pyspiral 0.2.5__cp310-abi3-macosx_11_0_arm64.whl → 0.4.0__cp310-abi3-macosx_11_0_arm64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {pyspiral-0.2.5.dist-info → pyspiral-0.4.0.dist-info}/METADATA +12 -14
- pyspiral-0.4.0.dist-info/RECORD +98 -0
- {pyspiral-0.2.5.dist-info → pyspiral-0.4.0.dist-info}/WHEEL +1 -1
- spiral/__init__.py +6 -7
- spiral/_lib.abi3.so +0 -0
- spiral/adbc.py +21 -14
- spiral/api/__init__.py +15 -172
- spiral/api/admin.py +12 -26
- spiral/api/client.py +160 -0
- spiral/api/filesystems.py +100 -72
- spiral/api/organizations.py +45 -58
- spiral/api/projects.py +171 -134
- spiral/api/telemetry.py +19 -0
- spiral/api/types.py +20 -0
- spiral/api/workloads.py +32 -25
- spiral/{arrow.py → arrow_.py} +12 -0
- spiral/cli/__init__.py +2 -5
- spiral/cli/admin.py +7 -12
- spiral/cli/app.py +23 -6
- spiral/cli/console.py +1 -1
- spiral/cli/fs.py +83 -18
- spiral/cli/iceberg/__init__.py +7 -0
- spiral/cli/iceberg/namespaces.py +47 -0
- spiral/cli/iceberg/tables.py +60 -0
- spiral/cli/indexes/__init__.py +19 -0
- spiral/cli/login.py +14 -5
- spiral/cli/orgs.py +90 -0
- spiral/cli/printer.py +9 -1
- spiral/cli/projects.py +136 -0
- spiral/cli/state.py +2 -0
- spiral/cli/tables/__init__.py +121 -0
- spiral/cli/telemetry.py +18 -0
- spiral/cli/types.py +8 -10
- spiral/cli/{workload.py → workloads.py} +11 -11
- spiral/{catalog.py → client.py} +22 -21
- spiral/core/client/__init__.pyi +117 -0
- spiral/core/index/__init__.pyi +15 -0
- spiral/core/table/__init__.pyi +108 -0
- spiral/core/{manifests → table/manifests}/__init__.pyi +5 -23
- spiral/core/table/metastore/__init__.pyi +62 -0
- spiral/core/{spec → table/spec}/__init__.pyi +49 -92
- spiral/datetime_.py +27 -0
- spiral/expressions/__init__.py +40 -17
- spiral/expressions/base.py +5 -5
- spiral/expressions/list_.py +1 -1
- spiral/expressions/mp4.py +62 -0
- spiral/expressions/png.py +18 -0
- spiral/expressions/qoi.py +18 -0
- spiral/expressions/refs.py +23 -9
- spiral/expressions/struct.py +7 -5
- spiral/expressions/text.py +62 -0
- spiral/expressions/tiff.py +88 -88
- spiral/expressions/udf.py +3 -3
- spiral/iceberg/__init__.py +3 -0
- spiral/iceberg/client.py +33 -0
- spiral/indexes/__init__.py +5 -0
- spiral/indexes/client.py +137 -0
- spiral/indexes/index.py +34 -0
- spiral/indexes/scan.py +22 -0
- spiral/project.py +19 -110
- spiral/{proto → protogen}/_/scandal/__init__.py +32 -77
- spiral/protogen/_/spiral/table/__init__.py +22 -0
- spiral/protogen/substrait/__init__.py +3399 -0
- spiral/protogen/substrait/extensions/__init__.py +115 -0
- spiral/server.py +17 -0
- spiral/settings.py +31 -87
- spiral/substrait_.py +10 -6
- spiral/tables/__init__.py +12 -0
- spiral/tables/client.py +130 -0
- spiral/{dataset.py → tables/dataset.py} +36 -25
- spiral/tables/debug/manifests.py +70 -0
- spiral/tables/debug/metrics.py +56 -0
- spiral/{debug.py → tables/debug/scan.py} +6 -9
- spiral/tables/maintenance.py +12 -0
- spiral/tables/scan.py +193 -0
- spiral/tables/snapshot.py +78 -0
- spiral/tables/table.py +157 -0
- spiral/tables/transaction.py +52 -0
- pyspiral-0.2.5.dist-info/RECORD +0 -81
- spiral/api/tables.py +0 -94
- spiral/api/tokens.py +0 -56
- spiral/authn/authn.py +0 -89
- spiral/authn/device.py +0 -206
- spiral/authn/github_.py +0 -33
- spiral/authn/modal_.py +0 -18
- spiral/cli/org.py +0 -90
- spiral/cli/project.py +0 -107
- spiral/cli/table.py +0 -20
- spiral/cli/token.py +0 -27
- spiral/config.py +0 -26
- spiral/core/core/__init__.pyi +0 -53
- spiral/core/metastore/__init__.pyi +0 -91
- spiral/proto/_/spfs/__init__.py +0 -36
- spiral/proto/_/spiral/table/__init__.py +0 -225
- spiral/proto/_/spiraldb/metastore/__init__.py +0 -499
- spiral/proto/__init__.py +0 -0
- spiral/proto/scandal/__init__.py +0 -45
- spiral/proto/spiral/__init__.py +0 -0
- spiral/proto/spiral/table/__init__.py +0 -96
- spiral/scan_.py +0 -168
- spiral/table.py +0 -157
- {pyspiral-0.2.5.dist-info → pyspiral-0.4.0.dist-info}/entry_points.txt +0 -0
- /spiral/{authn/__init__.py → core/__init__.pyi} +0 -0
- /spiral/{core → protogen/_}/__init__.py +0 -0
- /spiral/{proto/_ → protogen/_/arrow}/__init__.py +0 -0
- /spiral/{proto/_/arrow → protogen/_/arrow/flight}/__init__.py +0 -0
- /spiral/{proto/_/arrow/flight → protogen/_/arrow/flight/protocol}/__init__.py +0 -0
- /spiral/{proto → protogen}/_/arrow/flight/protocol/sql/__init__.py +0 -0
- /spiral/{proto/_/arrow/flight/protocol → protogen/_/spiral}/__init__.py +0 -0
- /spiral/{proto → protogen/_}/substrait/__init__.py +0 -0
- /spiral/{proto → protogen/_}/substrait/extensions/__init__.py +0 -0
- /spiral/{proto/_/spiral → protogen}/__init__.py +0 -0
- /spiral/{proto → protogen}/util.py +0 -0
- /spiral/{proto/_/spiraldb → tables/debug}/__init__.py +0 -0
spiral/scan_.py
DELETED
@@ -1,168 +0,0 @@
|
|
1
|
-
from collections.abc import Iterator
|
2
|
-
from datetime import datetime
|
3
|
-
from typing import TYPE_CHECKING, Any
|
4
|
-
|
5
|
-
import pyarrow as pa
|
6
|
-
from opentelemetry import trace
|
7
|
-
|
8
|
-
from spiral.core.core import TableScan
|
9
|
-
from spiral.core.spec import KeyRange, Schema
|
10
|
-
from spiral.expressions.base import ExprLike
|
11
|
-
|
12
|
-
if TYPE_CHECKING:
|
13
|
-
import dask.dataframe as dd
|
14
|
-
import pandas as pd
|
15
|
-
import polars as pl
|
16
|
-
from datasets import iterable_dataset
|
17
|
-
|
18
|
-
tracer = trace.get_tracer("pyspiral.client.scan")
|
19
|
-
|
20
|
-
|
21
|
-
def scan(
|
22
|
-
*projections: ExprLike,
|
23
|
-
where: ExprLike | None = None,
|
24
|
-
asof: datetime | int | str = None,
|
25
|
-
exclude_keys: bool = False,
|
26
|
-
# TODO(marko): Support config.
|
27
|
-
# config: Config | None = None,
|
28
|
-
) -> "Scan":
|
29
|
-
"""Starts a read transaction on the spiral.
|
30
|
-
|
31
|
-
Args:
|
32
|
-
projections: a set of expressions that return struct arrays.
|
33
|
-
where: a query expression to apply to the data.
|
34
|
-
asof: only data written before the given timestamp will be returned, caveats around compaction.
|
35
|
-
exclude_keys: whether to exclude the key columns in the scan result, defaults to False.
|
36
|
-
"""
|
37
|
-
from spiral import expressions as se
|
38
|
-
|
39
|
-
# Combine all projections into a single struct.
|
40
|
-
projection = se.merge(*projections)
|
41
|
-
if where is not None:
|
42
|
-
where = se.lift(where)
|
43
|
-
|
44
|
-
return Scan(
|
45
|
-
TableScan(
|
46
|
-
projection.__expr__,
|
47
|
-
filter=where.__expr__ if where else None,
|
48
|
-
asof=asof,
|
49
|
-
exclude_keys=exclude_keys,
|
50
|
-
),
|
51
|
-
# config=config,
|
52
|
-
)
|
53
|
-
|
54
|
-
|
55
|
-
class Scan:
|
56
|
-
"""Scan object."""
|
57
|
-
|
58
|
-
def __init__(
|
59
|
-
self,
|
60
|
-
scan: TableScan,
|
61
|
-
# TODO(marko): Support config.
|
62
|
-
# config: Config | None = None,
|
63
|
-
):
|
64
|
-
# NOTE(ngates): this API is a little weird. e.g. if the query doesn't define an asof, it is resolved
|
65
|
-
# when we wrap it into a core.Scan. Should we expose a Query object in the Python API that's reusable
|
66
|
-
# and will re-resolve the asof? Or should we just expose a scan that fixes the asof at construction time?
|
67
|
-
self._scan = scan
|
68
|
-
|
69
|
-
@property
|
70
|
-
def metrics(self) -> dict[str, Any]:
|
71
|
-
"""Returns metrics about the scan."""
|
72
|
-
return self._scan.metrics()
|
73
|
-
|
74
|
-
@property
|
75
|
-
def schema(self) -> Schema:
|
76
|
-
"""Returns the schema of the scan."""
|
77
|
-
return self._scan.schema()
|
78
|
-
|
79
|
-
def is_empty(self) -> bool:
|
80
|
-
"""Check if the Spiral is empty for the given key range.
|
81
|
-
|
82
|
-
**IMPORTANT**: False negatives are possible, but false positives are not,
|
83
|
-
i.e. is_empty can return False and scan can return zero rows.
|
84
|
-
"""
|
85
|
-
return self._scan.is_empty()
|
86
|
-
|
87
|
-
def to_record_batches(self, key_table: pa.Table | pa.RecordBatchReader | None = None) -> pa.RecordBatchReader:
|
88
|
-
"""Read as a stream of RecordBatches.
|
89
|
-
|
90
|
-
Args:
|
91
|
-
key_table: a table of keys to "take" (including aux columns for cell-push-down).
|
92
|
-
"""
|
93
|
-
if isinstance(key_table, pa.RecordBatchReader):
|
94
|
-
raise NotImplementedError("RecordBatchReader is not supported as key_table")
|
95
|
-
|
96
|
-
# Prefix non-key columns in the key table with # (auxiliary) to avoid conflicts with the scan schema.
|
97
|
-
if key_table is not None:
|
98
|
-
key_columns = list(self._scan.key_schema().to_arrow().names)
|
99
|
-
key_table = key_table.rename_columns(
|
100
|
-
{name: f"#{name}" if name not in key_columns else name for name in key_table.schema.names}
|
101
|
-
)
|
102
|
-
|
103
|
-
return self._scan.to_record_batches(aux_table=key_table)
|
104
|
-
|
105
|
-
def to_table(self) -> pa.Table:
|
106
|
-
"""Read into a single PyArrow Table."""
|
107
|
-
return self.to_record_batches().read_all()
|
108
|
-
|
109
|
-
def to_dask(self) -> "dd.DataFrame":
|
110
|
-
"""Read into a Dask DataFrame.
|
111
|
-
|
112
|
-
Requires the `dask` package to be installed.
|
113
|
-
"""
|
114
|
-
import dask.dataframe as dd
|
115
|
-
import pandas as pd
|
116
|
-
|
117
|
-
def _read_key_range(key_range: KeyRange) -> pd.DataFrame:
|
118
|
-
# TODO(ngates): we need a way to preserve the existing asofs? Should we copy CoreScan instead of Query?
|
119
|
-
raise NotImplementedError()
|
120
|
-
|
121
|
-
# Fetch a set of partition ranges
|
122
|
-
return dd.from_map(_read_key_range, self.split())
|
123
|
-
|
124
|
-
def to_pandas(self) -> "pd.DataFrame":
|
125
|
-
"""Read into a Pandas DataFrame.
|
126
|
-
|
127
|
-
Requires the `pandas` package to be installed.
|
128
|
-
"""
|
129
|
-
return self.to_table().to_pandas()
|
130
|
-
|
131
|
-
def to_polars(self) -> "pl.DataFrame":
|
132
|
-
"""Read into a Polars DataFrame.
|
133
|
-
|
134
|
-
Requires the `polars` package to be installed.
|
135
|
-
"""
|
136
|
-
import polars as pl
|
137
|
-
|
138
|
-
# TODO(ngates): PR PyArrow to support lazy datasets
|
139
|
-
return pl.from_arrow(self.to_record_batches())
|
140
|
-
|
141
|
-
def to_pytorch(self) -> "iterable_dataset.IterableDataset":
|
142
|
-
"""Returns an iterable dataset that can be used to build a `pytorch.DataLoader`.
|
143
|
-
|
144
|
-
Requires the `datasets` package to be installed.
|
145
|
-
"""
|
146
|
-
from datasets.iterable_dataset import ArrowExamplesIterable, IterableDataset
|
147
|
-
|
148
|
-
def _generate_tables(**kwargs) -> Iterator[tuple[int, pa.Table]]:
|
149
|
-
stream = self.to_record_batches()
|
150
|
-
|
151
|
-
# This key is unused when training with IterableDataset.
|
152
|
-
# Default implementation returns shard id, e.g. parquet row group id.
|
153
|
-
for i, rb in enumerate(stream):
|
154
|
-
yield i, pa.Table.from_batches([rb], stream.schema)
|
155
|
-
|
156
|
-
# NOTE: Type annotation Callable[..., tuple[str, pa.Table]] is wrong. The return value must be iterable.
|
157
|
-
ex_iterable = ArrowExamplesIterable(generate_tables_fn=_generate_tables, kwargs={})
|
158
|
-
return IterableDataset(ex_iterable=ex_iterable)
|
159
|
-
|
160
|
-
def split(self) -> list[KeyRange]:
|
161
|
-
return self._scan.split()
|
162
|
-
|
163
|
-
def debug(self):
|
164
|
-
# Visualizes the scan, mainly for debugging purposes.
|
165
|
-
# NOTE: This is not part of the API and may disappear at any moment.
|
166
|
-
from spiral.debug import show_scan
|
167
|
-
|
168
|
-
show_scan(self._scan)
|
spiral/table.py
DELETED
@@ -1,157 +0,0 @@
|
|
1
|
-
from datetime import datetime
|
2
|
-
from typing import TYPE_CHECKING, Literal
|
3
|
-
|
4
|
-
import pyarrow as pa
|
5
|
-
|
6
|
-
from spiral import expressions as se
|
7
|
-
from spiral.config import FILE_FORMAT, Config
|
8
|
-
from spiral.core.core import Table as CoreTable
|
9
|
-
from spiral.core.core import flush_wal, write
|
10
|
-
from spiral.expressions.base import Expr, ExprLike
|
11
|
-
|
12
|
-
if TYPE_CHECKING:
|
13
|
-
import duckdb
|
14
|
-
import polars as pl
|
15
|
-
import pyarrow.dataset
|
16
|
-
|
17
|
-
from spiral.scan_ import Scan
|
18
|
-
|
19
|
-
|
20
|
-
class Table(Expr):
|
21
|
-
"""API for interacting with a SpiralDB's Table.
|
22
|
-
|
23
|
-
Different catalog implementations should ultimately construct a Table object.
|
24
|
-
"""
|
25
|
-
|
26
|
-
def __init__(
|
27
|
-
self,
|
28
|
-
table: CoreTable,
|
29
|
-
name: str | None = None,
|
30
|
-
):
|
31
|
-
super().__init__(table.__expr__)
|
32
|
-
|
33
|
-
self._table = table
|
34
|
-
self._name = name or self._table.id
|
35
|
-
self._key_schema = self._table.key_schema.to_arrow()
|
36
|
-
self._key_columns = set(self._key_schema.names)
|
37
|
-
|
38
|
-
@property
|
39
|
-
def table_id(self) -> str:
|
40
|
-
return self._table.id
|
41
|
-
|
42
|
-
@property
|
43
|
-
def last_modified_at(self) -> int:
|
44
|
-
return self._table.get_wal(asof=None).last_modified_at
|
45
|
-
|
46
|
-
def __str__(self):
|
47
|
-
return self._name
|
48
|
-
|
49
|
-
def __repr__(self):
|
50
|
-
return f'Table("{self._name}")'
|
51
|
-
|
52
|
-
def __getitem__(self, item: str) -> Expr:
|
53
|
-
from spiral import expressions as se
|
54
|
-
|
55
|
-
if item in self._key_columns:
|
56
|
-
return se.var(name=item)
|
57
|
-
|
58
|
-
return super().__getitem__(item)
|
59
|
-
|
60
|
-
def select(self, *paths: str, exclude: list[str] = None) -> "Expr":
|
61
|
-
# Override an expression select in the root column group to split between keys and columns.
|
62
|
-
if exclude is not None:
|
63
|
-
if set(exclude) & self._key_columns:
|
64
|
-
raise ValueError(
|
65
|
-
"Cannot use 'exclude' arg with key columns. Use 'exclude_keys' and an explicit select of keys."
|
66
|
-
)
|
67
|
-
|
68
|
-
key_paths = set(paths) & self._key_columns
|
69
|
-
other_paths = set(paths) - key_paths
|
70
|
-
if not key_paths:
|
71
|
-
return super().select(*paths, exclude=exclude)
|
72
|
-
|
73
|
-
from spiral import expressions as se
|
74
|
-
|
75
|
-
return se.merge(se.pack({key: se.var(key) for key in key_paths}), super().select(*other_paths, exclude=exclude))
|
76
|
-
|
77
|
-
@property
|
78
|
-
def key_schema(self) -> pa.Schema:
|
79
|
-
"""Returns the key schema of the table."""
|
80
|
-
return self._key_schema
|
81
|
-
|
82
|
-
@property
|
83
|
-
def schema(self) -> pa.Schema:
|
84
|
-
"""Returns the FULL schema of the table.
|
85
|
-
|
86
|
-
NOTE: This can be expensive for large tables.
|
87
|
-
"""
|
88
|
-
return self._table.get_schema(asof=None)
|
89
|
-
|
90
|
-
def to_dataset(self) -> "pyarrow.dataset.Dataset":
|
91
|
-
"""Returns a PyArrow Dataset representing the table."""
|
92
|
-
from .dataset import TableDataset
|
93
|
-
|
94
|
-
return TableDataset(self)
|
95
|
-
|
96
|
-
def to_polars(self) -> "pl.LazyFrame":
|
97
|
-
"""Returns a Polars LazyFrame for the Spiral table."""
|
98
|
-
import polars as pl
|
99
|
-
|
100
|
-
return pl.scan_pyarrow_dataset(self.to_dataset())
|
101
|
-
|
102
|
-
def to_duckdb(self) -> "duckdb.DuckDBPyRelation":
|
103
|
-
"""Returns a DuckDB relation for the Spiral table."""
|
104
|
-
import duckdb
|
105
|
-
|
106
|
-
return duckdb.from_arrow(self.to_dataset())
|
107
|
-
|
108
|
-
def scan(
|
109
|
-
self,
|
110
|
-
*projections: ExprLike,
|
111
|
-
where: ExprLike | None = None,
|
112
|
-
asof: datetime | int | str = None,
|
113
|
-
exclude_keys: bool = False,
|
114
|
-
# TODO(marko): Support config.
|
115
|
-
# config: Config | None = None,
|
116
|
-
) -> "Scan":
|
117
|
-
"""Reads the table. If projections are not provided, the entire table is read.
|
118
|
-
|
119
|
-
See `spiral.scan` for more information.
|
120
|
-
"""
|
121
|
-
from spiral.scan_ import scan
|
122
|
-
|
123
|
-
if not projections:
|
124
|
-
projections = [self]
|
125
|
-
|
126
|
-
return scan(
|
127
|
-
*projections,
|
128
|
-
where=where,
|
129
|
-
asof=asof,
|
130
|
-
exclude_keys=exclude_keys,
|
131
|
-
# config=config,
|
132
|
-
)
|
133
|
-
|
134
|
-
# NOTE: "vortex" is valid format. We don't want that visible in the API docs.
|
135
|
-
def write(
|
136
|
-
self,
|
137
|
-
expr: ExprLike,
|
138
|
-
*,
|
139
|
-
format: Literal["parquet"] | None = None,
|
140
|
-
# TODO(joe): support group_by, and config
|
141
|
-
config: Config | None = None,
|
142
|
-
) -> None:
|
143
|
-
"""Write an item to the table inside a single transaction.
|
144
|
-
|
145
|
-
:param expr: The expression to write. Must evaluate to a struct array.
|
146
|
-
:param format: the format to write the data in. Defaults to "parquet".
|
147
|
-
:param config: The configuration to use for this write.
|
148
|
-
"""
|
149
|
-
write(
|
150
|
-
self._table,
|
151
|
-
se.lift(expr).__expr__,
|
152
|
-
format=format or FILE_FORMAT,
|
153
|
-
partition_size=config.partition_file_min_size if config else None,
|
154
|
-
)
|
155
|
-
# Flush the WAL if configured.
|
156
|
-
if config is not None and config.flush_wal_on_write:
|
157
|
-
flush_wal(self._table, manifest_format=format or FILE_FORMAT)
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|