pyspiral 0.6.6__cp312-abi3-macosx_11_0_arm64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pyspiral-0.6.6.dist-info/METADATA +51 -0
- pyspiral-0.6.6.dist-info/RECORD +102 -0
- pyspiral-0.6.6.dist-info/WHEEL +4 -0
- pyspiral-0.6.6.dist-info/entry_points.txt +2 -0
- spiral/__init__.py +35 -0
- spiral/_lib.abi3.so +0 -0
- spiral/adbc.py +411 -0
- spiral/api/__init__.py +78 -0
- spiral/api/admin.py +15 -0
- spiral/api/client.py +164 -0
- spiral/api/filesystems.py +134 -0
- spiral/api/key_space_indexes.py +23 -0
- spiral/api/organizations.py +77 -0
- spiral/api/projects.py +219 -0
- spiral/api/telemetry.py +19 -0
- spiral/api/text_indexes.py +56 -0
- spiral/api/types.py +22 -0
- spiral/api/workers.py +40 -0
- spiral/api/workloads.py +52 -0
- spiral/arrow_.py +216 -0
- spiral/cli/__init__.py +88 -0
- spiral/cli/__main__.py +4 -0
- spiral/cli/admin.py +14 -0
- spiral/cli/app.py +104 -0
- spiral/cli/console.py +95 -0
- spiral/cli/fs.py +76 -0
- spiral/cli/iceberg.py +97 -0
- spiral/cli/key_spaces.py +89 -0
- spiral/cli/login.py +24 -0
- spiral/cli/orgs.py +89 -0
- spiral/cli/printer.py +53 -0
- spiral/cli/projects.py +147 -0
- spiral/cli/state.py +5 -0
- spiral/cli/tables.py +174 -0
- spiral/cli/telemetry.py +17 -0
- spiral/cli/text.py +115 -0
- spiral/cli/types.py +50 -0
- spiral/cli/workloads.py +58 -0
- spiral/client.py +178 -0
- spiral/core/__init__.pyi +0 -0
- spiral/core/_tools/__init__.pyi +5 -0
- spiral/core/authn/__init__.pyi +27 -0
- spiral/core/client/__init__.pyi +237 -0
- spiral/core/table/__init__.pyi +101 -0
- spiral/core/table/manifests/__init__.pyi +35 -0
- spiral/core/table/metastore/__init__.pyi +58 -0
- spiral/core/table/spec/__init__.pyi +213 -0
- spiral/dataloader.py +285 -0
- spiral/dataset.py +255 -0
- spiral/datetime_.py +27 -0
- spiral/debug/__init__.py +0 -0
- spiral/debug/manifests.py +87 -0
- spiral/debug/metrics.py +56 -0
- spiral/debug/scan.py +266 -0
- spiral/expressions/__init__.py +276 -0
- spiral/expressions/base.py +157 -0
- spiral/expressions/http.py +86 -0
- spiral/expressions/io.py +100 -0
- spiral/expressions/list_.py +68 -0
- spiral/expressions/mp4.py +62 -0
- spiral/expressions/png.py +18 -0
- spiral/expressions/qoi.py +18 -0
- spiral/expressions/refs.py +58 -0
- spiral/expressions/str_.py +39 -0
- spiral/expressions/struct.py +59 -0
- spiral/expressions/text.py +62 -0
- spiral/expressions/tiff.py +223 -0
- spiral/expressions/udf.py +46 -0
- spiral/grpc_.py +32 -0
- spiral/iceberg.py +31 -0
- spiral/iterable_dataset.py +106 -0
- spiral/key_space_index.py +44 -0
- spiral/project.py +199 -0
- spiral/protogen/_/__init__.py +0 -0
- spiral/protogen/_/arrow/__init__.py +0 -0
- spiral/protogen/_/arrow/flight/__init__.py +0 -0
- spiral/protogen/_/arrow/flight/protocol/__init__.py +0 -0
- spiral/protogen/_/arrow/flight/protocol/sql/__init__.py +2548 -0
- spiral/protogen/_/google/__init__.py +0 -0
- spiral/protogen/_/google/protobuf/__init__.py +2310 -0
- spiral/protogen/_/message_pool.py +3 -0
- spiral/protogen/_/py.typed +0 -0
- spiral/protogen/_/scandal/__init__.py +190 -0
- spiral/protogen/_/spfs/__init__.py +72 -0
- spiral/protogen/_/spql/__init__.py +61 -0
- spiral/protogen/_/substrait/__init__.py +6196 -0
- spiral/protogen/_/substrait/extensions/__init__.py +169 -0
- spiral/protogen/__init__.py +0 -0
- spiral/protogen/util.py +41 -0
- spiral/py.typed +0 -0
- spiral/scan.py +285 -0
- spiral/server.py +17 -0
- spiral/settings.py +114 -0
- spiral/snapshot.py +56 -0
- spiral/streaming_/__init__.py +3 -0
- spiral/streaming_/reader.py +133 -0
- spiral/streaming_/stream.py +157 -0
- spiral/substrait_.py +274 -0
- spiral/table.py +293 -0
- spiral/text_index.py +17 -0
- spiral/transaction.py +58 -0
- spiral/types_.py +6 -0
spiral/table.py
ADDED
@@ -0,0 +1,293 @@
|
|
1
|
+
from datetime import datetime
|
2
|
+
from typing import TYPE_CHECKING, Any
|
3
|
+
|
4
|
+
from spiral.core.table import Table as CoreTable
|
5
|
+
from spiral.core.table.spec import Schema
|
6
|
+
from spiral.expressions.base import Expr, ExprLike
|
7
|
+
from spiral.settings import settings
|
8
|
+
from spiral.snapshot import Snapshot
|
9
|
+
from spiral.transaction import Transaction
|
10
|
+
|
11
|
+
if TYPE_CHECKING:
|
12
|
+
import duckdb
|
13
|
+
import polars as pl
|
14
|
+
import pyarrow.dataset as ds
|
15
|
+
import streaming
|
16
|
+
import torch.utils.data as torchdata # noqa
|
17
|
+
|
18
|
+
from spiral.client import Spiral
|
19
|
+
from spiral.dataloader import SpiralDataLoader
|
20
|
+
from spiral.key_space_index import KeySpaceIndex
|
21
|
+
|
22
|
+
|
23
|
+
class Table(Expr):
|
24
|
+
"""API for interacting with a SpiralDB's Table.
|
25
|
+
|
26
|
+
Spiral Table is a powerful and flexible way for storing, analyzing,
|
27
|
+
and querying massive and/or multimodal datasets. The data model will feel familiar
|
28
|
+
to users of SQL- or DataFrame-style systems, yet is designed to be more flexible, more powerful,
|
29
|
+
and more useful in the context of modern data processing.
|
30
|
+
|
31
|
+
Tables are stored and queried directly from object storage.
|
32
|
+
"""
|
33
|
+
|
34
|
+
def __init__(self, spiral: "Spiral", core: CoreTable, *, identifier: str | None = None):
|
35
|
+
super().__init__(core.__expr__)
|
36
|
+
|
37
|
+
self.spiral = spiral
|
38
|
+
self.core = core
|
39
|
+
|
40
|
+
self._key_schema = core.key_schema
|
41
|
+
self._key_columns = set(self._key_schema.names)
|
42
|
+
self._identifier = identifier
|
43
|
+
|
44
|
+
@property
|
45
|
+
def table_id(self) -> str:
|
46
|
+
return self.core.id
|
47
|
+
|
48
|
+
@property
|
49
|
+
def identifier(self) -> str:
|
50
|
+
"""Returns the fully qualified identifier of the table."""
|
51
|
+
return self._identifier or self.table_id
|
52
|
+
|
53
|
+
@property
|
54
|
+
def dataset(self) -> str | None:
|
55
|
+
"""Returns the dataset of the table."""
|
56
|
+
if self._identifier is None:
|
57
|
+
return None
|
58
|
+
_, dataset, _ = self._identifier.split(".")
|
59
|
+
return dataset
|
60
|
+
|
61
|
+
@property
|
62
|
+
def name(self) -> str | None:
|
63
|
+
"""Returns the name of the table."""
|
64
|
+
if self._identifier is None:
|
65
|
+
return None
|
66
|
+
_, _, name = self._identifier.split(".")
|
67
|
+
return name
|
68
|
+
|
69
|
+
def last_modified_at(self) -> int:
|
70
|
+
return self.core.get_wal(asof=None).last_modified_at
|
71
|
+
|
72
|
+
def __str__(self):
|
73
|
+
return self.identifier
|
74
|
+
|
75
|
+
def __repr__(self):
|
76
|
+
return f'Table("{self.identifier}")'
|
77
|
+
|
78
|
+
def __getitem__(self, item: str) -> Expr:
|
79
|
+
return super().__getitem__(item)
|
80
|
+
|
81
|
+
def select(self, *paths: str, exclude: list[str] = None) -> "Expr":
|
82
|
+
return super().select(*paths, exclude=exclude)
|
83
|
+
|
84
|
+
@property
|
85
|
+
def key_schema(self) -> Schema:
|
86
|
+
"""Returns the key schema of the table."""
|
87
|
+
return self._key_schema
|
88
|
+
|
89
|
+
def schema(self) -> Schema:
|
90
|
+
"""Returns the FULL schema of the table.
|
91
|
+
|
92
|
+
NOTE: This can be expensive for large tables.
|
93
|
+
"""
|
94
|
+
return self.core.get_schema(asof=None)
|
95
|
+
|
96
|
+
def write(
|
97
|
+
self,
|
98
|
+
expr: ExprLike,
|
99
|
+
*,
|
100
|
+
partition_size_bytes: int | None = None,
|
101
|
+
) -> None:
|
102
|
+
"""Write an item to the table inside a single transaction.
|
103
|
+
|
104
|
+
:param expr: The expression to write. Must evaluate to a struct array.
|
105
|
+
:param partition_size_bytes: The maximum partition size in bytes.
|
106
|
+
"""
|
107
|
+
with self.txn() as txn:
|
108
|
+
txn.write(
|
109
|
+
expr,
|
110
|
+
partition_size_bytes=partition_size_bytes,
|
111
|
+
)
|
112
|
+
|
113
|
+
def drop_columns(self, column_paths: list[str]) -> None:
|
114
|
+
"""
|
115
|
+
Drops the specified columns from the table.
|
116
|
+
|
117
|
+
|
118
|
+
:param column_paths: Fully qualified column names. (e.g., "column_name" or "nested.field").
|
119
|
+
All columns must exist, if a column doesn't exist the function will return an error.
|
120
|
+
"""
|
121
|
+
with self.txn() as txn:
|
122
|
+
txn.drop_columns(column_paths)
|
123
|
+
|
124
|
+
def snapshot(self, asof: datetime | int | None = None) -> Snapshot:
|
125
|
+
"""Returns a snapshot of the table at the given timestamp."""
|
126
|
+
if isinstance(asof, datetime):
|
127
|
+
asof = int(asof.timestamp() * 1_000_000)
|
128
|
+
return Snapshot(self, self.core.get_snapshot(asof=asof))
|
129
|
+
|
130
|
+
def txn(self, retries: int | None = 3) -> Transaction:
|
131
|
+
"""Begins a new transaction. Transaction must be committed for writes to become visible.
|
132
|
+
|
133
|
+
:param retries: Maximum number of retry attempts on conflict (default: 3). Set to None for a single attempt.
|
134
|
+
|
135
|
+
IMPORTANT: While transaction can be used to atomically write data to the table,
|
136
|
+
it is important that the primary key columns are unique within the transaction.
|
137
|
+
The behavior is undefined if this is not the case.
|
138
|
+
"""
|
139
|
+
return Transaction(self.spiral._core.transaction(self.core, settings().file_format, retries=retries))
|
140
|
+
|
141
|
+
def to_dataset(self) -> "ds.Dataset":
|
142
|
+
"""Returns a PyArrow Dataset representing the table."""
|
143
|
+
return self.snapshot().to_dataset()
|
144
|
+
|
145
|
+
def to_polars(self) -> "pl.LazyFrame":
|
146
|
+
"""Returns a Polars LazyFrame for the Spiral table."""
|
147
|
+
return self.snapshot().to_polars()
|
148
|
+
|
149
|
+
def to_duckdb(self) -> "duckdb.DuckDBPyRelation":
|
150
|
+
"""Returns a DuckDB relation for the Spiral table."""
|
151
|
+
return self.snapshot().to_duckdb()
|
152
|
+
|
153
|
+
def to_data_loader(
|
154
|
+
self,
|
155
|
+
index: "KeySpaceIndex",
|
156
|
+
*,
|
157
|
+
projection: Expr | None = None,
|
158
|
+
**kwargs,
|
159
|
+
) -> "SpiralDataLoader":
|
160
|
+
"""Read into a Torch-compatible DataLoader for single-node training.
|
161
|
+
|
162
|
+
Args:
|
163
|
+
index: Prebuilt KeysIndex to use when creating the stream.
|
164
|
+
The index's `asof` will be used when scanning.
|
165
|
+
projection: Optional projection to use when scanning the table if index's projection is not used.
|
166
|
+
Projection must be compatible with the index's projection for correctness.
|
167
|
+
**kwargs: Additional arguments passed to SpiralDataLoader constructor.
|
168
|
+
See Scan#to_data_loader for supported arguments.
|
169
|
+
|
170
|
+
Returns:
|
171
|
+
SpiralDataLoader with shuffled shards.
|
172
|
+
"""
|
173
|
+
if index.table_id != self.table_id:
|
174
|
+
raise ValueError("Index must be built on the same table as the scan.")
|
175
|
+
if index.asof == 0:
|
176
|
+
raise ValueError("Index have to be synced before it can be used.")
|
177
|
+
|
178
|
+
shards = self.spiral._core._ops().compute_shards(index=index.core)
|
179
|
+
|
180
|
+
return self.spiral.scan(
|
181
|
+
projection if projection is not None else index.projection,
|
182
|
+
where=index.filter,
|
183
|
+
asof=index.asof,
|
184
|
+
).to_data_loader(shards=shards, **kwargs)
|
185
|
+
|
186
|
+
def to_distributed_data_loader(
|
187
|
+
self,
|
188
|
+
index: "KeySpaceIndex",
|
189
|
+
*,
|
190
|
+
projection: Expr | None = None,
|
191
|
+
**kwargs,
|
192
|
+
) -> "SpiralDataLoader":
|
193
|
+
"""Read into a Torch-compatible DataLoader for distributed training.
|
194
|
+
|
195
|
+
Args:
|
196
|
+
index: Prebuilt KeysIndex to use when creating the stream.
|
197
|
+
The index's `asof` will be used when scanning.
|
198
|
+
projection: Optional projection to use when scanning the table if index's projection is not used.
|
199
|
+
Projection must be compatible with the index's projection for correctness.
|
200
|
+
**kwargs: Additional arguments passed to SpiralDataLoader constructor.
|
201
|
+
See Scan#to_distributed_data_loader for supported arguments.
|
202
|
+
|
203
|
+
Returns:
|
204
|
+
SpiralDataLoader with shuffled shards.
|
205
|
+
"""
|
206
|
+
if index.table_id != self.table_id:
|
207
|
+
raise ValueError("Index must be built on the same table as the scan.")
|
208
|
+
if index.asof == 0:
|
209
|
+
raise ValueError("Index have to be synced before it can be used.")
|
210
|
+
|
211
|
+
shards = self.spiral._core._ops().compute_shards(index=index.core)
|
212
|
+
|
213
|
+
return self.spiral.scan(
|
214
|
+
projection if projection is not None else index.projection,
|
215
|
+
where=index.filter,
|
216
|
+
asof=index.asof,
|
217
|
+
).to_distributed_data_loader(shards=shards, **kwargs)
|
218
|
+
|
219
|
+
def resume_data_loader(
|
220
|
+
self,
|
221
|
+
index: "KeySpaceIndex",
|
222
|
+
*,
|
223
|
+
state: dict[str, Any],
|
224
|
+
**kwargs,
|
225
|
+
) -> "SpiralDataLoader":
|
226
|
+
"""Resume a previously created DataLoader from a saved state.
|
227
|
+
|
228
|
+
Args:
|
229
|
+
index: Prebuilt KeysIndex to use when creating the stream.
|
230
|
+
The index's `asof` will be used when scanning.
|
231
|
+
state: State dictionary returned from a previous DataLoader's `state_dict()` method.
|
232
|
+
**kwargs: Additional arguments passed to SpiralDataLoader constructor.
|
233
|
+
See Scan#to_data_loader for supported arguments.
|
234
|
+
|
235
|
+
Returns:
|
236
|
+
SpiralDataLoader with shuffled shards.
|
237
|
+
"""
|
238
|
+
if index.table_id != self.table_id:
|
239
|
+
raise ValueError("Index must be built on the same table as the scan.")
|
240
|
+
if index.asof == 0:
|
241
|
+
raise ValueError("Index have to be synced before it can be used.")
|
242
|
+
|
243
|
+
shards = self.spiral._core._ops().compute_shards(index=index.core)
|
244
|
+
|
245
|
+
return self.spiral.scan(
|
246
|
+
index.projection,
|
247
|
+
where=index.filter,
|
248
|
+
asof=index.asof,
|
249
|
+
).resume_data_loader(shards=shards, state=state, **kwargs)
|
250
|
+
|
251
|
+
def to_streaming(
|
252
|
+
self,
|
253
|
+
index: "KeySpaceIndex",
|
254
|
+
*,
|
255
|
+
projection: Expr | None = None,
|
256
|
+
cache_dir: str | None = None,
|
257
|
+
shard_row_block_size: int | None = None,
|
258
|
+
) -> "streaming.Stream":
|
259
|
+
"""Returns a stream to be used with MosaicML's StreamingDataset.
|
260
|
+
|
261
|
+
Requires `streaming` package to be installed.
|
262
|
+
|
263
|
+
Args:
|
264
|
+
index: Prebuilt KeysIndex to use when creating the stream.
|
265
|
+
The index's `asof` will be used when scanning.
|
266
|
+
projection: Optional projection to use when scanning the table if index's projection is not used.
|
267
|
+
Projection must be compatible with the index's projection for correctness.
|
268
|
+
cache_dir: Directory to use for caching data. If None, a temporary directory will be used.
|
269
|
+
shard_row_block_size: Number of rows per segment of a shard file. Defaults to 8192.
|
270
|
+
Value should be set to lower for larger rows.
|
271
|
+
"""
|
272
|
+
from spiral.streaming_ import SpiralStream
|
273
|
+
|
274
|
+
if index.table_id != self.table_id:
|
275
|
+
raise ValueError("Index must be built on the same table as the scan.")
|
276
|
+
if index.asof == 0:
|
277
|
+
raise ValueError("Index have to be synced before it can be used.")
|
278
|
+
|
279
|
+
# We know table from projection is in the session cause this method is on it.
|
280
|
+
scan = self.spiral.scan(
|
281
|
+
projection if projection is not None else index.projection,
|
282
|
+
where=index.filter,
|
283
|
+
asof=index.asof,
|
284
|
+
)
|
285
|
+
shards = self.spiral._core._ops().compute_shards(index=index.core)
|
286
|
+
|
287
|
+
return SpiralStream(
|
288
|
+
sp=self.spiral,
|
289
|
+
scan=scan,
|
290
|
+
shards=shards,
|
291
|
+
cache_dir=cache_dir,
|
292
|
+
shard_row_block_size=shard_row_block_size,
|
293
|
+
) # type: ignore[return-value]
|
spiral/text_index.py
ADDED
@@ -0,0 +1,17 @@
|
|
1
|
+
from spiral.core.client import TextIndex as CoreTextIndex
|
2
|
+
from spiral.expressions import Expr
|
3
|
+
|
4
|
+
|
5
|
+
class TextIndex(Expr):
|
6
|
+
def __init__(self, core: CoreTextIndex, *, name: str | None = None):
|
7
|
+
super().__init__(core.__expr__)
|
8
|
+
self.core = core
|
9
|
+
self._name = name
|
10
|
+
|
11
|
+
@property
|
12
|
+
def index_id(self) -> str:
|
13
|
+
return self.core.id
|
14
|
+
|
15
|
+
@property
|
16
|
+
def name(self) -> str:
|
17
|
+
return self._name or self.index_id
|
spiral/transaction.py
ADDED
@@ -0,0 +1,58 @@
|
|
1
|
+
from spiral.core.table import Transaction as CoreTransaction
|
2
|
+
from spiral.expressions.base import ExprLike
|
3
|
+
|
4
|
+
|
5
|
+
class Transaction:
|
6
|
+
"""Spiral table transaction.
|
7
|
+
|
8
|
+
IMPORTANT: While transaction can be used to atomically write data to the table,
|
9
|
+
it is important that the primary key columns are unique within the transaction.
|
10
|
+
"""
|
11
|
+
|
12
|
+
def __init__(self, core: CoreTransaction):
|
13
|
+
self._core = core
|
14
|
+
|
15
|
+
@property
|
16
|
+
def status(self) -> str:
|
17
|
+
"""The status of the transaction."""
|
18
|
+
return self._core.status
|
19
|
+
|
20
|
+
def __enter__(self):
|
21
|
+
return self
|
22
|
+
|
23
|
+
def __exit__(self, exc_type, exc_value, traceback):
|
24
|
+
if exc_type is None:
|
25
|
+
self._core.commit()
|
26
|
+
else:
|
27
|
+
self._core.abort()
|
28
|
+
|
29
|
+
def write(self, expr: ExprLike, *, partition_size_bytes: int | None = None):
|
30
|
+
"""Write an item to the table inside a single transaction.
|
31
|
+
|
32
|
+
:param expr: The expression to write. Must evaluate to a struct array.
|
33
|
+
:param partition_size_bytes: The maximum partition size in bytes.
|
34
|
+
If not provided, the default partition size is used.
|
35
|
+
"""
|
36
|
+
from spiral import expressions as se
|
37
|
+
|
38
|
+
record_batches = se.evaluate(expr)
|
39
|
+
|
40
|
+
self._core.write(record_batches, partition_size_bytes=partition_size_bytes)
|
41
|
+
|
42
|
+
def drop_columns(self, column_paths: list[str]):
|
43
|
+
"""
|
44
|
+
Drops the specified columns from the table.
|
45
|
+
|
46
|
+
|
47
|
+
:param column_paths: Fully qualified column names. (e.g., "column_name" or "nested.field").
|
48
|
+
All columns must exist, if a column doesn't exist the function will return an error.
|
49
|
+
"""
|
50
|
+
self._core.drop_columns(column_paths)
|
51
|
+
|
52
|
+
def commit(self):
|
53
|
+
"""Commit the transaction."""
|
54
|
+
self._core.commit()
|
55
|
+
|
56
|
+
def abort(self):
|
57
|
+
"""Abort the transaction."""
|
58
|
+
self._core.abort()
|