pyspiral 0.6.11__cp312-abi3-manylinux_2_28_aarch64.whl → 0.6.13__cp312-abi3-manylinux_2_28_aarch64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pyspiral might be problematic. Click here for more details.
- {pyspiral-0.6.11.dist-info → pyspiral-0.6.13.dist-info}/METADATA +8 -5
- {pyspiral-0.6.11.dist-info → pyspiral-0.6.13.dist-info}/RECORD +36 -30
- spiral/__init__.py +7 -0
- spiral/_lib.abi3.so +0 -0
- spiral/cli/iceberg.py +1 -1
- spiral/cli/key_spaces.py +15 -1
- spiral/cli/tables.py +3 -3
- spiral/client.py +12 -11
- spiral/core/client/__init__.pyi +8 -8
- spiral/core/expr/__init__.pyi +15 -0
- spiral/core/expr/images/__init__.pyi +3 -0
- spiral/core/expr/list_/__init__.pyi +4 -0
- spiral/core/expr/refs/__init__.pyi +4 -0
- spiral/core/expr/str_/__init__.pyi +3 -0
- spiral/core/expr/struct_/__init__.pyi +6 -0
- spiral/core/expr/text/__init__.pyi +5 -0
- spiral/core/expr/udf/__init__.pyi +14 -0
- spiral/core/expr/video/__init__.pyi +3 -0
- spiral/core/table/__init__.pyi +19 -1
- spiral/core/table/spec/__init__.pyi +6 -0
- spiral/dataloader.py +52 -38
- spiral/enrichment.py +153 -0
- spiral/expressions/__init__.py +15 -19
- spiral/expressions/base.py +9 -4
- spiral/expressions/http.py +10 -80
- spiral/expressions/s3.py +15 -0
- spiral/expressions/tiff.py +2 -3
- spiral/expressions/udf.py +38 -24
- spiral/project.py +6 -6
- spiral/scan.py +76 -33
- spiral/settings.py +9 -6
- spiral/streaming_/stream.py +1 -1
- spiral/table.py +41 -9
- spiral/transaction.py +42 -0
- spiral/expressions/io.py +0 -100
- spiral/expressions/mp4.py +0 -62
- spiral/expressions/png.py +0 -18
- spiral/expressions/qoi.py +0 -18
- spiral/expressions/refs.py +0 -58
- {pyspiral-0.6.11.dist-info → pyspiral-0.6.13.dist-info}/WHEEL +0 -0
- {pyspiral-0.6.11.dist-info → pyspiral-0.6.13.dist-info}/entry_points.txt +0 -0
spiral/scan.py
CHANGED
|
@@ -1,8 +1,10 @@
|
|
|
1
|
+
from functools import partial
|
|
1
2
|
from typing import TYPE_CHECKING, Any, Optional
|
|
2
3
|
|
|
3
4
|
import pyarrow as pa
|
|
4
5
|
|
|
5
6
|
from spiral.core.client import Shard, ShuffleConfig
|
|
7
|
+
from spiral.core.table import KeyRange
|
|
6
8
|
from spiral.core.table import Scan as CoreScan
|
|
7
9
|
from spiral.core.table.spec import Schema
|
|
8
10
|
from spiral.settings import CI, DEV
|
|
@@ -15,13 +17,15 @@ if TYPE_CHECKING:
|
|
|
15
17
|
import streaming # noqa
|
|
16
18
|
import torch.utils.data as torchdata # noqa
|
|
17
19
|
|
|
20
|
+
from spiral.client import Spiral
|
|
18
21
|
from spiral.dataloader import SpiralDataLoader, World # noqa
|
|
19
22
|
|
|
20
23
|
|
|
21
24
|
class Scan:
|
|
22
25
|
"""Scan object."""
|
|
23
26
|
|
|
24
|
-
def __init__(self, core: CoreScan):
|
|
27
|
+
def __init__(self, spiral: "Spiral", core: CoreScan):
|
|
28
|
+
self.spiral = spiral
|
|
25
29
|
self.core = core
|
|
26
30
|
|
|
27
31
|
@property
|
|
@@ -34,6 +38,11 @@ class Scan:
|
|
|
34
38
|
"""Returns the schema of the scan."""
|
|
35
39
|
return self.core.schema()
|
|
36
40
|
|
|
41
|
+
@property
|
|
42
|
+
def key_schema(self) -> Schema:
|
|
43
|
+
"""Returns the key schema of the scan."""
|
|
44
|
+
return self.core.key_schema()
|
|
45
|
+
|
|
37
46
|
def is_empty(self) -> bool:
|
|
38
47
|
"""Check if the Spiral is empty for the given key range.
|
|
39
48
|
|
|
@@ -44,6 +53,8 @@ class Scan:
|
|
|
44
53
|
|
|
45
54
|
def to_record_batches(
|
|
46
55
|
self,
|
|
56
|
+
*,
|
|
57
|
+
key_range: KeyRange | None = None,
|
|
47
58
|
key_table: pa.Table | pa.RecordBatchReader | None = None,
|
|
48
59
|
batch_size: int | None = None,
|
|
49
60
|
batch_readahead: int | None = None,
|
|
@@ -51,6 +62,9 @@ class Scan:
|
|
|
51
62
|
"""Read as a stream of RecordBatches.
|
|
52
63
|
|
|
53
64
|
Args:
|
|
65
|
+
key_range: Optional key range to filter the scan.
|
|
66
|
+
If provided, the scan will only return rows within the key range.
|
|
67
|
+
Only one of key_range or key_table can be provided.
|
|
54
68
|
key_table: a table of keys to "take" (including aux columns for cell-push-down).
|
|
55
69
|
If None, the scan will be executed without a key table.
|
|
56
70
|
batch_size: the maximum number of rows per returned batch.
|
|
@@ -58,6 +72,9 @@ class Scan:
|
|
|
58
72
|
RecordBatchReader, the batch_size argument must be None, and the existing batching is respected.
|
|
59
73
|
batch_readahead: the number of batches to prefetch in the background.
|
|
60
74
|
"""
|
|
75
|
+
if key_range is not None and key_table is not None:
|
|
76
|
+
raise ValueError("Only one of key_range or key_table can be provided.")
|
|
77
|
+
|
|
61
78
|
if isinstance(key_table, pa.RecordBatchReader):
|
|
62
79
|
if batch_size is not None:
|
|
63
80
|
raise ValueError(
|
|
@@ -66,46 +83,54 @@ class Scan:
|
|
|
66
83
|
elif isinstance(key_table, pa.Table):
|
|
67
84
|
key_table = key_table.to_reader(max_chunksize=batch_size)
|
|
68
85
|
|
|
69
|
-
return self.core.to_record_batches(key_table=key_table, batch_readahead=batch_readahead)
|
|
86
|
+
return self.core.to_record_batches(key_range=key_range, key_table=key_table, batch_readahead=batch_readahead)
|
|
70
87
|
|
|
71
88
|
def to_table(
|
|
72
89
|
self,
|
|
90
|
+
*,
|
|
91
|
+
key_range: KeyRange | None = None,
|
|
73
92
|
key_table: pa.Table | pa.RecordBatchReader | None = None,
|
|
74
93
|
) -> pa.Table:
|
|
75
94
|
"""Read into a single PyArrow Table.
|
|
76
95
|
|
|
77
96
|
Args:
|
|
97
|
+
key_range: Optional key range to filter the scan.
|
|
98
|
+
If provided, the scan will only return rows within the key range.
|
|
99
|
+
Only one of key_range or key_table can be provided.
|
|
78
100
|
key_table: a table of keys to "take" (including aux columns for cell-push-down).
|
|
79
101
|
If None, the scan will be executed without a key table.
|
|
80
102
|
"""
|
|
81
103
|
# NOTE: Evaluates fully on Rust side which improved debuggability.
|
|
82
|
-
if DEV and not CI and key_table is None:
|
|
104
|
+
if DEV and not CI and key_table is None and key_range is None:
|
|
83
105
|
rb = self.core.to_record_batch()
|
|
84
106
|
return pa.Table.from_batches([rb])
|
|
85
107
|
|
|
86
|
-
return self.to_record_batches(key_table=key_table).read_all()
|
|
108
|
+
return self.to_record_batches(key_range=key_range, key_table=key_table).read_all()
|
|
87
109
|
|
|
88
110
|
def to_dask(self) -> "dd.DataFrame":
|
|
89
111
|
"""Read into a Dask DataFrame.
|
|
90
112
|
|
|
91
113
|
Requires the `dask` package to be installed.
|
|
114
|
+
|
|
115
|
+
IMPORTANT: Dask execution has some limitations, e.g. UDFs are not currently supported. These limitations
|
|
116
|
+
usually manifest as serialization errors when Dask workers attempt to serialize the state. If you are
|
|
117
|
+
encountering such issues, please reach out to the support for assistance.
|
|
92
118
|
"""
|
|
93
119
|
import dask.dataframe as dd
|
|
94
|
-
import pandas as pd
|
|
95
120
|
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
121
|
+
_read_shard = partial(
|
|
122
|
+
_read_shard_task,
|
|
123
|
+
settings_dict=self.spiral.config.model_dump(),
|
|
124
|
+
state_json=self.core.scan_state().to_json(),
|
|
125
|
+
)
|
|
101
126
|
return dd.from_map(_read_shard, self.shards())
|
|
102
127
|
|
|
103
|
-
def to_pandas(self) -> "pd.DataFrame":
|
|
128
|
+
def to_pandas(self, *, key_range: KeyRange | None = None) -> "pd.DataFrame":
|
|
104
129
|
"""Read into a Pandas DataFrame.
|
|
105
130
|
|
|
106
131
|
Requires the `pandas` package to be installed.
|
|
107
132
|
"""
|
|
108
|
-
return self.to_table().to_pandas()
|
|
133
|
+
return self.to_table(key_range=key_range).to_pandas()
|
|
109
134
|
|
|
110
135
|
def to_polars(self) -> "pl.DataFrame":
|
|
111
136
|
"""Read into a Polars DataFrame.
|
|
@@ -160,16 +185,18 @@ class Scan:
|
|
|
160
185
|
|
|
161
186
|
Returns:
|
|
162
187
|
SpiralDataLoader with shards partitioned for this rank.
|
|
163
|
-
"""
|
|
164
|
-
# Example usage:
|
|
165
|
-
#
|
|
166
|
-
# Auto-detect from PyTorch distributed:
|
|
167
|
-
# loader: SpiralDataLoader = scan.to_distributed_data_loader(batch_size=32)
|
|
168
|
-
#
|
|
169
|
-
# Explicit world configuration:
|
|
170
|
-
# world = World(rank=0, world_size=4)
|
|
171
|
-
# loader: SpiralDataLoader = scan.to_distributed_data_loader(world=world, batch_size=32)
|
|
172
188
|
|
|
189
|
+
Auto-detect from PyTorch distributed:
|
|
190
|
+
```python
|
|
191
|
+
loader: SpiralDataLoader = scan.to_distributed_data_loader(batch_size=32)
|
|
192
|
+
```
|
|
193
|
+
|
|
194
|
+
Explicit world configuration:
|
|
195
|
+
```python
|
|
196
|
+
world = World(rank=0, world_size=4)
|
|
197
|
+
loader: SpiralDataLoader = scan.to_distributed_data_loader(world=world, batch_size=32)
|
|
198
|
+
```
|
|
199
|
+
"""
|
|
173
200
|
from spiral.dataloader import SpiralDataLoader, World
|
|
174
201
|
|
|
175
202
|
if world is None:
|
|
@@ -203,19 +230,21 @@ class Scan:
|
|
|
203
230
|
|
|
204
231
|
Returns:
|
|
205
232
|
New SpiralDataLoader instance configured to resume from the checkpoint.
|
|
233
|
+
|
|
234
|
+
Save checkpoint during training:
|
|
235
|
+
```python
|
|
236
|
+
loader = scan.to_distributed_data_loader(batch_size=32, seed=42)
|
|
237
|
+
checkpoint = loader.state_dict()
|
|
238
|
+
```
|
|
239
|
+
|
|
240
|
+
Resume later - uses same shards from checkpoint:
|
|
241
|
+
```python
|
|
242
|
+
resumed_loader = scan.resume_data_loader(
|
|
243
|
+
checkpoint,
|
|
244
|
+
batch_size=32,
|
|
245
|
+
transform_fn=my_transform,
|
|
246
|
+
)
|
|
206
247
|
"""
|
|
207
|
-
# Example usage:
|
|
208
|
-
#
|
|
209
|
-
# Save checkpoint during training:
|
|
210
|
-
# loader = scan.to_distributed_data_loader(batch_size=32, seed=42)
|
|
211
|
-
# checkpoint = loader.state_dict()
|
|
212
|
-
#
|
|
213
|
-
# Resume later - uses same shards from checkpoint:
|
|
214
|
-
# resumed_loader = scan.resume_data_loader(
|
|
215
|
-
# checkpoint,
|
|
216
|
-
# batch_size=32,
|
|
217
|
-
# transform_fn=my_transform,
|
|
218
|
-
# )
|
|
219
248
|
from spiral.dataloader import SpiralDataLoader
|
|
220
249
|
|
|
221
250
|
return SpiralDataLoader.from_state_dict(self, state, **kwargs)
|
|
@@ -283,3 +312,17 @@ class Scan:
|
|
|
283
312
|
from spiral.debug.metrics import display_metrics
|
|
284
313
|
|
|
285
314
|
display_metrics(self.metrics)
|
|
315
|
+
|
|
316
|
+
|
|
317
|
+
# NOTE(marko): This function must be picklable!
|
|
318
|
+
def _read_shard_task(shard: Shard, *, settings_dict, state_json) -> "pd.DataFrame":
|
|
319
|
+
from spiral import Spiral
|
|
320
|
+
from spiral.core.table import ScanState
|
|
321
|
+
from spiral.settings import Settings
|
|
322
|
+
|
|
323
|
+
settings: Settings = Settings.model_validate(settings_dict)
|
|
324
|
+
sp = Spiral(config=settings)
|
|
325
|
+
state = ScanState.from_json(state_json)
|
|
326
|
+
task_scan = Scan(sp, sp.core.load_scan(state))
|
|
327
|
+
|
|
328
|
+
return task_scan.to_pandas(key_range=shard.key_range)
|
spiral/settings.py
CHANGED
|
@@ -4,7 +4,7 @@ from pathlib import Path
|
|
|
4
4
|
from typing import TYPE_CHECKING, Annotated
|
|
5
5
|
|
|
6
6
|
import typer
|
|
7
|
-
from pydantic import Field, ValidatorFunctionWrapHandler, WrapValidator
|
|
7
|
+
from pydantic import Field, PlainSerializer, ValidatorFunctionWrapHandler, WrapValidator
|
|
8
8
|
from pydantic_settings import (
|
|
9
9
|
BaseSettings,
|
|
10
10
|
InitSettingsSource,
|
|
@@ -28,13 +28,16 @@ PACKAGE_NAME = "pyspiral"
|
|
|
28
28
|
|
|
29
29
|
|
|
30
30
|
def validate_token(v, handler: ValidatorFunctionWrapHandler):
|
|
31
|
-
if isinstance(v, str):
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
raise ValueError("Token value must be a string")
|
|
31
|
+
if not isinstance(v, str):
|
|
32
|
+
raise ValueError("Token value (SPIRAL__SPIRALDB__TOKEN) must be a string")
|
|
33
|
+
return Token(v)
|
|
35
34
|
|
|
36
35
|
|
|
37
|
-
TokenType = Annotated[
|
|
36
|
+
TokenType = Annotated[
|
|
37
|
+
Token,
|
|
38
|
+
WrapValidator(validate_token),
|
|
39
|
+
PlainSerializer(lambda token: token.expose_secret(), return_type=str),
|
|
40
|
+
]
|
|
38
41
|
|
|
39
42
|
|
|
40
43
|
class SpiralDBSettings(BaseSettings):
|
spiral/streaming_/stream.py
CHANGED
spiral/table.py
CHANGED
|
@@ -3,6 +3,7 @@ from typing import TYPE_CHECKING, Any
|
|
|
3
3
|
|
|
4
4
|
from spiral.core.table import Table as CoreTable
|
|
5
5
|
from spiral.core.table.spec import Schema
|
|
6
|
+
from spiral.enrichment import Enrichment
|
|
6
7
|
from spiral.expressions.base import Expr, ExprLike
|
|
7
8
|
from spiral.settings import settings
|
|
8
9
|
from spiral.snapshot import Snapshot
|
|
@@ -12,12 +13,11 @@ if TYPE_CHECKING:
|
|
|
12
13
|
import duckdb
|
|
13
14
|
import polars as pl
|
|
14
15
|
import pyarrow.dataset as ds
|
|
15
|
-
import streaming
|
|
16
|
-
import torch.utils.data as torchdata # noqa
|
|
17
16
|
|
|
18
17
|
from spiral.client import Spiral
|
|
19
18
|
from spiral.dataloader import SpiralDataLoader
|
|
20
19
|
from spiral.key_space_index import KeySpaceIndex
|
|
20
|
+
from spiral.streaming_ import SpiralStream
|
|
21
21
|
|
|
22
22
|
|
|
23
23
|
class Table(Expr):
|
|
@@ -50,6 +50,14 @@ class Table(Expr):
|
|
|
50
50
|
"""Returns the fully qualified identifier of the table."""
|
|
51
51
|
return self._identifier or self.table_id
|
|
52
52
|
|
|
53
|
+
@property
|
|
54
|
+
def project(self) -> str | None:
|
|
55
|
+
"""Returns the project of the table."""
|
|
56
|
+
if self._identifier is None:
|
|
57
|
+
return None
|
|
58
|
+
project, _, _ = self._identifier.split(".")
|
|
59
|
+
return project
|
|
60
|
+
|
|
53
61
|
@property
|
|
54
62
|
def dataset(self) -> str | None:
|
|
55
63
|
"""Returns the dataset of the table."""
|
|
@@ -110,6 +118,30 @@ class Table(Expr):
|
|
|
110
118
|
partition_size_bytes=partition_size_bytes,
|
|
111
119
|
)
|
|
112
120
|
|
|
121
|
+
def enrich(
|
|
122
|
+
self,
|
|
123
|
+
*projections: ExprLike,
|
|
124
|
+
where: ExprLike | None = None,
|
|
125
|
+
) -> Enrichment:
|
|
126
|
+
"""Returns an Enrichment object that, when applied, produces new columns.
|
|
127
|
+
|
|
128
|
+
Enrichment can be applied in different ways, e.g. distributed.
|
|
129
|
+
|
|
130
|
+
:param projections: Projection expressions deriving new columns to write back.
|
|
131
|
+
Expressions can be over multiple Spiral tables, but all tables including
|
|
132
|
+
this one must share the same key schema.
|
|
133
|
+
:param where: Optional filter expression to apply when reading the input tables.
|
|
134
|
+
"""
|
|
135
|
+
from spiral import expressions as se
|
|
136
|
+
|
|
137
|
+
# Combine table with all projections into a single struct.
|
|
138
|
+
# The table is included to ensure key columns are present in the scan output.
|
|
139
|
+
projection = se.merge(self, *projections)
|
|
140
|
+
if where is not None:
|
|
141
|
+
where = se.lift(where)
|
|
142
|
+
|
|
143
|
+
return Enrichment(self, projection, where)
|
|
144
|
+
|
|
113
145
|
def drop_columns(self, column_paths: list[str]) -> None:
|
|
114
146
|
"""
|
|
115
147
|
Drops the specified columns from the table.
|
|
@@ -136,7 +168,7 @@ class Table(Expr):
|
|
|
136
168
|
it is important that the primary key columns are unique within the transaction.
|
|
137
169
|
The behavior is undefined if this is not the case.
|
|
138
170
|
"""
|
|
139
|
-
return Transaction(self.spiral.
|
|
171
|
+
return Transaction(self.spiral.core.transaction(self.core, settings().file_format, retries=retries))
|
|
140
172
|
|
|
141
173
|
def to_dataset(self) -> "ds.Dataset":
|
|
142
174
|
"""Returns a PyArrow Dataset representing the table."""
|
|
@@ -175,7 +207,7 @@ class Table(Expr):
|
|
|
175
207
|
if index.asof == 0:
|
|
176
208
|
raise ValueError("Index have to be synced before it can be used.")
|
|
177
209
|
|
|
178
|
-
shards = self.spiral.
|
|
210
|
+
shards = self.spiral.internal.compute_shards(index=index.core)
|
|
179
211
|
|
|
180
212
|
return self.spiral.scan(
|
|
181
213
|
projection if projection is not None else index.projection,
|
|
@@ -208,7 +240,7 @@ class Table(Expr):
|
|
|
208
240
|
if index.asof == 0:
|
|
209
241
|
raise ValueError("Index have to be synced before it can be used.")
|
|
210
242
|
|
|
211
|
-
shards = self.spiral.
|
|
243
|
+
shards = self.spiral.core.internal.compute_shards(index=index.core)
|
|
212
244
|
|
|
213
245
|
return self.spiral.scan(
|
|
214
246
|
projection if projection is not None else index.projection,
|
|
@@ -240,7 +272,7 @@ class Table(Expr):
|
|
|
240
272
|
if index.asof == 0:
|
|
241
273
|
raise ValueError("Index have to be synced before it can be used.")
|
|
242
274
|
|
|
243
|
-
shards = self.spiral.
|
|
275
|
+
shards = self.spiral.core.internal.compute_shards(index=index.core)
|
|
244
276
|
|
|
245
277
|
return self.spiral.scan(
|
|
246
278
|
index.projection,
|
|
@@ -255,7 +287,7 @@ class Table(Expr):
|
|
|
255
287
|
projection: Expr | None = None,
|
|
256
288
|
cache_dir: str | None = None,
|
|
257
289
|
shard_row_block_size: int | None = None,
|
|
258
|
-
) -> "
|
|
290
|
+
) -> "SpiralStream":
|
|
259
291
|
"""Returns a stream to be used with MosaicML's StreamingDataset.
|
|
260
292
|
|
|
261
293
|
Requires `streaming` package to be installed.
|
|
@@ -282,7 +314,7 @@ class Table(Expr):
|
|
|
282
314
|
where=index.filter,
|
|
283
315
|
asof=index.asof,
|
|
284
316
|
)
|
|
285
|
-
shards = self.spiral.
|
|
317
|
+
shards = self.spiral.internal.compute_shards(index=index.core)
|
|
286
318
|
|
|
287
319
|
return SpiralStream(
|
|
288
320
|
sp=self.spiral,
|
|
@@ -290,4 +322,4 @@ class Table(Expr):
|
|
|
290
322
|
shards=shards,
|
|
291
323
|
cache_dir=cache_dir,
|
|
292
324
|
shard_row_block_size=shard_row_block_size,
|
|
293
|
-
)
|
|
325
|
+
)
|
spiral/transaction.py
CHANGED
|
@@ -1,5 +1,8 @@
|
|
|
1
|
+
from spiral.core.table import KeyRange
|
|
1
2
|
from spiral.core.table import Transaction as CoreTransaction
|
|
3
|
+
from spiral.core.table.spec import Operation
|
|
2
4
|
from spiral.expressions.base import ExprLike
|
|
5
|
+
from spiral.scan import Scan
|
|
3
6
|
|
|
4
7
|
|
|
5
8
|
class Transaction:
|
|
@@ -17,6 +20,10 @@ class Transaction:
|
|
|
17
20
|
"""The status of the transaction."""
|
|
18
21
|
return self._core.status
|
|
19
22
|
|
|
23
|
+
def is_empty(self) -> bool:
|
|
24
|
+
"""Check if the transaction has no operations."""
|
|
25
|
+
return self._core.is_empty()
|
|
26
|
+
|
|
20
27
|
def __enter__(self):
|
|
21
28
|
return self
|
|
22
29
|
|
|
@@ -39,6 +46,27 @@ class Transaction:
|
|
|
39
46
|
|
|
40
47
|
self._core.write(record_batches, partition_size_bytes=partition_size_bytes)
|
|
41
48
|
|
|
49
|
+
def writeback(
|
|
50
|
+
self,
|
|
51
|
+
scan: Scan,
|
|
52
|
+
*,
|
|
53
|
+
key_range: KeyRange | None = None,
|
|
54
|
+
partition_size_bytes: int | None = None,
|
|
55
|
+
batch_readahead: int | None = None,
|
|
56
|
+
):
|
|
57
|
+
"""Write back the results of a scan to the table.
|
|
58
|
+
|
|
59
|
+
:param scan: The scan to write back.
|
|
60
|
+
The scan does NOT need to be over the same table as transaction,
|
|
61
|
+
but it does need to have the same key schema.
|
|
62
|
+
:param key_range: Optional key range to limit the writeback to.
|
|
63
|
+
:param partition_size_bytes: The maximum partition size in bytes.
|
|
64
|
+
:param batch_readahead: The number of batches to read ahead when evaluating the scan.
|
|
65
|
+
"""
|
|
66
|
+
self._core.writeback(
|
|
67
|
+
scan.core, key_range=key_range, partition_size_bytes=partition_size_bytes, batch_readahead=batch_readahead
|
|
68
|
+
)
|
|
69
|
+
|
|
42
70
|
def drop_columns(self, column_paths: list[str]):
|
|
43
71
|
"""
|
|
44
72
|
Drops the specified columns from the table.
|
|
@@ -49,6 +77,20 @@ class Transaction:
|
|
|
49
77
|
"""
|
|
50
78
|
self._core.drop_columns(column_paths)
|
|
51
79
|
|
|
80
|
+
def take(self) -> list[Operation]:
|
|
81
|
+
"""Take the operations from the transaction
|
|
82
|
+
|
|
83
|
+
Transaction can no longer be committed or aborted after calling this method.
|
|
84
|
+
."""
|
|
85
|
+
return self._core.take()
|
|
86
|
+
|
|
87
|
+
def include(self, ops: list[Operation]):
|
|
88
|
+
"""Include the given operations in the transaction.
|
|
89
|
+
|
|
90
|
+
Checks for conflicts between the included operations and any existing operations.
|
|
91
|
+
"""
|
|
92
|
+
self._core.include(ops)
|
|
93
|
+
|
|
52
94
|
def commit(self):
|
|
53
95
|
"""Commit the transaction."""
|
|
54
96
|
self._core.commit()
|
spiral/expressions/io.py
DELETED
|
@@ -1,100 +0,0 @@
|
|
|
1
|
-
import tarfile
|
|
2
|
-
from io import BytesIO
|
|
3
|
-
|
|
4
|
-
import pyarrow as pa
|
|
5
|
-
|
|
6
|
-
from spiral.expressions.base import Expr, ExprLike
|
|
7
|
-
from spiral.expressions.struct import pack
|
|
8
|
-
from spiral.expressions.udf import UDF
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
def read_file(path: ExprLike) -> Expr:
|
|
12
|
-
"""
|
|
13
|
-
Read file path(s) from disk into a struct with a single field "bytes" containing the file contents.
|
|
14
|
-
|
|
15
|
-
Args:
|
|
16
|
-
path: Expression evaluating to an array of strings representing local disk paths.
|
|
17
|
-
"""
|
|
18
|
-
to_pack = {"path": path}
|
|
19
|
-
return FileRead()(pack(to_pack))
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
class FileRead(UDF):
|
|
23
|
-
RES_DTYPE: pa.DataType = pa.struct(
|
|
24
|
-
[
|
|
25
|
-
pa.field("bytes", pa.large_binary()),
|
|
26
|
-
]
|
|
27
|
-
)
|
|
28
|
-
|
|
29
|
-
def __init__(self):
|
|
30
|
-
super().__init__("file.read")
|
|
31
|
-
|
|
32
|
-
def return_type(self, *input_types: pa.DataType) -> pa.DataType:
|
|
33
|
-
return FileRead.RES_DTYPE
|
|
34
|
-
|
|
35
|
-
def invoke(self, *input_args: pa.Array) -> pa.Array:
|
|
36
|
-
if len(input_args) != 1:
|
|
37
|
-
raise ValueError(f"Expected 1 argument, got {len(input_args)}")
|
|
38
|
-
arg = input_args[0]
|
|
39
|
-
|
|
40
|
-
res = []
|
|
41
|
-
for req in arg:
|
|
42
|
-
with open(req["path"].as_py(), "rb") as f:
|
|
43
|
-
res.append({"bytes": f.read()})
|
|
44
|
-
|
|
45
|
-
return pa.array(res, type=FileRead.RES_DTYPE)
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
def read_tar(path: ExprLike = None, bytes_: ExprLike = None) -> "Expr":
|
|
49
|
-
# Untar a vector of paths / byte arrays representing tarballs.
|
|
50
|
-
if path is None and bytes_ is None:
|
|
51
|
-
raise ValueError("Expected either path or bytes_ to be provided")
|
|
52
|
-
to_pack = {}
|
|
53
|
-
if path is not None:
|
|
54
|
-
to_pack["path"] = path
|
|
55
|
-
if bytes_ is not None:
|
|
56
|
-
to_pack["bytes"] = bytes_
|
|
57
|
-
return TarRead()(pack(to_pack))
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
class TarRead(UDF):
|
|
61
|
-
RES_DTYPE = pa.list_(
|
|
62
|
-
pa.struct(
|
|
63
|
-
[
|
|
64
|
-
pa.field("name", pa.string()),
|
|
65
|
-
pa.field("bytes", pa.large_binary()),
|
|
66
|
-
]
|
|
67
|
-
)
|
|
68
|
-
)
|
|
69
|
-
|
|
70
|
-
def __init__(self):
|
|
71
|
-
super().__init__("tar.read")
|
|
72
|
-
|
|
73
|
-
def return_type(self, *input_types: pa.DataType) -> pa.DataType:
|
|
74
|
-
return TarRead.RES_DTYPE
|
|
75
|
-
|
|
76
|
-
def invoke(self, *input_args: pa.Array) -> pa.Array:
|
|
77
|
-
if len(input_args) != 1:
|
|
78
|
-
raise ValueError(f"Expected 1 argument, got {len(input_args)}")
|
|
79
|
-
arg = input_args[0]
|
|
80
|
-
|
|
81
|
-
res = []
|
|
82
|
-
for req in arg:
|
|
83
|
-
if "path" in req:
|
|
84
|
-
kwargs = {"name": req["path"].as_py()}
|
|
85
|
-
elif "bytes" in req:
|
|
86
|
-
kwargs = {"fileobj": BytesIO(req["bytes"].as_py())}
|
|
87
|
-
else:
|
|
88
|
-
raise ValueError("Expected path or bytes_ to be provided")
|
|
89
|
-
|
|
90
|
-
files = []
|
|
91
|
-
with tarfile.open(**kwargs) as f:
|
|
92
|
-
for m in f.getmembers():
|
|
93
|
-
m: tarfile.TarInfo
|
|
94
|
-
if m.type == tarfile.DIRTYPE:
|
|
95
|
-
continue
|
|
96
|
-
# TODO(ngates): skip other types too maybe? Why are we even skipping directories?
|
|
97
|
-
files.append({"name": m.name, "bytes": f.extractfile(m).read()})
|
|
98
|
-
res.append(files)
|
|
99
|
-
|
|
100
|
-
return pa.array(res, type=TarRead.RES_DTYPE)
|
spiral/expressions/mp4.py
DELETED
|
@@ -1,62 +0,0 @@
|
|
|
1
|
-
import pyarrow as pa
|
|
2
|
-
|
|
3
|
-
from spiral.expressions.base import Expr, ExprLike
|
|
4
|
-
|
|
5
|
-
_MP4_RES_DTYPE: pa.DataType = pa.struct(
|
|
6
|
-
[
|
|
7
|
-
pa.field("pixels", pa.large_binary()),
|
|
8
|
-
pa.field("height", pa.uint32()),
|
|
9
|
-
pa.field("width", pa.uint32()),
|
|
10
|
-
pa.field("frames", pa.uint32()),
|
|
11
|
-
]
|
|
12
|
-
)
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
# TODO(marko): Support optional range and crop.
|
|
16
|
-
# IMPORTANT: Frames is currently broken and defaults to full.
|
|
17
|
-
def read(expr: ExprLike | str, frames: ExprLike | str, crop: ExprLike | str):
|
|
18
|
-
"""
|
|
19
|
-
Read referenced cell in a `MP4` format. Requires `ffmpeg`.
|
|
20
|
-
|
|
21
|
-
Args:
|
|
22
|
-
expr: The referenced `Mp4` bytes.
|
|
23
|
-
A str is assumed to be the `se.aux` expression.
|
|
24
|
-
frames: The range of frames to read. Each element must be a list of two uint32,
|
|
25
|
-
frame start and frame end, or null / empty list to read all frames.
|
|
26
|
-
A str is assumed to be the `se.aux` expression.
|
|
27
|
-
crop: The crop of the frames to read. Each element must be a list of four uint32,
|
|
28
|
-
x, y, width, height or null / empty list to read full frames.
|
|
29
|
-
A str is assumed to be the `se.aux` expression.
|
|
30
|
-
|
|
31
|
-
Returns:
|
|
32
|
-
An array where each element is a decoded cropped video with fields:
|
|
33
|
-
pixels: RGB8 bytes, frames * width * height * 3.
|
|
34
|
-
width: Width of the image with type `pa.uint32()`.
|
|
35
|
-
height: Height of the image with type `pa.uint32()`.
|
|
36
|
-
frames: Number of frames with type `pa.uint32()`.
|
|
37
|
-
"""
|
|
38
|
-
from spiral import _lib
|
|
39
|
-
from spiral.expressions import aux, lift
|
|
40
|
-
|
|
41
|
-
if isinstance(expr, str):
|
|
42
|
-
expr = aux(
|
|
43
|
-
expr,
|
|
44
|
-
pa.struct([("__ref__", pa.struct([("id", pa.string()), ("begin", pa.uint64()), ("end", pa.uint64())]))]),
|
|
45
|
-
)
|
|
46
|
-
if isinstance(frames, str):
|
|
47
|
-
frames = aux(frames, pa.list_(pa.uint32()))
|
|
48
|
-
if isinstance(crop, str):
|
|
49
|
-
crop = aux(crop, pa.list_(pa.uint32()))
|
|
50
|
-
|
|
51
|
-
expr = lift(expr)
|
|
52
|
-
frames = lift(frames)
|
|
53
|
-
crop = lift(crop)
|
|
54
|
-
|
|
55
|
-
return Expr(
|
|
56
|
-
_lib.expr.video.read(
|
|
57
|
-
expr.__expr__,
|
|
58
|
-
frames.__expr__,
|
|
59
|
-
crop.__expr__,
|
|
60
|
-
format="mp4",
|
|
61
|
-
)
|
|
62
|
-
)
|
spiral/expressions/png.py
DELETED
|
@@ -1,18 +0,0 @@
|
|
|
1
|
-
from spiral.expressions.base import Expr, ExprLike
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
def encode(expr: ExprLike) -> Expr:
|
|
5
|
-
"""Encode the given expression as a PNG image.
|
|
6
|
-
|
|
7
|
-
Args:
|
|
8
|
-
expr: The expression to encode.
|
|
9
|
-
Expects a struct with `pixels`, `width`, `height`, `channels`, `channel_bit_depth` fields.
|
|
10
|
-
|
|
11
|
-
Returns:
|
|
12
|
-
The encoded PNG images.
|
|
13
|
-
"""
|
|
14
|
-
from spiral import _lib
|
|
15
|
-
from spiral.expressions import lift
|
|
16
|
-
|
|
17
|
-
expr = lift(expr)
|
|
18
|
-
return Expr(_lib.expr.img.encode(expr.__expr__, format="png"))
|
spiral/expressions/qoi.py
DELETED
|
@@ -1,18 +0,0 @@
|
|
|
1
|
-
from spiral.expressions.base import Expr, ExprLike
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
def encode(expr: ExprLike) -> Expr:
|
|
5
|
-
"""Encode the given expression as a QOI image.
|
|
6
|
-
|
|
7
|
-
Args:
|
|
8
|
-
expr: The expression to encode.
|
|
9
|
-
Expects a struct with `pixels`, `width`, `height`, `channels`, `channel_bit_depth` fields.
|
|
10
|
-
|
|
11
|
-
Returns:
|
|
12
|
-
The encoded QOI images.
|
|
13
|
-
"""
|
|
14
|
-
from spiral import _lib
|
|
15
|
-
from spiral.expressions import lift
|
|
16
|
-
|
|
17
|
-
expr = lift(expr)
|
|
18
|
-
return Expr(_lib.expr.img.encode(expr.__expr__, format="qoi"))
|