pyspiral 0.7.8__cp312-abi3-macosx_11_0_arm64.whl → 0.7.10__cp312-abi3-macosx_11_0_arm64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pyspiral might be problematic. Click here for more details.
- {pyspiral-0.7.8.dist-info → pyspiral-0.7.10.dist-info}/METADATA +1 -1
- {pyspiral-0.7.8.dist-info → pyspiral-0.7.10.dist-info}/RECORD +9 -9
- spiral/_lib.abi3.so +0 -0
- spiral/cli/tables.py +15 -0
- spiral/core/table/__init__.pyi +11 -0
- spiral/dataset.py +10 -1
- spiral/enrichment.py +123 -14
- {pyspiral-0.7.8.dist-info → pyspiral-0.7.10.dist-info}/WHEEL +0 -0
- {pyspiral-0.7.8.dist-info → pyspiral-0.7.10.dist-info}/entry_points.txt +0 -0
|
@@ -1,8 +1,8 @@
|
|
|
1
|
-
pyspiral-0.7.
|
|
2
|
-
pyspiral-0.7.
|
|
3
|
-
pyspiral-0.7.
|
|
1
|
+
pyspiral-0.7.10.dist-info/METADATA,sha256=rCgXce3dHmwg5oIvXTHJnnMMEX93NFLvTRbUY0Ns9-Y,1875
|
|
2
|
+
pyspiral-0.7.10.dist-info/WHEEL,sha256=KQvxBiy7GLcML6Ad3w_ZPrgSvER1uXd7aYb6wy6b44Y,103
|
|
3
|
+
pyspiral-0.7.10.dist-info/entry_points.txt,sha256=R96Y3FpYX6XbQu9qMPfUTgiCcf4qM9OBQQZTDdBkZwA,74
|
|
4
4
|
spiral/__init__.py,sha256=PwaYBWFBtB7cYi7peMmhk_Lm5XzjRoLwOtLbUhc1ZDo,1449
|
|
5
|
-
spiral/_lib.abi3.so,sha256=
|
|
5
|
+
spiral/_lib.abi3.so,sha256=rkT4wXMbVHeuV_ika6WCBCV5ByzXHhzho67BdWq7VmM,71030800
|
|
6
6
|
spiral/adbc.py,sha256=7IxfWIeQN-fh0W5OdN_PP2x3pzQYg6ZUOLsHg3jktqw,14842
|
|
7
7
|
spiral/api/__init__.py,sha256=ULBlVq3PnfNOO6T5naE_ULmmii-83--qTuN2PpAUQN0,2241
|
|
8
8
|
spiral/api/admin.py,sha256=A1iVR1XYJSObZivPAD5UzmPuMgupXc9kaHNYYa_kwfs,585
|
|
@@ -30,7 +30,7 @@ spiral/cli/orgs.py,sha256=fmOuLxpeIFfKqePRi292Gv9k-EF5pPn_tbKd2BLl2Ig,2869
|
|
|
30
30
|
spiral/cli/printer.py,sha256=aosc763hDFgoXJGkiANmNyO3kAsecAS1JWgjEhn8GCM,1784
|
|
31
31
|
spiral/cli/projects.py,sha256=1M1nGrBT-t0aY9RV5Cnmzy7YrhIvmHwdkpa3y9j8rG8,5756
|
|
32
32
|
spiral/cli/state.py,sha256=10wTIVQ0SJkY67Z6-KQ1LFlt3aVIPmZhoHFdTwp4kNA,130
|
|
33
|
-
spiral/cli/tables.py,sha256=
|
|
33
|
+
spiral/cli/tables.py,sha256=6vt6EBGt7I9b0kAQ6sQORbmWiKbRdH4ubQYjjuNBXEg,6900
|
|
34
34
|
spiral/cli/telemetry.py,sha256=Uxo1Q1FkKJ6n6QNGOUmL3j_pRRWRx0qWIhoP-U9BuR0,589
|
|
35
35
|
spiral/cli/text.py,sha256=DlWGe4JrkdERAiqyITNpk91Wqb63Re99rNYlIFsIamc,4031
|
|
36
36
|
spiral/cli/types.py,sha256=XYzo1GgX7dBBItoBSrHI4vO5C2lLmS2sktb-2GnGH3E,1362
|
|
@@ -49,18 +49,18 @@ spiral/core/expr/struct_/__init__.pyi,sha256=MXckd98eV_x3X0RhEWvlkA3DcDXRtLs5pNn
|
|
|
49
49
|
spiral/core/expr/text/__init__.pyi,sha256=ed83n1xcsGY7_QDhMmJGnSQ20UrJFXcdv1AveSEcS1c,175
|
|
50
50
|
spiral/core/expr/udf/__init__.pyi,sha256=zsZs081KVhY3-1JidqTkWMW81Qd_ScoTGZvasIhIK-4,358
|
|
51
51
|
spiral/core/expr/video/__init__.pyi,sha256=nQJEcSsigZuRpMjkI_O4EEtMK_n2zRvorcL_KEeD5vU,95
|
|
52
|
-
spiral/core/table/__init__.pyi,sha256=
|
|
52
|
+
spiral/core/table/__init__.pyi,sha256=h84QDg6hLuPcmRpavx5zOZM77ZCi2-YwIlrrUZJp1sE,4374
|
|
53
53
|
spiral/core/table/manifests/__init__.pyi,sha256=eVfDpmhYSjafIvvALqAkZe5baN3Y1HpKpxYEbjwd4gQ,1043
|
|
54
54
|
spiral/core/table/metastore/__init__.pyi,sha256=rc3u9MwEKRvL2kxOc8lBorddFRnM8o_o1frqtae86a4,1697
|
|
55
55
|
spiral/core/table/spec/__init__.pyi,sha256=fVuc2j3uoTdWfYNm720OfUIgrLYw9fRwj44maI5bgdY,5709
|
|
56
56
|
spiral/dataloader.py,sha256=W9siY4BF4p_rwTTSS4KgsaQsPLxxza6XmQhrdBzzMJ8,10592
|
|
57
|
-
spiral/dataset.py,sha256=
|
|
57
|
+
spiral/dataset.py,sha256=S8pdiBXIhwMxQiJYgF7UI_8HkN7pZO798UzlO1LNXy4,8409
|
|
58
58
|
spiral/datetime_.py,sha256=elXaUWtZuuLVcu9E0aXnvYRPB9XWqZbLDToozQYQYjU,950
|
|
59
59
|
spiral/debug/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
60
60
|
spiral/debug/manifests.py,sha256=7f1O3ba9mrA5nXpOF9cEIQuUAteP5wiBkFy_diQJ7No,3216
|
|
61
61
|
spiral/debug/metrics.py,sha256=XdRDcjggtsLNGCAjam6IxG9072pz_d2C8iLApNRFUtk,2044
|
|
62
62
|
spiral/debug/scan.py,sha256=UEm_aRnql5pwDPTpZgakMLNjlzkKL4RurBFFqH_BLAQ,9526
|
|
63
|
-
spiral/enrichment.py,sha256=
|
|
63
|
+
spiral/enrichment.py,sha256=t3CFnidG1kGHeJk1zIMVyImKapUJQx1OXvGn88brOo4,11059
|
|
64
64
|
spiral/expressions/__init__.py,sha256=ZsD8g7vB0G7xy19GUiH4m79kw7KEkTQRwJl5Gn1cgtw,8049
|
|
65
65
|
spiral/expressions/base.py,sha256=PvhJkcUSsPSIaxirHVzM9zlqyBXiaiia1HXohXdOmL4,5377
|
|
66
66
|
spiral/expressions/file.py,sha256=7D9jIENJcoT0KFharBLkzK9dZgO4DYn5K_KCt0twefg,518
|
|
@@ -106,4 +106,4 @@ spiral/table.py,sha256=p95AYv6b7e14F3t7j-B-r45k9CtG84ngikdlAhh9WxA,12260
|
|
|
106
106
|
spiral/text_index.py,sha256=FQ9rgIEGLSJryS9lFdMhKtPFey18BXoWbPXyvZPJJ04,442
|
|
107
107
|
spiral/transaction.py,sha256=bI5oqBAmPMSF0yOOYcPfGbV37Xc1-_V-wQNKw1xOlTA,4136
|
|
108
108
|
spiral/types_.py,sha256=W_jyO7F6rpPiH69jhgSgV7OxQZbOlb1Ho3InpKUP6Eo,155
|
|
109
|
-
pyspiral-0.7.
|
|
109
|
+
pyspiral-0.7.10.dist-info/RECORD,,
|
spiral/_lib.abi3.so
CHANGED
|
Binary file
|
spiral/cli/tables.py
CHANGED
|
@@ -60,6 +60,21 @@ def ls(
|
|
|
60
60
|
CONSOLE.print(rich_table)
|
|
61
61
|
|
|
62
62
|
|
|
63
|
+
@app.command(help="Show the leading rows of the table.")
|
|
64
|
+
def head(
|
|
65
|
+
project: ProjectArg,
|
|
66
|
+
table: Annotated[str | None, Option(help="Table name.")] = None,
|
|
67
|
+
dataset: Annotated[str | None, Option(help="Dataset name.")] = None,
|
|
68
|
+
n: Annotated[int, Option("-n", help="Maximum number of rows to show. Defaults to 10.")] = 10,
|
|
69
|
+
):
|
|
70
|
+
import polars as pl
|
|
71
|
+
|
|
72
|
+
_, t = get_table(project, table, dataset)
|
|
73
|
+
|
|
74
|
+
with pl.Config(tbl_rows=-1):
|
|
75
|
+
CONSOLE.print(t.to_polars().limit(n).collect())
|
|
76
|
+
|
|
77
|
+
|
|
63
78
|
def validate_non_empty_str(text: str) -> bool | str:
|
|
64
79
|
if len(text) > 0:
|
|
65
80
|
return True
|
spiral/core/table/__init__.pyi
CHANGED
|
@@ -60,6 +60,13 @@ class ScanState:
|
|
|
60
60
|
class MaterializablePlan:
|
|
61
61
|
pass
|
|
62
62
|
|
|
63
|
+
class EvaluatedExecutablePlan:
|
|
64
|
+
pass
|
|
65
|
+
|
|
66
|
+
class EvaluatedPlanStream:
|
|
67
|
+
def __next__(self) -> EvaluatedExecutablePlan: ...
|
|
68
|
+
def __iter__(self) -> EvaluatedPlanStream: ...
|
|
69
|
+
|
|
63
70
|
class Scan:
|
|
64
71
|
def key_schema(self) -> Schema: ...
|
|
65
72
|
def schema(self) -> Schema: ...
|
|
@@ -90,6 +97,10 @@ class Scan:
|
|
|
90
97
|
# If `infinite` is True, shards are shuffled after exhausted but not before the first pass.
|
|
91
98
|
# Otherwise, shards are not shuffle and shuffle config is only used for shuffle buffer.
|
|
92
99
|
...
|
|
100
|
+
|
|
101
|
+
def evaluate_analyze(
|
|
102
|
+
self, key_table: pa.Table | pa.RecordBatch | None = None, batch_readahead: int | None = None
|
|
103
|
+
) -> EvaluatedPlanStream: ...
|
|
93
104
|
def metrics(self) -> dict[str, Any]: ...
|
|
94
105
|
|
|
95
106
|
class KeySpaceState:
|
spiral/dataset.py
CHANGED
|
@@ -226,7 +226,16 @@ class TableScanner(ds.Scanner):
|
|
|
226
226
|
|
|
227
227
|
def head(self, num_rows: int):
|
|
228
228
|
"""Return the first `num_rows` rows of the dataset."""
|
|
229
|
-
|
|
229
|
+
|
|
230
|
+
kwargs = {}
|
|
231
|
+
if num_rows <= 10_000:
|
|
232
|
+
# We are unlikely to need more than a couple batches
|
|
233
|
+
kwargs["batch_readahead"] = 1
|
|
234
|
+
# The progress bar length is the total number of splits in this dataset. We will likely
|
|
235
|
+
# stop streaming early. As a result, the progress bar is misleading.
|
|
236
|
+
kwargs["hide_progress_bar"] = True
|
|
237
|
+
|
|
238
|
+
reader = self._scan.to_record_batches(key_table=self.key_table, **kwargs)
|
|
230
239
|
batches = []
|
|
231
240
|
row_count = 0
|
|
232
241
|
for batch in reader:
|
spiral/enrichment.py
CHANGED
|
@@ -4,14 +4,14 @@ from functools import partial
|
|
|
4
4
|
from typing import TYPE_CHECKING, Optional
|
|
5
5
|
|
|
6
6
|
from spiral.core.client import Shard
|
|
7
|
-
from spiral.core.table import
|
|
8
|
-
from spiral.core.table.spec import Operation
|
|
7
|
+
from spiral.core.table import KeyRange
|
|
8
|
+
from spiral.core.table.spec import Key, Operation
|
|
9
9
|
from spiral.expressions import Expr
|
|
10
10
|
|
|
11
11
|
if TYPE_CHECKING:
|
|
12
12
|
import dask.distributed
|
|
13
13
|
|
|
14
|
-
from spiral import KeySpaceIndex, Table
|
|
14
|
+
from spiral import KeySpaceIndex, Scan, Table
|
|
15
15
|
|
|
16
16
|
logger = logging.getLogger(__name__)
|
|
17
17
|
|
|
@@ -50,7 +50,7 @@ class Enrichment:
|
|
|
50
50
|
"""The filter expression."""
|
|
51
51
|
return self._where
|
|
52
52
|
|
|
53
|
-
def _scan(self) -> Scan:
|
|
53
|
+
def _scan(self) -> "Scan":
|
|
54
54
|
return self._table.spiral.scan(self._projection, where=self._where)
|
|
55
55
|
|
|
56
56
|
def apply(
|
|
@@ -90,6 +90,7 @@ class Enrichment:
|
|
|
90
90
|
index: Optional["KeySpaceIndex"] = None,
|
|
91
91
|
partition_size_bytes: int | None = None,
|
|
92
92
|
tx_dump: str | None = None,
|
|
93
|
+
checkpoint_dump: str | None = None,
|
|
93
94
|
client: Optional["dask.distributed.Client"] = None,
|
|
94
95
|
**kwargs,
|
|
95
96
|
) -> None:
|
|
@@ -109,6 +110,7 @@ class Enrichment:
|
|
|
109
110
|
partition_size_bytes: The maximum partition size in bytes.
|
|
110
111
|
If not provided, the default partition size is used.
|
|
111
112
|
tx_dump: Optional path to dump the transaction JSON for debugging.
|
|
113
|
+
checkpoint_dump: Optional path to dump intermediate checkpoints for incremental progress.
|
|
112
114
|
client: Optional Dask distributed client. If not provided, a new client will be created
|
|
113
115
|
**kwargs: Additional keyword arguments to pass to `dask.distributed.Client`
|
|
114
116
|
such as `address` to connect to an existing cluster.
|
|
@@ -126,11 +128,23 @@ class Enrichment:
|
|
|
126
128
|
tx = self._table.txn()
|
|
127
129
|
plan_scan = self._scan()
|
|
128
130
|
|
|
129
|
-
# Determine the "tasks".
|
|
130
|
-
shards =
|
|
131
|
-
|
|
131
|
+
# Determine the "tasks".
|
|
132
|
+
shards = None
|
|
133
|
+
# Use checkpoint, if provided.
|
|
134
|
+
if checkpoint_dump is not None:
|
|
135
|
+
checkpoint: list[KeyRange] | None = _checkpoint_load_key_ranges(checkpoint_dump)
|
|
136
|
+
if checkpoint is None:
|
|
137
|
+
logger.info(f"No existing checkpoint found at {checkpoint_dump}. Starting from scratch.")
|
|
138
|
+
else:
|
|
139
|
+
logger.info(f"Resuming enrichment from checkpoint at {checkpoint_dump} with {len(checkpoint)} ranges.")
|
|
140
|
+
shards = [Shard(kr, None) for kr in checkpoint]
|
|
141
|
+
# Fallback to index-based sharding.
|
|
142
|
+
if shards is None and index is not None:
|
|
132
143
|
# TODO(marko): This will use index's asof automatically.
|
|
133
144
|
shards = self._table.spiral.internal.compute_shards(index.core)
|
|
145
|
+
# Fallback to default sharding.
|
|
146
|
+
if shards is None:
|
|
147
|
+
shards = plan_scan.shards()
|
|
134
148
|
|
|
135
149
|
# Partially bind the enrichment function.
|
|
136
150
|
_compute = partial(
|
|
@@ -139,13 +153,60 @@ class Enrichment:
|
|
|
139
153
|
state_json=plan_scan.core.plan_state().to_json(),
|
|
140
154
|
output_table_id=self._table.table_id,
|
|
141
155
|
partition_size_bytes=partition_size_bytes,
|
|
156
|
+
incremental=checkpoint_dump is not None,
|
|
142
157
|
)
|
|
143
158
|
enrichments = client.map(_compute, shards)
|
|
144
159
|
|
|
145
160
|
logger.info(f"Applying enrichment with {len(shards)} shards. Follow progress at {client.dashboard_link}")
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
161
|
+
|
|
162
|
+
failed_ranges = []
|
|
163
|
+
try:
|
|
164
|
+
for result, shard in zip(client.gather(enrichments), shards):
|
|
165
|
+
result: EnrichmentTaskResult
|
|
166
|
+
|
|
167
|
+
if result.error is not None:
|
|
168
|
+
logger.error(f"Enrichment task failed for range {shard.key_range}: {result.error}")
|
|
169
|
+
failed_ranges.append(shard.key_range)
|
|
170
|
+
continue
|
|
171
|
+
|
|
172
|
+
tx.include(result.ops)
|
|
173
|
+
except Exception as e:
|
|
174
|
+
# If not incremental, re-raise the exception.
|
|
175
|
+
if checkpoint_dump is None:
|
|
176
|
+
raise e
|
|
177
|
+
|
|
178
|
+
# Handle worker failures (e.g., KilledWorker from Dask)
|
|
179
|
+
from dask.distributed import KilledWorker
|
|
180
|
+
|
|
181
|
+
if not isinstance(e, KilledWorker):
|
|
182
|
+
# Re-raise other exceptions
|
|
183
|
+
raise e
|
|
184
|
+
|
|
185
|
+
logger.error(f"Dask worker was killed during enrichment: {e}")
|
|
186
|
+
|
|
187
|
+
# Try to gather partial results and mark remaining tasks as failed
|
|
188
|
+
for future, shard in zip(enrichments, shards):
|
|
189
|
+
if future.done() and not future.exception():
|
|
190
|
+
try:
|
|
191
|
+
result = future.result()
|
|
192
|
+
|
|
193
|
+
if result.error is not None:
|
|
194
|
+
logger.error(f"Enrichment task failed for range {shard.key_range}: {result.error}")
|
|
195
|
+
failed_ranges.append(shard.key_range)
|
|
196
|
+
continue
|
|
197
|
+
|
|
198
|
+
tx.include(result.ops)
|
|
199
|
+
except Exception:
|
|
200
|
+
# Task failed or incomplete, add to failed ranges
|
|
201
|
+
failed_ranges.append(shard.key_range)
|
|
202
|
+
else:
|
|
203
|
+
# Task didn't complete, add to failed ranges
|
|
204
|
+
failed_ranges.append(shard.key_range)
|
|
205
|
+
|
|
206
|
+
# Dump checkpoint of failed ranges, if any.
|
|
207
|
+
if checkpoint_dump is not None:
|
|
208
|
+
logger.info(f"Dumping checkpoint with {len(failed_ranges)} failed ranges to {checkpoint_dump}.")
|
|
209
|
+
_checkpoint_dump_key_ranges(checkpoint_dump, failed_ranges)
|
|
149
210
|
|
|
150
211
|
if tx.is_empty():
|
|
151
212
|
logger.warning("Transaction not committed. No rows were read for enrichment.")
|
|
@@ -155,20 +216,58 @@ class Enrichment:
|
|
|
155
216
|
tx.commit(compact=True, tx_dump=tx_dump)
|
|
156
217
|
|
|
157
218
|
|
|
219
|
+
def _checkpoint_load_key_ranges(checkpoint_dump: str) -> list[KeyRange] | None:
|
|
220
|
+
import json
|
|
221
|
+
import os
|
|
222
|
+
|
|
223
|
+
if not os.path.exists(checkpoint_dump):
|
|
224
|
+
return None
|
|
225
|
+
|
|
226
|
+
with open(checkpoint_dump) as f:
|
|
227
|
+
data = json.load(f)
|
|
228
|
+
return [
|
|
229
|
+
KeyRange(begin=Key(bytes.fromhex(r["begin"])), end=Key(bytes.fromhex(r["end"])))
|
|
230
|
+
for r in data.get("key_ranges", [])
|
|
231
|
+
]
|
|
232
|
+
|
|
233
|
+
|
|
234
|
+
def _checkpoint_dump_key_ranges(checkpoint_dump: str, ranges: list[KeyRange]):
|
|
235
|
+
import json
|
|
236
|
+
import os
|
|
237
|
+
|
|
238
|
+
os.makedirs(os.path.dirname(checkpoint_dump), exist_ok=True)
|
|
239
|
+
with open(checkpoint_dump, "w") as f:
|
|
240
|
+
json.dump(
|
|
241
|
+
{"key_ranges": [{"begin": bytes(r.begin).hex(), "end": bytes(r.end).hex()} for r in ranges]},
|
|
242
|
+
f,
|
|
243
|
+
)
|
|
244
|
+
|
|
245
|
+
|
|
158
246
|
@dataclasses.dataclass
|
|
159
247
|
class EnrichmentTaskResult:
|
|
160
248
|
ops: list[Operation]
|
|
249
|
+
error: str | None = None
|
|
161
250
|
|
|
162
251
|
def __getstate__(self):
|
|
163
|
-
return {
|
|
252
|
+
return {
|
|
253
|
+
"ops": [op.to_json() for op in self.ops],
|
|
254
|
+
"error": self.error,
|
|
255
|
+
}
|
|
164
256
|
|
|
165
257
|
def __setstate__(self, state):
|
|
166
258
|
self.ops = [Operation.from_json(op_json) for op_json in state["ops"]]
|
|
259
|
+
self.error = state["error"]
|
|
167
260
|
|
|
168
261
|
|
|
169
262
|
# NOTE(marko): This function must be picklable!
|
|
170
263
|
def _enrichment_task(
|
|
171
|
-
shard: Shard,
|
|
264
|
+
shard: Shard,
|
|
265
|
+
*,
|
|
266
|
+
settings_dict,
|
|
267
|
+
state_json,
|
|
268
|
+
output_table_id,
|
|
269
|
+
partition_size_bytes: int | None,
|
|
270
|
+
incremental: bool,
|
|
172
271
|
) -> EnrichmentTaskResult:
|
|
173
272
|
# Returns operations that can be included in a transaction.
|
|
174
273
|
from spiral import Scan, Spiral
|
|
@@ -182,5 +281,15 @@ def _enrichment_task(
|
|
|
182
281
|
table = sp.table(output_table_id)
|
|
183
282
|
|
|
184
283
|
task_tx = table.txn()
|
|
185
|
-
|
|
186
|
-
|
|
284
|
+
|
|
285
|
+
try:
|
|
286
|
+
task_tx.writeback(task_scan, key_range=shard.key_range, partition_size_bytes=partition_size_bytes)
|
|
287
|
+
return EnrichmentTaskResult(ops=task_tx.take())
|
|
288
|
+
except Exception as e:
|
|
289
|
+
task_tx.abort()
|
|
290
|
+
|
|
291
|
+
if incremental:
|
|
292
|
+
return EnrichmentTaskResult(ops=[], error=str(e))
|
|
293
|
+
|
|
294
|
+
logger.error(f"Enrichment task failed for shard {shard}: {e}")
|
|
295
|
+
raise e
|
|
File without changes
|
|
File without changes
|