pyspiral 0.7.8__cp312-abi3-manylinux_2_28_aarch64.whl → 0.7.10__cp312-abi3-manylinux_2_28_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pyspiral might be problematic. Click here for more details.

@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: pyspiral
3
- Version: 0.7.8
3
+ Version: 0.7.10
4
4
  Classifier: Intended Audience :: Science/Research
5
5
  Classifier: Operating System :: OS Independent
6
6
  Classifier: Programming Language :: Python
@@ -1,8 +1,8 @@
1
- pyspiral-0.7.8.dist-info/METADATA,sha256=tGCOA2CfvPk_EGKZ35MCaeZ0x6SIm-XZNnLMHd5ao-U,1874
2
- pyspiral-0.7.8.dist-info/WHEEL,sha256=I5JYpyYzeAl2SOerY_wvkm-HJti0rDQc6zMeJs35MpM,108
3
- pyspiral-0.7.8.dist-info/entry_points.txt,sha256=R96Y3FpYX6XbQu9qMPfUTgiCcf4qM9OBQQZTDdBkZwA,74
1
+ pyspiral-0.7.10.dist-info/METADATA,sha256=rCgXce3dHmwg5oIvXTHJnnMMEX93NFLvTRbUY0Ns9-Y,1875
2
+ pyspiral-0.7.10.dist-info/WHEEL,sha256=I5JYpyYzeAl2SOerY_wvkm-HJti0rDQc6zMeJs35MpM,108
3
+ pyspiral-0.7.10.dist-info/entry_points.txt,sha256=R96Y3FpYX6XbQu9qMPfUTgiCcf4qM9OBQQZTDdBkZwA,74
4
4
  spiral/__init__.py,sha256=PwaYBWFBtB7cYi7peMmhk_Lm5XzjRoLwOtLbUhc1ZDo,1449
5
- spiral/_lib.abi3.so,sha256=Xd_FEh5WITGs_pTl7JlGA3JZc7Vlzn4Wxj6cY14Rmfc,61716216
5
+ spiral/_lib.abi3.so,sha256=uQdP0Z3oFR0Ay_HN-lo05jSZ4e9AyvOs8wTrzR3TDGU,61828040
6
6
  spiral/adbc.py,sha256=7IxfWIeQN-fh0W5OdN_PP2x3pzQYg6ZUOLsHg3jktqw,14842
7
7
  spiral/api/__init__.py,sha256=ULBlVq3PnfNOO6T5naE_ULmmii-83--qTuN2PpAUQN0,2241
8
8
  spiral/api/admin.py,sha256=A1iVR1XYJSObZivPAD5UzmPuMgupXc9kaHNYYa_kwfs,585
@@ -30,7 +30,7 @@ spiral/cli/orgs.py,sha256=fmOuLxpeIFfKqePRi292Gv9k-EF5pPn_tbKd2BLl2Ig,2869
30
30
  spiral/cli/printer.py,sha256=aosc763hDFgoXJGkiANmNyO3kAsecAS1JWgjEhn8GCM,1784
31
31
  spiral/cli/projects.py,sha256=1M1nGrBT-t0aY9RV5Cnmzy7YrhIvmHwdkpa3y9j8rG8,5756
32
32
  spiral/cli/state.py,sha256=10wTIVQ0SJkY67Z6-KQ1LFlt3aVIPmZhoHFdTwp4kNA,130
33
- spiral/cli/tables.py,sha256=Mv6M8zlgG_1i_GsguYIzU-CY2GXq2fMmKnabSMWE1qI,6402
33
+ spiral/cli/tables.py,sha256=6vt6EBGt7I9b0kAQ6sQORbmWiKbRdH4ubQYjjuNBXEg,6900
34
34
  spiral/cli/telemetry.py,sha256=Uxo1Q1FkKJ6n6QNGOUmL3j_pRRWRx0qWIhoP-U9BuR0,589
35
35
  spiral/cli/text.py,sha256=DlWGe4JrkdERAiqyITNpk91Wqb63Re99rNYlIFsIamc,4031
36
36
  spiral/cli/types.py,sha256=XYzo1GgX7dBBItoBSrHI4vO5C2lLmS2sktb-2GnGH3E,1362
@@ -49,18 +49,18 @@ spiral/core/expr/struct_/__init__.pyi,sha256=MXckd98eV_x3X0RhEWvlkA3DcDXRtLs5pNn
49
49
  spiral/core/expr/text/__init__.pyi,sha256=ed83n1xcsGY7_QDhMmJGnSQ20UrJFXcdv1AveSEcS1c,175
50
50
  spiral/core/expr/udf/__init__.pyi,sha256=zsZs081KVhY3-1JidqTkWMW81Qd_ScoTGZvasIhIK-4,358
51
51
  spiral/core/expr/video/__init__.pyi,sha256=nQJEcSsigZuRpMjkI_O4EEtMK_n2zRvorcL_KEeD5vU,95
52
- spiral/core/table/__init__.pyi,sha256=zcf4GripPZtiwh6uHkPgVyDij1g2nYL1DogN83z5ISU,4037
52
+ spiral/core/table/__init__.pyi,sha256=h84QDg6hLuPcmRpavx5zOZM77ZCi2-YwIlrrUZJp1sE,4374
53
53
  spiral/core/table/manifests/__init__.pyi,sha256=eVfDpmhYSjafIvvALqAkZe5baN3Y1HpKpxYEbjwd4gQ,1043
54
54
  spiral/core/table/metastore/__init__.pyi,sha256=rc3u9MwEKRvL2kxOc8lBorddFRnM8o_o1frqtae86a4,1697
55
55
  spiral/core/table/spec/__init__.pyi,sha256=fVuc2j3uoTdWfYNm720OfUIgrLYw9fRwj44maI5bgdY,5709
56
56
  spiral/dataloader.py,sha256=W9siY4BF4p_rwTTSS4KgsaQsPLxxza6XmQhrdBzzMJ8,10592
57
- spiral/dataset.py,sha256=PMLoXnXuEUciP6-NXqTmQLXu0UIH7OcC4-iZtY_iuO8,7973
57
+ spiral/dataset.py,sha256=S8pdiBXIhwMxQiJYgF7UI_8HkN7pZO798UzlO1LNXy4,8409
58
58
  spiral/datetime_.py,sha256=elXaUWtZuuLVcu9E0aXnvYRPB9XWqZbLDToozQYQYjU,950
59
59
  spiral/debug/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
60
60
  spiral/debug/manifests.py,sha256=7f1O3ba9mrA5nXpOF9cEIQuUAteP5wiBkFy_diQJ7No,3216
61
61
  spiral/debug/metrics.py,sha256=XdRDcjggtsLNGCAjam6IxG9072pz_d2C8iLApNRFUtk,2044
62
62
  spiral/debug/scan.py,sha256=UEm_aRnql5pwDPTpZgakMLNjlzkKL4RurBFFqH_BLAQ,9526
63
- spiral/enrichment.py,sha256=w0MrZ93wDuvS4sazw_8dPmnhzkQ4SAU5A1CGE7WF-F8,7046
63
+ spiral/enrichment.py,sha256=t3CFnidG1kGHeJk1zIMVyImKapUJQx1OXvGn88brOo4,11059
64
64
  spiral/expressions/__init__.py,sha256=ZsD8g7vB0G7xy19GUiH4m79kw7KEkTQRwJl5Gn1cgtw,8049
65
65
  spiral/expressions/base.py,sha256=PvhJkcUSsPSIaxirHVzM9zlqyBXiaiia1HXohXdOmL4,5377
66
66
  spiral/expressions/file.py,sha256=7D9jIENJcoT0KFharBLkzK9dZgO4DYn5K_KCt0twefg,518
@@ -106,4 +106,4 @@ spiral/table.py,sha256=p95AYv6b7e14F3t7j-B-r45k9CtG84ngikdlAhh9WxA,12260
106
106
  spiral/text_index.py,sha256=FQ9rgIEGLSJryS9lFdMhKtPFey18BXoWbPXyvZPJJ04,442
107
107
  spiral/transaction.py,sha256=bI5oqBAmPMSF0yOOYcPfGbV37Xc1-_V-wQNKw1xOlTA,4136
108
108
  spiral/types_.py,sha256=W_jyO7F6rpPiH69jhgSgV7OxQZbOlb1Ho3InpKUP6Eo,155
109
- pyspiral-0.7.8.dist-info/RECORD,,
109
+ pyspiral-0.7.10.dist-info/RECORD,,
spiral/_lib.abi3.so CHANGED
Binary file
spiral/cli/tables.py CHANGED
@@ -60,6 +60,21 @@ def ls(
60
60
  CONSOLE.print(rich_table)
61
61
 
62
62
 
63
+ @app.command(help="Show the leading rows of the table.")
64
+ def head(
65
+ project: ProjectArg,
66
+ table: Annotated[str | None, Option(help="Table name.")] = None,
67
+ dataset: Annotated[str | None, Option(help="Dataset name.")] = None,
68
+ n: Annotated[int, Option("-n", help="Maximum number of rows to show. Defaults to 10.")] = 10,
69
+ ):
70
+ import polars as pl
71
+
72
+ _, t = get_table(project, table, dataset)
73
+
74
+ with pl.Config(tbl_rows=-1):
75
+ CONSOLE.print(t.to_polars().limit(n).collect())
76
+
77
+
63
78
  def validate_non_empty_str(text: str) -> bool | str:
64
79
  if len(text) > 0:
65
80
  return True
@@ -60,6 +60,13 @@ class ScanState:
60
60
  class MaterializablePlan:
61
61
  pass
62
62
 
63
+ class EvaluatedExecutablePlan:
64
+ pass
65
+
66
+ class EvaluatedPlanStream:
67
+ def __next__(self) -> EvaluatedExecutablePlan: ...
68
+ def __iter__(self) -> EvaluatedPlanStream: ...
69
+
63
70
  class Scan:
64
71
  def key_schema(self) -> Schema: ...
65
72
  def schema(self) -> Schema: ...
@@ -90,6 +97,10 @@ class Scan:
90
97
  # If `infinite` is True, shards are shuffled after exhausted but not before the first pass.
91
98
  # Otherwise, shards are not shuffle and shuffle config is only used for shuffle buffer.
92
99
  ...
100
+
101
+ def evaluate_analyze(
102
+ self, key_table: pa.Table | pa.RecordBatch | None = None, batch_readahead: int | None = None
103
+ ) -> EvaluatedPlanStream: ...
93
104
  def metrics(self) -> dict[str, Any]: ...
94
105
 
95
106
  class KeySpaceState:
spiral/dataset.py CHANGED
@@ -226,7 +226,16 @@ class TableScanner(ds.Scanner):
226
226
 
227
227
  def head(self, num_rows: int):
228
228
  """Return the first `num_rows` rows of the dataset."""
229
- reader = self.to_reader()
229
+
230
+ kwargs = {}
231
+ if num_rows <= 10_000:
232
+ # We are unlikely to need more than a couple batches
233
+ kwargs["batch_readahead"] = 1
234
+ # The progress bar length is the total number of splits in this dataset. We will likely
235
+ # stop streaming early. As a result, the progress bar is misleading.
236
+ kwargs["hide_progress_bar"] = True
237
+
238
+ reader = self._scan.to_record_batches(key_table=self.key_table, **kwargs)
230
239
  batches = []
231
240
  row_count = 0
232
241
  for batch in reader:
spiral/enrichment.py CHANGED
@@ -4,14 +4,14 @@ from functools import partial
4
4
  from typing import TYPE_CHECKING, Optional
5
5
 
6
6
  from spiral.core.client import Shard
7
- from spiral.core.table import Scan
8
- from spiral.core.table.spec import Operation
7
+ from spiral.core.table import KeyRange
8
+ from spiral.core.table.spec import Key, Operation
9
9
  from spiral.expressions import Expr
10
10
 
11
11
  if TYPE_CHECKING:
12
12
  import dask.distributed
13
13
 
14
- from spiral import KeySpaceIndex, Table
14
+ from spiral import KeySpaceIndex, Scan, Table
15
15
 
16
16
  logger = logging.getLogger(__name__)
17
17
 
@@ -50,7 +50,7 @@ class Enrichment:
50
50
  """The filter expression."""
51
51
  return self._where
52
52
 
53
- def _scan(self) -> Scan:
53
+ def _scan(self) -> "Scan":
54
54
  return self._table.spiral.scan(self._projection, where=self._where)
55
55
 
56
56
  def apply(
@@ -90,6 +90,7 @@ class Enrichment:
90
90
  index: Optional["KeySpaceIndex"] = None,
91
91
  partition_size_bytes: int | None = None,
92
92
  tx_dump: str | None = None,
93
+ checkpoint_dump: str | None = None,
93
94
  client: Optional["dask.distributed.Client"] = None,
94
95
  **kwargs,
95
96
  ) -> None:
@@ -109,6 +110,7 @@ class Enrichment:
109
110
  partition_size_bytes: The maximum partition size in bytes.
110
111
  If not provided, the default partition size is used.
111
112
  tx_dump: Optional path to dump the transaction JSON for debugging.
113
+ checkpoint_dump: Optional path to dump intermediate checkpoints for incremental progress.
112
114
  client: Optional Dask distributed client. If not provided, a new client will be created
113
115
  **kwargs: Additional keyword arguments to pass to `dask.distributed.Client`
114
116
  such as `address` to connect to an existing cluster.
@@ -126,11 +128,23 @@ class Enrichment:
126
128
  tx = self._table.txn()
127
129
  plan_scan = self._scan()
128
130
 
129
- # Determine the "tasks". Use the index if provided.
130
- shards = plan_scan.shards()
131
- if index is not None:
131
+ # Determine the "tasks".
132
+ shards = None
133
+ # Use checkpoint, if provided.
134
+ if checkpoint_dump is not None:
135
+ checkpoint: list[KeyRange] | None = _checkpoint_load_key_ranges(checkpoint_dump)
136
+ if checkpoint is None:
137
+ logger.info(f"No existing checkpoint found at {checkpoint_dump}. Starting from scratch.")
138
+ else:
139
+ logger.info(f"Resuming enrichment from checkpoint at {checkpoint_dump} with {len(checkpoint)} ranges.")
140
+ shards = [Shard(kr, None) for kr in checkpoint]
141
+ # Fallback to index-based sharding.
142
+ if shards is None and index is not None:
132
143
  # TODO(marko): This will use index's asof automatically.
133
144
  shards = self._table.spiral.internal.compute_shards(index.core)
145
+ # Fallback to default sharding.
146
+ if shards is None:
147
+ shards = plan_scan.shards()
134
148
 
135
149
  # Partially bind the enrichment function.
136
150
  _compute = partial(
@@ -139,13 +153,60 @@ class Enrichment:
139
153
  state_json=plan_scan.core.plan_state().to_json(),
140
154
  output_table_id=self._table.table_id,
141
155
  partition_size_bytes=partition_size_bytes,
156
+ incremental=checkpoint_dump is not None,
142
157
  )
143
158
  enrichments = client.map(_compute, shards)
144
159
 
145
160
  logger.info(f"Applying enrichment with {len(shards)} shards. Follow progress at {client.dashboard_link}")
146
- for result in client.gather(enrichments):
147
- result: EnrichmentTaskResult
148
- tx.include(result.ops)
161
+
162
+ failed_ranges = []
163
+ try:
164
+ for result, shard in zip(client.gather(enrichments), shards):
165
+ result: EnrichmentTaskResult
166
+
167
+ if result.error is not None:
168
+ logger.error(f"Enrichment task failed for range {shard.key_range}: {result.error}")
169
+ failed_ranges.append(shard.key_range)
170
+ continue
171
+
172
+ tx.include(result.ops)
173
+ except Exception as e:
174
+ # If not incremental, re-raise the exception.
175
+ if checkpoint_dump is None:
176
+ raise e
177
+
178
+ # Handle worker failures (e.g., KilledWorker from Dask)
179
+ from dask.distributed import KilledWorker
180
+
181
+ if not isinstance(e, KilledWorker):
182
+ # Re-raise other exceptions
183
+ raise e
184
+
185
+ logger.error(f"Dask worker was killed during enrichment: {e}")
186
+
187
+ # Try to gather partial results and mark remaining tasks as failed
188
+ for future, shard in zip(enrichments, shards):
189
+ if future.done() and not future.exception():
190
+ try:
191
+ result = future.result()
192
+
193
+ if result.error is not None:
194
+ logger.error(f"Enrichment task failed for range {shard.key_range}: {result.error}")
195
+ failed_ranges.append(shard.key_range)
196
+ continue
197
+
198
+ tx.include(result.ops)
199
+ except Exception:
200
+ # Task failed or incomplete, add to failed ranges
201
+ failed_ranges.append(shard.key_range)
202
+ else:
203
+ # Task didn't complete, add to failed ranges
204
+ failed_ranges.append(shard.key_range)
205
+
206
+ # Dump checkpoint of failed ranges, if any.
207
+ if checkpoint_dump is not None:
208
+ logger.info(f"Dumping checkpoint with {len(failed_ranges)} failed ranges to {checkpoint_dump}.")
209
+ _checkpoint_dump_key_ranges(checkpoint_dump, failed_ranges)
149
210
 
150
211
  if tx.is_empty():
151
212
  logger.warning("Transaction not committed. No rows were read for enrichment.")
@@ -155,20 +216,58 @@ class Enrichment:
155
216
  tx.commit(compact=True, tx_dump=tx_dump)
156
217
 
157
218
 
219
+ def _checkpoint_load_key_ranges(checkpoint_dump: str) -> list[KeyRange] | None:
220
+ import json
221
+ import os
222
+
223
+ if not os.path.exists(checkpoint_dump):
224
+ return None
225
+
226
+ with open(checkpoint_dump) as f:
227
+ data = json.load(f)
228
+ return [
229
+ KeyRange(begin=Key(bytes.fromhex(r["begin"])), end=Key(bytes.fromhex(r["end"])))
230
+ for r in data.get("key_ranges", [])
231
+ ]
232
+
233
+
234
+ def _checkpoint_dump_key_ranges(checkpoint_dump: str, ranges: list[KeyRange]):
235
+ import json
236
+ import os
237
+
238
+ os.makedirs(os.path.dirname(checkpoint_dump), exist_ok=True)
239
+ with open(checkpoint_dump, "w") as f:
240
+ json.dump(
241
+ {"key_ranges": [{"begin": bytes(r.begin).hex(), "end": bytes(r.end).hex()} for r in ranges]},
242
+ f,
243
+ )
244
+
245
+
158
246
  @dataclasses.dataclass
159
247
  class EnrichmentTaskResult:
160
248
  ops: list[Operation]
249
+ error: str | None = None
161
250
 
162
251
  def __getstate__(self):
163
- return {"ops": [op.to_json() for op in self.ops]}
252
+ return {
253
+ "ops": [op.to_json() for op in self.ops],
254
+ "error": self.error,
255
+ }
164
256
 
165
257
  def __setstate__(self, state):
166
258
  self.ops = [Operation.from_json(op_json) for op_json in state["ops"]]
259
+ self.error = state["error"]
167
260
 
168
261
 
169
262
  # NOTE(marko): This function must be picklable!
170
263
  def _enrichment_task(
171
- shard: Shard, *, settings_dict, state_json, output_table_id, partition_size_bytes: int | None
264
+ shard: Shard,
265
+ *,
266
+ settings_dict,
267
+ state_json,
268
+ output_table_id,
269
+ partition_size_bytes: int | None,
270
+ incremental: bool,
172
271
  ) -> EnrichmentTaskResult:
173
272
  # Returns operations that can be included in a transaction.
174
273
  from spiral import Scan, Spiral
@@ -182,5 +281,15 @@ def _enrichment_task(
182
281
  table = sp.table(output_table_id)
183
282
 
184
283
  task_tx = table.txn()
185
- task_tx.writeback(task_scan, key_range=shard.key_range, partition_size_bytes=partition_size_bytes)
186
- return EnrichmentTaskResult(ops=task_tx.take())
284
+
285
+ try:
286
+ task_tx.writeback(task_scan, key_range=shard.key_range, partition_size_bytes=partition_size_bytes)
287
+ return EnrichmentTaskResult(ops=task_tx.take())
288
+ except Exception as e:
289
+ task_tx.abort()
290
+
291
+ if incremental:
292
+ return EnrichmentTaskResult(ops=[], error=str(e))
293
+
294
+ logger.error(f"Enrichment task failed for shard {shard}: {e}")
295
+ raise e