pyspiral 0.7.7__cp312-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl → 0.7.9__cp312-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pyspiral might be problematic. Click here for more details.

@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: pyspiral
3
- Version: 0.7.7
3
+ Version: 0.7.9
4
4
  Classifier: Intended Audience :: Science/Research
5
5
  Classifier: Operating System :: OS Independent
6
6
  Classifier: Programming Language :: Python
@@ -1,8 +1,8 @@
1
- pyspiral-0.7.7.dist-info/METADATA,sha256=5KbbTylag9ZfdffltxdGDcIK2fdzoMZHkQ3ASWi0ZRM,1874
2
- pyspiral-0.7.7.dist-info/WHEEL,sha256=0ecHyBdkJfSXYIVmWsPh7S-4h4fSrB4FlXhlnIu9c_A,130
3
- pyspiral-0.7.7.dist-info/entry_points.txt,sha256=R96Y3FpYX6XbQu9qMPfUTgiCcf4qM9OBQQZTDdBkZwA,74
1
+ pyspiral-0.7.9.dist-info/METADATA,sha256=La_MkKyTCJ_qrYYT2eGLaPYkaf9OSpNa36nYOfMaN_4,1874
2
+ pyspiral-0.7.9.dist-info/WHEEL,sha256=0ecHyBdkJfSXYIVmWsPh7S-4h4fSrB4FlXhlnIu9c_A,130
3
+ pyspiral-0.7.9.dist-info/entry_points.txt,sha256=R96Y3FpYX6XbQu9qMPfUTgiCcf4qM9OBQQZTDdBkZwA,74
4
4
  spiral/__init__.py,sha256=PwaYBWFBtB7cYi7peMmhk_Lm5XzjRoLwOtLbUhc1ZDo,1449
5
- spiral/_lib.abi3.so,sha256=YiUSFOqsNbrac9quToKru5_5TKEClQhhQ2nMi_V567c,61716472
5
+ spiral/_lib.abi3.so,sha256=lLD_ZKRMNTADiM96MnkJT-0rIeZFDSzx_yiNOk0UtrU,61819128
6
6
  spiral/adbc.py,sha256=7IxfWIeQN-fh0W5OdN_PP2x3pzQYg6ZUOLsHg3jktqw,14842
7
7
  spiral/api/__init__.py,sha256=ULBlVq3PnfNOO6T5naE_ULmmii-83--qTuN2PpAUQN0,2241
8
8
  spiral/api/admin.py,sha256=A1iVR1XYJSObZivPAD5UzmPuMgupXc9kaHNYYa_kwfs,585
@@ -30,7 +30,7 @@ spiral/cli/orgs.py,sha256=fmOuLxpeIFfKqePRi292Gv9k-EF5pPn_tbKd2BLl2Ig,2869
30
30
  spiral/cli/printer.py,sha256=aosc763hDFgoXJGkiANmNyO3kAsecAS1JWgjEhn8GCM,1784
31
31
  spiral/cli/projects.py,sha256=1M1nGrBT-t0aY9RV5Cnmzy7YrhIvmHwdkpa3y9j8rG8,5756
32
32
  spiral/cli/state.py,sha256=10wTIVQ0SJkY67Z6-KQ1LFlt3aVIPmZhoHFdTwp4kNA,130
33
- spiral/cli/tables.py,sha256=gp7v8K-6EUTa0P7ZcHXD9gYPPpp4nHABloo_elvUrBw,7757
33
+ spiral/cli/tables.py,sha256=6vt6EBGt7I9b0kAQ6sQORbmWiKbRdH4ubQYjjuNBXEg,6900
34
34
  spiral/cli/telemetry.py,sha256=Uxo1Q1FkKJ6n6QNGOUmL3j_pRRWRx0qWIhoP-U9BuR0,589
35
35
  spiral/cli/text.py,sha256=DlWGe4JrkdERAiqyITNpk91Wqb63Re99rNYlIFsIamc,4031
36
36
  spiral/cli/types.py,sha256=XYzo1GgX7dBBItoBSrHI4vO5C2lLmS2sktb-2GnGH3E,1362
@@ -39,7 +39,7 @@ spiral/client.py,sha256=53dVv8wxYMmozUfR8MVcUufKGqdVIdb0yZ0gchczBoQ,6426
39
39
  spiral/core/__init__.pyi,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
40
40
  spiral/core/_tools/__init__.pyi,sha256=b2KLfTOQ67pjfbYt07o0IGiTu5o2bZw69lllV8v0Dps,143
41
41
  spiral/core/authn/__init__.pyi,sha256=z_GWyIS62fuiYQrYO8hzw4W8oGaiciqS1u5qtAt54VY,769
42
- spiral/core/client/__init__.pyi,sha256=MmuIzluwaxAFFamr_J8iT15qP8AkA4smkz4zrF-yGnM,6993
42
+ spiral/core/client/__init__.pyi,sha256=YgDM-MoIt3J-QKxvsfs5gRiaTBtOA6TphbNBIAnrFCw,6956
43
43
  spiral/core/expr/__init__.pyi,sha256=3HSKjkotiEkxBvGBALXEBIie0JiyI9bCpehwA3nMQkU,571
44
44
  spiral/core/expr/images/__init__.pyi,sha256=wnE_wZXq7a4iqTg3SVm-ssxGw1WQZyk5dGOPaP4Btko,73
45
45
  spiral/core/expr/list_/__init__.pyi,sha256=Q_9c87eIQfZbqlaw_rq3fvs93YEsW7K5VYk6VZ4g6mU,126
@@ -49,18 +49,18 @@ spiral/core/expr/struct_/__init__.pyi,sha256=MXckd98eV_x3X0RhEWvlkA3DcDXRtLs5pNn
49
49
  spiral/core/expr/text/__init__.pyi,sha256=ed83n1xcsGY7_QDhMmJGnSQ20UrJFXcdv1AveSEcS1c,175
50
50
  spiral/core/expr/udf/__init__.pyi,sha256=zsZs081KVhY3-1JidqTkWMW81Qd_ScoTGZvasIhIK-4,358
51
51
  spiral/core/expr/video/__init__.pyi,sha256=nQJEcSsigZuRpMjkI_O4EEtMK_n2zRvorcL_KEeD5vU,95
52
- spiral/core/table/__init__.pyi,sha256=zcf4GripPZtiwh6uHkPgVyDij1g2nYL1DogN83z5ISU,4037
52
+ spiral/core/table/__init__.pyi,sha256=h84QDg6hLuPcmRpavx5zOZM77ZCi2-YwIlrrUZJp1sE,4374
53
53
  spiral/core/table/manifests/__init__.pyi,sha256=eVfDpmhYSjafIvvALqAkZe5baN3Y1HpKpxYEbjwd4gQ,1043
54
54
  spiral/core/table/metastore/__init__.pyi,sha256=rc3u9MwEKRvL2kxOc8lBorddFRnM8o_o1frqtae86a4,1697
55
55
  spiral/core/table/spec/__init__.pyi,sha256=fVuc2j3uoTdWfYNm720OfUIgrLYw9fRwj44maI5bgdY,5709
56
56
  spiral/dataloader.py,sha256=W9siY4BF4p_rwTTSS4KgsaQsPLxxza6XmQhrdBzzMJ8,10592
57
- spiral/dataset.py,sha256=PMLoXnXuEUciP6-NXqTmQLXu0UIH7OcC4-iZtY_iuO8,7973
57
+ spiral/dataset.py,sha256=S8pdiBXIhwMxQiJYgF7UI_8HkN7pZO798UzlO1LNXy4,8409
58
58
  spiral/datetime_.py,sha256=elXaUWtZuuLVcu9E0aXnvYRPB9XWqZbLDToozQYQYjU,950
59
59
  spiral/debug/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
60
60
  spiral/debug/manifests.py,sha256=7f1O3ba9mrA5nXpOF9cEIQuUAteP5wiBkFy_diQJ7No,3216
61
61
  spiral/debug/metrics.py,sha256=XdRDcjggtsLNGCAjam6IxG9072pz_d2C8iLApNRFUtk,2044
62
62
  spiral/debug/scan.py,sha256=UEm_aRnql5pwDPTpZgakMLNjlzkKL4RurBFFqH_BLAQ,9526
63
- spiral/enrichment.py,sha256=w0MrZ93wDuvS4sazw_8dPmnhzkQ4SAU5A1CGE7WF-F8,7046
63
+ spiral/enrichment.py,sha256=j8CzWJqLRq0Zko-qz3NzicsFOAvdzQSRJ58vdmqxPsk,9879
64
64
  spiral/expressions/__init__.py,sha256=ZsD8g7vB0G7xy19GUiH4m79kw7KEkTQRwJl5Gn1cgtw,8049
65
65
  spiral/expressions/base.py,sha256=PvhJkcUSsPSIaxirHVzM9zlqyBXiaiia1HXohXdOmL4,5377
66
66
  spiral/expressions/file.py,sha256=7D9jIENJcoT0KFharBLkzK9dZgO4DYn5K_KCt0twefg,518
@@ -106,4 +106,4 @@ spiral/table.py,sha256=p95AYv6b7e14F3t7j-B-r45k9CtG84ngikdlAhh9WxA,12260
106
106
  spiral/text_index.py,sha256=FQ9rgIEGLSJryS9lFdMhKtPFey18BXoWbPXyvZPJJ04,442
107
107
  spiral/transaction.py,sha256=bI5oqBAmPMSF0yOOYcPfGbV37Xc1-_V-wQNKw1xOlTA,4136
108
108
  spiral/types_.py,sha256=W_jyO7F6rpPiH69jhgSgV7OxQZbOlb1Ho3InpKUP6Eo,155
109
- pyspiral-0.7.7.dist-info/RECORD,,
109
+ pyspiral-0.7.9.dist-info/RECORD,,
spiral/_lib.abi3.so CHANGED
Binary file
spiral/cli/tables.py CHANGED
@@ -1,6 +1,5 @@
1
- import datetime
2
1
  from collections.abc import Callable
3
- from typing import Annotated, Literal
2
+ from typing import Annotated
4
3
 
5
4
  import questionary
6
5
  import rich
@@ -61,6 +60,21 @@ def ls(
61
60
  CONSOLE.print(rich_table)
62
61
 
63
62
 
63
+ @app.command(help="Show the leading rows of the table.")
64
+ def head(
65
+ project: ProjectArg,
66
+ table: Annotated[str | None, Option(help="Table name.")] = None,
67
+ dataset: Annotated[str | None, Option(help="Dataset name.")] = None,
68
+ n: Annotated[int, Option("-n", help="Maximum number of rows to show. Defaults to 10.")] = 10,
69
+ ):
70
+ import polars as pl
71
+
72
+ _, t = get_table(project, table, dataset)
73
+
74
+ with pl.Config(tbl_rows=-1):
75
+ CONSOLE.print(t.to_polars().limit(n).collect())
76
+
77
+
64
78
  def validate_non_empty_str(text: str) -> bool | str:
65
79
  if len(text) > 0:
66
80
  return True
@@ -137,40 +151,9 @@ def flush(
137
151
  project: ProjectArg,
138
152
  table: Annotated[str | None, Option(help="Table name.")] = None,
139
153
  dataset: Annotated[str | None, Option(help="Dataset name.")] = None,
140
- keep: Annotated[
141
- Literal["1h", "2h", "4h"] | None,
142
- Option(help="Duration string that indicates how much WAL to keep. Defaults to 24h."),
143
- ] = None,
144
- full: Annotated[bool, Option(help="Flush full Write-Ahead-Log.")] = False,
145
154
  ):
146
- # TODO(marko): Use some human-readable duration parsing library.
147
- duration = None
148
- if keep is not None:
149
- if full:
150
- raise ValueError("Cannot specify both --keep and --full")
151
- match keep:
152
- case "1h":
153
- duration = datetime.timedelta(hours=1)
154
- case "2h":
155
- duration = datetime.timedelta(hours=2)
156
- case "4h":
157
- duration = datetime.timedelta(hours=4)
158
- case _:
159
- raise ValueError(f"Invalid duration string: {keep}")
160
-
161
- if full:
162
- # Warn and wait for confirmation.
163
- ERR_CONSOLE.print("[bold yellow]Warning: All currently open transaction will fail to commit.[/bold yellow]")
164
- if not questionary.confirm("Are you sure you want to continue?", default=False).ask(): # pyright: ignore[reportAny]
165
- ERR_CONSOLE.print("Aborting.")
166
- raise typer.Exit(1)
167
-
168
- duration = datetime.timedelta(hours=0)
169
-
170
- keep_latest_s = int(duration.total_seconds()) if duration is not None else None
171
-
172
155
  identifier, t = get_table(project, table, dataset)
173
- state.spiral.internal.flush_wal(t.core, keep_latest_s=keep_latest_s) # pyright: ignore[reportPrivateUsage]
156
+ state.spiral.internal.flush_wal(t.core) # pyright: ignore[reportPrivateUsage]
174
157
  CONSOLE.print(f"Flushed WAL for table {identifier} in project {project}.")
175
158
 
176
159
 
@@ -175,7 +175,7 @@ class ShuffleConfig:
175
175
  ): ...
176
176
 
177
177
  class Internal:
178
- def flush_wal(self, table: Table, *, keep_latest_s: int | None = None) -> None:
178
+ def flush_wal(self, table: Table) -> None:
179
179
  """
180
180
  Flush the write-ahead log of the table.
181
181
  """
@@ -60,6 +60,13 @@ class ScanState:
60
60
  class MaterializablePlan:
61
61
  pass
62
62
 
63
+ class EvaluatedExecutablePlan:
64
+ pass
65
+
66
+ class EvaluatedPlanStream:
67
+ def __next__(self) -> EvaluatedExecutablePlan: ...
68
+ def __iter__(self) -> EvaluatedPlanStream: ...
69
+
63
70
  class Scan:
64
71
  def key_schema(self) -> Schema: ...
65
72
  def schema(self) -> Schema: ...
@@ -90,6 +97,10 @@ class Scan:
90
97
  # If `infinite` is True, shards are shuffled after exhausted but not before the first pass.
91
98
  # Otherwise, shards are not shuffle and shuffle config is only used for shuffle buffer.
92
99
  ...
100
+
101
+ def evaluate_analyze(
102
+ self, key_table: pa.Table | pa.RecordBatch | None = None, batch_readahead: int | None = None
103
+ ) -> EvaluatedPlanStream: ...
93
104
  def metrics(self) -> dict[str, Any]: ...
94
105
 
95
106
  class KeySpaceState:
spiral/dataset.py CHANGED
@@ -226,7 +226,16 @@ class TableScanner(ds.Scanner):
226
226
 
227
227
  def head(self, num_rows: int):
228
228
  """Return the first `num_rows` rows of the dataset."""
229
- reader = self.to_reader()
229
+
230
+ kwargs = {}
231
+ if num_rows <= 10_000:
232
+ # We are unlikely to need more than a couple batches
233
+ kwargs["batch_readahead"] = 1
234
+ # The progress bar length is the total number of splits in this dataset. We will likely
235
+ # stop streaming early. As a result, the progress bar is misleading.
236
+ kwargs["hide_progress_bar"] = True
237
+
238
+ reader = self._scan.to_record_batches(key_table=self.key_table, **kwargs)
230
239
  batches = []
231
240
  row_count = 0
232
241
  for batch in reader:
spiral/enrichment.py CHANGED
@@ -4,14 +4,14 @@ from functools import partial
4
4
  from typing import TYPE_CHECKING, Optional
5
5
 
6
6
  from spiral.core.client import Shard
7
- from spiral.core.table import Scan
8
- from spiral.core.table.spec import Operation
7
+ from spiral.core.table import KeyRange
8
+ from spiral.core.table.spec import Key, Operation
9
9
  from spiral.expressions import Expr
10
10
 
11
11
  if TYPE_CHECKING:
12
12
  import dask.distributed
13
13
 
14
- from spiral import KeySpaceIndex, Table
14
+ from spiral import KeySpaceIndex, Scan, Table
15
15
 
16
16
  logger = logging.getLogger(__name__)
17
17
 
@@ -50,7 +50,7 @@ class Enrichment:
50
50
  """The filter expression."""
51
51
  return self._where
52
52
 
53
- def _scan(self) -> Scan:
53
+ def _scan(self) -> "Scan":
54
54
  return self._table.spiral.scan(self._projection, where=self._where)
55
55
 
56
56
  def apply(
@@ -90,6 +90,7 @@ class Enrichment:
90
90
  index: Optional["KeySpaceIndex"] = None,
91
91
  partition_size_bytes: int | None = None,
92
92
  tx_dump: str | None = None,
93
+ checkpoint_dump: str | None = None,
93
94
  client: Optional["dask.distributed.Client"] = None,
94
95
  **kwargs,
95
96
  ) -> None:
@@ -109,6 +110,7 @@ class Enrichment:
109
110
  partition_size_bytes: The maximum partition size in bytes.
110
111
  If not provided, the default partition size is used.
111
112
  tx_dump: Optional path to dump the transaction JSON for debugging.
113
+ checkpoint_dump: Optional path to dump intermediate checkpoints for incremental progress.
112
114
  client: Optional Dask distributed client. If not provided, a new client will be created
113
115
  **kwargs: Additional keyword arguments to pass to `dask.distributed.Client`
114
116
  such as `address` to connect to an existing cluster.
@@ -126,11 +128,23 @@ class Enrichment:
126
128
  tx = self._table.txn()
127
129
  plan_scan = self._scan()
128
130
 
129
- # Determine the "tasks". Use the index if provided.
130
- shards = plan_scan.shards()
131
- if index is not None:
131
+ # Determine the "tasks".
132
+ shards = None
133
+ # Use checkpoint, if provided.
134
+ if checkpoint_dump is not None:
135
+ checkpoint: list[KeyRange] | None = _checkpoint_load_key_ranges(checkpoint_dump)
136
+ if checkpoint is None:
137
+ logger.info(f"No existing checkpoint found at {checkpoint_dump}. Starting from scratch.")
138
+ else:
139
+ logger.info(f"Resuming enrichment from checkpoint at {checkpoint_dump} with {len(checkpoint)} ranges.")
140
+ shards = [Shard(kr, None) for kr in checkpoint]
141
+ # Fallback to index-based sharding.
142
+ if shards is None and index is not None:
132
143
  # TODO(marko): This will use index's asof automatically.
133
144
  shards = self._table.spiral.internal.compute_shards(index.core)
145
+ # Fallback to default sharding.
146
+ if shards is None:
147
+ shards = plan_scan.shards()
134
148
 
135
149
  # Partially bind the enrichment function.
136
150
  _compute = partial(
@@ -139,14 +153,28 @@ class Enrichment:
139
153
  state_json=plan_scan.core.plan_state().to_json(),
140
154
  output_table_id=self._table.table_id,
141
155
  partition_size_bytes=partition_size_bytes,
156
+ incremental=checkpoint_dump is not None,
142
157
  )
143
158
  enrichments = client.map(_compute, shards)
144
159
 
145
160
  logger.info(f"Applying enrichment with {len(shards)} shards. Follow progress at {client.dashboard_link}")
161
+
162
+ failed_ranges = []
146
163
  for result in client.gather(enrichments):
147
164
  result: EnrichmentTaskResult
165
+
166
+ if result.error is not None:
167
+ logger.error(f"Enrichment task failed for range {result.key_range}: {result.error}")
168
+ failed_ranges.append(result.key_range)
169
+ continue
170
+
148
171
  tx.include(result.ops)
149
172
 
173
+ # Dump checkpoint of failed ranges, if any.
174
+ if checkpoint_dump is not None:
175
+ logger.info(f"Dumping checkpoint with {len(failed_ranges)} failed ranges to {checkpoint_dump}.")
176
+ _checkpoint_dump_key_ranges(checkpoint_dump, failed_ranges)
177
+
150
178
  if tx.is_empty():
151
179
  logger.warning("Transaction not committed. No rows were read for enrichment.")
152
180
  return
@@ -155,20 +183,62 @@ class Enrichment:
155
183
  tx.commit(compact=True, tx_dump=tx_dump)
156
184
 
157
185
 
186
+ def _checkpoint_load_key_ranges(checkpoint_dump: str) -> list[KeyRange] | None:
187
+ import json
188
+ import os
189
+
190
+ if not os.path.exists(checkpoint_dump):
191
+ return None
192
+
193
+ with open(checkpoint_dump) as f:
194
+ data = json.load(f)
195
+ return [
196
+ KeyRange(begin=Key(bytes.fromhex(r["begin"])), end=Key(bytes.fromhex(r["end"])))
197
+ for r in data.get("key_ranges", [])
198
+ ]
199
+
200
+
201
+ def _checkpoint_dump_key_ranges(checkpoint_dump: str, ranges: list[KeyRange]):
202
+ import json
203
+ import os
204
+
205
+ os.makedirs(os.path.dirname(checkpoint_dump), exist_ok=True)
206
+ with open(checkpoint_dump, "w") as f:
207
+ json.dump(
208
+ {"key_ranges": [{"begin": bytes(r.begin).hex(), "end": bytes(r.end).hex()} for r in ranges]},
209
+ f,
210
+ )
211
+
212
+
158
213
  @dataclasses.dataclass
159
214
  class EnrichmentTaskResult:
215
+ key_range: KeyRange
160
216
  ops: list[Operation]
217
+ error: str | None = None
161
218
 
162
219
  def __getstate__(self):
163
- return {"ops": [op.to_json() for op in self.ops]}
220
+ return {
221
+ "ops": [op.to_json() for op in self.ops],
222
+ "error": self.error,
223
+ "begin": bytes(self.key_range.begin),
224
+ "end": bytes(self.key_range.end),
225
+ }
164
226
 
165
227
  def __setstate__(self, state):
228
+ self.key_range = KeyRange(begin=Key(state["begin"]), end=Key(state["end"]))
166
229
  self.ops = [Operation.from_json(op_json) for op_json in state["ops"]]
230
+ self.error = state["error"]
167
231
 
168
232
 
169
233
  # NOTE(marko): This function must be picklable!
170
234
  def _enrichment_task(
171
- shard: Shard, *, settings_dict, state_json, output_table_id, partition_size_bytes: int | None
235
+ shard: Shard,
236
+ *,
237
+ settings_dict,
238
+ state_json,
239
+ output_table_id,
240
+ partition_size_bytes: int | None,
241
+ incremental: bool,
172
242
  ) -> EnrichmentTaskResult:
173
243
  # Returns operations that can be included in a transaction.
174
244
  from spiral import Scan, Spiral
@@ -182,5 +252,15 @@ def _enrichment_task(
182
252
  table = sp.table(output_table_id)
183
253
 
184
254
  task_tx = table.txn()
185
- task_tx.writeback(task_scan, key_range=shard.key_range, partition_size_bytes=partition_size_bytes)
186
- return EnrichmentTaskResult(ops=task_tx.take())
255
+
256
+ try:
257
+ task_tx.writeback(task_scan, key_range=shard.key_range, partition_size_bytes=partition_size_bytes)
258
+ return EnrichmentTaskResult(key_range=shard.key_range, ops=task_tx.take())
259
+ except Exception as e:
260
+ task_tx.abort()
261
+
262
+ if incremental:
263
+ return EnrichmentTaskResult(key_range=shard.key_range, ops=[], error=str(e))
264
+
265
+ logger.error(f"Enrichment task failed for shard {shard}: {e}")
266
+ raise e