PyPI - pyspiral - Versions diffs - 0.7.8__cp312-abi3-macosx_11_0_arm64.whl → 0.7.10__cp312-abi3-macosx_11_0_arm64.whl - Mend

pyspiral 0.7.8__cp312-abi3-macosx_11_0_arm64.whl → 0.7.10__cp312-abi3-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of pyspiral might be problematic. Click here for more details.

Files changed (9) hide show

{pyspiral-0.7.8.dist-info → pyspiral-0.7.10.dist-info}/METADATA +1 -1
{pyspiral-0.7.8.dist-info → pyspiral-0.7.10.dist-info}/RECORD +9 -9
spiral/_lib.abi3.so +0 -0
spiral/cli/tables.py +15 -0
spiral/core/table/__init__.pyi +11 -0
spiral/dataset.py +10 -1
spiral/enrichment.py +123 -14
{pyspiral-0.7.8.dist-info → pyspiral-0.7.10.dist-info}/WHEEL +0 -0
{pyspiral-0.7.8.dist-info → pyspiral-0.7.10.dist-info}/entry_points.txt +0 -0

{pyspiral-0.7.8.dist-info → pyspiral-0.7.10.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: pyspiral
-Version: 0.7.8
+Version: 0.7.10
 Classifier: Intended Audience :: Science/Research
 Classifier: Operating System :: OS Independent
 Classifier: Programming Language :: Python

{pyspiral-0.7.8.dist-info → pyspiral-0.7.10.dist-info}/RECORD RENAMED Viewed

@@ -1,8 +1,8 @@
-pyspiral-0.7.8.dist-info/METADATA,sha256=tGCOA2CfvPk_EGKZ35MCaeZ0x6SIm-XZNnLMHd5ao-U,1874
-pyspiral-0.7.8.dist-info/WHEEL,sha256=KQvxBiy7GLcML6Ad3w_ZPrgSvER1uXd7aYb6wy6b44Y,103
-pyspiral-0.7.8.dist-info/entry_points.txt,sha256=R96Y3FpYX6XbQu9qMPfUTgiCcf4qM9OBQQZTDdBkZwA,74
+pyspiral-0.7.10.dist-info/METADATA,sha256=rCgXce3dHmwg5oIvXTHJnnMMEX93NFLvTRbUY0Ns9-Y,1875
+pyspiral-0.7.10.dist-info/WHEEL,sha256=KQvxBiy7GLcML6Ad3w_ZPrgSvER1uXd7aYb6wy6b44Y,103
+pyspiral-0.7.10.dist-info/entry_points.txt,sha256=R96Y3FpYX6XbQu9qMPfUTgiCcf4qM9OBQQZTDdBkZwA,74
 spiral/__init__.py,sha256=PwaYBWFBtB7cYi7peMmhk_Lm5XzjRoLwOtLbUhc1ZDo,1449
-spiral/_lib.abi3.so,sha256=fSclMfCK-CsL8hz65fdTDqXCH9U4R3AZ3UpeNTLSgj0,70882208
+spiral/_lib.abi3.so,sha256=rkT4wXMbVHeuV_ika6WCBCV5ByzXHhzho67BdWq7VmM,71030800
 spiral/adbc.py,sha256=7IxfWIeQN-fh0W5OdN_PP2x3pzQYg6ZUOLsHg3jktqw,14842
 spiral/api/__init__.py,sha256=ULBlVq3PnfNOO6T5naE_ULmmii-83--qTuN2PpAUQN0,2241
 spiral/api/admin.py,sha256=A1iVR1XYJSObZivPAD5UzmPuMgupXc9kaHNYYa_kwfs,585
@@ -30,7 +30,7 @@ spiral/cli/orgs.py,sha256=fmOuLxpeIFfKqePRi292Gv9k-EF5pPn_tbKd2BLl2Ig,2869
 spiral/cli/printer.py,sha256=aosc763hDFgoXJGkiANmNyO3kAsecAS1JWgjEhn8GCM,1784
 spiral/cli/projects.py,sha256=1M1nGrBT-t0aY9RV5Cnmzy7YrhIvmHwdkpa3y9j8rG8,5756
 spiral/cli/state.py,sha256=10wTIVQ0SJkY67Z6-KQ1LFlt3aVIPmZhoHFdTwp4kNA,130
-spiral/cli/tables.py,sha256=Mv6M8zlgG_1i_GsguYIzU-CY2GXq2fMmKnabSMWE1qI,6402
+spiral/cli/tables.py,sha256=6vt6EBGt7I9b0kAQ6sQORbmWiKbRdH4ubQYjjuNBXEg,6900
 spiral/cli/telemetry.py,sha256=Uxo1Q1FkKJ6n6QNGOUmL3j_pRRWRx0qWIhoP-U9BuR0,589
 spiral/cli/text.py,sha256=DlWGe4JrkdERAiqyITNpk91Wqb63Re99rNYlIFsIamc,4031
 spiral/cli/types.py,sha256=XYzo1GgX7dBBItoBSrHI4vO5C2lLmS2sktb-2GnGH3E,1362
@@ -49,18 +49,18 @@ spiral/core/expr/struct_/__init__.pyi,sha256=MXckd98eV_x3X0RhEWvlkA3DcDXRtLs5pNn
 spiral/core/expr/text/__init__.pyi,sha256=ed83n1xcsGY7_QDhMmJGnSQ20UrJFXcdv1AveSEcS1c,175
 spiral/core/expr/udf/__init__.pyi,sha256=zsZs081KVhY3-1JidqTkWMW81Qd_ScoTGZvasIhIK-4,358
 spiral/core/expr/video/__init__.pyi,sha256=nQJEcSsigZuRpMjkI_O4EEtMK_n2zRvorcL_KEeD5vU,95
-spiral/core/table/__init__.pyi,sha256=zcf4GripPZtiwh6uHkPgVyDij1g2nYL1DogN83z5ISU,4037
+spiral/core/table/__init__.pyi,sha256=h84QDg6hLuPcmRpavx5zOZM77ZCi2-YwIlrrUZJp1sE,4374
 spiral/core/table/manifests/__init__.pyi,sha256=eVfDpmhYSjafIvvALqAkZe5baN3Y1HpKpxYEbjwd4gQ,1043
 spiral/core/table/metastore/__init__.pyi,sha256=rc3u9MwEKRvL2kxOc8lBorddFRnM8o_o1frqtae86a4,1697
 spiral/core/table/spec/__init__.pyi,sha256=fVuc2j3uoTdWfYNm720OfUIgrLYw9fRwj44maI5bgdY,5709
 spiral/dataloader.py,sha256=W9siY4BF4p_rwTTSS4KgsaQsPLxxza6XmQhrdBzzMJ8,10592
-spiral/dataset.py,sha256=PMLoXnXuEUciP6-NXqTmQLXu0UIH7OcC4-iZtY_iuO8,7973
+spiral/dataset.py,sha256=S8pdiBXIhwMxQiJYgF7UI_8HkN7pZO798UzlO1LNXy4,8409
 spiral/datetime_.py,sha256=elXaUWtZuuLVcu9E0aXnvYRPB9XWqZbLDToozQYQYjU,950
 spiral/debug/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 spiral/debug/manifests.py,sha256=7f1O3ba9mrA5nXpOF9cEIQuUAteP5wiBkFy_diQJ7No,3216
 spiral/debug/metrics.py,sha256=XdRDcjggtsLNGCAjam6IxG9072pz_d2C8iLApNRFUtk,2044
 spiral/debug/scan.py,sha256=UEm_aRnql5pwDPTpZgakMLNjlzkKL4RurBFFqH_BLAQ,9526
-spiral/enrichment.py,sha256=w0MrZ93wDuvS4sazw_8dPmnhzkQ4SAU5A1CGE7WF-F8,7046
+spiral/enrichment.py,sha256=t3CFnidG1kGHeJk1zIMVyImKapUJQx1OXvGn88brOo4,11059
 spiral/expressions/__init__.py,sha256=ZsD8g7vB0G7xy19GUiH4m79kw7KEkTQRwJl5Gn1cgtw,8049
 spiral/expressions/base.py,sha256=PvhJkcUSsPSIaxirHVzM9zlqyBXiaiia1HXohXdOmL4,5377
 spiral/expressions/file.py,sha256=7D9jIENJcoT0KFharBLkzK9dZgO4DYn5K_KCt0twefg,518
@@ -106,4 +106,4 @@ spiral/table.py,sha256=p95AYv6b7e14F3t7j-B-r45k9CtG84ngikdlAhh9WxA,12260
 spiral/text_index.py,sha256=FQ9rgIEGLSJryS9lFdMhKtPFey18BXoWbPXyvZPJJ04,442
 spiral/transaction.py,sha256=bI5oqBAmPMSF0yOOYcPfGbV37Xc1-_V-wQNKw1xOlTA,4136
 spiral/types_.py,sha256=W_jyO7F6rpPiH69jhgSgV7OxQZbOlb1Ho3InpKUP6Eo,155
-pyspiral-0.7.8.dist-info/RECORD,,
+pyspiral-0.7.10.dist-info/RECORD,,

spiral/_lib.abi3.so CHANGED Viewed

Binary file

spiral/cli/tables.py CHANGED Viewed

@@ -60,6 +60,21 @@ def ls(
     CONSOLE.print(rich_table)
+@app.command(help="Show the leading rows of the table.")
+def head(
+    project: ProjectArg,
+    table: Annotated[str | None, Option(help="Table name.")] = None,
+    dataset: Annotated[str | None, Option(help="Dataset name.")] = None,
+    n: Annotated[int, Option("-n", help="Maximum number of rows to show. Defaults to 10.")] = 10,
+):
+    import polars as pl
+    _, t = get_table(project, table, dataset)
+    with pl.Config(tbl_rows=-1):
+        CONSOLE.print(t.to_polars().limit(n).collect())
 def validate_non_empty_str(text: str) -> bool | str:
     if len(text) > 0:
         return True

spiral/core/table/__init__.pyi CHANGED Viewed

@@ -60,6 +60,13 @@ class ScanState:
 class MaterializablePlan:
     pass
+class EvaluatedExecutablePlan:
+    pass
+class EvaluatedPlanStream:
+    def __next__(self) -> EvaluatedExecutablePlan: ...
+    def __iter__(self) -> EvaluatedPlanStream: ...
 class Scan:
     def key_schema(self) -> Schema: ...
     def schema(self) -> Schema: ...
@@ -90,6 +97,10 @@ class Scan:
         # If `infinite` is True, shards are shuffled after exhausted but not before the first pass.
         # Otherwise, shards are not shuffle and shuffle config is only used for shuffle buffer.
         ...
+    def evaluate_analyze(
+        self, key_table: pa.Table | pa.RecordBatch | None = None, batch_readahead: int | None = None
+    ) -> EvaluatedPlanStream: ...
     def metrics(self) -> dict[str, Any]: ...
 class KeySpaceState:

spiral/dataset.py CHANGED Viewed

@@ -226,7 +226,16 @@ class TableScanner(ds.Scanner):
     def head(self, num_rows: int):
         """Return the first `num_rows` rows of the dataset."""
-        reader = self.to_reader()
+        kwargs = {}
+        if num_rows <= 10_000:
+            # We are unlikely to need more than a couple batches
+            kwargs["batch_readahead"] = 1
+            # The progress bar length is the total number of splits in this dataset. We will likely
+            # stop streaming early. As a result, the progress bar is misleading.
+            kwargs["hide_progress_bar"] = True
+        reader = self._scan.to_record_batches(key_table=self.key_table, **kwargs)
         batches = []
         row_count = 0
         for batch in reader:

spiral/enrichment.py CHANGED Viewed

@@ -4,14 +4,14 @@ from functools import partial
 from typing import TYPE_CHECKING, Optional
 from spiral.core.client import Shard
-from spiral.core.table import Scan
-from spiral.core.table.spec import Operation
+from spiral.core.table import KeyRange
+from spiral.core.table.spec import Key, Operation
 from spiral.expressions import Expr
 if TYPE_CHECKING:
     import dask.distributed
-    from spiral import KeySpaceIndex, Table
+    from spiral import KeySpaceIndex, Scan, Table
 logger = logging.getLogger(__name__)
@@ -50,7 +50,7 @@ class Enrichment:
         """The filter expression."""
         return self._where
-    def _scan(self) -> Scan:
+    def _scan(self) -> "Scan":
         return self._table.spiral.scan(self._projection, where=self._where)
     def apply(
@@ -90,6 +90,7 @@ class Enrichment:
         index: Optional["KeySpaceIndex"] = None,
         partition_size_bytes: int | None = None,
         tx_dump: str | None = None,
+        checkpoint_dump: str | None = None,
         client: Optional["dask.distributed.Client"] = None,
         **kwargs,
     ) -> None:
@@ -109,6 +110,7 @@ class Enrichment:
             partition_size_bytes: The maximum partition size in bytes.
                 If not provided, the default partition size is used.
             tx_dump: Optional path to dump the transaction JSON for debugging.
+            checkpoint_dump: Optional path to dump intermediate checkpoints for incremental progress.
             client: Optional Dask distributed client. If not provided, a new client will be created
             **kwargs: Additional keyword arguments to pass to `dask.distributed.Client`
                 such as `address` to connect to an existing cluster.
@@ -126,11 +128,23 @@ class Enrichment:
         tx = self._table.txn()
         plan_scan = self._scan()
-        # Determine the "tasks". Use the index if provided.
-        shards = plan_scan.shards()
-        if index is not None:
+        # Determine the "tasks".
+        shards = None
+        # Use checkpoint, if provided.
+        if checkpoint_dump is not None:
+            checkpoint: list[KeyRange] | None = _checkpoint_load_key_ranges(checkpoint_dump)
+            if checkpoint is None:
+                logger.info(f"No existing checkpoint found at {checkpoint_dump}. Starting from scratch.")
+            else:
+                logger.info(f"Resuming enrichment from checkpoint at {checkpoint_dump} with {len(checkpoint)} ranges.")
+                shards = [Shard(kr, None) for kr in checkpoint]
+        # Fallback to index-based sharding.
+        if shards is None and index is not None:
             # TODO(marko): This will use index's asof automatically.
             shards = self._table.spiral.internal.compute_shards(index.core)
+        # Fallback to default sharding.
+        if shards is None:
+            shards = plan_scan.shards()
         # Partially bind the enrichment function.
         _compute = partial(
@@ -139,13 +153,60 @@ class Enrichment:
             state_json=plan_scan.core.plan_state().to_json(),
             output_table_id=self._table.table_id,
             partition_size_bytes=partition_size_bytes,
+            incremental=checkpoint_dump is not None,
         )
         enrichments = client.map(_compute, shards)
         logger.info(f"Applying enrichment with {len(shards)} shards. Follow progress at {client.dashboard_link}")
-        for result in client.gather(enrichments):
-            result: EnrichmentTaskResult
-            tx.include(result.ops)
+        failed_ranges = []
+        try:
+            for result, shard in zip(client.gather(enrichments), shards):
+                result: EnrichmentTaskResult
+                if result.error is not None:
+                    logger.error(f"Enrichment task failed for range {shard.key_range}: {result.error}")
+                    failed_ranges.append(shard.key_range)
+                    continue
+                tx.include(result.ops)
+        except Exception as e:
+            # If not incremental, re-raise the exception.
+            if checkpoint_dump is None:
+                raise e
+            # Handle worker failures (e.g., KilledWorker from Dask)
+            from dask.distributed import KilledWorker
+            if not isinstance(e, KilledWorker):
+                # Re-raise other exceptions
+                raise e
+            logger.error(f"Dask worker was killed during enrichment: {e}")
+            # Try to gather partial results and mark remaining tasks as failed
+            for future, shard in zip(enrichments, shards):
+                if future.done() and not future.exception():
+                    try:
+                        result = future.result()
+                        if result.error is not None:
+                            logger.error(f"Enrichment task failed for range {shard.key_range}: {result.error}")
+                            failed_ranges.append(shard.key_range)
+                            continue
+                        tx.include(result.ops)
+                    except Exception:
+                        # Task failed or incomplete, add to failed ranges
+                        failed_ranges.append(shard.key_range)
+                else:
+                    # Task didn't complete, add to failed ranges
+                    failed_ranges.append(shard.key_range)
+        # Dump checkpoint of failed ranges, if any.
+        if checkpoint_dump is not None:
+            logger.info(f"Dumping checkpoint with {len(failed_ranges)} failed ranges to {checkpoint_dump}.")
+            _checkpoint_dump_key_ranges(checkpoint_dump, failed_ranges)
         if tx.is_empty():
             logger.warning("Transaction not committed. No rows were read for enrichment.")
@@ -155,20 +216,58 @@ class Enrichment:
         tx.commit(compact=True, tx_dump=tx_dump)
+def _checkpoint_load_key_ranges(checkpoint_dump: str) -> list[KeyRange] | None:
+    import json
+    import os
+    if not os.path.exists(checkpoint_dump):
+        return None
+    with open(checkpoint_dump) as f:
+        data = json.load(f)
+        return [
+            KeyRange(begin=Key(bytes.fromhex(r["begin"])), end=Key(bytes.fromhex(r["end"])))
+            for r in data.get("key_ranges", [])
+        ]
+def _checkpoint_dump_key_ranges(checkpoint_dump: str, ranges: list[KeyRange]):
+    import json
+    import os
+    os.makedirs(os.path.dirname(checkpoint_dump), exist_ok=True)
+    with open(checkpoint_dump, "w") as f:
+        json.dump(
+            {"key_ranges": [{"begin": bytes(r.begin).hex(), "end": bytes(r.end).hex()} for r in ranges]},
+            f,
+        )
 @dataclasses.dataclass
 class EnrichmentTaskResult:
     ops: list[Operation]
+    error: str | None = None
     def __getstate__(self):
-        return {"ops": [op.to_json() for op in self.ops]}
+        return {
+            "ops": [op.to_json() for op in self.ops],
+            "error": self.error,
+        }
     def __setstate__(self, state):
         self.ops = [Operation.from_json(op_json) for op_json in state["ops"]]
+        self.error = state["error"]
 # NOTE(marko): This function must be picklable!
 def _enrichment_task(
-    shard: Shard, *, settings_dict, state_json, output_table_id, partition_size_bytes: int | None
+    shard: Shard,
+    *,
+    settings_dict,
+    state_json,
+    output_table_id,
+    partition_size_bytes: int | None,
+    incremental: bool,
 ) -> EnrichmentTaskResult:
     # Returns operations that can be included in a transaction.
     from spiral import Scan, Spiral
@@ -182,5 +281,15 @@ def _enrichment_task(
     table = sp.table(output_table_id)
     task_tx = table.txn()
-    task_tx.writeback(task_scan, key_range=shard.key_range, partition_size_bytes=partition_size_bytes)
-    return EnrichmentTaskResult(ops=task_tx.take())
+    try:
+        task_tx.writeback(task_scan, key_range=shard.key_range, partition_size_bytes=partition_size_bytes)
+        return EnrichmentTaskResult(ops=task_tx.take())
+    except Exception as e:
+        task_tx.abort()
+        if incremental:
+            return EnrichmentTaskResult(ops=[], error=str(e))
+        logger.error(f"Enrichment task failed for shard {shard}: {e}")
+        raise e

{pyspiral-0.7.8.dist-info → pyspiral-0.7.10.dist-info}/WHEEL RENAMED Viewed

File without changes

{pyspiral-0.7.8.dist-info → pyspiral-0.7.10.dist-info}/entry_points.txt RENAMED Viewed

File without changes