PyPI - pyspiral - Versions diffs - 0.6.9__cp312-abi3-macosx_11_0_arm64.whl → 0.7.12__cp312-abi3-macosx_11_0_arm64.whl - Mend

pyspiral 0.6.9__cp312-abi3-macosx_11_0_arm64.whl → 0.7.12__cp312-abi3-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (58) hide show

{pyspiral-0.6.9.dist-info → pyspiral-0.7.12.dist-info}/METADATA +9 -8
{pyspiral-0.6.9.dist-info → pyspiral-0.7.12.dist-info}/RECORD +53 -45
{pyspiral-0.6.9.dist-info → pyspiral-0.7.12.dist-info}/entry_points.txt +1 -0
spiral/__init__.py +20 -0
spiral/_lib.abi3.so +0 -0
spiral/api/__init__.py +1 -1
spiral/api/client.py +1 -1
spiral/api/types.py +1 -0
spiral/cli/admin.py +2 -2
spiral/cli/app.py +8 -4
spiral/cli/fs.py +4 -4
spiral/cli/iceberg.py +1 -1
spiral/cli/key_spaces.py +15 -1
spiral/cli/login.py +4 -3
spiral/cli/orgs.py +8 -7
spiral/cli/projects.py +4 -4
spiral/cli/state.py +5 -3
spiral/cli/tables.py +59 -36
spiral/cli/telemetry.py +1 -1
spiral/cli/types.py +2 -2
spiral/cli/workloads.py +3 -3
spiral/client.py +69 -22
spiral/core/client/__init__.pyi +48 -13
spiral/core/config/__init__.pyi +47 -0
spiral/core/expr/__init__.pyi +15 -0
spiral/core/expr/images/__init__.pyi +3 -0
spiral/core/expr/list_/__init__.pyi +4 -0
spiral/core/expr/refs/__init__.pyi +4 -0
spiral/core/expr/str_/__init__.pyi +3 -0
spiral/core/expr/struct_/__init__.pyi +6 -0
spiral/core/expr/text/__init__.pyi +5 -0
spiral/core/expr/udf/__init__.pyi +14 -0
spiral/core/expr/video/__init__.pyi +3 -0
spiral/core/table/__init__.pyi +37 -2
spiral/core/table/spec/__init__.pyi +6 -4
spiral/dataloader.py +52 -38
spiral/dataset.py +10 -1
spiral/enrichment.py +304 -0
spiral/expressions/__init__.py +21 -23
spiral/expressions/base.py +9 -4
spiral/expressions/file.py +17 -0
spiral/expressions/http.py +11 -80
spiral/expressions/s3.py +16 -0
spiral/expressions/tiff.py +2 -3
spiral/expressions/udf.py +38 -24
spiral/iceberg.py +3 -3
spiral/project.py +34 -6
spiral/scan.py +80 -33
spiral/settings.py +19 -97
spiral/streaming_/stream.py +1 -1
spiral/table.py +40 -10
spiral/transaction.py +99 -2
spiral/expressions/io.py +0 -100
spiral/expressions/mp4.py +0 -62
spiral/expressions/png.py +0 -18
spiral/expressions/qoi.py +0 -18
spiral/expressions/refs.py +0 -58
{pyspiral-0.6.9.dist-info → pyspiral-0.7.12.dist-info}/WHEEL +0 -0

spiral/dataloader.py CHANGED Viewed

@@ -88,22 +88,24 @@ class SpiralDataLoader:
     - map_workers for parallel post-processing (tokenization, decoding, etc.)
     - Built-in checkpoint support via skip_samples
     - Explicit shard-based architecture for distributed training
-    """
-    # Example usage:
-    #
-    # Simple usage:
-    #   loader = SpiralDataLoader(scan, batch_size=32)
-    #   for batch in loader:
-    #       train_step(batch)
-    #
-    # With parallel transforms:
-    #   loader = SpiralDataLoader(
-    #       scan,
-    #       batch_size=32,
-    #       transform_fn=tokenize_batch,
-    #       map_workers=4,
-    #   )
+    Simple usage:
+    ```python
+    loader = SpiralDataLoader(scan, batch_size=32)
+    for batch in loader:
+        train_step(batch)
+    ```
+    With parallel transforms:
+    ```python
+    loader = SpiralDataLoader(
+        scan,
+        batch_size=32,
+        transform_fn=tokenize_batch,
+        map_workers=4,
+    )
+    ```
+    """
     def __init__(
         self,
@@ -119,6 +121,7 @@ class SpiralDataLoader:
         # TODO(os): accept vortex arrays here instead of Arrow
         transform_fn: Callable[[pa.RecordBatch], Any] | None = None,
         map_workers: int = 0,
+        infinite: bool = False,
     ):
         """Initialize SpiralDataLoader.
@@ -143,6 +146,9 @@ class SpiralDataLoader:
             map_workers: Number of worker processes for parallel transform_fn
                 application. 0 means single-process (no parallelism). Use this for
                 CPU-bound transforms like tokenization or audio decoding.
+            infinite: Whether to cycle through the dataset infinitely. If True,
+                the dataloader will repeat the dataset indefinitely. If False,
+                the dataloader will stop after going through the dataset once.
         """
         self.scan = scan
         self.shards = shards if shards is not None else scan.shards()
@@ -155,6 +161,7 @@ class SpiralDataLoader:
         self.batch_readahead = batch_readahead
         self.transform_fn = transform_fn
         self.map_workers = map_workers
+        self.infinite = infinite
         self._samples_yielded = 0
@@ -174,7 +181,7 @@ class SpiralDataLoader:
             shuffle=shuffle,
             max_batch_size=self.batch_size,
             batch_readahead=self.batch_readahead,
-            infinite=False,
+            infinite=self.infinite,
         )
         if self.skip_samples > 0:
@@ -220,16 +227,21 @@ class SpiralDataLoader:
         Returns:
             Dictionary containing samples_yielded, seed, and shards.
+        Example checkpoint:
+        ```python
+        loader = SpiralDataLoader(scan, batch_size=32, seed=42)
+        for i, batch in enumerate(loader):
+            if i == 10:
+                checkpoint = loader.state_dict()
+                break
+        ```
+        Example resume:
+        ```python
+        loader = SpiralDataLoader.from_state_dict(scan, checkpoint, batch_size=32)
+        ```
         """
-        # Example usage:
-        #   loader = SpiralDataLoader(scan, batch_size=32, seed=42)
-        #   for i, batch in enumerate(loader):
-        #       if i == 10:
-        #           checkpoint = loader.state_dict()
-        #           break
-        #
-        #   # Resume later with exact same shards
-        #   loader = SpiralDataLoader.from_state_dict(scan, checkpoint, batch_size=32)
         return {
             "samples_yielded": self._samples_yielded,
             "seed": self.seed,
@@ -257,20 +269,22 @@ class SpiralDataLoader:
         Returns:
             New SpiralDataLoader instance configured to resume from the checkpoint.
+        Save checkpoint during training:
+        ```python
+        loader = scan.to_distributed_data_loader(scan, batch_size=32, seed=42)
+        checkpoint = loader.state_dict()
+        ```
+        Resume later using the same shards from checkpoint:
+        ```python
+        resumed_loader = SpiralDataLoader.from_state_dict(
+            scan,
+            checkpoint,
+            batch_size=32,
+            transform_fn=my_transform,
+        )
         """
-        # Example usage:
-        #
-        # Save checkpoint during training:
-        #   loader = scan.to_distributed_data_loader(scan, batch_size=32, seed=42)
-        #   checkpoint = loader.state_dict()
-        #
-        # Resume later using the same shards from checkpoint:
-        #   resumed_loader = SpiralDataLoader.from_state_dict(
-        #       scan,
-        #       checkpoint,
-        #       batch_size=32,
-        #       transform_fn=my_transform,
-        #   )
         # Extract resume parameters from state
         seed = state.get("seed", 42)

spiral/dataset.py CHANGED Viewed

@@ -226,7 +226,16 @@ class TableScanner(ds.Scanner):
     def head(self, num_rows: int):
         """Return the first `num_rows` rows of the dataset."""
-        reader = self.to_reader()
+        kwargs = {}
+        if num_rows <= 10_000:
+            # We are unlikely to need more than a couple batches
+            kwargs["batch_readahead"] = 1
+            # The progress bar length is the total number of splits in this dataset. We will likely
+            # stop streaming early. As a result, the progress bar is misleading.
+            kwargs["hide_progress_bar"] = True
+        reader = self._scan.to_record_batches(key_table=self.key_table, **kwargs)
         batches = []
         row_count = 0
         for batch in reader:

spiral/enrichment.py ADDED Viewed

@@ -0,0 +1,304 @@
+from __future__ import annotations
+import dataclasses
+import logging
+from functools import partial
+from typing import TYPE_CHECKING
+from spiral.core.client import KeyColumns, Shard
+from spiral.core.table import KeyRange
+from spiral.core.table.spec import Key, Operation
+from spiral.expressions import Expr
+if TYPE_CHECKING:
+    import dask.distributed
+    from spiral import KeySpaceIndex, Scan, Table
+logger = logging.getLogger(__name__)
+class Enrichment:
+    """
+    An enrichment is used to derive new columns from the existing once, such as fetching data from object storage
+    with `se.s3.get` or compute embeddings. With column groups design supporting 100s of thousands of columns,
+    horizontally expanding tables are a powerful primitive.
+    NOTE: Spiral aims to optimize enrichments where source and destination table are the same.
+    """
+    def __init__(
+        self,
+        table: Table,
+        projection: Expr,
+        where: Expr | None,
+    ):
+        self._table = table
+        self._projection = projection
+        self._where = where
+    @property
+    def table(self) -> Table:
+        """The table to write back into."""
+        return self._table
+    @property
+    def projection(self) -> Expr:
+        """The projection expression."""
+        return self._projection
+    @property
+    def where(self) -> Expr | None:
+        """The filter expression."""
+        return self._where
+    def _scan(self) -> Scan:
+        return self._table.spiral.scan(self._projection, where=self._where, _key_columns=KeyColumns.Included)
+    def apply(
+        self, *, batch_readahead: int | None = None, partition_size_bytes: int | None = None, tx_dump: str | None = None
+    ) -> None:
+        """Apply the enrichment onto the table in a streaming fashion.
+        For large tables, consider using `apply_dask` for distributed execution.
+        Args:
+            index: Optional key space index to use for sharding the enrichment.
+                If not provided, the table's default sharding will be used.
+            partition_size_bytes: The maximum partition size in bytes.
+                If not provided, the default partition size is used.
+            tx_dump: Optional path to dump the transaction JSON for debugging.
+        """
+        txn = self._table.txn()
+        txn.writeback(
+            self._scan(),
+            partition_size_bytes=partition_size_bytes,
+            batch_readahead=batch_readahead,
+        )
+        if txn.is_empty():
+            logger.warning("Transaction not committed. No rows were read for enrichment.")
+            return
+        txn.commit(tx_dump=tx_dump)
+    # TODO(marko): Need to figure out this sharding with key space index in places.
+    #   We could compute on-demand instead of requiring a resource.
+    def apply_dask(
+        self,
+        *,
+        index: KeySpaceIndex | None = None,
+        partition_size_bytes: int | None = None,
+        tx_dump: str | None = None,
+        checkpoint_dump: str | None = None,
+        client: dask.distributed.Client | None = None,
+        **kwargs,
+    ) -> None:
+        """Use distributed Dask to apply the enrichment. Requires `dask[distributed]` to be installed.
+        If "address" of an existing Dask cluster is not provided in `kwargs`, a local cluster will be created.
+        IMPORTANT: Dask execution has some limitations, e.g. UDFs are not currently supported. These limitations
+        usually manifest as serialization errors when Dask workers attempt to serialize the state. If you are
+        encountering such issues, consider splitting the enrichment into UDF-only derivation that will be
+        executed in a streaming fashion, followed by a Dask enrichment for the rest of the computation.
+        If that is not possible, please reach out to the support for assistance.
+        Args:
+            index: Optional key space index to use for sharding the enrichment.
+                If not provided, the table's default sharding will be used.
+            partition_size_bytes: The maximum partition size in bytes.
+                If not provided, the default partition size is used.
+            tx_dump: Optional path to dump the transaction JSON for debugging.
+            checkpoint_dump: Optional path to dump intermediate checkpoints for incremental progress.
+            client: Optional Dask distributed client. If not provided, a new client will be created
+            **kwargs: Additional keyword arguments to pass to `dask.distributed.Client`
+                such as `address` to connect to an existing cluster.
+        """
+        if client is None:
+            try:
+                from dask.distributed import Client
+            except ImportError:
+                raise ImportError("dask is not installed, please install dask[distributed] to use this feature.")
+            # Connect before doing any work.
+            client = Client(**kwargs)
+        # Start a transaction BEFORE the planning scan.
+        tx = self._table.txn()
+        plan_scan = self._scan()
+        # Determine the "tasks".
+        shards = None
+        # Use checkpoint, if provided.
+        if checkpoint_dump is not None:
+            checkpoint: list[KeyRange] | None = _checkpoint_load_key_ranges(checkpoint_dump)
+            if checkpoint is None:
+                logger.info(f"No existing checkpoint found at {checkpoint_dump}. Starting from scratch.")
+            else:
+                logger.info(f"Resuming enrichment from checkpoint at {checkpoint_dump} with {len(checkpoint)} ranges.")
+                shards = [Shard(kr, None) for kr in checkpoint]
+        # Fallback to index-based sharding.
+        if shards is None and index is not None:
+            # TODO(marko): This will use index's asof automatically.
+            shards = self._table.spiral.internal.compute_shards(index.core)
+        # Fallback to default sharding.
+        if shards is None:
+            shards = plan_scan.shards()
+        # TODO(marko): This is temporary workaround. Passing token is a bad idea.
+        #   Token can expire during long-running enrichments.
+        #   Maybe if device code is used, we can pass something.
+        token = self._table.spiral.authn.token()
+        if token is None:
+            raise ValueError("Spiral client is not authenticated.")
+        config = self._table.spiral.config
+        config.token = token
+        # Partially bind the enrichment function.
+        _compute = partial(
+            _enrichment_task,
+            settings_json=config.to_json(),
+            state_json=plan_scan.core.plan_state().to_json(),
+            output_table_id=self._table.table_id,
+            partition_size_bytes=partition_size_bytes,
+            incremental=checkpoint_dump is not None,
+        )
+        enrichments = client.map(_compute, shards)
+        logger.info(f"Applying enrichment with {len(shards)} shards. Follow progress at {client.dashboard_link}")
+        failed_ranges = []
+        try:
+            for result, shard in zip(client.gather(enrichments), shards):
+                result: EnrichmentTaskResult
+                if result.error is not None:
+                    logger.error(f"Enrichment task failed for range {shard.key_range}: {result.error}")
+                    failed_ranges.append(shard.key_range)
+                    continue
+                tx.include(result.ops)
+        except Exception as e:
+            # If not incremental, re-raise the exception.
+            if checkpoint_dump is None:
+                raise e
+            # Handle worker failures (e.g., KilledWorker from Dask)
+            from dask.distributed import KilledWorker
+            if isinstance(e, KilledWorker):
+                logger.error(f"Dask worker was killed during enrichment: {e}")
+            # Try to gather partial results and mark remaining tasks as failed
+            for future, shard in zip(enrichments, shards):
+                if future.done() and not future.exception():
+                    try:
+                        result = future.result()
+                        if result.error is not None:
+                            logger.error(f"Enrichment task failed for range {shard.key_range}: {result.error}")
+                            failed_ranges.append(shard.key_range)
+                            continue
+                        tx.include(result.ops)
+                    except Exception:
+                        # Task failed or incomplete, add to failed ranges
+                        failed_ranges.append(shard.key_range)
+                else:
+                    # Task didn't complete, add to failed ranges
+                    failed_ranges.append(shard.key_range)
+        # Dump checkpoint of failed ranges, if any.
+        if checkpoint_dump is not None:
+            logger.info(
+                f"Dumping checkpoint with failed {len(failed_ranges)}/{len(shards)} ranges to {checkpoint_dump}."
+            )
+            _checkpoint_dump_key_ranges(checkpoint_dump, failed_ranges)
+        if tx.is_empty():
+            logger.warning("Transaction not committed. No rows were read for enrichment.")
+            return
+        # Always compact in distributed enrichment.
+        tx.commit(compact=True, tx_dump=tx_dump)
+def _checkpoint_load_key_ranges(checkpoint_dump: str) -> list[KeyRange] | None:
+    import json
+    import os
+    if not os.path.exists(checkpoint_dump):
+        return None
+    with open(checkpoint_dump) as f:
+        data = json.load(f)
+        return [
+            KeyRange(begin=Key(bytes.fromhex(r["begin"])), end=Key(bytes.fromhex(r["end"])))
+            for r in data.get("key_ranges", [])
+        ]
+def _checkpoint_dump_key_ranges(checkpoint_dump: str, ranges: list[KeyRange]):
+    import json
+    import os
+    os.makedirs(os.path.dirname(checkpoint_dump), exist_ok=True)
+    with open(checkpoint_dump, "w") as f:
+        json.dump(
+            {"key_ranges": [{"begin": bytes(r.begin).hex(), "end": bytes(r.end).hex()} for r in ranges]},
+            f,
+        )
+@dataclasses.dataclass
+class EnrichmentTaskResult:
+    ops: list[Operation]
+    error: str | None = None
+    def __getstate__(self):
+        return {
+            "ops": [op.to_json() for op in self.ops],
+            "error": self.error,
+        }
+    def __setstate__(self, state):
+        self.ops = [Operation.from_json(op_json) for op_json in state["ops"]]
+        self.error = state["error"]
+# NOTE(marko): This function must be picklable!
+def _enrichment_task(
+    shard: Shard,
+    *,
+    settings_json: str,
+    state_json: str,
+    output_table_id,
+    partition_size_bytes: int | None,
+    incremental: bool,
+) -> EnrichmentTaskResult:
+    # Returns operations that can be included in a transaction.
+    from spiral import Scan, Spiral
+    from spiral.core.table import ScanState
+    from spiral.settings import ClientSettings
+    settings = ClientSettings.from_json(settings_json)
+    sp = Spiral(config=settings)
+    state = ScanState.from_json(state_json)
+    task_scan = Scan(sp, sp.core.load_scan(state))
+    table = sp.table(output_table_id)
+    task_tx = table.txn()
+    try:
+        task_tx.writeback(task_scan, key_range=shard.key_range, partition_size_bytes=partition_size_bytes)
+        return EnrichmentTaskResult(ops=task_tx.take())
+    except Exception as e:
+        task_tx.abort()
+        if incremental:
+            return EnrichmentTaskResult(ops=[], error=str(e))
+        logger.error(f"Enrichment task failed for shard {shard}: {e}")
+        raise e

spiral/expressions/__init__.py CHANGED Viewed

@@ -8,31 +8,25 @@ import pyarrow as pa
 from spiral import _lib, arrow_
+from . import file as file
 from . import http as http
-from . import io as io
 from . import list_ as list
-from . import mp4 as mp4
-from . import png as png
-from . import qoi as qoi
-from . import refs as refs
+from . import s3 as s3
 from . import str_ as str
 from . import struct as struct
 from . import text as text
-from . import tiff as tiff
 from .base import Expr, ExprLike, NativeExpr
+from .udf import UDF
 __all__ = [
     "Expr",
     "add",
     "and_",
-    "deref",
     "divide",
     "eq",
     "getitem",
     "gt",
     "gte",
-    "http",
-    "io",
     "is_not_null",
     "is_null",
     "lift",
@@ -48,19 +42,17 @@ __all__ = [
     "or_",
     "pack",
     "aux",
-    "ref",
-    "refs",
     "scalar",
     "select",
     "str",
     "struct",
     "subtract",
-    "tiff",
     "xor",
-    "png",
-    "qoi",
-    "mp4",
     "text",
+    "s3",
+    "http",
+    "file",
+    "UDF",
 ]
 # Inline some of the struct expressions since they're so common
@@ -68,8 +60,6 @@ getitem = struct.getitem
 merge = struct.merge
 pack = struct.pack
 select = struct.select
-ref = refs.ref
-deref = refs.deref
 def lift(expr: ExprLike) -> Expr:
@@ -127,16 +117,24 @@ def evaluate(expr: ExprLike) -> pa.RecordBatchReader:
         return pa.RecordBatchReader.from_batches(expr.schema, [expr])
     if isinstance(expr, pa.StructArray):
         return pa.Table.from_struct_array(expr).to_reader()
     if isinstance(expr, pa.ChunkedArray):
-        # TODO(marko): We shouldn't need to combine chunks here._
-        return evaluate(expr.combine_chunks())
+        if not pa.types.is_struct(expr.type):
+            raise ValueError("Arrow chunked array must be a struct type.")
+        def _iter_batches():
+            for chunk in expr.chunks:
+                yield pa.RecordBatch.from_struct_array(chunk)
+        return pa.RecordBatchReader.from_batches(pa.schema(expr.type.fields), _iter_batches())
     if isinstance(expr, pa.Array):
         raise ValueError("Arrow array must be a struct array.")
-    if isinstance(expr, Expr):
-        raise NotImplementedError("cannot evaluate an Expr")
-    if isinstance(expr, NativeExpr):
-        raise NotImplementedError("cannot evaluate a NativeExpr")
+    if isinstance(expr, Expr) or isinstance(expr, NativeExpr):
+        raise NotImplementedError(
+            "Expr evaluation not supported yet. Use Arrow to write instead. Reach out if you require this feature."
+        )
     if isinstance(expr, dict):
         # NOTE: we assume this is a struct expression. We could be smarter and be context aware to determine if

spiral/expressions/base.py CHANGED Viewed

@@ -1,6 +1,5 @@
-import builtins
 import datetime
-from typing import TypeAlias
+from typing import TypeAlias, Union
 import pyarrow as pa
@@ -153,5 +152,11 @@ class Expr:
 ScalarLike: TypeAlias = bool | int | float | str | list["ScalarLike"] | datetime.datetime | None
-ArrowLike: TypeAlias = pa.Array | pa.ChunkedArray | pa.Scalar | pa.RecordBatch | pa.Table
-ExprLike: TypeAlias = Expr | dict[str, "ExprLike"] | builtins.list | ArrowLike | ScalarLike
+ArrowLike: TypeAlias = Union[
+    pa.RecordBatch,
+    "pa.Array[pa.Scalar[pa.DataType]]",
+    "pa.ChunkedArray[pa.Scalar[pa.DataType]]",
+    "pa.Scalar[pa.DataType]",
+    pa.Table,
+]
+ExprLike: TypeAlias = Expr | dict[str, "ExprLike"] | list["ExprLike"] | ArrowLike | ScalarLike

spiral/expressions/file.py ADDED Viewed

@@ -0,0 +1,17 @@
+from spiral import _lib
+from spiral.expressions.base import Expr, ExprLike
+def get(expr: ExprLike, abort_on_error: bool = False) -> Expr:
+    """Read data from the local filesystem by the file:// URL.
+    Args:
+        expr: URLs of the data that needs to be read.
+        abort_on_error: Should the expression abort on errors or just collect them.
+    """
+    from spiral import expressions as se
+    expr = se.lift(expr)
+    # This just works :)
+    return Expr(_lib.expr.s3.get(expr.__expr__, abort_on_error))