PyPI - pyspiral - Versions diffs - 0.6.11__cp312-abi3-manylinux_2_28_aarch64.whl → 0.6.13__cp312-abi3-manylinux_2_28_aarch64.whl - Mend

pyspiral 0.6.11__cp312-abi3-manylinux_2_28_aarch64.whl → 0.6.13__cp312-abi3-manylinux_2_28_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of pyspiral might be problematic. Click here for more details.

Files changed (41) hide show

{pyspiral-0.6.11.dist-info → pyspiral-0.6.13.dist-info}/METADATA +8 -5
{pyspiral-0.6.11.dist-info → pyspiral-0.6.13.dist-info}/RECORD +36 -30
spiral/__init__.py +7 -0
spiral/_lib.abi3.so +0 -0
spiral/cli/iceberg.py +1 -1
spiral/cli/key_spaces.py +15 -1
spiral/cli/tables.py +3 -3
spiral/client.py +12 -11
spiral/core/client/__init__.pyi +8 -8
spiral/core/expr/__init__.pyi +15 -0
spiral/core/expr/images/__init__.pyi +3 -0
spiral/core/expr/list_/__init__.pyi +4 -0
spiral/core/expr/refs/__init__.pyi +4 -0
spiral/core/expr/str_/__init__.pyi +3 -0
spiral/core/expr/struct_/__init__.pyi +6 -0
spiral/core/expr/text/__init__.pyi +5 -0
spiral/core/expr/udf/__init__.pyi +14 -0
spiral/core/expr/video/__init__.pyi +3 -0
spiral/core/table/__init__.pyi +19 -1
spiral/core/table/spec/__init__.pyi +6 -0
spiral/dataloader.py +52 -38
spiral/enrichment.py +153 -0
spiral/expressions/__init__.py +15 -19
spiral/expressions/base.py +9 -4
spiral/expressions/http.py +10 -80
spiral/expressions/s3.py +15 -0
spiral/expressions/tiff.py +2 -3
spiral/expressions/udf.py +38 -24
spiral/project.py +6 -6
spiral/scan.py +76 -33
spiral/settings.py +9 -6
spiral/streaming_/stream.py +1 -1
spiral/table.py +41 -9
spiral/transaction.py +42 -0
spiral/expressions/io.py +0 -100
spiral/expressions/mp4.py +0 -62
spiral/expressions/png.py +0 -18
spiral/expressions/qoi.py +0 -18
spiral/expressions/refs.py +0 -58
{pyspiral-0.6.11.dist-info → pyspiral-0.6.13.dist-info}/WHEEL +0 -0
{pyspiral-0.6.11.dist-info → pyspiral-0.6.13.dist-info}/entry_points.txt +0 -0

spiral/enrichment.py ADDED Viewed

@@ -0,0 +1,153 @@
+import dataclasses
+import logging
+from functools import partial
+from typing import TYPE_CHECKING, Optional
+from spiral.core.client import Shard
+from spiral.core.table.spec import Operation
+from spiral.expressions import Expr
+if TYPE_CHECKING:
+    from spiral import KeySpaceIndex, Table
+logger = logging.getLogger(__name__)
+class Enrichment:
+    """
+    An enrichment is used to derive new columns from the existing once, such as fetching data from object storage
+    with `se.s3.get` or compute embeddings. With column groups design supporting 100s of thousands of columns,
+    horizontally expanding tables are a powerful primitive.
+    NOTE: Spiral aims to optimize enrichments where source and destination table are the same.
+    """
+    def __init__(
+        self,
+        table: "Table",
+        projection: Expr,
+        where: Expr | None,
+    ):
+        self._table = table
+        self._projection = projection
+        self._where = where
+    @property
+    def table(self) -> "Table":
+        """The table to write back into."""
+        return self._table
+    @property
+    def projection(self) -> Expr:
+        """The projection expression."""
+        return self._projection
+    @property
+    def where(self) -> Expr | None:
+        """The filter expression."""
+        return self._where
+    def apply(self, *, batch_readahead: int | None = None, partition_size_bytes: int | None = None) -> None:
+        """Apply the enrichment onto the table in a streaming fashion.
+        For large tables, consider using `apply_dask` for distributed execution.
+        """
+        scan = self._table.spiral.scan(self._projection, where=self._where)
+        with self._table.txn() as txn:
+            txn.writeback(
+                scan,
+                partition_size_bytes=partition_size_bytes,
+                batch_readahead=batch_readahead,
+            )
+    # TODO(marko): Need to figure out this sharding with key space index in places.
+    #   We could compute on-demand instead of requiring a resource.
+    def apply_dask(
+        self, *, index: Optional["KeySpaceIndex"] = None, partition_size_bytes: int | None = None, **kwargs
+    ) -> None:
+        """Use distributed Dask to apply the enrichment. Requires `dask[distributed]` to be installed.
+        If "address" of an existing Dask cluster is not provided in `kwargs`, a local cluster will be created.
+        IMPORTANT: Dask execution has some limitations, e.g. UDFs are not currently supported. These limitations
+        usually manifest as serialization errors when Dask workers attempt to serialize the state. If you are
+        encountering such issues, consider splitting the enrichment into UDF-only derivation that will be
+        executed in a streaming fashion, followed by a Dask enrichment for the rest of the computation.
+        If that is not possible, please reach out to the support for assistance.
+        Args:
+            index: Optional key space index to use for sharding the enrichment.
+                If not provided, the table's default sharding will be used.
+            **kwargs: Additional keyword arguments to pass to `dask.distributed.Client`
+                such as `address` to connect to an existing cluster.
+        """
+        try:
+            from dask.distributed import Client
+        except ImportError:
+            raise ImportError("dask is not installed, please install dask[distributed] to use this feature.")
+        # Connect before doing any work.
+        dask_client = Client(**kwargs)
+        # Start a transaction BEFORE the planning scan.
+        tx = self._table.txn()
+        plan_scan = self._table.spiral.scan(self._projection, where=self._where)
+        # Determine the "tasks". Use the index if provided.
+        shards = plan_scan.shards()
+        if index is not None:
+            # TODO(marko): This will use index's asof automatically.
+            shards = self._table.spiral.internal.compute_shards(index.core)
+        # Partially bind the enrichment function.
+        _compute = partial(
+            _enrichment_task,
+            settings_dict=self._table.spiral.config.model_dump(),
+            state_json=plan_scan.core.scan_state().to_json(),
+            output_table_id=self._table.table_id,
+            partition_size_bytes=partition_size_bytes,
+        )
+        enrichments = dask_client.map(_compute, shards)
+        logger.info(f"Applying enrichment with {len(shards)} shards. Follow progress at {dask_client.dashboard_link}")
+        for result in dask_client.gather(enrichments):
+            result: EnrichmentTaskResult
+            tx.include(result.ops)
+        if tx.is_empty():
+            logger.warning("Transaction not committed. No rows were read for enrichment.")
+            return
+        tx.commit()
+@dataclasses.dataclass
+class EnrichmentTaskResult:
+    ops: list[Operation]
+    def __getstate__(self):
+        return {"ops": [op.to_json() for op in self.ops]}
+    def __setstate__(self, state):
+        self.ops = [Operation.from_json(op_json) for op_json in state["ops"]]
+# NOTE(marko): This function must be picklable!
+def _enrichment_task(
+    shard: Shard, *, settings_dict, state_json, output_table_id, partition_size_bytes: int | None
+) -> EnrichmentTaskResult:
+    # Returns operations that can be included in a transaction.
+    from spiral import Scan, Spiral
+    from spiral.core.table import ScanState
+    from spiral.settings import Settings
+    settings: Settings = Settings.model_validate(settings_dict)
+    sp = Spiral(config=settings)
+    state = ScanState.from_json(state_json)
+    task_scan = Scan(sp, sp.core.load_scan(state))
+    table = sp.table(output_table_id)
+    task_tx = table.txn()
+    task_tx.writeback(task_scan, key_range=shard.key_range, partition_size_bytes=partition_size_bytes)
+    return EnrichmentTaskResult(ops=task_tx.take())

spiral/expressions/__init__.py CHANGED Viewed

@@ -9,30 +9,23 @@ import pyarrow as pa
 from spiral import _lib, arrow_
 from . import http as http
-from . import io as io
 from . import list_ as list
-from . import mp4 as mp4
-from . import png as png
-from . import qoi as qoi
-from . import refs as refs
+from . import s3 as s3
 from . import str_ as str
 from . import struct as struct
 from . import text as text
-from . import tiff as tiff
 from .base import Expr, ExprLike, NativeExpr
+from .udf import UDF
 __all__ = [
     "Expr",
     "add",
     "and_",
-    "deref",
     "divide",
     "eq",
     "getitem",
     "gt",
     "gte",
-    "http",
-    "io",
     "is_not_null",
     "is_null",
     "lift",
@@ -48,19 +41,16 @@ __all__ = [
     "or_",
     "pack",
     "aux",
-    "ref",
-    "refs",
     "scalar",
     "select",
     "str",
     "struct",
     "subtract",
-    "tiff",
     "xor",
-    "png",
-    "qoi",
-    "mp4",
     "text",
+    "s3",
+    "http",
+    "UDF",
 ]
 # Inline some of the struct expressions since they're so common
@@ -68,8 +58,6 @@ getitem = struct.getitem
 merge = struct.merge
 pack = struct.pack
 select = struct.select
-ref = refs.ref
-deref = refs.deref
 def lift(expr: ExprLike) -> Expr:
@@ -127,9 +115,17 @@ def evaluate(expr: ExprLike) -> pa.RecordBatchReader:
         return pa.RecordBatchReader.from_batches(expr.schema, [expr])
     if isinstance(expr, pa.StructArray):
         return pa.Table.from_struct_array(expr).to_reader()
     if isinstance(expr, pa.ChunkedArray):
-        # TODO(marko): We shouldn't need to combine chunks here._
-        return evaluate(expr.combine_chunks())
+        if not pa.types.is_struct(expr.type):
+            raise ValueError("Arrow chunked array must be a struct type.")
+        def _iter_batches():
+            for chunk in expr.chunks:
+                yield pa.RecordBatch.from_struct_array(chunk)
+        return pa.RecordBatchReader.from_batches(pa.schema(expr.type.fields), _iter_batches())
     if isinstance(expr, pa.Array):
         raise ValueError("Arrow array must be a struct array.")

spiral/expressions/base.py CHANGED Viewed

@@ -1,6 +1,5 @@
-import builtins
 import datetime
-from typing import TypeAlias
+from typing import TypeAlias, Union
 import pyarrow as pa
@@ -153,5 +152,11 @@ class Expr:
 ScalarLike: TypeAlias = bool | int | float | str | list["ScalarLike"] | datetime.datetime | None
-ArrowLike: TypeAlias = pa.Array | pa.ChunkedArray | pa.Scalar | pa.RecordBatch | pa.Table
-ExprLike: TypeAlias = Expr | dict[str, "ExprLike"] | builtins.list | ArrowLike | ScalarLike
+ArrowLike: TypeAlias = Union[
+    pa.RecordBatch,
+    "pa.Array[pa.Scalar[pa.DataType]]",
+    "pa.ChunkedArray[pa.Scalar[pa.DataType]]",
+    "pa.Scalar[pa.DataType]",
+    pa.Table,
+]
+ExprLike: TypeAlias = Expr | dict[str, "ExprLike"] | list["ExprLike"] | ArrowLike | ScalarLike

spiral/expressions/http.py CHANGED Viewed

@@ -1,86 +1,16 @@
-import hishel
-import httpx
-import pyarrow as pa
+from spiral import _lib
 from spiral.expressions.base import Expr, ExprLike
-from spiral.expressions.struct import pack
-from spiral.expressions.udf import UDF
-from spiral.settings import APP_DIR
-def get(url: ExprLike, headers: ExprLike = None, force_cache: bool = False) -> Expr:
-    """Submit a GET request to either a scalar of vector of URLs."""
-    to_pack = {"url": url}
-    if headers is not None:
-        to_pack["headers"] = headers
-    return HttpGet(force_cache)(pack(to_pack))
-class HttpGet(UDF):
-    RES_DTYPE: pa.DataType = pa.struct(
-        [
-            pa.field("bytes", pa.large_binary()),
-            pa.field("status", pa.int32()),
-            pa.field("headers", pa.map_(pa.string(), pa.string())),
-        ]
-    )
-    def __init__(self, force_cache: bool = False):
-        super().__init__("http.get")
-        self._force_cache = force_cache
-    def return_type(self, *input_types: pa.DataType) -> pa.DataType:
-        return HttpGet.RES_DTYPE
-    def invoke(self, *input_args: pa.Array) -> pa.Array:
-        if len(input_args) != 1:
-            raise ValueError(f"Expected 1 argument, got {len(input_args)}")
-        result = _http_request(input_args[0], self._force_cache)
-        if isinstance(result, pa.ChunkedArray):
-            result = result.combine_chunks()
-        return result
-def _http_request(arg: pa.Array, force_cache: bool) -> pa.Array:
-    client = _HttpClient()
-    if isinstance(arg, pa.StructArray):
-        # We assume a vector of requests, but with potentially many arguments
-        return pa.array(
-            [
-                _response_dict(
-                    client.request(
-                        req.get("method", "GET").upper(),
-                        req["url"],
-                        headers=req.get("headers", {}),
-                        extensions={"force_cache": force_cache},
-                    )
-                )
-                for req in arg.to_pylist()
-            ],
-            type=HttpGet.RES_DTYPE,
-        )
-    raise TypeError(f"Unsupported argument: {arg} ({type(arg)})")
-def _response_dict(response: httpx.Response) -> dict:
-    if response.status_code != 200:
-        raise ValueError(f"Request failed with status {response.status_code}")
-    return {
-        "bytes": response.read(),
-        "status": response.status_code,
-        "headers": dict(response.headers),
-    }
+def get(expr: ExprLike) -> Expr:
+    """Read data from the URL.
-class _HttpClient(hishel.CacheClient):
-    _instance: "_HttpClient" = None
+    Args:
+        expr: URLs of the data that needs to be read.
+    """
+    from spiral import expressions as se
-    def __new__(cls, *args, **kwargs):
-        if not cls._instance:
-            cls._instance = super().__new__(cls)
-        return cls._instance
+    expr = se.lift(expr)
-    def __init__(self):
-        super().__init__(storage=hishel.FileStorage(base_path=APP_DIR / "http.cache", ttl=3600))
+    # This just works :)
+    return Expr(_lib.expr.s3.get(expr.__expr__))

spiral/expressions/s3.py ADDED Viewed

@@ -0,0 +1,15 @@
+from spiral import _lib
+from spiral.expressions.base import Expr, ExprLike
+def get(expr: ExprLike) -> Expr:
+    """Read data from object storage by the object's URL.
+    Args:
+        expr: URLs of the data that needs to be read from object storage.
+    """
+    from spiral import expressions as se
+    expr = se.lift(expr)
+    return Expr(_lib.expr.s3.get(expr.__expr__))

spiral/expressions/tiff.py CHANGED Viewed

@@ -2,7 +2,6 @@ import numpy as np
 import pyarrow as pa
 from spiral.expressions.base import Expr, ExprLike
-from spiral.expressions.udf import RefUDF
 _TIFF_RES_DTYPE: pa.DataType = pa.struct(
     [
@@ -78,7 +77,7 @@ def select(
     return TiffSelectUDF()(expr, shape, indexes)
-class TiffReadUDF(RefUDF):
+class TiffReadUDF:
     def __init__(self):
         super().__init__("tiff.read")
@@ -122,7 +121,7 @@ class TiffReadUDF(RefUDF):
             return _return_result(result, indexes)
-class TiffSelectUDF(RefUDF):
+class TiffSelectUDF:
     def __init__(self):
         super().__init__("tiff.select")

spiral/expressions/udf.py CHANGED Viewed

@@ -3,44 +3,58 @@ import abc
 import pyarrow as pa
 from spiral import _lib
-from spiral.expressions.base import Expr
+from spiral.expressions.base import Expr, ExprLike
-class BaseUDF:
-    def __init__(self, udf):
-        self._udf = udf
+class UDF(abc.ABC):
+    """A User-Defined Function (UDF). This class should be subclassed to define custom UDFs.
-    def __call__(self, *args) -> Expr:
-        """Create an expression that calls this UDF with the given arguments."""
-        from spiral import expressions as se
+    Example:
-        args = [se.lift(arg).__expr__ for arg in args]
-        return Expr(self._udf(args))
+    ```python
+    from spiral import expressions as se
+    import pyarrow as pa
-    @abc.abstractmethod
-    def return_type(self, *input_types: pa.DataType) -> pa.DataType: ...
+    class MyAdd(se.UDF):
+        def __init__(self):
+            super().__init__("my_add")
+        def return_type(self, scope: pa.DataType):
+            if not isinstance(scope, pa.StructType):
+                raise ValueError("Expected struct type as input")
+            return scope.field(0).type
-class UDF(BaseUDF):
-    """A User-Defined Function (UDF)."""
+        def invoke(self, scope: pa.Array):
+            if not isinstance(scope, pa.StructArray):
+                raise ValueError("Expected struct array as input")
+            return pa.compute.add(scope.field(0), scope.field(1))
-    def __init__(self, name: str):
-        super().__init__(_lib.expr.udf.create(name, return_type=self.return_type, invoke=self.invoke))
+    my_add = MyAdd()
-    @abc.abstractmethod
-    def invoke(self, *input_args: pa.Array) -> pa.Array: ...
+    expr = my_add(table.select("first_arg", "second_arg"))
+    ```
+    """
+    def __init__(self, name: str):
+        self._udf = _lib.expr.udf.create(name, return_type=self.return_type, invoke=self.invoke)
-class RefUDF(BaseUDF):
-    """A UDF over a single ref cell, and therefore can access the file object."""
+    def __call__(self, scope: ExprLike) -> Expr:
+        """Create an expression that calls this UDF with the given arguments."""
+        from spiral import expressions as se
-    def __init__(self, name: str):
-        super().__init__(_lib.expr.udf.create(name, return_type=self.return_type, invoke=self.invoke, scope="ref"))
+        return Expr(self._udf(se.lift(scope).__expr__))
     @abc.abstractmethod
-    def invoke(self, fp, *input_args: pa.Array) -> pa.Array:
-        """Invoke the UDF with the given arguments.
+    def return_type(self, scope: pa.DataType) -> pa.DataType:
+        """Must return the return type of the UDF given the input scope type.
-        NOTE: The first argument is always the ref cell. All array input args will be sliced to the appropriate row.
+        IMPORTANT: All expressions in Spiral must return nullable (Arrow default) types,
+        including nested structs, meaning that all fields in structs must also be nullable,
+        and if those fields are structs, their fields must also be nullable, and so on.
         """
         ...
+    @abc.abstractmethod
+    def invoke(self, scope: pa.Array) -> pa.Array:
+        """Must implement the UDF logic given the input scope array."""
+        ...

spiral/project.py CHANGED Viewed

@@ -53,7 +53,7 @@ class Project:
         res = res[0]
         return Table(
-            self._spiral, self._spiral._core.table(res.id), identifier=f"{res.project_id}.{res.dataset}.{res.table}"
+            self._spiral, self._spiral.core.table(res.id), identifier=f"{res.project_id}.{res.dataset}.{res.table}"
         )
     def create_table(
@@ -78,7 +78,7 @@ class Project:
             key_schema = pa.schema(key_schema)
         key_schema = Schema.from_arrow(key_schema)
-        core_table = self._spiral._core.create_table(
+        core_table = self._spiral.core.create_table(
             project_id=self._id,
             dataset=dataset,
             table=table,
@@ -105,7 +105,7 @@ class Project:
             raise ValueError(f"Index not found: {name}")
         res = res[0]
-        return TextIndex(self._spiral._core.text_index(res.id), name=name)
+        return TextIndex(self._spiral.core.text_index(res.id), name=name)
     def create_text_index(
         self,
@@ -135,7 +135,7 @@ class Project:
         if where is not None:
             where = se.lift(where)
-        core_index = self._spiral._core.create_text_index(
+        core_index = self._spiral.core.create_text_index(
             project_id=self._id,
             name=name,
             projection=projection.__expr__,
@@ -154,7 +154,7 @@ class Project:
             raise ValueError(f"Index not found: {name}")
         res = res[0]
-        return KeySpaceIndex(self._spiral._core.key_space_index(res.id), name=name)
+        return KeySpaceIndex(self._spiral.core.key_space_index(res.id), name=name)
     def create_key_space_index(
         self,
@@ -185,7 +185,7 @@ class Project:
         if where is not None:
             where = se.lift(where)
-        core_index = self._spiral._core.create_key_space_index(
+        core_index = self._spiral.core.create_key_space_index(
             project_id=self._id,
             name=name,
             granularity=granularity,