PyPI - pyspiral - Versions diffs - 0.2.5__cp310-abi3-macosx_11_0_arm64.whl → 0.3.1__cp310-abi3-macosx_11_0_arm64.whl - Mend

pyspiral 0.2.5__cp310-abi3-macosx_11_0_arm64.whl → 0.3.1__cp310-abi3-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (30) hide show

{pyspiral-0.2.5.dist-info → pyspiral-0.3.1.dist-info}/METADATA +8 -6
{pyspiral-0.2.5.dist-info → pyspiral-0.3.1.dist-info}/RECORD +29 -25
{pyspiral-0.2.5.dist-info → pyspiral-0.3.1.dist-info}/WHEEL +1 -1
spiral/__init__.py +3 -1
spiral/_lib.abi3.so +0 -0
spiral/api/__init__.py +8 -4
spiral/api/filesystems.py +1 -1
spiral/api/tables.py +3 -6
spiral/catalog.py +15 -0
spiral/cli/fs.py +2 -2
spiral/cli/project.py +5 -3
spiral/core/core/__init__.pyi +34 -6
spiral/core/spec/__init__.pyi +8 -26
spiral/dataset.py +221 -20
spiral/expressions/__init__.py +19 -4
spiral/expressions/mp4.py +69 -0
spiral/expressions/png.py +18 -0
spiral/expressions/qoi.py +18 -0
spiral/expressions/refs.py +23 -3
spiral/expressions/tiff.py +88 -88
spiral/maintenance.py +12 -0
spiral/proto/_/scandal/__init__.py +78 -11
spiral/proto/_/spiral/table/__init__.py +53 -2
spiral/scan_.py +75 -24
spiral/settings.py +6 -0
spiral/substrait_.py +1 -1
spiral/table.py +35 -21
spiral/txn.py +48 -0
spiral/config.py +0 -26
{pyspiral-0.2.5.dist-info → pyspiral-0.3.1.dist-info}/entry_points.txt +0 -0

spiral/dataset.py CHANGED Viewed

@@ -1,22 +1,23 @@
-from typing import TYPE_CHECKING, Any
+from typing import Any
 import pyarrow as pa
 import pyarrow.compute as pc
-if TYPE_CHECKING:
-    import pyarrow.dataset
+import pyarrow.dataset as ds
 from spiral import Scan, Table
-class TableDataset(pa.dataset.Dataset):
+class TableDataset(ds.Dataset):
     def __init__(self, table: Table):
         self._table = table
-        self._schema: pa.Schema = table.scan().schema.to_arrow()
+        # Once table is converted to a dataset, used pinned snapshot.
+        self._asof = table.last_modified_at
+        self._schema: pa.Schema = table._table.get_schema(asof=self._asof).to_arrow()
         # We don't actually initialize a Dataset, we just implement enough of the API
         # to fool both DuckDB and Polars.
         # super().__init__()
+        self._last_scan = None
     @property
     def schema(self) -> pa.Schema:
@@ -28,7 +29,7 @@ class TableDataset(pa.dataset.Dataset):
         batch_size: int | None = None,
         batch_readahead: int | None = None,
         fragment_readahead: int | None = None,
-        fragment_scan_options: pa.dataset.FragmentScanOptions | None = None,
+        fragment_scan_options: ds.FragmentScanOptions | None = None,
         use_threads: bool = True,
         memory_pool: pa.MemoryPool = None,
     ):
@@ -58,11 +59,11 @@ class TableDataset(pa.dataset.Dataset):
         batch_size: int | None = None,
         batch_readahead: int | None = None,
         fragment_readahead: int | None = None,
-        fragment_scan_options: pa.dataset.FragmentScanOptions | None = None,
+        fragment_scan_options: ds.FragmentScanOptions | None = None,
         use_threads: bool = True,
         memory_pool: pa.MemoryPool = None,
     ):
-        self.scanner(
+        return self.scanner(
             columns,
             filter,
             batch_size,
@@ -99,7 +100,7 @@ class TableDataset(pa.dataset.Dataset):
         batch_size: int | None = None,
         batch_readahead: int | None = None,
         fragment_readahead: int | None = None,
-        fragment_scan_options: pa.dataset.FragmentScanOptions | None = None,
+        fragment_scan_options: ds.FragmentScanOptions | None = None,
         use_threads: bool = True,
         memory_pool: pa.MemoryPool = None,
     ) -> "TableScanner":
@@ -111,11 +112,18 @@ class TableDataset(pa.dataset.Dataset):
                 filter.to_substrait(self._schema, allow_arrow_extensions=True),
             )
-        scan = self._table.scan(
-            {c: self._table[c] for c in columns} if columns else self._table,
-            where=filter,
-            exclude_keys=True,
+        scan = (
+            self._table.scan(
+                {c: self._table[c] for c in columns},
+                where=filter,
+                exclude_keys=True,
+                asof=self._asof,
+            )
+            if columns
+            else self._table.scan(where=filter, asof=self._asof)
         )
+        self._last_scan = scan
         return TableScanner(scan)
     def sort_by(self, sorting, **kwargs):
@@ -129,7 +137,7 @@ class TableDataset(pa.dataset.Dataset):
         batch_size: int | None = None,
         batch_readahead: int | None = None,
         fragment_readahead: int | None = None,
-        fragment_scan_options: pa.dataset.FragmentScanOptions | None = None,
+        fragment_scan_options: ds.FragmentScanOptions | None = None,
         use_threads: bool = True,
         memory_pool: pa.MemoryPool = None,
     ):
@@ -151,7 +159,7 @@ class TableDataset(pa.dataset.Dataset):
         batch_size: int | None = None,
         batch_readahead: int | None = None,
         fragment_readahead: int | None = None,
-        fragment_scan_options: pa.dataset.FragmentScanOptions | None = None,
+        fragment_scan_options: ds.FragmentScanOptions | None = None,
         use_threads: bool = True,
         memory_pool: pa.MemoryPool = None,
     ):
@@ -173,7 +181,7 @@ class TableDataset(pa.dataset.Dataset):
         batch_size: int | None = None,
         batch_readahead: int | None = None,
         fragment_readahead: int | None = None,
-        fragment_scan_options: pa.dataset.FragmentScanOptions | None = None,
+        fragment_scan_options: ds.FragmentScanOptions | None = None,
         use_threads: bool = True,
         memory_pool: pa.MemoryPool = None,
     ):
@@ -189,12 +197,17 @@ class TableDataset(pa.dataset.Dataset):
         ).to_table()
-class TableScanner(pa.dataset.Scanner):
+class TableScanner(ds.Scanner):
     """A PyArrow Dataset Scanner that reads from a Spiral Table."""
-    def __init__(self, scan: Scan):
+    def __init__(
+        self,
+        scan: Scan,
+        key_table: pa.Table | pa.RecordBatchReader | None = None,
+    ):
         self._scan = scan
         self._schema = scan.schema
+        self.key_table = key_table
         # We don't actually initialize a Dataset, we just implement enough of the API
         # to fool both DuckDB and Polars.
@@ -233,7 +246,195 @@ class TableScanner(pa.dataset.Scanner):
         return self.to_reader()
     def to_reader(self):
-        return self._scan.to_record_batches()
+        return self._scan.to_record_batches(key_table=self.key_table)
     def to_table(self):
         return self.to_reader().read_all()
+class ScanDataset(ds.Dataset):
+    def __init__(
+        self,
+        scan: Scan,
+        key_table: pa.Table | pa.RecordBatchReader | None = None,
+    ):
+        self._scan = scan
+        self._schema: pa.Schema = scan.schema.to_arrow()
+        self._key_table = key_table
+        # We don't actually initialize a Dataset, we just implement enough of the API
+        # to fool both DuckDB and Polars.
+        # super().__init__()
+    @property
+    def schema(self) -> pa.Schema:
+        return self._schema
+    def count_rows(
+        self,
+        filter: pc.Expression | None = None,
+        batch_size: int | None = None,
+        batch_readahead: int | None = None,
+        fragment_readahead: int | None = None,
+        fragment_scan_options: ds.FragmentScanOptions | None = None,
+        use_threads: bool = True,
+        memory_pool: pa.MemoryPool = None,
+    ):
+        return self.scanner(
+            None,
+            filter,
+            batch_size,
+            batch_readahead,
+            fragment_readahead,
+            fragment_scan_options,
+            use_threads,
+            memory_pool,
+        ).count_rows()
+    def filter(self, expression: pc.Expression) -> "TableDataset":
+        raise NotImplementedError("filter not implemented")
+    def get_fragments(self, filter: pc.Expression | None = None):
+        """TODO(ngates): perhaps we should return ranges as per our split API?"""
+        raise NotImplementedError("get_fragments not implemented")
+    def head(
+        self,
+        num_rows: int,
+        columns: list[str] | None = None,
+        filter: pc.Expression | None = None,
+        batch_size: int | None = None,
+        batch_readahead: int | None = None,
+        fragment_readahead: int | None = None,
+        fragment_scan_options: ds.FragmentScanOptions | None = None,
+        use_threads: bool = True,
+        memory_pool: pa.MemoryPool = None,
+    ):
+        return self.scanner(
+            columns,
+            filter,
+            batch_size,
+            batch_readahead,
+            fragment_readahead,
+            fragment_scan_options,
+            use_threads,
+            memory_pool,
+        ).head(num_rows)
+    def join(
+        self,
+        right_dataset,
+        keys,
+        right_keys=None,
+        join_type=None,
+        left_suffix=None,
+        right_suffix=None,
+        coalesce_keys=True,
+        use_threads=True,
+    ):
+        raise NotImplementedError("join not implemented")
+    def join_asof(self, right_dataset, on, by, tolerance, right_on=None, right_by=None):
+        raise NotImplementedError("join_asof not implemented")
+    def replace_schema(self, schema: pa.Schema) -> "TableDataset":
+        raise NotImplementedError("replace_schema not implemented")
+    def scanner(
+        self,
+        columns: list[str] | None = None,
+        filter: pc.Expression | None = None,
+        batch_size: int | None = None,
+        batch_readahead: int | None = None,
+        fragment_readahead: int | None = None,
+        fragment_scan_options: ds.FragmentScanOptions | None = None,
+        use_threads: bool = True,
+        memory_pool: pa.MemoryPool = None,
+    ) -> "TableScanner":
+        if columns is not None:
+            columns = set(columns)
+            names = set(self.schema.names)
+            if len(columns - names) != 0 or len(names - columns) != 0:
+                raise NotImplementedError("columns", columns, self.schema)
+        if filter is not None:
+            raise NotImplementedError("filter")
+        if batch_size is not None:
+            raise NotImplementedError("batch_size")
+        if batch_readahead is not None:
+            raise NotImplementedError("batch_readahead")
+        if fragment_readahead is not None:
+            raise NotImplementedError("fragment_readahead")
+        if fragment_scan_options is not None:
+            raise NotImplementedError("fragment_scan_options")
+        return TableScanner(self._scan, key_table=self._key_table)
+    def sort_by(self, sorting, **kwargs):
+        raise NotImplementedError("sort_by not implemented")
+    def take(
+        self,
+        indices: pa.Array | Any,
+        columns: list[str] | None = None,
+        filter: pc.Expression | None = None,
+        batch_size: int | None = None,
+        batch_readahead: int | None = None,
+        fragment_readahead: int | None = None,
+        fragment_scan_options: ds.FragmentScanOptions | None = None,
+        use_threads: bool = True,
+        memory_pool: pa.MemoryPool = None,
+    ):
+        return self.scanner(
+            columns,
+            filter,
+            batch_size,
+            batch_readahead,
+            fragment_readahead,
+            fragment_scan_options,
+            use_threads,
+            memory_pool,
+        ).take(indices)
+    def to_batches(
+        self,
+        columns: list[str] | None = None,
+        filter: pc.Expression | None = None,
+        batch_size: int | None = None,
+        batch_readahead: int | None = None,
+        fragment_readahead: int | None = None,
+        fragment_scan_options: ds.FragmentScanOptions | None = None,
+        use_threads: bool = True,
+        memory_pool: pa.MemoryPool = None,
+    ):
+        return self.scanner(
+            columns,
+            filter,
+            batch_size,
+            batch_readahead,
+            fragment_readahead,
+            fragment_scan_options,
+            use_threads,
+            memory_pool,
+        ).to_batches()
+    def to_table(
+        self,
+        columns=None,
+        filter: pc.Expression | None = None,
+        batch_size: int | None = None,
+        batch_readahead: int | None = None,
+        fragment_readahead: int | None = None,
+        fragment_scan_options: ds.FragmentScanOptions | None = None,
+        use_threads: bool = True,
+        memory_pool: pa.MemoryPool = None,
+    ):
+        return self.scanner(
+            columns,
+            filter,
+            batch_size,
+            batch_readahead,
+            fragment_readahead,
+            fragment_scan_options,
+            use_threads,
+            memory_pool,
+        ).to_table()

spiral/expressions/__init__.py CHANGED Viewed

@@ -10,6 +10,9 @@ from spiral import _lib, arrow
 from . import http as http
 from . import io as io
 from . import list_ as list
+from . import mp4 as mp4
+from . import png as png
+from . import qoi as qoi
 from . import refs as refs
 from . import str_ as str
 from . import struct as struct
@@ -42,6 +45,7 @@ __all__ = [
     "not_",
     "or_",
     "pack",
+    "keyed",
     "ref",
     "refs",
     "scalar",
@@ -52,6 +56,9 @@ __all__ = [
     "tiff",
     "var",
     "xor",
+    "png",
+    "qoi",
+    "mp4",
 ]
 # Inline some of the struct expressions since they're so common
@@ -88,6 +95,10 @@ def lift(expr: ExprLike) -> Expr:
     # If the value is struct-like, we un-nest any dot-separated field names
     if isinstance(expr, pa.StructArray | pa.StructScalar):
+        if isinstance(expr, pa.StructArray) and expr.null_count != 0:
+            raise ValueError("lift: cannot lift a struct array with nulls.")
+        if isinstance(expr, pa.StructArray) and not expr.is_valid():
+            raise ValueError("lift: cannot lift a struct scalar with nulls.")
         return lift(arrow.nest_structs(expr))
     if isinstance(expr, pa.Array):
@@ -97,9 +108,13 @@ def lift(expr: ExprLike) -> Expr:
     return scalar(expr)
-def var(name: builtins.str) -> Expr:
-    """Create a variable expression."""
-    return Expr(_lib.spql.expr.var(name))
+def key(name: builtins.str) -> Expr:
+    """Create a variable expression referencing a key column.
+    Args:
+        name: variable name
+    """
+    return Expr(_lib.spql.expr.keyed(name))
 def keyed(name: builtins.str, dtype: pa.DataType) -> Expr:
@@ -112,7 +127,7 @@ def keyed(name: builtins.str, dtype: pa.DataType) -> Expr:
         name: variable name
         dtype: must match dtype of the column in the key table.
     """
-    return Expr(_lib.spql.expr.keyed(f"#{name}", dtype))
+    return Expr(_lib.spql.expr.keyed(name, dtype))
 def scalar(value: Any) -> Expr:

spiral/expressions/mp4.py ADDED Viewed

@@ -0,0 +1,69 @@
+from typing import TYPE_CHECKING
+import pyarrow as pa
+from spiral.expressions.base import Expr, ExprLike
+if TYPE_CHECKING:
+    from spiral import Table
+_MP4_RES_DTYPE: pa.DataType = pa.struct(
+    [
+        pa.field("pixels", pa.large_binary()),
+        pa.field("height", pa.uint32()),
+        pa.field("width", pa.uint32()),
+        pa.field("frames", pa.uint32()),
+    ]
+)
+# TODO(marko): Support optional range and crop.
+#   IMPORTANT: Frames is currently broken and defaults to full.
+def read(expr: ExprLike | str, frames: ExprLike | str, crop: ExprLike | str, *, table: "Table" = None):
+    """
+    Read referenced cell in a `MP4` format. Requires `ffmpeg`.
+    Args:
+        expr: The referenced `Mp4` bytes.
+            A str is assumed to be the `se.keyed` expression.
+        frames: The range of frames to read. Each element must be a list of two uint32,
+            frame start and frame end, or null / empty list to read all frames.
+            A str is assumed to be the `se.keyed` expression.
+        crop: The crop of the frames to read. Each element must be a list of four uint32,
+            x, y, width, height or null / empty list to read full frames.
+            A str is assumed to be the `se.keyed` expression.
+        table (optional): The table to de-reference from, if not available in input expression.
+    Returns:
+        An array where each element is a decoded cropped video with fields:
+            pixels: RGB8 bytes, frames * width * height * 3.
+            width: Width of the image with type `pa.uint32()`.
+            height: Height of the image with type `pa.uint32()`.
+            frames: Number of frames with type `pa.uint32()`.
+    """
+    from spiral import _lib
+    from spiral.expressions import keyed, lift
+    if isinstance(expr, str):
+        expr = keyed(
+            expr,
+            pa.struct([("__ref__", pa.struct([("id", pa.string()), ("begin", pa.uint64()), ("end", pa.uint64())]))]),
+        )
+    if isinstance(frames, str):
+        frames = keyed(frames, pa.list_(pa.uint32()))
+    if isinstance(crop, str):
+        crop = keyed(crop, pa.list_(pa.uint32()))
+    expr = lift(expr)
+    frames = lift(frames)
+    crop = lift(crop)
+    return Expr(
+        _lib.spql.expr.video.read(
+            expr.__expr__,
+            frames.__expr__,
+            crop.__expr__,
+            format="mp4",
+            table=table._table if table is not None else None,
+        )
+    )

spiral/expressions/png.py ADDED Viewed

@@ -0,0 +1,18 @@
+from spiral.expressions.base import Expr, ExprLike
+def encode(expr: ExprLike) -> Expr:
+    """Encode the given expression as a PNG image.
+    Args:
+        expr: The expression to encode.
+            Expects a struct with `pixels`, `width`, `height`, `channels`, `channel_bit_depth` fields.
+    Returns:
+        The encoded PNG images.
+    """
+    from spiral import _lib
+    from spiral.expressions import lift
+    expr = lift(expr)
+    return Expr(_lib.spql.expr.img.encode(expr.__expr__, format="png"))

spiral/expressions/qoi.py ADDED Viewed

@@ -0,0 +1,18 @@
+from spiral.expressions.base import Expr, ExprLike
+def encode(expr: ExprLike) -> Expr:
+    """Encode the given expression as a QOI image.
+    Args:
+        expr: The expression to encode.
+            Expects a struct with `pixels`, `width`, `height`, `channels`, `channel_bit_depth` fields.
+    Returns:
+        The encoded QOI images.
+    """
+    from spiral import _lib
+    from spiral.expressions import lift
+    expr = lift(expr)
+    return Expr(_lib.spql.expr.img.encode(expr.__expr__, format="qoi"))

spiral/expressions/refs.py CHANGED Viewed

@@ -1,5 +1,7 @@
 from typing import TYPE_CHECKING
+import pyarrow as pa
 from spiral.expressions.base import Expr, ExprLike
 if TYPE_CHECKING:
@@ -25,20 +27,38 @@ def ref(expr: ExprLike, field: str | None = None) -> Expr:
     return Expr(_lib.spql.expr.ref(expr.__expr__, field))
-def deref(expr: ExprLike, field: str | None = None, table: "Table" = None) -> Expr:
+def deref(expr: ExprLike | str, field: str | None = None, *, table: "Table" = None) -> Expr:
     """De-reference referenced values.
     See `ref` for more information on Spiral's reference values. This expression is used to de-reference referenced
     column back into their original form, e.g. binary.
     Args:
-        expr: The expression to de-reference.
+        expr: The expression to de-reference. A str is assumed to be the `se.keyed` expression.
         field: If the expr evaluates into struct, the field name of that struct that should be de-referenced.
             If `None`, the expr must evaluate into a reference type.
         table (optional): The table to de-reference from, if not available in input expression.
     """
     from spiral import _lib
+    from spiral.expressions import keyed, lift
+    if isinstance(expr, str):
+        expr = keyed(
+            expr,
+            pa.struct([("__ref__", pa.struct([("id", pa.string()), ("begin", pa.uint64()), ("end", pa.uint64())]))]),
+        )
+    expr = lift(expr)
+    return Expr(_lib.spql.expr.deref(expr.__expr__, field=field, table=table._table if table is not None else None))
+def nbytes(expr: ExprLike) -> Expr:
+    """Return the number of bytes in a reference.
+    Args:
+        expr: The ref expression to get the number of bytes from.
+    """
     from spiral.expressions import lift
     expr = lift(expr)
-    return Expr(_lib.spql.expr.deref(expr.__expr__, field, table._table if table is not None else None))
+    return expr["__ref__"]["end"] - expr["__ref__"]["begin"]