PyPI - pyspiral - Versions diffs - 0.8.9__cp311-abi3-macosx_11_0_arm64.whl → 0.9.9__cp311-abi3-macosx_11_0_arm64.whl - Mend

pyspiral 0.8.9__cp311-abi3-macosx_11_0_arm64.whl → 0.9.9__cp311-abi3-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (40) hide show

{pyspiral-0.8.9.dist-info → pyspiral-0.9.9.dist-info}/METADATA +4 -2
{pyspiral-0.8.9.dist-info → pyspiral-0.9.9.dist-info}/RECORD +39 -34
spiral/__init__.py +3 -2
spiral/_lib.abi3.so +0 -0
spiral/api/__init__.py +7 -0
spiral/api/client.py +86 -8
spiral/api/projects.py +4 -2
spiral/api/tables.py +77 -0
spiral/arrow_.py +4 -155
spiral/cli/app.py +10 -4
spiral/cli/chooser.py +30 -0
spiral/cli/fs.py +3 -2
spiral/cli/iceberg.py +1 -1
spiral/cli/key_spaces.py +4 -4
spiral/cli/orgs.py +1 -1
spiral/cli/projects.py +2 -2
spiral/cli/tables.py +47 -20
spiral/cli/telemetry.py +13 -6
spiral/cli/text.py +4 -4
spiral/cli/transactions.py +84 -0
spiral/cli/{types.py → types_.py} +6 -6
spiral/cli/workloads.py +4 -4
spiral/client.py +70 -8
spiral/core/client/__init__.pyi +25 -16
spiral/core/table/__init__.pyi +24 -22
spiral/debug/manifests.py +21 -9
spiral/debug/scan.py +4 -6
spiral/demo.py +145 -38
spiral/enrichment.py +18 -23
spiral/expressions/__init__.py +3 -75
spiral/expressions/base.py +5 -10
spiral/huggingface.py +456 -0
spiral/input.py +131 -0
spiral/ray_.py +75 -0
spiral/scan.py +218 -64
spiral/table.py +5 -4
spiral/transaction.py +95 -15
spiral/iterable_dataset.py +0 -106
{pyspiral-0.8.9.dist-info → pyspiral-0.9.9.dist-info}/WHEEL +0 -0
{pyspiral-0.8.9.dist-info → pyspiral-0.9.9.dist-info}/entry_points.txt +0 -0

spiral/transaction.py CHANGED Viewed

@@ -1,14 +1,23 @@
+from __future__ import annotations
 import logging
 from pathlib import Path
+from typing import TYPE_CHECKING
 from spiral.core.client import Shard
 from spiral.core.table import Transaction as CoreTransaction
 from spiral.core.table.spec import Operation
-from spiral.expressions.base import ExprLike
+from spiral.input import TableLike, evaluate
 from spiral.scan import Scan
+from spiral.types_ import Timestamp
 logger = logging.getLogger(__name__)
+if TYPE_CHECKING:
+    import ray.data
+    from spiral.table import Table
 class Transaction:
     """Spiral table transaction.
@@ -17,14 +26,20 @@ class Transaction:
     it is important that the primary key columns are unique within the transaction.
     """
-    def __init__(self, core: CoreTransaction):
+    def __init__(self, table: Table, core: CoreTransaction):
         self._core = core
+        self._table = table
     @property
     def status(self) -> str:
         """The status of the transaction."""
         return self._core.status
+    @property
+    def table(self) -> Table:
+        """The table associated with this transaction."""
+        return self._table
     def is_empty(self) -> bool:
         """Check if the transaction has no operations."""
         return self._core.is_empty()
@@ -38,22 +53,21 @@ class Transaction:
         else:
             self._core.abort()
-    def write(self, expr: ExprLike, push_down_nulls: bool = False):
+    def write(self, table: TableLike, push_down_nulls: bool = False):
         """Write an item to the table inside a single transaction.
         :param push_down_nulls: Whether to push down nullable structs down its children. E.g. `[{"a": 1}, null]` would
         become `[{"a": 1}, {"a": null}]`. SpiralDB doesn't allow struct-level nullability, so use this option if your
         data contains nullable structs.
-        :param expr: The expression to write. Must evaluate to a struct array.
+        :param table: The table to write.
         """
-        from spiral import expressions as se
+        record_batch_reader = evaluate(table)
-        record_batches = se.evaluate(expr)
         if push_down_nulls:
-            self._core.write_push_down(record_batches)
+            self._core.write_push_down(record_batch_reader)
         else:
-            self._core.write(record_batches)
+            self._core.write(record_batch_reader)
     def writeback(
         self,
@@ -70,6 +84,12 @@ class Transaction:
         """
         self._core.writeback(scan.core, shards=shards)
+    def to_ray_datasink(self) -> ray.data.Datasink:
+        """Returns a Ray Datasink which writes into this transaction."""
+        from spiral.ray_ import Datasink
+        return Datasink(self)
     def drop_columns(self, column_paths: list[str]):
         """
         Drops the specified columns from the table.
@@ -83,21 +103,29 @@ class Transaction:
         """Compact the key space of the table."""
         self._core.compact_key_space()
-    def take(self) -> list[Operation]:
+    def take(self) -> TransactionOps:
         """Take the operations from the transaction
         Transaction can no longer be committed or aborted after calling this method.
         ."""
-        return self._core.take()
+        return TransactionOps(self._core.snapshot().asof, self._core.take())
-    def include(self, ops: list[Operation]):
+    def include(self, ops: TransactionOps):
         """Include the given operations in the transaction.
         Checks for conflicts between the included operations and any existing operations.
-        """
-        self._core.include(ops)
-    def commit(self, *, txn_dump: str | None = None, compact: bool = False):
+        IMPORTANT: The `self` transaction must be started at or before the timestamp of the included operations.
+        """
+        self_asof = self._core.snapshot().asof
+        if ops.timestamp < self_asof:
+            raise ValueError(
+                f"Cannot include operations created against an out-of-date state of the table: {ops.timestamp}. "
+                f"This transaction's asof is {self_asof}."
+            )
+        self._core.include(ops.operations)
+    def commit(self, *, txn_dump: str | None = None):
         """Commit the transaction."""
         if txn_dump is not None:
             try:
@@ -114,7 +142,7 @@ class Transaction:
             except Exception as e:
                 logger.error(f"Failed to dump transaction to {txn_dump}: {e}")
-        self._core.commit(compact=compact)
+        self._core.commit()
     @staticmethod
     def load_dumps(*txn_dump: str) -> list[Operation]:
@@ -154,3 +182,55 @@ class Transaction:
     def abort(self):
         """Abort the transaction."""
         self._core.abort()
+class TransactionOps:
+    """
+    Operations taken from a transaction.
+    Operations are timestamped and can only be included in transactions
+    that are started at or before the timestamp of the operations.
+    """
+    def __init__(self, timestamp: Timestamp, operations: list[Operation]):
+        self._timestamp = timestamp
+        self._operations = operations
+    @property
+    def timestamp(self) -> Timestamp:
+        """The timestamp of the operations.
+        These operations can only be included in transactions started at or before this timestamp.
+        """
+        return self._timestamp
+    @property
+    def operations(self) -> list[Operation]:
+        """The list of operations."""
+        return self._operations
+    def to_json(self):
+        """Serialize the TransactionOps to JSON."""
+        import json
+        return json.dumps(
+            {
+                "timestamp": self.timestamp,
+                "operations": [op.to_json() for op in self.operations],
+            }
+        )
+    @classmethod
+    def from_json(cls, json_str: str) -> TransactionOps:
+        """Deserialize the TransactionOps from JSON."""
+        import json
+        data = json.loads(json_str)
+        return TransactionOps(
+            timestamp=data["timestamp"],
+            operations=[Operation.from_json(op_json) for op_json in data["operations"]],
+        )
+    def __reduce__(self):
+        """Support pickle protocol by using JSON serialization."""
+        return (self.__class__.from_json, (self.to_json(),))

spiral/iterable_dataset.py DELETED Viewed

@@ -1,106 +0,0 @@
-from collections.abc import Callable, Iterator
-from typing import TYPE_CHECKING
-import pyarrow as pa
-if TYPE_CHECKING:
-    import datasets.iterable_dataset as hf  # noqa
-    import streaming  # noqa
-    import torch.utils.data as torchdata  # noqa
-def _hf_compatible_schema(schema: pa.Schema) -> pa.Schema:
-    """
-    Replace string-view and binary-view columns in the schema with strings/binary.
-    Recursively handles nested types (struct, list, etc).
-    We use this converted schema as Features in the returned Dataset.
-    Remove this method once we have https://github.com/huggingface/datasets/pull/7718
-    """
-    def _convert_type(dtype: pa.DataType) -> pa.DataType:
-        if dtype == pa.string_view():
-            return pa.string()
-        elif dtype == pa.binary_view():
-            return pa.binary()
-        elif pa.types.is_struct(dtype):
-            new_fields = [
-                pa.field(field.name, _convert_type(field.type), nullable=field.nullable, metadata=field.metadata)
-                for field in dtype
-            ]
-            return pa.struct(new_fields)
-        elif pa.types.is_list(dtype):
-            return pa.list_(_convert_type(dtype.value_type))
-        elif pa.types.is_large_list(dtype):
-            return pa.large_list(_convert_type(dtype.value_type))
-        elif pa.types.is_fixed_size_list(dtype):
-            return pa.list_(_convert_type(dtype.value_type), dtype.list_size)
-        elif pa.types.is_map(dtype):
-            return pa.map_(_convert_type(dtype.key_type), _convert_type(dtype.item_type))
-        else:
-            return dtype
-    new_fields = []
-    for field in schema:
-        new_type = _convert_type(field.type)
-        new_fields.append(pa.field(field.name, new_type, nullable=field.nullable, metadata=field.metadata))
-    return pa.schema(new_fields)
-def to_iterable_dataset(stream: pa.RecordBatchReader) -> "hf.IterableDataset":
-    from datasets import DatasetInfo, Features
-    from datasets.builder import ArrowExamplesIterable
-    from datasets.iterable_dataset import IterableDataset
-    def _generate_tables(**kwargs) -> Iterator[tuple[int, pa.Table]]:
-        # This key is unused when training with IterableDataset.
-        # Default implementation returns shard id, e.g. parquet row group id.
-        for i, rb in enumerate(stream):
-            yield i, pa.Table.from_batches([rb], stream.schema)
-    # TODO(marko): This is temporary until we stop returning IterableDataset from this function.
-    class _IterableDataset(IterableDataset):
-        # Diff with datasets.iterable_dataset.IterableDataset:
-        # - Removes torch handling which attempts to handle worker processes.
-        # - Assumes arrow iterator.
-        def __iter__(self):
-            from datasets.formatting import get_formatter
-            prepared_ex_iterable = self._prepare_ex_iterable_for_iteration()
-            if self._formatting and (prepared_ex_iterable.iter_arrow or self._formatting.is_table):
-                formatter = get_formatter(self._formatting.format_type, features=self.features)
-                iterator = prepared_ex_iterable.iter_arrow()
-                for key, pa_table in iterator:
-                    yield formatter.format_row(pa_table)
-                return
-            for key, example in prepared_ex_iterable:
-                # no need to format thanks to FormattedExamplesIterable
-                yield example
-        def map(self, *args, **kwargs):
-            # Map constructs a new IterableDataset, so we need to "patch" it
-            base = super().map(*args, **kwargs)
-            if isinstance(base, IterableDataset):
-                # Patch __iter__ to avoid torch handling
-                base.__class__ = _IterableDataset  # type: ignore
-            return base
-    class _ArrowExamplesIterable(ArrowExamplesIterable):
-        def __init__(self, generate_tables_fn: Callable[..., Iterator[tuple[int, pa.Table]]], features: Features):
-            # NOTE: generate_tables_fn type annotations are wrong, return type must be an iterable of tuples.
-            super().__init__(generate_tables_fn, kwargs={})  # type: ignore
-            self._features = features
-        @property
-        def is_typed(self) -> bool:
-            return True
-        @property
-        def features(self) -> Features:
-            return self._features
-    target_features = Features.from_arrow_schema(_hf_compatible_schema(stream.schema))
-    ex_iterable = _ArrowExamplesIterable(_generate_tables, target_features)
-    info = DatasetInfo(features=target_features)
-    return _IterableDataset(ex_iterable=ex_iterable, info=info)

{pyspiral-0.8.9.dist-info → pyspiral-0.9.9.dist-info}/WHEEL RENAMED Viewed

File without changes

{pyspiral-0.8.9.dist-info → pyspiral-0.9.9.dist-info}/entry_points.txt RENAMED Viewed

File without changes