PyPI - pyspiral - Versions diffs - 0.6.3__cp310-abi3-macosx_11_0_arm64.whl → 0.6.5__cp310-abi3-macosx_11_0_arm64.whl - Mend

pyspiral 0.6.3__cp310-abi3-macosx_11_0_arm64.whl → 0.6.5__cp310-abi3-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (34) hide show

{pyspiral-0.6.3.dist-info → pyspiral-0.6.5.dist-info}/METADATA +3 -3
{pyspiral-0.6.3.dist-info → pyspiral-0.6.5.dist-info}/RECORD +34 -32
{pyspiral-0.6.3.dist-info → pyspiral-0.6.5.dist-info}/WHEEL +1 -1
spiral/_lib.abi3.so +0 -0
spiral/api/client.py +1 -1
spiral/api/filesystems.py +9 -40
spiral/api/projects.py +9 -2
spiral/cli/app.py +42 -6
spiral/cli/fs.py +25 -60
spiral/cli/login.py +6 -3
spiral/cli/projects.py +34 -22
spiral/client.py +2 -2
spiral/core/_tools/__init__.pyi +5 -0
spiral/core/authn/__init__.pyi +1 -1
spiral/core/client/__init__.pyi +14 -3
spiral/core/table/__init__.pyi +3 -0
spiral/debug/manifests.py +26 -18
spiral/expressions/__init__.py +2 -2
spiral/expressions/base.py +9 -3
spiral/iterable_dataset.py +106 -0
spiral/protogen/_/arrow/flight/protocol/sql/__init__.py +1 -1
spiral/protogen/_/google/protobuf/__init__.py +121 -1
spiral/protogen/_/scandal/__init__.py +1 -1
spiral/protogen/_/spfs/__init__.py +1 -1
spiral/protogen/_/spql/__init__.py +1 -1
spiral/protogen/_/substrait/__init__.py +2 -2
spiral/protogen/_/substrait/extensions/__init__.py +1 -1
spiral/scan.py +22 -34
spiral/settings.py +3 -1
spiral/snapshot.py +16 -0
spiral/streaming_/stream.py +7 -3
spiral/table.py +53 -94
spiral/transaction.py +1 -1
{pyspiral-0.6.3.dist-info → pyspiral-0.6.5.dist-info}/entry_points.txt +0 -0

spiral/scan.py CHANGED Viewed

@@ -1,4 +1,3 @@
-from collections.abc import Iterator
 from typing import TYPE_CHECKING, Any
 import pyarrow as pa
@@ -120,8 +119,11 @@ class Scan:
         self,
         shuffle: ShuffleStrategy | None = None,
         batch_readahead: int | None = None,
+        num_workers: int | None = None,
+        worker_id: int | None = None,
+        infinite: bool = False,
     ) -> "hf.IterableDataset":
-        """Returns an Huggingface's IterableDataset.
+        """Returns a Huggingface's IterableDataset.
         Requires `datasets` package to be installed.
@@ -130,39 +132,25 @@ class Scan:
             batch_readahead: Controls how many batches to read ahead concurrently.
                 If pipeline includes work after reading (e.g. decoding, transforming, ...) this can be set higher.
                 Otherwise, it should be kept low to reduce next batch latency. Defaults to 2.
+            num_workers: If not None, shards the scan across multiple workers.
+                Must be used together with worker_id.
+            worker_id: If not None, the id of the current worker.
+                Scan will only return a subset of the data corresponding to the worker_id.
+            infinite: If True, the returned IterableDataset will loop infinitely over the data,
+                re-shuffling ranges after exhausting all data.
         """
-        from datasets import DatasetInfo, Features
-        from datasets.iterable_dataset import ArrowExamplesIterable, IterableDataset
-        def _generate_tables(**kwargs) -> Iterator[tuple[int, pa.Table]]:
-            stream = self.core.to_shuffled_record_batches(
-                shuffle,
-                batch_readahead,
-            )
-            # This key is unused when training with IterableDataset.
-            # Default implementation returns shard id, e.g. parquet row group id.
-            for i, rb in enumerate(stream):
-                yield i, pa.Table.from_batches([rb], stream.schema)
-        def _hf_compatible_schema(schema: pa.Schema) -> pa.Schema:
-            """
-            Replace string-view columns in the schema with strings. We do use this converted schema
-            as Features in the returned Dataset.
-            Remove this method once we have https://github.com/huggingface/datasets/pull/7718
-            """
-            new_fields = [
-                pa.field(field.name, pa.string(), nullable=field.nullable, metadata=field.metadata)
-                if field.type == pa.string_view()
-                else field
-                for field in schema
-            ]
-            return pa.schema(new_fields)
-        # NOTE: generate_tables_fn type annotations are wrong, return type must be an iterable of tuples.
-        ex_iterable = ArrowExamplesIterable(generate_tables_fn=_generate_tables, kwargs={})  # type: ignore
-        info = DatasetInfo(features=Features.from_arrow_schema(_hf_compatible_schema(self.schema.to_arrow())))
-        return IterableDataset(ex_iterable=ex_iterable, info=info)
+        stream = self.core.to_shuffled_record_batches(
+            shuffle,
+            batch_readahead,
+            num_workers,
+            worker_id,
+            infinite,
+        )
+        from spiral.iterable_dataset import to_iterable_dataset
+        return to_iterable_dataset(stream)
     def _splits(self) -> list[KeyRange]:
         # Splits the scan into a set of key ranges.

spiral/settings.py CHANGED Viewed

@@ -24,6 +24,8 @@ CI = "GITHUB_ACTIONS" in os.environ
 APP_DIR = Path(typer.get_app_dir("pyspiral"))
 LOG_DIR = APP_DIR / "logs"
+PACKAGE_NAME = "pyspiral"
 def validate_token(v, handler: ValidatorFunctionWrapHandler):
     if isinstance(v, str):
@@ -89,7 +91,7 @@ class Settings(BaseSettings):
     def authn(self):
         if self.spiraldb.token:
             return Authn.from_token(self.spiraldb.token)
-        return Authn.from_fallback()
+        return Authn.from_fallback(self.spiraldb.uri)
     @functools.cached_property
     def device_code_auth(self) -> DeviceCodeAuth:

spiral/snapshot.py CHANGED Viewed

@@ -1,5 +1,6 @@
 from typing import TYPE_CHECKING
+from spiral import ShuffleStrategy
 from spiral.core.table import Snapshot as CoreSnapshot
 from spiral.core.table.spec import Schema
 from spiral.types_ import Timestamp
@@ -8,6 +9,7 @@ if TYPE_CHECKING:
     import duckdb
     import polars as pl
     import pyarrow.dataset as ds
+    import torch.utils.data as torchdata  # noqa
     from spiral.table import Table
@@ -53,3 +55,17 @@ class Snapshot:
         import duckdb
         return duckdb.from_arrow(self.to_dataset())
+    def to_iterable_dataset(
+        self,
+        *,
+        shuffle: ShuffleStrategy | None = None,
+        batch_readahead: int | None = None,
+        infinite: bool = False,
+    ) -> "torchdata.IterableDataset":
+        """Returns an iterable dataset compatible with `torch.IterableDataset`.
+        See `Table` docs for details on the parameters.
+        """
+        # TODO(marko): WIP.
+        raise NotImplementedError

spiral/streaming_/stream.py CHANGED Viewed

@@ -25,12 +25,16 @@ class SpiralStream:
     """
     def __init__(
-        self, scan: CoreScan, shards: list[Shard], cache_dir: str | None = None, shard_row_block_size: int = 8192
+        self,
+        scan: CoreScan,
+        shards: list[Shard],
+        cache_dir: str | None = None,
+        shard_row_block_size: int | None = None,
     ):
         self._scan = scan
         # TODO(marko): Read shards only on world.is_local_leader in `get_shards` and materialize on disk.
         self._shards = shards
-        self.shard_row_block_size = shard_row_block_size
+        self._shard_row_block_size = shard_row_block_size or 8192
         if cache_dir is not None:
             if not os.path.exists(cache_dir):
@@ -99,7 +103,7 @@ class SpiralStream:
             shard_path,
             shard.shard.key_range,
             expected_cardinality=shard.shard.cardinality,
-            shard_row_block_size=self.shard_row_block_size,
+            shard_row_block_size=self._shard_row_block_size,
         )
         # Get the size of the file on disk.

spiral/table.py CHANGED Viewed

@@ -1,6 +1,7 @@
 from datetime import datetime
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, Optional
+from spiral import ShuffleStrategy
 from spiral.core.table import Table as CoreTable
 from spiral.core.table.spec import Schema
 from spiral.expressions.base import Expr, ExprLike
@@ -115,7 +116,7 @@ class Table(Expr):
         :param column_paths: Fully qualified column names. (e.g., "column_name" or "nested.field").
-            All columns must exist, if a a column doesn't exist the function will return an error.
+            All columns must exist, if a column doesn't exist the function will return an error.
         """
         with self.txn() as txn:
             txn.drop_columns(column_paths)
@@ -126,13 +127,16 @@ class Table(Expr):
             asof = int(asof.timestamp() * 1_000_000)
         return Snapshot(self, self.core.get_snapshot(asof=asof))
-    def txn(self) -> Transaction:
+    def txn(self, retries: int | None = 3) -> Transaction:
         """Begins a new transaction. Transaction must be committed for writes to become visible.
+        :param retries: Maximum number of retry attempts on conflict (default: 3). Set to None for a single attempt.
         IMPORTANT: While transaction can be used to atomically write data to the table,
              it is important that the primary key columns are unique within the transaction.
+             The behavior is undefined if this is not the case.
         """
-        return Transaction(self.spiral._core.transaction(self.core, settings().file_format))
+        return Transaction(self.spiral._core.transaction(self.core, settings().file_format, retries=retries))
     def to_dataset(self) -> "ds.Dataset":
         """Returns a PyArrow Dataset representing the table."""
@@ -146,108 +150,58 @@ class Table(Expr):
         """Returns a DuckDB relation for the Spiral table."""
         return self.snapshot().to_duckdb()
-    def to_data_loader(self, *, index: "KeySpaceIndex", **kwargs) -> "torchdata.DataLoader":
-        """Returns a PyTorch DataLoader.
+    def to_iterable_dataset(
+        self,
+        *,
+        index: Optional["KeySpaceIndex"] = None,
+        shuffle: ShuffleStrategy | None = None,
+        batch_readahead: int | None = None,
+        infinite: bool = False,
+    ) -> "torchdata.IterableDataset":
+        """Returns an iterable dataset compatible with `torch.IterableDataset`. It can be used for training
+        in local or distributed settings.
+        Supports sharding, shuffling, and compatible for multiprocessing with `num_workers`. If projections and
+        filtering are needed, you must create a key space index and pass it when creating the stream.
-        Requires `torch` and `streaming` package to be installed.
+        Requires `torch` package to be installed.
         Args:
-            index: See `streaming` method.
+            shuffle: Controls sample shuffling. If None, no shuffling is performed.
+            batch_readahead: Controls how many batches to read ahead concurrently.
+                If pipeline includes work after reading (e.g. decoding, transforming, ...) this can be set higher.
+                Otherwise, it should be kept low to reduce next batch latency. Defaults to 2.
+            asof: If provided, only data written before the given timestamp will be returned.
+                If `index` is provided, it must not be used. The index's `asof` will be used instead.
+            infinite: If True, the returned IterableDataset will loop infinitely over the data,
+                re-shuffling ranges after exhausting all data.
+            index: Optional prebuilt KeysIndex to use when creating the stream.
+                The index's `asof` will be used when scanning.
             **kwargs: Additional arguments passed to the PyTorch DataLoader constructor.
         """
-        from streaming import StreamingDataLoader
-        dataset_kwargs = {}
-        if "batch_size" in kwargs:
-            # Keep it in kwargs for DataLoader
-            dataset_kwargs["batch_size"] = kwargs["batch_size"]
-        if "cache_limit" in kwargs:
-            dataset_kwargs["cache_limit"] = kwargs.pop("cache_limit")
-        if "sampling_method" in kwargs:
-            dataset_kwargs["sampling_method"] = kwargs.pop("sampling_method")
-        if "sampling_granularity" in kwargs:
-            dataset_kwargs["sampling_granularity"] = kwargs.pop("sampling_granularity")
-        if "partition_algo" in kwargs:
-            dataset_kwargs["partition_algo"] = kwargs.pop("partition_algo")
-        if "num_canonical_nodes" in kwargs:
-            dataset_kwargs["num_canonical_nodes"] = kwargs.pop("num_canonical_nodes")
-        if "shuffle" in kwargs:
-            dataset_kwargs["shuffle"] = kwargs.pop("shuffle")
-        if "shuffle_algo" in kwargs:
-            dataset_kwargs["shuffle_algo"] = kwargs.pop("shuffle_algo")
-        if "shuffle_seed" in kwargs:
-            dataset_kwargs["shuffle_seed"] = kwargs.pop("shuffle_seed")
-        if "shuffle_block_size" in kwargs:
-            dataset_kwargs["shuffle_block_size"] = kwargs.pop("shuffle_block_size")
-        if "batching_method" in kwargs:
-            dataset_kwargs["batching_method"] = kwargs.pop("batching_method")
-        if "replication" in kwargs:
-            dataset_kwargs["replication"] = kwargs.pop("replication")
-        dataset = self.to_streaming_dataset(index=index, **dataset_kwargs)
-        return StreamingDataLoader(dataset=dataset, **kwargs)
-    def to_streaming_dataset(
+        # TODO(marko): WIP.
+        raise NotImplementedError
+    def to_streaming(
         self,
-        *,
         index: "KeySpaceIndex",
-        batch_size: int | None = None,
+        *,
+        projection: Expr | None = None,
         cache_dir: str | None = None,
-        cache_limit: int | str | None = None,
-        predownload: int | None = None,
-        sampling_method: str = "balanced",
-        sampling_granularity: int = 1,
-        partition_algo: str = "relaxed",
-        num_canonical_nodes: int | None = None,
-        shuffle: bool = False,
-        shuffle_algo: str = "py1e",
-        shuffle_seed: int = 9176,
-        shuffle_block_size: int | None = None,
-        batching_method: str = "random",
-        replication: int | None = None,
-    ) -> "streaming.StreamingDataset":
-        """Returns a MosaicML's StreamingDataset that can be used for distributed training.
-        Requires `streaming` package to be installed.
-        Args:
-            See `streaming` method for `index` arg.
-            See MosaicML's `StreamingDataset` for other args.
-        This is a helper method to construct a single stream dataset from the scan. When multiple streams are combined,
-        use `to_stream` to get the SpiralStream and construct the StreamingDataset manually using a `streams` arg.
-        """
-        from streaming import StreamingDataset
-        stream = self.to_streaming(index=index, cache_dir=cache_dir)
-        return StreamingDataset(
-            streams=[stream],
-            batch_size=batch_size,
-            cache_limit=cache_limit,
-            predownload=predownload,
-            sampling_method=sampling_method,
-            sampling_granularity=sampling_granularity,
-            partition_algo=partition_algo,
-            num_canonical_nodes=num_canonical_nodes,
-            shuffle=shuffle,
-            shuffle_algo=shuffle_algo,
-            shuffle_seed=shuffle_seed,
-            shuffle_block_size=shuffle_block_size,
-            batching_method=batching_method,
-            replication=replication,
-        )
-    def to_streaming(self, index: "KeySpaceIndex", *, cache_dir: str | None = None) -> "streaming.Stream":
+        shard_row_block_size: int | None = None,
+    ) -> "streaming.Stream":
         """Returns a stream to be used with MosaicML's StreamingDataset.
         Requires `streaming` package to be installed.
         Args:
-            index: Prebuilt KeysIndex to use when creating the stream. The index's `asof` will be used when scanning.
+            index: Prebuilt KeysIndex to use when creating the stream.
+                The index's `asof` will be used when scanning.
+            projection: Optional projection to use when scanning the table if index's projection is not used.
+                Projection must be compatible with the index's projection for correctness.
             cache_dir: Directory to use for caching data. If None, a temporary directory will be used.
+            shard_row_block_size: Number of rows per segment of a shard file. Defaults to 8192.
+                Value should be set to lower for larger rows.
         """
         from spiral.streaming_ import SpiralStream
@@ -258,7 +212,7 @@ class Table(Expr):
         # We know table from projection is in the session cause this method is on it.
         scan = self.spiral.scan(
-            index.projection,
+            projection if projection is not None else index.projection,
             where=index.filter,
             asof=index.asof,
             # TODO(marko): This should be configurable?
@@ -269,4 +223,9 @@ class Table(Expr):
         #   We have a world there and can compute shards only on leader.
         shards = self.spiral._core._ops().compute_shards(index=index.core)
-        return SpiralStream(scan=scan.core, shards=shards, cache_dir=cache_dir)  # type: ignore[return-value]
+        return SpiralStream(
+            scan=scan.core,
+            shards=shards,
+            cache_dir=cache_dir,
+            shard_row_block_size=shard_row_block_size,
+        )  # type: ignore[return-value]

spiral/transaction.py CHANGED Viewed

@@ -45,7 +45,7 @@ class Transaction:
         :param column_paths: Fully qualified column names. (e.g., "column_name" or "nested.field").
-            All columns must exist, if a a column doesn't exist the function will return an error.
+            All columns must exist, if a column doesn't exist the function will return an error.
         """
         self._core.drop_columns(column_paths)

{pyspiral-0.6.3.dist-info → pyspiral-0.6.5.dist-info}/entry_points.txt RENAMED Viewed

File without changes