PyPI - pyspiral - Versions diffs - 0.6.3__cp310-abi3-macosx_11_0_arm64.whl → 0.6.4__cp310-abi3-macosx_11_0_arm64.whl - Mend

pyspiral 0.6.3__cp310-abi3-macosx_11_0_arm64.whl → 0.6.4__cp310-abi3-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (29) hide show

{pyspiral-0.6.3.dist-info → pyspiral-0.6.4.dist-info}/METADATA +3 -3
{pyspiral-0.6.3.dist-info → pyspiral-0.6.4.dist-info}/RECORD +29 -27
{pyspiral-0.6.3.dist-info → pyspiral-0.6.4.dist-info}/WHEEL +1 -1
spiral/_lib.abi3.so +0 -0
spiral/api/client.py +1 -1
spiral/api/filesystems.py +9 -40
spiral/cli/app.py +42 -6
spiral/cli/fs.py +25 -60
spiral/cli/login.py +3 -2
spiral/core/_tools/__init__.pyi +5 -0
spiral/core/client/__init__.pyi +12 -1
spiral/core/table/__init__.pyi +3 -0
spiral/debug/manifests.py +26 -18
spiral/expressions/__init__.py +2 -2
spiral/expressions/base.py +9 -3
spiral/iterable_dataset.py +106 -0
spiral/protogen/_/arrow/flight/protocol/sql/__init__.py +1 -1
spiral/protogen/_/google/protobuf/__init__.py +121 -1
spiral/protogen/_/scandal/__init__.py +1 -1
spiral/protogen/_/spfs/__init__.py +1 -1
spiral/protogen/_/spql/__init__.py +1 -1
spiral/protogen/_/substrait/__init__.py +1 -1
spiral/protogen/_/substrait/extensions/__init__.py +1 -1
spiral/scan.py +22 -34
spiral/settings.py +2 -0
spiral/snapshot.py +16 -0
spiral/streaming_/stream.py +7 -3
spiral/table.py +48 -91
{pyspiral-0.6.3.dist-info → pyspiral-0.6.4.dist-info}/entry_points.txt +0 -0

spiral/table.py CHANGED Viewed

@@ -1,6 +1,7 @@
 from datetime import datetime
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, Optional
+from spiral import ShuffleStrategy
 from spiral.core.table import Table as CoreTable
 from spiral.core.table.spec import Schema
 from spiral.expressions.base import Expr, ExprLike
@@ -131,6 +132,7 @@ class Table(Expr):
         IMPORTANT: While transaction can be used to atomically write data to the table,
              it is important that the primary key columns are unique within the transaction.
+             The behavior is undefined if this is not the case.
         """
         return Transaction(self.spiral._core.transaction(self.core, settings().file_format))
@@ -146,108 +148,58 @@ class Table(Expr):
         """Returns a DuckDB relation for the Spiral table."""
         return self.snapshot().to_duckdb()
-    def to_data_loader(self, *, index: "KeySpaceIndex", **kwargs) -> "torchdata.DataLoader":
-        """Returns a PyTorch DataLoader.
+    def to_iterable_dataset(
+        self,
+        *,
+        index: Optional["KeySpaceIndex"] = None,
+        shuffle: ShuffleStrategy | None = None,
+        batch_readahead: int | None = None,
+        infinite: bool = False,
+    ) -> "torchdata.IterableDataset":
+        """Returns an iterable dataset compatible with `torch.IterableDataset`. It can be used for training
+        in local or distributed settings.
-        Requires `torch` and `streaming` package to be installed.
+        Supports sharding, shuffling, and compatible for multiprocessing with `num_workers`. If projections and
+        filtering are needed, you must create a key space index and pass it when creating the stream.
+        Requires `torch` package to be installed.
         Args:
-            index: See `streaming` method.
+            shuffle: Controls sample shuffling. If None, no shuffling is performed.
+            batch_readahead: Controls how many batches to read ahead concurrently.
+                If pipeline includes work after reading (e.g. decoding, transforming, ...) this can be set higher.
+                Otherwise, it should be kept low to reduce next batch latency. Defaults to 2.
+            asof: If provided, only data written before the given timestamp will be returned.
+                If `index` is provided, it must not be used. The index's `asof` will be used instead.
+            infinite: If True, the returned IterableDataset will loop infinitely over the data,
+                re-shuffling ranges after exhausting all data.
+            index: Optional prebuilt KeysIndex to use when creating the stream.
+                The index's `asof` will be used when scanning.
             **kwargs: Additional arguments passed to the PyTorch DataLoader constructor.
         """
-        from streaming import StreamingDataLoader
-        dataset_kwargs = {}
-        if "batch_size" in kwargs:
-            # Keep it in kwargs for DataLoader
-            dataset_kwargs["batch_size"] = kwargs["batch_size"]
-        if "cache_limit" in kwargs:
-            dataset_kwargs["cache_limit"] = kwargs.pop("cache_limit")
-        if "sampling_method" in kwargs:
-            dataset_kwargs["sampling_method"] = kwargs.pop("sampling_method")
-        if "sampling_granularity" in kwargs:
-            dataset_kwargs["sampling_granularity"] = kwargs.pop("sampling_granularity")
-        if "partition_algo" in kwargs:
-            dataset_kwargs["partition_algo"] = kwargs.pop("partition_algo")
-        if "num_canonical_nodes" in kwargs:
-            dataset_kwargs["num_canonical_nodes"] = kwargs.pop("num_canonical_nodes")
-        if "shuffle" in kwargs:
-            dataset_kwargs["shuffle"] = kwargs.pop("shuffle")
-        if "shuffle_algo" in kwargs:
-            dataset_kwargs["shuffle_algo"] = kwargs.pop("shuffle_algo")
-        if "shuffle_seed" in kwargs:
-            dataset_kwargs["shuffle_seed"] = kwargs.pop("shuffle_seed")
-        if "shuffle_block_size" in kwargs:
-            dataset_kwargs["shuffle_block_size"] = kwargs.pop("shuffle_block_size")
-        if "batching_method" in kwargs:
-            dataset_kwargs["batching_method"] = kwargs.pop("batching_method")
-        if "replication" in kwargs:
-            dataset_kwargs["replication"] = kwargs.pop("replication")
-        dataset = self.to_streaming_dataset(index=index, **dataset_kwargs)
-        return StreamingDataLoader(dataset=dataset, **kwargs)
-    def to_streaming_dataset(
+        # TODO(marko): WIP.
+        raise NotImplementedError
+    def to_streaming(
         self,
-        *,
         index: "KeySpaceIndex",
-        batch_size: int | None = None,
+        *,
+        projection: Expr | None = None,
         cache_dir: str | None = None,
-        cache_limit: int | str | None = None,
-        predownload: int | None = None,
-        sampling_method: str = "balanced",
-        sampling_granularity: int = 1,
-        partition_algo: str = "relaxed",
-        num_canonical_nodes: int | None = None,
-        shuffle: bool = False,
-        shuffle_algo: str = "py1e",
-        shuffle_seed: int = 9176,
-        shuffle_block_size: int | None = None,
-        batching_method: str = "random",
-        replication: int | None = None,
-    ) -> "streaming.StreamingDataset":
-        """Returns a MosaicML's StreamingDataset that can be used for distributed training.
-        Requires `streaming` package to be installed.
-        Args:
-            See `streaming` method for `index` arg.
-            See MosaicML's `StreamingDataset` for other args.
-        This is a helper method to construct a single stream dataset from the scan. When multiple streams are combined,
-        use `to_stream` to get the SpiralStream and construct the StreamingDataset manually using a `streams` arg.
-        """
-        from streaming import StreamingDataset
-        stream = self.to_streaming(index=index, cache_dir=cache_dir)
-        return StreamingDataset(
-            streams=[stream],
-            batch_size=batch_size,
-            cache_limit=cache_limit,
-            predownload=predownload,
-            sampling_method=sampling_method,
-            sampling_granularity=sampling_granularity,
-            partition_algo=partition_algo,
-            num_canonical_nodes=num_canonical_nodes,
-            shuffle=shuffle,
-            shuffle_algo=shuffle_algo,
-            shuffle_seed=shuffle_seed,
-            shuffle_block_size=shuffle_block_size,
-            batching_method=batching_method,
-            replication=replication,
-        )
-    def to_streaming(self, index: "KeySpaceIndex", *, cache_dir: str | None = None) -> "streaming.Stream":
+        shard_row_block_size: int | None = None,
+    ) -> "streaming.Stream":
         """Returns a stream to be used with MosaicML's StreamingDataset.
         Requires `streaming` package to be installed.
         Args:
-            index: Prebuilt KeysIndex to use when creating the stream. The index's `asof` will be used when scanning.
+            index: Prebuilt KeysIndex to use when creating the stream.
+                The index's `asof` will be used when scanning.
+            projection: Optional projection to use when scanning the table if index's projection is not used.
+                Projection must be compatible with the index's projection for correctness.
             cache_dir: Directory to use for caching data. If None, a temporary directory will be used.
+            shard_row_block_size: Number of rows per segment of a shard file. Defaults to 8192.
+                Value should be set to lower for larger rows.
         """
         from spiral.streaming_ import SpiralStream
@@ -258,7 +210,7 @@ class Table(Expr):
         # We know table from projection is in the session cause this method is on it.
         scan = self.spiral.scan(
-            index.projection,
+            projection if projection is not None else index.projection,
             where=index.filter,
             asof=index.asof,
             # TODO(marko): This should be configurable?
@@ -269,4 +221,9 @@ class Table(Expr):
         #   We have a world there and can compute shards only on leader.
         shards = self.spiral._core._ops().compute_shards(index=index.core)
-        return SpiralStream(scan=scan.core, shards=shards, cache_dir=cache_dir)  # type: ignore[return-value]
+        return SpiralStream(
+            scan=scan.core,
+            shards=shards,
+            cache_dir=cache_dir,
+            shard_row_block_size=shard_row_block_size,
+        )  # type: ignore[return-value]

{pyspiral-0.6.3.dist-info → pyspiral-0.6.4.dist-info}/entry_points.txt RENAMED Viewed

File without changes