PyPI - pyspiral - Versions diffs - 0.2.4__cp310-abi3-macosx_11_0_arm64.whl → 0.3.1__cp310-abi3-macosx_11_0_arm64.whl - Mend

pyspiral 0.2.4__cp310-abi3-macosx_11_0_arm64.whl → 0.3.1__cp310-abi3-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (30) hide show

{pyspiral-0.2.4.dist-info → pyspiral-0.3.1.dist-info}/METADATA +8 -6
{pyspiral-0.2.4.dist-info → pyspiral-0.3.1.dist-info}/RECORD +29 -25
{pyspiral-0.2.4.dist-info → pyspiral-0.3.1.dist-info}/WHEEL +1 -1
spiral/__init__.py +3 -1
spiral/_lib.abi3.so +0 -0
spiral/api/__init__.py +8 -4
spiral/api/filesystems.py +1 -1
spiral/api/tables.py +3 -6
spiral/catalog.py +15 -0
spiral/cli/fs.py +2 -2
spiral/cli/project.py +5 -3
spiral/core/core/__init__.pyi +34 -6
spiral/core/spec/__init__.pyi +8 -26
spiral/dataset.py +221 -20
spiral/expressions/__init__.py +19 -4
spiral/expressions/mp4.py +69 -0
spiral/expressions/png.py +18 -0
spiral/expressions/qoi.py +18 -0
spiral/expressions/refs.py +23 -3
spiral/expressions/tiff.py +88 -88
spiral/maintenance.py +12 -0
spiral/proto/_/scandal/__init__.py +78 -11
spiral/proto/_/spiral/table/__init__.py +53 -2
spiral/scan_.py +75 -24
spiral/settings.py +6 -0
spiral/substrait_.py +1 -1
spiral/table.py +35 -21
spiral/txn.py +48 -0
spiral/config.py +0 -26
{pyspiral-0.2.4.dist-info → pyspiral-0.3.1.dist-info}/entry_points.txt +0 -0

spiral/expressions/tiff.py CHANGED Viewed

@@ -1,42 +1,44 @@
 import numpy as np
 import pyarrow as pa
-from spiral.expressions.base import ExprLike
+from spiral.expressions.base import Expr, ExprLike
 from spiral.expressions.udf import RefUDF
+_TIFF_RES_DTYPE: pa.DataType = pa.struct(
+    [
+        pa.field("pixels", pa.large_binary()),
+        pa.field("height", pa.uint32()),
+        pa.field("width", pa.uint32()),
+        pa.field("channels", pa.uint8()),
+        pa.field("channel_bit_depth", pa.uint8()),
+    ]
+)
 def read(
     expr: ExprLike,
-    indexes: ExprLike | int | list[int] | None = None,
+    indexes: ExprLike | int | None = None,
     window: ExprLike | tuple[tuple[int, int], tuple[int, int]] | None = None,
     boundless: ExprLike | bool | None = None,
-):
+) -> Expr:
     """
     Read referenced cell in a `TIFF` format. Requires `rasterio` to be installed.
     Args:
         expr: The referenced `TIFF` bytes.
-        indexes: The band indexes to read. Defaults to first band. The first dimension of the result's `shape` field
-            is either 1 or the number of indexes.
+        indexes: The band indexes to read. Defaults to all.
         window: The window to read. In format (row_range_tuple, col_range_tuple). Defaults to full window.
         boundless: If `True`, windows that extend beyond the dataset's extent
             are permitted and partially or completely filled arrays will be returned as appropriate.
     Returns:
-        An array where each element is a NumPy array represented as a struct with fields:
-            bytes: Array bytes with type `pa.large_binary()`.
-            shape: Array shape with type `pa.list_(pa.uint32(), 3)`.
-            dtype: String representation of NumPy dtype with type `pa.string()`.
-    Example:
-        A way to get the i-th element in the result as NumPy array:
-        ```
-        array: np.ndarray = np.frombuffer(
-            result["bytes"][i].as_py(),
-            dtype=np.dtype(result["dtype"][i].as_py()),
-        ).reshape(tuple(result["shape"][i].as_py()))
-        ```
+        An array where each element is a decoded image with fields:
+            pixels: bytes of shape (channels, width, height).
+            width: Width of the image with type `pa.uint32()`.
+            height: Height of the image with type `pa.uint32()`.
+            channels: Number of channels of the image with type `pa.uint8()`.
+                If `indexes` is not None, this is the length of `indexes` or 1 if `indexes` is an int.
+            channel_bit_depth: Bit depth of the channel with type `pa.uint8()`.
     """
     try:
         import rasterio  # noqa: F401
@@ -46,55 +48,42 @@ def read(
     return TiffReadUDF()(expr, indexes, window, boundless)
-def crop(
+def select(
     expr: ExprLike,
-    shape: ExprLike,
-):
+    shape: ExprLike | dict,
+    indexes: ExprLike | int | None = None,
+) -> Expr:
     """
-    Crop shapes out of the referenced cell in a `TIFF` format. Requires `rasterio` to be installed.
+    Select the shape out of the referenced cell in a `TIFF` format. Requires `rasterio` to be installed.
     Args:
         expr: The referenced `TIFF` bytes.
         shape: [GeoJSON-like](https://geojson.org/) shape.
+        indexes: The band indexes to read. Defaults to all.
     Returns:
-        An array where each element is a NumPy array represented as a struct with fields:
-            bytes: Array bytes with type `pa.large_binary()`.
-            shape: Array shape with type `pa.list_(pa.uint32(), 3)`.
-            dtype: String representation of NumPy dtype with type `pa.string()`.
-    Example:
-        A way to get the i-th element in the result as NumPy array:
-        ```
-        array: np.ndarray = np.frombuffer(
-            result["bytes"][i].as_py(),
-            dtype=np.dtype(result["dtype"][i].as_py()),
-        ).reshape(tuple(result["shape"][i].as_py()))
-        ```
+        An array where each element is a decoded image with fields:
+            pixels: bytes of shape (len(indexes) or 1, width, height).
+            width: Width of the image with type `pa.uint32()`.
+            height: Height of the image with type `pa.uint32()`.
+            channels: Number of channels of the image with type `pa.uint8()`.
+                If `indexes` is not None, this is the length of `indexes` or 1 if `indexes` is an int.
+            channel_bit_depth: Bit depth of the channel with type `pa.uint8()`.
     """
     try:
         import rasterio  # noqa: F401
     except ImportError:
-        raise ImportError("`rasterio` is required for tiff.crop")
+        raise ImportError("`rasterio` is required for tiff.select")
-    return TiffCropUDF()(expr, shape)
+    return TiffSelectUDF()(expr, shape, indexes)
 class TiffReadUDF(RefUDF):
-    RES_DTYPE: pa.DataType = pa.struct(
-        [
-            pa.field("bytes", pa.large_binary()),
-            pa.field("shape", pa.list_(pa.uint32(), 3)),
-            pa.field("dtype", pa.string()),
-        ]
-    )
     def __init__(self):
         super().__init__("tiff.read")
     def return_type(self, *input_types: pa.DataType) -> pa.DataType:
-        return TiffReadUDF.RES_DTYPE
+        return _TIFF_RES_DTYPE
     def invoke(self, fp, *input_args: pa.Array) -> pa.Array:
         try:
@@ -130,65 +119,76 @@ class TiffReadUDF(RefUDF):
             #   This matters more if we want to rewrite this function to work with multiple inputs at once, in which
             #   case we should first consider using Rust GDAL bindings - I believe rasterio uses GDAL under the hood.
             result: np.ndarray = src.read(indexes=indexes, window=window)
-            return pa.array(
-                [
-                    {
-                        "bytes": result.tobytes(),
-                        "shape": list(result.shape),
-                        "dtype": str(result.dtype),
-                    }
-                ],
-                type=TiffReadUDF.RES_DTYPE,
-            )
-class TiffCropUDF(RefUDF):
-    RES_DTYPE: pa.DataType = pa.struct(
-        [
-            pa.field("bytes", pa.large_binary()),
-            pa.field("shape", pa.list_(pa.uint32()), 3),
-            pa.field("dtype", pa.string()),
-        ]
-    )
+            return _return_result(result, indexes)
+class TiffSelectUDF(RefUDF):
     def __init__(self):
-        super().__init__("tiff.crop")
+        super().__init__("tiff.select")
     def return_type(self, *input_types: pa.DataType) -> pa.DataType:
-        return TiffCropUDF.RES_DTYPE
+        return _TIFF_RES_DTYPE
     def invoke(self, fp, *input_args: pa.Array) -> pa.Array:
         try:
             import rasterio
         except ImportError:
-            raise ImportError("`rasterio` is required for tiff.crop")
+            raise ImportError("`rasterio` is required for tiff.select")
-        from rasterio.mask import mask as rio_mask
+        from rasterio.mask import raster_geometry_mask
-        if len(input_args) != 2:
-            raise ValueError("tiff.crop expects exactly 2 arguments: expr, shape")
+        if len(input_args) != 3:
+            raise ValueError("tiff.select expects exactly 3 arguments: expr, shape, indexes")
-        _, shape = input_args
+        _, shape, indexes = input_args
         shape = shape[0].as_py()
         if shape is None:
-            raise ValueError("tiff.crop expects shape to be a GeoJSON-like shape")
+            raise ValueError("tiff.select expects shape to be a GeoJSON-like shape")
+        indexes = indexes[0].as_py()
+        if indexes is not None and not isinstance(indexes, int) and not isinstance(indexes, list):
+            raise ValueError(f"tiff.select expects indexes to be None or an int or a list, got {indexes}")
         opener = _VsiOpener(fp)
         with rasterio.open("ref", opener=opener) as src:
             src: rasterio.DatasetReader
-            result, _ = rio_mask(src, shapes=[shape], crop=True)
-            result: np.ndarray
-            return pa.array(
-                [
-                    {
-                        "bytes": result.tobytes(),
-                        "shape": list(result.shape),
-                        "dtype": str(result.dtype),
-                    }
-                ],
-                type=TiffCropUDF.RES_DTYPE,
-            )
+            shape_mask, _, window = raster_geometry_mask(src, [shape], crop=True)
+            out_shape = (src.count,) + shape_mask.shape
+            result: np.ndarray = src.read(window=window, indexes=indexes, out_shape=out_shape, masked=True)
+            return _return_result(result, indexes)
+def _return_result(result: np.ndarray, indexes) -> pa.Array:
+    channels = result.shape[0]
+    if indexes is None:
+        pass
+    elif isinstance(indexes, int):
+        assert channels == 1, f"Expected 1 channel, got {channels}"
+    else:
+        assert channels == len(indexes), f"Expected {len(indexes)} channels, got {channels}"
+    if result.dtype == np.uint8:
+        channel_bit_depth = 8
+    elif result.dtype == np.uint16:
+        channel_bit_depth = 16
+    else:
+        raise ValueError(f"Unsupported bit width: {result.dtype}")
+    return pa.array(
+        [
+            {
+                "pixels": result.tobytes(),
+                "height": result.shape[1],
+                "width": result.shape[2],
+                "channels": channels,
+                "channel_bit_depth": channel_bit_depth,
+            }
+        ],
+        type=_TIFF_RES_DTYPE,
+    )
 class _VsiOpener:

spiral/maintenance.py ADDED Viewed

@@ -0,0 +1,12 @@
+from spiral.core.core import TableMaintenance
+class Maintenance:
+    """Spiral table maintenance."""
+    def __init__(self, maintenance: TableMaintenance):
+        self._maintenance = maintenance
+    def flush_wal(self):
+        """Flush the write-ahead log."""
+        self._maintenance.flush_wal()

spiral/proto/_/scandal/__init__.py CHANGED Viewed

@@ -30,6 +30,11 @@ class Source(betterproto.Message):
     parquet: "MetadataParquet" = betterproto.message_field(10, group="metadata")
+@dataclass(eq=False, repr=False)
+class Sink(betterproto.Message):
+    url: str = betterproto.string_field(1)
 @dataclass(eq=False, repr=False)
 class Fetch(betterproto.Message):
     """Let's make "fetch" happen."""
@@ -39,15 +44,24 @@ class Fetch(betterproto.Message):
 @dataclass(eq=False, repr=False)
 class FetchRequest(betterproto.Message):
+    """TODO(ngates): include projection expression."""
     uri: str = betterproto.string_field(1)
     """
-    A signed request to read an spfs://<fsid>/path?token=<jwt> URI.
-      * Declares the MIME types the client can read directly.
-      * Declares whether the client has connectivity to the FileSystem.
+    A signed request to read an
+     spfs://&lt;fsid&gt;/path?token=&lt;jwt&gt URI.
     """
-    connectivity: "Connectivity" = betterproto.message_field(2)
-    accepts: List[str] = betterproto.string_field(3)
+    headers: Dict[str, str] = betterproto.map_field(
+        2, betterproto.TYPE_STRING, betterproto.TYPE_STRING
+    )
+    """Custom headers to sign into the request."""
+    connectivity: "Connectivity" = betterproto.message_field(3)
+    """Declares whether the client has connectivity to the FileSystem."""
+    accepts: List[str] = betterproto.string_field(4)
+    """Declares the MIME types the client can read directly."""
 @dataclass(eq=False, repr=False)
@@ -59,11 +73,6 @@ class FetchResponse(betterproto.Message):
     """
-@dataclass(eq=False, repr=False)
-class Sink(betterproto.Message):
-    url: str = betterproto.string_field(1)
 @dataclass(eq=False, repr=False)
 class Put(betterproto.Message):
     pass
@@ -72,7 +81,10 @@ class Put(betterproto.Message):
 @dataclass(eq=False, repr=False)
 class PutRequest(betterproto.Message):
     uri: str = betterproto.string_field(1)
-    connectivity: "Connectivity" = betterproto.message_field(2)
+    headers: Dict[str, str] = betterproto.map_field(
+        2, betterproto.TYPE_STRING, betterproto.TYPE_STRING
+    )
+    connectivity: "Connectivity" = betterproto.message_field(3)
 @dataclass(eq=False, repr=False)
@@ -80,6 +92,25 @@ class PutResponse(betterproto.Message):
     sinks: List["Sink"] = betterproto.message_field(1)
+@dataclass(eq=False, repr=False)
+class Head(betterproto.Message):
+    pass
+@dataclass(eq=False, repr=False)
+class HeadRequest(betterproto.Message):
+    uri: str = betterproto.string_field(1)
+    headers: Dict[str, str] = betterproto.map_field(
+        2, betterproto.TYPE_STRING, betterproto.TYPE_STRING
+    )
+@dataclass(eq=False, repr=False)
+class HeadResponse(betterproto.Message):
+    url: str = betterproto.string_field(1)
+    """Returns signed URL to head the resource."""
 @dataclass(eq=False, repr=False)
 class Delete(betterproto.Message):
     pass
@@ -88,6 +119,9 @@ class Delete(betterproto.Message):
 @dataclass(eq=False, repr=False)
 class DeleteRequest(betterproto.Message):
     uri: str = betterproto.string_field(1)
+    headers: Dict[str, str] = betterproto.map_field(
+        2, betterproto.TYPE_STRING, betterproto.TYPE_STRING
+    )
 @dataclass(eq=False, repr=False)
@@ -151,6 +185,23 @@ class ScandalServiceStub(betterproto.ServiceStub):
             metadata=metadata,
         )
+    async def head(
+        self,
+        head_request: "HeadRequest",
+        *,
+        timeout: Optional[float] = None,
+        deadline: Optional["Deadline"] = None,
+        metadata: Optional["MetadataLike"] = None
+    ) -> "HeadResponse":
+        return await self._unary_unary(
+            "/scandal.ScandalService/Head",
+            head_request,
+            HeadResponse,
+            timeout=timeout,
+            deadline=deadline,
+            metadata=metadata,
+        )
     async def delete(
         self,
         delete_request: "DeleteRequest",
@@ -176,6 +227,9 @@ class ScandalServiceBase(ServiceBase):
     async def put(self, put_request: "PutRequest") -> "PutResponse":
         raise grpclib.GRPCError(grpclib.const.Status.UNIMPLEMENTED)
+    async def head(self, head_request: "HeadRequest") -> "HeadResponse":
+        raise grpclib.GRPCError(grpclib.const.Status.UNIMPLEMENTED)
     async def delete(self, delete_request: "DeleteRequest") -> "DeleteResponse":
         raise grpclib.GRPCError(grpclib.const.Status.UNIMPLEMENTED)
@@ -193,6 +247,13 @@ class ScandalServiceBase(ServiceBase):
         response = await self.put(request)
         await stream.send_message(response)
+    async def __rpc_head(
+        self, stream: "grpclib.server.Stream[HeadRequest, HeadResponse]"
+    ) -> None:
+        request = await stream.recv_message()
+        response = await self.head(request)
+        await stream.send_message(response)
     async def __rpc_delete(
         self, stream: "grpclib.server.Stream[DeleteRequest, DeleteResponse]"
     ) -> None:
@@ -214,6 +275,12 @@ class ScandalServiceBase(ServiceBase):
                 PutRequest,
                 PutResponse,
             ),
+            "/scandal.ScandalService/Head": grpclib.const.Handler(
+                self.__rpc_head,
+                grpclib.const.Cardinality.UNARY_UNARY,
+                HeadRequest,
+                HeadResponse,
+            ),
             "/scandal.ScandalService/Delete": grpclib.const.Handler(
                 self.__rpc_delete,
                 grpclib.const.Cardinality.UNARY_UNARY,

spiral/proto/_/spiral/table/__init__.py CHANGED Viewed

@@ -152,6 +152,12 @@ class FragmentSetWriteOp(betterproto.Message):
     key_span: "KeySpan" = betterproto.message_field(5)
     key_extent: "KeyExtent" = betterproto.message_field(6)
     column_ids: List[str] = betterproto.string_field(7)
+    data_ts: Optional[int] = betterproto.uint64_field(8, optional=True)
+    """
+    Timestamp of the data in the fragments.
+     Used as committed_ts for files in the manifest.
+     If not present, timestamp of the operation is used.
+    """
 @dataclass(eq=False, repr=False)
@@ -175,8 +181,53 @@ class SchemaBreakOp(betterproto.Message):
 @dataclass(eq=False, repr=False)
 class CompactKeySpaceOp(betterproto.Message):
-    from_ks_ids: List[str] = betterproto.string_field(1)
-    into_ks_ids: List[str] = betterproto.string_field(2)
+    results: List["CompactKeySpaceResult"] = betterproto.message_field(1)
+@dataclass(eq=False, repr=False)
+class CompactKeySpaceResult(betterproto.Message):
+    """
+    TODO(marko): Do we really need to know all of this? UpdateKeySpaceOp?
+    """
+    ks_id: str = betterproto.string_field(1)
+    compacted: "CompactKeySpaceResultCompacted" = betterproto.message_field(
+        2, group="action"
+    )
+    """Key space has been compacted."""
+    created: "CompactKeySpaceResultCreated" = betterproto.message_field(
+        3, group="action"
+    )
+    """New output key space has been created."""
+    moved: "CompactKeySpaceResultMoved" = betterproto.message_field(4, group="action")
+    """Key space has been promoted to L1."""
+    extended: "CompactKeySpaceResultExtended" = betterproto.message_field(
+        5, group="action"
+    )
+    """Key space has been extended with new key files."""
+@dataclass(eq=False, repr=False)
+class CompactKeySpaceResultCompacted(betterproto.Message):
+    pass
+@dataclass(eq=False, repr=False)
+class CompactKeySpaceResultCreated(betterproto.Message):
+    pass
+@dataclass(eq=False, repr=False)
+class CompactKeySpaceResultMoved(betterproto.Message):
+    pass
+@dataclass(eq=False, repr=False)
+class CompactKeySpaceResultExtended(betterproto.Message):
+    pass
 @dataclass(eq=False, repr=False)

spiral/scan_.py CHANGED Viewed

@@ -13,6 +13,8 @@ if TYPE_CHECKING:
     import dask.dataframe as dd
     import pandas as pd
     import polars as pl
+    import pyarrow
+    import pyarrow.dataset
     from datasets import iterable_dataset
 tracer = trace.get_tracer("pyspiral.client.scan")
@@ -23,8 +25,6 @@ def scan(
     where: ExprLike | None = None,
     asof: datetime | int | str = None,
     exclude_keys: bool = False,
-    # TODO(marko): Support config.
-    # config: Config | None = None,
 ) -> "Scan":
     """Starts a read transaction on the spiral.
@@ -33,6 +33,7 @@ def scan(
         where: a query expression to apply to the data.
         asof: only data written before the given timestamp will be returned, caveats around compaction.
         exclude_keys: whether to exclude the key columns in the scan result, defaults to False.
+            Note that if a projection includes a key column, it will be included in the result.
     """
     from spiral import expressions as se
@@ -58,8 +59,6 @@ class Scan:
     def __init__(
         self,
         scan: TableScan,
-        # TODO(marko): Support config.
-        # config: Config | None = None,
     ):
         # NOTE(ngates): this API is a little weird. e.g. if the query doesn't define an asof, it is resolved
         #  when we wrap it into a core.Scan. Should we expose a Query object in the Python API that's reusable
@@ -84,27 +83,57 @@ class Scan:
         """
         return self._scan.is_empty()
-    def to_record_batches(self, key_table: pa.Table | pa.RecordBatchReader | None = None) -> pa.RecordBatchReader:
+    def to_dataset(
+        self,
+        key_table: pa.Table | pa.RecordBatchReader | None = None,
+    ) -> "pyarrow.dataset.Dataset":
+        """Returns a PyArrow Dataset representing the scan.
+        Args:
+            key_table: a table of keys to "take" (including aux columns for cell-push-down).
+                If None, the scan will be executed without a key table.
+        """
+        from .dataset import ScanDataset
+        return ScanDataset(self, key_table=key_table)
+    def to_record_batches(
+        self,
+        key_table: pa.Table | pa.RecordBatchReader | None = None,
+        batch_size: int | None = None,
+        batch_readahead: int | None = None,
+    ) -> pa.RecordBatchReader:
         """Read as a stream of RecordBatches.
         Args:
             key_table: a table of keys to "take" (including aux columns for cell-push-down).
+                If None, the scan will be executed without a key table.
+            batch_size: the maximum number of rows per returned batch.
+                IMPORTANT: This is currently only respected when the key_table is used. If key table is a
+                    RecordBatchReader, the batch_size argument must be None, and the existing batching is respected.
+            batch_readahead: the number of batches to prefetch in the background.
         """
         if isinstance(key_table, pa.RecordBatchReader):
-            raise NotImplementedError("RecordBatchReader is not supported as key_table")
+            if batch_size is not None:
+                raise ValueError(
+                    "batch_size must be None when key_table is a RecordBatchReader, the existing batching is respected."
+                )
+        elif isinstance(key_table, pa.Table):
+            key_table = key_table.to_reader(max_chunksize=batch_size)
-        # Prefix non-key columns in the key table with # (auxiliary) to avoid conflicts with the scan schema.
-        if key_table is not None:
-            key_columns = list(self._scan.key_schema().to_arrow().names)
-            key_table = key_table.rename_columns(
-                {name: f"#{name}" if name not in key_columns else name for name in key_table.schema.names}
-            )
+        return self._scan.to_record_batches(key_table=key_table, batch_readahead=batch_readahead)
-        return self._scan.to_record_batches(aux_table=key_table)
+    def to_table(
+        self,
+        key_table: pa.Table | pa.RecordBatchReader | None = None,
+    ) -> pa.Table:
+        """Read into a single PyArrow Table.
-    def to_table(self) -> pa.Table:
-        """Read into a single PyArrow Table."""
-        return self.to_record_batches().read_all()
+        Args:
+            key_table: a table of keys to "take" (including aux columns for cell-push-down).
+                If None, the scan will be executed without a key table.
+        """
+        return self.to_record_batches(key_table=key_table).read_all()
     def to_dask(self) -> "dd.DataFrame":
         """Read into a Dask DataFrame.
@@ -121,32 +150,54 @@ class Scan:
         # Fetch a set of partition ranges
         return dd.from_map(_read_key_range, self.split())
-    def to_pandas(self) -> "pd.DataFrame":
+    def to_pandas(
+        self,
+        key_table: pa.Table | pa.RecordBatchReader | None = None,
+    ) -> "pd.DataFrame":
         """Read into a Pandas DataFrame.
         Requires the `pandas` package to be installed.
+        Args:
+            key_table: a table of keys to "take" (including aux columns for cell-push-down).
+                If None, the scan will be executed without a key table.
         """
-        return self.to_table().to_pandas()
+        return self.to_table(key_table=key_table).to_pandas()
-    def to_polars(self) -> "pl.DataFrame":
-        """Read into a Polars DataFrame.
+    def to_polars(self, key_table: pa.Table | pa.RecordBatchReader | None = None) -> "pl.LazyFrame":
+        """Read into a Polars LazyFrame.
         Requires the `polars` package to be installed.
+        Args:
+            key_table: a table of keys to "take" (including aux columns for cell-push-down).
+                If None, the scan will be executed without a key table.
         """
         import polars as pl
-        # TODO(ngates): PR PyArrow to support lazy datasets
-        return pl.from_arrow(self.to_record_batches())
+        return pl.scan_pyarrow_dataset(self.to_dataset(key_table=key_table))
-    def to_pytorch(self) -> "iterable_dataset.IterableDataset":
+    def to_pytorch(
+        self,
+        key_table: pa.Table | pa.RecordBatchReader | None = None,
+        batch_readahead: int | None = None,
+    ) -> "iterable_dataset.IterableDataset":
         """Returns an iterable dataset that can be used to build a `pytorch.DataLoader`.
         Requires the `datasets` package to be installed.
+        Args:
+            key_table: a table of keys to "take" (including aux columns for cell-push-down).
+                If None, the scan will be executed without a key table.
+            batch_readahead: the number of batches to prefetch in the background.
         """
         from datasets.iterable_dataset import ArrowExamplesIterable, IterableDataset
         def _generate_tables(**kwargs) -> Iterator[tuple[int, pa.Table]]:
-            stream = self.to_record_batches()
+            # Use batch size 1 when iterating samples, unless batch reader is already used.
+            stream = self.to_record_batches(
+                key_table, batch_size=1 if isinstance(key_table, pa.Table) else None, batch_readahead=batch_readahead
+            )
             # This key is unused when training with IterableDataset.
             # Default implementation returns shard id, e.g. parquet row group id.

spiral/settings.py CHANGED Viewed

@@ -29,6 +29,8 @@ from spiral.authn.github_ import GitHubActionsProvider
 from spiral.authn.modal_ import ModalProvider
 DEV = "PYTEST_VERSION" in os.environ or bool(os.environ.get("SPIRAL_DEV", None))
+FILE_FORMAT = os.environ.get("SPIRAL_FILE_FORMAT", "parquet")
 APP_DIR = Path(typer.get_app_dir("pyspiral"))
 LOG_DIR = APP_DIR / "logs"
 CONFIG_FILE = APP_DIR / "config.toml"
@@ -67,6 +69,10 @@ class SpiralDBSettings(BaseSettings):
         # TODO(marko): Scandal will be a different service. For now, gRPC API is hosted on the SpiralDB service.
         return f"{'grpc+tls' if self.ssl else 'grpc'}://{self.host}:{self.port}"
+    @property
+    def uri_iceberg(self) -> str:
+        return self.uri + "/iceberg"
     def device_auth(self) -> DeviceAuth:
         auth_file = (
             APP_DIR / hashlib.md5(f"{self.auth.domain}/{self.auth.client_id}".encode()).hexdigest() / "auth.json"