PyPI - pyspiral - Versions diffs - 0.3.1__cp310-abi3-macosx_11_0_arm64.whl → 0.4.1__cp310-abi3-macosx_11_0_arm64.whl - Mend

pyspiral 0.3.1__cp310-abi3-macosx_11_0_arm64.whl → 0.4.1__cp310-abi3-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (109) hide show

{pyspiral-0.3.1.dist-info → pyspiral-0.4.1.dist-info}/METADATA +9 -13
pyspiral-0.4.1.dist-info/RECORD +98 -0
{pyspiral-0.3.1.dist-info → pyspiral-0.4.1.dist-info}/WHEEL +1 -1
spiral/__init__.py +6 -9
spiral/_lib.abi3.so +0 -0
spiral/adbc.py +21 -14
spiral/api/__init__.py +14 -175
spiral/api/admin.py +12 -26
spiral/api/client.py +160 -0
spiral/api/filesystems.py +100 -72
spiral/api/organizations.py +45 -58
spiral/api/projects.py +171 -134
spiral/api/telemetry.py +19 -0
spiral/api/types.py +20 -0
spiral/api/workloads.py +32 -25
spiral/{arrow.py → arrow_.py} +12 -0
spiral/cli/__init__.py +2 -5
spiral/cli/admin.py +7 -12
spiral/cli/app.py +23 -6
spiral/cli/console.py +1 -1
spiral/cli/fs.py +82 -17
spiral/cli/iceberg/__init__.py +7 -0
spiral/cli/iceberg/namespaces.py +47 -0
spiral/cli/iceberg/tables.py +60 -0
spiral/cli/indexes/__init__.py +19 -0
spiral/cli/login.py +14 -5
spiral/cli/orgs.py +90 -0
spiral/cli/printer.py +9 -1
spiral/cli/projects.py +136 -0
spiral/cli/state.py +2 -0
spiral/cli/tables/__init__.py +121 -0
spiral/cli/telemetry.py +18 -0
spiral/cli/types.py +8 -10
spiral/cli/{workload.py → workloads.py} +11 -11
spiral/{catalog.py → client.py} +23 -37
spiral/core/client/__init__.pyi +117 -0
spiral/core/index/__init__.pyi +15 -0
spiral/core/{core → table}/__init__.pyi +44 -17
spiral/core/{manifests → table/manifests}/__init__.pyi +5 -23
spiral/core/table/metastore/__init__.pyi +62 -0
spiral/core/{spec → table/spec}/__init__.pyi +41 -66
spiral/datetime_.py +27 -0
spiral/expressions/__init__.py +26 -18
spiral/expressions/base.py +5 -5
spiral/expressions/list_.py +1 -1
spiral/expressions/mp4.py +2 -9
spiral/expressions/png.py +1 -1
spiral/expressions/qoi.py +1 -1
spiral/expressions/refs.py +3 -9
spiral/expressions/struct.py +7 -5
spiral/expressions/text.py +62 -0
spiral/expressions/udf.py +3 -3
spiral/iceberg/__init__.py +3 -0
spiral/iceberg/client.py +33 -0
spiral/indexes/__init__.py +5 -0
spiral/indexes/client.py +137 -0
spiral/indexes/index.py +34 -0
spiral/indexes/scan.py +22 -0
spiral/project.py +19 -110
spiral/{proto → protogen}/_/scandal/__init__.py +23 -135
spiral/protogen/_/spiral/table/__init__.py +22 -0
spiral/protogen/substrait/__init__.py +3399 -0
spiral/protogen/substrait/extensions/__init__.py +115 -0
spiral/server.py +17 -0
spiral/settings.py +29 -91
spiral/substrait_.py +9 -5
spiral/tables/__init__.py +12 -0
spiral/tables/client.py +130 -0
spiral/{dataset.py → tables/dataset.py} +9 -199
spiral/tables/debug/manifests.py +70 -0
spiral/tables/debug/metrics.py +56 -0
spiral/{debug.py → tables/debug/scan.py} +6 -9
spiral/{maintenance.py → tables/maintenance.py} +1 -1
spiral/{scan_.py → tables/scan.py} +63 -89
spiral/tables/snapshot.py +78 -0
spiral/{table.py → tables/table.py} +59 -73
spiral/{txn.py → tables/transaction.py} +7 -3
pyspiral-0.3.1.dist-info/RECORD +0 -85
spiral/api/tables.py +0 -91
spiral/api/tokens.py +0 -56
spiral/authn/authn.py +0 -89
spiral/authn/device.py +0 -206
spiral/authn/github_.py +0 -33
spiral/authn/modal_.py +0 -18
spiral/cli/org.py +0 -90
spiral/cli/project.py +0 -109
spiral/cli/table.py +0 -20
spiral/cli/token.py +0 -27
spiral/core/metastore/__init__.pyi +0 -91
spiral/proto/_/spfs/__init__.py +0 -36
spiral/proto/_/spiral/table/__init__.py +0 -276
spiral/proto/_/spiraldb/metastore/__init__.py +0 -499
spiral/proto/__init__.py +0 -0
spiral/proto/scandal/__init__.py +0 -45
spiral/proto/spiral/__init__.py +0 -0
spiral/proto/spiral/table/__init__.py +0 -96
{pyspiral-0.3.1.dist-info → pyspiral-0.4.1.dist-info}/entry_points.txt +0 -0
/spiral/{authn/__init__.py → core/__init__.pyi} +0 -0
/spiral/{core → protogen/_}/__init__.py +0 -0
/spiral/{proto/_ → protogen/_/arrow}/__init__.py +0 -0
/spiral/{proto/_/arrow → protogen/_/arrow/flight}/__init__.py +0 -0
/spiral/{proto/_/arrow/flight → protogen/_/arrow/flight/protocol}/__init__.py +0 -0
/spiral/{proto → protogen}/_/arrow/flight/protocol/sql/__init__.py +0 -0
/spiral/{proto/_/arrow/flight/protocol → protogen/_/spiral}/__init__.py +0 -0
/spiral/{proto → protogen/_}/substrait/__init__.py +0 -0
/spiral/{proto → protogen/_}/substrait/extensions/__init__.py +0 -0
/spiral/{proto/_/spiral → protogen}/__init__.py +0 -0
/spiral/{proto → protogen}/util.py +0 -0
/spiral/{proto/_/spiraldb → tables/debug}/__init__.py +0 -0

spiral/{dataset.py → tables/dataset.py} RENAMED Viewed

@@ -4,15 +4,14 @@ import pyarrow as pa
 import pyarrow.compute as pc
 import pyarrow.dataset as ds
-from spiral import Scan, Table
+from spiral.tables import Scan, Snapshot
 class TableDataset(ds.Dataset):
-    def __init__(self, table: Table):
-        self._table = table
-        # Once table is converted to a dataset, used pinned snapshot.
-        self._asof = table.last_modified_at
-        self._schema: pa.Schema = table._table.get_schema(asof=self._asof).to_arrow()
+    def __init__(self, snapshot: Snapshot):
+        self._snapshot = snapshot
+        self._table = snapshot.table
+        self._schema: pa.Schema = self._snapshot._snapshot.table.get_schema(asof=self._snapshot.asof).to_arrow()
         # We don't actually initialize a Dataset, we just implement enough of the API
         # to fool both DuckDB and Polars.
@@ -104,23 +103,22 @@ class TableDataset(ds.Dataset):
         use_threads: bool = True,
         memory_pool: pa.MemoryPool = None,
     ) -> "TableScanner":
-        from .substrait_ import SubstraitConverter
+        from spiral.substrait_ import SubstraitConverter
         # Extract the substrait expression so we can convert it to a Spiral expression
         if filter is not None:
-            filter = SubstraitConverter(self._table, self._schema, self._table.key_schema).convert(
+            filter = SubstraitConverter(self._table, self._schema, self._table.key_schema.to_arrow()).convert(
                 filter.to_substrait(self._schema, allow_arrow_extensions=True),
             )
         scan = (
-            self._table.scan(
+            self._snapshot.scan(
                 {c: self._table[c] for c in columns},
                 where=filter,
                 exclude_keys=True,
-                asof=self._asof,
             )
             if columns
-            else self._table.scan(where=filter, asof=self._asof)
+            else self._snapshot.scan(where=filter)
         )
         self._last_scan = scan
@@ -250,191 +248,3 @@ class TableScanner(ds.Scanner):
     def to_table(self):
         return self.to_reader().read_all()
-class ScanDataset(ds.Dataset):
-    def __init__(
-        self,
-        scan: Scan,
-        key_table: pa.Table | pa.RecordBatchReader | None = None,
-    ):
-        self._scan = scan
-        self._schema: pa.Schema = scan.schema.to_arrow()
-        self._key_table = key_table
-        # We don't actually initialize a Dataset, we just implement enough of the API
-        # to fool both DuckDB and Polars.
-        # super().__init__()
-    @property
-    def schema(self) -> pa.Schema:
-        return self._schema
-    def count_rows(
-        self,
-        filter: pc.Expression | None = None,
-        batch_size: int | None = None,
-        batch_readahead: int | None = None,
-        fragment_readahead: int | None = None,
-        fragment_scan_options: ds.FragmentScanOptions | None = None,
-        use_threads: bool = True,
-        memory_pool: pa.MemoryPool = None,
-    ):
-        return self.scanner(
-            None,
-            filter,
-            batch_size,
-            batch_readahead,
-            fragment_readahead,
-            fragment_scan_options,
-            use_threads,
-            memory_pool,
-        ).count_rows()
-    def filter(self, expression: pc.Expression) -> "TableDataset":
-        raise NotImplementedError("filter not implemented")
-    def get_fragments(self, filter: pc.Expression | None = None):
-        """TODO(ngates): perhaps we should return ranges as per our split API?"""
-        raise NotImplementedError("get_fragments not implemented")
-    def head(
-        self,
-        num_rows: int,
-        columns: list[str] | None = None,
-        filter: pc.Expression | None = None,
-        batch_size: int | None = None,
-        batch_readahead: int | None = None,
-        fragment_readahead: int | None = None,
-        fragment_scan_options: ds.FragmentScanOptions | None = None,
-        use_threads: bool = True,
-        memory_pool: pa.MemoryPool = None,
-    ):
-        return self.scanner(
-            columns,
-            filter,
-            batch_size,
-            batch_readahead,
-            fragment_readahead,
-            fragment_scan_options,
-            use_threads,
-            memory_pool,
-        ).head(num_rows)
-    def join(
-        self,
-        right_dataset,
-        keys,
-        right_keys=None,
-        join_type=None,
-        left_suffix=None,
-        right_suffix=None,
-        coalesce_keys=True,
-        use_threads=True,
-    ):
-        raise NotImplementedError("join not implemented")
-    def join_asof(self, right_dataset, on, by, tolerance, right_on=None, right_by=None):
-        raise NotImplementedError("join_asof not implemented")
-    def replace_schema(self, schema: pa.Schema) -> "TableDataset":
-        raise NotImplementedError("replace_schema not implemented")
-    def scanner(
-        self,
-        columns: list[str] | None = None,
-        filter: pc.Expression | None = None,
-        batch_size: int | None = None,
-        batch_readahead: int | None = None,
-        fragment_readahead: int | None = None,
-        fragment_scan_options: ds.FragmentScanOptions | None = None,
-        use_threads: bool = True,
-        memory_pool: pa.MemoryPool = None,
-    ) -> "TableScanner":
-        if columns is not None:
-            columns = set(columns)
-            names = set(self.schema.names)
-            if len(columns - names) != 0 or len(names - columns) != 0:
-                raise NotImplementedError("columns", columns, self.schema)
-        if filter is not None:
-            raise NotImplementedError("filter")
-        if batch_size is not None:
-            raise NotImplementedError("batch_size")
-        if batch_readahead is not None:
-            raise NotImplementedError("batch_readahead")
-        if fragment_readahead is not None:
-            raise NotImplementedError("fragment_readahead")
-        if fragment_scan_options is not None:
-            raise NotImplementedError("fragment_scan_options")
-        return TableScanner(self._scan, key_table=self._key_table)
-    def sort_by(self, sorting, **kwargs):
-        raise NotImplementedError("sort_by not implemented")
-    def take(
-        self,
-        indices: pa.Array | Any,
-        columns: list[str] | None = None,
-        filter: pc.Expression | None = None,
-        batch_size: int | None = None,
-        batch_readahead: int | None = None,
-        fragment_readahead: int | None = None,
-        fragment_scan_options: ds.FragmentScanOptions | None = None,
-        use_threads: bool = True,
-        memory_pool: pa.MemoryPool = None,
-    ):
-        return self.scanner(
-            columns,
-            filter,
-            batch_size,
-            batch_readahead,
-            fragment_readahead,
-            fragment_scan_options,
-            use_threads,
-            memory_pool,
-        ).take(indices)
-    def to_batches(
-        self,
-        columns: list[str] | None = None,
-        filter: pc.Expression | None = None,
-        batch_size: int | None = None,
-        batch_readahead: int | None = None,
-        fragment_readahead: int | None = None,
-        fragment_scan_options: ds.FragmentScanOptions | None = None,
-        use_threads: bool = True,
-        memory_pool: pa.MemoryPool = None,
-    ):
-        return self.scanner(
-            columns,
-            filter,
-            batch_size,
-            batch_readahead,
-            fragment_readahead,
-            fragment_scan_options,
-            use_threads,
-            memory_pool,
-        ).to_batches()
-    def to_table(
-        self,
-        columns=None,
-        filter: pc.Expression | None = None,
-        batch_size: int | None = None,
-        batch_readahead: int | None = None,
-        fragment_readahead: int | None = None,
-        fragment_scan_options: ds.FragmentScanOptions | None = None,
-        use_threads: bool = True,
-        memory_pool: pa.MemoryPool = None,
-    ):
-        return self.scanner(
-            columns,
-            filter,
-            batch_size,
-            batch_readahead,
-            fragment_readahead,
-            fragment_scan_options,
-            use_threads,
-            memory_pool,
-        ).to_table()

spiral/tables/debug/manifests.py ADDED Viewed

@@ -0,0 +1,70 @@
+from spiral import datetime_
+from spiral.core.table import TableScan
+from spiral.core.table.manifests import FragmentManifest
+from spiral.tables.debug.metrics import _format_bytes
+def display_manifests(scan: TableScan):
+    """Display all manifests in a scan."""
+    if len(scan.table_ids()) != 1:
+        raise NotImplementedError("Multiple table scans are not supported.")
+    table_id = scan.table_ids()[0]
+    key_space_manifest: FragmentManifest = scan.key_space_scan(table_id).manifest
+    _table_of_fragments(
+        key_space_manifest,
+        title="Key Space manifest",
+    )
+    for column_group in scan.column_groups():
+        column_group_manifest: FragmentManifest = scan.column_group_scan(column_group).manifest
+        _table_of_fragments(
+            column_group_manifest,
+            title=f"Column Group manifest for {str(column_group)}",
+        )
+def _table_of_fragments(manifest: FragmentManifest, title: str):
+    """Display fragments in a formatted table."""
+    # Calculate summary statistics
+    total_size = sum(fragment.size_bytes for fragment in manifest)
+    total_metadata_size = sum(len(fragment.format_metadata or b"") for fragment in manifest)
+    fragment_count = len(manifest)
+    avg_size = total_size / fragment_count if fragment_count > 0 else 0
+    # Print title and summary
+    print(f"\n\n{title}")
+    print(
+        f"{fragment_count} fragments, "
+        f"total: {_format_bytes(total_size)}, "
+        f"avg: {_format_bytes(int(avg_size))}, "
+        f"metadata: {_format_bytes(total_metadata_size)}"
+    )
+    print("=" * 120)
+    # Print header
+    print(
+        f"{'ID':<30} {'Size (Metadata)':<20} {'Format':<10} {'Key Span':<10} "
+        f"{'Level':<5} {'Committed At':<20} {'Compacted At':<20}"
+    )
+    print("=" * 120)
+    # Print each fragment
+    for fragment in manifest:
+        committed_str = str(datetime_.from_timestamp_micros(fragment.committed_at)) if fragment.committed_at else "N/A"
+        compacted_str = str(datetime_.from_timestamp_micros(fragment.compacted_at)) if fragment.compacted_at else "N/A"
+        size_with_metadata = (
+            f"{_format_bytes(fragment.size_bytes)} ({_format_bytes(len(fragment.format_metadata or b''))})"
+        )
+        key_span = f"{fragment.key_span.begin}..{fragment.key_span.end}"
+        print(
+            f"{fragment.id:<30} "
+            f"{size_with_metadata:<20} "
+            f"{str(fragment.format):<10} "
+            f"{key_span:<10} "
+            f"{str(fragment.level):<5} "
+            f"{committed_str:<20} "
+            f"{compacted_str:<20}"
+        )

spiral/tables/debug/metrics.py ADDED Viewed

@@ -0,0 +1,56 @@
+from typing import Any
+def display_metrics(metrics: dict[str, Any]) -> None:
+    """Display metrics in a formatted table."""
+    print(
+        f"{'Metric':<40} {'Type':<10} {'Count':<8} {'Avg':<12} {'Min':<12} "
+        f"{'Max':<12} {'P95':<12} {'P99':<12} {'StdDev':<12}"
+    )
+    print("=" * 140)
+    for metric_name, data in sorted(metrics.items()):
+        metric_type = data["type"]
+        count = data["count"]
+        avg = _format_value(data["avg"], metric_type, metric_name)
+        min_val = _format_value(data["min"], metric_type, metric_name)
+        max_val = _format_value(data["max"], metric_type, metric_name)
+        p95 = _format_value(data["p95"], metric_type, metric_name)
+        p99 = _format_value(data["p99"], metric_type, metric_name)
+        stddev = _format_value(data["stddev"], metric_type, metric_name)
+        print(
+            f"{metric_name:<40} {metric_type:<10} {count:<8} {avg:<12} {min_val:<12} "
+            f"{max_val:<12} {p95:<12} {p99:<12} {stddev:<12}"
+        )
+def _format_duration(nanoseconds: float) -> str:
+    """Convert nanoseconds to human-readable duration."""
+    if nanoseconds >= 1_000_000_000:
+        return f"{nanoseconds / 1_000_000_000:.2f}s"
+    elif nanoseconds >= 1_000_000:
+        return f"{nanoseconds / 1_000_000:.2f}ms"
+    elif nanoseconds >= 1_000:
+        return f"{nanoseconds / 1_000:.2f}μs"
+    else:
+        return f"{nanoseconds:.0f}ns"
+def _format_bytes(bytes_value: float) -> str:
+    """Convert bytes to human-readable size."""
+    for unit in ["B", "KB", "MB", "GB"]:
+        if bytes_value < 1024:
+            return f"{bytes_value:.1f}{unit}"
+        bytes_value /= 1024
+    return f"{bytes_value:.1f}TB"
+def _format_value(value: float, metric_type: str, metric_name: str) -> str:
+    """Format a value based on metric type and name."""
+    if metric_type == "timer" or "duration" in metric_name:
+        return _format_duration(value)
+    elif "bytes" in metric_name:
+        return _format_bytes(value)
+    else:
+        return f"{value:,.0f}"

spiral/{debug.py → tables/debug/scan.py} RENAMED Viewed

@@ -1,8 +1,8 @@
 from datetime import datetime
-from spiral.core.core import TableScan
-from spiral.core.manifests import FragmentFile, FragmentManifest
-from spiral.core.spec import Key, KeyRange
+from spiral.core.table import TableScan
+from spiral.core.table.manifests import FragmentFile, FragmentManifest
+from spiral.core.table.spec import Key
 from spiral.types_ import Timestamp
@@ -30,7 +30,7 @@ def show_scan(scan: TableScan):
         for i in range(len(cg_manifest)):
             fragment_file = cg_manifest[i]
             key_points.add(fragment_file.key_extent.min)
-        key_points.add(fragment_file.key_extent.max)
+            key_points.add(fragment_file.key_extent.max)
     # Make sure split points exist in all key points.
     for s in splits[:-1]:  # Don't take the last end.
@@ -44,9 +44,7 @@ def show_scan(scan: TableScan):
         show_manifest(cg_scan.manifest, scope=".".join(cg.path[1:]), key_points=key_points, splits=splits)
-def show_manifest(
-    manifest: FragmentManifest, scope: str = None, key_points: list[Key] = None, splits: list[KeyRange] = None
-):
+def show_manifest(manifest: FragmentManifest, scope: str = None, key_points: list[Key] = None, splits: list = None):
     try:
         import matplotlib.patches as patches
         import matplotlib.pyplot as plt
@@ -157,10 +155,9 @@ def _get_fragment_legend(manifest_file: FragmentFile):
             f"key_min: {manifest_file.key_extent.min}",
             f"key_max: {manifest_file.key_extent.max}",
             f"format: {manifest_file.format}",
-            f"level: {manifest_file.fs_level}",
+            f"level: {manifest_file.level}",
             f"committed_at: {_format_timestamp(manifest_file.committed_at)}",
             f"compacted_at: {_format_timestamp(manifest_file.compacted_at)}",
-            f"fs_id: {manifest_file.fs_id}",
             f"ks_id: {manifest_file.ks_id}",
         ]
     )

spiral/{maintenance.py → tables/maintenance.py} RENAMED Viewed

@@ -1,4 +1,4 @@
-from spiral.core.core import TableMaintenance
+from spiral.core.table import TableMaintenance
 class Maintenance:

spiral/{scan_.py → tables/scan.py} RENAMED Viewed

@@ -1,57 +1,19 @@
 from collections.abc import Iterator
-from datetime import datetime
 from typing import TYPE_CHECKING, Any
 import pyarrow as pa
-from opentelemetry import trace
+from datasets import DatasetInfo, Features
-from spiral.core.core import TableScan
-from spiral.core.spec import KeyRange, Schema
-from spiral.expressions.base import ExprLike
+from spiral.core.table import KeyRange, TableScan
+from spiral.core.table.spec import Schema
+from spiral.settings import CI, DEV
 if TYPE_CHECKING:
     import dask.dataframe as dd
     import pandas as pd
     import polars as pl
-    import pyarrow
-    import pyarrow.dataset
     from datasets import iterable_dataset
-tracer = trace.get_tracer("pyspiral.client.scan")
-def scan(
-    *projections: ExprLike,
-    where: ExprLike | None = None,
-    asof: datetime | int | str = None,
-    exclude_keys: bool = False,
-) -> "Scan":
-    """Starts a read transaction on the spiral.
-    Args:
-        projections: a set of expressions that return struct arrays.
-        where: a query expression to apply to the data.
-        asof: only data written before the given timestamp will be returned, caveats around compaction.
-        exclude_keys: whether to exclude the key columns in the scan result, defaults to False.
-            Note that if a projection includes a key column, it will be included in the result.
-    """
-    from spiral import expressions as se
-    # Combine all projections into a single struct.
-    projection = se.merge(*projections)
-    if where is not None:
-        where = se.lift(where)
-    return Scan(
-        TableScan(
-            projection.__expr__,
-            filter=where.__expr__ if where else None,
-            asof=asof,
-            exclude_keys=exclude_keys,
-        ),
-        # config=config,
-    )
 class Scan:
     """Scan object."""
@@ -83,20 +45,6 @@ class Scan:
         """
         return self._scan.is_empty()
-    def to_dataset(
-        self,
-        key_table: pa.Table | pa.RecordBatchReader | None = None,
-    ) -> "pyarrow.dataset.Dataset":
-        """Returns a PyArrow Dataset representing the scan.
-        Args:
-            key_table: a table of keys to "take" (including aux columns for cell-push-down).
-                If None, the scan will be executed without a key table.
-        """
-        from .dataset import ScanDataset
-        return ScanDataset(self, key_table=key_table)
     def to_record_batches(
         self,
         key_table: pa.Table | pa.RecordBatchReader | None = None,
@@ -133,6 +81,11 @@ class Scan:
             key_table: a table of keys to "take" (including aux columns for cell-push-down).
                 If None, the scan will be executed without a key table.
         """
+        # NOTE: Evaluates fully on Rust side which improved debuggability.
+        if DEV and not CI and key_table is None:
+            rb = self._scan.to_record_batch()
+            return pa.Table.from_batches([rb])
         return self.to_record_batches(key_table=key_table).read_all()
     def to_dask(self) -> "dd.DataFrame":
@@ -150,70 +103,91 @@ class Scan:
         # Fetch a set of partition ranges
         return dd.from_map(_read_key_range, self.split())
-    def to_pandas(
-        self,
-        key_table: pa.Table | pa.RecordBatchReader | None = None,
-    ) -> "pd.DataFrame":
+    def to_pandas(self) -> "pd.DataFrame":
         """Read into a Pandas DataFrame.
         Requires the `pandas` package to be installed.
-        Args:
-            key_table: a table of keys to "take" (including aux columns for cell-push-down).
-                If None, the scan will be executed without a key table.
         """
-        return self.to_table(key_table=key_table).to_pandas()
+        return self.to_table().to_pandas()
-    def to_polars(self, key_table: pa.Table | pa.RecordBatchReader | None = None) -> "pl.LazyFrame":
-        """Read into a Polars LazyFrame.
+    def to_polars(self) -> "pl.DataFrame":
+        """Read into a Polars DataFrame.
         Requires the `polars` package to be installed.
-        Args:
-            key_table: a table of keys to "take" (including aux columns for cell-push-down).
-                If None, the scan will be executed without a key table.
         """
         import polars as pl
-        return pl.scan_pyarrow_dataset(self.to_dataset(key_table=key_table))
+        # TODO(marko): This should support lazy dataframe.
+        return pl.from_arrow(self.to_record_batches())
     def to_pytorch(
         self,
-        key_table: pa.Table | pa.RecordBatchReader | None = None,
         batch_readahead: int | None = None,
+        shuffle_batch_size: int | None = None,
+        shuffle_pool_num_rows: int | None = None,
     ) -> "iterable_dataset.IterableDataset":
-        """Returns an iterable dataset that can be used to build a `pytorch.DataLoader`.
-        Requires the `datasets` package to be installed.
+        """Returns an iterable dataset that can be used to build a PyTorch DataLoader.
         Args:
-            key_table: a table of keys to "take" (including aux columns for cell-push-down).
-                If None, the scan will be executed without a key table.
-            batch_readahead: the number of batches to prefetch in the background.
+            batch_readahead: Number of batches to prefetch in the background.
+            shuffle_batch_size: read granularity of number of rows for a shuffled scan. If left as
+            None along with shuffle_pool_num_rows=None, shuffling is disabled.
+            shuffle_pool_num_rows: Pool size for shuffling batches.
         """
         from datasets.iterable_dataset import ArrowExamplesIterable, IterableDataset
         def _generate_tables(**kwargs) -> Iterator[tuple[int, pa.Table]]:
-            # Use batch size 1 when iterating samples, unless batch reader is already used.
-            stream = self.to_record_batches(
-                key_table, batch_size=1 if isinstance(key_table, pa.Table) else None, batch_readahead=batch_readahead
-            )
+            if shuffle_batch_size is None and shuffle_pool_num_rows is None:
+                stream = self.to_record_batches(
+                    batch_readahead=batch_readahead,
+                )
+            else:
+                stream = self._scan.to_shuffled_record_batches(
+                    batch_readahead, shuffle_batch_size, shuffle_pool_num_rows
+                )
             # This key is unused when training with IterableDataset.
             # Default implementation returns shard id, e.g. parquet row group id.
             for i, rb in enumerate(stream):
                 yield i, pa.Table.from_batches([rb], stream.schema)
-        # NOTE: Type annotation Callable[..., tuple[str, pa.Table]] is wrong. The return value must be iterable.
+        def _hf_compatible_schema(schema: pa.Schema) -> pa.Schema:
+            """
+            Replace string-view columns in the schema with strings. We do use this converted schema
+            as Features in the returned Dataset.
+            Remove this method once we have https://github.com/huggingface/datasets/pull/7718
+            """
+            new_fields = [
+                pa.field(field.name, pa.string(), nullable=field.nullable, metadata=field.metadata)
+                if field.type == pa.string_view()
+                else field
+                for field in schema
+            ]
+            return pa.schema(new_fields)
+        # NOTE: generate_tables_fn type annotations are wrong, return type must be an iterable of tuples.
         ex_iterable = ArrowExamplesIterable(generate_tables_fn=_generate_tables, kwargs={})
-        return IterableDataset(ex_iterable=ex_iterable)
+        info = DatasetInfo(features=Features.from_arrow_schema(_hf_compatible_schema(self.schema.to_arrow())))
+        return IterableDataset(ex_iterable=ex_iterable, info=info)
-    def split(self) -> list[KeyRange]:
+    def _split(self) -> list[KeyRange]:
+        # Splits the scan into a set of key ranges.
         return self._scan.split()
-    def debug(self):
+    def _debug(self):
         # Visualizes the scan, mainly for debugging purposes.
-        # NOTE: This is not part of the API and may disappear at any moment.
-        from spiral.debug import show_scan
+        from spiral.tables.debug.scan import show_scan
         show_scan(self._scan)
+    def _dump_manifests(self):
+        # Print manifests in a human-readable format.
+        from spiral.tables.debug.manifests import display_manifests
+        display_manifests(self._scan)
+    def _dump_metrics(self):
+        # Print metrics in a human-readable format.
+        from spiral.tables.debug.metrics import display_metrics
+        display_metrics(self.metrics)