PyPI - pyspiral - Versions diffs - 0.4.4__cp310-abi3-macosx_11_0_arm64.whl → 0.6.0__cp310-abi3-macosx_11_0_arm64.whl - Mend

pyspiral 0.4.4__cp310-abi3-macosx_11_0_arm64.whl → 0.6.0__cp310-abi3-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (85) hide show

{pyspiral-0.4.4.dist-info → pyspiral-0.6.0.dist-info}/METADATA +10 -5
pyspiral-0.6.0.dist-info/RECORD +99 -0
{pyspiral-0.4.4.dist-info → pyspiral-0.6.0.dist-info}/WHEEL +1 -1
spiral/__init__.py +10 -3
spiral/_lib.abi3.so +0 -0
spiral/adbc.py +29 -11
spiral/api/__init__.py +14 -0
spiral/api/client.py +5 -1
spiral/api/key_space_indexes.py +23 -0
spiral/api/projects.py +17 -2
spiral/api/text_indexes.py +56 -0
spiral/api/types.py +2 -0
spiral/api/workers.py +40 -0
spiral/cli/__init__.py +15 -6
spiral/cli/admin.py +2 -4
spiral/cli/app.py +4 -2
spiral/cli/fs.py +5 -6
spiral/cli/iceberg.py +97 -0
spiral/cli/key_spaces.py +68 -0
spiral/cli/login.py +6 -7
spiral/cli/orgs.py +7 -8
spiral/cli/printer.py +3 -3
spiral/cli/projects.py +5 -6
spiral/cli/tables.py +131 -0
spiral/cli/telemetry.py +3 -4
spiral/cli/text.py +115 -0
spiral/cli/types.py +3 -4
spiral/cli/workloads.py +7 -8
spiral/client.py +111 -8
spiral/core/authn/__init__.pyi +27 -0
spiral/core/client/__init__.pyi +135 -63
spiral/core/table/__init__.pyi +36 -26
spiral/core/table/metastore/__init__.pyi +0 -4
spiral/core/table/spec/__init__.pyi +0 -2
spiral/{tables/dataset.py → dataset.py} +13 -7
spiral/{tables/debug → debug}/manifests.py +17 -6
spiral/{tables/debug → debug}/scan.py +7 -7
spiral/expressions/base.py +3 -3
spiral/expressions/udf.py +1 -1
spiral/{iceberg/client.py → iceberg.py} +1 -3
spiral/key_space_index.py +44 -0
spiral/project.py +171 -18
spiral/protogen/_/arrow/flight/protocol/sql/__init__.py +1668 -1110
spiral/protogen/_/google/protobuf/__init__.py +2190 -0
spiral/protogen/_/message_pool.py +3 -0
spiral/protogen/_/py.typed +0 -0
spiral/protogen/_/scandal/__init__.py +138 -126
spiral/protogen/_/spfs/__init__.py +72 -0
spiral/protogen/_/spql/__init__.py +61 -0
spiral/protogen/_/substrait/__init__.py +5256 -2459
spiral/protogen/_/substrait/extensions/__init__.py +103 -49
spiral/{tables/scan.py → scan.py} +37 -44
spiral/settings.py +14 -3
spiral/snapshot.py +55 -0
spiral/streaming_/__init__.py +3 -0
spiral/streaming_/reader.py +117 -0
spiral/streaming_/stream.py +146 -0
spiral/substrait_.py +9 -9
spiral/table.py +257 -0
spiral/text_index.py +17 -0
spiral/{tables/transaction.py → transaction.py} +11 -15
pyspiral-0.4.4.dist-info/RECORD +0 -98
spiral/cli/iceberg/__init__.py +0 -7
spiral/cli/iceberg/namespaces.py +0 -47
spiral/cli/iceberg/tables.py +0 -60
spiral/cli/indexes/__init__.py +0 -19
spiral/cli/tables/__init__.py +0 -121
spiral/core/index/__init__.pyi +0 -15
spiral/iceberg/__init__.py +0 -3
spiral/indexes/__init__.py +0 -5
spiral/indexes/client.py +0 -137
spiral/indexes/index.py +0 -34
spiral/indexes/scan.py +0 -22
spiral/protogen/_/spiral/table/__init__.py +0 -22
spiral/protogen/substrait/__init__.py +0 -3399
spiral/protogen/substrait/extensions/__init__.py +0 -115
spiral/tables/__init__.py +0 -12
spiral/tables/client.py +0 -130
spiral/tables/maintenance.py +0 -12
spiral/tables/snapshot.py +0 -78
spiral/tables/table.py +0 -145
{pyspiral-0.4.4.dist-info → pyspiral-0.6.0.dist-info}/entry_points.txt +0 -0
/spiral/{protogen/_/spiral → debug}/__init__.py +0 -0
/spiral/{tables/debug → debug}/metrics.py +0 -0
/spiral/{tables/debug → protogen/_/google}/__init__.py +0 -0

spiral/protogen/_/substrait/extensions/__init__.py CHANGED Viewed

@@ -1,115 +1,169 @@
 # Generated by the protocol buffer compiler.  DO NOT EDIT!
 # sources: substrait/extensions/extensions.proto
-# plugin: python-betterproto
+# plugin: python-betterproto2
 # This file has been @generated
+__all__ = (
+    "AdvancedExtension",
+    "SimpleExtensionDeclaration",
+    "SimpleExtensionDeclarationExtensionFunction",
+    "SimpleExtensionDeclarationExtensionType",
+    "SimpleExtensionDeclarationExtensionTypeVariation",
+    "SimpleExtensionUri",
+)
 from dataclasses import dataclass
-from typing import List
-import betterproto
-import betterproto.lib.google.protobuf as betterproto_lib_google_protobuf
+import betterproto2
+from ...message_pool import default_message_pool
+_COMPILER_VERSION = "0.8.0"
+betterproto2.check_compiler_version(_COMPILER_VERSION)
 @dataclass(eq=False, repr=False)
-class SimpleExtensionUri(betterproto.Message):
-    extension_uri_anchor: int = betterproto.uint32_field(1)
+class AdvancedExtension(betterproto2.Message):
     """
-    A surrogate key used in the context of a single plan used to reference the
-     URI associated with an extension.
+    A generic object that can be used to embed additional extension information
+    into the serialized substrait plan.
     """
-    uri: str = betterproto.string_field(2)
+    optimization: "list[__google__protobuf__.Any]" = betterproto2.field(1, betterproto2.TYPE_MESSAGE, repeated=True)
     """
-    The URI where this extension YAML can be retrieved. This is the "namespace"
-     of this extension.
+    An optimization is helpful information that don't influence semantics. May
+    be ignored by a consumer.
+    """
+    enhancement: "__google__protobuf__.Any | None" = betterproto2.field(2, betterproto2.TYPE_MESSAGE, optional=True)
+    """
+    An enhancement alter semantics. Cannot be ignored by a consumer.
     """
+default_message_pool.register_message("substrait.extensions", "AdvancedExtension", AdvancedExtension)
 @dataclass(eq=False, repr=False)
-class SimpleExtensionDeclaration(betterproto.Message):
+class SimpleExtensionDeclaration(betterproto2.Message):
     """
     Describes a mapping between a specific extension entity and the uri where
-     that extension can be found.
+    that extension can be found.
+    Oneofs:
+        - mapping_type:
     """
-    extension_type: "SimpleExtensionDeclarationExtensionType" = (
-        betterproto.message_field(1, group="mapping_type")
+    extension_type: "SimpleExtensionDeclarationExtensionType | None" = betterproto2.field(
+        1, betterproto2.TYPE_MESSAGE, optional=True, group="mapping_type"
     )
-    extension_type_variation: "SimpleExtensionDeclarationExtensionTypeVariation" = (
-        betterproto.message_field(2, group="mapping_type")
+    extension_type_variation: "SimpleExtensionDeclarationExtensionTypeVariation | None" = betterproto2.field(
+        2, betterproto2.TYPE_MESSAGE, optional=True, group="mapping_type"
     )
-    extension_function: "SimpleExtensionDeclarationExtensionFunction" = (
-        betterproto.message_field(3, group="mapping_type")
+    extension_function: "SimpleExtensionDeclarationExtensionFunction | None" = betterproto2.field(
+        3, betterproto2.TYPE_MESSAGE, optional=True, group="mapping_type"
     )
-@dataclass(eq=False, repr=False)
-class SimpleExtensionDeclarationExtensionType(betterproto.Message):
-    """Describes a Type"""
+default_message_pool.register_message("substrait.extensions", "SimpleExtensionDeclaration", SimpleExtensionDeclaration)
-    extension_uri_reference: int = betterproto.uint32_field(1)
+@dataclass(eq=False, repr=False)
+class SimpleExtensionDeclarationExtensionFunction(betterproto2.Message):
+    extension_uri_reference: "int" = betterproto2.field(1, betterproto2.TYPE_UINT32)
     """
     references the extension_uri_anchor defined for a specific extension URI.
     """
-    type_anchor: int = betterproto.uint32_field(2)
+    function_anchor: "int" = betterproto2.field(2, betterproto2.TYPE_UINT32)
     """
     A surrogate key used in the context of a single plan to reference a
-     specific extension type
+    specific function
+    """
+    name: "str" = betterproto2.field(3, betterproto2.TYPE_STRING)
     """
+    A function signature compound name
+    """
-    name: str = betterproto.string_field(3)
-    """the name of the type in the defined extension YAML."""
+default_message_pool.register_message(
+    "substrait.extensions", "SimpleExtensionDeclaration.ExtensionFunction", SimpleExtensionDeclarationExtensionFunction
+)
 @dataclass(eq=False, repr=False)
-class SimpleExtensionDeclarationExtensionTypeVariation(betterproto.Message):
-    extension_uri_reference: int = betterproto.uint32_field(1)
+class SimpleExtensionDeclarationExtensionType(betterproto2.Message):
+    """
+    Describes a Type
+    """
+    extension_uri_reference: "int" = betterproto2.field(1, betterproto2.TYPE_UINT32)
     """
     references the extension_uri_anchor defined for a specific extension URI.
     """
-    type_variation_anchor: int = betterproto.uint32_field(2)
+    type_anchor: "int" = betterproto2.field(2, betterproto2.TYPE_UINT32)
     """
     A surrogate key used in the context of a single plan to reference a
-     specific type variation
+    specific extension type
+    """
+    name: "str" = betterproto2.field(3, betterproto2.TYPE_STRING)
+    """
+    the name of the type in the defined extension YAML.
     """
-    name: str = betterproto.string_field(3)
-    """the name of the type in the defined extension YAML."""
+default_message_pool.register_message(
+    "substrait.extensions", "SimpleExtensionDeclaration.ExtensionType", SimpleExtensionDeclarationExtensionType
+)
 @dataclass(eq=False, repr=False)
-class SimpleExtensionDeclarationExtensionFunction(betterproto.Message):
-    extension_uri_reference: int = betterproto.uint32_field(1)
+class SimpleExtensionDeclarationExtensionTypeVariation(betterproto2.Message):
+    extension_uri_reference: "int" = betterproto2.field(1, betterproto2.TYPE_UINT32)
     """
     references the extension_uri_anchor defined for a specific extension URI.
     """
-    function_anchor: int = betterproto.uint32_field(2)
+    type_variation_anchor: "int" = betterproto2.field(2, betterproto2.TYPE_UINT32)
     """
     A surrogate key used in the context of a single plan to reference a
-     specific function
+    specific type variation
+    """
+    name: "str" = betterproto2.field(3, betterproto2.TYPE_STRING)
+    """
+    the name of the type in the defined extension YAML.
     """
-    name: str = betterproto.string_field(3)
-    """A function signature compound name"""
+default_message_pool.register_message(
+    "substrait.extensions",
+    "SimpleExtensionDeclaration.ExtensionTypeVariation",
+    SimpleExtensionDeclarationExtensionTypeVariation,
+)
 @dataclass(eq=False, repr=False)
-class AdvancedExtension(betterproto.Message):
+class SimpleExtensionUri(betterproto2.Message):
+    extension_uri_anchor: "int" = betterproto2.field(1, betterproto2.TYPE_UINT32)
     """
-    A generic object that can be used to embed additional extension information
-     into the serialized substrait plan.
+    A surrogate key used in the context of a single plan used to reference the
+    URI associated with an extension.
     """
-    optimization: List[
-        "betterproto_lib_google_protobuf.Any"
-    ] = betterproto.message_field(1)
+    uri: "str" = betterproto2.field(2, betterproto2.TYPE_STRING)
     """
-    An optimization is helpful information that don't influence semantics. May
-     be ignored by a consumer.
+    The URI where this extension YAML can be retrieved. This is the "namespace"
+    of this extension.
     """
-    enhancement: "betterproto_lib_google_protobuf.Any" = betterproto.message_field(2)
-    """An enhancement alter semantics. Cannot be ignored by a consumer."""
+default_message_pool.register_message("substrait.extensions", "SimpleExtensionURI", SimpleExtensionUri)
+from ...google import protobuf as __google__protobuf__

spiral/{tables/scan.py → scan.py} RENAMED Viewed

@@ -2,40 +2,36 @@ from collections.abc import Iterator
 from typing import TYPE_CHECKING, Any
 import pyarrow as pa
-from datasets import DatasetInfo, Features
-from spiral.core.table import KeyRange, TableScan
+from spiral.core.table import KeyRange, ShuffleStrategy
+from spiral.core.table import Scan as CoreScan
 from spiral.core.table.spec import Schema
 from spiral.settings import CI, DEV
 if TYPE_CHECKING:
     import dask.dataframe as dd
+    import datasets.iterable_dataset as hf  # noqa
     import pandas as pd
     import polars as pl
-    from datasets import iterable_dataset
+    import streaming  # noqa
+    import torch.utils.data as torchdata  # noqa
 class Scan:
     """Scan object."""
-    def __init__(
-        self,
-        scan: TableScan,
-    ):
-        # NOTE(ngates): this API is a little weird. e.g. if the query doesn't define an asof, it is resolved
-        #  when we wrap it into a core.Scan. Should we expose a Query object in the Python API that's reusable
-        #  and will re-resolve the asof? Or should we just expose a scan that fixes the asof at construction time?
-        self._scan = scan
+    def __init__(self, core: CoreScan):
+        self.core = core
     @property
     def metrics(self) -> dict[str, Any]:
         """Returns metrics about the scan."""
-        return self._scan.metrics()
+        return self.core.metrics()
     @property
     def schema(self) -> Schema:
         """Returns the schema of the scan."""
-        return self._scan.schema()
+        return self.core.schema()
     def is_empty(self) -> bool:
         """Check if the Spiral is empty for the given key range.
@@ -43,7 +39,7 @@ class Scan:
         **IMPORTANT**: False negatives are possible, but false positives are not,
             i.e. is_empty can return False and scan can return zero rows.
         """
-        return self._scan.is_empty()
+        return self.core.is_empty()
     def to_record_batches(
         self,
@@ -69,7 +65,7 @@ class Scan:
         elif isinstance(key_table, pa.Table):
             key_table = key_table.to_reader(max_chunksize=batch_size)
-        return self._scan.to_record_batches(key_table=key_table, batch_readahead=batch_readahead)
+        return self.core.to_record_batches(key_table=key_table, batch_readahead=batch_readahead)
     def to_table(
         self,
@@ -83,7 +79,7 @@ class Scan:
         """
         # NOTE: Evaluates fully on Rust side which improved debuggability.
         if DEV and not CI and key_table is None:
-            rb = self._scan.to_record_batch()
+            rb = self.core.to_record_batch()
             return pa.Table.from_batches([rb])
         return self.to_record_batches(key_table=key_table).read_all()
@@ -97,11 +93,11 @@ class Scan:
         import pandas as pd
         def _read_key_range(key_range: KeyRange) -> pd.DataFrame:
-            # TODO(ngates): we need a way to preserve the existing asofs? Should we copy CoreScan instead of Query?
+            # TODO(ngates): we need a way to preserve the existing asofs?
             raise NotImplementedError()
         # Fetch a set of partition ranges
-        return dd.from_map(_read_key_range, self.split())
+        return dd.from_map(_read_key_range, self._splits())
     def to_pandas(self) -> "pd.DataFrame":
         """Read into a Pandas DataFrame.
@@ -117,34 +113,31 @@ class Scan:
         """
         import polars as pl
-        # TODO(marko): This should support lazy dataframe.
         return pl.from_arrow(self.to_record_batches())
-    def to_pytorch(
+    def to_iterable_dataset(
         self,
+        shuffle: ShuffleStrategy | None = None,
         batch_readahead: int | None = None,
-        shuffle_batch_size: int | None = None,
-        shuffle_pool_num_rows: int | None = None,
-    ) -> "iterable_dataset.IterableDataset":
-        """Returns an iterable dataset that can be used to build a PyTorch DataLoader.
+    ) -> "hf.IterableDataset":
+        """Returns an Huggingface's IterableDataset.
+        Requires `datasets` package to be installed.
         Args:
-            batch_readahead: Number of batches to prefetch in the background.
-            shuffle_batch_size: read granularity of number of rows for a shuffled scan. If left as
-            None along with shuffle_pool_num_rows=None, shuffling is disabled.
-            shuffle_pool_num_rows: Pool size for shuffling batches.
+            shuffle: Controls sample shuffling. If None, no shuffling is performed.
+            batch_readahead: Controls how many batches to read ahead concurrently.
+                If pipeline includes work after reading (e.g. decoding, transforming, ...) this can be set higher.
+                Otherwise, it should be kept low to reduce next batch latency. Defaults to 2.
         """
+        from datasets import DatasetInfo, Features
         from datasets.iterable_dataset import ArrowExamplesIterable, IterableDataset
         def _generate_tables(**kwargs) -> Iterator[tuple[int, pa.Table]]:
-            if shuffle_batch_size is None and shuffle_pool_num_rows is None:
-                stream = self.to_record_batches(
-                    batch_readahead=batch_readahead,
-                )
-            else:
-                stream = self._scan.to_shuffled_record_batches(
-                    batch_readahead, shuffle_batch_size, shuffle_pool_num_rows
-                )
+            stream = self.core.to_shuffled_record_batches(
+                shuffle,
+                batch_readahead,
+            )
             # This key is unused when training with IterableDataset.
             # Default implementation returns shard id, e.g. parquet row group id.
@@ -166,28 +159,28 @@ class Scan:
             return pa.schema(new_fields)
         # NOTE: generate_tables_fn type annotations are wrong, return type must be an iterable of tuples.
-        ex_iterable = ArrowExamplesIterable(generate_tables_fn=_generate_tables, kwargs={})
+        ex_iterable = ArrowExamplesIterable(generate_tables_fn=_generate_tables, kwargs={})  # type: ignore
         info = DatasetInfo(features=Features.from_arrow_schema(_hf_compatible_schema(self.schema.to_arrow())))
         return IterableDataset(ex_iterable=ex_iterable, info=info)
-    def _split(self) -> list[KeyRange]:
+    def _splits(self) -> list[KeyRange]:
         # Splits the scan into a set of key ranges.
-        return self._scan.split()
+        return self.core.splits()
     def _debug(self):
         # Visualizes the scan, mainly for debugging purposes.
-        from spiral.tables.debug.scan import show_scan
+        from spiral.debug.scan import show_scan
-        show_scan(self._scan)
+        show_scan(self.core)
     def _dump_manifests(self):
         # Print manifests in a human-readable format.
-        from spiral.tables.debug.manifests import display_manifests
+        from spiral.debug.manifests import display_scan_manifests
-        display_manifests(self._scan)
+        display_scan_manifests(self._core)
     def _dump_metrics(self):
         # Print metrics in a human-readable format.
-        from spiral.tables.debug.metrics import display_metrics
+        from spiral.debug.metrics import display_metrics
         display_metrics(self.metrics)

spiral/settings.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import functools
 import os
 from pathlib import Path
-from typing import Annotated
+from typing import TYPE_CHECKING, Annotated
 import typer
 from pydantic import Field, ValidatorFunctionWrapHandler, WrapValidator
@@ -12,8 +12,11 @@ from pydantic_settings import (
     SettingsConfigDict,
 )
-from spiral.api import SpiralAPI
-from spiral.core.client import Authn, DeviceCodeAuth, Token
+from spiral.core.authn import Authn, DeviceCodeAuth, Token
+from spiral.core.client import Spiral
+if TYPE_CHECKING:
+    from spiral.api import SpiralAPI
 DEV = "PYTEST_VERSION" in os.environ or bool(os.environ.get("SPIRAL_DEV", None))
 CI = "GITHUB_ACTIONS" in os.environ
@@ -74,6 +77,14 @@ class Settings(BaseSettings):
         return SpiralAPI(self.authn, base_url=self.spiraldb.uri)
+    @functools.cached_property
+    def core(self) -> Spiral:
+        return Spiral(
+            api_url=self.spiraldb.uri,
+            spfs_url=self.spfs.uri,
+            authn=self.authn,
+        )
     @functools.cached_property
     def authn(self):
         if self.spiraldb.token:

spiral/snapshot.py ADDED Viewed

@@ -0,0 +1,55 @@
+from typing import TYPE_CHECKING
+from spiral.core.table import Snapshot as CoreSnapshot
+from spiral.core.table.spec import Schema
+from spiral.types_ import Timestamp
+if TYPE_CHECKING:
+    import duckdb
+    import polars as pl
+    import pyarrow.dataset as ds
+    from spiral.table import Table
+class Snapshot:
+    """Spiral table snapshot.
+    A snapshot represents a point-in-time view of a table.
+    """
+    def __init__(self, table: "Table", core: CoreSnapshot):
+        self.core = core
+        self._table = table
+    @property
+    def asof(self) -> Timestamp:
+        """Returns the asof timestamp of the snapshot."""
+        return self.core.asof
+    def schema(self) -> Schema:
+        """Returns the schema of the snapshot."""
+        return self.core.table.get_schema(asof=self.asof)
+    @property
+    def table(self) -> "Table":
+        """Returns the table associated with the snapshot."""
+        return self._table
+    def to_dataset(self) -> "ds.Dataset":
+        """Returns a PyArrow Dataset representing the table."""
+        from spiral.dataset import Dataset
+        return Dataset(self)
+    def to_polars(self) -> "pl.LazyFrame":
+        """Returns a Polars LazyFrame for the Spiral table."""
+        import polars as pl
+        return pl.scan_pyarrow_dataset(self.to_dataset())
+    def to_duckdb(self) -> "duckdb.DuckDBPyRelation":
+        """Returns a DuckDB relation for the Spiral table."""
+        import duckdb
+        return duckdb.from_arrow(self.to_dataset())

spiral/streaming_/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+from .stream import SpiralStream
+__all__ = ["SpiralStream"]

spiral/streaming_/reader.py ADDED Viewed

@@ -0,0 +1,117 @@
+import dataclasses
+import functools
+import os
+from typing import Any
+import vortex as vx
+from spiral.core.client import Shard
+# Fake streaming.base.format.base.reader.FileInfo
+# Dataset manages decompression instead of the Stream in MDS.
+# So we return our own fake FileInfo that has None for compressed file.
+@dataclasses.dataclass
+class FileInfo:
+    basename: str
+    hashes: dict[str, str] = dataclasses.field(default_factory=dict)
+    @property
+    def bytes(self):
+        raise NotImplementedError("FileInfo.bytes should NOT be called.")
+class SpiralReader:
+    """
+    An MDS (streaming) compatible Reader.
+    """
+    def __init__(self, shard: Shard, basepath):
+        self._shard = shard
+        self._cardinality = shard.cardinality
+        self._basepath = basepath
+        self._scan: vx.RepeatedScan | None = None
+    @property
+    def shard(self) -> Shard:
+        return self._shard
+    @property
+    def size(self):
+        """Get the number of samples in this shard.
+        Returns:
+            int: Sample count.
+        """
+        return self._cardinality
+    @property
+    def samples(self):
+        """Get the number of samples in this shard.
+        Returns:
+            int: Sample count.
+        """
+        return self._cardinality
+    def __len__(self) -> int:
+        """Get the number of samples in this shard.
+        Returns:
+            int: Sample count.
+        """
+        return self._cardinality
+    @property
+    def file_pairs(self) -> list[tuple[FileInfo, FileInfo | None]]:
+        """Get the infos from raw and compressed file.
+        MDS uses this because dataset manages decompression of the shards, not stream...
+        """
+        return [(FileInfo(basename=self.filename), None)]
+    @functools.cached_property
+    def filename(self) -> str:
+        """Used by SpiralStream to identify shard's file-on-disk, if it exists."""
+        # TODO(marko): This might be too long...
+        return (
+            bytes(self._shard.key_range.begin).hex()
+            + "_"
+            + bytes(self._shard.key_range.end).hex()
+            + "_"
+            + str(self._shard.cardinality)
+            + ".vortex"
+        )
+    @functools.cached_property
+    def filepath(self) -> str:
+        """Full path to the shard's file-on-disk, if it exists."""
+        return os.path.join(self._basepath, self.filename)
+    def evict(self) -> int:
+        """Remove all files belonging to this shard."""
+        # Clean up the scan handle first. This will make sure memory is freed.
+        self._scan = None
+        # Try to evict file.
+        try:
+            stat = os.stat(self.filepath)
+            os.remove(self.filepath)
+            return stat.st_size
+        except FileNotFoundError:
+            # Nothing to evict.
+            return 0
+    def __getitem__(self, item):
+        return self.get_item(item)
+    def get_item(self, idx: int) -> dict[str, Any]:
+        if self._scan is None:
+            # TODO(marko): vx.open should throw FileNotFoundError instead of
+            #   ValueError: No such file or directory (os error 2)
+            # Check if shard is ready on disk. This must throw FileNotFoundError.
+            if not os.path.exists(self.filepath):
+                raise FileNotFoundError(f"Shard not found: {self.filepath}")
+            self._scan = vx.open(self.filepath, without_segment_cache=True).to_repeated_scan()
+        return self._scan.scalar_at(idx).as_py()