PyPI - pyspiral - Versions diffs - 0.7.18__cp312-abi3-manylinux_2_28_x86_64.whl - Mend

pyspiral 0.7.18__cp312-abi3-manylinux_2_28_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (110) hide show

pyspiral-0.7.18.dist-info/METADATA +52 -0
pyspiral-0.7.18.dist-info/RECORD +110 -0
pyspiral-0.7.18.dist-info/WHEEL +4 -0
pyspiral-0.7.18.dist-info/entry_points.txt +3 -0
spiral/__init__.py +55 -0
spiral/_lib.abi3.so +0 -0
spiral/adbc.py +411 -0
spiral/api/__init__.py +78 -0
spiral/api/admin.py +15 -0
spiral/api/client.py +164 -0
spiral/api/filesystems.py +134 -0
spiral/api/key_space_indexes.py +23 -0
spiral/api/organizations.py +77 -0
spiral/api/projects.py +219 -0
spiral/api/telemetry.py +19 -0
spiral/api/text_indexes.py +56 -0
spiral/api/types.py +23 -0
spiral/api/workers.py +40 -0
spiral/api/workloads.py +52 -0
spiral/arrow_.py +216 -0
spiral/cli/__init__.py +88 -0
spiral/cli/__main__.py +4 -0
spiral/cli/admin.py +14 -0
spiral/cli/app.py +108 -0
spiral/cli/console.py +95 -0
spiral/cli/fs.py +76 -0
spiral/cli/iceberg.py +97 -0
spiral/cli/key_spaces.py +103 -0
spiral/cli/login.py +25 -0
spiral/cli/orgs.py +90 -0
spiral/cli/printer.py +53 -0
spiral/cli/projects.py +147 -0
spiral/cli/state.py +7 -0
spiral/cli/tables.py +197 -0
spiral/cli/telemetry.py +17 -0
spiral/cli/text.py +115 -0
spiral/cli/types.py +50 -0
spiral/cli/workloads.py +58 -0
spiral/client.py +256 -0
spiral/core/__init__.pyi +0 -0
spiral/core/_tools/__init__.pyi +5 -0
spiral/core/authn/__init__.pyi +21 -0
spiral/core/client/__init__.pyi +285 -0
spiral/core/config/__init__.pyi +35 -0
spiral/core/expr/__init__.pyi +15 -0
spiral/core/expr/images/__init__.pyi +3 -0
spiral/core/expr/list_/__init__.pyi +4 -0
spiral/core/expr/refs/__init__.pyi +4 -0
spiral/core/expr/str_/__init__.pyi +3 -0
spiral/core/expr/struct_/__init__.pyi +6 -0
spiral/core/expr/text/__init__.pyi +5 -0
spiral/core/expr/udf/__init__.pyi +14 -0
spiral/core/expr/video/__init__.pyi +3 -0
spiral/core/table/__init__.pyi +141 -0
spiral/core/table/manifests/__init__.pyi +35 -0
spiral/core/table/metastore/__init__.pyi +58 -0
spiral/core/table/spec/__init__.pyi +215 -0
spiral/dataloader.py +299 -0
spiral/dataset.py +264 -0
spiral/datetime_.py +27 -0
spiral/debug/__init__.py +0 -0
spiral/debug/manifests.py +87 -0
spiral/debug/metrics.py +56 -0
spiral/debug/scan.py +266 -0
spiral/enrichment.py +306 -0
spiral/expressions/__init__.py +274 -0
spiral/expressions/base.py +167 -0
spiral/expressions/file.py +17 -0
spiral/expressions/http.py +17 -0
spiral/expressions/list_.py +68 -0
spiral/expressions/s3.py +16 -0
spiral/expressions/str_.py +39 -0
spiral/expressions/struct.py +59 -0
spiral/expressions/text.py +62 -0
spiral/expressions/tiff.py +222 -0
spiral/expressions/udf.py +60 -0
spiral/grpc_.py +32 -0
spiral/iceberg.py +31 -0
spiral/iterable_dataset.py +106 -0
spiral/key_space_index.py +44 -0
spiral/project.py +227 -0
spiral/protogen/_/__init__.py +0 -0
spiral/protogen/_/arrow/__init__.py +0 -0
spiral/protogen/_/arrow/flight/__init__.py +0 -0
spiral/protogen/_/arrow/flight/protocol/__init__.py +0 -0
spiral/protogen/_/arrow/flight/protocol/sql/__init__.py +2548 -0
spiral/protogen/_/google/__init__.py +0 -0
spiral/protogen/_/google/protobuf/__init__.py +2310 -0
spiral/protogen/_/message_pool.py +3 -0
spiral/protogen/_/py.typed +0 -0
spiral/protogen/_/scandal/__init__.py +190 -0
spiral/protogen/_/spfs/__init__.py +72 -0
spiral/protogen/_/spql/__init__.py +61 -0
spiral/protogen/_/substrait/__init__.py +6196 -0
spiral/protogen/_/substrait/extensions/__init__.py +169 -0
spiral/protogen/__init__.py +0 -0
spiral/protogen/util.py +41 -0
spiral/py.typed +0 -0
spiral/scan.py +363 -0
spiral/server.py +17 -0
spiral/settings.py +36 -0
spiral/snapshot.py +56 -0
spiral/streaming_/__init__.py +3 -0
spiral/streaming_/reader.py +133 -0
spiral/streaming_/stream.py +157 -0
spiral/substrait_.py +274 -0
spiral/table.py +224 -0
spiral/text_index.py +17 -0
spiral/transaction.py +155 -0
spiral/types_.py +6 -0

spiral/protogen/_/substrait/extensions/__init__.py ADDED Viewed

@@ -0,0 +1,169 @@
+# Generated by the protocol buffer compiler.  DO NOT EDIT!
+# sources: substrait/extensions/extensions.proto
+# plugin: python-betterproto2
+# This file has been @generated
+__all__ = (
+    "AdvancedExtension",
+    "SimpleExtensionDeclaration",
+    "SimpleExtensionDeclarationExtensionFunction",
+    "SimpleExtensionDeclarationExtensionType",
+    "SimpleExtensionDeclarationExtensionTypeVariation",
+    "SimpleExtensionUri",
+)
+from dataclasses import dataclass
+import betterproto2
+from ...message_pool import default_message_pool
+_COMPILER_VERSION = "0.9.0"
+betterproto2.check_compiler_version(_COMPILER_VERSION)
+@dataclass(eq=False, repr=False)
+class AdvancedExtension(betterproto2.Message):
+    """
+    A generic object that can be used to embed additional extension information
+    into the serialized substrait plan.
+    """
+    optimization: "list[__google__protobuf__.Any]" = betterproto2.field(1, betterproto2.TYPE_MESSAGE, repeated=True)
+    """
+    An optimization is helpful information that don't influence semantics. May
+    be ignored by a consumer.
+    """
+    enhancement: "__google__protobuf__.Any | None" = betterproto2.field(2, betterproto2.TYPE_MESSAGE, optional=True)
+    """
+    An enhancement alter semantics. Cannot be ignored by a consumer.
+    """
+default_message_pool.register_message("substrait.extensions", "AdvancedExtension", AdvancedExtension)
+@dataclass(eq=False, repr=False)
+class SimpleExtensionDeclaration(betterproto2.Message):
+    """
+    Describes a mapping between a specific extension entity and the uri where
+    that extension can be found.
+    Oneofs:
+        - mapping_type:
+    """
+    extension_type: "SimpleExtensionDeclarationExtensionType | None" = betterproto2.field(
+        1, betterproto2.TYPE_MESSAGE, optional=True, group="mapping_type"
+    )
+    extension_type_variation: "SimpleExtensionDeclarationExtensionTypeVariation | None" = betterproto2.field(
+        2, betterproto2.TYPE_MESSAGE, optional=True, group="mapping_type"
+    )
+    extension_function: "SimpleExtensionDeclarationExtensionFunction | None" = betterproto2.field(
+        3, betterproto2.TYPE_MESSAGE, optional=True, group="mapping_type"
+    )
+default_message_pool.register_message("substrait.extensions", "SimpleExtensionDeclaration", SimpleExtensionDeclaration)
+@dataclass(eq=False, repr=False)
+class SimpleExtensionDeclarationExtensionFunction(betterproto2.Message):
+    extension_uri_reference: "int" = betterproto2.field(1, betterproto2.TYPE_UINT32)
+    """
+    references the extension_uri_anchor defined for a specific extension URI.
+    """
+    function_anchor: "int" = betterproto2.field(2, betterproto2.TYPE_UINT32)
+    """
+    A surrogate key used in the context of a single plan to reference a
+    specific function
+    """
+    name: "str" = betterproto2.field(3, betterproto2.TYPE_STRING)
+    """
+    A function signature compound name
+    """
+default_message_pool.register_message(
+    "substrait.extensions", "SimpleExtensionDeclaration.ExtensionFunction", SimpleExtensionDeclarationExtensionFunction
+)
+@dataclass(eq=False, repr=False)
+class SimpleExtensionDeclarationExtensionType(betterproto2.Message):
+    """
+    Describes a Type
+    """
+    extension_uri_reference: "int" = betterproto2.field(1, betterproto2.TYPE_UINT32)
+    """
+    references the extension_uri_anchor defined for a specific extension URI.
+    """
+    type_anchor: "int" = betterproto2.field(2, betterproto2.TYPE_UINT32)
+    """
+    A surrogate key used in the context of a single plan to reference a
+    specific extension type
+    """
+    name: "str" = betterproto2.field(3, betterproto2.TYPE_STRING)
+    """
+    the name of the type in the defined extension YAML.
+    """
+default_message_pool.register_message(
+    "substrait.extensions", "SimpleExtensionDeclaration.ExtensionType", SimpleExtensionDeclarationExtensionType
+)
+@dataclass(eq=False, repr=False)
+class SimpleExtensionDeclarationExtensionTypeVariation(betterproto2.Message):
+    extension_uri_reference: "int" = betterproto2.field(1, betterproto2.TYPE_UINT32)
+    """
+    references the extension_uri_anchor defined for a specific extension URI.
+    """
+    type_variation_anchor: "int" = betterproto2.field(2, betterproto2.TYPE_UINT32)
+    """
+    A surrogate key used in the context of a single plan to reference a
+    specific type variation
+    """
+    name: "str" = betterproto2.field(3, betterproto2.TYPE_STRING)
+    """
+    the name of the type in the defined extension YAML.
+    """
+default_message_pool.register_message(
+    "substrait.extensions",
+    "SimpleExtensionDeclaration.ExtensionTypeVariation",
+    SimpleExtensionDeclarationExtensionTypeVariation,
+)
+@dataclass(eq=False, repr=False)
+class SimpleExtensionUri(betterproto2.Message):
+    extension_uri_anchor: "int" = betterproto2.field(1, betterproto2.TYPE_UINT32)
+    """
+    A surrogate key used in the context of a single plan used to reference the
+    URI associated with an extension.
+    """
+    uri: "str" = betterproto2.field(2, betterproto2.TYPE_STRING)
+    """
+    The URI where this extension YAML can be retrieved. This is the "namespace"
+    of this extension.
+    """
+default_message_pool.register_message("substrait.extensions", "SimpleExtensionURI", SimpleExtensionUri)
+from ...google import protobuf as __google__protobuf__

spiral/protogen/__init__.py ADDED Viewed

File without changes

spiral/protogen/util.py ADDED Viewed

@@ -0,0 +1,41 @@
+import betterproto
+from betterproto.grpc.grpclib_server import ServiceBase
+def patch_protos(proto_module, our_module_globals):
+    """Calculate __all__ to re-export protos from a module."""
+    betterproto_types = (betterproto.Message, betterproto.Enum, betterproto.ServiceStub, ServiceBase)
+    proto_overrides = {}
+    missing = set()
+    for ident in dir(proto_module):
+        var = getattr(proto_module, ident)
+        if isinstance(var, type) and issubclass(var, betterproto_types):
+            if ident in our_module_globals:
+                override = id(our_module_globals.get(ident)) != id(var)
+            else:
+                override = False
+                missing.add(ident)
+            proto_overrides[ident] = override
+    if missing:
+        print(f"from {proto_module.__name__} import (")
+        for ident, override in proto_overrides.items():
+            if override:
+                print(f"    {ident} as {ident}_,")
+            else:
+                print(f"    {ident},")
+        print(")")
+        print("\n")
+        print("__all__ = [")
+        for ident in proto_overrides:
+            print(f'    "{ident}",')
+        print("]")
+        raise ValueError(f"Missing types that need to be re-exported: {missing}")
+    # Patch any local subclasses back into the original module so the gRPC client will use them
+    for ident, override in proto_overrides.items():
+        if override:
+            setattr(proto_module, ident, our_module_globals[ident])

spiral/py.typed ADDED Viewed

File without changes

spiral/scan.py ADDED Viewed

@@ -0,0 +1,363 @@
+from functools import partial
+from typing import TYPE_CHECKING, Any, Optional
+import pyarrow as pa
+from spiral.core.client import Shard, ShuffleConfig
+from spiral.core.table import KeyRange
+from spiral.core.table import Scan as CoreScan
+from spiral.core.table.spec import Schema
+from spiral.settings import CI, DEV
+if TYPE_CHECKING:
+    import dask.dataframe as dd
+    import datasets.iterable_dataset as hf  # noqa
+    import pandas as pd
+    import polars as pl
+    import streaming  # noqa
+    import torch.utils.data as torchdata  # noqa
+    from spiral.client import Spiral
+    from spiral.dataloader import SpiralDataLoader, World  # noqa
+class Scan:
+    """Scan object."""
+    def __init__(self, spiral: "Spiral", core: CoreScan):
+        self.spiral = spiral
+        self.core = core
+    @property
+    def metrics(self) -> dict[str, Any]:
+        """Returns metrics about the scan."""
+        return self.core.metrics()
+    @property
+    def schema(self) -> Schema:
+        """Returns the schema of the scan."""
+        return self.core.schema()
+    @property
+    def key_schema(self) -> Schema:
+        """Returns the key schema of the scan."""
+        return self.core.key_schema()
+    def is_empty(self) -> bool:
+        """Check if the Spiral is empty for the given key range.
+        False negatives are possible, but false positives are not,
+            i.e. is_empty can return False and scan can return zero rows.
+        """
+        return self.core.is_empty()
+    def to_record_batches(
+        self,
+        *,
+        key_range: KeyRange | None = None,
+        key_table: pa.Table | pa.RecordBatchReader | None = None,
+        batch_size: int | None = None,
+        batch_readahead: int | None = None,
+        hide_progress_bar: bool = False,
+    ) -> pa.RecordBatchReader:
+        """Read as a stream of RecordBatches.
+        Args:
+            key_range: Optional key range to filter the scan.
+                If provided, the scan will only return rows within the key range.
+                Only one of key_range or key_table can be provided.
+            key_table: a table of keys to "take" (including aux columns for cell-push-down).
+                If None, the scan will be executed without a key table.
+            batch_size: the maximum number of rows per returned batch.
+                This is currently only respected when the key_table is used. If key table is a
+                RecordBatchReader, the batch_size argument must be None, and the existing batching is respected.
+            batch_readahead: the number of batches to prefetch in the background.
+            hide_progress_bar: If True, disables the progress bar during reading.
+        """
+        if key_range is not None and key_table is not None:
+            raise ValueError("Only one of key_range or key_table can be provided.")
+        if isinstance(key_table, pa.RecordBatchReader):
+            if batch_size is not None:
+                raise ValueError(
+                    "batch_size must be None when key_table is a RecordBatchReader, the existing batching is respected."
+                )
+        elif isinstance(key_table, pa.Table):
+            key_table = key_table.to_reader(max_chunksize=batch_size)
+        return self.core.to_record_batches(
+            key_range=key_range, key_table=key_table, batch_readahead=batch_readahead, progress=(not hide_progress_bar)
+        )
+    def to_unordered_record_batches(
+        self,
+        *,
+        key_table: pa.Table | pa.RecordBatchReader | None = None,
+        batch_size: int | None = None,
+        batch_readahead: int | None = None,
+        hide_progress_bar: bool = False,
+    ) -> pa.RecordBatchReader:
+        """Read as a stream of RecordBatches, NOT ordered by key.
+        Args:
+            key_table: a table of keys to "take" (including aux columns for cell-push-down).
+                If None, the scan will be executed without a key table.
+            batch_size: the maximum number of rows per returned batch.
+                This is currently only respected when the key_table is used. If key table is a
+                RecordBatchReader, the batch_size argument must be None, and the existing batching is respected.
+            batch_readahead: the number of batches to prefetch in the background.
+            hide_progress_bar: If True, disables the progress bar during reading.
+        """
+        if isinstance(key_table, pa.RecordBatchReader):
+            if batch_size is not None:
+                raise ValueError(
+                    "batch_size must be None when key_table is a RecordBatchReader, the existing batching is respected."
+                )
+        elif isinstance(key_table, pa.Table):
+            key_table = key_table.to_reader(max_chunksize=batch_size)
+        return self.core.to_unordered_record_batches(
+            key_table=key_table, batch_readahead=batch_readahead, progress=(not hide_progress_bar)
+        )
+    def to_table(
+        self,
+        *,
+        key_range: KeyRange | None = None,
+        key_table: pa.Table | pa.RecordBatchReader | None = None,
+    ) -> pa.Table:
+        """Read into a single PyArrow Table.
+        Args:
+            key_range: Optional key range to filter the scan.
+                If provided, the scan will only return rows within the key range.
+                Only one of key_range or key_table can be provided.
+            key_table: a table of keys to "take" (including aux columns for cell-push-down).
+                If None, the scan will be executed without a key table.
+        """
+        # NOTE: Evaluates fully on Rust side which improved debuggability.
+        if DEV and not CI and key_table is None and key_range is None:
+            rb = self.core.to_record_batch()
+            return pa.Table.from_batches([rb])
+        return self.to_record_batches(key_range=key_range, key_table=key_table).read_all()
+    def to_dask(self) -> "dd.DataFrame":
+        """Read into a Dask DataFrame.
+        Requires the `dask` package to be installed.
+        Dask execution has some limitations, e.g. UDFs are not currently supported. These limitations
+        usually manifest as serialization errors when Dask workers attempt to serialize the state. If you are
+        encountering such issues, please reach out to the support for assistance.
+        """
+        import dask.dataframe as dd
+        _read_shard = partial(
+            _read_shard_task,
+            settings_json=self.spiral.config.to_json(),
+            state_json=self.core.plan_state().to_json(),
+        )
+        return dd.from_map(_read_shard, self.shards())
+    def to_pandas(self, *, key_range: KeyRange | None = None) -> "pd.DataFrame":
+        """Read into a Pandas DataFrame.
+        Requires the `pandas` package to be installed.
+        """
+        return self.to_table(key_range=key_range).to_pandas()
+    def to_polars(self) -> "pl.DataFrame":
+        """Read into a Polars DataFrame.
+        Requires the `polars` package to be installed.
+        """
+        import polars as pl
+        return pl.from_arrow(self.to_record_batches())
+    def to_data_loader(
+        self, seed: int = 42, shuffle_buffer_size: int = 8192, batch_size: int = 32, **kwargs
+    ) -> "SpiralDataLoader":
+        """Read into a Torch-compatible DataLoader for single-node training.
+        Args:
+            seed: Random seed for reproducibility.
+            shuffle_buffer_size: Size of shuffle buffer.
+            batch_size: Batch size.
+            **kwargs: Additional arguments passed to SpiralDataLoader constructor.
+        Returns:
+            SpiralDataLoader with shuffled shards.
+        """
+        from spiral.dataloader import SpiralDataLoader
+        return SpiralDataLoader(
+            self, seed=seed, shuffle_buffer_size=shuffle_buffer_size, batch_size=batch_size, **kwargs
+        )
+    def to_distributed_data_loader(
+        self,
+        world: Optional["World"] = None,
+        shards: list[Shard] | None = None,
+        seed: int = 42,
+        shuffle_buffer_size: int = 8192,
+        batch_size: int = 32,
+        **kwargs,
+    ) -> "SpiralDataLoader":
+        """Read into a Torch-compatible DataLoader for distributed training.
+        Args:
+            world: World configuration with rank and world_size.
+                If None, auto-detects from torch.distributed.
+            shards: Optional sharding. Sharding is global, i.e. the world will be used to select
+                the shards for this rank. If None, uses scan's natural sharding.
+            seed: Random seed for reproducibility.
+            shuffle_buffer_size: Size of shuffle buffer.
+                Use zero to skip shuffling with shuffle buffer.
+            batch_size: Batch size.
+            **kwargs: Additional arguments passed to SpiralDataLoader constructor.
+        Returns:
+            SpiralDataLoader with shards partitioned for this rank.
+        Auto-detect from PyTorch distributed:
+        ```python
+        loader: SpiralDataLoader = scan.to_distributed_data_loader(batch_size=32)
+        ```
+        Explicit world configuration:
+        ```python
+        world = World(rank=0, world_size=4)
+        loader: SpiralDataLoader = scan.to_distributed_data_loader(world=world, batch_size=32)
+        ```
+        """
+        from spiral.dataloader import SpiralDataLoader, World
+        if world is None:
+            world = World.from_torch()
+        shards = shards or self.shards()
+        # Apply world partitioning to shards.
+        shards = world.shards(shards, seed)
+        return SpiralDataLoader(
+            self,
+            shards=shards,
+            shuffle_shards=False,  # Shards are shuffled before selected for the world.
+            seed=seed,
+            shuffle_buffer_size=shuffle_buffer_size,
+            batch_size=batch_size,
+            **kwargs,
+        )
+    def resume_data_loader(self, state: dict[str, Any], **kwargs) -> "SpiralDataLoader":
+        """Create a DataLoader from checkpoint state, resuming from where it left off.
+        This is the recommended way to resume training from a checkpoint. It extracts
+        the seed, samples_yielded, and shards from the state dict and creates a new
+        DataLoader that will skip the already-processed samples.
+        Args:
+            state: Checkpoint state from state_dict().
+            **kwargs: Additional arguments to pass to SpiralDataLoader constructor.
+                These will override values in the state dict where applicable.
+        Returns:
+            New SpiralDataLoader instance configured to resume from the checkpoint.
+        Save checkpoint during training:
+        ```python
+        loader = scan.to_distributed_data_loader(batch_size=32, seed=42)
+        checkpoint = loader.state_dict()
+        ```
+        Resume later - uses same shards from checkpoint:
+        ```python
+        resumed_loader = scan.resume_data_loader(
+            checkpoint,
+            batch_size=32,
+            transform_fn=my_transform,
+        )
+        """
+        from spiral.dataloader import SpiralDataLoader
+        return SpiralDataLoader.from_state_dict(self, state, **kwargs)
+    def to_iterable_dataset(
+        self,
+        shards: list[Shard] | None = None,
+        shuffle: ShuffleConfig | None = None,
+        batch_readahead: int | None = None,
+        infinite: bool = False,
+    ) -> "hf.IterableDataset":
+        """Returns a Huggingface's IterableDataset.
+        Requires `datasets` package to be installed.
+        Note: For new code, consider using SpiralDataLoader instead.
+        Args:
+            shards: Optional list of shards to read. If None, uses scan's natural sharding.
+            shuffle: Optional ShuffleConfig for configuring within-shard sample shuffling.
+                If None, no shuffling is performed.
+            batch_readahead: Controls how many batches to read ahead concurrently.
+                If pipeline includes work after reading (e.g. decoding, transforming, ...) this can be set higher.
+                Otherwise, it should be kept low to reduce next batch latency. Defaults to 2.
+            infinite: If True, the returned IterableDataset will loop infinitely over the data,
+                re-shuffling ranges after exhausting all data.
+        """
+        stream = self.core.to_shuffled_record_batches(
+            shards=shards,
+            shuffle=shuffle,
+            batch_readahead=batch_readahead,
+            infinite=infinite,
+        )
+        from spiral.iterable_dataset import to_iterable_dataset
+        return to_iterable_dataset(stream)
+    def shards(self) -> list[Shard]:
+        """Get list of shards for this scan.
+        The shards are based on the scan's physical data layout (file fragments).
+        Each shard contains a key range and cardinality (set to None when unknown).
+        Returns:
+            List of Shard objects with key range and cardinality (if known).
+        """
+        return self.core.shards()
+    def _debug(self):
+        # Visualizes the scan, mainly for debugging purposes.
+        from spiral.debug.scan import show_scan
+        show_scan(self.core)
+    def _dump_manifests(self):
+        # Print manifests in a human-readable format.
+        from spiral.debug.manifests import display_scan_manifests
+        display_scan_manifests(self.core)
+    def _dump_metrics(self):
+        # Print metrics in a human-readable format.
+        from spiral.debug.metrics import display_metrics
+        display_metrics(self.metrics)
+# NOTE(marko): This function must be picklable!
+def _read_shard_task(shard: Shard, *, settings_json: str, state_json: str) -> "pd.DataFrame":
+    from spiral import Spiral
+    from spiral.core.table import ScanState
+    from spiral.settings import ClientSettings
+    settings = ClientSettings.from_json(settings_json)
+    sp = Spiral(config=settings)
+    state = ScanState.from_json(state_json)
+    task_scan = Scan(sp, sp.core.load_scan(state))
+    return task_scan.to_record_batches(key_range=shard.key_range, hide_progress_bar=True).read_all().to_pandas()

spiral/server.py ADDED Viewed

@@ -0,0 +1,17 @@
+import socket
+import time
+def wait_for_port(port: int, host: str = "localhost", timeout: float = 5.0):
+    """Wait until a port starts accepting TCP connections."""
+    start_time = time.time()
+    while True:
+        try:
+            with socket.create_connection((host, port), timeout=timeout):
+                break
+        except OSError as ex:
+            time.sleep(0.01)
+            if time.time() - start_time >= timeout:
+                raise TimeoutError(
+                    f"Waited too long for the port {port} on host {host} to start accepting connections."
+                ) from ex

spiral/settings.py ADDED Viewed

@@ -0,0 +1,36 @@
+"""Configuration module using Rust ClientSettings via PyO3.
+This module provides a simple settings() function that returns a cached
+ClientSettings instance loaded from ~/.spiral.toml and environment variables.
+"""
+import functools
+import os
+from pathlib import Path
+import typer
+from spiral.core.config import ClientSettings
+DEV = "PYTEST_VERSION" in os.environ or bool(os.environ.get("SPIRAL_DEV", None))
+CI = "GITHUB_ACTIONS" in os.environ
+APP_DIR = Path(typer.get_app_dir("pyspiral"))
+LOG_DIR = APP_DIR / "logs"
+PACKAGE_NAME = "pyspiral"
+@functools.cache
+def settings() -> ClientSettings:
+    """Get the global ClientSettings instance.
+    Configuration is loaded with the following priority (highest to lowest):
+    1. Environment variables (SPIRAL__*)
+    2. Config file (~/.spiral.toml)
+    3. Default values
+    Returns:
+        ClientSettings: The global configuration instance
+    """
+    return ClientSettings.load()

spiral/snapshot.py ADDED Viewed

@@ -0,0 +1,56 @@
+from typing import TYPE_CHECKING
+from spiral.core.table import Snapshot as CoreSnapshot
+from spiral.core.table.spec import Schema
+from spiral.types_ import Timestamp
+if TYPE_CHECKING:
+    import duckdb
+    import polars as pl
+    import pyarrow.dataset as ds
+    import torch.utils.data as torchdata  # noqa
+    from spiral.table import Table
+class Snapshot:
+    """Spiral table snapshot.
+    A snapshot represents a point-in-time view of a table.
+    """
+    def __init__(self, table: "Table", core: CoreSnapshot):
+        self.core = core
+        self._table = table
+    @property
+    def asof(self) -> Timestamp:
+        """Returns the asof timestamp of the snapshot."""
+        return self.core.asof
+    def schema(self) -> Schema:
+        """Returns the schema of the snapshot."""
+        return self.core.table.get_schema(asof=self.asof)
+    @property
+    def table(self) -> "Table":
+        """Returns the table associated with the snapshot."""
+        return self._table
+    def to_dataset(self) -> "ds.Dataset":
+        """Returns a PyArrow Dataset representing the table."""
+        from spiral.dataset import Dataset
+        return Dataset(self)
+    def to_polars(self) -> "pl.LazyFrame":
+        """Returns a Polars LazyFrame for the Spiral table."""
+        import polars as pl
+        return pl.scan_pyarrow_dataset(self.to_dataset())
+    def to_duckdb(self) -> "duckdb.DuckDBPyRelation":
+        """Returns a DuckDB relation for the Spiral table."""
+        import duckdb
+        return duckdb.from_arrow(self.to_dataset())

spiral/streaming_/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+from .stream import SpiralStream
+__all__ = ["SpiralStream"]