PyPI - pyspiral - Versions diffs - 0.4.0__pp310-pypy310_pp73-macosx_10_12_x86_64.whl - Mend

pyspiral 0.4.0__pp310-pypy310_pp73-macosx_10_12_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (98) hide show

pyspiral-0.4.0.dist-info/METADATA +46 -0
pyspiral-0.4.0.dist-info/RECORD +98 -0
pyspiral-0.4.0.dist-info/WHEEL +4 -0
pyspiral-0.4.0.dist-info/entry_points.txt +2 -0
spiral/__init__.py +10 -0
spiral/_lib.pypy310-pp73-darwin.so +0 -0
spiral/adbc.py +393 -0
spiral/api/__init__.py +64 -0
spiral/api/admin.py +15 -0
spiral/api/client.py +160 -0
spiral/api/filesystems.py +153 -0
spiral/api/organizations.py +77 -0
spiral/api/projects.py +197 -0
spiral/api/telemetry.py +19 -0
spiral/api/types.py +20 -0
spiral/api/workloads.py +52 -0
spiral/arrow_.py +221 -0
spiral/cli/__init__.py +79 -0
spiral/cli/__main__.py +4 -0
spiral/cli/admin.py +16 -0
spiral/cli/app.py +65 -0
spiral/cli/console.py +95 -0
spiral/cli/fs.py +112 -0
spiral/cli/iceberg/__init__.py +7 -0
spiral/cli/iceberg/namespaces.py +47 -0
spiral/cli/iceberg/tables.py +60 -0
spiral/cli/indexes/__init__.py +19 -0
spiral/cli/login.py +22 -0
spiral/cli/orgs.py +90 -0
spiral/cli/printer.py +53 -0
spiral/cli/projects.py +136 -0
spiral/cli/state.py +5 -0
spiral/cli/tables/__init__.py +121 -0
spiral/cli/telemetry.py +18 -0
spiral/cli/types.py +51 -0
spiral/cli/workloads.py +59 -0
spiral/client.py +79 -0
spiral/core/__init__.pyi +0 -0
spiral/core/client/__init__.pyi +117 -0
spiral/core/index/__init__.pyi +15 -0
spiral/core/table/__init__.pyi +108 -0
spiral/core/table/manifests/__init__.pyi +35 -0
spiral/core/table/metastore/__init__.pyi +62 -0
spiral/core/table/spec/__init__.pyi +214 -0
spiral/datetime_.py +27 -0
spiral/expressions/__init__.py +245 -0
spiral/expressions/base.py +149 -0
spiral/expressions/http.py +86 -0
spiral/expressions/io.py +100 -0
spiral/expressions/list_.py +68 -0
spiral/expressions/mp4.py +62 -0
spiral/expressions/png.py +18 -0
spiral/expressions/qoi.py +18 -0
spiral/expressions/refs.py +58 -0
spiral/expressions/str_.py +39 -0
spiral/expressions/struct.py +59 -0
spiral/expressions/text.py +62 -0
spiral/expressions/tiff.py +223 -0
spiral/expressions/udf.py +46 -0
spiral/grpc_.py +32 -0
spiral/iceberg/__init__.py +3 -0
spiral/iceberg/client.py +33 -0
spiral/indexes/__init__.py +5 -0
spiral/indexes/client.py +137 -0
spiral/indexes/index.py +34 -0
spiral/indexes/scan.py +22 -0
spiral/project.py +46 -0
spiral/protogen/_/__init__.py +0 -0
spiral/protogen/_/arrow/__init__.py +0 -0
spiral/protogen/_/arrow/flight/__init__.py +0 -0
spiral/protogen/_/arrow/flight/protocol/__init__.py +0 -0
spiral/protogen/_/arrow/flight/protocol/sql/__init__.py +1990 -0
spiral/protogen/_/scandal/__init__.py +178 -0
spiral/protogen/_/spiral/__init__.py +0 -0
spiral/protogen/_/spiral/table/__init__.py +22 -0
spiral/protogen/_/substrait/__init__.py +3399 -0
spiral/protogen/_/substrait/extensions/__init__.py +115 -0
spiral/protogen/__init__.py +0 -0
spiral/protogen/substrait/__init__.py +3399 -0
spiral/protogen/substrait/extensions/__init__.py +115 -0
spiral/protogen/util.py +41 -0
spiral/py.typed +0 -0
spiral/server.py +17 -0
spiral/settings.py +101 -0
spiral/substrait_.py +279 -0
spiral/tables/__init__.py +12 -0
spiral/tables/client.py +130 -0
spiral/tables/dataset.py +250 -0
spiral/tables/debug/__init__.py +0 -0
spiral/tables/debug/manifests.py +70 -0
spiral/tables/debug/metrics.py +56 -0
spiral/tables/debug/scan.py +248 -0
spiral/tables/maintenance.py +12 -0
spiral/tables/scan.py +193 -0
spiral/tables/snapshot.py +78 -0
spiral/tables/table.py +157 -0
spiral/tables/transaction.py +52 -0
spiral/types_.py +6 -0

spiral/tables/client.py ADDED Viewed

@@ -0,0 +1,130 @@
+from datetime import datetime
+from typing import Any
+import pyarrow as pa
+from spiral.api import SpiralAPI
+from spiral.api.projects import TableResource
+from spiral.core.client import Spiral as CoreSpiral
+from spiral.core.table.spec import Schema
+from spiral.datetime_ import timestamp_micros
+from spiral.expressions import ExprLike
+from spiral.tables.scan import Scan
+from spiral.tables.table import Table
+from spiral.types_ import Uri
+class Tables:
+    """
+    Spiral Tables a powerful and flexible way for storing, analyzing,
+    and querying massive and/or multimodal datasets.
+    The data model will feel familiar to users of SQL- or DataFrame-style systems,
+    yet is designed to be more flexible, more powerful, and more useful in the context
+    of modern data processing. Tables are stored and queried directly from object storage.
+    """
+    def __init__(self, api: SpiralAPI, spiral: CoreSpiral, *, project_id: str | None = None):
+        self._api = api
+        self._spiral = spiral
+        self._project_id = project_id
+    def table(self, identifier: str) -> Table:
+        """Open a table with a `dataset.table` identifier, or `table` name using the `default` dataset."""
+        project_id, dataset, table = self._parse_identifier(identifier)
+        if project_id is None:
+            raise ValueError("Must provide a fully qualified table identifier.")
+        res = list(self._api.project.list_tables(project_id, dataset=dataset, table=table))
+        if len(res) == 0:
+            raise ValueError(f"Table not found: {project_id}.{dataset}.{table}")
+        res = res[0]
+        return Table(self, self._spiral.get_table(res.id), identifier=f"{res.project_id}.{res.dataset}.{res.table}")
+    def list_tables(self) -> list[TableResource]:
+        project_id = self._project_id
+        if project_id is None:
+            raise ValueError("Must provide a project ID to list tables.")
+        return list(self._api.project.list_tables(project_id))
+    def create_table(
+        self,
+        identifier: str,
+        *,
+        key_schema: pa.Schema | Any,
+        root_uri: Uri | None = None,
+        exist_ok: bool = False,
+    ) -> Table:
+        """Create a new table in the project.
+        Args:
+            identifier: The table identifier, in the form `project.dataset.table`, `dataset.table` or `table`.
+            key_schema: The schema of the table's keys.
+            root_uri: The root URI for the table.
+            exist_ok: If True, do not raise an error if the table already exists.
+        """
+        project_id, dataset, table = self._parse_identifier(identifier)
+        if project_id is None:
+            raise ValueError("Must provide a fully qualified table identifier.")
+        if not isinstance(key_schema, pa.Schema):
+            key_schema = pa.schema(key_schema)
+        key_schema = Schema.from_arrow(key_schema)
+        core_table = self._spiral.create_table(
+            project_id,
+            dataset=dataset,
+            table=table,
+            key_schema=key_schema,
+            root_uri=root_uri,
+            exist_ok=exist_ok,
+        )
+        return Table(self, core_table, identifier=f"{project_id}.{dataset}.{table}")
+    def _parse_identifier(self, identifier: str) -> tuple[str | None, str, str]:
+        parts = identifier.split(".")
+        if len(parts) == 1:
+            return self._project_id, "default", parts[0]
+        elif len(parts) == 2:
+            return self._project_id, parts[0], parts[1]
+        elif len(parts) == 3:
+            return parts[0], parts[1], parts[2]
+        else:
+            raise ValueError(f"Invalid table identifier: {identifier}")
+    def scan(
+        self,
+        *projections: ExprLike,
+        where: ExprLike | None = None,
+        asof: datetime | int | None = None,
+        exclude_keys: bool = False,
+    ) -> Scan:
+        """Starts a read transaction on the Spiral.
+        Args:
+            projections: a set of expressions that return struct arrays.
+            where: a query expression to apply to the data.
+            asof: only data written before the given timestamp will be returned, caveats around compaction.
+            exclude_keys: whether to exclude the key columns in the scan result, defaults to False.
+                Note that if a projection includes a key column, it will be included in the result.
+        """
+        from spiral import expressions as se
+        if isinstance(asof, datetime):
+            asof = timestamp_micros(asof)
+        # Combine all projections into a single struct.
+        projection = se.merge(*projections)
+        if where is not None:
+            where = se.lift(where)
+        return Scan(
+            self._spiral.open_table_scan(
+                projection.__expr__,
+                filter=where.__expr__ if where else None,
+                asof=asof,
+                exclude_keys=exclude_keys,
+            ),
+        )

spiral/tables/dataset.py ADDED Viewed

@@ -0,0 +1,250 @@
+from typing import Any
+import pyarrow as pa
+import pyarrow.compute as pc
+import pyarrow.dataset as ds
+from spiral.tables import Scan, Snapshot
+class TableDataset(ds.Dataset):
+    def __init__(self, snapshot: Snapshot):
+        self._snapshot = snapshot
+        self._table = snapshot.table
+        self._schema: pa.Schema = self._snapshot._snapshot.table.get_schema(asof=self._snapshot.asof).to_arrow()
+        # We don't actually initialize a Dataset, we just implement enough of the API
+        # to fool both DuckDB and Polars.
+        # super().__init__()
+        self._last_scan = None
+    @property
+    def schema(self) -> pa.Schema:
+        return self._schema
+    def count_rows(
+        self,
+        filter: pc.Expression | None = None,
+        batch_size: int | None = None,
+        batch_readahead: int | None = None,
+        fragment_readahead: int | None = None,
+        fragment_scan_options: ds.FragmentScanOptions | None = None,
+        use_threads: bool = True,
+        memory_pool: pa.MemoryPool = None,
+    ):
+        return self.scanner(
+            None,
+            filter,
+            batch_size,
+            batch_readahead,
+            fragment_readahead,
+            fragment_scan_options,
+            use_threads,
+            memory_pool,
+        ).count_rows()
+    def filter(self, expression: pc.Expression) -> "TableDataset":
+        raise NotImplementedError("filter not implemented")
+    def get_fragments(self, filter: pc.Expression | None = None):
+        """TODO(ngates): perhaps we should return ranges as per our split API?"""
+        raise NotImplementedError("get_fragments not implemented")
+    def head(
+        self,
+        num_rows: int,
+        columns: list[str] | None = None,
+        filter: pc.Expression | None = None,
+        batch_size: int | None = None,
+        batch_readahead: int | None = None,
+        fragment_readahead: int | None = None,
+        fragment_scan_options: ds.FragmentScanOptions | None = None,
+        use_threads: bool = True,
+        memory_pool: pa.MemoryPool = None,
+    ):
+        return self.scanner(
+            columns,
+            filter,
+            batch_size,
+            batch_readahead,
+            fragment_readahead,
+            fragment_scan_options,
+            use_threads,
+            memory_pool,
+        ).head(num_rows)
+    def join(
+        self,
+        right_dataset,
+        keys,
+        right_keys=None,
+        join_type=None,
+        left_suffix=None,
+        right_suffix=None,
+        coalesce_keys=True,
+        use_threads=True,
+    ):
+        raise NotImplementedError("join not implemented")
+    def join_asof(self, right_dataset, on, by, tolerance, right_on=None, right_by=None):
+        raise NotImplementedError("join_asof not implemented")
+    def replace_schema(self, schema: pa.Schema) -> "TableDataset":
+        raise NotImplementedError("replace_schema not implemented")
+    def scanner(
+        self,
+        columns: list[str] | None = None,
+        filter: pc.Expression | None = None,
+        batch_size: int | None = None,
+        batch_readahead: int | None = None,
+        fragment_readahead: int | None = None,
+        fragment_scan_options: ds.FragmentScanOptions | None = None,
+        use_threads: bool = True,
+        memory_pool: pa.MemoryPool = None,
+    ) -> "TableScanner":
+        from spiral.substrait_ import SubstraitConverter
+        # Extract the substrait expression so we can convert it to a Spiral expression
+        if filter is not None:
+            filter = SubstraitConverter(self._table, self._schema, self._table.key_schema.to_arrow()).convert(
+                filter.to_substrait(self._schema, allow_arrow_extensions=True),
+            )
+        scan = (
+            self._snapshot.scan(
+                {c: self._table[c] for c in columns},
+                where=filter,
+                exclude_keys=True,
+            )
+            if columns
+            else self._snapshot.scan(where=filter)
+        )
+        self._last_scan = scan
+        return TableScanner(scan)
+    def sort_by(self, sorting, **kwargs):
+        raise NotImplementedError("sort_by not implemented")
+    def take(
+        self,
+        indices: pa.Array | Any,
+        columns: list[str] | None = None,
+        filter: pc.Expression | None = None,
+        batch_size: int | None = None,
+        batch_readahead: int | None = None,
+        fragment_readahead: int | None = None,
+        fragment_scan_options: ds.FragmentScanOptions | None = None,
+        use_threads: bool = True,
+        memory_pool: pa.MemoryPool = None,
+    ):
+        return self.scanner(
+            columns,
+            filter,
+            batch_size,
+            batch_readahead,
+            fragment_readahead,
+            fragment_scan_options,
+            use_threads,
+            memory_pool,
+        ).take(indices)
+    def to_batches(
+        self,
+        columns: list[str] | None = None,
+        filter: pc.Expression | None = None,
+        batch_size: int | None = None,
+        batch_readahead: int | None = None,
+        fragment_readahead: int | None = None,
+        fragment_scan_options: ds.FragmentScanOptions | None = None,
+        use_threads: bool = True,
+        memory_pool: pa.MemoryPool = None,
+    ):
+        return self.scanner(
+            columns,
+            filter,
+            batch_size,
+            batch_readahead,
+            fragment_readahead,
+            fragment_scan_options,
+            use_threads,
+            memory_pool,
+        ).to_batches()
+    def to_table(
+        self,
+        columns=None,
+        filter: pc.Expression | None = None,
+        batch_size: int | None = None,
+        batch_readahead: int | None = None,
+        fragment_readahead: int | None = None,
+        fragment_scan_options: ds.FragmentScanOptions | None = None,
+        use_threads: bool = True,
+        memory_pool: pa.MemoryPool = None,
+    ):
+        return self.scanner(
+            columns,
+            filter,
+            batch_size,
+            batch_readahead,
+            fragment_readahead,
+            fragment_scan_options,
+            use_threads,
+            memory_pool,
+        ).to_table()
+class TableScanner(ds.Scanner):
+    """A PyArrow Dataset Scanner that reads from a Spiral Table."""
+    def __init__(
+        self,
+        scan: Scan,
+        key_table: pa.Table | pa.RecordBatchReader | None = None,
+    ):
+        self._scan = scan
+        self._schema = scan.schema
+        self.key_table = key_table
+        # We don't actually initialize a Dataset, we just implement enough of the API
+        # to fool both DuckDB and Polars.
+        # super().__init__()
+    @property
+    def schema(self):
+        return self._schema
+    def count_rows(self):
+        # TODO(ngates): is there a faster way to count rows?
+        return sum(len(batch) for batch in self.to_reader())
+    def head(self, num_rows: int):
+        """Return the first `num_rows` rows of the dataset."""
+        reader = self.to_reader()
+        batches = []
+        row_count = 0
+        for batch in reader:
+            if row_count + len(batch) > num_rows:
+                batches.append(batch.slice(0, num_rows - row_count))
+                break
+            row_count += len(batch)
+            batches.append(batch)
+        return pa.Table.from_batches(batches, schema=reader.schema)
+    def scan_batches(self):
+        raise NotImplementedError("scan_batches not implemented")
+    def take(self, indices):
+        # TODO(ngates): can we defer take until after we've constructed the scan?
+        #  Or should this we delay constructing the Spiral Table.scan?
+        raise NotImplementedError("take not implemented")
+    def to_batches(self):
+        return self.to_reader()
+    def to_reader(self):
+        return self._scan.to_record_batches(key_table=self.key_table)
+    def to_table(self):
+        return self.to_reader().read_all()

spiral/tables/debug/__init__.py ADDED Viewed

File without changes

spiral/tables/debug/manifests.py ADDED Viewed

@@ -0,0 +1,70 @@
+from spiral import datetime_
+from spiral.core.table import TableScan
+from spiral.core.table.manifests import FragmentManifest
+from spiral.tables.debug.metrics import _format_bytes
+def display_manifests(scan: TableScan):
+    """Display all manifests in a scan."""
+    if len(scan.table_ids()) != 1:
+        raise NotImplementedError("Multiple table scans are not supported.")
+    table_id = scan.table_ids()[0]
+    key_space_manifest: FragmentManifest = scan.key_space_scan(table_id).manifest
+    _table_of_fragments(
+        key_space_manifest,
+        title="Key Space manifest",
+    )
+    for column_group in scan.column_groups():
+        column_group_manifest: FragmentManifest = scan.column_group_scan(column_group).manifest
+        _table_of_fragments(
+            column_group_manifest,
+            title=f"Column Group manifest for {str(column_group)}",
+        )
+def _table_of_fragments(manifest: FragmentManifest, title: str):
+    """Display fragments in a formatted table."""
+    # Calculate summary statistics
+    total_size = sum(fragment.size_bytes for fragment in manifest)
+    total_metadata_size = sum(len(fragment.format_metadata or b"") for fragment in manifest)
+    fragment_count = len(manifest)
+    avg_size = total_size / fragment_count if fragment_count > 0 else 0
+    # Print title and summary
+    print(f"\n\n{title}")
+    print(
+        f"{fragment_count} fragments, "
+        f"total: {_format_bytes(total_size)}, "
+        f"avg: {_format_bytes(int(avg_size))}, "
+        f"metadata: {_format_bytes(total_metadata_size)}"
+    )
+    print("=" * 120)
+    # Print header
+    print(
+        f"{'ID':<30} {'Size (Metadata)':<20} {'Format':<10} {'Key Span':<10} "
+        f"{'Level':<5} {'Committed At':<20} {'Compacted At':<20}"
+    )
+    print("=" * 120)
+    # Print each fragment
+    for fragment in manifest:
+        committed_str = str(datetime_.from_timestamp_micros(fragment.committed_at)) if fragment.committed_at else "N/A"
+        compacted_str = str(datetime_.from_timestamp_micros(fragment.compacted_at)) if fragment.compacted_at else "N/A"
+        size_with_metadata = (
+            f"{_format_bytes(fragment.size_bytes)} ({_format_bytes(len(fragment.format_metadata or b''))})"
+        )
+        key_span = f"{fragment.key_span.begin}..{fragment.key_span.end}"
+        print(
+            f"{fragment.id:<30} "
+            f"{size_with_metadata:<20} "
+            f"{str(fragment.format):<10} "
+            f"{key_span:<10} "
+            f"{str(fragment.level):<5} "
+            f"{committed_str:<20} "
+            f"{compacted_str:<20}"
+        )

spiral/tables/debug/metrics.py ADDED Viewed

@@ -0,0 +1,56 @@
+from typing import Any
+def display_metrics(metrics: dict[str, Any]) -> None:
+    """Display metrics in a formatted table."""
+    print(
+        f"{'Metric':<40} {'Type':<10} {'Count':<8} {'Avg':<12} {'Min':<12} "
+        f"{'Max':<12} {'P95':<12} {'P99':<12} {'StdDev':<12}"
+    )
+    print("=" * 140)
+    for metric_name, data in sorted(metrics.items()):
+        metric_type = data["type"]
+        count = data["count"]
+        avg = _format_value(data["avg"], metric_type, metric_name)
+        min_val = _format_value(data["min"], metric_type, metric_name)
+        max_val = _format_value(data["max"], metric_type, metric_name)
+        p95 = _format_value(data["p95"], metric_type, metric_name)
+        p99 = _format_value(data["p99"], metric_type, metric_name)
+        stddev = _format_value(data["stddev"], metric_type, metric_name)
+        print(
+            f"{metric_name:<40} {metric_type:<10} {count:<8} {avg:<12} {min_val:<12} "
+            f"{max_val:<12} {p95:<12} {p99:<12} {stddev:<12}"
+        )
+def _format_duration(nanoseconds: float) -> str:
+    """Convert nanoseconds to human-readable duration."""
+    if nanoseconds >= 1_000_000_000:
+        return f"{nanoseconds / 1_000_000_000:.2f}s"
+    elif nanoseconds >= 1_000_000:
+        return f"{nanoseconds / 1_000_000:.2f}ms"
+    elif nanoseconds >= 1_000:
+        return f"{nanoseconds / 1_000:.2f}μs"
+    else:
+        return f"{nanoseconds:.0f}ns"
+def _format_bytes(bytes_value: float) -> str:
+    """Convert bytes to human-readable size."""
+    for unit in ["B", "KB", "MB", "GB"]:
+        if bytes_value < 1024:
+            return f"{bytes_value:.1f}{unit}"
+        bytes_value /= 1024
+    return f"{bytes_value:.1f}TB"
+def _format_value(value: float, metric_type: str, metric_name: str) -> str:
+    """Format a value based on metric type and name."""
+    if metric_type == "timer" or "duration" in metric_name:
+        return _format_duration(value)
+    elif "bytes" in metric_name:
+        return _format_bytes(value)
+    else:
+        return f"{value:,.0f}"