PyPI - pyspiral - Versions diffs - 0.1.0__cp310-abi3-macosx_11_0_arm64.whl - Mend

pyspiral 0.1.0__cp310-abi3-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (81) hide show

pyspiral-0.1.0.dist-info/METADATA +48 -0
pyspiral-0.1.0.dist-info/RECORD +81 -0
pyspiral-0.1.0.dist-info/WHEEL +4 -0
pyspiral-0.1.0.dist-info/entry_points.txt +2 -0
spiral/__init__.py +11 -0
spiral/_lib.abi3.so +0 -0
spiral/adbc.py +386 -0
spiral/api/__init__.py +221 -0
spiral/api/admin.py +29 -0
spiral/api/filesystems.py +125 -0
spiral/api/organizations.py +90 -0
spiral/api/projects.py +160 -0
spiral/api/tables.py +94 -0
spiral/api/tokens.py +56 -0
spiral/api/workloads.py +45 -0
spiral/arrow.py +209 -0
spiral/authn/__init__.py +0 -0
spiral/authn/authn.py +89 -0
spiral/authn/device.py +206 -0
spiral/authn/github_.py +33 -0
spiral/authn/modal_.py +18 -0
spiral/catalog.py +78 -0
spiral/cli/__init__.py +82 -0
spiral/cli/__main__.py +4 -0
spiral/cli/admin.py +21 -0
spiral/cli/app.py +48 -0
spiral/cli/console.py +95 -0
spiral/cli/fs.py +47 -0
spiral/cli/login.py +13 -0
spiral/cli/org.py +90 -0
spiral/cli/printer.py +45 -0
spiral/cli/project.py +107 -0
spiral/cli/state.py +3 -0
spiral/cli/table.py +20 -0
spiral/cli/token.py +27 -0
spiral/cli/types.py +53 -0
spiral/cli/workload.py +59 -0
spiral/config.py +26 -0
spiral/core/__init__.py +0 -0
spiral/core/core/__init__.pyi +53 -0
spiral/core/manifests/__init__.pyi +53 -0
spiral/core/metastore/__init__.pyi +91 -0
spiral/core/spec/__init__.pyi +257 -0
spiral/dataset.py +239 -0
spiral/debug.py +251 -0
spiral/expressions/__init__.py +222 -0
spiral/expressions/base.py +149 -0
spiral/expressions/http.py +86 -0
spiral/expressions/io.py +100 -0
spiral/expressions/list_.py +68 -0
spiral/expressions/refs.py +44 -0
spiral/expressions/str_.py +39 -0
spiral/expressions/struct.py +57 -0
spiral/expressions/tiff.py +223 -0
spiral/expressions/udf.py +46 -0
spiral/grpc_.py +32 -0
spiral/project.py +137 -0
spiral/proto/_/__init__.py +0 -0
spiral/proto/_/arrow/__init__.py +0 -0
spiral/proto/_/arrow/flight/__init__.py +0 -0
spiral/proto/_/arrow/flight/protocol/__init__.py +0 -0
spiral/proto/_/arrow/flight/protocol/sql/__init__.py +1990 -0
spiral/proto/_/scandal/__init__.py +223 -0
spiral/proto/_/spfs/__init__.py +36 -0
spiral/proto/_/spiral/__init__.py +0 -0
spiral/proto/_/spiral/table/__init__.py +225 -0
spiral/proto/_/spiraldb/__init__.py +0 -0
spiral/proto/_/spiraldb/metastore/__init__.py +499 -0
spiral/proto/__init__.py +0 -0
spiral/proto/scandal/__init__.py +45 -0
spiral/proto/spiral/__init__.py +0 -0
spiral/proto/spiral/table/__init__.py +96 -0
spiral/proto/substrait/__init__.py +3399 -0
spiral/proto/substrait/extensions/__init__.py +115 -0
spiral/proto/util.py +41 -0
spiral/py.typed +0 -0
spiral/scan_.py +168 -0
spiral/settings.py +157 -0
spiral/substrait_.py +275 -0
spiral/table.py +157 -0
spiral/types_.py +6 -0

spiral/expressions/tiff.py ADDED Viewed

@@ -0,0 +1,223 @@
+import numpy as np
+import pyarrow as pa
+from spiral.expressions.base import ExprLike
+from spiral.expressions.udf import RefUDF
+def read(
+    expr: ExprLike,
+    indexes: ExprLike | int | list[int] | None = None,
+    window: ExprLike | tuple[tuple[int, int], tuple[int, int]] | None = None,
+    boundless: ExprLike | bool | None = None,
+):
+    """
+    Read referenced cell in a `TIFF` format. Requires `rasterio` to be installed.
+    Args:
+        expr: The referenced `TIFF` bytes.
+        indexes: The band indexes to read. Defaults to first band. The first dimension of the result's `shape` field
+            is either 1 or the number of indexes.
+        window: The window to read. In format (row_range_tuple, col_range_tuple). Defaults to full window.
+        boundless: If `True`, windows that extend beyond the dataset's extent
+            are permitted and partially or completely filled arrays will be returned as appropriate.
+    Returns:
+        An array where each element is a NumPy array represented as a struct with fields:
+            bytes: Array bytes with type `pa.large_binary()`.
+            shape: Array shape with type `pa.list_(pa.uint32(), 3)`.
+            dtype: String representation of NumPy dtype with type `pa.string()`.
+    Example:
+        A way to get the i-th element in the result as NumPy array:
+        ```
+        array: np.ndarray = np.frombuffer(
+            result["bytes"][i].as_py(),
+            dtype=np.dtype(result["dtype"][i].as_py()),
+        ).reshape(tuple(result["shape"][i].as_py()))
+        ```
+    """
+    try:
+        import rasterio  # noqa: F401
+    except ImportError:
+        raise ImportError("`rasterio` is required for tiff.read")
+    return TiffReadUDF()(expr, indexes, window, boundless)
+def crop(
+    expr: ExprLike,
+    shape: ExprLike,
+):
+    """
+    Crop shapes out of the referenced cell in a `TIFF` format. Requires `rasterio` to be installed.
+    Args:
+        expr: The referenced `TIFF` bytes.
+        shape: [GeoJSON-like](https://geojson.org/) shape.
+    Returns:
+        An array where each element is a NumPy array represented as a struct with fields:
+            bytes: Array bytes with type `pa.large_binary()`.
+            shape: Array shape with type `pa.list_(pa.uint32(), 3)`.
+            dtype: String representation of NumPy dtype with type `pa.string()`.
+    Example:
+        A way to get the i-th element in the result as NumPy array:
+        ```
+        array: np.ndarray = np.frombuffer(
+            result["bytes"][i].as_py(),
+            dtype=np.dtype(result["dtype"][i].as_py()),
+        ).reshape(tuple(result["shape"][i].as_py()))
+        ```
+    """
+    try:
+        import rasterio  # noqa: F401
+    except ImportError:
+        raise ImportError("`rasterio` is required for tiff.crop")
+    return TiffCropUDF()(expr, shape)
+class TiffReadUDF(RefUDF):
+    RES_DTYPE: pa.DataType = pa.struct(
+        [
+            pa.field("bytes", pa.large_binary()),
+            pa.field("shape", pa.list_(pa.uint32(), 3)),
+            pa.field("dtype", pa.string()),
+        ]
+    )
+    def __init__(self):
+        super().__init__("tiff.read")
+    def return_type(self, *input_types: pa.DataType) -> pa.DataType:
+        return TiffReadUDF.RES_DTYPE
+    def invoke(self, fp, *input_args: pa.Array) -> pa.Array:
+        try:
+            import rasterio
+        except ImportError:
+            raise ImportError("`rasterio` is required for tiff.read")
+        from rasterio.windows import Window
+        if len(input_args) != 4:
+            raise ValueError("tiff.read expects exactly 4 arguments: expr, indexes, window, boundless")
+        _, indexes, window, boundless = input_args
+        indexes = indexes[0].as_py()
+        if indexes is not None and not isinstance(indexes, int) and not isinstance(indexes, list):
+            raise ValueError(f"tiff.read expects indexes to be None or an int or a list, got {indexes}")
+        boundless = boundless[0].as_py()
+        if boundless is not None and not isinstance(boundless, bool):
+            raise ValueError(f"tiff.read expects boundless to be None or a bool, got {boundless}")
+        window = window[0].as_py()
+        if window is not None:
+            if len(window) != 2:
+                raise ValueError(f"tiff.read window invalid, got {window}")
+            window = Window.from_slices(slice(*window[0]), slice(*window[1]), boundless=boundless or False)
+        opener = _VsiOpener(fp)
+        with rasterio.open("ref", opener=opener) as src:
+            src: rasterio.DatasetReader
+            # TODO(marko): We know the size and dtype so we should be able to preallocate the result and read into it.
+            #   This matters more if we want to rewrite this function to work with multiple inputs at once, in which
+            #   case we should first consider using Rust GDAL bindings - I believe rasterio uses GDAL under the hood.
+            result: np.ndarray = src.read(indexes=indexes, window=window)
+            return pa.array(
+                [
+                    {
+                        "bytes": result.tobytes(),
+                        "shape": list(result.shape),
+                        "dtype": str(result.dtype),
+                    }
+                ],
+                type=TiffReadUDF.RES_DTYPE,
+            )
+class TiffCropUDF(RefUDF):
+    RES_DTYPE: pa.DataType = pa.struct(
+        [
+            pa.field("bytes", pa.large_binary()),
+            pa.field("shape", pa.list_(pa.uint32()), 3),
+            pa.field("dtype", pa.string()),
+        ]
+    )
+    def __init__(self):
+        super().__init__("tiff.crop")
+    def return_type(self, *input_types: pa.DataType) -> pa.DataType:
+        return TiffCropUDF.RES_DTYPE
+    def invoke(self, fp, *input_args: pa.Array) -> pa.Array:
+        try:
+            import rasterio
+        except ImportError:
+            raise ImportError("`rasterio` is required for tiff.crop")
+        from rasterio.mask import mask as rio_mask
+        if len(input_args) != 2:
+            raise ValueError("tiff.crop expects exactly 2 arguments: expr, shape")
+        _, shape = input_args
+        shape = shape[0].as_py()
+        if shape is None:
+            raise ValueError("tiff.crop expects shape to be a GeoJSON-like shape")
+        opener = _VsiOpener(fp)
+        with rasterio.open("ref", opener=opener) as src:
+            src: rasterio.DatasetReader
+            result, _ = rio_mask(src, shapes=[shape], crop=True)
+            result: np.ndarray
+            return pa.array(
+                [
+                    {
+                        "bytes": result.tobytes(),
+                        "shape": list(result.shape),
+                        "dtype": str(result.dtype),
+                    }
+                ],
+                type=TiffCropUDF.RES_DTYPE,
+            )
+class _VsiOpener:
+    """
+    VSI file opener which returns a constant file-like on open.
+    Must match https://rasterio.readthedocs.io/en/stable/topics/vsi.html#python-file-and-filesystem-openers spec but
+    only `open` is needed when going through rasterio.
+    """
+    def __init__(self, file_like):
+        self._file_like = file_like
+    def open(self, _path, mode):
+        if mode not in {"r", "rb"}:
+            raise ValueError(f"Unsupported mode: {mode}")
+        return self._file_like
+    def isdir(self, _):
+        return False
+    def isfile(self, _):
+        return False
+    def mtime(self, _):
+        return 0
+    def size(self, _):
+        return self._file_like.size()
+    def modified(self, _):
+        raise NotImplementedError

spiral/expressions/udf.py ADDED Viewed

@@ -0,0 +1,46 @@
+import abc
+import pyarrow as pa
+from spiral import _lib
+from spiral.expressions.base import Expr
+class BaseUDF:
+    def __init__(self, udf):
+        self._udf = udf
+    def __call__(self, *args) -> Expr:
+        """Create an expression that calls this UDF with the given arguments."""
+        from spiral import expressions as se
+        args = [se.lift(arg).__expr__ for arg in args]
+        return Expr(self._udf(args))
+    @abc.abstractmethod
+    def return_type(self, *input_types: pa.DataType) -> pa.DataType: ...
+class UDF(BaseUDF):
+    """A User-Defined Function (UDF)."""
+    def __init__(self, name: str):
+        super().__init__(_lib.spql.expr.udf.create(name, return_type=self.return_type, invoke=self.invoke))
+    @abc.abstractmethod
+    def invoke(self, *input_args: pa.Array) -> pa.Array: ...
+class RefUDF(BaseUDF):
+    """A UDF over a single ref cell, and therefore can access the file object."""
+    def __init__(self, name: str):
+        super().__init__(_lib.spql.expr.udf.create(name, return_type=self.return_type, invoke=self.invoke, scope="ref"))
+    @abc.abstractmethod
+    def invoke(self, fp, *input_args: pa.Array) -> pa.Array:
+        """Invoke the UDF with the given arguments.
+        NOTE: The first argument is always the ref cell. All array input args will be sliced to the appropriate row.
+        """
+        ...

spiral/grpc_.py ADDED Viewed

@@ -0,0 +1,32 @@
+from collections.abc import AsyncIterator, Awaitable, Callable
+from typing import TypeVar
+R = TypeVar("R")
+T = TypeVar("T")
+async def paged(stub_fn: Callable[[R], Awaitable[T]], request: R, page_size: int = None) -> AsyncIterator[T]:
+    """Page through a gRPC paged API.
+    Assumes fields exist as per https://cloud.google.com/apis/design/design_patterns#list_pagination
+    """
+    next_page_token: str | None = None
+    while True:
+        request.page_size = page_size
+        request.page_token = next_page_token
+        res = await stub_fn(request)
+        if not res.next_page_token:
+            # No more items
+            yield res
+            break
+        next_page_token = res.next_page_token
+        yield res
+async def paged_items(
+    stub_fn: Callable[[R], Awaitable[T]], request: R, collection_name: str, page_size: int = None
+) -> AsyncIterator:
+    async for page in paged(stub_fn, request, page_size=page_size):
+        for item in getattr(page, collection_name):
+            yield item

spiral/project.py ADDED Viewed

@@ -0,0 +1,137 @@
+from typing import TYPE_CHECKING, Any
+import pyarrow as pa
+from spiral import Table
+from spiral.api.tables import CreateTable, FindTable
+from spiral.core.core import Table as CoreTable
+from spiral.core.metastore import PyMetastore
+from spiral.core.spec import Schema
+from spiral.types_ import Uri
+if TYPE_CHECKING:
+    from spiral.catalog import Spiral
+class Project:
+    def __init__(self, spiral_db: "Spiral", id: str, name: str | None = None):
+        self._spiral_db = spiral_db
+        self._id = id
+        self._name = name
+        self._api = self._spiral_db.config.api
+    def __str__(self):
+        return self._id
+    def __repr__(self):
+        return f"Project(id={self._id}{', name=' + self._name if self._name else ''})"
+    @property
+    def id(self) -> str:
+        return self._id
+    @property
+    def name(self) -> str:
+        return self._name or self._id
+    def list_table_names(self) -> list[(str, str)]:
+        """List tuples of (dataset, table) names in the project."""
+        return [(t.dataset, t.table) for t in self._api.table.list(FindTable.Request(project_id=self.id))]
+    def list_tables(self) -> list[Table]:
+        """List tables in the project."""
+        return [
+            Table(
+                CoreTable(
+                    PyMetastore.http(
+                        table_id=t.id,
+                        root_uri=t.metadata.root_uri,
+                        key_schema=Schema.from_arrow(t.metadata.key_schema),
+                        base_url=self._api.base_url + "/metastore/",
+                        token_provider=self._spiral_db.config.authn.token,
+                    ),
+                ),
+                name=f"{self.id}.{t.dataset}.{t.table}",
+            )
+            for t in self._api.table.list(FindTable.Request(project_id=self.id))
+        ]
+    def create_table(
+        self,
+        identifier: str,
+        *,
+        key_schema: pa.Schema | Any,
+        uri: Uri | None = None,
+        exist_ok: bool = False,
+    ) -> Table:
+        """Create a new table in the project."""
+        dataset, table = self._parse_identifier(identifier)
+        if not isinstance(key_schema, pa.Schema):
+            key_schema = pa.schema(key_schema)
+        res = self._api.table.create(
+            CreateTable.Request(
+                project_id=self.id,
+                dataset=dataset,
+                table=table,
+                key_schema=key_schema,
+                root_uri=uri,
+                exist_ok=exist_ok,
+            )
+        )
+        # Must have the same schema as provided, even if the table already exists.
+        expected_key_schema = res.table.metadata.key_schema
+        if key_schema != expected_key_schema:
+            raise ValueError(f"Table already exists with different key schema: {expected_key_schema} != {key_schema}")
+        if uri and res.table.metadata.root_uri != uri:
+            raise ValueError(f"Table already exists with different root URI: {res.table.metadata.root_uri} != {uri}")
+        # Set up a metastore backed by SpiralDB
+        metastore = PyMetastore.http(
+            table_id=res.table.id,
+            root_uri=res.table.metadata.root_uri,
+            key_schema=Schema.from_arrow(res.table.metadata.key_schema),
+            base_url=self._api.base_url + "/metastore/",
+            token_provider=self._spiral_db.config.authn.token,
+        )
+        return Table(CoreTable(metastore), name=f"{self.id}.{res.table.dataset}.{res.table.table}")
+    def table(self, identifier: str) -> Table:
+        """Open a table with a `dataset.table` identifier, or `table` name using the `default` dataset."""
+        dataset, table = self._parse_identifier(identifier)
+        # TODO(ngates): why does the client _need_ this information? Can we defer it?
+        res = self._api.table.find(
+            FindTable.Request(
+                project_id=self.id,
+                dataset=dataset,
+                table=table,
+            )
+        )
+        if res.table is None:
+            raise ValueError(f"Table not found: {self.id}.{dataset}.{table}")
+        # Set up a metastore backed by SpiralDB
+        metastore = PyMetastore.http(
+            table_id=res.table.id,
+            root_uri=res.table.metadata.root_uri,
+            key_schema=Schema.from_arrow(res.table.metadata.key_schema),
+            base_url=self._api.base_url + "/metastore/",
+            token_provider=self._spiral_db.config.authn.token,
+        )
+        return Table(CoreTable(metastore), name=f"{self.id}.{res.table.dataset}.{res.table.table}")
+    @staticmethod
+    def _parse_identifier(identifier: str) -> tuple[str, str]:
+        parts = identifier.split(".")
+        if len(parts) == 1:
+            return "default", parts[0]
+        elif len(parts) == 2:
+            return parts[0], parts[1]
+        else:
+            raise ValueError(f"Invalid table identifier: {identifier}")

spiral/proto/_/__init__.py ADDED Viewed

File without changes

spiral/proto/_/arrow/__init__.py ADDED Viewed

File without changes

spiral/proto/_/arrow/flight/__init__.py ADDED Viewed

File without changes

spiral/proto/_/arrow/flight/protocol/__init__.py ADDED Viewed

File without changes