pyspiral 0.1.0__cp310-abi3-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (81) hide show
  1. pyspiral-0.1.0.dist-info/METADATA +48 -0
  2. pyspiral-0.1.0.dist-info/RECORD +81 -0
  3. pyspiral-0.1.0.dist-info/WHEEL +4 -0
  4. pyspiral-0.1.0.dist-info/entry_points.txt +2 -0
  5. spiral/__init__.py +11 -0
  6. spiral/_lib.abi3.so +0 -0
  7. spiral/adbc.py +386 -0
  8. spiral/api/__init__.py +221 -0
  9. spiral/api/admin.py +29 -0
  10. spiral/api/filesystems.py +125 -0
  11. spiral/api/organizations.py +90 -0
  12. spiral/api/projects.py +160 -0
  13. spiral/api/tables.py +94 -0
  14. spiral/api/tokens.py +56 -0
  15. spiral/api/workloads.py +45 -0
  16. spiral/arrow.py +209 -0
  17. spiral/authn/__init__.py +0 -0
  18. spiral/authn/authn.py +89 -0
  19. spiral/authn/device.py +206 -0
  20. spiral/authn/github_.py +33 -0
  21. spiral/authn/modal_.py +18 -0
  22. spiral/catalog.py +78 -0
  23. spiral/cli/__init__.py +82 -0
  24. spiral/cli/__main__.py +4 -0
  25. spiral/cli/admin.py +21 -0
  26. spiral/cli/app.py +48 -0
  27. spiral/cli/console.py +95 -0
  28. spiral/cli/fs.py +47 -0
  29. spiral/cli/login.py +13 -0
  30. spiral/cli/org.py +90 -0
  31. spiral/cli/printer.py +45 -0
  32. spiral/cli/project.py +107 -0
  33. spiral/cli/state.py +3 -0
  34. spiral/cli/table.py +20 -0
  35. spiral/cli/token.py +27 -0
  36. spiral/cli/types.py +53 -0
  37. spiral/cli/workload.py +59 -0
  38. spiral/config.py +26 -0
  39. spiral/core/__init__.py +0 -0
  40. spiral/core/core/__init__.pyi +53 -0
  41. spiral/core/manifests/__init__.pyi +53 -0
  42. spiral/core/metastore/__init__.pyi +91 -0
  43. spiral/core/spec/__init__.pyi +257 -0
  44. spiral/dataset.py +239 -0
  45. spiral/debug.py +251 -0
  46. spiral/expressions/__init__.py +222 -0
  47. spiral/expressions/base.py +149 -0
  48. spiral/expressions/http.py +86 -0
  49. spiral/expressions/io.py +100 -0
  50. spiral/expressions/list_.py +68 -0
  51. spiral/expressions/refs.py +44 -0
  52. spiral/expressions/str_.py +39 -0
  53. spiral/expressions/struct.py +57 -0
  54. spiral/expressions/tiff.py +223 -0
  55. spiral/expressions/udf.py +46 -0
  56. spiral/grpc_.py +32 -0
  57. spiral/project.py +137 -0
  58. spiral/proto/_/__init__.py +0 -0
  59. spiral/proto/_/arrow/__init__.py +0 -0
  60. spiral/proto/_/arrow/flight/__init__.py +0 -0
  61. spiral/proto/_/arrow/flight/protocol/__init__.py +0 -0
  62. spiral/proto/_/arrow/flight/protocol/sql/__init__.py +1990 -0
  63. spiral/proto/_/scandal/__init__.py +223 -0
  64. spiral/proto/_/spfs/__init__.py +36 -0
  65. spiral/proto/_/spiral/__init__.py +0 -0
  66. spiral/proto/_/spiral/table/__init__.py +225 -0
  67. spiral/proto/_/spiraldb/__init__.py +0 -0
  68. spiral/proto/_/spiraldb/metastore/__init__.py +499 -0
  69. spiral/proto/__init__.py +0 -0
  70. spiral/proto/scandal/__init__.py +45 -0
  71. spiral/proto/spiral/__init__.py +0 -0
  72. spiral/proto/spiral/table/__init__.py +96 -0
  73. spiral/proto/substrait/__init__.py +3399 -0
  74. spiral/proto/substrait/extensions/__init__.py +115 -0
  75. spiral/proto/util.py +41 -0
  76. spiral/py.typed +0 -0
  77. spiral/scan_.py +168 -0
  78. spiral/settings.py +157 -0
  79. spiral/substrait_.py +275 -0
  80. spiral/table.py +157 -0
  81. spiral/types_.py +6 -0
@@ -0,0 +1,223 @@
1
+ import numpy as np
2
+ import pyarrow as pa
3
+
4
+ from spiral.expressions.base import ExprLike
5
+ from spiral.expressions.udf import RefUDF
6
+
7
+
8
+ def read(
9
+ expr: ExprLike,
10
+ indexes: ExprLike | int | list[int] | None = None,
11
+ window: ExprLike | tuple[tuple[int, int], tuple[int, int]] | None = None,
12
+ boundless: ExprLike | bool | None = None,
13
+ ):
14
+ """
15
+ Read referenced cell in a `TIFF` format. Requires `rasterio` to be installed.
16
+
17
+ Args:
18
+ expr: The referenced `TIFF` bytes.
19
+ indexes: The band indexes to read. Defaults to first band. The first dimension of the result's `shape` field
20
+ is either 1 or the number of indexes.
21
+ window: The window to read. In format (row_range_tuple, col_range_tuple). Defaults to full window.
22
+ boundless: If `True`, windows that extend beyond the dataset's extent
23
+ are permitted and partially or completely filled arrays will be returned as appropriate.
24
+
25
+ Returns:
26
+ An array where each element is a NumPy array represented as a struct with fields:
27
+ bytes: Array bytes with type `pa.large_binary()`.
28
+ shape: Array shape with type `pa.list_(pa.uint32(), 3)`.
29
+ dtype: String representation of NumPy dtype with type `pa.string()`.
30
+
31
+ Example:
32
+ A way to get the i-th element in the result as NumPy array:
33
+
34
+ ```
35
+ array: np.ndarray = np.frombuffer(
36
+ result["bytes"][i].as_py(),
37
+ dtype=np.dtype(result["dtype"][i].as_py()),
38
+ ).reshape(tuple(result["shape"][i].as_py()))
39
+ ```
40
+ """
41
+ try:
42
+ import rasterio # noqa: F401
43
+ except ImportError:
44
+ raise ImportError("`rasterio` is required for tiff.read")
45
+
46
+ return TiffReadUDF()(expr, indexes, window, boundless)
47
+
48
+
49
+ def crop(
50
+ expr: ExprLike,
51
+ shape: ExprLike,
52
+ ):
53
+ """
54
+ Crop shapes out of the referenced cell in a `TIFF` format. Requires `rasterio` to be installed.
55
+
56
+ Args:
57
+ expr: The referenced `TIFF` bytes.
58
+ shape: [GeoJSON-like](https://geojson.org/) shape.
59
+
60
+ Returns:
61
+ An array where each element is a NumPy array represented as a struct with fields:
62
+ bytes: Array bytes with type `pa.large_binary()`.
63
+ shape: Array shape with type `pa.list_(pa.uint32(), 3)`.
64
+ dtype: String representation of NumPy dtype with type `pa.string()`.
65
+
66
+ Example:
67
+ A way to get the i-th element in the result as NumPy array:
68
+
69
+ ```
70
+ array: np.ndarray = np.frombuffer(
71
+ result["bytes"][i].as_py(),
72
+ dtype=np.dtype(result["dtype"][i].as_py()),
73
+ ).reshape(tuple(result["shape"][i].as_py()))
74
+ ```
75
+ """
76
+ try:
77
+ import rasterio # noqa: F401
78
+ except ImportError:
79
+ raise ImportError("`rasterio` is required for tiff.crop")
80
+
81
+ return TiffCropUDF()(expr, shape)
82
+
83
+
84
+ class TiffReadUDF(RefUDF):
85
+ RES_DTYPE: pa.DataType = pa.struct(
86
+ [
87
+ pa.field("bytes", pa.large_binary()),
88
+ pa.field("shape", pa.list_(pa.uint32(), 3)),
89
+ pa.field("dtype", pa.string()),
90
+ ]
91
+ )
92
+
93
+ def __init__(self):
94
+ super().__init__("tiff.read")
95
+
96
+ def return_type(self, *input_types: pa.DataType) -> pa.DataType:
97
+ return TiffReadUDF.RES_DTYPE
98
+
99
+ def invoke(self, fp, *input_args: pa.Array) -> pa.Array:
100
+ try:
101
+ import rasterio
102
+ except ImportError:
103
+ raise ImportError("`rasterio` is required for tiff.read")
104
+
105
+ from rasterio.windows import Window
106
+
107
+ if len(input_args) != 4:
108
+ raise ValueError("tiff.read expects exactly 4 arguments: expr, indexes, window, boundless")
109
+
110
+ _, indexes, window, boundless = input_args
111
+
112
+ indexes = indexes[0].as_py()
113
+ if indexes is not None and not isinstance(indexes, int) and not isinstance(indexes, list):
114
+ raise ValueError(f"tiff.read expects indexes to be None or an int or a list, got {indexes}")
115
+
116
+ boundless = boundless[0].as_py()
117
+ if boundless is not None and not isinstance(boundless, bool):
118
+ raise ValueError(f"tiff.read expects boundless to be None or a bool, got {boundless}")
119
+
120
+ window = window[0].as_py()
121
+ if window is not None:
122
+ if len(window) != 2:
123
+ raise ValueError(f"tiff.read window invalid, got {window}")
124
+ window = Window.from_slices(slice(*window[0]), slice(*window[1]), boundless=boundless or False)
125
+
126
+ opener = _VsiOpener(fp)
127
+ with rasterio.open("ref", opener=opener) as src:
128
+ src: rasterio.DatasetReader
129
+ # TODO(marko): We know the size and dtype so we should be able to preallocate the result and read into it.
130
+ # This matters more if we want to rewrite this function to work with multiple inputs at once, in which
131
+ # case we should first consider using Rust GDAL bindings - I believe rasterio uses GDAL under the hood.
132
+ result: np.ndarray = src.read(indexes=indexes, window=window)
133
+ return pa.array(
134
+ [
135
+ {
136
+ "bytes": result.tobytes(),
137
+ "shape": list(result.shape),
138
+ "dtype": str(result.dtype),
139
+ }
140
+ ],
141
+ type=TiffReadUDF.RES_DTYPE,
142
+ )
143
+
144
+
145
+ class TiffCropUDF(RefUDF):
146
+ RES_DTYPE: pa.DataType = pa.struct(
147
+ [
148
+ pa.field("bytes", pa.large_binary()),
149
+ pa.field("shape", pa.list_(pa.uint32()), 3),
150
+ pa.field("dtype", pa.string()),
151
+ ]
152
+ )
153
+
154
+ def __init__(self):
155
+ super().__init__("tiff.crop")
156
+
157
+ def return_type(self, *input_types: pa.DataType) -> pa.DataType:
158
+ return TiffCropUDF.RES_DTYPE
159
+
160
+ def invoke(self, fp, *input_args: pa.Array) -> pa.Array:
161
+ try:
162
+ import rasterio
163
+ except ImportError:
164
+ raise ImportError("`rasterio` is required for tiff.crop")
165
+
166
+ from rasterio.mask import mask as rio_mask
167
+
168
+ if len(input_args) != 2:
169
+ raise ValueError("tiff.crop expects exactly 2 arguments: expr, shape")
170
+
171
+ _, shape = input_args
172
+
173
+ shape = shape[0].as_py()
174
+ if shape is None:
175
+ raise ValueError("tiff.crop expects shape to be a GeoJSON-like shape")
176
+
177
+ opener = _VsiOpener(fp)
178
+ with rasterio.open("ref", opener=opener) as src:
179
+ src: rasterio.DatasetReader
180
+ result, _ = rio_mask(src, shapes=[shape], crop=True)
181
+ result: np.ndarray
182
+ return pa.array(
183
+ [
184
+ {
185
+ "bytes": result.tobytes(),
186
+ "shape": list(result.shape),
187
+ "dtype": str(result.dtype),
188
+ }
189
+ ],
190
+ type=TiffCropUDF.RES_DTYPE,
191
+ )
192
+
193
+
194
+ class _VsiOpener:
195
+ """
196
+ VSI file opener which returns a constant file-like on open.
197
+
198
+ Must match https://rasterio.readthedocs.io/en/stable/topics/vsi.html#python-file-and-filesystem-openers spec but
199
+ only `open` is needed when going through rasterio.
200
+ """
201
+
202
+ def __init__(self, file_like):
203
+ self._file_like = file_like
204
+
205
+ def open(self, _path, mode):
206
+ if mode not in {"r", "rb"}:
207
+ raise ValueError(f"Unsupported mode: {mode}")
208
+ return self._file_like
209
+
210
+ def isdir(self, _):
211
+ return False
212
+
213
+ def isfile(self, _):
214
+ return False
215
+
216
+ def mtime(self, _):
217
+ return 0
218
+
219
+ def size(self, _):
220
+ return self._file_like.size()
221
+
222
+ def modified(self, _):
223
+ raise NotImplementedError
@@ -0,0 +1,46 @@
1
+ import abc
2
+
3
+ import pyarrow as pa
4
+
5
+ from spiral import _lib
6
+ from spiral.expressions.base import Expr
7
+
8
+
9
+ class BaseUDF:
10
+ def __init__(self, udf):
11
+ self._udf = udf
12
+
13
+ def __call__(self, *args) -> Expr:
14
+ """Create an expression that calls this UDF with the given arguments."""
15
+ from spiral import expressions as se
16
+
17
+ args = [se.lift(arg).__expr__ for arg in args]
18
+ return Expr(self._udf(args))
19
+
20
+ @abc.abstractmethod
21
+ def return_type(self, *input_types: pa.DataType) -> pa.DataType: ...
22
+
23
+
24
+ class UDF(BaseUDF):
25
+ """A User-Defined Function (UDF)."""
26
+
27
+ def __init__(self, name: str):
28
+ super().__init__(_lib.spql.expr.udf.create(name, return_type=self.return_type, invoke=self.invoke))
29
+
30
+ @abc.abstractmethod
31
+ def invoke(self, *input_args: pa.Array) -> pa.Array: ...
32
+
33
+
34
+ class RefUDF(BaseUDF):
35
+ """A UDF over a single ref cell, and therefore can access the file object."""
36
+
37
+ def __init__(self, name: str):
38
+ super().__init__(_lib.spql.expr.udf.create(name, return_type=self.return_type, invoke=self.invoke, scope="ref"))
39
+
40
+ @abc.abstractmethod
41
+ def invoke(self, fp, *input_args: pa.Array) -> pa.Array:
42
+ """Invoke the UDF with the given arguments.
43
+
44
+ NOTE: The first argument is always the ref cell. All array input args will be sliced to the appropriate row.
45
+ """
46
+ ...
spiral/grpc_.py ADDED
@@ -0,0 +1,32 @@
1
+ from collections.abc import AsyncIterator, Awaitable, Callable
2
+ from typing import TypeVar
3
+
4
+ R = TypeVar("R")
5
+ T = TypeVar("T")
6
+
7
+
8
+ async def paged(stub_fn: Callable[[R], Awaitable[T]], request: R, page_size: int = None) -> AsyncIterator[T]:
9
+ """Page through a gRPC paged API.
10
+
11
+ Assumes fields exist as per https://cloud.google.com/apis/design/design_patterns#list_pagination
12
+ """
13
+ next_page_token: str | None = None
14
+ while True:
15
+ request.page_size = page_size
16
+ request.page_token = next_page_token
17
+ res = await stub_fn(request)
18
+ if not res.next_page_token:
19
+ # No more items
20
+ yield res
21
+ break
22
+
23
+ next_page_token = res.next_page_token
24
+ yield res
25
+
26
+
27
+ async def paged_items(
28
+ stub_fn: Callable[[R], Awaitable[T]], request: R, collection_name: str, page_size: int = None
29
+ ) -> AsyncIterator:
30
+ async for page in paged(stub_fn, request, page_size=page_size):
31
+ for item in getattr(page, collection_name):
32
+ yield item
spiral/project.py ADDED
@@ -0,0 +1,137 @@
1
+ from typing import TYPE_CHECKING, Any
2
+
3
+ import pyarrow as pa
4
+
5
+ from spiral import Table
6
+ from spiral.api.tables import CreateTable, FindTable
7
+ from spiral.core.core import Table as CoreTable
8
+ from spiral.core.metastore import PyMetastore
9
+ from spiral.core.spec import Schema
10
+ from spiral.types_ import Uri
11
+
12
+ if TYPE_CHECKING:
13
+ from spiral.catalog import Spiral
14
+
15
+
16
+ class Project:
17
+ def __init__(self, spiral_db: "Spiral", id: str, name: str | None = None):
18
+ self._spiral_db = spiral_db
19
+ self._id = id
20
+ self._name = name
21
+
22
+ self._api = self._spiral_db.config.api
23
+
24
+ def __str__(self):
25
+ return self._id
26
+
27
+ def __repr__(self):
28
+ return f"Project(id={self._id}{', name=' + self._name if self._name else ''})"
29
+
30
+ @property
31
+ def id(self) -> str:
32
+ return self._id
33
+
34
+ @property
35
+ def name(self) -> str:
36
+ return self._name or self._id
37
+
38
+ def list_table_names(self) -> list[(str, str)]:
39
+ """List tuples of (dataset, table) names in the project."""
40
+ return [(t.dataset, t.table) for t in self._api.table.list(FindTable.Request(project_id=self.id))]
41
+
42
+ def list_tables(self) -> list[Table]:
43
+ """List tables in the project."""
44
+ return [
45
+ Table(
46
+ CoreTable(
47
+ PyMetastore.http(
48
+ table_id=t.id,
49
+ root_uri=t.metadata.root_uri,
50
+ key_schema=Schema.from_arrow(t.metadata.key_schema),
51
+ base_url=self._api.base_url + "/metastore/",
52
+ token_provider=self._spiral_db.config.authn.token,
53
+ ),
54
+ ),
55
+ name=f"{self.id}.{t.dataset}.{t.table}",
56
+ )
57
+ for t in self._api.table.list(FindTable.Request(project_id=self.id))
58
+ ]
59
+
60
+ def create_table(
61
+ self,
62
+ identifier: str,
63
+ *,
64
+ key_schema: pa.Schema | Any,
65
+ uri: Uri | None = None,
66
+ exist_ok: bool = False,
67
+ ) -> Table:
68
+ """Create a new table in the project."""
69
+ dataset, table = self._parse_identifier(identifier)
70
+
71
+ if not isinstance(key_schema, pa.Schema):
72
+ key_schema = pa.schema(key_schema)
73
+
74
+ res = self._api.table.create(
75
+ CreateTable.Request(
76
+ project_id=self.id,
77
+ dataset=dataset,
78
+ table=table,
79
+ key_schema=key_schema,
80
+ root_uri=uri,
81
+ exist_ok=exist_ok,
82
+ )
83
+ )
84
+
85
+ # Must have the same schema as provided, even if the table already exists.
86
+ expected_key_schema = res.table.metadata.key_schema
87
+ if key_schema != expected_key_schema:
88
+ raise ValueError(f"Table already exists with different key schema: {expected_key_schema} != {key_schema}")
89
+ if uri and res.table.metadata.root_uri != uri:
90
+ raise ValueError(f"Table already exists with different root URI: {res.table.metadata.root_uri} != {uri}")
91
+
92
+ # Set up a metastore backed by SpiralDB
93
+ metastore = PyMetastore.http(
94
+ table_id=res.table.id,
95
+ root_uri=res.table.metadata.root_uri,
96
+ key_schema=Schema.from_arrow(res.table.metadata.key_schema),
97
+ base_url=self._api.base_url + "/metastore/",
98
+ token_provider=self._spiral_db.config.authn.token,
99
+ )
100
+
101
+ return Table(CoreTable(metastore), name=f"{self.id}.{res.table.dataset}.{res.table.table}")
102
+
103
+ def table(self, identifier: str) -> Table:
104
+ """Open a table with a `dataset.table` identifier, or `table` name using the `default` dataset."""
105
+ dataset, table = self._parse_identifier(identifier)
106
+
107
+ # TODO(ngates): why does the client _need_ this information? Can we defer it?
108
+ res = self._api.table.find(
109
+ FindTable.Request(
110
+ project_id=self.id,
111
+ dataset=dataset,
112
+ table=table,
113
+ )
114
+ )
115
+ if res.table is None:
116
+ raise ValueError(f"Table not found: {self.id}.{dataset}.{table}")
117
+
118
+ # Set up a metastore backed by SpiralDB
119
+ metastore = PyMetastore.http(
120
+ table_id=res.table.id,
121
+ root_uri=res.table.metadata.root_uri,
122
+ key_schema=Schema.from_arrow(res.table.metadata.key_schema),
123
+ base_url=self._api.base_url + "/metastore/",
124
+ token_provider=self._spiral_db.config.authn.token,
125
+ )
126
+
127
+ return Table(CoreTable(metastore), name=f"{self.id}.{res.table.dataset}.{res.table.table}")
128
+
129
+ @staticmethod
130
+ def _parse_identifier(identifier: str) -> tuple[str, str]:
131
+ parts = identifier.split(".")
132
+ if len(parts) == 1:
133
+ return "default", parts[0]
134
+ elif len(parts) == 2:
135
+ return parts[0], parts[1]
136
+ else:
137
+ raise ValueError(f"Invalid table identifier: {identifier}")
File without changes
File without changes
File without changes
File without changes