pyspiral 0.1.0__cp310-abi3-macosx_11_0_arm64.whl

Sign up to get free protection for your applications and to get access to all the features.
Files changed (81) hide show
  1. pyspiral-0.1.0.dist-info/METADATA +48 -0
  2. pyspiral-0.1.0.dist-info/RECORD +81 -0
  3. pyspiral-0.1.0.dist-info/WHEEL +4 -0
  4. pyspiral-0.1.0.dist-info/entry_points.txt +2 -0
  5. spiral/__init__.py +11 -0
  6. spiral/_lib.abi3.so +0 -0
  7. spiral/adbc.py +386 -0
  8. spiral/api/__init__.py +221 -0
  9. spiral/api/admin.py +29 -0
  10. spiral/api/filesystems.py +125 -0
  11. spiral/api/organizations.py +90 -0
  12. spiral/api/projects.py +160 -0
  13. spiral/api/tables.py +94 -0
  14. spiral/api/tokens.py +56 -0
  15. spiral/api/workloads.py +45 -0
  16. spiral/arrow.py +209 -0
  17. spiral/authn/__init__.py +0 -0
  18. spiral/authn/authn.py +89 -0
  19. spiral/authn/device.py +206 -0
  20. spiral/authn/github_.py +33 -0
  21. spiral/authn/modal_.py +18 -0
  22. spiral/catalog.py +78 -0
  23. spiral/cli/__init__.py +82 -0
  24. spiral/cli/__main__.py +4 -0
  25. spiral/cli/admin.py +21 -0
  26. spiral/cli/app.py +48 -0
  27. spiral/cli/console.py +95 -0
  28. spiral/cli/fs.py +47 -0
  29. spiral/cli/login.py +13 -0
  30. spiral/cli/org.py +90 -0
  31. spiral/cli/printer.py +45 -0
  32. spiral/cli/project.py +107 -0
  33. spiral/cli/state.py +3 -0
  34. spiral/cli/table.py +20 -0
  35. spiral/cli/token.py +27 -0
  36. spiral/cli/types.py +53 -0
  37. spiral/cli/workload.py +59 -0
  38. spiral/config.py +26 -0
  39. spiral/core/__init__.py +0 -0
  40. spiral/core/core/__init__.pyi +53 -0
  41. spiral/core/manifests/__init__.pyi +53 -0
  42. spiral/core/metastore/__init__.pyi +91 -0
  43. spiral/core/spec/__init__.pyi +257 -0
  44. spiral/dataset.py +239 -0
  45. spiral/debug.py +251 -0
  46. spiral/expressions/__init__.py +222 -0
  47. spiral/expressions/base.py +149 -0
  48. spiral/expressions/http.py +86 -0
  49. spiral/expressions/io.py +100 -0
  50. spiral/expressions/list_.py +68 -0
  51. spiral/expressions/refs.py +44 -0
  52. spiral/expressions/str_.py +39 -0
  53. spiral/expressions/struct.py +57 -0
  54. spiral/expressions/tiff.py +223 -0
  55. spiral/expressions/udf.py +46 -0
  56. spiral/grpc_.py +32 -0
  57. spiral/project.py +137 -0
  58. spiral/proto/_/__init__.py +0 -0
  59. spiral/proto/_/arrow/__init__.py +0 -0
  60. spiral/proto/_/arrow/flight/__init__.py +0 -0
  61. spiral/proto/_/arrow/flight/protocol/__init__.py +0 -0
  62. spiral/proto/_/arrow/flight/protocol/sql/__init__.py +1990 -0
  63. spiral/proto/_/scandal/__init__.py +223 -0
  64. spiral/proto/_/spfs/__init__.py +36 -0
  65. spiral/proto/_/spiral/__init__.py +0 -0
  66. spiral/proto/_/spiral/table/__init__.py +225 -0
  67. spiral/proto/_/spiraldb/__init__.py +0 -0
  68. spiral/proto/_/spiraldb/metastore/__init__.py +499 -0
  69. spiral/proto/__init__.py +0 -0
  70. spiral/proto/scandal/__init__.py +45 -0
  71. spiral/proto/spiral/__init__.py +0 -0
  72. spiral/proto/spiral/table/__init__.py +96 -0
  73. spiral/proto/substrait/__init__.py +3399 -0
  74. spiral/proto/substrait/extensions/__init__.py +115 -0
  75. spiral/proto/util.py +41 -0
  76. spiral/py.typed +0 -0
  77. spiral/scan_.py +168 -0
  78. spiral/settings.py +157 -0
  79. spiral/substrait_.py +275 -0
  80. spiral/table.py +157 -0
  81. spiral/types_.py +6 -0
@@ -0,0 +1,223 @@
1
+ import numpy as np
2
+ import pyarrow as pa
3
+
4
+ from spiral.expressions.base import ExprLike
5
+ from spiral.expressions.udf import RefUDF
6
+
7
+
8
+ def read(
9
+ expr: ExprLike,
10
+ indexes: ExprLike | int | list[int] | None = None,
11
+ window: ExprLike | tuple[tuple[int, int], tuple[int, int]] | None = None,
12
+ boundless: ExprLike | bool | None = None,
13
+ ):
14
+ """
15
+ Read referenced cell in a `TIFF` format. Requires `rasterio` to be installed.
16
+
17
+ Args:
18
+ expr: The referenced `TIFF` bytes.
19
+ indexes: The band indexes to read. Defaults to first band. The first dimension of the result's `shape` field
20
+ is either 1 or the number of indexes.
21
+ window: The window to read. In format (row_range_tuple, col_range_tuple). Defaults to full window.
22
+ boundless: If `True`, windows that extend beyond the dataset's extent
23
+ are permitted and partially or completely filled arrays will be returned as appropriate.
24
+
25
+ Returns:
26
+ An array where each element is a NumPy array represented as a struct with fields:
27
+ bytes: Array bytes with type `pa.large_binary()`.
28
+ shape: Array shape with type `pa.list_(pa.uint32(), 3)`.
29
+ dtype: String representation of NumPy dtype with type `pa.string()`.
30
+
31
+ Example:
32
+ A way to get the i-th element in the result as NumPy array:
33
+
34
+ ```
35
+ array: np.ndarray = np.frombuffer(
36
+ result["bytes"][i].as_py(),
37
+ dtype=np.dtype(result["dtype"][i].as_py()),
38
+ ).reshape(tuple(result["shape"][i].as_py()))
39
+ ```
40
+ """
41
+ try:
42
+ import rasterio # noqa: F401
43
+ except ImportError:
44
+ raise ImportError("`rasterio` is required for tiff.read")
45
+
46
+ return TiffReadUDF()(expr, indexes, window, boundless)
47
+
48
+
49
+ def crop(
50
+ expr: ExprLike,
51
+ shape: ExprLike,
52
+ ):
53
+ """
54
+ Crop shapes out of the referenced cell in a `TIFF` format. Requires `rasterio` to be installed.
55
+
56
+ Args:
57
+ expr: The referenced `TIFF` bytes.
58
+ shape: [GeoJSON-like](https://geojson.org/) shape.
59
+
60
+ Returns:
61
+ An array where each element is a NumPy array represented as a struct with fields:
62
+ bytes: Array bytes with type `pa.large_binary()`.
63
+ shape: Array shape with type `pa.list_(pa.uint32(), 3)`.
64
+ dtype: String representation of NumPy dtype with type `pa.string()`.
65
+
66
+ Example:
67
+ A way to get the i-th element in the result as NumPy array:
68
+
69
+ ```
70
+ array: np.ndarray = np.frombuffer(
71
+ result["bytes"][i].as_py(),
72
+ dtype=np.dtype(result["dtype"][i].as_py()),
73
+ ).reshape(tuple(result["shape"][i].as_py()))
74
+ ```
75
+ """
76
+ try:
77
+ import rasterio # noqa: F401
78
+ except ImportError:
79
+ raise ImportError("`rasterio` is required for tiff.crop")
80
+
81
+ return TiffCropUDF()(expr, shape)
82
+
83
+
84
+ class TiffReadUDF(RefUDF):
85
+ RES_DTYPE: pa.DataType = pa.struct(
86
+ [
87
+ pa.field("bytes", pa.large_binary()),
88
+ pa.field("shape", pa.list_(pa.uint32(), 3)),
89
+ pa.field("dtype", pa.string()),
90
+ ]
91
+ )
92
+
93
+ def __init__(self):
94
+ super().__init__("tiff.read")
95
+
96
+ def return_type(self, *input_types: pa.DataType) -> pa.DataType:
97
+ return TiffReadUDF.RES_DTYPE
98
+
99
+ def invoke(self, fp, *input_args: pa.Array) -> pa.Array:
100
+ try:
101
+ import rasterio
102
+ except ImportError:
103
+ raise ImportError("`rasterio` is required for tiff.read")
104
+
105
+ from rasterio.windows import Window
106
+
107
+ if len(input_args) != 4:
108
+ raise ValueError("tiff.read expects exactly 4 arguments: expr, indexes, window, boundless")
109
+
110
+ _, indexes, window, boundless = input_args
111
+
112
+ indexes = indexes[0].as_py()
113
+ if indexes is not None and not isinstance(indexes, int) and not isinstance(indexes, list):
114
+ raise ValueError(f"tiff.read expects indexes to be None or an int or a list, got {indexes}")
115
+
116
+ boundless = boundless[0].as_py()
117
+ if boundless is not None and not isinstance(boundless, bool):
118
+ raise ValueError(f"tiff.read expects boundless to be None or a bool, got {boundless}")
119
+
120
+ window = window[0].as_py()
121
+ if window is not None:
122
+ if len(window) != 2:
123
+ raise ValueError(f"tiff.read window invalid, got {window}")
124
+ window = Window.from_slices(slice(*window[0]), slice(*window[1]), boundless=boundless or False)
125
+
126
+ opener = _VsiOpener(fp)
127
+ with rasterio.open("ref", opener=opener) as src:
128
+ src: rasterio.DatasetReader
129
+ # TODO(marko): We know the size and dtype so we should be able to preallocate the result and read into it.
130
+ # This matters more if we want to rewrite this function to work with multiple inputs at once, in which
131
+ # case we should first consider using Rust GDAL bindings - I believe rasterio uses GDAL under the hood.
132
+ result: np.ndarray = src.read(indexes=indexes, window=window)
133
+ return pa.array(
134
+ [
135
+ {
136
+ "bytes": result.tobytes(),
137
+ "shape": list(result.shape),
138
+ "dtype": str(result.dtype),
139
+ }
140
+ ],
141
+ type=TiffReadUDF.RES_DTYPE,
142
+ )
143
+
144
+
145
+ class TiffCropUDF(RefUDF):
146
+ RES_DTYPE: pa.DataType = pa.struct(
147
+ [
148
+ pa.field("bytes", pa.large_binary()),
149
+ pa.field("shape", pa.list_(pa.uint32()), 3),
150
+ pa.field("dtype", pa.string()),
151
+ ]
152
+ )
153
+
154
+ def __init__(self):
155
+ super().__init__("tiff.crop")
156
+
157
+ def return_type(self, *input_types: pa.DataType) -> pa.DataType:
158
+ return TiffCropUDF.RES_DTYPE
159
+
160
+ def invoke(self, fp, *input_args: pa.Array) -> pa.Array:
161
+ try:
162
+ import rasterio
163
+ except ImportError:
164
+ raise ImportError("`rasterio` is required for tiff.crop")
165
+
166
+ from rasterio.mask import mask as rio_mask
167
+
168
+ if len(input_args) != 2:
169
+ raise ValueError("tiff.crop expects exactly 2 arguments: expr, shape")
170
+
171
+ _, shape = input_args
172
+
173
+ shape = shape[0].as_py()
174
+ if shape is None:
175
+ raise ValueError("tiff.crop expects shape to be a GeoJSON-like shape")
176
+
177
+ opener = _VsiOpener(fp)
178
+ with rasterio.open("ref", opener=opener) as src:
179
+ src: rasterio.DatasetReader
180
+ result, _ = rio_mask(src, shapes=[shape], crop=True)
181
+ result: np.ndarray
182
+ return pa.array(
183
+ [
184
+ {
185
+ "bytes": result.tobytes(),
186
+ "shape": list(result.shape),
187
+ "dtype": str(result.dtype),
188
+ }
189
+ ],
190
+ type=TiffCropUDF.RES_DTYPE,
191
+ )
192
+
193
+
194
+ class _VsiOpener:
195
+ """
196
+ VSI file opener which returns a constant file-like on open.
197
+
198
+ Must match https://rasterio.readthedocs.io/en/stable/topics/vsi.html#python-file-and-filesystem-openers spec but
199
+ only `open` is needed when going through rasterio.
200
+ """
201
+
202
+ def __init__(self, file_like):
203
+ self._file_like = file_like
204
+
205
+ def open(self, _path, mode):
206
+ if mode not in {"r", "rb"}:
207
+ raise ValueError(f"Unsupported mode: {mode}")
208
+ return self._file_like
209
+
210
+ def isdir(self, _):
211
+ return False
212
+
213
+ def isfile(self, _):
214
+ return False
215
+
216
+ def mtime(self, _):
217
+ return 0
218
+
219
+ def size(self, _):
220
+ return self._file_like.size()
221
+
222
+ def modified(self, _):
223
+ raise NotImplementedError
@@ -0,0 +1,46 @@
1
+ import abc
2
+
3
+ import pyarrow as pa
4
+
5
+ from spiral import _lib
6
+ from spiral.expressions.base import Expr
7
+
8
+
9
+ class BaseUDF:
10
+ def __init__(self, udf):
11
+ self._udf = udf
12
+
13
+ def __call__(self, *args) -> Expr:
14
+ """Create an expression that calls this UDF with the given arguments."""
15
+ from spiral import expressions as se
16
+
17
+ args = [se.lift(arg).__expr__ for arg in args]
18
+ return Expr(self._udf(args))
19
+
20
+ @abc.abstractmethod
21
+ def return_type(self, *input_types: pa.DataType) -> pa.DataType: ...
22
+
23
+
24
+ class UDF(BaseUDF):
25
+ """A User-Defined Function (UDF)."""
26
+
27
+ def __init__(self, name: str):
28
+ super().__init__(_lib.spql.expr.udf.create(name, return_type=self.return_type, invoke=self.invoke))
29
+
30
+ @abc.abstractmethod
31
+ def invoke(self, *input_args: pa.Array) -> pa.Array: ...
32
+
33
+
34
+ class RefUDF(BaseUDF):
35
+ """A UDF over a single ref cell, and therefore can access the file object."""
36
+
37
+ def __init__(self, name: str):
38
+ super().__init__(_lib.spql.expr.udf.create(name, return_type=self.return_type, invoke=self.invoke, scope="ref"))
39
+
40
+ @abc.abstractmethod
41
+ def invoke(self, fp, *input_args: pa.Array) -> pa.Array:
42
+ """Invoke the UDF with the given arguments.
43
+
44
+ NOTE: The first argument is always the ref cell. All array input args will be sliced to the appropriate row.
45
+ """
46
+ ...
spiral/grpc_.py ADDED
@@ -0,0 +1,32 @@
1
+ from collections.abc import AsyncIterator, Awaitable, Callable
2
+ from typing import TypeVar
3
+
4
+ R = TypeVar("R")
5
+ T = TypeVar("T")
6
+
7
+
8
+ async def paged(stub_fn: Callable[[R], Awaitable[T]], request: R, page_size: int = None) -> AsyncIterator[T]:
9
+ """Page through a gRPC paged API.
10
+
11
+ Assumes fields exist as per https://cloud.google.com/apis/design/design_patterns#list_pagination
12
+ """
13
+ next_page_token: str | None = None
14
+ while True:
15
+ request.page_size = page_size
16
+ request.page_token = next_page_token
17
+ res = await stub_fn(request)
18
+ if not res.next_page_token:
19
+ # No more items
20
+ yield res
21
+ break
22
+
23
+ next_page_token = res.next_page_token
24
+ yield res
25
+
26
+
27
+ async def paged_items(
28
+ stub_fn: Callable[[R], Awaitable[T]], request: R, collection_name: str, page_size: int = None
29
+ ) -> AsyncIterator:
30
+ async for page in paged(stub_fn, request, page_size=page_size):
31
+ for item in getattr(page, collection_name):
32
+ yield item
spiral/project.py ADDED
@@ -0,0 +1,137 @@
1
+ from typing import TYPE_CHECKING, Any
2
+
3
+ import pyarrow as pa
4
+
5
+ from spiral import Table
6
+ from spiral.api.tables import CreateTable, FindTable
7
+ from spiral.core.core import Table as CoreTable
8
+ from spiral.core.metastore import PyMetastore
9
+ from spiral.core.spec import Schema
10
+ from spiral.types_ import Uri
11
+
12
+ if TYPE_CHECKING:
13
+ from spiral.catalog import Spiral
14
+
15
+
16
+ class Project:
17
+ def __init__(self, spiral_db: "Spiral", id: str, name: str | None = None):
18
+ self._spiral_db = spiral_db
19
+ self._id = id
20
+ self._name = name
21
+
22
+ self._api = self._spiral_db.config.api
23
+
24
+ def __str__(self):
25
+ return self._id
26
+
27
+ def __repr__(self):
28
+ return f"Project(id={self._id}{', name=' + self._name if self._name else ''})"
29
+
30
+ @property
31
+ def id(self) -> str:
32
+ return self._id
33
+
34
+ @property
35
+ def name(self) -> str:
36
+ return self._name or self._id
37
+
38
+ def list_table_names(self) -> list[(str, str)]:
39
+ """List tuples of (dataset, table) names in the project."""
40
+ return [(t.dataset, t.table) for t in self._api.table.list(FindTable.Request(project_id=self.id))]
41
+
42
+ def list_tables(self) -> list[Table]:
43
+ """List tables in the project."""
44
+ return [
45
+ Table(
46
+ CoreTable(
47
+ PyMetastore.http(
48
+ table_id=t.id,
49
+ root_uri=t.metadata.root_uri,
50
+ key_schema=Schema.from_arrow(t.metadata.key_schema),
51
+ base_url=self._api.base_url + "/metastore/",
52
+ token_provider=self._spiral_db.config.authn.token,
53
+ ),
54
+ ),
55
+ name=f"{self.id}.{t.dataset}.{t.table}",
56
+ )
57
+ for t in self._api.table.list(FindTable.Request(project_id=self.id))
58
+ ]
59
+
60
+ def create_table(
61
+ self,
62
+ identifier: str,
63
+ *,
64
+ key_schema: pa.Schema | Any,
65
+ uri: Uri | None = None,
66
+ exist_ok: bool = False,
67
+ ) -> Table:
68
+ """Create a new table in the project."""
69
+ dataset, table = self._parse_identifier(identifier)
70
+
71
+ if not isinstance(key_schema, pa.Schema):
72
+ key_schema = pa.schema(key_schema)
73
+
74
+ res = self._api.table.create(
75
+ CreateTable.Request(
76
+ project_id=self.id,
77
+ dataset=dataset,
78
+ table=table,
79
+ key_schema=key_schema,
80
+ root_uri=uri,
81
+ exist_ok=exist_ok,
82
+ )
83
+ )
84
+
85
+ # Must have the same schema as provided, even if the table already exists.
86
+ expected_key_schema = res.table.metadata.key_schema
87
+ if key_schema != expected_key_schema:
88
+ raise ValueError(f"Table already exists with different key schema: {expected_key_schema} != {key_schema}")
89
+ if uri and res.table.metadata.root_uri != uri:
90
+ raise ValueError(f"Table already exists with different root URI: {res.table.metadata.root_uri} != {uri}")
91
+
92
+ # Set up a metastore backed by SpiralDB
93
+ metastore = PyMetastore.http(
94
+ table_id=res.table.id,
95
+ root_uri=res.table.metadata.root_uri,
96
+ key_schema=Schema.from_arrow(res.table.metadata.key_schema),
97
+ base_url=self._api.base_url + "/metastore/",
98
+ token_provider=self._spiral_db.config.authn.token,
99
+ )
100
+
101
+ return Table(CoreTable(metastore), name=f"{self.id}.{res.table.dataset}.{res.table.table}")
102
+
103
+ def table(self, identifier: str) -> Table:
104
+ """Open a table with a `dataset.table` identifier, or `table` name using the `default` dataset."""
105
+ dataset, table = self._parse_identifier(identifier)
106
+
107
+ # TODO(ngates): why does the client _need_ this information? Can we defer it?
108
+ res = self._api.table.find(
109
+ FindTable.Request(
110
+ project_id=self.id,
111
+ dataset=dataset,
112
+ table=table,
113
+ )
114
+ )
115
+ if res.table is None:
116
+ raise ValueError(f"Table not found: {self.id}.{dataset}.{table}")
117
+
118
+ # Set up a metastore backed by SpiralDB
119
+ metastore = PyMetastore.http(
120
+ table_id=res.table.id,
121
+ root_uri=res.table.metadata.root_uri,
122
+ key_schema=Schema.from_arrow(res.table.metadata.key_schema),
123
+ base_url=self._api.base_url + "/metastore/",
124
+ token_provider=self._spiral_db.config.authn.token,
125
+ )
126
+
127
+ return Table(CoreTable(metastore), name=f"{self.id}.{res.table.dataset}.{res.table.table}")
128
+
129
+ @staticmethod
130
+ def _parse_identifier(identifier: str) -> tuple[str, str]:
131
+ parts = identifier.split(".")
132
+ if len(parts) == 1:
133
+ return "default", parts[0]
134
+ elif len(parts) == 2:
135
+ return parts[0], parts[1]
136
+ else:
137
+ raise ValueError(f"Invalid table identifier: {identifier}")
File without changes
File without changes
File without changes
File without changes