pyspiral 0.1.0__cp310-abi3-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (81) hide show
  1. pyspiral-0.1.0.dist-info/METADATA +48 -0
  2. pyspiral-0.1.0.dist-info/RECORD +81 -0
  3. pyspiral-0.1.0.dist-info/WHEEL +4 -0
  4. pyspiral-0.1.0.dist-info/entry_points.txt +2 -0
  5. spiral/__init__.py +11 -0
  6. spiral/_lib.abi3.so +0 -0
  7. spiral/adbc.py +386 -0
  8. spiral/api/__init__.py +221 -0
  9. spiral/api/admin.py +29 -0
  10. spiral/api/filesystems.py +125 -0
  11. spiral/api/organizations.py +90 -0
  12. spiral/api/projects.py +160 -0
  13. spiral/api/tables.py +94 -0
  14. spiral/api/tokens.py +56 -0
  15. spiral/api/workloads.py +45 -0
  16. spiral/arrow.py +209 -0
  17. spiral/authn/__init__.py +0 -0
  18. spiral/authn/authn.py +89 -0
  19. spiral/authn/device.py +206 -0
  20. spiral/authn/github_.py +33 -0
  21. spiral/authn/modal_.py +18 -0
  22. spiral/catalog.py +78 -0
  23. spiral/cli/__init__.py +82 -0
  24. spiral/cli/__main__.py +4 -0
  25. spiral/cli/admin.py +21 -0
  26. spiral/cli/app.py +48 -0
  27. spiral/cli/console.py +95 -0
  28. spiral/cli/fs.py +47 -0
  29. spiral/cli/login.py +13 -0
  30. spiral/cli/org.py +90 -0
  31. spiral/cli/printer.py +45 -0
  32. spiral/cli/project.py +107 -0
  33. spiral/cli/state.py +3 -0
  34. spiral/cli/table.py +20 -0
  35. spiral/cli/token.py +27 -0
  36. spiral/cli/types.py +53 -0
  37. spiral/cli/workload.py +59 -0
  38. spiral/config.py +26 -0
  39. spiral/core/__init__.py +0 -0
  40. spiral/core/core/__init__.pyi +53 -0
  41. spiral/core/manifests/__init__.pyi +53 -0
  42. spiral/core/metastore/__init__.pyi +91 -0
  43. spiral/core/spec/__init__.pyi +257 -0
  44. spiral/dataset.py +239 -0
  45. spiral/debug.py +251 -0
  46. spiral/expressions/__init__.py +222 -0
  47. spiral/expressions/base.py +149 -0
  48. spiral/expressions/http.py +86 -0
  49. spiral/expressions/io.py +100 -0
  50. spiral/expressions/list_.py +68 -0
  51. spiral/expressions/refs.py +44 -0
  52. spiral/expressions/str_.py +39 -0
  53. spiral/expressions/struct.py +57 -0
  54. spiral/expressions/tiff.py +223 -0
  55. spiral/expressions/udf.py +46 -0
  56. spiral/grpc_.py +32 -0
  57. spiral/project.py +137 -0
  58. spiral/proto/_/__init__.py +0 -0
  59. spiral/proto/_/arrow/__init__.py +0 -0
  60. spiral/proto/_/arrow/flight/__init__.py +0 -0
  61. spiral/proto/_/arrow/flight/protocol/__init__.py +0 -0
  62. spiral/proto/_/arrow/flight/protocol/sql/__init__.py +1990 -0
  63. spiral/proto/_/scandal/__init__.py +223 -0
  64. spiral/proto/_/spfs/__init__.py +36 -0
  65. spiral/proto/_/spiral/__init__.py +0 -0
  66. spiral/proto/_/spiral/table/__init__.py +225 -0
  67. spiral/proto/_/spiraldb/__init__.py +0 -0
  68. spiral/proto/_/spiraldb/metastore/__init__.py +499 -0
  69. spiral/proto/__init__.py +0 -0
  70. spiral/proto/scandal/__init__.py +45 -0
  71. spiral/proto/spiral/__init__.py +0 -0
  72. spiral/proto/spiral/table/__init__.py +96 -0
  73. spiral/proto/substrait/__init__.py +3399 -0
  74. spiral/proto/substrait/extensions/__init__.py +115 -0
  75. spiral/proto/util.py +41 -0
  76. spiral/py.typed +0 -0
  77. spiral/scan_.py +168 -0
  78. spiral/settings.py +157 -0
  79. spiral/substrait_.py +275 -0
  80. spiral/table.py +157 -0
  81. spiral/types_.py +6 -0
spiral/substrait_.py ADDED
@@ -0,0 +1,275 @@
1
+ import betterproto
2
+ import pyarrow as pa
3
+
4
+ import spiral.expressions as se
5
+ from spiral.expressions.base import Expr
6
+ from spiral.proto.substrait import (
7
+ Expression,
8
+ ExpressionFieldReference,
9
+ ExpressionLiteral,
10
+ ExpressionLiteralList,
11
+ ExpressionLiteralStruct,
12
+ ExpressionLiteralUserDefined,
13
+ ExpressionMaskExpression,
14
+ ExpressionReferenceSegment,
15
+ ExpressionReferenceSegmentListElement,
16
+ ExpressionReferenceSegmentStructField,
17
+ ExpressionScalarFunction,
18
+ ExtendedExpression,
19
+ )
20
+ from spiral.proto.substrait.extensions import (
21
+ SimpleExtensionDeclaration,
22
+ SimpleExtensionDeclarationExtensionFunction,
23
+ SimpleExtensionDeclarationExtensionType,
24
+ SimpleExtensionDeclarationExtensionTypeVariation,
25
+ )
26
+
27
+
28
+ class SubstraitConverter:
29
+ def __init__(self, scope: Expr, schema: pa.Schema, key_schema: pa.Schema):
30
+ self.scope = scope
31
+ self.schema = schema
32
+ self.key_names = set(key_schema.names)
33
+
34
+ # Extension URIs, keyed by extension URI anchor
35
+ self.extension_uris = {}
36
+
37
+ # Functions, keyed by function_anchor
38
+ self.functions = {}
39
+
40
+ # Types, keyed by type anchor
41
+ self.type_factories = {}
42
+
43
+ def convert(self, buffer: pa.Buffer) -> Expr:
44
+ """Convert a Substrait Extended Expression into a Spiral expression."""
45
+
46
+ expr: ExtendedExpression = ExtendedExpression().parse(buffer.to_pybytes())
47
+ assert len(expr.referred_expr) == 1, "Only one expression is supported"
48
+
49
+ # Parse the extension URIs from the plan.
50
+ for ext_uri in expr.extension_uris:
51
+ self.extension_uris[ext_uri.extension_uri_anchor] = ext_uri.uri
52
+
53
+ # Parse the extensions from the plan.
54
+ for ext in expr.extensions:
55
+ self._extension_declaration(ext)
56
+
57
+ # Convert the expression
58
+ return self._expr(expr.referred_expr[0].expression)
59
+
60
+ def _extension_declaration(self, ext: SimpleExtensionDeclaration):
61
+ match betterproto.which_one_of(ext, "mapping_type"):
62
+ case "extension_function", ext_func:
63
+ self._extension_function(ext_func)
64
+ case "extension_type", ext_type:
65
+ self._extension_type(ext_type)
66
+ case "extension_type_variation", ext_type_variation:
67
+ self._extension_type_variation(ext_type_variation)
68
+ case _:
69
+ raise AssertionError("Invalid substrait plan")
70
+
71
+ def _extension_function(self, ext: SimpleExtensionDeclarationExtensionFunction):
72
+ ext_uri: str = self.extension_uris[ext.extension_uri_reference]
73
+ match ext_uri:
74
+ case "https://github.com/substrait-io/substrait/blob/main/extensions/functions_boolean.yaml":
75
+ match ext.name:
76
+ case "or":
77
+ self.functions[ext.function_anchor] = se.or_
78
+ case "and":
79
+ self.functions[ext.function_anchor] = se.and_
80
+ case "xor":
81
+ self.functions[ext.function_anchor] = se.xor
82
+ case "not":
83
+ self.functions[ext.function_anchor] = se.not_
84
+ case _:
85
+ raise NotImplementedError(f"Function name {ext.name} not supported")
86
+ case "https://github.com/substrait-io/substrait/blob/main/extensions/functions_comparison.yaml":
87
+ match ext.name:
88
+ case "equal":
89
+ self.functions[ext.function_anchor] = se.eq
90
+ case "not_equal":
91
+ self.functions[ext.function_anchor] = se.neq
92
+ case "lt":
93
+ self.functions[ext.function_anchor] = se.lt
94
+ case "lte":
95
+ self.functions[ext.function_anchor] = se.lte
96
+ case "gt":
97
+ self.functions[ext.function_anchor] = se.gt
98
+ case "gte":
99
+ self.functions[ext.function_anchor] = se.gte
100
+ case "is_null":
101
+ self.functions[ext.function_anchor] = se.is_null
102
+ case "is_not_null":
103
+ self.functions[ext.function_anchor] = se.is_not_null
104
+ case _:
105
+ raise NotImplementedError(f"Function name {ext.name} not supported")
106
+ case uri:
107
+ raise NotImplementedError(f"Function extension URI {uri} not supported")
108
+
109
+ def _extension_type(self, ext: SimpleExtensionDeclarationExtensionType):
110
+ ext_uri: str = self.extension_uris[ext.extension_uri_reference]
111
+ match ext_uri:
112
+ case "https://github.com/apache/arrow/blob/main/format/substrait/extension_types.yaml":
113
+ match ext.name:
114
+ case "null":
115
+ self.type_factories[ext.type_anchor] = pa.null
116
+ case "interval_month_day_nano":
117
+ self.type_factories[ext.type_anchor] = pa.month_day_nano_interval
118
+ case "u8":
119
+ self.type_factories[ext.type_anchor] = pa.uint8
120
+ case "u16":
121
+ self.type_factories[ext.type_anchor] = pa.uint16
122
+ case "u32":
123
+ self.type_factories[ext.type_anchor] = pa.uint32
124
+ case "u64":
125
+ self.type_factories[ext.type_anchor] = pa.uint64
126
+ case "fp16":
127
+ self.type_factories[ext.type_anchor] = pa.float16
128
+ case "date_millis":
129
+ self.type_factories[ext.type_anchor] = pa.date64
130
+ case "time_seconds":
131
+ self.type_factories[ext.type_anchor] = lambda: pa.time32("s")
132
+ case "time_millis":
133
+ self.type_factories[ext.type_anchor] = lambda: pa.time32("ms")
134
+ case "time_nanos":
135
+ self.type_factories[ext.type_anchor] = lambda: pa.time64("ns")
136
+ case "large_string":
137
+ self.type_factories[ext.type_anchor] = pa.large_string
138
+ case "large_binary":
139
+ self.type_factories[ext.type_anchor] = pa.large_binary
140
+ case "decimal256":
141
+ self.type_factories[ext.type_anchor] = pa.decimal256
142
+ case "large_list":
143
+ self.type_factories[ext.type_anchor] = pa.large_list
144
+ case "fixed_size_list":
145
+ self.type_factories[ext.type_anchor] = pa.list_
146
+ case "duration":
147
+ self.type_factories[ext.type_anchor] = pa.duration
148
+ case uri:
149
+ raise NotImplementedError(f"Type extension URI {uri} not support")
150
+
151
+ def _extension_type_variation(self, ext: SimpleExtensionDeclarationExtensionTypeVariation):
152
+ raise NotImplementedError()
153
+
154
+ def _expr(self, expr: Expression) -> Expr:
155
+ match betterproto.which_one_of(expr, "rex_type"):
156
+ case "literal", e:
157
+ return self._expr_literal(e)
158
+ case "selection", e:
159
+ return self._expr_selection(e)
160
+ case "scalar_function", e:
161
+ return self._expr_scalar_function(e)
162
+ case "window_function", _:
163
+ raise ValueError("Window functions are not supported in Spiral push-down")
164
+ case "if_then", e:
165
+ return self._expr_if_then(e)
166
+ case "switch", e:
167
+ return self._expr_switch(e)
168
+ case "singular_or_list", _:
169
+ raise ValueError("singular_or_list is not supported in Spiral push-down")
170
+ case "multi_or_list", _:
171
+ raise ValueError("multi_or_list is not supported in Spiral push-down")
172
+ case "cast", e:
173
+ return self._expr_cast(e)
174
+ case "subquery", _:
175
+ raise ValueError("Subqueries are not supported in Spiral push-down")
176
+ case "nested", e:
177
+ return self._expr_nested(e)
178
+ case _:
179
+ raise NotImplementedError(f"Expression type {expr.rex_type} not implemented")
180
+
181
+ def _expr_literal(self, expr: ExpressionLiteral):
182
+ # TODO(ngates): the Spiral literal expression is quite weakly typed...
183
+ # Maybe we can switch to Vortex?
184
+ simple = {
185
+ "boolean",
186
+ "i8",
187
+ "i16",
188
+ "i32",
189
+ "i64",
190
+ "fp32",
191
+ "fp64",
192
+ "string",
193
+ "binary",
194
+ "fixed_char",
195
+ "var_char",
196
+ "fixed_binary",
197
+ }
198
+
199
+ match betterproto.which_one_of(expr, "literal_type"):
200
+ case type_, v if type_ in simple:
201
+ return se.scalar(pa.scalar(v))
202
+ case "timestamp", v:
203
+ return se.scalar(pa.scalar(v, type=pa.timestamp("us")))
204
+ case "date", v:
205
+ return se.scalar(pa.scalar(v, type=pa.date32()))
206
+ case "time", v:
207
+ # Substrait time is us since midnight. PyArrow only supports ms.
208
+ v: int
209
+ v = int(v / 1000)
210
+ return se.scalar(pa.scalar(v, type=pa.time32("ms")))
211
+ case "null", _null_type:
212
+ # We need a typed null value
213
+ raise NotImplementedError()
214
+ case "struct", v:
215
+ v: ExpressionLiteralStruct
216
+ # Hmm, v has fields, but no field names. I guess we return a list and the type is applied later?
217
+ raise NotImplementedError()
218
+ case "list", v:
219
+ v: ExpressionLiteralList
220
+ return pa.scalar([self._expr_literal(e) for e in v.values])
221
+ case "user_defined", v:
222
+ v: ExpressionLiteralUserDefined
223
+ raise NotImplementedError()
224
+ case literal_type, _:
225
+ raise NotImplementedError(f"Literal type not supported: {literal_type}")
226
+
227
+ def _expr_selection(self, expr: ExpressionFieldReference):
228
+ match betterproto.which_one_of(expr, "root_type"):
229
+ case "root_reference", _:
230
+ # The reference is relative to the root
231
+ base_expr = self.scope
232
+ base_type = pa.struct(self.schema)
233
+ case _:
234
+ raise NotImplementedError("Only root_reference expressions are supported")
235
+
236
+ match betterproto.which_one_of(expr, "reference_type"):
237
+ case "direct_reference", direct_ref:
238
+ return self._expr_direct_reference(base_expr, base_type, direct_ref)
239
+ case "masked_reference", masked_ref:
240
+ return self._expr_masked_reference(base_expr, base_type, masked_ref)
241
+ case _:
242
+ raise NotImplementedError()
243
+
244
+ def _expr_direct_reference(self, scope: Expr, scope_type: pa.StructType, expr: ExpressionReferenceSegment):
245
+ match betterproto.which_one_of(expr, "reference_type"):
246
+ case "map_key", ref:
247
+ raise NotImplementedError("Map types not yet supported in Spiral")
248
+ case "struct_field", ref:
249
+ ref: ExpressionReferenceSegmentStructField
250
+ field_name = scope_type.field(ref.field).name
251
+
252
+ if field_name in self.key_names:
253
+ # This is a key column, so we need to select it from the scope.
254
+ return se.var(field_name)
255
+
256
+ scope = se.getitem(scope, field_name)
257
+ scope_type = scope_type.field(ref.field).type
258
+ return self._expr_direct_reference(scope, scope_type, ref.child) if ref.child else scope
259
+ case "list_element", ref:
260
+ ref: ExpressionReferenceSegmentListElement
261
+ scope = se.getitem(scope, ref.offset)
262
+ scope_type = scope_type.field(ref.field).type
263
+ return self._expr_direct_reference(scope, scope_type, ref.child) if ref.child else scope
264
+ case "", ref:
265
+ # Because Proto... we hit this case when we recurse into a child node and it's actually "None".
266
+ return scope
267
+ case _:
268
+ raise NotImplementedError()
269
+
270
+ def _expr_masked_reference(self, scope: Expr, scope_type: pa.StructType, expr: ExpressionMaskExpression):
271
+ raise NotImplementedError("Masked references are not yet supported in Spiral push-down")
272
+
273
+ def _expr_scalar_function(self, expr: ExpressionScalarFunction):
274
+ args = [self._expr(arg.value) for arg in expr.arguments]
275
+ return self.functions[expr.function_reference](*args)
spiral/table.py ADDED
@@ -0,0 +1,157 @@
1
+ from datetime import datetime
2
+ from typing import TYPE_CHECKING, Literal
3
+
4
+ import pyarrow as pa
5
+
6
+ from spiral import expressions as se
7
+ from spiral.config import FILE_FORMAT, Config
8
+ from spiral.core.core import Table as CoreTable
9
+ from spiral.core.core import flush_wal, write
10
+ from spiral.expressions.base import Expr, ExprLike
11
+
12
+ if TYPE_CHECKING:
13
+ import duckdb
14
+ import polars as pl
15
+ import pyarrow.dataset
16
+
17
+ from spiral.scan_ import Scan
18
+
19
+
20
+ class Table(Expr):
21
+ """API for interacting with a SpiralDB's Table.
22
+
23
+ Different catalog implementations should ultimately construct a Table object.
24
+ """
25
+
26
+ def __init__(
27
+ self,
28
+ table: CoreTable,
29
+ name: str | None = None,
30
+ ):
31
+ super().__init__(table.__expr__)
32
+
33
+ self._table = table
34
+ self._name = name or self._table.id
35
+ self._key_schema = self._table.key_schema.to_arrow()
36
+ self._key_columns = set(self._key_schema.names)
37
+
38
+ @property
39
+ def table_id(self) -> str:
40
+ return self._table.id
41
+
42
+ @property
43
+ def last_modified_at(self) -> int:
44
+ return self._table.get_wal(asof=None).last_modified_at
45
+
46
+ def __str__(self):
47
+ return self._name
48
+
49
+ def __repr__(self):
50
+ return f'Table("{self._name}")'
51
+
52
+ def __getitem__(self, item: str) -> Expr:
53
+ from spiral import expressions as se
54
+
55
+ if item in self._key_columns:
56
+ return se.var(name=item)
57
+
58
+ return super().__getitem__(item)
59
+
60
+ def select(self, *paths: str, exclude: list[str] = None) -> "Expr":
61
+ # Override an expression select in the root column group to split between keys and columns.
62
+ if exclude is not None:
63
+ if set(exclude) & self._key_columns:
64
+ raise ValueError(
65
+ "Cannot use 'exclude' arg with key columns. Use 'exclude_keys' and an explicit select of keys."
66
+ )
67
+
68
+ key_paths = set(paths) & self._key_columns
69
+ other_paths = set(paths) - key_paths
70
+ if not key_paths:
71
+ return super().select(*paths, exclude=exclude)
72
+
73
+ from spiral import expressions as se
74
+
75
+ return se.merge(se.pack({key: se.var(key) for key in key_paths}), super().select(*other_paths, exclude=exclude))
76
+
77
+ @property
78
+ def key_schema(self) -> pa.Schema:
79
+ """Returns the key schema of the table."""
80
+ return self._key_schema
81
+
82
+ @property
83
+ def schema(self) -> pa.Schema:
84
+ """Returns the FULL schema of the table.
85
+
86
+ NOTE: This can be expensive for large tables.
87
+ """
88
+ return self._table.get_schema(asof=None)
89
+
90
+ def to_dataset(self) -> "pyarrow.dataset.Dataset":
91
+ """Returns a PyArrow Dataset representing the table."""
92
+ from .dataset import TableDataset
93
+
94
+ return TableDataset(self)
95
+
96
+ def to_polars(self) -> "pl.LazyFrame":
97
+ """Returns a Polars LazyFrame for the Spiral table."""
98
+ import polars as pl
99
+
100
+ return pl.scan_pyarrow_dataset(self.to_dataset())
101
+
102
+ def to_duckdb(self) -> "duckdb.DuckDBPyRelation":
103
+ """Returns a DuckDB relation for the Spiral table."""
104
+ import duckdb
105
+
106
+ return duckdb.from_arrow(self.to_dataset())
107
+
108
+ def scan(
109
+ self,
110
+ *projections: ExprLike,
111
+ where: ExprLike | None = None,
112
+ asof: datetime | int | str = None,
113
+ exclude_keys: bool = False,
114
+ # TODO(marko): Support config.
115
+ # config: Config | None = None,
116
+ ) -> "Scan":
117
+ """Reads the table. If projections are not provided, the entire table is read.
118
+
119
+ See `spiral.scan` for more information.
120
+ """
121
+ from spiral.scan_ import scan
122
+
123
+ if not projections:
124
+ projections = [self]
125
+
126
+ return scan(
127
+ *projections,
128
+ where=where,
129
+ asof=asof,
130
+ exclude_keys=exclude_keys,
131
+ # config=config,
132
+ )
133
+
134
+ # NOTE: "vortex" is valid format. We don't want that visible in the API docs.
135
+ def write(
136
+ self,
137
+ expr: ExprLike,
138
+ *,
139
+ format: Literal["parquet"] | None = None,
140
+ # TODO(joe): support group_by, and config
141
+ config: Config | None = None,
142
+ ) -> None:
143
+ """Write an item to the table inside a single transaction.
144
+
145
+ :param expr: The expression to write. Must evaluate to a struct array.
146
+ :param format: the format to write the data in. Defaults to "parquet".
147
+ :param config: The configuration to use for this write.
148
+ """
149
+ write(
150
+ self._table,
151
+ se.lift(expr).__expr__,
152
+ format=format or FILE_FORMAT,
153
+ partition_size=config.partition_file_min_size if config else None,
154
+ )
155
+ # Flush the WAL if configured.
156
+ if config is not None and config.flush_wal_on_write:
157
+ flush_wal(self._table, manifest_format=format or FILE_FORMAT)
spiral/types_.py ADDED
@@ -0,0 +1,6 @@
1
+ from typing import Annotated, TypeAlias
2
+
3
+ from pydantic import UrlConstraints
4
+
5
+ Uri: TypeAlias = Annotated[str, UrlConstraints()]
6
+ Timestamp: TypeAlias = int