pyspiral 0.1.0__cp310-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl

Sign up to get free protection for your applications and to get access to all the features.
Files changed (81) hide show
  1. pyspiral-0.1.0.dist-info/METADATA +48 -0
  2. pyspiral-0.1.0.dist-info/RECORD +81 -0
  3. pyspiral-0.1.0.dist-info/WHEEL +4 -0
  4. pyspiral-0.1.0.dist-info/entry_points.txt +2 -0
  5. spiral/__init__.py +11 -0
  6. spiral/_lib.abi3.so +0 -0
  7. spiral/adbc.py +386 -0
  8. spiral/api/__init__.py +221 -0
  9. spiral/api/admin.py +29 -0
  10. spiral/api/filesystems.py +125 -0
  11. spiral/api/organizations.py +90 -0
  12. spiral/api/projects.py +160 -0
  13. spiral/api/tables.py +94 -0
  14. spiral/api/tokens.py +56 -0
  15. spiral/api/workloads.py +45 -0
  16. spiral/arrow.py +209 -0
  17. spiral/authn/__init__.py +0 -0
  18. spiral/authn/authn.py +89 -0
  19. spiral/authn/device.py +206 -0
  20. spiral/authn/github_.py +33 -0
  21. spiral/authn/modal_.py +18 -0
  22. spiral/catalog.py +78 -0
  23. spiral/cli/__init__.py +82 -0
  24. spiral/cli/__main__.py +4 -0
  25. spiral/cli/admin.py +21 -0
  26. spiral/cli/app.py +48 -0
  27. spiral/cli/console.py +95 -0
  28. spiral/cli/fs.py +47 -0
  29. spiral/cli/login.py +13 -0
  30. spiral/cli/org.py +90 -0
  31. spiral/cli/printer.py +45 -0
  32. spiral/cli/project.py +107 -0
  33. spiral/cli/state.py +3 -0
  34. spiral/cli/table.py +20 -0
  35. spiral/cli/token.py +27 -0
  36. spiral/cli/types.py +53 -0
  37. spiral/cli/workload.py +59 -0
  38. spiral/config.py +26 -0
  39. spiral/core/__init__.py +0 -0
  40. spiral/core/core/__init__.pyi +53 -0
  41. spiral/core/manifests/__init__.pyi +53 -0
  42. spiral/core/metastore/__init__.pyi +91 -0
  43. spiral/core/spec/__init__.pyi +257 -0
  44. spiral/dataset.py +239 -0
  45. spiral/debug.py +251 -0
  46. spiral/expressions/__init__.py +222 -0
  47. spiral/expressions/base.py +149 -0
  48. spiral/expressions/http.py +86 -0
  49. spiral/expressions/io.py +100 -0
  50. spiral/expressions/list_.py +68 -0
  51. spiral/expressions/refs.py +44 -0
  52. spiral/expressions/str_.py +39 -0
  53. spiral/expressions/struct.py +57 -0
  54. spiral/expressions/tiff.py +223 -0
  55. spiral/expressions/udf.py +46 -0
  56. spiral/grpc_.py +32 -0
  57. spiral/project.py +137 -0
  58. spiral/proto/_/__init__.py +0 -0
  59. spiral/proto/_/arrow/__init__.py +0 -0
  60. spiral/proto/_/arrow/flight/__init__.py +0 -0
  61. spiral/proto/_/arrow/flight/protocol/__init__.py +0 -0
  62. spiral/proto/_/arrow/flight/protocol/sql/__init__.py +1990 -0
  63. spiral/proto/_/scandal/__init__.py +223 -0
  64. spiral/proto/_/spfs/__init__.py +36 -0
  65. spiral/proto/_/spiral/__init__.py +0 -0
  66. spiral/proto/_/spiral/table/__init__.py +225 -0
  67. spiral/proto/_/spiraldb/__init__.py +0 -0
  68. spiral/proto/_/spiraldb/metastore/__init__.py +499 -0
  69. spiral/proto/__init__.py +0 -0
  70. spiral/proto/scandal/__init__.py +45 -0
  71. spiral/proto/spiral/__init__.py +0 -0
  72. spiral/proto/spiral/table/__init__.py +96 -0
  73. spiral/proto/substrait/__init__.py +3399 -0
  74. spiral/proto/substrait/extensions/__init__.py +115 -0
  75. spiral/proto/util.py +41 -0
  76. spiral/py.typed +0 -0
  77. spiral/scan_.py +168 -0
  78. spiral/settings.py +157 -0
  79. spiral/substrait_.py +275 -0
  80. spiral/table.py +157 -0
  81. spiral/types_.py +6 -0
spiral/substrait_.py ADDED
@@ -0,0 +1,275 @@
1
+ import betterproto
2
+ import pyarrow as pa
3
+
4
+ import spiral.expressions as se
5
+ from spiral.expressions.base import Expr
6
+ from spiral.proto.substrait import (
7
+ Expression,
8
+ ExpressionFieldReference,
9
+ ExpressionLiteral,
10
+ ExpressionLiteralList,
11
+ ExpressionLiteralStruct,
12
+ ExpressionLiteralUserDefined,
13
+ ExpressionMaskExpression,
14
+ ExpressionReferenceSegment,
15
+ ExpressionReferenceSegmentListElement,
16
+ ExpressionReferenceSegmentStructField,
17
+ ExpressionScalarFunction,
18
+ ExtendedExpression,
19
+ )
20
+ from spiral.proto.substrait.extensions import (
21
+ SimpleExtensionDeclaration,
22
+ SimpleExtensionDeclarationExtensionFunction,
23
+ SimpleExtensionDeclarationExtensionType,
24
+ SimpleExtensionDeclarationExtensionTypeVariation,
25
+ )
26
+
27
+
28
+ class SubstraitConverter:
29
+ def __init__(self, scope: Expr, schema: pa.Schema, key_schema: pa.Schema):
30
+ self.scope = scope
31
+ self.schema = schema
32
+ self.key_names = set(key_schema.names)
33
+
34
+ # Extension URIs, keyed by extension URI anchor
35
+ self.extension_uris = {}
36
+
37
+ # Functions, keyed by function_anchor
38
+ self.functions = {}
39
+
40
+ # Types, keyed by type anchor
41
+ self.type_factories = {}
42
+
43
+ def convert(self, buffer: pa.Buffer) -> Expr:
44
+ """Convert a Substrait Extended Expression into a Spiral expression."""
45
+
46
+ expr: ExtendedExpression = ExtendedExpression().parse(buffer.to_pybytes())
47
+ assert len(expr.referred_expr) == 1, "Only one expression is supported"
48
+
49
+ # Parse the extension URIs from the plan.
50
+ for ext_uri in expr.extension_uris:
51
+ self.extension_uris[ext_uri.extension_uri_anchor] = ext_uri.uri
52
+
53
+ # Parse the extensions from the plan.
54
+ for ext in expr.extensions:
55
+ self._extension_declaration(ext)
56
+
57
+ # Convert the expression
58
+ return self._expr(expr.referred_expr[0].expression)
59
+
60
+ def _extension_declaration(self, ext: SimpleExtensionDeclaration):
61
+ match betterproto.which_one_of(ext, "mapping_type"):
62
+ case "extension_function", ext_func:
63
+ self._extension_function(ext_func)
64
+ case "extension_type", ext_type:
65
+ self._extension_type(ext_type)
66
+ case "extension_type_variation", ext_type_variation:
67
+ self._extension_type_variation(ext_type_variation)
68
+ case _:
69
+ raise AssertionError("Invalid substrait plan")
70
+
71
+ def _extension_function(self, ext: SimpleExtensionDeclarationExtensionFunction):
72
+ ext_uri: str = self.extension_uris[ext.extension_uri_reference]
73
+ match ext_uri:
74
+ case "https://github.com/substrait-io/substrait/blob/main/extensions/functions_boolean.yaml":
75
+ match ext.name:
76
+ case "or":
77
+ self.functions[ext.function_anchor] = se.or_
78
+ case "and":
79
+ self.functions[ext.function_anchor] = se.and_
80
+ case "xor":
81
+ self.functions[ext.function_anchor] = se.xor
82
+ case "not":
83
+ self.functions[ext.function_anchor] = se.not_
84
+ case _:
85
+ raise NotImplementedError(f"Function name {ext.name} not supported")
86
+ case "https://github.com/substrait-io/substrait/blob/main/extensions/functions_comparison.yaml":
87
+ match ext.name:
88
+ case "equal":
89
+ self.functions[ext.function_anchor] = se.eq
90
+ case "not_equal":
91
+ self.functions[ext.function_anchor] = se.neq
92
+ case "lt":
93
+ self.functions[ext.function_anchor] = se.lt
94
+ case "lte":
95
+ self.functions[ext.function_anchor] = se.lte
96
+ case "gt":
97
+ self.functions[ext.function_anchor] = se.gt
98
+ case "gte":
99
+ self.functions[ext.function_anchor] = se.gte
100
+ case "is_null":
101
+ self.functions[ext.function_anchor] = se.is_null
102
+ case "is_not_null":
103
+ self.functions[ext.function_anchor] = se.is_not_null
104
+ case _:
105
+ raise NotImplementedError(f"Function name {ext.name} not supported")
106
+ case uri:
107
+ raise NotImplementedError(f"Function extension URI {uri} not supported")
108
+
109
+ def _extension_type(self, ext: SimpleExtensionDeclarationExtensionType):
110
+ ext_uri: str = self.extension_uris[ext.extension_uri_reference]
111
+ match ext_uri:
112
+ case "https://github.com/apache/arrow/blob/main/format/substrait/extension_types.yaml":
113
+ match ext.name:
114
+ case "null":
115
+ self.type_factories[ext.type_anchor] = pa.null
116
+ case "interval_month_day_nano":
117
+ self.type_factories[ext.type_anchor] = pa.month_day_nano_interval
118
+ case "u8":
119
+ self.type_factories[ext.type_anchor] = pa.uint8
120
+ case "u16":
121
+ self.type_factories[ext.type_anchor] = pa.uint16
122
+ case "u32":
123
+ self.type_factories[ext.type_anchor] = pa.uint32
124
+ case "u64":
125
+ self.type_factories[ext.type_anchor] = pa.uint64
126
+ case "fp16":
127
+ self.type_factories[ext.type_anchor] = pa.float16
128
+ case "date_millis":
129
+ self.type_factories[ext.type_anchor] = pa.date64
130
+ case "time_seconds":
131
+ self.type_factories[ext.type_anchor] = lambda: pa.time32("s")
132
+ case "time_millis":
133
+ self.type_factories[ext.type_anchor] = lambda: pa.time32("ms")
134
+ case "time_nanos":
135
+ self.type_factories[ext.type_anchor] = lambda: pa.time64("ns")
136
+ case "large_string":
137
+ self.type_factories[ext.type_anchor] = pa.large_string
138
+ case "large_binary":
139
+ self.type_factories[ext.type_anchor] = pa.large_binary
140
+ case "decimal256":
141
+ self.type_factories[ext.type_anchor] = pa.decimal256
142
+ case "large_list":
143
+ self.type_factories[ext.type_anchor] = pa.large_list
144
+ case "fixed_size_list":
145
+ self.type_factories[ext.type_anchor] = pa.list_
146
+ case "duration":
147
+ self.type_factories[ext.type_anchor] = pa.duration
148
+ case uri:
149
+ raise NotImplementedError(f"Type extension URI {uri} not support")
150
+
151
+ def _extension_type_variation(self, ext: SimpleExtensionDeclarationExtensionTypeVariation):
152
+ raise NotImplementedError()
153
+
154
+ def _expr(self, expr: Expression) -> Expr:
155
+ match betterproto.which_one_of(expr, "rex_type"):
156
+ case "literal", e:
157
+ return self._expr_literal(e)
158
+ case "selection", e:
159
+ return self._expr_selection(e)
160
+ case "scalar_function", e:
161
+ return self._expr_scalar_function(e)
162
+ case "window_function", _:
163
+ raise ValueError("Window functions are not supported in Spiral push-down")
164
+ case "if_then", e:
165
+ return self._expr_if_then(e)
166
+ case "switch", e:
167
+ return self._expr_switch(e)
168
+ case "singular_or_list", _:
169
+ raise ValueError("singular_or_list is not supported in Spiral push-down")
170
+ case "multi_or_list", _:
171
+ raise ValueError("multi_or_list is not supported in Spiral push-down")
172
+ case "cast", e:
173
+ return self._expr_cast(e)
174
+ case "subquery", _:
175
+ raise ValueError("Subqueries are not supported in Spiral push-down")
176
+ case "nested", e:
177
+ return self._expr_nested(e)
178
+ case _:
179
+ raise NotImplementedError(f"Expression type {expr.rex_type} not implemented")
180
+
181
+ def _expr_literal(self, expr: ExpressionLiteral):
182
+ # TODO(ngates): the Spiral literal expression is quite weakly typed...
183
+ # Maybe we can switch to Vortex?
184
+ simple = {
185
+ "boolean",
186
+ "i8",
187
+ "i16",
188
+ "i32",
189
+ "i64",
190
+ "fp32",
191
+ "fp64",
192
+ "string",
193
+ "binary",
194
+ "fixed_char",
195
+ "var_char",
196
+ "fixed_binary",
197
+ }
198
+
199
+ match betterproto.which_one_of(expr, "literal_type"):
200
+ case type_, v if type_ in simple:
201
+ return se.scalar(pa.scalar(v))
202
+ case "timestamp", v:
203
+ return se.scalar(pa.scalar(v, type=pa.timestamp("us")))
204
+ case "date", v:
205
+ return se.scalar(pa.scalar(v, type=pa.date32()))
206
+ case "time", v:
207
+ # Substrait time is us since midnight. PyArrow only supports ms.
208
+ v: int
209
+ v = int(v / 1000)
210
+ return se.scalar(pa.scalar(v, type=pa.time32("ms")))
211
+ case "null", _null_type:
212
+ # We need a typed null value
213
+ raise NotImplementedError()
214
+ case "struct", v:
215
+ v: ExpressionLiteralStruct
216
+ # Hmm, v has fields, but no field names. I guess we return a list and the type is applied later?
217
+ raise NotImplementedError()
218
+ case "list", v:
219
+ v: ExpressionLiteralList
220
+ return pa.scalar([self._expr_literal(e) for e in v.values])
221
+ case "user_defined", v:
222
+ v: ExpressionLiteralUserDefined
223
+ raise NotImplementedError()
224
+ case literal_type, _:
225
+ raise NotImplementedError(f"Literal type not supported: {literal_type}")
226
+
227
+ def _expr_selection(self, expr: ExpressionFieldReference):
228
+ match betterproto.which_one_of(expr, "root_type"):
229
+ case "root_reference", _:
230
+ # The reference is relative to the root
231
+ base_expr = self.scope
232
+ base_type = pa.struct(self.schema)
233
+ case _:
234
+ raise NotImplementedError("Only root_reference expressions are supported")
235
+
236
+ match betterproto.which_one_of(expr, "reference_type"):
237
+ case "direct_reference", direct_ref:
238
+ return self._expr_direct_reference(base_expr, base_type, direct_ref)
239
+ case "masked_reference", masked_ref:
240
+ return self._expr_masked_reference(base_expr, base_type, masked_ref)
241
+ case _:
242
+ raise NotImplementedError()
243
+
244
+ def _expr_direct_reference(self, scope: Expr, scope_type: pa.StructType, expr: ExpressionReferenceSegment):
245
+ match betterproto.which_one_of(expr, "reference_type"):
246
+ case "map_key", ref:
247
+ raise NotImplementedError("Map types not yet supported in Spiral")
248
+ case "struct_field", ref:
249
+ ref: ExpressionReferenceSegmentStructField
250
+ field_name = scope_type.field(ref.field).name
251
+
252
+ if field_name in self.key_names:
253
+ # This is a key column, so we need to select it from the scope.
254
+ return se.var(field_name)
255
+
256
+ scope = se.getitem(scope, field_name)
257
+ scope_type = scope_type.field(ref.field).type
258
+ return self._expr_direct_reference(scope, scope_type, ref.child) if ref.child else scope
259
+ case "list_element", ref:
260
+ ref: ExpressionReferenceSegmentListElement
261
+ scope = se.getitem(scope, ref.offset)
262
+ scope_type = scope_type.field(ref.field).type
263
+ return self._expr_direct_reference(scope, scope_type, ref.child) if ref.child else scope
264
+ case "", ref:
265
+ # Because Proto... we hit this case when we recurse into a child node and it's actually "None".
266
+ return scope
267
+ case _:
268
+ raise NotImplementedError()
269
+
270
+ def _expr_masked_reference(self, scope: Expr, scope_type: pa.StructType, expr: ExpressionMaskExpression):
271
+ raise NotImplementedError("Masked references are not yet supported in Spiral push-down")
272
+
273
+ def _expr_scalar_function(self, expr: ExpressionScalarFunction):
274
+ args = [self._expr(arg.value) for arg in expr.arguments]
275
+ return self.functions[expr.function_reference](*args)
spiral/table.py ADDED
@@ -0,0 +1,157 @@
1
+ from datetime import datetime
2
+ from typing import TYPE_CHECKING, Literal
3
+
4
+ import pyarrow as pa
5
+
6
+ from spiral import expressions as se
7
+ from spiral.config import FILE_FORMAT, Config
8
+ from spiral.core.core import Table as CoreTable
9
+ from spiral.core.core import flush_wal, write
10
+ from spiral.expressions.base import Expr, ExprLike
11
+
12
+ if TYPE_CHECKING:
13
+ import duckdb
14
+ import polars as pl
15
+ import pyarrow.dataset
16
+
17
+ from spiral.scan_ import Scan
18
+
19
+
20
+ class Table(Expr):
21
+ """API for interacting with a SpiralDB's Table.
22
+
23
+ Different catalog implementations should ultimately construct a Table object.
24
+ """
25
+
26
+ def __init__(
27
+ self,
28
+ table: CoreTable,
29
+ name: str | None = None,
30
+ ):
31
+ super().__init__(table.__expr__)
32
+
33
+ self._table = table
34
+ self._name = name or self._table.id
35
+ self._key_schema = self._table.key_schema.to_arrow()
36
+ self._key_columns = set(self._key_schema.names)
37
+
38
+ @property
39
+ def table_id(self) -> str:
40
+ return self._table.id
41
+
42
+ @property
43
+ def last_modified_at(self) -> int:
44
+ return self._table.get_wal(asof=None).last_modified_at
45
+
46
+ def __str__(self):
47
+ return self._name
48
+
49
+ def __repr__(self):
50
+ return f'Table("{self._name}")'
51
+
52
+ def __getitem__(self, item: str) -> Expr:
53
+ from spiral import expressions as se
54
+
55
+ if item in self._key_columns:
56
+ return se.var(name=item)
57
+
58
+ return super().__getitem__(item)
59
+
60
+ def select(self, *paths: str, exclude: list[str] = None) -> "Expr":
61
+ # Override an expression select in the root column group to split between keys and columns.
62
+ if exclude is not None:
63
+ if set(exclude) & self._key_columns:
64
+ raise ValueError(
65
+ "Cannot use 'exclude' arg with key columns. Use 'exclude_keys' and an explicit select of keys."
66
+ )
67
+
68
+ key_paths = set(paths) & self._key_columns
69
+ other_paths = set(paths) - key_paths
70
+ if not key_paths:
71
+ return super().select(*paths, exclude=exclude)
72
+
73
+ from spiral import expressions as se
74
+
75
+ return se.merge(se.pack({key: se.var(key) for key in key_paths}), super().select(*other_paths, exclude=exclude))
76
+
77
+ @property
78
+ def key_schema(self) -> pa.Schema:
79
+ """Returns the key schema of the table."""
80
+ return self._key_schema
81
+
82
+ @property
83
+ def schema(self) -> pa.Schema:
84
+ """Returns the FULL schema of the table.
85
+
86
+ NOTE: This can be expensive for large tables.
87
+ """
88
+ return self._table.get_schema(asof=None)
89
+
90
+ def to_dataset(self) -> "pyarrow.dataset.Dataset":
91
+ """Returns a PyArrow Dataset representing the table."""
92
+ from .dataset import TableDataset
93
+
94
+ return TableDataset(self)
95
+
96
+ def to_polars(self) -> "pl.LazyFrame":
97
+ """Returns a Polars LazyFrame for the Spiral table."""
98
+ import polars as pl
99
+
100
+ return pl.scan_pyarrow_dataset(self.to_dataset())
101
+
102
+ def to_duckdb(self) -> "duckdb.DuckDBPyRelation":
103
+ """Returns a DuckDB relation for the Spiral table."""
104
+ import duckdb
105
+
106
+ return duckdb.from_arrow(self.to_dataset())
107
+
108
+ def scan(
109
+ self,
110
+ *projections: ExprLike,
111
+ where: ExprLike | None = None,
112
+ asof: datetime | int | str = None,
113
+ exclude_keys: bool = False,
114
+ # TODO(marko): Support config.
115
+ # config: Config | None = None,
116
+ ) -> "Scan":
117
+ """Reads the table. If projections are not provided, the entire table is read.
118
+
119
+ See `spiral.scan` for more information.
120
+ """
121
+ from spiral.scan_ import scan
122
+
123
+ if not projections:
124
+ projections = [self]
125
+
126
+ return scan(
127
+ *projections,
128
+ where=where,
129
+ asof=asof,
130
+ exclude_keys=exclude_keys,
131
+ # config=config,
132
+ )
133
+
134
+ # NOTE: "vortex" is valid format. We don't want that visible in the API docs.
135
+ def write(
136
+ self,
137
+ expr: ExprLike,
138
+ *,
139
+ format: Literal["parquet"] | None = None,
140
+ # TODO(joe): support group_by, and config
141
+ config: Config | None = None,
142
+ ) -> None:
143
+ """Write an item to the table inside a single transaction.
144
+
145
+ :param expr: The expression to write. Must evaluate to a struct array.
146
+ :param format: the format to write the data in. Defaults to "parquet".
147
+ :param config: The configuration to use for this write.
148
+ """
149
+ write(
150
+ self._table,
151
+ se.lift(expr).__expr__,
152
+ format=format or FILE_FORMAT,
153
+ partition_size=config.partition_file_min_size if config else None,
154
+ )
155
+ # Flush the WAL if configured.
156
+ if config is not None and config.flush_wal_on_write:
157
+ flush_wal(self._table, manifest_format=format or FILE_FORMAT)
spiral/types_.py ADDED
@@ -0,0 +1,6 @@
1
+ from typing import Annotated, TypeAlias
2
+
3
+ from pydantic import UrlConstraints
4
+
5
+ Uri: TypeAlias = Annotated[str, UrlConstraints()]
6
+ Timestamp: TypeAlias = int