pyspiral 0.8.9__cp311-abi3-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (114) hide show
  1. pyspiral-0.8.9.dist-info/METADATA +53 -0
  2. pyspiral-0.8.9.dist-info/RECORD +114 -0
  3. pyspiral-0.8.9.dist-info/WHEEL +4 -0
  4. pyspiral-0.8.9.dist-info/entry_points.txt +3 -0
  5. spiral/__init__.py +55 -0
  6. spiral/_lib.abi3.so +0 -0
  7. spiral/adbc.py +411 -0
  8. spiral/api/__init__.py +78 -0
  9. spiral/api/admin.py +15 -0
  10. spiral/api/client.py +165 -0
  11. spiral/api/filesystems.py +152 -0
  12. spiral/api/key_space_indexes.py +23 -0
  13. spiral/api/organizations.py +78 -0
  14. spiral/api/projects.py +219 -0
  15. spiral/api/telemetry.py +19 -0
  16. spiral/api/text_indexes.py +56 -0
  17. spiral/api/types.py +23 -0
  18. spiral/api/workers.py +40 -0
  19. spiral/api/workloads.py +52 -0
  20. spiral/arrow_.py +202 -0
  21. spiral/cli/__init__.py +89 -0
  22. spiral/cli/__main__.py +4 -0
  23. spiral/cli/admin.py +33 -0
  24. spiral/cli/app.py +108 -0
  25. spiral/cli/console.py +95 -0
  26. spiral/cli/fs.py +109 -0
  27. spiral/cli/iceberg.py +97 -0
  28. spiral/cli/key_spaces.py +103 -0
  29. spiral/cli/login.py +25 -0
  30. spiral/cli/orgs.py +81 -0
  31. spiral/cli/printer.py +53 -0
  32. spiral/cli/projects.py +148 -0
  33. spiral/cli/state.py +7 -0
  34. spiral/cli/tables.py +225 -0
  35. spiral/cli/telemetry.py +17 -0
  36. spiral/cli/text.py +115 -0
  37. spiral/cli/types.py +50 -0
  38. spiral/cli/workloads.py +86 -0
  39. spiral/client.py +279 -0
  40. spiral/core/__init__.pyi +0 -0
  41. spiral/core/_tools/__init__.pyi +5 -0
  42. spiral/core/authn/__init__.pyi +21 -0
  43. spiral/core/client/__init__.pyi +270 -0
  44. spiral/core/config/__init__.pyi +35 -0
  45. spiral/core/expr/__init__.pyi +15 -0
  46. spiral/core/expr/images/__init__.pyi +3 -0
  47. spiral/core/expr/list_/__init__.pyi +4 -0
  48. spiral/core/expr/pushdown/__init__.pyi +3 -0
  49. spiral/core/expr/refs/__init__.pyi +4 -0
  50. spiral/core/expr/s3/__init__.pyi +3 -0
  51. spiral/core/expr/str_/__init__.pyi +3 -0
  52. spiral/core/expr/struct_/__init__.pyi +6 -0
  53. spiral/core/expr/text/__init__.pyi +5 -0
  54. spiral/core/expr/udf/__init__.pyi +14 -0
  55. spiral/core/expr/video/__init__.pyi +3 -0
  56. spiral/core/table/__init__.pyi +142 -0
  57. spiral/core/table/manifests/__init__.pyi +35 -0
  58. spiral/core/table/metastore/__init__.pyi +58 -0
  59. spiral/core/table/spec/__init__.pyi +214 -0
  60. spiral/dataloader.py +310 -0
  61. spiral/dataset.py +264 -0
  62. spiral/datetime_.py +27 -0
  63. spiral/debug/__init__.py +0 -0
  64. spiral/debug/manifests.py +103 -0
  65. spiral/debug/metrics.py +56 -0
  66. spiral/debug/scan.py +266 -0
  67. spiral/demo.py +100 -0
  68. spiral/enrichment.py +290 -0
  69. spiral/expressions/__init__.py +274 -0
  70. spiral/expressions/base.py +186 -0
  71. spiral/expressions/file.py +17 -0
  72. spiral/expressions/http.py +17 -0
  73. spiral/expressions/list_.py +77 -0
  74. spiral/expressions/pushdown.py +12 -0
  75. spiral/expressions/s3.py +16 -0
  76. spiral/expressions/str_.py +39 -0
  77. spiral/expressions/struct.py +59 -0
  78. spiral/expressions/text.py +62 -0
  79. spiral/expressions/tiff.py +225 -0
  80. spiral/expressions/udf.py +66 -0
  81. spiral/grpc_.py +32 -0
  82. spiral/iceberg.py +31 -0
  83. spiral/iterable_dataset.py +106 -0
  84. spiral/key_space_index.py +44 -0
  85. spiral/project.py +247 -0
  86. spiral/protogen/_/__init__.py +0 -0
  87. spiral/protogen/_/arrow/__init__.py +0 -0
  88. spiral/protogen/_/arrow/flight/__init__.py +0 -0
  89. spiral/protogen/_/arrow/flight/protocol/__init__.py +0 -0
  90. spiral/protogen/_/arrow/flight/protocol/sql/__init__.py +2548 -0
  91. spiral/protogen/_/google/__init__.py +0 -0
  92. spiral/protogen/_/google/protobuf/__init__.py +2310 -0
  93. spiral/protogen/_/message_pool.py +3 -0
  94. spiral/protogen/_/py.typed +0 -0
  95. spiral/protogen/_/scandal/__init__.py +190 -0
  96. spiral/protogen/_/spfs/__init__.py +72 -0
  97. spiral/protogen/_/spql/__init__.py +61 -0
  98. spiral/protogen/_/substrait/__init__.py +6196 -0
  99. spiral/protogen/_/substrait/extensions/__init__.py +169 -0
  100. spiral/protogen/__init__.py +0 -0
  101. spiral/protogen/util.py +41 -0
  102. spiral/py.typed +0 -0
  103. spiral/scan.py +383 -0
  104. spiral/server.py +37 -0
  105. spiral/settings.py +36 -0
  106. spiral/snapshot.py +61 -0
  107. spiral/streaming_/__init__.py +3 -0
  108. spiral/streaming_/reader.py +133 -0
  109. spiral/streaming_/stream.py +156 -0
  110. spiral/substrait_.py +274 -0
  111. spiral/table.py +216 -0
  112. spiral/text_index.py +17 -0
  113. spiral/transaction.py +156 -0
  114. spiral/types_.py +6 -0
spiral/table.py ADDED
@@ -0,0 +1,216 @@
1
+ from datetime import datetime
2
+ from typing import TYPE_CHECKING
3
+
4
+ from spiral.core.table import Table as CoreTable
5
+ from spiral.core.table.spec import Schema
6
+ from spiral.enrichment import Enrichment
7
+ from spiral.expressions.base import Expr, ExprLike
8
+ from spiral.snapshot import Snapshot
9
+ from spiral.transaction import Transaction
10
+
11
+ if TYPE_CHECKING:
12
+ import duckdb
13
+ import polars as pl
14
+ import pyarrow.dataset as ds
15
+
16
+ from spiral.client import Spiral
17
+ from spiral.key_space_index import KeySpaceIndex
18
+ from spiral.streaming_ import SpiralStream
19
+
20
+
21
+ class Table(Expr):
22
+ """API for interacting with a SpiralDB's Table.
23
+
24
+ Spiral Table is a powerful and flexible way for storing, analyzing,
25
+ and querying massive and/or multimodal datasets. The data model will feel familiar
26
+ to users of SQL- or DataFrame-style systems, yet is designed to be more flexible, more powerful,
27
+ and more useful in the context of modern data processing.
28
+
29
+ Tables are stored and queried directly from object storage.
30
+ """
31
+
32
+ def __init__(self, spiral: "Spiral", core: CoreTable, *, identifier: str | None = None):
33
+ super().__init__(core.__expr__)
34
+
35
+ self.spiral = spiral
36
+ self.core = core
37
+
38
+ self._key_schema = core.key_schema
39
+ self._key_columns = set(self._key_schema.names)
40
+ self._identifier = identifier
41
+
42
+ @property
43
+ def table_id(self) -> str:
44
+ return self.core.id
45
+
46
+ @property
47
+ def identifier(self) -> str:
48
+ """Returns the fully qualified identifier of the table."""
49
+ return self._identifier or self.table_id
50
+
51
+ @property
52
+ def project(self) -> str | None:
53
+ """Returns the project of the table."""
54
+ if self._identifier is None:
55
+ return None
56
+ project, _, _ = self._identifier.split(".")
57
+ return project
58
+
59
+ @property
60
+ def dataset(self) -> str | None:
61
+ """Returns the dataset of the table."""
62
+ if self._identifier is None:
63
+ return None
64
+ _, dataset, _ = self._identifier.split(".")
65
+ return dataset
66
+
67
+ @property
68
+ def name(self) -> str | None:
69
+ """Returns the name of the table."""
70
+ if self._identifier is None:
71
+ return None
72
+ _, _, name = self._identifier.split(".")
73
+ return name
74
+
75
+ def last_modified_at(self) -> int:
76
+ return self.core.get_wal(asof=None).last_modified_at
77
+
78
+ def __str__(self):
79
+ return self.identifier
80
+
81
+ def __repr__(self):
82
+ return f'Table("{self.identifier}")'
83
+
84
+ def __getitem__(self, item: str | int | list[str]) -> Expr:
85
+ return super().__getitem__(item)
86
+
87
+ def select(self, *paths: str, exclude: list[str] = None) -> "Expr":
88
+ return super().select(*paths, exclude=exclude)
89
+
90
+ @property
91
+ def key_schema(self) -> Schema:
92
+ """Returns the key schema of the table."""
93
+ return self._key_schema
94
+
95
+ def schema(self) -> Schema:
96
+ """Returns the FULL schema of the table.
97
+
98
+ NOTE: This can be expensive for large tables.
99
+ """
100
+ return self.core.get_schema(asof=None)
101
+
102
+ def write(self, expr: ExprLike, push_down_nulls: bool = False, **kwargs) -> None:
103
+ """Write an item to the table inside a single transaction.
104
+
105
+ :param push_down_nulls: Whether to push down nullable structs down its children. E.g. `[{"a": 1}, null]` would
106
+ become `[{"a": 1}, {"a": null}]`. SpiralDB doesn't allow struct-level nullability, so use this option if your
107
+ data contains nullable structs.
108
+
109
+ :param expr: The expression to write. Must evaluate to a struct array.
110
+ """
111
+ with self.txn(**kwargs) as txn:
112
+ txn.write(expr, push_down_nulls=push_down_nulls)
113
+
114
+ def enrich(
115
+ self,
116
+ *projections: ExprLike,
117
+ where: ExprLike | None = None,
118
+ ) -> Enrichment:
119
+ """Returns an Enrichment object that, when applied, produces new columns.
120
+
121
+ Enrichment can be applied in different ways, e.g. distributed.
122
+
123
+ :param projections: Projection expressions deriving new columns to write back.
124
+ Expressions can be over multiple Spiral tables, but all tables including
125
+ this one must share the same key schema.
126
+ :param where: Optional filter expression to apply when reading the input tables.
127
+ """
128
+ from spiral import expressions as se
129
+
130
+ projection = se.merge(*projections)
131
+ if where is not None:
132
+ where = se.lift(where)
133
+
134
+ return Enrichment(self, projection, where)
135
+
136
+ def drop_columns(self, column_paths: list[str]) -> None:
137
+ """
138
+ Drops the specified columns from the table.
139
+
140
+
141
+ :param column_paths: Fully qualified column names. (e.g., "column_name" or "nested.field").
142
+ All columns must exist, if a column doesn't exist the function will return an error.
143
+ """
144
+ with self.txn() as txn:
145
+ txn.drop_columns(column_paths)
146
+
147
+ def snapshot(self, asof: datetime | int | None = None) -> Snapshot:
148
+ """Returns a snapshot of the table at the given timestamp."""
149
+ if isinstance(asof, datetime):
150
+ asof = int(asof.timestamp() * 1_000_000)
151
+ return Snapshot(self, self.core.get_snapshot(asof=asof))
152
+
153
+ def txn(self, **kwargs) -> Transaction:
154
+ """Begins a new transaction. Transaction must be committed for writes to become visible.
155
+
156
+ While transaction can be used to atomically write data to the table,
157
+ it is important that the primary key columns are unique within the transaction.
158
+ The behavior is undefined if this is not the case.
159
+ """
160
+ return Transaction(self.spiral.core.transaction(self.core, **kwargs))
161
+
162
+ def to_arrow_dataset(self) -> "ds.Dataset":
163
+ """Returns a PyArrow Dataset representing the table."""
164
+ return self.snapshot().to_arrow_dataset()
165
+
166
+ def to_polars_lazy_frame(self) -> "pl.LazyFrame":
167
+ """Returns a Polars LazyFrame for the Spiral table."""
168
+ return self.snapshot().to_polars_lazy_frame()
169
+
170
+ def to_duckdb_relation(self) -> "duckdb.DuckDBPyRelation":
171
+ """Returns a DuckDB relation for the Spiral table."""
172
+ return self.snapshot().to_duckdb_relation()
173
+
174
+ def to_streaming_stream(
175
+ self,
176
+ index: "KeySpaceIndex",
177
+ *,
178
+ projection: Expr | None = None,
179
+ cache_dir: str | None = None,
180
+ shard_row_block_size: int | None = None,
181
+ ) -> "SpiralStream":
182
+ """Returns a stream to be used with MosaicML's StreamingDataset.
183
+
184
+ Requires `streaming` package to be installed.
185
+
186
+ Args:
187
+ index: Prebuilt KeysIndex to use when creating the stream.
188
+ The index's `asof` will be used when scanning.
189
+ projection: Optional projection to use when scanning the table if index's projection is not used.
190
+ Projection must be compatible with the index's projection for correctness.
191
+ cache_dir: Directory to use for caching data. If None, a temporary directory will be used.
192
+ shard_row_block_size: Number of rows per segment of a shard file. Defaults to 8192.
193
+ Value should be set to lower for larger rows.
194
+ """
195
+ from spiral.streaming_ import SpiralStream
196
+
197
+ if index.table_id != self.table_id:
198
+ raise ValueError("Index must be built on the same table as the scan.")
199
+ if index.asof == 0:
200
+ raise ValueError("Index have to be synced before it can be used.")
201
+
202
+ # We know table from projection is in the session cause this method is on it.
203
+ scan = self.spiral.scan(
204
+ projection if projection is not None else index.projection,
205
+ where=index.filter,
206
+ asof=index.asof,
207
+ )
208
+ shards = self.spiral.internal.key_space_index_shards(index=index.core)
209
+
210
+ return SpiralStream(
211
+ sp=self.spiral,
212
+ scan=scan,
213
+ shards=shards,
214
+ cache_dir=cache_dir,
215
+ shard_row_block_size=shard_row_block_size,
216
+ )
spiral/text_index.py ADDED
@@ -0,0 +1,17 @@
1
+ from spiral.core.client import TextIndex as CoreTextIndex
2
+ from spiral.expressions import Expr
3
+
4
+
5
+ class TextIndex(Expr):
6
+ def __init__(self, core: CoreTextIndex, *, name: str | None = None):
7
+ super().__init__(core.__expr__)
8
+ self.core = core
9
+ self._name = name
10
+
11
+ @property
12
+ def index_id(self) -> str:
13
+ return self.core.id
14
+
15
+ @property
16
+ def name(self) -> str:
17
+ return self._name or self.index_id
spiral/transaction.py ADDED
@@ -0,0 +1,156 @@
1
+ import logging
2
+ from pathlib import Path
3
+
4
+ from spiral.core.client import Shard
5
+ from spiral.core.table import Transaction as CoreTransaction
6
+ from spiral.core.table.spec import Operation
7
+ from spiral.expressions.base import ExprLike
8
+ from spiral.scan import Scan
9
+
10
+ logger = logging.getLogger(__name__)
11
+
12
+
13
+ class Transaction:
14
+ """Spiral table transaction.
15
+
16
+ While transaction can be used to atomically write data to the table,
17
+ it is important that the primary key columns are unique within the transaction.
18
+ """
19
+
20
+ def __init__(self, core: CoreTransaction):
21
+ self._core = core
22
+
23
+ @property
24
+ def status(self) -> str:
25
+ """The status of the transaction."""
26
+ return self._core.status
27
+
28
+ def is_empty(self) -> bool:
29
+ """Check if the transaction has no operations."""
30
+ return self._core.is_empty()
31
+
32
+ def __enter__(self):
33
+ return self
34
+
35
+ def __exit__(self, exc_type, exc_value, traceback):
36
+ if exc_type is None:
37
+ self._core.commit()
38
+ else:
39
+ self._core.abort()
40
+
41
+ def write(self, expr: ExprLike, push_down_nulls: bool = False):
42
+ """Write an item to the table inside a single transaction.
43
+
44
+ :param push_down_nulls: Whether to push down nullable structs down its children. E.g. `[{"a": 1}, null]` would
45
+ become `[{"a": 1}, {"a": null}]`. SpiralDB doesn't allow struct-level nullability, so use this option if your
46
+ data contains nullable structs.
47
+
48
+ :param expr: The expression to write. Must evaluate to a struct array.
49
+ """
50
+ from spiral import expressions as se
51
+
52
+ record_batches = se.evaluate(expr)
53
+ if push_down_nulls:
54
+ self._core.write_push_down(record_batches)
55
+ else:
56
+ self._core.write(record_batches)
57
+
58
+ def writeback(
59
+ self,
60
+ scan: Scan,
61
+ *,
62
+ shards: list[Shard] | None = None,
63
+ ):
64
+ """Write back the results of a scan to the table.
65
+
66
+ :param scan: The scan to write back.
67
+ The scan does NOT need to be over the same table as transaction,
68
+ but it does need to have the same key schema.
69
+ :param shards: The shards to read from. If not provided, all shards are read.
70
+ """
71
+ self._core.writeback(scan.core, shards=shards)
72
+
73
+ def drop_columns(self, column_paths: list[str]):
74
+ """
75
+ Drops the specified columns from the table.
76
+
77
+ :param column_paths: Fully qualified column names. (e.g., "column_name" or "nested.field").
78
+ All columns must exist, if a column doesn't exist the function will return an error.
79
+ """
80
+ self._core.drop_columns(column_paths)
81
+
82
+ def compact_key_space(self):
83
+ """Compact the key space of the table."""
84
+ self._core.compact_key_space()
85
+
86
+ def take(self) -> list[Operation]:
87
+ """Take the operations from the transaction
88
+
89
+ Transaction can no longer be committed or aborted after calling this method.
90
+ ."""
91
+ return self._core.take()
92
+
93
+ def include(self, ops: list[Operation]):
94
+ """Include the given operations in the transaction.
95
+
96
+ Checks for conflicts between the included operations and any existing operations.
97
+ """
98
+ self._core.include(ops)
99
+
100
+ def commit(self, *, txn_dump: str | None = None, compact: bool = False):
101
+ """Commit the transaction."""
102
+ if txn_dump is not None:
103
+ try:
104
+ # Create parent directories if they don't exist
105
+ dump_path = Path(txn_dump)
106
+ dump_path.parent.mkdir(parents=True, exist_ok=True)
107
+
108
+ # Write operations to a JSONL file
109
+ with open(dump_path, "w") as f:
110
+ for op in self._core.ops():
111
+ f.write(op.to_json() + "\n")
112
+
113
+ logger.info(f"Transaction dumped to {txn_dump}")
114
+ except Exception as e:
115
+ logger.error(f"Failed to dump transaction to {txn_dump}: {e}")
116
+
117
+ self._core.commit(compact=compact)
118
+
119
+ @staticmethod
120
+ def load_dumps(*txn_dump: str) -> list[Operation]:
121
+ """Load a transaction from a dump file."""
122
+ import json
123
+
124
+ dumps = list(txn_dump)
125
+ ops: list[Operation] = []
126
+
127
+ for dump in dumps:
128
+ with open(dump) as f:
129
+ lines = f.readlines()
130
+
131
+ for line in lines:
132
+ line = line.strip()
133
+ if not line:
134
+ continue
135
+
136
+ # Each line may contain multiple JSON objects concatenated together
137
+ # This is due to a bug in the dump writing code.
138
+ # Use JSONDecoder to parse them one by one
139
+ decoder = json.JSONDecoder()
140
+ idx = 0
141
+ while idx < len(line):
142
+ try:
143
+ obj, end_idx = decoder.raw_decode(line, idx)
144
+ ops.append(Operation.from_json(json.dumps(obj)))
145
+ idx = end_idx
146
+ # Skip whitespace between JSON objects
147
+ while idx < len(line) and line[idx].isspace():
148
+ idx += 1
149
+ except json.JSONDecodeError as e:
150
+ raise ValueError(f"Failed to parse JSON at position {idx} in line: {line}") from e
151
+
152
+ return ops
153
+
154
+ def abort(self):
155
+ """Abort the transaction."""
156
+ self._core.abort()
spiral/types_.py ADDED
@@ -0,0 +1,6 @@
1
+ from typing import Annotated, TypeAlias
2
+
3
+ from pydantic import UrlConstraints
4
+
5
+ Uri: TypeAlias = Annotated[str, UrlConstraints()]
6
+ Timestamp: TypeAlias = int