pyspiral 0.8.9__cp311-abi3-macosx_11_0_arm64.whl → 0.9.9__cp311-abi3-macosx_11_0_arm64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {pyspiral-0.8.9.dist-info → pyspiral-0.9.9.dist-info}/METADATA +4 -2
- {pyspiral-0.8.9.dist-info → pyspiral-0.9.9.dist-info}/RECORD +39 -34
- spiral/__init__.py +3 -2
- spiral/_lib.abi3.so +0 -0
- spiral/api/__init__.py +7 -0
- spiral/api/client.py +86 -8
- spiral/api/projects.py +4 -2
- spiral/api/tables.py +77 -0
- spiral/arrow_.py +4 -155
- spiral/cli/app.py +10 -4
- spiral/cli/chooser.py +30 -0
- spiral/cli/fs.py +3 -2
- spiral/cli/iceberg.py +1 -1
- spiral/cli/key_spaces.py +4 -4
- spiral/cli/orgs.py +1 -1
- spiral/cli/projects.py +2 -2
- spiral/cli/tables.py +47 -20
- spiral/cli/telemetry.py +13 -6
- spiral/cli/text.py +4 -4
- spiral/cli/transactions.py +84 -0
- spiral/cli/{types.py → types_.py} +6 -6
- spiral/cli/workloads.py +4 -4
- spiral/client.py +70 -8
- spiral/core/client/__init__.pyi +25 -16
- spiral/core/table/__init__.pyi +24 -22
- spiral/debug/manifests.py +21 -9
- spiral/debug/scan.py +4 -6
- spiral/demo.py +145 -38
- spiral/enrichment.py +18 -23
- spiral/expressions/__init__.py +3 -75
- spiral/expressions/base.py +5 -10
- spiral/huggingface.py +456 -0
- spiral/input.py +131 -0
- spiral/ray_.py +75 -0
- spiral/scan.py +218 -64
- spiral/table.py +5 -4
- spiral/transaction.py +95 -15
- spiral/iterable_dataset.py +0 -106
- {pyspiral-0.8.9.dist-info → pyspiral-0.9.9.dist-info}/WHEEL +0 -0
- {pyspiral-0.8.9.dist-info → pyspiral-0.9.9.dist-info}/entry_points.txt +0 -0
spiral/transaction.py
CHANGED
|
@@ -1,14 +1,23 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
import logging
|
|
2
4
|
from pathlib import Path
|
|
5
|
+
from typing import TYPE_CHECKING
|
|
3
6
|
|
|
4
7
|
from spiral.core.client import Shard
|
|
5
8
|
from spiral.core.table import Transaction as CoreTransaction
|
|
6
9
|
from spiral.core.table.spec import Operation
|
|
7
|
-
from spiral.
|
|
10
|
+
from spiral.input import TableLike, evaluate
|
|
8
11
|
from spiral.scan import Scan
|
|
12
|
+
from spiral.types_ import Timestamp
|
|
9
13
|
|
|
10
14
|
logger = logging.getLogger(__name__)
|
|
11
15
|
|
|
16
|
+
if TYPE_CHECKING:
|
|
17
|
+
import ray.data
|
|
18
|
+
|
|
19
|
+
from spiral.table import Table
|
|
20
|
+
|
|
12
21
|
|
|
13
22
|
class Transaction:
|
|
14
23
|
"""Spiral table transaction.
|
|
@@ -17,14 +26,20 @@ class Transaction:
|
|
|
17
26
|
it is important that the primary key columns are unique within the transaction.
|
|
18
27
|
"""
|
|
19
28
|
|
|
20
|
-
def __init__(self, core: CoreTransaction):
|
|
29
|
+
def __init__(self, table: Table, core: CoreTransaction):
|
|
21
30
|
self._core = core
|
|
31
|
+
self._table = table
|
|
22
32
|
|
|
23
33
|
@property
|
|
24
34
|
def status(self) -> str:
|
|
25
35
|
"""The status of the transaction."""
|
|
26
36
|
return self._core.status
|
|
27
37
|
|
|
38
|
+
@property
|
|
39
|
+
def table(self) -> Table:
|
|
40
|
+
"""The table associated with this transaction."""
|
|
41
|
+
return self._table
|
|
42
|
+
|
|
28
43
|
def is_empty(self) -> bool:
|
|
29
44
|
"""Check if the transaction has no operations."""
|
|
30
45
|
return self._core.is_empty()
|
|
@@ -38,22 +53,21 @@ class Transaction:
|
|
|
38
53
|
else:
|
|
39
54
|
self._core.abort()
|
|
40
55
|
|
|
41
|
-
def write(self,
|
|
56
|
+
def write(self, table: TableLike, push_down_nulls: bool = False):
|
|
42
57
|
"""Write an item to the table inside a single transaction.
|
|
43
58
|
|
|
44
59
|
:param push_down_nulls: Whether to push down nullable structs down its children. E.g. `[{"a": 1}, null]` would
|
|
45
60
|
become `[{"a": 1}, {"a": null}]`. SpiralDB doesn't allow struct-level nullability, so use this option if your
|
|
46
61
|
data contains nullable structs.
|
|
47
62
|
|
|
48
|
-
:param
|
|
63
|
+
:param table: The table to write.
|
|
49
64
|
"""
|
|
50
|
-
|
|
65
|
+
record_batch_reader = evaluate(table)
|
|
51
66
|
|
|
52
|
-
record_batches = se.evaluate(expr)
|
|
53
67
|
if push_down_nulls:
|
|
54
|
-
self._core.write_push_down(
|
|
68
|
+
self._core.write_push_down(record_batch_reader)
|
|
55
69
|
else:
|
|
56
|
-
self._core.write(
|
|
70
|
+
self._core.write(record_batch_reader)
|
|
57
71
|
|
|
58
72
|
def writeback(
|
|
59
73
|
self,
|
|
@@ -70,6 +84,12 @@ class Transaction:
|
|
|
70
84
|
"""
|
|
71
85
|
self._core.writeback(scan.core, shards=shards)
|
|
72
86
|
|
|
87
|
+
def to_ray_datasink(self) -> ray.data.Datasink:
|
|
88
|
+
"""Returns a Ray Datasink which writes into this transaction."""
|
|
89
|
+
from spiral.ray_ import Datasink
|
|
90
|
+
|
|
91
|
+
return Datasink(self)
|
|
92
|
+
|
|
73
93
|
def drop_columns(self, column_paths: list[str]):
|
|
74
94
|
"""
|
|
75
95
|
Drops the specified columns from the table.
|
|
@@ -83,21 +103,29 @@ class Transaction:
|
|
|
83
103
|
"""Compact the key space of the table."""
|
|
84
104
|
self._core.compact_key_space()
|
|
85
105
|
|
|
86
|
-
def take(self) ->
|
|
106
|
+
def take(self) -> TransactionOps:
|
|
87
107
|
"""Take the operations from the transaction
|
|
88
108
|
|
|
89
109
|
Transaction can no longer be committed or aborted after calling this method.
|
|
90
110
|
."""
|
|
91
|
-
return self._core.take()
|
|
111
|
+
return TransactionOps(self._core.snapshot().asof, self._core.take())
|
|
92
112
|
|
|
93
|
-
def include(self, ops:
|
|
113
|
+
def include(self, ops: TransactionOps):
|
|
94
114
|
"""Include the given operations in the transaction.
|
|
95
115
|
|
|
96
116
|
Checks for conflicts between the included operations and any existing operations.
|
|
97
|
-
"""
|
|
98
|
-
self._core.include(ops)
|
|
99
117
|
|
|
100
|
-
|
|
118
|
+
IMPORTANT: The `self` transaction must be started at or before the timestamp of the included operations.
|
|
119
|
+
"""
|
|
120
|
+
self_asof = self._core.snapshot().asof
|
|
121
|
+
if ops.timestamp < self_asof:
|
|
122
|
+
raise ValueError(
|
|
123
|
+
f"Cannot include operations created against an out-of-date state of the table: {ops.timestamp}. "
|
|
124
|
+
f"This transaction's asof is {self_asof}."
|
|
125
|
+
)
|
|
126
|
+
self._core.include(ops.operations)
|
|
127
|
+
|
|
128
|
+
def commit(self, *, txn_dump: str | None = None):
|
|
101
129
|
"""Commit the transaction."""
|
|
102
130
|
if txn_dump is not None:
|
|
103
131
|
try:
|
|
@@ -114,7 +142,7 @@ class Transaction:
|
|
|
114
142
|
except Exception as e:
|
|
115
143
|
logger.error(f"Failed to dump transaction to {txn_dump}: {e}")
|
|
116
144
|
|
|
117
|
-
self._core.commit(
|
|
145
|
+
self._core.commit()
|
|
118
146
|
|
|
119
147
|
@staticmethod
|
|
120
148
|
def load_dumps(*txn_dump: str) -> list[Operation]:
|
|
@@ -154,3 +182,55 @@ class Transaction:
|
|
|
154
182
|
def abort(self):
|
|
155
183
|
"""Abort the transaction."""
|
|
156
184
|
self._core.abort()
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
class TransactionOps:
|
|
188
|
+
"""
|
|
189
|
+
Operations taken from a transaction.
|
|
190
|
+
|
|
191
|
+
Operations are timestamped and can only be included in transactions
|
|
192
|
+
that are started at or before the timestamp of the operations.
|
|
193
|
+
"""
|
|
194
|
+
|
|
195
|
+
def __init__(self, timestamp: Timestamp, operations: list[Operation]):
|
|
196
|
+
self._timestamp = timestamp
|
|
197
|
+
self._operations = operations
|
|
198
|
+
|
|
199
|
+
@property
|
|
200
|
+
def timestamp(self) -> Timestamp:
|
|
201
|
+
"""The timestamp of the operations.
|
|
202
|
+
|
|
203
|
+
These operations can only be included in transactions started at or before this timestamp.
|
|
204
|
+
"""
|
|
205
|
+
return self._timestamp
|
|
206
|
+
|
|
207
|
+
@property
|
|
208
|
+
def operations(self) -> list[Operation]:
|
|
209
|
+
"""The list of operations."""
|
|
210
|
+
return self._operations
|
|
211
|
+
|
|
212
|
+
def to_json(self):
|
|
213
|
+
"""Serialize the TransactionOps to JSON."""
|
|
214
|
+
import json
|
|
215
|
+
|
|
216
|
+
return json.dumps(
|
|
217
|
+
{
|
|
218
|
+
"timestamp": self.timestamp,
|
|
219
|
+
"operations": [op.to_json() for op in self.operations],
|
|
220
|
+
}
|
|
221
|
+
)
|
|
222
|
+
|
|
223
|
+
@classmethod
|
|
224
|
+
def from_json(cls, json_str: str) -> TransactionOps:
|
|
225
|
+
"""Deserialize the TransactionOps from JSON."""
|
|
226
|
+
import json
|
|
227
|
+
|
|
228
|
+
data = json.loads(json_str)
|
|
229
|
+
return TransactionOps(
|
|
230
|
+
timestamp=data["timestamp"],
|
|
231
|
+
operations=[Operation.from_json(op_json) for op_json in data["operations"]],
|
|
232
|
+
)
|
|
233
|
+
|
|
234
|
+
def __reduce__(self):
|
|
235
|
+
"""Support pickle protocol by using JSON serialization."""
|
|
236
|
+
return (self.__class__.from_json, (self.to_json(),))
|
spiral/iterable_dataset.py
DELETED
|
@@ -1,106 +0,0 @@
|
|
|
1
|
-
from collections.abc import Callable, Iterator
|
|
2
|
-
from typing import TYPE_CHECKING
|
|
3
|
-
|
|
4
|
-
import pyarrow as pa
|
|
5
|
-
|
|
6
|
-
if TYPE_CHECKING:
|
|
7
|
-
import datasets.iterable_dataset as hf # noqa
|
|
8
|
-
import streaming # noqa
|
|
9
|
-
import torch.utils.data as torchdata # noqa
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
def _hf_compatible_schema(schema: pa.Schema) -> pa.Schema:
|
|
13
|
-
"""
|
|
14
|
-
Replace string-view and binary-view columns in the schema with strings/binary.
|
|
15
|
-
Recursively handles nested types (struct, list, etc).
|
|
16
|
-
We use this converted schema as Features in the returned Dataset.
|
|
17
|
-
Remove this method once we have https://github.com/huggingface/datasets/pull/7718
|
|
18
|
-
"""
|
|
19
|
-
|
|
20
|
-
def _convert_type(dtype: pa.DataType) -> pa.DataType:
|
|
21
|
-
if dtype == pa.string_view():
|
|
22
|
-
return pa.string()
|
|
23
|
-
elif dtype == pa.binary_view():
|
|
24
|
-
return pa.binary()
|
|
25
|
-
elif pa.types.is_struct(dtype):
|
|
26
|
-
new_fields = [
|
|
27
|
-
pa.field(field.name, _convert_type(field.type), nullable=field.nullable, metadata=field.metadata)
|
|
28
|
-
for field in dtype
|
|
29
|
-
]
|
|
30
|
-
return pa.struct(new_fields)
|
|
31
|
-
elif pa.types.is_list(dtype):
|
|
32
|
-
return pa.list_(_convert_type(dtype.value_type))
|
|
33
|
-
elif pa.types.is_large_list(dtype):
|
|
34
|
-
return pa.large_list(_convert_type(dtype.value_type))
|
|
35
|
-
elif pa.types.is_fixed_size_list(dtype):
|
|
36
|
-
return pa.list_(_convert_type(dtype.value_type), dtype.list_size)
|
|
37
|
-
elif pa.types.is_map(dtype):
|
|
38
|
-
return pa.map_(_convert_type(dtype.key_type), _convert_type(dtype.item_type))
|
|
39
|
-
else:
|
|
40
|
-
return dtype
|
|
41
|
-
|
|
42
|
-
new_fields = []
|
|
43
|
-
for field in schema:
|
|
44
|
-
new_type = _convert_type(field.type)
|
|
45
|
-
new_fields.append(pa.field(field.name, new_type, nullable=field.nullable, metadata=field.metadata))
|
|
46
|
-
|
|
47
|
-
return pa.schema(new_fields)
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
def to_iterable_dataset(stream: pa.RecordBatchReader) -> "hf.IterableDataset":
|
|
51
|
-
from datasets import DatasetInfo, Features
|
|
52
|
-
from datasets.builder import ArrowExamplesIterable
|
|
53
|
-
from datasets.iterable_dataset import IterableDataset
|
|
54
|
-
|
|
55
|
-
def _generate_tables(**kwargs) -> Iterator[tuple[int, pa.Table]]:
|
|
56
|
-
# This key is unused when training with IterableDataset.
|
|
57
|
-
# Default implementation returns shard id, e.g. parquet row group id.
|
|
58
|
-
for i, rb in enumerate(stream):
|
|
59
|
-
yield i, pa.Table.from_batches([rb], stream.schema)
|
|
60
|
-
|
|
61
|
-
# TODO(marko): This is temporary until we stop returning IterableDataset from this function.
|
|
62
|
-
class _IterableDataset(IterableDataset):
|
|
63
|
-
# Diff with datasets.iterable_dataset.IterableDataset:
|
|
64
|
-
# - Removes torch handling which attempts to handle worker processes.
|
|
65
|
-
# - Assumes arrow iterator.
|
|
66
|
-
def __iter__(self):
|
|
67
|
-
from datasets.formatting import get_formatter
|
|
68
|
-
|
|
69
|
-
prepared_ex_iterable = self._prepare_ex_iterable_for_iteration()
|
|
70
|
-
if self._formatting and (prepared_ex_iterable.iter_arrow or self._formatting.is_table):
|
|
71
|
-
formatter = get_formatter(self._formatting.format_type, features=self.features)
|
|
72
|
-
iterator = prepared_ex_iterable.iter_arrow()
|
|
73
|
-
for key, pa_table in iterator:
|
|
74
|
-
yield formatter.format_row(pa_table)
|
|
75
|
-
return
|
|
76
|
-
|
|
77
|
-
for key, example in prepared_ex_iterable:
|
|
78
|
-
# no need to format thanks to FormattedExamplesIterable
|
|
79
|
-
yield example
|
|
80
|
-
|
|
81
|
-
def map(self, *args, **kwargs):
|
|
82
|
-
# Map constructs a new IterableDataset, so we need to "patch" it
|
|
83
|
-
base = super().map(*args, **kwargs)
|
|
84
|
-
if isinstance(base, IterableDataset):
|
|
85
|
-
# Patch __iter__ to avoid torch handling
|
|
86
|
-
base.__class__ = _IterableDataset # type: ignore
|
|
87
|
-
return base
|
|
88
|
-
|
|
89
|
-
class _ArrowExamplesIterable(ArrowExamplesIterable):
|
|
90
|
-
def __init__(self, generate_tables_fn: Callable[..., Iterator[tuple[int, pa.Table]]], features: Features):
|
|
91
|
-
# NOTE: generate_tables_fn type annotations are wrong, return type must be an iterable of tuples.
|
|
92
|
-
super().__init__(generate_tables_fn, kwargs={}) # type: ignore
|
|
93
|
-
self._features = features
|
|
94
|
-
|
|
95
|
-
@property
|
|
96
|
-
def is_typed(self) -> bool:
|
|
97
|
-
return True
|
|
98
|
-
|
|
99
|
-
@property
|
|
100
|
-
def features(self) -> Features:
|
|
101
|
-
return self._features
|
|
102
|
-
|
|
103
|
-
target_features = Features.from_arrow_schema(_hf_compatible_schema(stream.schema))
|
|
104
|
-
ex_iterable = _ArrowExamplesIterable(_generate_tables, target_features)
|
|
105
|
-
info = DatasetInfo(features=target_features)
|
|
106
|
-
return _IterableDataset(ex_iterable=ex_iterable, info=info)
|
|
File without changes
|
|
File without changes
|