pyspiral 0.8.9__cp311-abi3-macosx_11_0_arm64.whl → 0.9.9__cp311-abi3-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
spiral/transaction.py CHANGED
@@ -1,14 +1,23 @@
1
+ from __future__ import annotations
2
+
1
3
  import logging
2
4
  from pathlib import Path
5
+ from typing import TYPE_CHECKING
3
6
 
4
7
  from spiral.core.client import Shard
5
8
  from spiral.core.table import Transaction as CoreTransaction
6
9
  from spiral.core.table.spec import Operation
7
- from spiral.expressions.base import ExprLike
10
+ from spiral.input import TableLike, evaluate
8
11
  from spiral.scan import Scan
12
+ from spiral.types_ import Timestamp
9
13
 
10
14
  logger = logging.getLogger(__name__)
11
15
 
16
+ if TYPE_CHECKING:
17
+ import ray.data
18
+
19
+ from spiral.table import Table
20
+
12
21
 
13
22
  class Transaction:
14
23
  """Spiral table transaction.
@@ -17,14 +26,20 @@ class Transaction:
17
26
  it is important that the primary key columns are unique within the transaction.
18
27
  """
19
28
 
20
- def __init__(self, core: CoreTransaction):
29
+ def __init__(self, table: Table, core: CoreTransaction):
21
30
  self._core = core
31
+ self._table = table
22
32
 
23
33
  @property
24
34
  def status(self) -> str:
25
35
  """The status of the transaction."""
26
36
  return self._core.status
27
37
 
38
+ @property
39
+ def table(self) -> Table:
40
+ """The table associated with this transaction."""
41
+ return self._table
42
+
28
43
  def is_empty(self) -> bool:
29
44
  """Check if the transaction has no operations."""
30
45
  return self._core.is_empty()
@@ -38,22 +53,21 @@ class Transaction:
38
53
  else:
39
54
  self._core.abort()
40
55
 
41
- def write(self, expr: ExprLike, push_down_nulls: bool = False):
56
+ def write(self, table: TableLike, push_down_nulls: bool = False):
42
57
  """Write an item to the table inside a single transaction.
43
58
 
44
59
  :param push_down_nulls: Whether to push down nullable structs down its children. E.g. `[{"a": 1}, null]` would
45
60
  become `[{"a": 1}, {"a": null}]`. SpiralDB doesn't allow struct-level nullability, so use this option if your
46
61
  data contains nullable structs.
47
62
 
48
- :param expr: The expression to write. Must evaluate to a struct array.
63
+ :param table: The table to write.
49
64
  """
50
- from spiral import expressions as se
65
+ record_batch_reader = evaluate(table)
51
66
 
52
- record_batches = se.evaluate(expr)
53
67
  if push_down_nulls:
54
- self._core.write_push_down(record_batches)
68
+ self._core.write_push_down(record_batch_reader)
55
69
  else:
56
- self._core.write(record_batches)
70
+ self._core.write(record_batch_reader)
57
71
 
58
72
  def writeback(
59
73
  self,
@@ -70,6 +84,12 @@ class Transaction:
70
84
  """
71
85
  self._core.writeback(scan.core, shards=shards)
72
86
 
87
+ def to_ray_datasink(self) -> ray.data.Datasink:
88
+ """Returns a Ray Datasink which writes into this transaction."""
89
+ from spiral.ray_ import Datasink
90
+
91
+ return Datasink(self)
92
+
73
93
  def drop_columns(self, column_paths: list[str]):
74
94
  """
75
95
  Drops the specified columns from the table.
@@ -83,21 +103,29 @@ class Transaction:
83
103
  """Compact the key space of the table."""
84
104
  self._core.compact_key_space()
85
105
 
86
- def take(self) -> list[Operation]:
106
+ def take(self) -> TransactionOps:
87
107
  """Take the operations from the transaction
88
108
 
89
109
  Transaction can no longer be committed or aborted after calling this method.
90
110
  ."""
91
- return self._core.take()
111
+ return TransactionOps(self._core.snapshot().asof, self._core.take())
92
112
 
93
- def include(self, ops: list[Operation]):
113
+ def include(self, ops: TransactionOps):
94
114
  """Include the given operations in the transaction.
95
115
 
96
116
  Checks for conflicts between the included operations and any existing operations.
97
- """
98
- self._core.include(ops)
99
117
 
100
- def commit(self, *, txn_dump: str | None = None, compact: bool = False):
118
+ IMPORTANT: The `self` transaction must be started at or before the timestamp of the included operations.
119
+ """
120
+ self_asof = self._core.snapshot().asof
121
+ if ops.timestamp < self_asof:
122
+ raise ValueError(
123
+ f"Cannot include operations created against an out-of-date state of the table: {ops.timestamp}. "
124
+ f"This transaction's asof is {self_asof}."
125
+ )
126
+ self._core.include(ops.operations)
127
+
128
+ def commit(self, *, txn_dump: str | None = None):
101
129
  """Commit the transaction."""
102
130
  if txn_dump is not None:
103
131
  try:
@@ -114,7 +142,7 @@ class Transaction:
114
142
  except Exception as e:
115
143
  logger.error(f"Failed to dump transaction to {txn_dump}: {e}")
116
144
 
117
- self._core.commit(compact=compact)
145
+ self._core.commit()
118
146
 
119
147
  @staticmethod
120
148
  def load_dumps(*txn_dump: str) -> list[Operation]:
@@ -154,3 +182,55 @@ class Transaction:
154
182
  def abort(self):
155
183
  """Abort the transaction."""
156
184
  self._core.abort()
185
+
186
+
187
+ class TransactionOps:
188
+ """
189
+ Operations taken from a transaction.
190
+
191
+ Operations are timestamped and can only be included in transactions
192
+ that are started at or before the timestamp of the operations.
193
+ """
194
+
195
+ def __init__(self, timestamp: Timestamp, operations: list[Operation]):
196
+ self._timestamp = timestamp
197
+ self._operations = operations
198
+
199
+ @property
200
+ def timestamp(self) -> Timestamp:
201
+ """The timestamp of the operations.
202
+
203
+ These operations can only be included in transactions started at or before this timestamp.
204
+ """
205
+ return self._timestamp
206
+
207
+ @property
208
+ def operations(self) -> list[Operation]:
209
+ """The list of operations."""
210
+ return self._operations
211
+
212
+ def to_json(self):
213
+ """Serialize the TransactionOps to JSON."""
214
+ import json
215
+
216
+ return json.dumps(
217
+ {
218
+ "timestamp": self.timestamp,
219
+ "operations": [op.to_json() for op in self.operations],
220
+ }
221
+ )
222
+
223
+ @classmethod
224
+ def from_json(cls, json_str: str) -> TransactionOps:
225
+ """Deserialize the TransactionOps from JSON."""
226
+ import json
227
+
228
+ data = json.loads(json_str)
229
+ return TransactionOps(
230
+ timestamp=data["timestamp"],
231
+ operations=[Operation.from_json(op_json) for op_json in data["operations"]],
232
+ )
233
+
234
+ def __reduce__(self):
235
+ """Support pickle protocol by using JSON serialization."""
236
+ return (self.__class__.from_json, (self.to_json(),))
@@ -1,106 +0,0 @@
1
- from collections.abc import Callable, Iterator
2
- from typing import TYPE_CHECKING
3
-
4
- import pyarrow as pa
5
-
6
- if TYPE_CHECKING:
7
- import datasets.iterable_dataset as hf # noqa
8
- import streaming # noqa
9
- import torch.utils.data as torchdata # noqa
10
-
11
-
12
- def _hf_compatible_schema(schema: pa.Schema) -> pa.Schema:
13
- """
14
- Replace string-view and binary-view columns in the schema with strings/binary.
15
- Recursively handles nested types (struct, list, etc).
16
- We use this converted schema as Features in the returned Dataset.
17
- Remove this method once we have https://github.com/huggingface/datasets/pull/7718
18
- """
19
-
20
- def _convert_type(dtype: pa.DataType) -> pa.DataType:
21
- if dtype == pa.string_view():
22
- return pa.string()
23
- elif dtype == pa.binary_view():
24
- return pa.binary()
25
- elif pa.types.is_struct(dtype):
26
- new_fields = [
27
- pa.field(field.name, _convert_type(field.type), nullable=field.nullable, metadata=field.metadata)
28
- for field in dtype
29
- ]
30
- return pa.struct(new_fields)
31
- elif pa.types.is_list(dtype):
32
- return pa.list_(_convert_type(dtype.value_type))
33
- elif pa.types.is_large_list(dtype):
34
- return pa.large_list(_convert_type(dtype.value_type))
35
- elif pa.types.is_fixed_size_list(dtype):
36
- return pa.list_(_convert_type(dtype.value_type), dtype.list_size)
37
- elif pa.types.is_map(dtype):
38
- return pa.map_(_convert_type(dtype.key_type), _convert_type(dtype.item_type))
39
- else:
40
- return dtype
41
-
42
- new_fields = []
43
- for field in schema:
44
- new_type = _convert_type(field.type)
45
- new_fields.append(pa.field(field.name, new_type, nullable=field.nullable, metadata=field.metadata))
46
-
47
- return pa.schema(new_fields)
48
-
49
-
50
- def to_iterable_dataset(stream: pa.RecordBatchReader) -> "hf.IterableDataset":
51
- from datasets import DatasetInfo, Features
52
- from datasets.builder import ArrowExamplesIterable
53
- from datasets.iterable_dataset import IterableDataset
54
-
55
- def _generate_tables(**kwargs) -> Iterator[tuple[int, pa.Table]]:
56
- # This key is unused when training with IterableDataset.
57
- # Default implementation returns shard id, e.g. parquet row group id.
58
- for i, rb in enumerate(stream):
59
- yield i, pa.Table.from_batches([rb], stream.schema)
60
-
61
- # TODO(marko): This is temporary until we stop returning IterableDataset from this function.
62
- class _IterableDataset(IterableDataset):
63
- # Diff with datasets.iterable_dataset.IterableDataset:
64
- # - Removes torch handling which attempts to handle worker processes.
65
- # - Assumes arrow iterator.
66
- def __iter__(self):
67
- from datasets.formatting import get_formatter
68
-
69
- prepared_ex_iterable = self._prepare_ex_iterable_for_iteration()
70
- if self._formatting and (prepared_ex_iterable.iter_arrow or self._formatting.is_table):
71
- formatter = get_formatter(self._formatting.format_type, features=self.features)
72
- iterator = prepared_ex_iterable.iter_arrow()
73
- for key, pa_table in iterator:
74
- yield formatter.format_row(pa_table)
75
- return
76
-
77
- for key, example in prepared_ex_iterable:
78
- # no need to format thanks to FormattedExamplesIterable
79
- yield example
80
-
81
- def map(self, *args, **kwargs):
82
- # Map constructs a new IterableDataset, so we need to "patch" it
83
- base = super().map(*args, **kwargs)
84
- if isinstance(base, IterableDataset):
85
- # Patch __iter__ to avoid torch handling
86
- base.__class__ = _IterableDataset # type: ignore
87
- return base
88
-
89
- class _ArrowExamplesIterable(ArrowExamplesIterable):
90
- def __init__(self, generate_tables_fn: Callable[..., Iterator[tuple[int, pa.Table]]], features: Features):
91
- # NOTE: generate_tables_fn type annotations are wrong, return type must be an iterable of tuples.
92
- super().__init__(generate_tables_fn, kwargs={}) # type: ignore
93
- self._features = features
94
-
95
- @property
96
- def is_typed(self) -> bool:
97
- return True
98
-
99
- @property
100
- def features(self) -> Features:
101
- return self._features
102
-
103
- target_features = Features.from_arrow_schema(_hf_compatible_schema(stream.schema))
104
- ex_iterable = _ArrowExamplesIterable(_generate_tables, target_features)
105
- info = DatasetInfo(features=target_features)
106
- return _IterableDataset(ex_iterable=ex_iterable, info=info)