pyspiral 0.8.9__cp311-abi3-macosx_11_0_arm64.whl → 0.9.9__cp311-abi3-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,12 +1,12 @@
1
1
  import builtins
2
2
  import functools
3
3
  import operator
4
- import warnings
5
4
  from typing import Any
6
5
 
7
6
  import pyarrow as pa
8
7
 
9
- from spiral import _lib, arrow_
8
+ from spiral import _lib
9
+ from spiral.input import dot_separated_dict_to_nested
10
10
 
11
11
  from . import file as file
12
12
  from . import http as http
@@ -74,86 +74,14 @@ def lift(expr: ExprLike) -> Expr:
74
74
  # NOTE: we assume this is a struct expression. We could be smarter and be context aware to determine if
75
75
  # this is in fact a struct scalar, but the user can always create one of those manually.
76
76
 
77
- # First we un-nest any dot-separated field names
78
- expr: dict = arrow_.nest_structs(expr)
77
+ expr: dict = dot_separated_dict_to_nested(expr)
79
78
 
80
79
  return pack({k: lift(v) for k, v in expr.items()})
81
80
 
82
- if isinstance(expr, builtins.list):
83
- return lift(pa.array(expr))
84
-
85
- # Unpack tables and chunked arrays
86
- if isinstance(expr, pa.Table | pa.RecordBatch):
87
- expr = expr.to_struct_array()
88
- if isinstance(expr, pa.ChunkedArray):
89
- expr = expr.combine_chunks()
90
-
91
- # If the value is struct-like, we un-nest any dot-separated field names
92
- if isinstance(expr, pa.StructArray | pa.StructScalar):
93
- # TODO(marko): Figure out what to do with nullable struct arrays when unpacking them.
94
- # We need to merge struct validity into the child validity?
95
- if isinstance(expr, pa.StructArray) and expr.null_count != 0:
96
- # raise ValueError("lift: cannot lift a struct array with nulls.")
97
- warnings.warn("found a struct array with nulls", stacklevel=2)
98
- if isinstance(expr, pa.StructScalar) and not expr.is_valid:
99
- # raise ValueError("lift: cannot lift a struct scalar with nulls.")
100
- warnings.warn("found a struct scalar with nulls", stacklevel=2)
101
- return lift(arrow_.nest_structs(expr))
102
-
103
- if isinstance(expr, pa.Array):
104
- return Expr(_lib.expr.array_lit(expr))
105
-
106
81
  # Otherwise, assume it's a scalar.
107
82
  return scalar(expr)
108
83
 
109
84
 
110
- def evaluate(expr: ExprLike) -> pa.RecordBatchReader:
111
- # TODO(marko): This implementation is currently minimal and most ExprLike-s fail.
112
- if isinstance(expr, pa.RecordBatchReader):
113
- return expr
114
- if isinstance(expr, pa.Table):
115
- return expr.to_reader()
116
- if isinstance(expr, pa.RecordBatch):
117
- return pa.RecordBatchReader.from_batches(expr.schema, [expr])
118
- if isinstance(expr, pa.StructArray):
119
- return pa.Table.from_struct_array(expr).to_reader()
120
-
121
- if isinstance(expr, pa.ChunkedArray):
122
- if not pa.types.is_struct(expr.type):
123
- raise ValueError("Arrow chunked array must be a struct type.")
124
-
125
- def _iter_batches():
126
- for chunk in expr.chunks:
127
- yield pa.RecordBatch.from_struct_array(chunk)
128
-
129
- return pa.RecordBatchReader.from_batches(pa.schema(expr.type.fields), _iter_batches())
130
-
131
- if isinstance(expr, pa.Array):
132
- raise ValueError("Arrow array must be a struct array.")
133
-
134
- if isinstance(expr, Expr) or isinstance(expr, NativeExpr):
135
- raise NotImplementedError(
136
- "Expr evaluation not supported yet. Use Arrow to write instead. Reach out if you require this feature."
137
- )
138
-
139
- if isinstance(expr, dict):
140
- # NOTE: we assume this is a struct expression. We could be smarter and be context aware to determine if
141
- # this is in fact a struct scalar, but the user can always create one of those manually.
142
-
143
- # First we un-nest any dot-separated field names
144
- expr: dict = arrow_.nest_structs(expr)
145
- return evaluate(arrow_.dict_to_table(expr))
146
-
147
- if isinstance(expr, builtins.list):
148
- return evaluate(pa.array(expr))
149
-
150
- if isinstance(expr, pa.Scalar):
151
- return evaluate(pa.array([expr]))
152
-
153
- # Otherwise, try scalar.
154
- return evaluate(scalar(expr))
155
-
156
-
157
85
  def aux(name: builtins.str, dtype: pa.DataType) -> Expr:
158
86
  """Create a variable expression referencing a column in the auxiliary table.
159
87
 
@@ -1,5 +1,5 @@
1
1
  import datetime
2
- from typing import TypeAlias, Union
2
+ from typing import TypeAlias
3
3
 
4
4
  import pyarrow as pa
5
5
 
@@ -175,12 +175,7 @@ class Expr:
175
175
  return Expr(_lib.expr.binary(op, self.__expr__, rhs.__expr__))
176
176
 
177
177
 
178
- ScalarLike: TypeAlias = bool | int | float | str | list["ScalarLike"] | datetime.datetime | None
179
- ArrowLike: TypeAlias = Union[
180
- pa.RecordBatch,
181
- "pa.Array[pa.Scalar[pa.DataType]]",
182
- "pa.ChunkedArray[pa.Scalar[pa.DataType]]",
183
- "pa.Scalar[pa.DataType]",
184
- pa.Table,
185
- ]
186
- ExprLike: TypeAlias = Expr | dict[str, "ExprLike"] | list["ExprLike"] | ArrowLike | ScalarLike
178
+ ScalarLike: TypeAlias = (
179
+ bool | int | float | str | datetime.datetime | datetime.date | datetime.time | None | list["ScalarLike"]
180
+ )
181
+ ExprLike: TypeAlias = Expr | dict[str, "ExprLike"] | ScalarLike
spiral/huggingface.py ADDED
@@ -0,0 +1,456 @@
1
+ """
2
+ This module provides utilities for bidirectional HuggingFace <-> Spiral conversion.
3
+
4
+ Ingesting HuggingFace datasets into Spiral:
5
+ from spiral.huggingface import ingest_dataset
6
+ from spiral import Spiral
7
+
8
+ sp = Spiral()
9
+ project = sp.project("my-project")
10
+
11
+ # Ingest a HuggingFace dataset
12
+ from datasets import load_dataset
13
+ hf_dataset = load_dataset("squad", split="train")
14
+
15
+ table = ingest_dataset(hf_dataset, project, "squad_train", key_columns="id")
16
+
17
+ Converting Spiral scans to HuggingFace IterableDataset:
18
+ # This is typically accessed via scan.to_iterable_dataset()
19
+ from spiral.huggingface import to_iterable_dataset
20
+
21
+ Requires the [huggingface] extra: pip install pyspiral[huggingface]
22
+ """
23
+
24
+ from __future__ import annotations
25
+
26
+ from collections.abc import Callable, Iterator, Sequence
27
+ from typing import TYPE_CHECKING
28
+
29
+ import pyarrow as pa
30
+
31
+ if TYPE_CHECKING:
32
+ from datasets import Dataset, DatasetDict, IterableDataset, IterableDatasetDict
33
+ from datasets.features import Features
34
+
35
+ from spiral import Project, Table
36
+
37
+ __all__ = ["ingest_dataset", "to_iterable_dataset"]
38
+
39
+ DEFAULT_ROW_INDEX_COLUMN = "__row_idx__"
40
+ DEFAULT_BATCH_SIZE = 100_000
41
+ DEFAULT_COMMIT_EVERY = 25
42
+
43
+
44
+ def _check_huggingface_installed() -> None:
45
+ """Raise ImportError with helpful message if datasets not installed."""
46
+ try:
47
+ import datasets # noqa: F401
48
+ except ImportError:
49
+ raise ImportError(
50
+ "The 'datasets' package is required for HuggingFace integration. "
51
+ "Install it with: pip install 'pyspiral[huggingface]'"
52
+ ) from None
53
+
54
+
55
+ def _add_row_index_column(table: pa.Table, offset: int = 0) -> pa.Table:
56
+ """Add a __row_idx__ column as the first column."""
57
+ row_count = len(table)
58
+ row_idx = pa.array(range(offset, offset + row_count), type=pa.uint64())
59
+ return table.add_column(0, DEFAULT_ROW_INDEX_COLUMN, row_idx)
60
+
61
+
62
+ def _validate_key_columns(schema: pa.Schema, key_columns: str | Sequence[str]) -> list[str]:
63
+ """Validate that key columns exist in the schema and return as list."""
64
+ if isinstance(key_columns, str):
65
+ key_columns = [key_columns]
66
+ else:
67
+ key_columns = list(key_columns)
68
+
69
+ schema_names = set(schema.names)
70
+ for col in key_columns:
71
+ if col not in schema_names:
72
+ raise ValueError(f"Key column '{col}' not found in dataset schema. Available columns: {schema.names}")
73
+
74
+ return key_columns
75
+
76
+
77
+ def _extract_key_schema(schema: pa.Schema, key_columns: list[str]) -> pa.Schema:
78
+ """Extract key schema from existing columns."""
79
+ key_fields = [schema.field(col) for col in key_columns]
80
+ return pa.schema(key_fields)
81
+
82
+
83
+ def _reorder_columns_keys_first(table: pa.Table, key_columns: list[str]) -> pa.Table:
84
+ """Reorder table columns so key columns come first."""
85
+ non_key_columns = [name for name in table.column_names if name not in key_columns]
86
+ new_order = key_columns + non_key_columns
87
+ return table.select(new_order)
88
+
89
+
90
+ def _features_to_arrow_schema(features: Features) -> pa.Schema:
91
+ """Convert HuggingFace Features to Arrow schema."""
92
+ return features.arrow_schema
93
+
94
+
95
+ def _ingest_in_memory_dataset(
96
+ dataset: Dataset,
97
+ project: Project,
98
+ table_name: str,
99
+ key_columns: str | Sequence[str] | None,
100
+ push_down_nulls: bool,
101
+ exist_ok: bool,
102
+ ) -> Table:
103
+ """Ingest an in-memory HuggingFace Dataset."""
104
+ # Get Arrow table directly from HuggingFace
105
+ arrow_table = dataset.data.table
106
+
107
+ # Handle key columns
108
+ if key_columns is None:
109
+ # Auto-generate row index
110
+ arrow_table = _add_row_index_column(arrow_table)
111
+ key_schema = pa.schema([(DEFAULT_ROW_INDEX_COLUMN, pa.uint64())])
112
+ else:
113
+ key_cols = _validate_key_columns(arrow_table.schema, key_columns)
114
+ key_schema = _extract_key_schema(arrow_table.schema, key_cols)
115
+ arrow_table = _reorder_columns_keys_first(arrow_table, key_cols)
116
+
117
+ # Create table and write
118
+ table = project.create_table(table_name, key_schema=key_schema, exist_ok=exist_ok)
119
+ table.write(arrow_table, push_down_nulls=push_down_nulls)
120
+
121
+ return table
122
+
123
+
124
+ def _ingest_iterable_dataset(
125
+ dataset: IterableDataset,
126
+ project: Project,
127
+ table_name: str,
128
+ key_columns: str | Sequence[str] | None,
129
+ batch_size: int,
130
+ commit_every: int,
131
+ push_down_nulls: bool,
132
+ exist_ok: bool,
133
+ ) -> Table:
134
+ """Ingest a streaming HuggingFace IterableDataset."""
135
+ # Infer key schema from features
136
+ features = dataset.features
137
+ if features is None:
138
+ raise ValueError(
139
+ "Cannot infer schema from IterableDataset without features. "
140
+ "Consider materializing the dataset first with dataset.take(n) or provide features."
141
+ )
142
+
143
+ arrow_schema = _features_to_arrow_schema(features)
144
+
145
+ # Determine key schema
146
+ if key_columns is None:
147
+ key_schema = pa.schema([(DEFAULT_ROW_INDEX_COLUMN, pa.uint64())])
148
+ else:
149
+ key_cols = _validate_key_columns(arrow_schema, key_columns)
150
+ key_schema = _extract_key_schema(arrow_schema, key_cols)
151
+
152
+ # Create table
153
+ table = project.create_table(table_name, key_schema=key_schema, exist_ok=exist_ok)
154
+
155
+ # Iterate with batching and transactions
156
+ row_offset = 0
157
+ tx = table.txn()
158
+ tx_ops = 0
159
+ batch_buffer: list[dict] = []
160
+
161
+ for row in dataset:
162
+ batch_buffer.append(row)
163
+
164
+ if len(batch_buffer) >= batch_size:
165
+ arrow_batch = _rows_to_arrow_table(batch_buffer, features, row_offset, key_columns)
166
+ tx.write(arrow_batch, push_down_nulls=push_down_nulls)
167
+ tx_ops += 1
168
+ row_offset += len(batch_buffer)
169
+ batch_buffer = []
170
+
171
+ if tx_ops >= commit_every:
172
+ tx.commit()
173
+ tx = table.txn()
174
+ tx_ops = 0
175
+
176
+ # Handle remaining rows
177
+ if batch_buffer:
178
+ arrow_batch = _rows_to_arrow_table(batch_buffer, features, row_offset, key_columns)
179
+ tx.write(arrow_batch, push_down_nulls=push_down_nulls)
180
+ tx_ops += 1
181
+
182
+ if tx_ops > 0:
183
+ tx.commit()
184
+
185
+ return table
186
+
187
+
188
+ def _rows_to_arrow_table(
189
+ rows: list[dict],
190
+ features: Features,
191
+ offset: int,
192
+ key_columns: str | Sequence[str] | None,
193
+ ) -> pa.Table:
194
+ """Convert a list of row dicts to an Arrow table with proper schema."""
195
+ from datasets import Dataset
196
+
197
+ # Create a temporary Dataset to leverage HF's Arrow conversion
198
+ temp_dataset = Dataset.from_list(rows, features=features)
199
+ arrow_table = temp_dataset.data.table
200
+
201
+ # Handle key columns
202
+ if key_columns is None:
203
+ arrow_table = _add_row_index_column(arrow_table, offset)
204
+ else:
205
+ key_cols = _validate_key_columns(arrow_table.schema, key_columns)
206
+ arrow_table = _reorder_columns_keys_first(arrow_table, key_cols)
207
+
208
+ return arrow_table
209
+
210
+
211
+ def _ingest_dataset_dict(
212
+ dataset_dict: DatasetDict,
213
+ project: Project,
214
+ table_name: str,
215
+ key_columns: str | Sequence[str] | None,
216
+ push_down_nulls: bool,
217
+ exist_ok: bool,
218
+ ) -> dict[str, Table]:
219
+ """Ingest a HuggingFace DatasetDict, creating one table per split."""
220
+ tables = {}
221
+ for split_name, dataset in dataset_dict.items():
222
+ split_table_name = f"{table_name}.{split_name}"
223
+ tables[split_name] = _ingest_in_memory_dataset(
224
+ dataset,
225
+ project,
226
+ split_table_name,
227
+ key_columns,
228
+ push_down_nulls,
229
+ exist_ok,
230
+ )
231
+ return tables
232
+
233
+
234
+ def _ingest_iterable_dataset_dict(
235
+ dataset_dict: IterableDatasetDict,
236
+ project: Project,
237
+ table_name: str,
238
+ key_columns: str | Sequence[str] | None,
239
+ batch_size: int,
240
+ commit_every: int,
241
+ push_down_nulls: bool,
242
+ exist_ok: bool,
243
+ ) -> dict[str, Table]:
244
+ """Ingest a HuggingFace IterableDatasetDict, creating one table per split."""
245
+ tables = {}
246
+ for split_name, dataset in dataset_dict.items():
247
+ split_table_name = f"{table_name}.{split_name}"
248
+ tables[split_name] = _ingest_iterable_dataset(
249
+ dataset,
250
+ project,
251
+ split_table_name,
252
+ key_columns,
253
+ batch_size,
254
+ commit_every,
255
+ push_down_nulls,
256
+ exist_ok,
257
+ )
258
+ return tables
259
+
260
+
261
+ def ingest_dataset(
262
+ dataset: Dataset | IterableDataset | DatasetDict | IterableDatasetDict,
263
+ project: Project,
264
+ table_name: str,
265
+ *,
266
+ key_columns: str | Sequence[str] | None = None,
267
+ batch_size: int = DEFAULT_BATCH_SIZE,
268
+ commit_every: int = DEFAULT_COMMIT_EVERY,
269
+ push_down_nulls: bool = True,
270
+ exist_ok: bool = False,
271
+ ) -> Table | dict[str, Table]:
272
+ """
273
+ Ingest a HuggingFace dataset into Spiral.
274
+
275
+ Args:
276
+ dataset: A HuggingFace Dataset, IterableDataset, DatasetDict, or IterableDatasetDict.
277
+ project: The Spiral project to create the table(s) in.
278
+ table_name: Base name for the table. For DatasetDict, tables are created as
279
+ `{table_name}.{split}` (e.g., `my_dataset.train`, `my_dataset.test`).
280
+ key_columns: Column(s) to use as the primary key. If None, a `__row_idx__`
281
+ column is auto-generated as a uint64 key.
282
+ batch_size: Number of rows to buffer before writing (for streaming datasets).
283
+ Default is 100,000 (matching fineweb.py pattern).
284
+ commit_every: Number of write operations before committing a transaction.
285
+ Default is 25 (matching fineweb.py pattern).
286
+ push_down_nulls: Whether to push down nullable structs to children.
287
+ exist_ok: If True, allow writing to existing tables.
288
+
289
+ Returns:
290
+ A single Table for Dataset/IterableDataset, or a dict mapping split names
291
+ to Tables for DatasetDict/IterableDatasetDict.
292
+
293
+ Raises:
294
+ ImportError: If `datasets` package is not installed.
295
+ ValueError: If key_columns don't exist in the dataset schema.
296
+ TypeError: If dataset is not a supported HuggingFace type.
297
+
298
+ Examples:
299
+ Basic ingestion with auto-generated key:
300
+
301
+ >>> from datasets import load_dataset
302
+ >>> from spiral import Spiral
303
+ >>> from spiral.huggingface import ingest_dataset
304
+ >>> sp = Spiral()
305
+ >>> project = sp.project("my-project")
306
+ >>> hf_ds = load_dataset("squad", split="train")
307
+ >>> table = ingest_dataset(hf_ds, project, "squad_train")
308
+
309
+ With custom key column:
310
+
311
+ >>> table = ingest_dataset(hf_ds, project, "squad_train", key_columns="id")
312
+
313
+ DatasetDict creates multiple tables:
314
+
315
+ >>> hf_dict = load_dataset("squad") # Returns DatasetDict
316
+ >>> tables = ingest_dataset(hf_dict, project, "squad")
317
+ >>> tables["train"] # squad.train table
318
+ >>> tables["validation"] # squad.validation table
319
+ """
320
+ _check_huggingface_installed()
321
+
322
+ from datasets import Dataset, DatasetDict, IterableDataset, IterableDatasetDict
323
+
324
+ if isinstance(dataset, DatasetDict):
325
+ return _ingest_dataset_dict(
326
+ dataset,
327
+ project,
328
+ table_name,
329
+ key_columns,
330
+ push_down_nulls,
331
+ exist_ok,
332
+ )
333
+ elif isinstance(dataset, IterableDatasetDict):
334
+ return _ingest_iterable_dataset_dict(
335
+ dataset,
336
+ project,
337
+ table_name,
338
+ key_columns,
339
+ batch_size,
340
+ commit_every,
341
+ push_down_nulls,
342
+ exist_ok,
343
+ )
344
+ elif isinstance(dataset, Dataset):
345
+ return _ingest_in_memory_dataset(
346
+ dataset,
347
+ project,
348
+ table_name,
349
+ key_columns,
350
+ push_down_nulls,
351
+ exist_ok,
352
+ )
353
+ elif isinstance(dataset, IterableDataset):
354
+ return _ingest_iterable_dataset(
355
+ dataset,
356
+ project,
357
+ table_name,
358
+ key_columns,
359
+ batch_size,
360
+ commit_every,
361
+ push_down_nulls,
362
+ exist_ok,
363
+ )
364
+ else:
365
+ raise TypeError(
366
+ f"Unsupported dataset type: {type(dataset).__name__}. "
367
+ "Expected Dataset, IterableDataset, DatasetDict, or IterableDatasetDict."
368
+ )
369
+
370
+
371
+ # =============================================================================
372
+ # Spiral -> HuggingFace conversion
373
+ # =============================================================================
374
+
375
+
376
+ def to_iterable_dataset(stream: pa.RecordBatchReader) -> IterableDataset:
377
+ """
378
+ Convert a PyArrow RecordBatchReader to a HuggingFace IterableDataset.
379
+
380
+ This is typically accessed via scan.to_iterable_dataset() rather than directly.
381
+
382
+ Args:
383
+ stream: A PyArrow RecordBatchReader, typically from a Spiral scan.
384
+
385
+ Returns:
386
+ A HuggingFace IterableDataset that yields rows from the stream.
387
+
388
+ Example:
389
+ >>> from spiral import Spiral
390
+ >>> sp = Spiral()
391
+ >>> table = sp.project("my-project").table("my-table")
392
+ >>> scan = sp.scan(table)
393
+ >>> hf_dataset = scan.to_iterable_dataset() # Uses this function internally
394
+ """
395
+ _check_huggingface_installed()
396
+
397
+ from datasets import DatasetInfo, Features
398
+ from datasets.builder import ArrowExamplesIterable
399
+ from datasets.iterable_dataset import IterableDataset as HFIterableDataset
400
+
401
+ def _generate_tables(**kwargs) -> Iterator[tuple[int, pa.Table]]:
402
+ # This key is unused when training with IterableDataset.
403
+ # Default implementation returns shard id, e.g. parquet row group id.
404
+ for i, rb in enumerate(stream):
405
+ yield i, pa.Table.from_batches([rb], stream.schema)
406
+
407
+ # TODO(marko): This is temporary until we stop returning IterableDataset from this function.
408
+ class _IterableDataset(HFIterableDataset):
409
+ # Diff with datasets.iterable_dataset.IterableDataset:
410
+ # - Removes torch handling which attempts to handle worker processes.
411
+ # - Assumes arrow iterator.
412
+ def __iter__(self):
413
+ from datasets.formatting import get_formatter
414
+
415
+ prepared_ex_iterable = self._prepare_ex_iterable_for_iteration()
416
+ if self._formatting and (prepared_ex_iterable.iter_arrow or self._formatting.is_table):
417
+ formatter = get_formatter(self._formatting.format_type, features=self.features)
418
+ iterator = prepared_ex_iterable.iter_arrow()
419
+ for key, pa_table in iterator:
420
+ yield formatter.format_row(pa_table)
421
+ return
422
+
423
+ for key, example in prepared_ex_iterable:
424
+ # no need to format thanks to FormattedExamplesIterable
425
+ yield example
426
+
427
+ def map(self, *args, **kwargs):
428
+ # Map constructs a new IterableDataset, so we need to "patch" it
429
+ base = super().map(*args, **kwargs)
430
+ if isinstance(base, HFIterableDataset):
431
+ # Patch __iter__ to avoid torch handling
432
+ base.__class__ = _IterableDataset # type: ignore
433
+ return base
434
+
435
+ class _ArrowExamplesIterable(ArrowExamplesIterable):
436
+ def __init__(
437
+ self,
438
+ generate_tables_fn: Callable[..., Iterator[tuple[int, pa.Table]]],
439
+ features: Features,
440
+ ):
441
+ # NOTE: generate_tables_fn type annotations are wrong, return type must be an iterable of tuples.
442
+ super().__init__(generate_tables_fn, kwargs={}) # type: ignore
443
+ self._features = features
444
+
445
+ @property
446
+ def is_typed(self) -> bool:
447
+ return True
448
+
449
+ @property
450
+ def features(self) -> Features:
451
+ return self._features
452
+
453
+ target_features = Features.from_arrow_schema(stream.schema)
454
+ ex_iterable = _ArrowExamplesIterable(_generate_tables, target_features)
455
+ info = DatasetInfo(features=target_features)
456
+ return _IterableDataset(ex_iterable=ex_iterable, info=info)