pyspiral 0.8.9__cp311-abi3-macosx_11_0_arm64.whl → 0.9.9__cp311-abi3-macosx_11_0_arm64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {pyspiral-0.8.9.dist-info → pyspiral-0.9.9.dist-info}/METADATA +4 -2
- {pyspiral-0.8.9.dist-info → pyspiral-0.9.9.dist-info}/RECORD +39 -34
- spiral/__init__.py +3 -2
- spiral/_lib.abi3.so +0 -0
- spiral/api/__init__.py +7 -0
- spiral/api/client.py +86 -8
- spiral/api/projects.py +4 -2
- spiral/api/tables.py +77 -0
- spiral/arrow_.py +4 -155
- spiral/cli/app.py +10 -4
- spiral/cli/chooser.py +30 -0
- spiral/cli/fs.py +3 -2
- spiral/cli/iceberg.py +1 -1
- spiral/cli/key_spaces.py +4 -4
- spiral/cli/orgs.py +1 -1
- spiral/cli/projects.py +2 -2
- spiral/cli/tables.py +47 -20
- spiral/cli/telemetry.py +13 -6
- spiral/cli/text.py +4 -4
- spiral/cli/transactions.py +84 -0
- spiral/cli/{types.py → types_.py} +6 -6
- spiral/cli/workloads.py +4 -4
- spiral/client.py +70 -8
- spiral/core/client/__init__.pyi +25 -16
- spiral/core/table/__init__.pyi +24 -22
- spiral/debug/manifests.py +21 -9
- spiral/debug/scan.py +4 -6
- spiral/demo.py +145 -38
- spiral/enrichment.py +18 -23
- spiral/expressions/__init__.py +3 -75
- spiral/expressions/base.py +5 -10
- spiral/huggingface.py +456 -0
- spiral/input.py +131 -0
- spiral/ray_.py +75 -0
- spiral/scan.py +218 -64
- spiral/table.py +5 -4
- spiral/transaction.py +95 -15
- spiral/iterable_dataset.py +0 -106
- {pyspiral-0.8.9.dist-info → pyspiral-0.9.9.dist-info}/WHEEL +0 -0
- {pyspiral-0.8.9.dist-info → pyspiral-0.9.9.dist-info}/entry_points.txt +0 -0
spiral/expressions/__init__.py
CHANGED
|
@@ -1,12 +1,12 @@
|
|
|
1
1
|
import builtins
|
|
2
2
|
import functools
|
|
3
3
|
import operator
|
|
4
|
-
import warnings
|
|
5
4
|
from typing import Any
|
|
6
5
|
|
|
7
6
|
import pyarrow as pa
|
|
8
7
|
|
|
9
|
-
from spiral import _lib
|
|
8
|
+
from spiral import _lib
|
|
9
|
+
from spiral.input import dot_separated_dict_to_nested
|
|
10
10
|
|
|
11
11
|
from . import file as file
|
|
12
12
|
from . import http as http
|
|
@@ -74,86 +74,14 @@ def lift(expr: ExprLike) -> Expr:
|
|
|
74
74
|
# NOTE: we assume this is a struct expression. We could be smarter and be context aware to determine if
|
|
75
75
|
# this is in fact a struct scalar, but the user can always create one of those manually.
|
|
76
76
|
|
|
77
|
-
|
|
78
|
-
expr: dict = arrow_.nest_structs(expr)
|
|
77
|
+
expr: dict = dot_separated_dict_to_nested(expr)
|
|
79
78
|
|
|
80
79
|
return pack({k: lift(v) for k, v in expr.items()})
|
|
81
80
|
|
|
82
|
-
if isinstance(expr, builtins.list):
|
|
83
|
-
return lift(pa.array(expr))
|
|
84
|
-
|
|
85
|
-
# Unpack tables and chunked arrays
|
|
86
|
-
if isinstance(expr, pa.Table | pa.RecordBatch):
|
|
87
|
-
expr = expr.to_struct_array()
|
|
88
|
-
if isinstance(expr, pa.ChunkedArray):
|
|
89
|
-
expr = expr.combine_chunks()
|
|
90
|
-
|
|
91
|
-
# If the value is struct-like, we un-nest any dot-separated field names
|
|
92
|
-
if isinstance(expr, pa.StructArray | pa.StructScalar):
|
|
93
|
-
# TODO(marko): Figure out what to do with nullable struct arrays when unpacking them.
|
|
94
|
-
# We need to merge struct validity into the child validity?
|
|
95
|
-
if isinstance(expr, pa.StructArray) and expr.null_count != 0:
|
|
96
|
-
# raise ValueError("lift: cannot lift a struct array with nulls.")
|
|
97
|
-
warnings.warn("found a struct array with nulls", stacklevel=2)
|
|
98
|
-
if isinstance(expr, pa.StructScalar) and not expr.is_valid:
|
|
99
|
-
# raise ValueError("lift: cannot lift a struct scalar with nulls.")
|
|
100
|
-
warnings.warn("found a struct scalar with nulls", stacklevel=2)
|
|
101
|
-
return lift(arrow_.nest_structs(expr))
|
|
102
|
-
|
|
103
|
-
if isinstance(expr, pa.Array):
|
|
104
|
-
return Expr(_lib.expr.array_lit(expr))
|
|
105
|
-
|
|
106
81
|
# Otherwise, assume it's a scalar.
|
|
107
82
|
return scalar(expr)
|
|
108
83
|
|
|
109
84
|
|
|
110
|
-
def evaluate(expr: ExprLike) -> pa.RecordBatchReader:
|
|
111
|
-
# TODO(marko): This implementation is currently minimal and most ExprLike-s fail.
|
|
112
|
-
if isinstance(expr, pa.RecordBatchReader):
|
|
113
|
-
return expr
|
|
114
|
-
if isinstance(expr, pa.Table):
|
|
115
|
-
return expr.to_reader()
|
|
116
|
-
if isinstance(expr, pa.RecordBatch):
|
|
117
|
-
return pa.RecordBatchReader.from_batches(expr.schema, [expr])
|
|
118
|
-
if isinstance(expr, pa.StructArray):
|
|
119
|
-
return pa.Table.from_struct_array(expr).to_reader()
|
|
120
|
-
|
|
121
|
-
if isinstance(expr, pa.ChunkedArray):
|
|
122
|
-
if not pa.types.is_struct(expr.type):
|
|
123
|
-
raise ValueError("Arrow chunked array must be a struct type.")
|
|
124
|
-
|
|
125
|
-
def _iter_batches():
|
|
126
|
-
for chunk in expr.chunks:
|
|
127
|
-
yield pa.RecordBatch.from_struct_array(chunk)
|
|
128
|
-
|
|
129
|
-
return pa.RecordBatchReader.from_batches(pa.schema(expr.type.fields), _iter_batches())
|
|
130
|
-
|
|
131
|
-
if isinstance(expr, pa.Array):
|
|
132
|
-
raise ValueError("Arrow array must be a struct array.")
|
|
133
|
-
|
|
134
|
-
if isinstance(expr, Expr) or isinstance(expr, NativeExpr):
|
|
135
|
-
raise NotImplementedError(
|
|
136
|
-
"Expr evaluation not supported yet. Use Arrow to write instead. Reach out if you require this feature."
|
|
137
|
-
)
|
|
138
|
-
|
|
139
|
-
if isinstance(expr, dict):
|
|
140
|
-
# NOTE: we assume this is a struct expression. We could be smarter and be context aware to determine if
|
|
141
|
-
# this is in fact a struct scalar, but the user can always create one of those manually.
|
|
142
|
-
|
|
143
|
-
# First we un-nest any dot-separated field names
|
|
144
|
-
expr: dict = arrow_.nest_structs(expr)
|
|
145
|
-
return evaluate(arrow_.dict_to_table(expr))
|
|
146
|
-
|
|
147
|
-
if isinstance(expr, builtins.list):
|
|
148
|
-
return evaluate(pa.array(expr))
|
|
149
|
-
|
|
150
|
-
if isinstance(expr, pa.Scalar):
|
|
151
|
-
return evaluate(pa.array([expr]))
|
|
152
|
-
|
|
153
|
-
# Otherwise, try scalar.
|
|
154
|
-
return evaluate(scalar(expr))
|
|
155
|
-
|
|
156
|
-
|
|
157
85
|
def aux(name: builtins.str, dtype: pa.DataType) -> Expr:
|
|
158
86
|
"""Create a variable expression referencing a column in the auxiliary table.
|
|
159
87
|
|
spiral/expressions/base.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import datetime
|
|
2
|
-
from typing import TypeAlias
|
|
2
|
+
from typing import TypeAlias
|
|
3
3
|
|
|
4
4
|
import pyarrow as pa
|
|
5
5
|
|
|
@@ -175,12 +175,7 @@ class Expr:
|
|
|
175
175
|
return Expr(_lib.expr.binary(op, self.__expr__, rhs.__expr__))
|
|
176
176
|
|
|
177
177
|
|
|
178
|
-
ScalarLike: TypeAlias =
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
"pa.ChunkedArray[pa.Scalar[pa.DataType]]",
|
|
183
|
-
"pa.Scalar[pa.DataType]",
|
|
184
|
-
pa.Table,
|
|
185
|
-
]
|
|
186
|
-
ExprLike: TypeAlias = Expr | dict[str, "ExprLike"] | list["ExprLike"] | ArrowLike | ScalarLike
|
|
178
|
+
ScalarLike: TypeAlias = (
|
|
179
|
+
bool | int | float | str | datetime.datetime | datetime.date | datetime.time | None | list["ScalarLike"]
|
|
180
|
+
)
|
|
181
|
+
ExprLike: TypeAlias = Expr | dict[str, "ExprLike"] | ScalarLike
|
spiral/huggingface.py
ADDED
|
@@ -0,0 +1,456 @@
|
|
|
1
|
+
"""
|
|
2
|
+
This module provides utilities for bidirectional HuggingFace <-> Spiral conversion.
|
|
3
|
+
|
|
4
|
+
Ingesting HuggingFace datasets into Spiral:
|
|
5
|
+
from spiral.huggingface import ingest_dataset
|
|
6
|
+
from spiral import Spiral
|
|
7
|
+
|
|
8
|
+
sp = Spiral()
|
|
9
|
+
project = sp.project("my-project")
|
|
10
|
+
|
|
11
|
+
# Ingest a HuggingFace dataset
|
|
12
|
+
from datasets import load_dataset
|
|
13
|
+
hf_dataset = load_dataset("squad", split="train")
|
|
14
|
+
|
|
15
|
+
table = ingest_dataset(hf_dataset, project, "squad_train", key_columns="id")
|
|
16
|
+
|
|
17
|
+
Converting Spiral scans to HuggingFace IterableDataset:
|
|
18
|
+
# This is typically accessed via scan.to_iterable_dataset()
|
|
19
|
+
from spiral.huggingface import to_iterable_dataset
|
|
20
|
+
|
|
21
|
+
Requires the [huggingface] extra: pip install pyspiral[huggingface]
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
from __future__ import annotations
|
|
25
|
+
|
|
26
|
+
from collections.abc import Callable, Iterator, Sequence
|
|
27
|
+
from typing import TYPE_CHECKING
|
|
28
|
+
|
|
29
|
+
import pyarrow as pa
|
|
30
|
+
|
|
31
|
+
if TYPE_CHECKING:
|
|
32
|
+
from datasets import Dataset, DatasetDict, IterableDataset, IterableDatasetDict
|
|
33
|
+
from datasets.features import Features
|
|
34
|
+
|
|
35
|
+
from spiral import Project, Table
|
|
36
|
+
|
|
37
|
+
__all__ = ["ingest_dataset", "to_iterable_dataset"]
|
|
38
|
+
|
|
39
|
+
DEFAULT_ROW_INDEX_COLUMN = "__row_idx__"
|
|
40
|
+
DEFAULT_BATCH_SIZE = 100_000
|
|
41
|
+
DEFAULT_COMMIT_EVERY = 25
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def _check_huggingface_installed() -> None:
|
|
45
|
+
"""Raise ImportError with helpful message if datasets not installed."""
|
|
46
|
+
try:
|
|
47
|
+
import datasets # noqa: F401
|
|
48
|
+
except ImportError:
|
|
49
|
+
raise ImportError(
|
|
50
|
+
"The 'datasets' package is required for HuggingFace integration. "
|
|
51
|
+
"Install it with: pip install 'pyspiral[huggingface]'"
|
|
52
|
+
) from None
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def _add_row_index_column(table: pa.Table, offset: int = 0) -> pa.Table:
|
|
56
|
+
"""Add a __row_idx__ column as the first column."""
|
|
57
|
+
row_count = len(table)
|
|
58
|
+
row_idx = pa.array(range(offset, offset + row_count), type=pa.uint64())
|
|
59
|
+
return table.add_column(0, DEFAULT_ROW_INDEX_COLUMN, row_idx)
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def _validate_key_columns(schema: pa.Schema, key_columns: str | Sequence[str]) -> list[str]:
|
|
63
|
+
"""Validate that key columns exist in the schema and return as list."""
|
|
64
|
+
if isinstance(key_columns, str):
|
|
65
|
+
key_columns = [key_columns]
|
|
66
|
+
else:
|
|
67
|
+
key_columns = list(key_columns)
|
|
68
|
+
|
|
69
|
+
schema_names = set(schema.names)
|
|
70
|
+
for col in key_columns:
|
|
71
|
+
if col not in schema_names:
|
|
72
|
+
raise ValueError(f"Key column '{col}' not found in dataset schema. Available columns: {schema.names}")
|
|
73
|
+
|
|
74
|
+
return key_columns
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def _extract_key_schema(schema: pa.Schema, key_columns: list[str]) -> pa.Schema:
|
|
78
|
+
"""Extract key schema from existing columns."""
|
|
79
|
+
key_fields = [schema.field(col) for col in key_columns]
|
|
80
|
+
return pa.schema(key_fields)
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def _reorder_columns_keys_first(table: pa.Table, key_columns: list[str]) -> pa.Table:
|
|
84
|
+
"""Reorder table columns so key columns come first."""
|
|
85
|
+
non_key_columns = [name for name in table.column_names if name not in key_columns]
|
|
86
|
+
new_order = key_columns + non_key_columns
|
|
87
|
+
return table.select(new_order)
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def _features_to_arrow_schema(features: Features) -> pa.Schema:
|
|
91
|
+
"""Convert HuggingFace Features to Arrow schema."""
|
|
92
|
+
return features.arrow_schema
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def _ingest_in_memory_dataset(
|
|
96
|
+
dataset: Dataset,
|
|
97
|
+
project: Project,
|
|
98
|
+
table_name: str,
|
|
99
|
+
key_columns: str | Sequence[str] | None,
|
|
100
|
+
push_down_nulls: bool,
|
|
101
|
+
exist_ok: bool,
|
|
102
|
+
) -> Table:
|
|
103
|
+
"""Ingest an in-memory HuggingFace Dataset."""
|
|
104
|
+
# Get Arrow table directly from HuggingFace
|
|
105
|
+
arrow_table = dataset.data.table
|
|
106
|
+
|
|
107
|
+
# Handle key columns
|
|
108
|
+
if key_columns is None:
|
|
109
|
+
# Auto-generate row index
|
|
110
|
+
arrow_table = _add_row_index_column(arrow_table)
|
|
111
|
+
key_schema = pa.schema([(DEFAULT_ROW_INDEX_COLUMN, pa.uint64())])
|
|
112
|
+
else:
|
|
113
|
+
key_cols = _validate_key_columns(arrow_table.schema, key_columns)
|
|
114
|
+
key_schema = _extract_key_schema(arrow_table.schema, key_cols)
|
|
115
|
+
arrow_table = _reorder_columns_keys_first(arrow_table, key_cols)
|
|
116
|
+
|
|
117
|
+
# Create table and write
|
|
118
|
+
table = project.create_table(table_name, key_schema=key_schema, exist_ok=exist_ok)
|
|
119
|
+
table.write(arrow_table, push_down_nulls=push_down_nulls)
|
|
120
|
+
|
|
121
|
+
return table
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
def _ingest_iterable_dataset(
|
|
125
|
+
dataset: IterableDataset,
|
|
126
|
+
project: Project,
|
|
127
|
+
table_name: str,
|
|
128
|
+
key_columns: str | Sequence[str] | None,
|
|
129
|
+
batch_size: int,
|
|
130
|
+
commit_every: int,
|
|
131
|
+
push_down_nulls: bool,
|
|
132
|
+
exist_ok: bool,
|
|
133
|
+
) -> Table:
|
|
134
|
+
"""Ingest a streaming HuggingFace IterableDataset."""
|
|
135
|
+
# Infer key schema from features
|
|
136
|
+
features = dataset.features
|
|
137
|
+
if features is None:
|
|
138
|
+
raise ValueError(
|
|
139
|
+
"Cannot infer schema from IterableDataset without features. "
|
|
140
|
+
"Consider materializing the dataset first with dataset.take(n) or provide features."
|
|
141
|
+
)
|
|
142
|
+
|
|
143
|
+
arrow_schema = _features_to_arrow_schema(features)
|
|
144
|
+
|
|
145
|
+
# Determine key schema
|
|
146
|
+
if key_columns is None:
|
|
147
|
+
key_schema = pa.schema([(DEFAULT_ROW_INDEX_COLUMN, pa.uint64())])
|
|
148
|
+
else:
|
|
149
|
+
key_cols = _validate_key_columns(arrow_schema, key_columns)
|
|
150
|
+
key_schema = _extract_key_schema(arrow_schema, key_cols)
|
|
151
|
+
|
|
152
|
+
# Create table
|
|
153
|
+
table = project.create_table(table_name, key_schema=key_schema, exist_ok=exist_ok)
|
|
154
|
+
|
|
155
|
+
# Iterate with batching and transactions
|
|
156
|
+
row_offset = 0
|
|
157
|
+
tx = table.txn()
|
|
158
|
+
tx_ops = 0
|
|
159
|
+
batch_buffer: list[dict] = []
|
|
160
|
+
|
|
161
|
+
for row in dataset:
|
|
162
|
+
batch_buffer.append(row)
|
|
163
|
+
|
|
164
|
+
if len(batch_buffer) >= batch_size:
|
|
165
|
+
arrow_batch = _rows_to_arrow_table(batch_buffer, features, row_offset, key_columns)
|
|
166
|
+
tx.write(arrow_batch, push_down_nulls=push_down_nulls)
|
|
167
|
+
tx_ops += 1
|
|
168
|
+
row_offset += len(batch_buffer)
|
|
169
|
+
batch_buffer = []
|
|
170
|
+
|
|
171
|
+
if tx_ops >= commit_every:
|
|
172
|
+
tx.commit()
|
|
173
|
+
tx = table.txn()
|
|
174
|
+
tx_ops = 0
|
|
175
|
+
|
|
176
|
+
# Handle remaining rows
|
|
177
|
+
if batch_buffer:
|
|
178
|
+
arrow_batch = _rows_to_arrow_table(batch_buffer, features, row_offset, key_columns)
|
|
179
|
+
tx.write(arrow_batch, push_down_nulls=push_down_nulls)
|
|
180
|
+
tx_ops += 1
|
|
181
|
+
|
|
182
|
+
if tx_ops > 0:
|
|
183
|
+
tx.commit()
|
|
184
|
+
|
|
185
|
+
return table
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
def _rows_to_arrow_table(
|
|
189
|
+
rows: list[dict],
|
|
190
|
+
features: Features,
|
|
191
|
+
offset: int,
|
|
192
|
+
key_columns: str | Sequence[str] | None,
|
|
193
|
+
) -> pa.Table:
|
|
194
|
+
"""Convert a list of row dicts to an Arrow table with proper schema."""
|
|
195
|
+
from datasets import Dataset
|
|
196
|
+
|
|
197
|
+
# Create a temporary Dataset to leverage HF's Arrow conversion
|
|
198
|
+
temp_dataset = Dataset.from_list(rows, features=features)
|
|
199
|
+
arrow_table = temp_dataset.data.table
|
|
200
|
+
|
|
201
|
+
# Handle key columns
|
|
202
|
+
if key_columns is None:
|
|
203
|
+
arrow_table = _add_row_index_column(arrow_table, offset)
|
|
204
|
+
else:
|
|
205
|
+
key_cols = _validate_key_columns(arrow_table.schema, key_columns)
|
|
206
|
+
arrow_table = _reorder_columns_keys_first(arrow_table, key_cols)
|
|
207
|
+
|
|
208
|
+
return arrow_table
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
def _ingest_dataset_dict(
|
|
212
|
+
dataset_dict: DatasetDict,
|
|
213
|
+
project: Project,
|
|
214
|
+
table_name: str,
|
|
215
|
+
key_columns: str | Sequence[str] | None,
|
|
216
|
+
push_down_nulls: bool,
|
|
217
|
+
exist_ok: bool,
|
|
218
|
+
) -> dict[str, Table]:
|
|
219
|
+
"""Ingest a HuggingFace DatasetDict, creating one table per split."""
|
|
220
|
+
tables = {}
|
|
221
|
+
for split_name, dataset in dataset_dict.items():
|
|
222
|
+
split_table_name = f"{table_name}.{split_name}"
|
|
223
|
+
tables[split_name] = _ingest_in_memory_dataset(
|
|
224
|
+
dataset,
|
|
225
|
+
project,
|
|
226
|
+
split_table_name,
|
|
227
|
+
key_columns,
|
|
228
|
+
push_down_nulls,
|
|
229
|
+
exist_ok,
|
|
230
|
+
)
|
|
231
|
+
return tables
|
|
232
|
+
|
|
233
|
+
|
|
234
|
+
def _ingest_iterable_dataset_dict(
|
|
235
|
+
dataset_dict: IterableDatasetDict,
|
|
236
|
+
project: Project,
|
|
237
|
+
table_name: str,
|
|
238
|
+
key_columns: str | Sequence[str] | None,
|
|
239
|
+
batch_size: int,
|
|
240
|
+
commit_every: int,
|
|
241
|
+
push_down_nulls: bool,
|
|
242
|
+
exist_ok: bool,
|
|
243
|
+
) -> dict[str, Table]:
|
|
244
|
+
"""Ingest a HuggingFace IterableDatasetDict, creating one table per split."""
|
|
245
|
+
tables = {}
|
|
246
|
+
for split_name, dataset in dataset_dict.items():
|
|
247
|
+
split_table_name = f"{table_name}.{split_name}"
|
|
248
|
+
tables[split_name] = _ingest_iterable_dataset(
|
|
249
|
+
dataset,
|
|
250
|
+
project,
|
|
251
|
+
split_table_name,
|
|
252
|
+
key_columns,
|
|
253
|
+
batch_size,
|
|
254
|
+
commit_every,
|
|
255
|
+
push_down_nulls,
|
|
256
|
+
exist_ok,
|
|
257
|
+
)
|
|
258
|
+
return tables
|
|
259
|
+
|
|
260
|
+
|
|
261
|
+
def ingest_dataset(
|
|
262
|
+
dataset: Dataset | IterableDataset | DatasetDict | IterableDatasetDict,
|
|
263
|
+
project: Project,
|
|
264
|
+
table_name: str,
|
|
265
|
+
*,
|
|
266
|
+
key_columns: str | Sequence[str] | None = None,
|
|
267
|
+
batch_size: int = DEFAULT_BATCH_SIZE,
|
|
268
|
+
commit_every: int = DEFAULT_COMMIT_EVERY,
|
|
269
|
+
push_down_nulls: bool = True,
|
|
270
|
+
exist_ok: bool = False,
|
|
271
|
+
) -> Table | dict[str, Table]:
|
|
272
|
+
"""
|
|
273
|
+
Ingest a HuggingFace dataset into Spiral.
|
|
274
|
+
|
|
275
|
+
Args:
|
|
276
|
+
dataset: A HuggingFace Dataset, IterableDataset, DatasetDict, or IterableDatasetDict.
|
|
277
|
+
project: The Spiral project to create the table(s) in.
|
|
278
|
+
table_name: Base name for the table. For DatasetDict, tables are created as
|
|
279
|
+
`{table_name}.{split}` (e.g., `my_dataset.train`, `my_dataset.test`).
|
|
280
|
+
key_columns: Column(s) to use as the primary key. If None, a `__row_idx__`
|
|
281
|
+
column is auto-generated as a uint64 key.
|
|
282
|
+
batch_size: Number of rows to buffer before writing (for streaming datasets).
|
|
283
|
+
Default is 100,000 (matching fineweb.py pattern).
|
|
284
|
+
commit_every: Number of write operations before committing a transaction.
|
|
285
|
+
Default is 25 (matching fineweb.py pattern).
|
|
286
|
+
push_down_nulls: Whether to push down nullable structs to children.
|
|
287
|
+
exist_ok: If True, allow writing to existing tables.
|
|
288
|
+
|
|
289
|
+
Returns:
|
|
290
|
+
A single Table for Dataset/IterableDataset, or a dict mapping split names
|
|
291
|
+
to Tables for DatasetDict/IterableDatasetDict.
|
|
292
|
+
|
|
293
|
+
Raises:
|
|
294
|
+
ImportError: If `datasets` package is not installed.
|
|
295
|
+
ValueError: If key_columns don't exist in the dataset schema.
|
|
296
|
+
TypeError: If dataset is not a supported HuggingFace type.
|
|
297
|
+
|
|
298
|
+
Examples:
|
|
299
|
+
Basic ingestion with auto-generated key:
|
|
300
|
+
|
|
301
|
+
>>> from datasets import load_dataset
|
|
302
|
+
>>> from spiral import Spiral
|
|
303
|
+
>>> from spiral.huggingface import ingest_dataset
|
|
304
|
+
>>> sp = Spiral()
|
|
305
|
+
>>> project = sp.project("my-project")
|
|
306
|
+
>>> hf_ds = load_dataset("squad", split="train")
|
|
307
|
+
>>> table = ingest_dataset(hf_ds, project, "squad_train")
|
|
308
|
+
|
|
309
|
+
With custom key column:
|
|
310
|
+
|
|
311
|
+
>>> table = ingest_dataset(hf_ds, project, "squad_train", key_columns="id")
|
|
312
|
+
|
|
313
|
+
DatasetDict creates multiple tables:
|
|
314
|
+
|
|
315
|
+
>>> hf_dict = load_dataset("squad") # Returns DatasetDict
|
|
316
|
+
>>> tables = ingest_dataset(hf_dict, project, "squad")
|
|
317
|
+
>>> tables["train"] # squad.train table
|
|
318
|
+
>>> tables["validation"] # squad.validation table
|
|
319
|
+
"""
|
|
320
|
+
_check_huggingface_installed()
|
|
321
|
+
|
|
322
|
+
from datasets import Dataset, DatasetDict, IterableDataset, IterableDatasetDict
|
|
323
|
+
|
|
324
|
+
if isinstance(dataset, DatasetDict):
|
|
325
|
+
return _ingest_dataset_dict(
|
|
326
|
+
dataset,
|
|
327
|
+
project,
|
|
328
|
+
table_name,
|
|
329
|
+
key_columns,
|
|
330
|
+
push_down_nulls,
|
|
331
|
+
exist_ok,
|
|
332
|
+
)
|
|
333
|
+
elif isinstance(dataset, IterableDatasetDict):
|
|
334
|
+
return _ingest_iterable_dataset_dict(
|
|
335
|
+
dataset,
|
|
336
|
+
project,
|
|
337
|
+
table_name,
|
|
338
|
+
key_columns,
|
|
339
|
+
batch_size,
|
|
340
|
+
commit_every,
|
|
341
|
+
push_down_nulls,
|
|
342
|
+
exist_ok,
|
|
343
|
+
)
|
|
344
|
+
elif isinstance(dataset, Dataset):
|
|
345
|
+
return _ingest_in_memory_dataset(
|
|
346
|
+
dataset,
|
|
347
|
+
project,
|
|
348
|
+
table_name,
|
|
349
|
+
key_columns,
|
|
350
|
+
push_down_nulls,
|
|
351
|
+
exist_ok,
|
|
352
|
+
)
|
|
353
|
+
elif isinstance(dataset, IterableDataset):
|
|
354
|
+
return _ingest_iterable_dataset(
|
|
355
|
+
dataset,
|
|
356
|
+
project,
|
|
357
|
+
table_name,
|
|
358
|
+
key_columns,
|
|
359
|
+
batch_size,
|
|
360
|
+
commit_every,
|
|
361
|
+
push_down_nulls,
|
|
362
|
+
exist_ok,
|
|
363
|
+
)
|
|
364
|
+
else:
|
|
365
|
+
raise TypeError(
|
|
366
|
+
f"Unsupported dataset type: {type(dataset).__name__}. "
|
|
367
|
+
"Expected Dataset, IterableDataset, DatasetDict, or IterableDatasetDict."
|
|
368
|
+
)
|
|
369
|
+
|
|
370
|
+
|
|
371
|
+
# =============================================================================
|
|
372
|
+
# Spiral -> HuggingFace conversion
|
|
373
|
+
# =============================================================================
|
|
374
|
+
|
|
375
|
+
|
|
376
|
+
def to_iterable_dataset(stream: pa.RecordBatchReader) -> IterableDataset:
|
|
377
|
+
"""
|
|
378
|
+
Convert a PyArrow RecordBatchReader to a HuggingFace IterableDataset.
|
|
379
|
+
|
|
380
|
+
This is typically accessed via scan.to_iterable_dataset() rather than directly.
|
|
381
|
+
|
|
382
|
+
Args:
|
|
383
|
+
stream: A PyArrow RecordBatchReader, typically from a Spiral scan.
|
|
384
|
+
|
|
385
|
+
Returns:
|
|
386
|
+
A HuggingFace IterableDataset that yields rows from the stream.
|
|
387
|
+
|
|
388
|
+
Example:
|
|
389
|
+
>>> from spiral import Spiral
|
|
390
|
+
>>> sp = Spiral()
|
|
391
|
+
>>> table = sp.project("my-project").table("my-table")
|
|
392
|
+
>>> scan = sp.scan(table)
|
|
393
|
+
>>> hf_dataset = scan.to_iterable_dataset() # Uses this function internally
|
|
394
|
+
"""
|
|
395
|
+
_check_huggingface_installed()
|
|
396
|
+
|
|
397
|
+
from datasets import DatasetInfo, Features
|
|
398
|
+
from datasets.builder import ArrowExamplesIterable
|
|
399
|
+
from datasets.iterable_dataset import IterableDataset as HFIterableDataset
|
|
400
|
+
|
|
401
|
+
def _generate_tables(**kwargs) -> Iterator[tuple[int, pa.Table]]:
|
|
402
|
+
# This key is unused when training with IterableDataset.
|
|
403
|
+
# Default implementation returns shard id, e.g. parquet row group id.
|
|
404
|
+
for i, rb in enumerate(stream):
|
|
405
|
+
yield i, pa.Table.from_batches([rb], stream.schema)
|
|
406
|
+
|
|
407
|
+
# TODO(marko): This is temporary until we stop returning IterableDataset from this function.
|
|
408
|
+
class _IterableDataset(HFIterableDataset):
|
|
409
|
+
# Diff with datasets.iterable_dataset.IterableDataset:
|
|
410
|
+
# - Removes torch handling which attempts to handle worker processes.
|
|
411
|
+
# - Assumes arrow iterator.
|
|
412
|
+
def __iter__(self):
|
|
413
|
+
from datasets.formatting import get_formatter
|
|
414
|
+
|
|
415
|
+
prepared_ex_iterable = self._prepare_ex_iterable_for_iteration()
|
|
416
|
+
if self._formatting and (prepared_ex_iterable.iter_arrow or self._formatting.is_table):
|
|
417
|
+
formatter = get_formatter(self._formatting.format_type, features=self.features)
|
|
418
|
+
iterator = prepared_ex_iterable.iter_arrow()
|
|
419
|
+
for key, pa_table in iterator:
|
|
420
|
+
yield formatter.format_row(pa_table)
|
|
421
|
+
return
|
|
422
|
+
|
|
423
|
+
for key, example in prepared_ex_iterable:
|
|
424
|
+
# no need to format thanks to FormattedExamplesIterable
|
|
425
|
+
yield example
|
|
426
|
+
|
|
427
|
+
def map(self, *args, **kwargs):
|
|
428
|
+
# Map constructs a new IterableDataset, so we need to "patch" it
|
|
429
|
+
base = super().map(*args, **kwargs)
|
|
430
|
+
if isinstance(base, HFIterableDataset):
|
|
431
|
+
# Patch __iter__ to avoid torch handling
|
|
432
|
+
base.__class__ = _IterableDataset # type: ignore
|
|
433
|
+
return base
|
|
434
|
+
|
|
435
|
+
class _ArrowExamplesIterable(ArrowExamplesIterable):
|
|
436
|
+
def __init__(
|
|
437
|
+
self,
|
|
438
|
+
generate_tables_fn: Callable[..., Iterator[tuple[int, pa.Table]]],
|
|
439
|
+
features: Features,
|
|
440
|
+
):
|
|
441
|
+
# NOTE: generate_tables_fn type annotations are wrong, return type must be an iterable of tuples.
|
|
442
|
+
super().__init__(generate_tables_fn, kwargs={}) # type: ignore
|
|
443
|
+
self._features = features
|
|
444
|
+
|
|
445
|
+
@property
|
|
446
|
+
def is_typed(self) -> bool:
|
|
447
|
+
return True
|
|
448
|
+
|
|
449
|
+
@property
|
|
450
|
+
def features(self) -> Features:
|
|
451
|
+
return self._features
|
|
452
|
+
|
|
453
|
+
target_features = Features.from_arrow_schema(stream.schema)
|
|
454
|
+
ex_iterable = _ArrowExamplesIterable(_generate_tables, target_features)
|
|
455
|
+
info = DatasetInfo(features=target_features)
|
|
456
|
+
return _IterableDataset(ex_iterable=ex_iterable, info=info)
|