pyspiral 0.8.9__cp311-abi3-macosx_11_0_arm64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pyspiral-0.8.9.dist-info/METADATA +53 -0
- pyspiral-0.8.9.dist-info/RECORD +114 -0
- pyspiral-0.8.9.dist-info/WHEEL +4 -0
- pyspiral-0.8.9.dist-info/entry_points.txt +3 -0
- spiral/__init__.py +55 -0
- spiral/_lib.abi3.so +0 -0
- spiral/adbc.py +411 -0
- spiral/api/__init__.py +78 -0
- spiral/api/admin.py +15 -0
- spiral/api/client.py +165 -0
- spiral/api/filesystems.py +152 -0
- spiral/api/key_space_indexes.py +23 -0
- spiral/api/organizations.py +78 -0
- spiral/api/projects.py +219 -0
- spiral/api/telemetry.py +19 -0
- spiral/api/text_indexes.py +56 -0
- spiral/api/types.py +23 -0
- spiral/api/workers.py +40 -0
- spiral/api/workloads.py +52 -0
- spiral/arrow_.py +202 -0
- spiral/cli/__init__.py +89 -0
- spiral/cli/__main__.py +4 -0
- spiral/cli/admin.py +33 -0
- spiral/cli/app.py +108 -0
- spiral/cli/console.py +95 -0
- spiral/cli/fs.py +109 -0
- spiral/cli/iceberg.py +97 -0
- spiral/cli/key_spaces.py +103 -0
- spiral/cli/login.py +25 -0
- spiral/cli/orgs.py +81 -0
- spiral/cli/printer.py +53 -0
- spiral/cli/projects.py +148 -0
- spiral/cli/state.py +7 -0
- spiral/cli/tables.py +225 -0
- spiral/cli/telemetry.py +17 -0
- spiral/cli/text.py +115 -0
- spiral/cli/types.py +50 -0
- spiral/cli/workloads.py +86 -0
- spiral/client.py +279 -0
- spiral/core/__init__.pyi +0 -0
- spiral/core/_tools/__init__.pyi +5 -0
- spiral/core/authn/__init__.pyi +21 -0
- spiral/core/client/__init__.pyi +270 -0
- spiral/core/config/__init__.pyi +35 -0
- spiral/core/expr/__init__.pyi +15 -0
- spiral/core/expr/images/__init__.pyi +3 -0
- spiral/core/expr/list_/__init__.pyi +4 -0
- spiral/core/expr/pushdown/__init__.pyi +3 -0
- spiral/core/expr/refs/__init__.pyi +4 -0
- spiral/core/expr/s3/__init__.pyi +3 -0
- spiral/core/expr/str_/__init__.pyi +3 -0
- spiral/core/expr/struct_/__init__.pyi +6 -0
- spiral/core/expr/text/__init__.pyi +5 -0
- spiral/core/expr/udf/__init__.pyi +14 -0
- spiral/core/expr/video/__init__.pyi +3 -0
- spiral/core/table/__init__.pyi +142 -0
- spiral/core/table/manifests/__init__.pyi +35 -0
- spiral/core/table/metastore/__init__.pyi +58 -0
- spiral/core/table/spec/__init__.pyi +214 -0
- spiral/dataloader.py +310 -0
- spiral/dataset.py +264 -0
- spiral/datetime_.py +27 -0
- spiral/debug/__init__.py +0 -0
- spiral/debug/manifests.py +103 -0
- spiral/debug/metrics.py +56 -0
- spiral/debug/scan.py +266 -0
- spiral/demo.py +100 -0
- spiral/enrichment.py +290 -0
- spiral/expressions/__init__.py +274 -0
- spiral/expressions/base.py +186 -0
- spiral/expressions/file.py +17 -0
- spiral/expressions/http.py +17 -0
- spiral/expressions/list_.py +77 -0
- spiral/expressions/pushdown.py +12 -0
- spiral/expressions/s3.py +16 -0
- spiral/expressions/str_.py +39 -0
- spiral/expressions/struct.py +59 -0
- spiral/expressions/text.py +62 -0
- spiral/expressions/tiff.py +225 -0
- spiral/expressions/udf.py +66 -0
- spiral/grpc_.py +32 -0
- spiral/iceberg.py +31 -0
- spiral/iterable_dataset.py +106 -0
- spiral/key_space_index.py +44 -0
- spiral/project.py +247 -0
- spiral/protogen/_/__init__.py +0 -0
- spiral/protogen/_/arrow/__init__.py +0 -0
- spiral/protogen/_/arrow/flight/__init__.py +0 -0
- spiral/protogen/_/arrow/flight/protocol/__init__.py +0 -0
- spiral/protogen/_/arrow/flight/protocol/sql/__init__.py +2548 -0
- spiral/protogen/_/google/__init__.py +0 -0
- spiral/protogen/_/google/protobuf/__init__.py +2310 -0
- spiral/protogen/_/message_pool.py +3 -0
- spiral/protogen/_/py.typed +0 -0
- spiral/protogen/_/scandal/__init__.py +190 -0
- spiral/protogen/_/spfs/__init__.py +72 -0
- spiral/protogen/_/spql/__init__.py +61 -0
- spiral/protogen/_/substrait/__init__.py +6196 -0
- spiral/protogen/_/substrait/extensions/__init__.py +169 -0
- spiral/protogen/__init__.py +0 -0
- spiral/protogen/util.py +41 -0
- spiral/py.typed +0 -0
- spiral/scan.py +383 -0
- spiral/server.py +37 -0
- spiral/settings.py +36 -0
- spiral/snapshot.py +61 -0
- spiral/streaming_/__init__.py +3 -0
- spiral/streaming_/reader.py +133 -0
- spiral/streaming_/stream.py +156 -0
- spiral/substrait_.py +274 -0
- spiral/table.py +216 -0
- spiral/text_index.py +17 -0
- spiral/transaction.py +156 -0
- spiral/types_.py +6 -0
spiral/enrichment.py
ADDED
|
@@ -0,0 +1,290 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import dataclasses
|
|
4
|
+
import logging
|
|
5
|
+
from functools import partial
|
|
6
|
+
from typing import TYPE_CHECKING
|
|
7
|
+
|
|
8
|
+
from spiral.core.client import Shard
|
|
9
|
+
from spiral.core.table import KeyRange
|
|
10
|
+
from spiral.core.table.spec import Key, Operation
|
|
11
|
+
from spiral.expressions import Expr
|
|
12
|
+
|
|
13
|
+
if TYPE_CHECKING:
|
|
14
|
+
import dask.distributed
|
|
15
|
+
|
|
16
|
+
from spiral import Scan, Table
|
|
17
|
+
|
|
18
|
+
logger = logging.getLogger(__name__)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class Enrichment:
|
|
22
|
+
"""
|
|
23
|
+
An enrichment is used to derive new columns from the existing once, such as fetching data from object storage
|
|
24
|
+
with `se.s3.get` or compute embeddings. With column groups design supporting 100s of thousands of columns,
|
|
25
|
+
horizontally expanding tables are a powerful primitive.
|
|
26
|
+
|
|
27
|
+
NOTE: Spiral aims to optimize enrichments where source and destination table are the same.
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
def __init__(
|
|
31
|
+
self,
|
|
32
|
+
table: Table,
|
|
33
|
+
projection: Expr,
|
|
34
|
+
where: Expr | None,
|
|
35
|
+
):
|
|
36
|
+
self._table = table
|
|
37
|
+
self._projection = projection
|
|
38
|
+
self._where = where
|
|
39
|
+
|
|
40
|
+
@property
|
|
41
|
+
def table(self) -> Table:
|
|
42
|
+
"""The table to write back into."""
|
|
43
|
+
return self._table
|
|
44
|
+
|
|
45
|
+
@property
|
|
46
|
+
def projection(self) -> Expr:
|
|
47
|
+
"""The projection expression."""
|
|
48
|
+
return self._projection
|
|
49
|
+
|
|
50
|
+
@property
|
|
51
|
+
def where(self) -> Expr | None:
|
|
52
|
+
"""The filter expression."""
|
|
53
|
+
return self._where
|
|
54
|
+
|
|
55
|
+
def _scan(self) -> Scan:
|
|
56
|
+
return self._table.spiral.scan(self._projection, where=self._where)
|
|
57
|
+
|
|
58
|
+
def apply(
|
|
59
|
+
self,
|
|
60
|
+
*,
|
|
61
|
+
txn_dump: str | None = None,
|
|
62
|
+
) -> None:
|
|
63
|
+
"""Apply the enrichment onto the table in a streaming fashion.
|
|
64
|
+
|
|
65
|
+
For large tables, consider using `apply_dask` for distributed execution.
|
|
66
|
+
|
|
67
|
+
Args:
|
|
68
|
+
txn_dump: Optional path to dump the transaction JSON for debugging.
|
|
69
|
+
"""
|
|
70
|
+
|
|
71
|
+
txn = self._table.txn()
|
|
72
|
+
|
|
73
|
+
txn.writeback(self._scan())
|
|
74
|
+
|
|
75
|
+
if txn.is_empty():
|
|
76
|
+
logger.warning("Transaction not committed. No rows were read for enrichment.")
|
|
77
|
+
return
|
|
78
|
+
|
|
79
|
+
txn.commit(txn_dump=txn_dump)
|
|
80
|
+
|
|
81
|
+
def apply_dask(
|
|
82
|
+
self,
|
|
83
|
+
*,
|
|
84
|
+
max_task_size: int | None = None,
|
|
85
|
+
checkpoint_dump: str | None = None,
|
|
86
|
+
shards: list[Shard] | None = None,
|
|
87
|
+
txn_dump: str | None = None,
|
|
88
|
+
client: dask.distributed.Client | None = None,
|
|
89
|
+
**kwargs,
|
|
90
|
+
) -> None:
|
|
91
|
+
"""Use distributed Dask to apply the enrichment. Requires `dask[distributed]` to be installed.
|
|
92
|
+
|
|
93
|
+
If "address" of an existing Dask cluster is not provided in `kwargs`, a local cluster will be created.
|
|
94
|
+
|
|
95
|
+
Dask execution has some limitations, e.g. UDFs are not currently supported. These limitations
|
|
96
|
+
usually manifest as serialization errors when Dask workers attempt to serialize the state. If you are
|
|
97
|
+
encountering such issues, consider splitting the enrichment into UDF-only derivation that will be
|
|
98
|
+
executed in a streaming fashion, followed by a Dask enrichment for the rest of the computation.
|
|
99
|
+
If that is not possible, please reach out to the support for assistance.
|
|
100
|
+
|
|
101
|
+
How shards are determined:
|
|
102
|
+
- If `shards` is provided, those will be used directly.
|
|
103
|
+
- Else, if `checkpoint_dump` is provided, shards will be loaded from the checkpoint.
|
|
104
|
+
- Else, if `max_task_size` is provided, shards will be created based on the task size.
|
|
105
|
+
- Else, the scan's default sharding will be used.
|
|
106
|
+
|
|
107
|
+
Args:
|
|
108
|
+
max_task_size: Optional size task limit, in number of rows. Used for sharding.
|
|
109
|
+
If provided and checkpoint is present, the checkpoint shards will be used instead.
|
|
110
|
+
If not provided, the scan's default sharding will be used.
|
|
111
|
+
checkpoint_dump: Optional path to dump intermediate checkpoints for incremental progress.
|
|
112
|
+
shards: Optional list of shards to process.
|
|
113
|
+
If provided, `max_task_size` and `checkpoint_dump` are ignored.
|
|
114
|
+
txn_dump: Optional path to dump the transaction JSON for debugging.
|
|
115
|
+
client: Optional Dask distributed client. If not provided, a new client will be created
|
|
116
|
+
**kwargs: Additional keyword arguments to pass to `dask.distributed.Client`
|
|
117
|
+
such as `address` to connect to an existing cluster.
|
|
118
|
+
"""
|
|
119
|
+
if client is None:
|
|
120
|
+
try:
|
|
121
|
+
from dask.distributed import Client
|
|
122
|
+
except ImportError:
|
|
123
|
+
raise ImportError("dask is not installed, please install dask[distributed] to use this feature.")
|
|
124
|
+
|
|
125
|
+
# Connect before doing any work.
|
|
126
|
+
client = Client(**kwargs)
|
|
127
|
+
|
|
128
|
+
# Start a transaction BEFORE the planning scan.
|
|
129
|
+
tx = self._table.txn()
|
|
130
|
+
plan_scan = self._scan()
|
|
131
|
+
|
|
132
|
+
# Determine the "tasks". Start from provided shards.
|
|
133
|
+
task_shards = shards
|
|
134
|
+
# If shards are not provided, try loading from checkpoint.
|
|
135
|
+
if task_shards is None and checkpoint_dump is not None:
|
|
136
|
+
checkpoint: list[KeyRange] | None = _checkpoint_load_key_ranges(checkpoint_dump)
|
|
137
|
+
if checkpoint is None:
|
|
138
|
+
logger.info(f"No existing checkpoint found at {checkpoint_dump}. Starting from scratch.")
|
|
139
|
+
else:
|
|
140
|
+
logger.info(f"Resuming enrichment from checkpoint at {checkpoint_dump} with {len(checkpoint)} ranges.")
|
|
141
|
+
task_shards = [Shard(kr, None) for kr in checkpoint]
|
|
142
|
+
# If still no shards, try creating from max task size.
|
|
143
|
+
if task_shards is None and max_task_size is not None:
|
|
144
|
+
task_shards = self._table.spiral.compute_shards(max_task_size, self.projection, self.where)
|
|
145
|
+
# Fallback to default sharding in the scan.
|
|
146
|
+
if task_shards is None:
|
|
147
|
+
task_shards = plan_scan.shards()
|
|
148
|
+
|
|
149
|
+
# Partially bind the enrichment function.
|
|
150
|
+
_compute = partial(
|
|
151
|
+
_enrichment_task,
|
|
152
|
+
config_json=self._table.spiral.config.to_json(),
|
|
153
|
+
state_json=plan_scan.core.plan_state().to_json(),
|
|
154
|
+
output_table_id=self._table.table_id,
|
|
155
|
+
incremental=checkpoint_dump is not None,
|
|
156
|
+
)
|
|
157
|
+
enrichments = client.map(_compute, task_shards)
|
|
158
|
+
|
|
159
|
+
logger.info(f"Applying enrichment with {len(task_shards)} shards. Follow progress at {client.dashboard_link}")
|
|
160
|
+
|
|
161
|
+
failed_ranges = []
|
|
162
|
+
try:
|
|
163
|
+
for result, shard in zip(client.gather(enrichments), task_shards):
|
|
164
|
+
result: EnrichmentTaskResult
|
|
165
|
+
|
|
166
|
+
if result.error is not None:
|
|
167
|
+
logger.error(f"Enrichment task failed for range {shard.key_range}: {result.error}")
|
|
168
|
+
failed_ranges.append(shard.key_range)
|
|
169
|
+
continue
|
|
170
|
+
|
|
171
|
+
tx.include(result.ops)
|
|
172
|
+
except Exception as e:
|
|
173
|
+
# If not incremental, re-raise the exception.
|
|
174
|
+
if checkpoint_dump is None:
|
|
175
|
+
raise e
|
|
176
|
+
|
|
177
|
+
# Handle worker failures (e.g., KilledWorker from Dask)
|
|
178
|
+
from dask.distributed import KilledWorker
|
|
179
|
+
|
|
180
|
+
if isinstance(e, KilledWorker):
|
|
181
|
+
logger.error(f"Dask worker was killed during enrichment: {e}")
|
|
182
|
+
|
|
183
|
+
# Try to gather partial results and mark remaining tasks as failed
|
|
184
|
+
for future, shard in zip(enrichments, task_shards):
|
|
185
|
+
if future.done() and not future.exception():
|
|
186
|
+
try:
|
|
187
|
+
result = future.result()
|
|
188
|
+
|
|
189
|
+
if result.error is not None:
|
|
190
|
+
logger.error(f"Enrichment task failed for range {shard.key_range}: {result.error}")
|
|
191
|
+
failed_ranges.append(shard.key_range)
|
|
192
|
+
continue
|
|
193
|
+
|
|
194
|
+
tx.include(result.ops)
|
|
195
|
+
except Exception:
|
|
196
|
+
# Task failed or incomplete, add to failed ranges
|
|
197
|
+
failed_ranges.append(shard.key_range)
|
|
198
|
+
else:
|
|
199
|
+
# Task didn't complete, add to failed ranges
|
|
200
|
+
failed_ranges.append(shard.key_range)
|
|
201
|
+
|
|
202
|
+
# Dump checkpoint of failed ranges, if any.
|
|
203
|
+
if checkpoint_dump is not None:
|
|
204
|
+
logger.info(
|
|
205
|
+
f"Dumping checkpoint with failed {len(failed_ranges)}/{len(task_shards)} ranges to {checkpoint_dump}."
|
|
206
|
+
)
|
|
207
|
+
_checkpoint_dump_key_ranges(checkpoint_dump, failed_ranges)
|
|
208
|
+
|
|
209
|
+
if tx.is_empty():
|
|
210
|
+
logger.warning("Transaction not committed. No rows were read for enrichment.")
|
|
211
|
+
return
|
|
212
|
+
|
|
213
|
+
# Always compact in distributed enrichment.
|
|
214
|
+
tx.commit(compact=True, txn_dump=txn_dump)
|
|
215
|
+
|
|
216
|
+
|
|
217
|
+
def _checkpoint_load_key_ranges(checkpoint_dump: str) -> list[KeyRange] | None:
|
|
218
|
+
import json
|
|
219
|
+
import os
|
|
220
|
+
|
|
221
|
+
if not os.path.exists(checkpoint_dump):
|
|
222
|
+
return None
|
|
223
|
+
|
|
224
|
+
with open(checkpoint_dump) as f:
|
|
225
|
+
data = json.load(f)
|
|
226
|
+
return [
|
|
227
|
+
KeyRange(begin=Key(bytes.fromhex(r["begin"])), end=Key(bytes.fromhex(r["end"])))
|
|
228
|
+
for r in data.get("key_ranges", [])
|
|
229
|
+
]
|
|
230
|
+
|
|
231
|
+
|
|
232
|
+
def _checkpoint_dump_key_ranges(checkpoint_dump: str, ranges: list[KeyRange]):
|
|
233
|
+
import json
|
|
234
|
+
import os
|
|
235
|
+
|
|
236
|
+
os.makedirs(os.path.dirname(checkpoint_dump), exist_ok=True)
|
|
237
|
+
with open(checkpoint_dump, "w") as f:
|
|
238
|
+
json.dump(
|
|
239
|
+
{"key_ranges": [{"begin": bytes(r.begin).hex(), "end": bytes(r.end).hex()} for r in ranges]},
|
|
240
|
+
f,
|
|
241
|
+
)
|
|
242
|
+
|
|
243
|
+
|
|
244
|
+
@dataclasses.dataclass
|
|
245
|
+
class EnrichmentTaskResult:
|
|
246
|
+
ops: list[Operation]
|
|
247
|
+
error: str | None = None
|
|
248
|
+
|
|
249
|
+
def __getstate__(self):
|
|
250
|
+
return {
|
|
251
|
+
"ops": [op.to_json() for op in self.ops],
|
|
252
|
+
"error": self.error,
|
|
253
|
+
}
|
|
254
|
+
|
|
255
|
+
def __setstate__(self, state):
|
|
256
|
+
self.ops = [Operation.from_json(op_json) for op_json in state["ops"]]
|
|
257
|
+
self.error = state["error"]
|
|
258
|
+
|
|
259
|
+
|
|
260
|
+
# NOTE(marko): This function must be picklable!
|
|
261
|
+
def _enrichment_task(
|
|
262
|
+
shard: Shard,
|
|
263
|
+
*,
|
|
264
|
+
config_json: str,
|
|
265
|
+
state_json: str,
|
|
266
|
+
output_table_id,
|
|
267
|
+
incremental: bool,
|
|
268
|
+
) -> EnrichmentTaskResult:
|
|
269
|
+
# Returns operations that can be included in a transaction.
|
|
270
|
+
from spiral import Spiral
|
|
271
|
+
from spiral.settings import ClientSettings
|
|
272
|
+
|
|
273
|
+
config = ClientSettings.from_json(config_json)
|
|
274
|
+
sp = Spiral(config=config)
|
|
275
|
+
task_scan = sp.resume_scan(state_json)
|
|
276
|
+
|
|
277
|
+
table = sp.table(output_table_id)
|
|
278
|
+
task_tx = table.txn()
|
|
279
|
+
|
|
280
|
+
try:
|
|
281
|
+
task_tx.writeback(task_scan, shards=[shard])
|
|
282
|
+
return EnrichmentTaskResult(ops=task_tx.take())
|
|
283
|
+
except Exception as e:
|
|
284
|
+
task_tx.abort()
|
|
285
|
+
|
|
286
|
+
if incremental:
|
|
287
|
+
return EnrichmentTaskResult(ops=[], error=str(e))
|
|
288
|
+
|
|
289
|
+
logger.error(f"Enrichment task failed for shard {shard}: {e}")
|
|
290
|
+
raise e
|
|
@@ -0,0 +1,274 @@
|
|
|
1
|
+
import builtins
|
|
2
|
+
import functools
|
|
3
|
+
import operator
|
|
4
|
+
import warnings
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
import pyarrow as pa
|
|
8
|
+
|
|
9
|
+
from spiral import _lib, arrow_
|
|
10
|
+
|
|
11
|
+
from . import file as file
|
|
12
|
+
from . import http as http
|
|
13
|
+
from . import list_ as list
|
|
14
|
+
from . import s3 as s3
|
|
15
|
+
from . import str_ as str
|
|
16
|
+
from . import struct as struct
|
|
17
|
+
from . import text as text
|
|
18
|
+
from .base import Expr, ExprLike, NativeExpr
|
|
19
|
+
from .udf import UDF
|
|
20
|
+
|
|
21
|
+
__all__ = [
|
|
22
|
+
"Expr",
|
|
23
|
+
"add",
|
|
24
|
+
"and_",
|
|
25
|
+
"divide",
|
|
26
|
+
"eq",
|
|
27
|
+
"getitem",
|
|
28
|
+
"gt",
|
|
29
|
+
"gte",
|
|
30
|
+
"is_not_null",
|
|
31
|
+
"is_null",
|
|
32
|
+
"lift",
|
|
33
|
+
"list",
|
|
34
|
+
"lt",
|
|
35
|
+
"lte",
|
|
36
|
+
"merge",
|
|
37
|
+
"modulo",
|
|
38
|
+
"multiply",
|
|
39
|
+
"negate",
|
|
40
|
+
"neq",
|
|
41
|
+
"not_",
|
|
42
|
+
"or_",
|
|
43
|
+
"pack",
|
|
44
|
+
"aux",
|
|
45
|
+
"scalar",
|
|
46
|
+
"select",
|
|
47
|
+
"str",
|
|
48
|
+
"struct",
|
|
49
|
+
"subtract",
|
|
50
|
+
"xor",
|
|
51
|
+
"text",
|
|
52
|
+
"s3",
|
|
53
|
+
"http",
|
|
54
|
+
"file",
|
|
55
|
+
"UDF",
|
|
56
|
+
]
|
|
57
|
+
|
|
58
|
+
# Inline some of the struct expressions since they're so common
|
|
59
|
+
getitem = struct.getitem
|
|
60
|
+
merge = struct.merge
|
|
61
|
+
pack = struct.pack
|
|
62
|
+
select = struct.select
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def lift(expr: ExprLike) -> Expr:
|
|
66
|
+
# Convert an ExprLike into an Expr.
|
|
67
|
+
|
|
68
|
+
if isinstance(expr, Expr):
|
|
69
|
+
return expr
|
|
70
|
+
if isinstance(expr, NativeExpr):
|
|
71
|
+
return Expr(expr)
|
|
72
|
+
|
|
73
|
+
if isinstance(expr, dict):
|
|
74
|
+
# NOTE: we assume this is a struct expression. We could be smarter and be context aware to determine if
|
|
75
|
+
# this is in fact a struct scalar, but the user can always create one of those manually.
|
|
76
|
+
|
|
77
|
+
# First we un-nest any dot-separated field names
|
|
78
|
+
expr: dict = arrow_.nest_structs(expr)
|
|
79
|
+
|
|
80
|
+
return pack({k: lift(v) for k, v in expr.items()})
|
|
81
|
+
|
|
82
|
+
if isinstance(expr, builtins.list):
|
|
83
|
+
return lift(pa.array(expr))
|
|
84
|
+
|
|
85
|
+
# Unpack tables and chunked arrays
|
|
86
|
+
if isinstance(expr, pa.Table | pa.RecordBatch):
|
|
87
|
+
expr = expr.to_struct_array()
|
|
88
|
+
if isinstance(expr, pa.ChunkedArray):
|
|
89
|
+
expr = expr.combine_chunks()
|
|
90
|
+
|
|
91
|
+
# If the value is struct-like, we un-nest any dot-separated field names
|
|
92
|
+
if isinstance(expr, pa.StructArray | pa.StructScalar):
|
|
93
|
+
# TODO(marko): Figure out what to do with nullable struct arrays when unpacking them.
|
|
94
|
+
# We need to merge struct validity into the child validity?
|
|
95
|
+
if isinstance(expr, pa.StructArray) and expr.null_count != 0:
|
|
96
|
+
# raise ValueError("lift: cannot lift a struct array with nulls.")
|
|
97
|
+
warnings.warn("found a struct array with nulls", stacklevel=2)
|
|
98
|
+
if isinstance(expr, pa.StructScalar) and not expr.is_valid:
|
|
99
|
+
# raise ValueError("lift: cannot lift a struct scalar with nulls.")
|
|
100
|
+
warnings.warn("found a struct scalar with nulls", stacklevel=2)
|
|
101
|
+
return lift(arrow_.nest_structs(expr))
|
|
102
|
+
|
|
103
|
+
if isinstance(expr, pa.Array):
|
|
104
|
+
return Expr(_lib.expr.array_lit(expr))
|
|
105
|
+
|
|
106
|
+
# Otherwise, assume it's a scalar.
|
|
107
|
+
return scalar(expr)
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def evaluate(expr: ExprLike) -> pa.RecordBatchReader:
|
|
111
|
+
# TODO(marko): This implementation is currently minimal and most ExprLike-s fail.
|
|
112
|
+
if isinstance(expr, pa.RecordBatchReader):
|
|
113
|
+
return expr
|
|
114
|
+
if isinstance(expr, pa.Table):
|
|
115
|
+
return expr.to_reader()
|
|
116
|
+
if isinstance(expr, pa.RecordBatch):
|
|
117
|
+
return pa.RecordBatchReader.from_batches(expr.schema, [expr])
|
|
118
|
+
if isinstance(expr, pa.StructArray):
|
|
119
|
+
return pa.Table.from_struct_array(expr).to_reader()
|
|
120
|
+
|
|
121
|
+
if isinstance(expr, pa.ChunkedArray):
|
|
122
|
+
if not pa.types.is_struct(expr.type):
|
|
123
|
+
raise ValueError("Arrow chunked array must be a struct type.")
|
|
124
|
+
|
|
125
|
+
def _iter_batches():
|
|
126
|
+
for chunk in expr.chunks:
|
|
127
|
+
yield pa.RecordBatch.from_struct_array(chunk)
|
|
128
|
+
|
|
129
|
+
return pa.RecordBatchReader.from_batches(pa.schema(expr.type.fields), _iter_batches())
|
|
130
|
+
|
|
131
|
+
if isinstance(expr, pa.Array):
|
|
132
|
+
raise ValueError("Arrow array must be a struct array.")
|
|
133
|
+
|
|
134
|
+
if isinstance(expr, Expr) or isinstance(expr, NativeExpr):
|
|
135
|
+
raise NotImplementedError(
|
|
136
|
+
"Expr evaluation not supported yet. Use Arrow to write instead. Reach out if you require this feature."
|
|
137
|
+
)
|
|
138
|
+
|
|
139
|
+
if isinstance(expr, dict):
|
|
140
|
+
# NOTE: we assume this is a struct expression. We could be smarter and be context aware to determine if
|
|
141
|
+
# this is in fact a struct scalar, but the user can always create one of those manually.
|
|
142
|
+
|
|
143
|
+
# First we un-nest any dot-separated field names
|
|
144
|
+
expr: dict = arrow_.nest_structs(expr)
|
|
145
|
+
return evaluate(arrow_.dict_to_table(expr))
|
|
146
|
+
|
|
147
|
+
if isinstance(expr, builtins.list):
|
|
148
|
+
return evaluate(pa.array(expr))
|
|
149
|
+
|
|
150
|
+
if isinstance(expr, pa.Scalar):
|
|
151
|
+
return evaluate(pa.array([expr]))
|
|
152
|
+
|
|
153
|
+
# Otherwise, try scalar.
|
|
154
|
+
return evaluate(scalar(expr))
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
def aux(name: builtins.str, dtype: pa.DataType) -> Expr:
|
|
158
|
+
"""Create a variable expression referencing a column in the auxiliary table.
|
|
159
|
+
|
|
160
|
+
Auxiliary table is optionally given to `Scan#to_record_batches` function when reading only specific keys
|
|
161
|
+
or doing cell pushdown.
|
|
162
|
+
|
|
163
|
+
Args:
|
|
164
|
+
name: variable name
|
|
165
|
+
dtype: must match dtype of the column in the auxiliary table.
|
|
166
|
+
"""
|
|
167
|
+
return Expr(_lib.expr.aux(name, dtype))
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
def scalar(value: Any) -> Expr:
|
|
171
|
+
"""Create a scalar expression."""
|
|
172
|
+
if not isinstance(value, pa.Scalar):
|
|
173
|
+
value = pa.scalar(value)
|
|
174
|
+
# TODO(marko): Use Vortex scalar instead of passing as array.
|
|
175
|
+
return Expr(_lib.expr.scalar(pa.array([value.as_py()], type=value.type)))
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
def cast(expr: ExprLike, dtype: pa.DataType) -> Expr:
|
|
179
|
+
"""Cast an expression into another PyArrow DataType."""
|
|
180
|
+
expr = lift(expr)
|
|
181
|
+
return Expr(_lib.expr.cast(expr.__expr__, dtype))
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
def and_(expr: ExprLike, *exprs: ExprLike) -> Expr:
|
|
185
|
+
"""Create a conjunction of one or more expressions."""
|
|
186
|
+
|
|
187
|
+
return functools.reduce(operator.and_, [lift(e) for e in exprs], lift(expr))
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
def or_(expr: ExprLike, *exprs: ExprLike) -> Expr:
|
|
191
|
+
"""Create a disjunction of one or more expressions."""
|
|
192
|
+
return functools.reduce(operator.or_, [lift(e) for e in exprs], lift(expr))
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
def eq(lhs: ExprLike, rhs: ExprLike) -> Expr:
|
|
196
|
+
"""Create an equality comparison."""
|
|
197
|
+
return operator.eq(lift(lhs), rhs)
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
def neq(lhs: ExprLike, rhs: ExprLike) -> Expr:
|
|
201
|
+
"""Create a not-equal comparison."""
|
|
202
|
+
return operator.ne(lift(lhs), rhs)
|
|
203
|
+
|
|
204
|
+
|
|
205
|
+
def xor(lhs: ExprLike, rhs: ExprLike) -> Expr:
|
|
206
|
+
"""Create a XOR comparison."""
|
|
207
|
+
return operator.xor(lift(lhs), rhs)
|
|
208
|
+
|
|
209
|
+
|
|
210
|
+
def lt(lhs: ExprLike, rhs: ExprLike) -> Expr:
|
|
211
|
+
"""Create a less-than comparison."""
|
|
212
|
+
return operator.lt(lift(lhs), rhs)
|
|
213
|
+
|
|
214
|
+
|
|
215
|
+
def lte(lhs: ExprLike, rhs: ExprLike) -> Expr:
|
|
216
|
+
"""Create a less-than-or-equal comparison."""
|
|
217
|
+
return operator.le(lift(lhs), rhs)
|
|
218
|
+
|
|
219
|
+
|
|
220
|
+
def gt(lhs: ExprLike, rhs: ExprLike) -> Expr:
|
|
221
|
+
"""Create a greater-than comparison."""
|
|
222
|
+
return operator.gt(lift(lhs), rhs)
|
|
223
|
+
|
|
224
|
+
|
|
225
|
+
def gte(lhs: ExprLike, rhs: ExprLike) -> Expr:
|
|
226
|
+
"""Create a greater-than-or-equal comparison."""
|
|
227
|
+
return operator.ge(lift(lhs), rhs)
|
|
228
|
+
|
|
229
|
+
|
|
230
|
+
def negate(expr: ExprLike) -> Expr:
|
|
231
|
+
"""Negate the given expression."""
|
|
232
|
+
return operator.neg(lift(expr))
|
|
233
|
+
|
|
234
|
+
|
|
235
|
+
def not_(expr: ExprLike) -> Expr:
|
|
236
|
+
"""Negate the given expression."""
|
|
237
|
+
expr = lift(expr)
|
|
238
|
+
return Expr(_lib.expr.not_(expr.__expr__))
|
|
239
|
+
|
|
240
|
+
|
|
241
|
+
def is_null(expr: ExprLike) -> Expr:
|
|
242
|
+
"""Check if the given expression is null."""
|
|
243
|
+
expr = lift(expr)
|
|
244
|
+
return Expr(_lib.expr.is_null(expr.__expr__))
|
|
245
|
+
|
|
246
|
+
|
|
247
|
+
def is_not_null(expr: ExprLike) -> Expr:
|
|
248
|
+
"""Check if the given expression is not null."""
|
|
249
|
+
return not_(is_null(expr))
|
|
250
|
+
|
|
251
|
+
|
|
252
|
+
def add(lhs: ExprLike, rhs: ExprLike) -> Expr:
|
|
253
|
+
"""Add two expressions."""
|
|
254
|
+
return operator.add(lift(lhs), rhs)
|
|
255
|
+
|
|
256
|
+
|
|
257
|
+
def subtract(lhs: ExprLike, rhs: ExprLike) -> Expr:
|
|
258
|
+
"""Subtract two expressions."""
|
|
259
|
+
return operator.sub(lift(lhs), rhs)
|
|
260
|
+
|
|
261
|
+
|
|
262
|
+
def multiply(lhs: ExprLike, rhs: ExprLike) -> Expr:
|
|
263
|
+
"""Multiply two expressions."""
|
|
264
|
+
return operator.mul(lift(lhs), rhs)
|
|
265
|
+
|
|
266
|
+
|
|
267
|
+
def divide(lhs: ExprLike, rhs: ExprLike) -> Expr:
|
|
268
|
+
"""Divide two expressions."""
|
|
269
|
+
return operator.truediv(lift(lhs), rhs)
|
|
270
|
+
|
|
271
|
+
|
|
272
|
+
def modulo(lhs: ExprLike, rhs: ExprLike) -> Expr:
|
|
273
|
+
"""Modulo two expressions."""
|
|
274
|
+
return operator.mod(lift(lhs), rhs)
|