vgi-python 0.8.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vgi/__init__.py +152 -0
- vgi/_duckdb.py +62 -0
- vgi/_storage_profile.py +132 -0
- vgi/_test_fixtures/__init__.py +20 -0
- vgi/_test_fixtures/accumulate/__init__.py +19 -0
- vgi/_test_fixtures/accumulate/worker.py +762 -0
- vgi/_test_fixtures/aggregate/__init__.py +62 -0
- vgi/_test_fixtures/aggregate/_common.py +21 -0
- vgi/_test_fixtures/aggregate/basic.py +232 -0
- vgi/_test_fixtures/aggregate/dynamic.py +409 -0
- vgi/_test_fixtures/aggregate/generic.py +86 -0
- vgi/_test_fixtures/aggregate/listagg.py +71 -0
- vgi/_test_fixtures/aggregate/percentile.py +107 -0
- vgi/_test_fixtures/aggregate/streaming.py +192 -0
- vgi/_test_fixtures/aggregate/varargs.py +75 -0
- vgi/_test_fixtures/aggregate/window.py +380 -0
- vgi/_test_fixtures/attach_options.py +308 -0
- vgi/_test_fixtures/bad_protocol.py +62 -0
- vgi/_test_fixtures/cancellable.py +336 -0
- vgi/_test_fixtures/catalog.py +813 -0
- vgi/_test_fixtures/http_server.py +394 -0
- vgi/_test_fixtures/nest_tensor.py +614 -0
- vgi/_test_fixtures/orchard_catalog.py +47 -0
- vgi/_test_fixtures/projection_repro/__init__.py +6 -0
- vgi/_test_fixtures/projection_repro/worker.py +454 -0
- vgi/_test_fixtures/scalar/__init__.py +116 -0
- vgi/_test_fixtures/scalar/_common.py +69 -0
- vgi/_test_fixtures/scalar/arithmetic.py +321 -0
- vgi/_test_fixtures/scalar/binary.py +120 -0
- vgi/_test_fixtures/scalar/formatting.py +176 -0
- vgi/_test_fixtures/scalar/geo.py +300 -0
- vgi/_test_fixtures/scalar/null_handling.py +107 -0
- vgi/_test_fixtures/scalar/random_demo.py +171 -0
- vgi/_test_fixtures/scalar/settings_secrets.py +102 -0
- vgi/_test_fixtures/scalar/type_info.py +219 -0
- vgi/_test_fixtures/schema_reconcile/__init__.py +29 -0
- vgi/_test_fixtures/schema_reconcile/worker.py +653 -0
- vgi/_test_fixtures/simple_writable.py +793 -0
- vgi/_test_fixtures/table/__init__.py +221 -0
- vgi/_test_fixtures/table/_common.py +162 -0
- vgi/_test_fixtures/table/batch_index.py +283 -0
- vgi/_test_fixtures/table/batch_index_broken.py +200 -0
- vgi/_test_fixtures/table/catalog_scans.py +162 -0
- vgi/_test_fixtures/table/filters.py +1005 -0
- vgi/_test_fixtures/table/late_materialization.py +249 -0
- vgi/_test_fixtures/table/make_series.py +273 -0
- vgi/_test_fixtures/table/misc.py +499 -0
- vgi/_test_fixtures/table/order_modes.py +164 -0
- vgi/_test_fixtures/table/pairs.py +437 -0
- vgi/_test_fixtures/table/partition_columns.py +472 -0
- vgi/_test_fixtures/table/partition_columns_broken.py +304 -0
- vgi/_test_fixtures/table/profiling_example.py +195 -0
- vgi/_test_fixtures/table/required_filters.py +234 -0
- vgi/_test_fixtures/table/sequence.py +710 -0
- vgi/_test_fixtures/table/settings.py +426 -0
- vgi/_test_fixtures/table/transaction_storage.py +162 -0
- vgi/_test_fixtures/table/tt_pushdown.py +191 -0
- vgi/_test_fixtures/table/versioned.py +230 -0
- vgi/_test_fixtures/table_in_out.py +1392 -0
- vgi/_test_fixtures/versioned.py +155 -0
- vgi/_test_fixtures/versioned_tables.py +595 -0
- vgi/_test_fixtures/worker.py +1631 -0
- vgi/_test_fixtures/writable/__init__.py +8 -0
- vgi/_test_fixtures/writable/generic.py +236 -0
- vgi/_test_fixtures/writable/table.py +149 -0
- vgi/_test_fixtures/writable/worker.py +1148 -0
- vgi/aggregate_function.py +607 -0
- vgi/argument_spec.py +472 -0
- vgi/arguments.py +1747 -0
- vgi/auth.py +55 -0
- vgi/catalog/__init__.py +88 -0
- vgi/catalog/attach_option.py +206 -0
- vgi/catalog/catalog_interface.py +2767 -0
- vgi/catalog/descriptors.py +870 -0
- vgi/catalog/duckdb_statistics.py +377 -0
- vgi/catalog/secret_type.py +96 -0
- vgi/catalog/setting.py +253 -0
- vgi/catalog/storage.py +372 -0
- vgi/client/__init__.py +67 -0
- vgi/client/catalog_mixin.py +1251 -0
- vgi/client/cli.py +582 -0
- vgi/client/cli_catalog.py +182 -0
- vgi/client/cli_schema.py +270 -0
- vgi/client/cli_table.py +907 -0
- vgi/client/cli_transaction.py +97 -0
- vgi/client/cli_utils.py +441 -0
- vgi/client/cli_view.py +303 -0
- vgi/client/client.py +2183 -0
- vgi/exceptions.py +205 -0
- vgi/function.py +245 -0
- vgi/function_storage.py +1636 -0
- vgi/function_storage_azure_sql.py +922 -0
- vgi/function_storage_cf_do.py +740 -0
- vgi/http/__init__.py +25 -0
- vgi/http/demo_storage.py +212 -0
- vgi/http/worker_page.py +1252 -0
- vgi/invocation.py +154 -0
- vgi/logging_config.py +93 -0
- vgi/meta_worker.py +661 -0
- vgi/metadata.py +1403 -0
- vgi/otel.py +406 -0
- vgi/protocol.py +2418 -0
- vgi/protocol_version.txt +1 -0
- vgi/py.typed +0 -0
- vgi/scalar_function.py +1211 -0
- vgi/schema_utils.py +234 -0
- vgi/secret_protocol.py +124 -0
- vgi/secret_service.py +238 -0
- vgi/serve.py +769 -0
- vgi/table_buffering_function.py +443 -0
- vgi/table_filter_pushdown.py +1528 -0
- vgi/table_function.py +1130 -0
- vgi/table_in_out_function.py +383 -0
- vgi/transactor/__init__.py +24 -0
- vgi/transactor/_duckdb_compat.py +27 -0
- vgi/transactor/client.py +137 -0
- vgi/transactor/protocol.py +149 -0
- vgi/transactor/server.py +740 -0
- vgi/worker.py +4761 -0
- vgi_python-0.8.0.dist-info/METADATA +735 -0
- vgi_python-0.8.0.dist-info/RECORD +124 -0
- vgi_python-0.8.0.dist-info/WHEEL +4 -0
- vgi_python-0.8.0.dist-info/entry_points.txt +5 -0
- vgi_python-0.8.0.dist-info/licenses/LICENSE +134 -0
|
@@ -0,0 +1,249 @@
|
|
|
1
|
+
# Copyright 2025, 2026 Query Farm LLC - https://query.farm
|
|
2
|
+
|
|
3
|
+
"""Late-materialization fixture.
|
|
4
|
+
|
|
5
|
+
Exercises DuckDB's late-materialization optimizer end-to-end against a VGI
|
|
6
|
+
worker. When ``Meta.late_materialization`` is advertised and the table has a
|
|
7
|
+
rowid virtual column, a ``TOP_N`` / ``LIMIT`` / ``SAMPLE`` over the scan is
|
|
8
|
+
rewritten by DuckDB into a SEMI join on the rowid: a narrow ordering scan
|
|
9
|
+
selects survivors, then the wide scan re-fetches their columns with the
|
|
10
|
+
surviving rowids pushed down as a filter.
|
|
11
|
+
|
|
12
|
+
Schema ``(row_id int64 [is_row_id], ord int64, payload utf8, pushed utf8)``:
|
|
13
|
+
|
|
14
|
+
* ``row_id == row index`` — unique, deterministic, and snapshot-stable, so a
|
|
15
|
+
rowid emitted by the ordering scan resolves to the same logical row in the
|
|
16
|
+
(independent) wide scan, even across worker processes. This satisfies the
|
|
17
|
+
late-materialization worker contract.
|
|
18
|
+
* ``ord`` is a *scrambled* function of the index so a Top-N on ``ord`` yields
|
|
19
|
+
scattered survivor rowids — that drives the exact ``IN``-list pushdown path
|
|
20
|
+
(DuckDB only builds an ``IN`` list for ``2..dynamic_or_filter_threshold``
|
|
21
|
+
survivors; above that it pushes a rowid min/max range instead).
|
|
22
|
+
* ``payload`` is the wide column whose materialization the rewrite avoids.
|
|
23
|
+
* ``pushed`` is the **witness**: it echoes, per row, the rowid filter the
|
|
24
|
+
worker received (``in=<n>`` join keys, ``rng=<lo>..<hi>`` bounds). Because
|
|
25
|
+
the rewrite's output columns come from the *wide* scan, selecting ``pushed``
|
|
26
|
+
unambiguously reports what was pushed to that scan. This works over both
|
|
27
|
+
subprocess and HTTP transports (unlike in-band ``client_log``).
|
|
28
|
+
|
|
29
|
+
``dup_row_id=True`` deliberately violates the uniqueness invariant (row_id =
|
|
30
|
+
index // 2) to back the negative gating test. ``null_ord_stride>0`` injects
|
|
31
|
+
NULLs into ``ord`` for the NULL-ordering test.
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
from __future__ import annotations
|
|
35
|
+
|
|
36
|
+
from dataclasses import dataclass
|
|
37
|
+
from typing import Annotated, Any
|
|
38
|
+
|
|
39
|
+
import pyarrow as pa
|
|
40
|
+
from vgi_rpc import ArrowSerializableDataclass
|
|
41
|
+
from vgi_rpc.rpc import OutputCollector
|
|
42
|
+
|
|
43
|
+
from vgi._test_fixtures.table._common import _cardinality_from_count
|
|
44
|
+
from vgi.arguments import Arg
|
|
45
|
+
from vgi.invocation import BindResponse
|
|
46
|
+
from vgi.metadata import FunctionExample
|
|
47
|
+
from vgi.table_filter_pushdown import PushdownFilters
|
|
48
|
+
from vgi.table_function import (
|
|
49
|
+
BindParams,
|
|
50
|
+
ProcessParams,
|
|
51
|
+
TableFunctionGenerator,
|
|
52
|
+
init_single_worker,
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
# Field name of the rowid column; the C++ extension resolves a pushed rowid
|
|
56
|
+
# filter to this name on the wire, so the worker matches it by name.
|
|
57
|
+
_ROWID_NAME = "row_id"
|
|
58
|
+
|
|
59
|
+
# Scramble multiplier (odd, coprime with any reasonable count) used to turn the
|
|
60
|
+
# monotonic index into a scattered ordering key.
|
|
61
|
+
_SCRAMBLE = 2654435761
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def _scramble_ord(index: int) -> int:
|
|
65
|
+
"""Deterministic, scattered ordering key for a given row index."""
|
|
66
|
+
return (index * _SCRAMBLE) % 1_000_000_007
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def _rowid_pushdown_witness(filters: PushdownFilters | None) -> str:
|
|
70
|
+
"""Summarize the rowid filter the worker received as a stable string.
|
|
71
|
+
|
|
72
|
+
``in=<n>`` — total number of rowid ``IN``-list (join-key) values.
|
|
73
|
+
``rng=<lo>..<hi>`` — min/max rowid range bounds, or ``none`` if absent.
|
|
74
|
+
"""
|
|
75
|
+
if filters is None:
|
|
76
|
+
return "rid:in=0;rng=none"
|
|
77
|
+
|
|
78
|
+
from vgi.table_filter_pushdown import AndFilter, ConstantFilter, InFilter, OrFilter
|
|
79
|
+
|
|
80
|
+
in_count = 0
|
|
81
|
+
lo: Any = None
|
|
82
|
+
hi: Any = None
|
|
83
|
+
|
|
84
|
+
def walk(f: object) -> None:
|
|
85
|
+
nonlocal in_count, lo, hi
|
|
86
|
+
if isinstance(f, (AndFilter, OrFilter)):
|
|
87
|
+
for child in f.children:
|
|
88
|
+
walk(child)
|
|
89
|
+
elif isinstance(f, InFilter) and f.column_name == _ROWID_NAME:
|
|
90
|
+
in_count += len(f.values)
|
|
91
|
+
elif isinstance(f, ConstantFilter) and f.column_name == _ROWID_NAME:
|
|
92
|
+
sym = f.op.symbol
|
|
93
|
+
if sym.startswith(">"):
|
|
94
|
+
lo = f.value if lo is None else min(lo, f.value)
|
|
95
|
+
elif sym.startswith("<"):
|
|
96
|
+
hi = f.value if hi is None else max(hi, f.value)
|
|
97
|
+
elif sym == "=":
|
|
98
|
+
lo = hi = f.value
|
|
99
|
+
|
|
100
|
+
for f in filters.filters:
|
|
101
|
+
walk(f)
|
|
102
|
+
|
|
103
|
+
rng = f"{lo}..{hi}" if (lo is not None or hi is not None) else "none"
|
|
104
|
+
return f"rid:in={in_count};rng={rng}"
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
@dataclass(slots=True, frozen=True)
|
|
108
|
+
class LateMaterializationFunctionArgs:
|
|
109
|
+
"""Arguments for LateMaterializationFunction."""
|
|
110
|
+
|
|
111
|
+
count: Annotated[int, Arg(0, doc="Number of rows to generate", ge=0)]
|
|
112
|
+
batch_size: Annotated[int, Arg("batch_size", default=2048, doc="Batch size for output", ge=1)]
|
|
113
|
+
dup_row_id: Annotated[
|
|
114
|
+
bool,
|
|
115
|
+
Arg("dup_row_id", default=False, doc="Emit a deliberately non-unique row_id (index // 2)"),
|
|
116
|
+
]
|
|
117
|
+
null_ord_stride: Annotated[
|
|
118
|
+
int,
|
|
119
|
+
Arg("null_ord_stride", default=0, doc="Emit NULL ord every Nth row (0 = never)", ge=0),
|
|
120
|
+
]
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
@dataclass(kw_only=True)
|
|
124
|
+
class LateMaterializationState(ArrowSerializableDataclass):
|
|
125
|
+
"""Mutable state: position, remaining count, and the cached witness string.
|
|
126
|
+
|
|
127
|
+
``witness`` is serialized (not Transient) so the HTTP rehydrate path — which
|
|
128
|
+
deserializes user state without re-invoking ``initial_state`` — preserves
|
|
129
|
+
the observed pushdown filters across state-token round-trips.
|
|
130
|
+
"""
|
|
131
|
+
|
|
132
|
+
remaining: int
|
|
133
|
+
current_index: int = 0
|
|
134
|
+
witness: str = "rid:in=0;rng=none"
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
@init_single_worker
|
|
138
|
+
@_cardinality_from_count
|
|
139
|
+
class LateMaterializationFunction(TableFunctionGenerator[LateMaterializationFunctionArgs, LateMaterializationState]):
|
|
140
|
+
"""Rowid-bearing generator that participates in late materialization.
|
|
141
|
+
|
|
142
|
+
SCHEMA
|
|
143
|
+
------
|
|
144
|
+
Output: {"row_id": int64 [is_row_id], "ord": int64, "payload": utf8,
|
|
145
|
+
"pushed": utf8}
|
|
146
|
+
|
|
147
|
+
Example:
|
|
148
|
+
-------
|
|
149
|
+
SELECT row_id, payload FROM late_materialization(100000) ORDER BY ord LIMIT 10
|
|
150
|
+
|
|
151
|
+
"""
|
|
152
|
+
|
|
153
|
+
FunctionArguments = LateMaterializationFunctionArgs
|
|
154
|
+
|
|
155
|
+
class Meta:
|
|
156
|
+
"""Metadata for LateMaterializationFunction."""
|
|
157
|
+
|
|
158
|
+
name = "late_materialization"
|
|
159
|
+
description = "Rowid generator that participates in late materialization"
|
|
160
|
+
categories = ["generator", "diagnostic"]
|
|
161
|
+
projection_pushdown = True
|
|
162
|
+
filter_pushdown = True
|
|
163
|
+
auto_apply_filters = True
|
|
164
|
+
late_materialization = True
|
|
165
|
+
examples = [
|
|
166
|
+
FunctionExample(
|
|
167
|
+
sql="SELECT row_id, payload FROM late_materialization(100000) ORDER BY ord LIMIT 10",
|
|
168
|
+
description="Top-N is late-materialized: payload fetched only for survivors",
|
|
169
|
+
),
|
|
170
|
+
]
|
|
171
|
+
|
|
172
|
+
@classmethod
|
|
173
|
+
def on_bind(cls, params: BindParams[LateMaterializationFunctionArgs]) -> BindResponse:
|
|
174
|
+
"""Build the rowid-bearing output schema."""
|
|
175
|
+
rid_field = pa.field(_ROWID_NAME, pa.int64(), metadata={b"is_row_id": b""})
|
|
176
|
+
fields: list[pa.Field[Any]] = [
|
|
177
|
+
rid_field,
|
|
178
|
+
pa.field("ord", pa.int64()),
|
|
179
|
+
pa.field("payload", pa.utf8()),
|
|
180
|
+
pa.field("pushed", pa.utf8()),
|
|
181
|
+
]
|
|
182
|
+
return BindResponse(output_schema=pa.schema(fields))
|
|
183
|
+
|
|
184
|
+
@classmethod
|
|
185
|
+
def initial_state(cls, params: ProcessParams[LateMaterializationFunctionArgs]) -> LateMaterializationState:
|
|
186
|
+
"""Seed state and capture the init-time rowid filter into the witness.
|
|
187
|
+
|
|
188
|
+
For the wide probe scan, the SEMI join's build side completes before the
|
|
189
|
+
scan inits, so the surviving rowid range arrives as a *concrete* filter on
|
|
190
|
+
the init-time ``pushdown_filters`` (not a per-tick dynamic filter).
|
|
191
|
+
process() additionally latches anything that shows up per-tick.
|
|
192
|
+
"""
|
|
193
|
+
init_witness = "rid:in=0;rng=none"
|
|
194
|
+
ic = params.init_call
|
|
195
|
+
if ic is not None and ic.pushdown_filters is not None:
|
|
196
|
+
init_filters = cls.pushdown_filters(ic.pushdown_filters, join_keys=ic.join_keys)
|
|
197
|
+
init_witness = _rowid_pushdown_witness(init_filters)
|
|
198
|
+
return LateMaterializationState(remaining=params.args.count, witness=init_witness)
|
|
199
|
+
|
|
200
|
+
@classmethod
|
|
201
|
+
def process(
|
|
202
|
+
cls,
|
|
203
|
+
params: ProcessParams[LateMaterializationFunctionArgs],
|
|
204
|
+
state: LateMaterializationState,
|
|
205
|
+
out: OutputCollector,
|
|
206
|
+
) -> None:
|
|
207
|
+
"""Emit the next batch of (projected) rowid rows.
|
|
208
|
+
|
|
209
|
+
The surviving-rowid filter from late materialization is pushed as a
|
|
210
|
+
*dynamic* filter (populated after the SEMI join's build side completes),
|
|
211
|
+
so it surfaces on ``params.current_pushdown_filters`` per tick — not on
|
|
212
|
+
the init-time ``pushdown_filters``. The probe (wide) scan runs after the
|
|
213
|
+
build, so it sees the full rowid filter from its first tick.
|
|
214
|
+
"""
|
|
215
|
+
# Refresh the witness from the current per-tick dynamic filters. Once a
|
|
216
|
+
# rowid filter is present, latch it (later ticks of the probe scan keep
|
|
217
|
+
# seeing it, but guard against a transient empty tick clobbering it).
|
|
218
|
+
tick_witness = _rowid_pushdown_witness(params.current_pushdown_filters)
|
|
219
|
+
if tick_witness != "rid:in=0;rng=none" or state.witness == "rid:in=0;rng=none":
|
|
220
|
+
state.witness = tick_witness
|
|
221
|
+
|
|
222
|
+
if state.remaining <= 0:
|
|
223
|
+
out.finish()
|
|
224
|
+
return
|
|
225
|
+
|
|
226
|
+
size = min(state.remaining, params.args.batch_size)
|
|
227
|
+
start = state.current_index
|
|
228
|
+
stride = params.args.null_ord_stride
|
|
229
|
+
|
|
230
|
+
columns: dict[str, list[Any]] = {}
|
|
231
|
+
for f in params.output_schema:
|
|
232
|
+
if f.name == _ROWID_NAME:
|
|
233
|
+
if params.args.dup_row_id:
|
|
234
|
+
columns[_ROWID_NAME] = [i // 2 for i in range(start, start + size)]
|
|
235
|
+
else:
|
|
236
|
+
columns[_ROWID_NAME] = list(range(start, start + size))
|
|
237
|
+
elif f.name == "ord":
|
|
238
|
+
columns["ord"] = [
|
|
239
|
+
None if (stride > 0 and i % stride == 0) else _scramble_ord(i) for i in range(start, start + size)
|
|
240
|
+
]
|
|
241
|
+
elif f.name == "payload":
|
|
242
|
+
columns["payload"] = [f"payload_{i}" for i in range(start, start + size)]
|
|
243
|
+
elif f.name == "pushed":
|
|
244
|
+
columns["pushed"] = [state.witness] * size
|
|
245
|
+
|
|
246
|
+
out.emit(pa.RecordBatch.from_pydict(columns, schema=params.output_schema))
|
|
247
|
+
|
|
248
|
+
state.current_index += size
|
|
249
|
+
state.remaining -= size
|
|
@@ -0,0 +1,273 @@
|
|
|
1
|
+
# Copyright 2025, 2026 Query Farm LLC - https://query.farm
|
|
2
|
+
|
|
3
|
+
"""make_series_* generators (count/range/step/csv/float)."""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
from dataclasses import dataclass, field
|
|
8
|
+
from typing import Annotated, ClassVar
|
|
9
|
+
|
|
10
|
+
import pyarrow as pa
|
|
11
|
+
from vgi_rpc import ArrowSerializableDataclass
|
|
12
|
+
from vgi_rpc.rpc import OutputCollector
|
|
13
|
+
|
|
14
|
+
from vgi.arguments import Arg
|
|
15
|
+
from vgi.metadata import FunctionExample
|
|
16
|
+
from vgi.schema_utils import schema
|
|
17
|
+
from vgi.table_function import (
|
|
18
|
+
ProcessParams,
|
|
19
|
+
TableFunctionGenerator,
|
|
20
|
+
bind_fixed_schema,
|
|
21
|
+
init_single_worker,
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
# ============================================================================
|
|
25
|
+
|
|
26
|
+
MAKE_SERIES_SCHEMA = schema(value=pa.int64())
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
@dataclass(kw_only=True)
|
|
30
|
+
class MakeSeriesCountArgs:
|
|
31
|
+
"""Arguments for MakeSeriesCountFunction."""
|
|
32
|
+
|
|
33
|
+
count: Annotated[int, Arg(0, doc="Number of values to generate", ge=0)]
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
@dataclass(kw_only=True)
|
|
37
|
+
class MakeSeriesRangeArgs:
|
|
38
|
+
"""Arguments for MakeSeriesRangeFunction."""
|
|
39
|
+
|
|
40
|
+
start: Annotated[int, Arg(0, doc="Start value (inclusive)")]
|
|
41
|
+
stop: Annotated[int, Arg(1, doc="Stop value (exclusive)")]
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
@dataclass(kw_only=True)
|
|
45
|
+
class MakeSeriesStepArgs:
|
|
46
|
+
"""Arguments for MakeSeriesStepFunction."""
|
|
47
|
+
|
|
48
|
+
start: Annotated[int, Arg(0, doc="Start value (inclusive)")]
|
|
49
|
+
stop: Annotated[int, Arg(1, doc="Stop value (exclusive)")]
|
|
50
|
+
step: Annotated[int, Arg(2, doc="Step between values", ge=1)]
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
@dataclass(kw_only=True)
|
|
54
|
+
class MakeSeriesState(ArrowSerializableDataclass):
|
|
55
|
+
"""Mutable state for make_series functions."""
|
|
56
|
+
|
|
57
|
+
values: list[int]
|
|
58
|
+
offset: int = 0
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def _make_series_emit(state: MakeSeriesState, out: OutputCollector) -> None:
|
|
62
|
+
"""Shared process logic for all make_series overloads."""
|
|
63
|
+
if state.offset >= len(state.values):
|
|
64
|
+
out.finish()
|
|
65
|
+
return
|
|
66
|
+
batch_values = state.values[state.offset : state.offset + 1024]
|
|
67
|
+
out.emit(pa.RecordBatch.from_pydict({"value": batch_values}, schema=MAKE_SERIES_SCHEMA))
|
|
68
|
+
state.offset += len(batch_values)
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
@init_single_worker
|
|
72
|
+
@bind_fixed_schema
|
|
73
|
+
class MakeSeriesCountFunction(TableFunctionGenerator[MakeSeriesCountArgs, MakeSeriesState]):
|
|
74
|
+
"""Generate a series of integers from 0 to count-1.
|
|
75
|
+
|
|
76
|
+
Example:
|
|
77
|
+
SELECT * FROM make_series(5)
|
|
78
|
+
Returns: 0, 1, 2, 3, 4
|
|
79
|
+
|
|
80
|
+
"""
|
|
81
|
+
|
|
82
|
+
FIXED_SCHEMA: ClassVar[pa.Schema] = MAKE_SERIES_SCHEMA
|
|
83
|
+
|
|
84
|
+
class Meta:
|
|
85
|
+
"""Function metadata."""
|
|
86
|
+
|
|
87
|
+
name = "make_series"
|
|
88
|
+
description = "Generate integers from 0 to count-1"
|
|
89
|
+
examples = [
|
|
90
|
+
FunctionExample(
|
|
91
|
+
sql="SELECT * FROM make_series(5)",
|
|
92
|
+
description="Generate 0..4",
|
|
93
|
+
),
|
|
94
|
+
]
|
|
95
|
+
|
|
96
|
+
@classmethod
|
|
97
|
+
def initial_state(cls, params: ProcessParams[MakeSeriesCountArgs]) -> MakeSeriesState:
|
|
98
|
+
"""Build the full value list."""
|
|
99
|
+
return MakeSeriesState(values=list(range(params.args.count)))
|
|
100
|
+
|
|
101
|
+
@classmethod
|
|
102
|
+
def process(cls, params: ProcessParams[MakeSeriesCountArgs], state: MakeSeriesState, out: OutputCollector) -> None:
|
|
103
|
+
"""Emit values in batches."""
|
|
104
|
+
_make_series_emit(state, out)
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
@init_single_worker
|
|
108
|
+
@bind_fixed_schema
|
|
109
|
+
class MakeSeriesRangeFunction(TableFunctionGenerator[MakeSeriesRangeArgs, MakeSeriesState]):
|
|
110
|
+
"""Generate a series of integers from start to stop-1.
|
|
111
|
+
|
|
112
|
+
Example:
|
|
113
|
+
SELECT * FROM make_series(3, 7)
|
|
114
|
+
Returns: 3, 4, 5, 6
|
|
115
|
+
|
|
116
|
+
"""
|
|
117
|
+
|
|
118
|
+
FIXED_SCHEMA: ClassVar[pa.Schema] = MAKE_SERIES_SCHEMA
|
|
119
|
+
|
|
120
|
+
class Meta:
|
|
121
|
+
"""Function metadata."""
|
|
122
|
+
|
|
123
|
+
name = "make_series"
|
|
124
|
+
description = "Generate integers from start to stop-1"
|
|
125
|
+
examples = [
|
|
126
|
+
FunctionExample(
|
|
127
|
+
sql="SELECT * FROM make_series(3, 7)",
|
|
128
|
+
description="Generate 3..6",
|
|
129
|
+
),
|
|
130
|
+
]
|
|
131
|
+
|
|
132
|
+
@classmethod
|
|
133
|
+
def initial_state(cls, params: ProcessParams[MakeSeriesRangeArgs]) -> MakeSeriesState:
|
|
134
|
+
"""Build the value list from start..stop."""
|
|
135
|
+
return MakeSeriesState(values=list(range(params.args.start, params.args.stop)))
|
|
136
|
+
|
|
137
|
+
@classmethod
|
|
138
|
+
def process(cls, params: ProcessParams[MakeSeriesRangeArgs], state: MakeSeriesState, out: OutputCollector) -> None:
|
|
139
|
+
"""Emit values in batches."""
|
|
140
|
+
_make_series_emit(state, out)
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
@init_single_worker
|
|
144
|
+
@bind_fixed_schema
|
|
145
|
+
class MakeSeriesStepFunction(TableFunctionGenerator[MakeSeriesStepArgs, MakeSeriesState]):
|
|
146
|
+
"""Generate a series of integers from start to stop-1 with step.
|
|
147
|
+
|
|
148
|
+
Example:
|
|
149
|
+
SELECT * FROM make_series(0, 10, 3)
|
|
150
|
+
Returns: 0, 3, 6, 9
|
|
151
|
+
|
|
152
|
+
"""
|
|
153
|
+
|
|
154
|
+
FIXED_SCHEMA: ClassVar[pa.Schema] = MAKE_SERIES_SCHEMA
|
|
155
|
+
|
|
156
|
+
class Meta:
|
|
157
|
+
"""Function metadata."""
|
|
158
|
+
|
|
159
|
+
name = "make_series"
|
|
160
|
+
description = "Generate integers from start to stop-1 with step"
|
|
161
|
+
examples = [
|
|
162
|
+
FunctionExample(
|
|
163
|
+
sql="SELECT * FROM make_series(0, 10, 3)",
|
|
164
|
+
description="Generate 0, 3, 6, 9",
|
|
165
|
+
),
|
|
166
|
+
]
|
|
167
|
+
|
|
168
|
+
@classmethod
|
|
169
|
+
def initial_state(cls, params: ProcessParams[MakeSeriesStepArgs]) -> MakeSeriesState:
|
|
170
|
+
"""Build the value list with step."""
|
|
171
|
+
return MakeSeriesState(values=list(range(params.args.start, params.args.stop, params.args.step)))
|
|
172
|
+
|
|
173
|
+
@classmethod
|
|
174
|
+
def process(cls, params: ProcessParams[MakeSeriesStepArgs], state: MakeSeriesState, out: OutputCollector) -> None:
|
|
175
|
+
"""Emit values in batches."""
|
|
176
|
+
_make_series_emit(state, out)
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
# ============================================================================
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
@dataclass(kw_only=True)
|
|
183
|
+
class MakeSeriesCsvArgs:
|
|
184
|
+
"""Arguments for MakeSeriesCsvFunction."""
|
|
185
|
+
|
|
186
|
+
values: Annotated[str, Arg(0, doc="Comma-separated integers")]
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
@init_single_worker
|
|
190
|
+
@bind_fixed_schema
|
|
191
|
+
class MakeSeriesCsvFunction(TableFunctionGenerator[MakeSeriesCsvArgs, MakeSeriesState]):
|
|
192
|
+
"""Parse a CSV string of integers into rows.
|
|
193
|
+
|
|
194
|
+
Example:
|
|
195
|
+
SELECT * FROM make_series('10,20,30')
|
|
196
|
+
Returns: 10, 20, 30
|
|
197
|
+
|
|
198
|
+
"""
|
|
199
|
+
|
|
200
|
+
FIXED_SCHEMA: ClassVar[pa.Schema] = MAKE_SERIES_SCHEMA
|
|
201
|
+
|
|
202
|
+
class Meta:
|
|
203
|
+
"""Function metadata."""
|
|
204
|
+
|
|
205
|
+
name = "make_series"
|
|
206
|
+
description = "Parse comma-separated integers into rows"
|
|
207
|
+
|
|
208
|
+
@classmethod
|
|
209
|
+
def initial_state(cls, params: ProcessParams[MakeSeriesCsvArgs]) -> MakeSeriesState:
|
|
210
|
+
"""Parse CSV string into value list."""
|
|
211
|
+
return MakeSeriesState(values=[int(x.strip()) for x in params.args.values.split(",")])
|
|
212
|
+
|
|
213
|
+
@classmethod
|
|
214
|
+
def process(cls, params: ProcessParams[MakeSeriesCsvArgs], state: MakeSeriesState, out: OutputCollector) -> None:
|
|
215
|
+
"""Emit values in batches."""
|
|
216
|
+
_make_series_emit(state, out)
|
|
217
|
+
|
|
218
|
+
|
|
219
|
+
MAKE_SERIES_FLOAT_SCHEMA = schema(value=pa.float64())
|
|
220
|
+
|
|
221
|
+
|
|
222
|
+
@dataclass(kw_only=True)
|
|
223
|
+
class MakeSeriesFloatArgs:
|
|
224
|
+
"""Arguments for MakeSeriesFloatFunction."""
|
|
225
|
+
|
|
226
|
+
step: Annotated[float, Arg(0, doc="Step size between values")]
|
|
227
|
+
|
|
228
|
+
|
|
229
|
+
@dataclass(kw_only=True)
|
|
230
|
+
class MakeSeriesFloatState(ArrowSerializableDataclass):
|
|
231
|
+
"""State for float make_series."""
|
|
232
|
+
|
|
233
|
+
values: list[float] = field(default_factory=list)
|
|
234
|
+
offset: int = 0
|
|
235
|
+
|
|
236
|
+
|
|
237
|
+
@init_single_worker
|
|
238
|
+
@bind_fixed_schema
|
|
239
|
+
class MakeSeriesFloatFunction(TableFunctionGenerator[MakeSeriesFloatArgs, MakeSeriesFloatState]):
|
|
240
|
+
"""Generate 10 float values: 0.0, step, 2*step, ..., 9*step.
|
|
241
|
+
|
|
242
|
+
Example:
|
|
243
|
+
SELECT * FROM make_series(0.5)
|
|
244
|
+
Returns: 0.0, 0.5, 1.0, ..., 4.5
|
|
245
|
+
|
|
246
|
+
"""
|
|
247
|
+
|
|
248
|
+
FIXED_SCHEMA: ClassVar[pa.Schema] = MAKE_SERIES_FLOAT_SCHEMA
|
|
249
|
+
|
|
250
|
+
class Meta:
|
|
251
|
+
"""Function metadata."""
|
|
252
|
+
|
|
253
|
+
name = "make_series"
|
|
254
|
+
description = "Generate 10 float values with given step size"
|
|
255
|
+
|
|
256
|
+
@classmethod
|
|
257
|
+
def initial_state(cls, params: ProcessParams[MakeSeriesFloatArgs]) -> MakeSeriesFloatState:
|
|
258
|
+
"""Build float value list."""
|
|
259
|
+
return MakeSeriesFloatState(values=[i * params.args.step for i in range(10)])
|
|
260
|
+
|
|
261
|
+
@classmethod
|
|
262
|
+
def process(
|
|
263
|
+
cls, params: ProcessParams[MakeSeriesFloatArgs], state: MakeSeriesFloatState, out: OutputCollector
|
|
264
|
+
) -> None:
|
|
265
|
+
"""Emit values in batches."""
|
|
266
|
+
if state.offset >= len(state.values):
|
|
267
|
+
out.finish()
|
|
268
|
+
return
|
|
269
|
+
batch_size = 1024
|
|
270
|
+
end = min(state.offset + batch_size, len(state.values))
|
|
271
|
+
chunk = state.values[state.offset : end]
|
|
272
|
+
state.offset = end
|
|
273
|
+
out.emit(pa.RecordBatch.from_pydict({"value": chunk}, schema=MAKE_SERIES_FLOAT_SCHEMA))
|