vgi-python 0.8.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vgi/__init__.py +152 -0
- vgi/_duckdb.py +62 -0
- vgi/_storage_profile.py +132 -0
- vgi/_test_fixtures/__init__.py +20 -0
- vgi/_test_fixtures/accumulate/__init__.py +19 -0
- vgi/_test_fixtures/accumulate/worker.py +762 -0
- vgi/_test_fixtures/aggregate/__init__.py +62 -0
- vgi/_test_fixtures/aggregate/_common.py +21 -0
- vgi/_test_fixtures/aggregate/basic.py +232 -0
- vgi/_test_fixtures/aggregate/dynamic.py +409 -0
- vgi/_test_fixtures/aggregate/generic.py +86 -0
- vgi/_test_fixtures/aggregate/listagg.py +71 -0
- vgi/_test_fixtures/aggregate/percentile.py +107 -0
- vgi/_test_fixtures/aggregate/streaming.py +192 -0
- vgi/_test_fixtures/aggregate/varargs.py +75 -0
- vgi/_test_fixtures/aggregate/window.py +380 -0
- vgi/_test_fixtures/attach_options.py +308 -0
- vgi/_test_fixtures/bad_protocol.py +62 -0
- vgi/_test_fixtures/cancellable.py +336 -0
- vgi/_test_fixtures/catalog.py +813 -0
- vgi/_test_fixtures/http_server.py +394 -0
- vgi/_test_fixtures/nest_tensor.py +614 -0
- vgi/_test_fixtures/orchard_catalog.py +47 -0
- vgi/_test_fixtures/projection_repro/__init__.py +6 -0
- vgi/_test_fixtures/projection_repro/worker.py +454 -0
- vgi/_test_fixtures/scalar/__init__.py +116 -0
- vgi/_test_fixtures/scalar/_common.py +69 -0
- vgi/_test_fixtures/scalar/arithmetic.py +321 -0
- vgi/_test_fixtures/scalar/binary.py +120 -0
- vgi/_test_fixtures/scalar/formatting.py +176 -0
- vgi/_test_fixtures/scalar/geo.py +300 -0
- vgi/_test_fixtures/scalar/null_handling.py +107 -0
- vgi/_test_fixtures/scalar/random_demo.py +171 -0
- vgi/_test_fixtures/scalar/settings_secrets.py +102 -0
- vgi/_test_fixtures/scalar/type_info.py +219 -0
- vgi/_test_fixtures/schema_reconcile/__init__.py +29 -0
- vgi/_test_fixtures/schema_reconcile/worker.py +653 -0
- vgi/_test_fixtures/simple_writable.py +793 -0
- vgi/_test_fixtures/table/__init__.py +221 -0
- vgi/_test_fixtures/table/_common.py +162 -0
- vgi/_test_fixtures/table/batch_index.py +283 -0
- vgi/_test_fixtures/table/batch_index_broken.py +200 -0
- vgi/_test_fixtures/table/catalog_scans.py +162 -0
- vgi/_test_fixtures/table/filters.py +1005 -0
- vgi/_test_fixtures/table/late_materialization.py +249 -0
- vgi/_test_fixtures/table/make_series.py +273 -0
- vgi/_test_fixtures/table/misc.py +499 -0
- vgi/_test_fixtures/table/order_modes.py +164 -0
- vgi/_test_fixtures/table/pairs.py +437 -0
- vgi/_test_fixtures/table/partition_columns.py +472 -0
- vgi/_test_fixtures/table/partition_columns_broken.py +304 -0
- vgi/_test_fixtures/table/profiling_example.py +195 -0
- vgi/_test_fixtures/table/required_filters.py +234 -0
- vgi/_test_fixtures/table/sequence.py +710 -0
- vgi/_test_fixtures/table/settings.py +426 -0
- vgi/_test_fixtures/table/transaction_storage.py +162 -0
- vgi/_test_fixtures/table/tt_pushdown.py +191 -0
- vgi/_test_fixtures/table/versioned.py +230 -0
- vgi/_test_fixtures/table_in_out.py +1392 -0
- vgi/_test_fixtures/versioned.py +155 -0
- vgi/_test_fixtures/versioned_tables.py +595 -0
- vgi/_test_fixtures/worker.py +1631 -0
- vgi/_test_fixtures/writable/__init__.py +8 -0
- vgi/_test_fixtures/writable/generic.py +236 -0
- vgi/_test_fixtures/writable/table.py +149 -0
- vgi/_test_fixtures/writable/worker.py +1148 -0
- vgi/aggregate_function.py +607 -0
- vgi/argument_spec.py +472 -0
- vgi/arguments.py +1747 -0
- vgi/auth.py +55 -0
- vgi/catalog/__init__.py +88 -0
- vgi/catalog/attach_option.py +206 -0
- vgi/catalog/catalog_interface.py +2767 -0
- vgi/catalog/descriptors.py +870 -0
- vgi/catalog/duckdb_statistics.py +377 -0
- vgi/catalog/secret_type.py +96 -0
- vgi/catalog/setting.py +253 -0
- vgi/catalog/storage.py +372 -0
- vgi/client/__init__.py +67 -0
- vgi/client/catalog_mixin.py +1251 -0
- vgi/client/cli.py +582 -0
- vgi/client/cli_catalog.py +182 -0
- vgi/client/cli_schema.py +270 -0
- vgi/client/cli_table.py +907 -0
- vgi/client/cli_transaction.py +97 -0
- vgi/client/cli_utils.py +441 -0
- vgi/client/cli_view.py +303 -0
- vgi/client/client.py +2183 -0
- vgi/exceptions.py +205 -0
- vgi/function.py +245 -0
- vgi/function_storage.py +1636 -0
- vgi/function_storage_azure_sql.py +922 -0
- vgi/function_storage_cf_do.py +740 -0
- vgi/http/__init__.py +25 -0
- vgi/http/demo_storage.py +212 -0
- vgi/http/worker_page.py +1252 -0
- vgi/invocation.py +154 -0
- vgi/logging_config.py +93 -0
- vgi/meta_worker.py +661 -0
- vgi/metadata.py +1403 -0
- vgi/otel.py +406 -0
- vgi/protocol.py +2418 -0
- vgi/protocol_version.txt +1 -0
- vgi/py.typed +0 -0
- vgi/scalar_function.py +1211 -0
- vgi/schema_utils.py +234 -0
- vgi/secret_protocol.py +124 -0
- vgi/secret_service.py +238 -0
- vgi/serve.py +769 -0
- vgi/table_buffering_function.py +443 -0
- vgi/table_filter_pushdown.py +1528 -0
- vgi/table_function.py +1130 -0
- vgi/table_in_out_function.py +383 -0
- vgi/transactor/__init__.py +24 -0
- vgi/transactor/_duckdb_compat.py +27 -0
- vgi/transactor/client.py +137 -0
- vgi/transactor/protocol.py +149 -0
- vgi/transactor/server.py +740 -0
- vgi/worker.py +4761 -0
- vgi_python-0.8.0.dist-info/METADATA +735 -0
- vgi_python-0.8.0.dist-info/RECORD +124 -0
- vgi_python-0.8.0.dist-info/WHEEL +4 -0
- vgi_python-0.8.0.dist-info/entry_points.txt +5 -0
- vgi_python-0.8.0.dist-info/licenses/LICENSE +134 -0
|
@@ -0,0 +1,710 @@
|
|
|
1
|
+
# Copyright 2025, 2026 Query Farm LLC - https://query.farm
|
|
2
|
+
|
|
3
|
+
"""Sequence-style table generators (sequence, double_sequence, partitioned_sequence, etc.)."""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
import struct
|
|
8
|
+
from dataclasses import dataclass
|
|
9
|
+
from typing import Annotated, Any, ClassVar
|
|
10
|
+
|
|
11
|
+
import numpy as np
|
|
12
|
+
import pyarrow as pa
|
|
13
|
+
from vgi_rpc import ArrowSerializableDataclass
|
|
14
|
+
from vgi_rpc.rpc import OutputCollector
|
|
15
|
+
|
|
16
|
+
from vgi._test_fixtures.table._common import (
|
|
17
|
+
CountBatchArgs,
|
|
18
|
+
CountdownState,
|
|
19
|
+
_BaseSequenceFunction,
|
|
20
|
+
_cardinality_from_count,
|
|
21
|
+
)
|
|
22
|
+
from vgi.arguments import Arg
|
|
23
|
+
from vgi.catalog.catalog_interface import ColumnStatistics
|
|
24
|
+
from vgi.invocation import BindResponse, GlobalInitResponse
|
|
25
|
+
from vgi.metadata import FunctionExample
|
|
26
|
+
from vgi.schema_utils import schema
|
|
27
|
+
from vgi.table_function import (
|
|
28
|
+
BindParams,
|
|
29
|
+
InitParams,
|
|
30
|
+
ProcessParams,
|
|
31
|
+
TableCardinality,
|
|
32
|
+
TableFunctionGenerator,
|
|
33
|
+
bind_fixed_schema,
|
|
34
|
+
init_single_worker,
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
@dataclass(frozen=True)
|
|
39
|
+
class SequenceFunctionArgs(CountBatchArgs):
|
|
40
|
+
"""Arguments for SequenceFunction."""
|
|
41
|
+
|
|
42
|
+
increment: Annotated[int, Arg("increment", default=1, doc="Step between values", ge=1)]
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
@init_single_worker
|
|
46
|
+
@bind_fixed_schema
|
|
47
|
+
@_cardinality_from_count
|
|
48
|
+
class SequenceFunction(_BaseSequenceFunction):
|
|
49
|
+
"""Generates a sequence of integers from 0 to n-1 with optional increment.
|
|
50
|
+
|
|
51
|
+
USE CASE
|
|
52
|
+
--------
|
|
53
|
+
Generate test data, create row numbers, or produce a fixed sequence
|
|
54
|
+
for joining or filtering. The increment parameter allows generating
|
|
55
|
+
sequences like 0, 2, 4, 6, ... or 0, 10, 20, 30, ...
|
|
56
|
+
|
|
57
|
+
SCHEMA
|
|
58
|
+
------
|
|
59
|
+
Output: {"n": int64}
|
|
60
|
+
|
|
61
|
+
Example:
|
|
62
|
+
-------
|
|
63
|
+
SELECT * FROM sequence(5)
|
|
64
|
+
Returns: [{"n": 0}, {"n": 1}, {"n": 2}, {"n": 3}, {"n": 4}]
|
|
65
|
+
|
|
66
|
+
SELECT * FROM sequence(5, increment := 2)
|
|
67
|
+
Returns: [{"n": 0}, {"n": 2}, {"n": 4}, {"n": 6}, {"n": 8}]
|
|
68
|
+
|
|
69
|
+
SELECT * FROM sequence(1000, batch_size := 100)
|
|
70
|
+
Returns: integers 0-999 in batches of 100 rows each
|
|
71
|
+
|
|
72
|
+
"""
|
|
73
|
+
|
|
74
|
+
FunctionArguments = SequenceFunctionArgs
|
|
75
|
+
|
|
76
|
+
class Meta:
|
|
77
|
+
"""Metadata for SequenceFunction."""
|
|
78
|
+
|
|
79
|
+
name = "sequence"
|
|
80
|
+
description = "Generates a sequence of integers from 0 to n-1"
|
|
81
|
+
categories = ["generator", "utility"]
|
|
82
|
+
tags = {"category": "generator", "type": "utility"}
|
|
83
|
+
projection_pushdown = True
|
|
84
|
+
filter_pushdown = True
|
|
85
|
+
auto_apply_filters = True
|
|
86
|
+
examples = [
|
|
87
|
+
FunctionExample(
|
|
88
|
+
sql="SELECT * FROM sequence(10)",
|
|
89
|
+
description="Generate integers 0-9",
|
|
90
|
+
),
|
|
91
|
+
FunctionExample(
|
|
92
|
+
sql="SELECT * FROM sequence(1000, batch_size := 100)",
|
|
93
|
+
description="Generate integers 0-999 in batches of 100",
|
|
94
|
+
),
|
|
95
|
+
FunctionExample(
|
|
96
|
+
sql="SELECT * FROM sequence(5, batch_size := 10000, increment := 10)",
|
|
97
|
+
description="Generate 0, 10, 20, 30, 40",
|
|
98
|
+
),
|
|
99
|
+
]
|
|
100
|
+
|
|
101
|
+
# Full schema before projection
|
|
102
|
+
FIXED_SCHEMA: ClassVar[pa.Schema] = schema(n=pa.int64())
|
|
103
|
+
NUMPY_DTYPE: ClassVar[type[np.generic]] = np.int64
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
@dataclass(slots=True, frozen=True)
|
|
107
|
+
class NamedParamsEchoFunctionArgs:
|
|
108
|
+
"""Arguments for NamedParamsEchoFunction.
|
|
109
|
+
|
|
110
|
+
Note: keeps its own ``count`` (no ``batch_size``) because the function
|
|
111
|
+
uses a fixed ``BATCH_SIZE_FALLBACK``. Subclassing CountBatchArgs would
|
|
112
|
+
expose a user knob this fixture intentionally hides.
|
|
113
|
+
"""
|
|
114
|
+
|
|
115
|
+
count: Annotated[int, Arg(0, doc="Number of rows to generate", ge=0)]
|
|
116
|
+
greeting: Annotated[str, Arg("greeting", default="hello", doc="Greeting text echoed in output")]
|
|
117
|
+
multiplier: Annotated[int, Arg("multiplier", default=1, doc="Multiplier for value column")]
|
|
118
|
+
scale: Annotated[float, Arg("scale", default=1.0, doc="Scale factor for float_value column")]
|
|
119
|
+
enabled: Annotated[bool, Arg("enabled", default=True, doc="Boolean echoed in output")]
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
@init_single_worker
|
|
123
|
+
@bind_fixed_schema
|
|
124
|
+
@_cardinality_from_count
|
|
125
|
+
class NamedParamsEchoFunction(_BaseSequenceFunction):
|
|
126
|
+
"""Echoes named parameter values directly in output columns.
|
|
127
|
+
|
|
128
|
+
USE CASE
|
|
129
|
+
--------
|
|
130
|
+
Testing that named parameters of various types (VARCHAR, BIGINT, DOUBLE,
|
|
131
|
+
BOOLEAN) are correctly passed from DuckDB to the worker. Each named
|
|
132
|
+
parameter value is echoed directly in an output column, making it easy
|
|
133
|
+
to assert correctness.
|
|
134
|
+
|
|
135
|
+
SCHEMA
|
|
136
|
+
------
|
|
137
|
+
Output: {"id": int64, "greeting": string, "value": int64, "float_value": float64, "enabled": bool}
|
|
138
|
+
|
|
139
|
+
Example:
|
|
140
|
+
-------
|
|
141
|
+
SELECT * FROM named_params_echo(3)
|
|
142
|
+
Returns: rows with id=0..2, greeting='hello', value=id*1, float_value=id*1.0, enabled=true
|
|
143
|
+
|
|
144
|
+
SELECT * FROM named_params_echo(3, greeting := 'hi', multiplier := 10)
|
|
145
|
+
Returns: rows with id=0..2, greeting='hi', value=id*10, float_value=id*1.0, enabled=true
|
|
146
|
+
|
|
147
|
+
"""
|
|
148
|
+
|
|
149
|
+
FunctionArguments = NamedParamsEchoFunctionArgs
|
|
150
|
+
|
|
151
|
+
class Meta:
|
|
152
|
+
"""Metadata for NamedParamsEchoFunction."""
|
|
153
|
+
|
|
154
|
+
name = "named_params_echo"
|
|
155
|
+
description = "Echoes named parameter values in output columns"
|
|
156
|
+
categories = ["generator", "testing"]
|
|
157
|
+
tags = {"category": "testing", "type": "params"}
|
|
158
|
+
examples = [
|
|
159
|
+
FunctionExample(
|
|
160
|
+
sql="SELECT * FROM named_params_echo(3)",
|
|
161
|
+
description="Echo default parameter values for 3 rows",
|
|
162
|
+
),
|
|
163
|
+
FunctionExample(
|
|
164
|
+
sql="SELECT * FROM named_params_echo(3, greeting := 'hi', multiplier := 10)",
|
|
165
|
+
description="Echo custom greeting and multiplier",
|
|
166
|
+
),
|
|
167
|
+
]
|
|
168
|
+
|
|
169
|
+
FIXED_SCHEMA: ClassVar[pa.Schema] = schema(
|
|
170
|
+
{
|
|
171
|
+
"id": pa.int64(),
|
|
172
|
+
"greeting": pa.string(),
|
|
173
|
+
"value": pa.int64(),
|
|
174
|
+
"float_value": pa.float64(),
|
|
175
|
+
"enabled": pa.bool_(),
|
|
176
|
+
}
|
|
177
|
+
)
|
|
178
|
+
|
|
179
|
+
@classmethod
|
|
180
|
+
def statistics(cls, params: BindParams[NamedParamsEchoFunctionArgs]) -> list[ColumnStatistics] | None:
|
|
181
|
+
"""Echo function doesn't compute single-column stats — opt out of base impl."""
|
|
182
|
+
return None
|
|
183
|
+
|
|
184
|
+
@classmethod
|
|
185
|
+
def _emit_chunk(
|
|
186
|
+
cls,
|
|
187
|
+
params: ProcessParams[NamedParamsEchoFunctionArgs],
|
|
188
|
+
state: CountdownState,
|
|
189
|
+
out: OutputCollector,
|
|
190
|
+
start: int,
|
|
191
|
+
size: int,
|
|
192
|
+
) -> None:
|
|
193
|
+
"""Emit a batch of rows echoing the named parameter values."""
|
|
194
|
+
ids = list(range(start, start + size))
|
|
195
|
+
data: dict[str, list[int] | list[str] | list[float] | list[bool]] = {
|
|
196
|
+
"id": ids,
|
|
197
|
+
"greeting": [params.args.greeting] * size,
|
|
198
|
+
"value": [i * params.args.multiplier for i in ids],
|
|
199
|
+
"float_value": [i * params.args.scale for i in ids],
|
|
200
|
+
"enabled": [params.args.enabled] * size,
|
|
201
|
+
}
|
|
202
|
+
out.emit(pa.RecordBatch.from_pydict(data, schema=params.output_schema))
|
|
203
|
+
|
|
204
|
+
|
|
205
|
+
@dataclass(frozen=True)
|
|
206
|
+
class NestedSequenceFunctionArguments(CountBatchArgs):
|
|
207
|
+
"""Arguments for NestedSequenceFunction."""
|
|
208
|
+
|
|
209
|
+
history_size: Annotated[int, Arg("history_size", default=20, doc="Max items in history list", ge=1)]
|
|
210
|
+
|
|
211
|
+
|
|
212
|
+
@init_single_worker
|
|
213
|
+
@bind_fixed_schema
|
|
214
|
+
@_cardinality_from_count
|
|
215
|
+
class NestedSequenceFunction(_BaseSequenceFunction):
|
|
216
|
+
"""Generates a sequence with nested struct and list columns.
|
|
217
|
+
|
|
218
|
+
USE CASE
|
|
219
|
+
--------
|
|
220
|
+
Test filter pushdown with complex types (structs and lists). The function
|
|
221
|
+
generates rows with:
|
|
222
|
+
- n: sequence index (0 to count-1)
|
|
223
|
+
- metadata: struct with {index: int64, label: string}
|
|
224
|
+
- history: list of the last 20 sequence values
|
|
225
|
+
|
|
226
|
+
SCHEMA
|
|
227
|
+
------
|
|
228
|
+
Output: {
|
|
229
|
+
"n": int64,
|
|
230
|
+
"metadata": struct<index: int64, label: string>,
|
|
231
|
+
"history": list<int64>
|
|
232
|
+
}
|
|
233
|
+
|
|
234
|
+
Example:
|
|
235
|
+
-------
|
|
236
|
+
SELECT * FROM nested_sequence(5)
|
|
237
|
+
Returns rows with n=0..4, metadata structs, and history lists
|
|
238
|
+
|
|
239
|
+
SELECT * FROM nested_sequence(100) WHERE n >= 50
|
|
240
|
+
Test filter pushdown on the sequence column
|
|
241
|
+
|
|
242
|
+
SELECT metadata.index FROM nested_sequence(10)
|
|
243
|
+
Test projection pushdown with struct field access
|
|
244
|
+
|
|
245
|
+
"""
|
|
246
|
+
|
|
247
|
+
class Meta:
|
|
248
|
+
"""Metadata for NestedSequenceFunction."""
|
|
249
|
+
|
|
250
|
+
name = "nested_sequence"
|
|
251
|
+
description = "Generates a sequence with nested struct and list columns"
|
|
252
|
+
categories = ["generator", "utility", "testing"]
|
|
253
|
+
tags = {"category": "generator", "type": "testing"}
|
|
254
|
+
projection_pushdown = True
|
|
255
|
+
filter_pushdown = True
|
|
256
|
+
auto_apply_filters = True
|
|
257
|
+
examples = [
|
|
258
|
+
FunctionExample(
|
|
259
|
+
sql="SELECT * FROM nested_sequence(10)",
|
|
260
|
+
description="Generate 10 rows with nested columns",
|
|
261
|
+
),
|
|
262
|
+
FunctionExample(
|
|
263
|
+
sql="SELECT n, metadata FROM nested_sequence(100) WHERE n >= 50",
|
|
264
|
+
description="Filter and project nested sequence",
|
|
265
|
+
),
|
|
266
|
+
]
|
|
267
|
+
|
|
268
|
+
FunctionArguments = NestedSequenceFunctionArguments
|
|
269
|
+
|
|
270
|
+
# Full schema before projection
|
|
271
|
+
FIXED_SCHEMA: ClassVar[pa.Schema] = pa.schema(
|
|
272
|
+
[
|
|
273
|
+
pa.field("n", pa.int64()),
|
|
274
|
+
pa.field(
|
|
275
|
+
"metadata",
|
|
276
|
+
pa.struct([("index", pa.int64()), ("label", pa.string())]),
|
|
277
|
+
),
|
|
278
|
+
pa.field("history", pa.list_(pa.int64())),
|
|
279
|
+
]
|
|
280
|
+
)
|
|
281
|
+
|
|
282
|
+
@classmethod
|
|
283
|
+
def statistics(cls, params: BindParams[NestedSequenceFunctionArguments]) -> list[ColumnStatistics] | None:
|
|
284
|
+
"""Nested sequence has multiple columns of varying types — opt out of base impl."""
|
|
285
|
+
return None
|
|
286
|
+
|
|
287
|
+
@classmethod
|
|
288
|
+
def _get_projected_column_names(cls, projection_ids: list[int] | None) -> set[str]:
|
|
289
|
+
"""Get the set of column names to generate."""
|
|
290
|
+
if projection_ids is not None:
|
|
291
|
+
return {cls.FIXED_SCHEMA.field(i).name for i in projection_ids}
|
|
292
|
+
return {f.name for f in cls.FIXED_SCHEMA}
|
|
293
|
+
|
|
294
|
+
@classmethod
|
|
295
|
+
def _emit_chunk(
|
|
296
|
+
cls,
|
|
297
|
+
params: ProcessParams[NestedSequenceFunctionArguments],
|
|
298
|
+
state: CountdownState,
|
|
299
|
+
out: OutputCollector,
|
|
300
|
+
start: int,
|
|
301
|
+
size: int,
|
|
302
|
+
) -> None:
|
|
303
|
+
"""Emit a batch of nested-sequence rows, honouring projection pushdown."""
|
|
304
|
+
assert params.init_call is not None
|
|
305
|
+
projected_cols = cls._get_projected_column_names(params.init_call.projection_ids)
|
|
306
|
+
indices = list(range(start, start + size))
|
|
307
|
+
data: dict[str, Any] = {}
|
|
308
|
+
|
|
309
|
+
if "n" in projected_cols:
|
|
310
|
+
data["n"] = indices
|
|
311
|
+
|
|
312
|
+
if "metadata" in projected_cols:
|
|
313
|
+
data["metadata"] = [{"index": i, "label": f"row_{i}"} for i in indices]
|
|
314
|
+
|
|
315
|
+
if "history" in projected_cols:
|
|
316
|
+
history_list = []
|
|
317
|
+
for i in indices:
|
|
318
|
+
window_start = max(0, i - params.args.history_size + 1)
|
|
319
|
+
history_list.append(list(range(window_start, i + 1)))
|
|
320
|
+
data["history"] = history_list
|
|
321
|
+
|
|
322
|
+
out.emit(pa.RecordBatch.from_pydict(data, schema=params.output_schema))
|
|
323
|
+
|
|
324
|
+
|
|
325
|
+
@dataclass(frozen=True)
|
|
326
|
+
class DoubleSequenceFunctionArguments(CountBatchArgs):
|
|
327
|
+
"""Arguments for DoubleSequenceFunction."""
|
|
328
|
+
|
|
329
|
+
increment: Annotated[float, Arg("increment", default=1.0, doc="Step between values", gt=0.0)]
|
|
330
|
+
|
|
331
|
+
|
|
332
|
+
@init_single_worker
|
|
333
|
+
@bind_fixed_schema
|
|
334
|
+
@_cardinality_from_count
|
|
335
|
+
class DoubleSequenceFunction(_BaseSequenceFunction):
|
|
336
|
+
"""Generates a sequence of floats from 0.0 to n-1 with optional increment.
|
|
337
|
+
|
|
338
|
+
USE CASE
|
|
339
|
+
--------
|
|
340
|
+
Generate test data with floating-point values, create sequences for
|
|
341
|
+
interpolation or sampling. The increment parameter allows generating
|
|
342
|
+
sequences like 0.0, 0.5, 1.0, 1.5, ... or 0.0, 0.1, 0.2, 0.3, ...
|
|
343
|
+
|
|
344
|
+
SCHEMA
|
|
345
|
+
------
|
|
346
|
+
Output: {"n": float64}
|
|
347
|
+
|
|
348
|
+
Example:
|
|
349
|
+
-------
|
|
350
|
+
SELECT * FROM double_sequence(5)
|
|
351
|
+
Returns: [{"n": 0.0}, {"n": 1.0}, {"n": 2.0}, {"n": 3.0}, {"n": 4.0}]
|
|
352
|
+
|
|
353
|
+
SELECT * FROM double_sequence(5, increment := 0.5)
|
|
354
|
+
Returns: [{"n": 0.0}, {"n": 0.5}, {"n": 1.0}, {"n": 1.5}, {"n": 2.0}]
|
|
355
|
+
|
|
356
|
+
SELECT * FROM double_sequence(1000, batch_size := 100)
|
|
357
|
+
Returns: floats 0.0-999.0 in batches of 100 rows each
|
|
358
|
+
|
|
359
|
+
"""
|
|
360
|
+
|
|
361
|
+
FunctionArguments = DoubleSequenceFunctionArguments
|
|
362
|
+
|
|
363
|
+
class Meta:
|
|
364
|
+
"""Metadata for DoubleSequenceFunction."""
|
|
365
|
+
|
|
366
|
+
name = "double_sequence"
|
|
367
|
+
description = "Generates a sequence of floating-point numbers from 0 to n-1"
|
|
368
|
+
categories = ["generator", "utility"]
|
|
369
|
+
tags = {"category": "generator", "type": "utility"}
|
|
370
|
+
examples = [
|
|
371
|
+
FunctionExample(
|
|
372
|
+
sql="SELECT * FROM double_sequence(10)",
|
|
373
|
+
description="Generate floats 0.0-9.0",
|
|
374
|
+
),
|
|
375
|
+
FunctionExample(
|
|
376
|
+
sql="SELECT * FROM double_sequence(1000, batch_size := 100)",
|
|
377
|
+
description="Generate floats 0.0-999.0 in batches of 100",
|
|
378
|
+
),
|
|
379
|
+
FunctionExample(
|
|
380
|
+
sql="SELECT * FROM double_sequence(5, increment := 0.5)",
|
|
381
|
+
description="Generate 0.0, 0.5, 1.0, 1.5, 2.0",
|
|
382
|
+
),
|
|
383
|
+
]
|
|
384
|
+
|
|
385
|
+
FIXED_SCHEMA: ClassVar[pa.Schema] = schema(n=pa.float64())
|
|
386
|
+
NUMPY_DTYPE: ClassVar[type[np.generic]] = np.float64
|
|
387
|
+
STATS_ARROW_TYPE: ClassVar[pa.DataType] = pa.float64()
|
|
388
|
+
|
|
389
|
+
|
|
390
|
+
@dataclass(slots=True, frozen=True)
|
|
391
|
+
class PartitionedSequenceFunctionArguments:
|
|
392
|
+
"""Arguments for PartitionedSequenceFunction."""
|
|
393
|
+
|
|
394
|
+
count: Annotated[int, Arg(0, doc="Total number of integers to generate", ge=0)]
|
|
395
|
+
increment: Annotated[int, Arg("increment", default=1, doc="Step between values", ge=1)]
|
|
396
|
+
|
|
397
|
+
|
|
398
|
+
@dataclass(kw_only=True)
|
|
399
|
+
class PartitionedSequenceState(ArrowSerializableDataclass):
|
|
400
|
+
"""Mutable state for PartitionedSequenceFunction."""
|
|
401
|
+
|
|
402
|
+
current_start: int | None = None
|
|
403
|
+
current_end: int | None = None
|
|
404
|
+
current_idx: int = 0
|
|
405
|
+
|
|
406
|
+
|
|
407
|
+
@bind_fixed_schema
|
|
408
|
+
@_cardinality_from_count
|
|
409
|
+
class PartitionedSequenceFunction(
|
|
410
|
+
TableFunctionGenerator[PartitionedSequenceFunctionArguments, PartitionedSequenceState]
|
|
411
|
+
):
|
|
412
|
+
"""Generates a partitioned sequence of integers for multi-worker execution.
|
|
413
|
+
|
|
414
|
+
USE CASE
|
|
415
|
+
--------
|
|
416
|
+
Generate a sequence of values using a work queue pattern. The primary worker
|
|
417
|
+
populates a queue with work chunks during initialization. All workers
|
|
418
|
+
(including the primary) pull chunks from the queue and generate output.
|
|
419
|
+
|
|
420
|
+
This is resilient to fewer workers launching than expected - all work
|
|
421
|
+
will still be completed by the available workers.
|
|
422
|
+
|
|
423
|
+
SCHEMA
|
|
424
|
+
------
|
|
425
|
+
Output: {"n": int64}
|
|
426
|
+
|
|
427
|
+
PARALLELIZATION
|
|
428
|
+
---------------
|
|
429
|
+
Fully parallelizable using a shared work queue. Each worker pulls chunks
|
|
430
|
+
atomically from the queue and generates values for that chunk.
|
|
431
|
+
|
|
432
|
+
The union of all workers' output produces the complete sequence.
|
|
433
|
+
|
|
434
|
+
Example:
|
|
435
|
+
-------
|
|
436
|
+
With count=3000 and MAX_PARTITIONS=24 (chunk = ceil(3000/24) = 125):
|
|
437
|
+
Queue is populated with 24 items: [(0, 125), (125, 250), ...].
|
|
438
|
+
Workers pull chunks and generate values for each range.
|
|
439
|
+
Combined output: [0, 1, 2, ..., 2999]
|
|
440
|
+
|
|
441
|
+
With count=5 and increment=10:
|
|
442
|
+
Combined output: [0, 10, 20, 30, 40]
|
|
443
|
+
|
|
444
|
+
"""
|
|
445
|
+
|
|
446
|
+
class Meta:
|
|
447
|
+
"""Metadata for PartitionedSequenceFunction."""
|
|
448
|
+
|
|
449
|
+
name = "partitioned_sequence"
|
|
450
|
+
description = "Generates a partitioned sequence for multi-worker execution"
|
|
451
|
+
categories = ["generator", "utility"]
|
|
452
|
+
examples = [
|
|
453
|
+
FunctionExample(
|
|
454
|
+
sql="SELECT * FROM partitioned_sequence(100)",
|
|
455
|
+
description="Generate 0-99 in parallel across workers",
|
|
456
|
+
),
|
|
457
|
+
FunctionExample(
|
|
458
|
+
sql="SELECT * FROM partitioned_sequence(5, increment := 10)",
|
|
459
|
+
description="Generate 0, 10, 20, 30, 40 in parallel",
|
|
460
|
+
),
|
|
461
|
+
]
|
|
462
|
+
|
|
463
|
+
# Cap the work queue at ~MAX_PARTITIONS items regardless of count, by sizing
|
|
464
|
+
# each chunk as ceil(count / MAX_PARTITIONS). The queue is drained one item
|
|
465
|
+
# per round-trip and serialized at the per-attach DO, so partition *count*
|
|
466
|
+
# drives remote cost. A fixed chunk size can't serve both a large query and
|
|
467
|
+
# a small distribution query (too-large chunks collapse the small one to one
|
|
468
|
+
# partition and kill fan-out); capping the partition count keeps ~24
|
|
469
|
+
# partitions at any scale. Each work item is a fixed-size (start, end) range
|
|
470
|
+
# — rows are generated locally and emitted in BATCH_SIZE batches — so this
|
|
471
|
+
# changes only the *count* of tiny pops, never any HTTP body size. Output is
|
|
472
|
+
# the plain sequence (partition-independent), so assertions are unchanged.
|
|
473
|
+
MAX_PARTITIONS: ClassVar[int] = 24
|
|
474
|
+
# Batch size for output within each chunk
|
|
475
|
+
BATCH_SIZE: ClassVar[int] = 1000
|
|
476
|
+
|
|
477
|
+
FIXED_SCHEMA: ClassVar[pa.Schema] = schema(n=pa.int64())
|
|
478
|
+
|
|
479
|
+
@classmethod
|
|
480
|
+
def on_init(
|
|
481
|
+
cls,
|
|
482
|
+
params: InitParams[PartitionedSequenceFunctionArguments],
|
|
483
|
+
) -> GlobalInitResponse:
|
|
484
|
+
"""Perform the global init of the worker for this function call."""
|
|
485
|
+
# Create work items for each chunk of the sequence
|
|
486
|
+
work_items: list[bytes] = []
|
|
487
|
+
chunk = max(1, -(-params.args.count // cls.MAX_PARTITIONS)) # ceil(count / MAX_PARTITIONS)
|
|
488
|
+
for start_idx in range(0, params.args.count, chunk):
|
|
489
|
+
end_idx = min(start_idx + chunk, params.args.count)
|
|
490
|
+
# Pack as two unsigned 64-bit integers: (start_idx, end_idx)
|
|
491
|
+
work_items.append(struct.pack(">QQ", start_idx, end_idx))
|
|
492
|
+
|
|
493
|
+
# Always enqueue (even if empty) to register the invocation
|
|
494
|
+
params.storage.queue_push(work_items)
|
|
495
|
+
return GlobalInitResponse()
|
|
496
|
+
|
|
497
|
+
@classmethod
|
|
498
|
+
def initial_state(cls, params: ProcessParams[PartitionedSequenceFunctionArguments]) -> PartitionedSequenceState:
|
|
499
|
+
"""Create initial state."""
|
|
500
|
+
return PartitionedSequenceState()
|
|
501
|
+
|
|
502
|
+
@classmethod
|
|
503
|
+
def process(
|
|
504
|
+
cls,
|
|
505
|
+
params: ProcessParams[PartitionedSequenceFunctionArguments],
|
|
506
|
+
state: PartitionedSequenceState,
|
|
507
|
+
out: OutputCollector,
|
|
508
|
+
) -> None:
|
|
509
|
+
"""Generate values by pulling chunks from the work queue."""
|
|
510
|
+
# If we have no current chunk or finished current chunk, pop next
|
|
511
|
+
if state.current_start is None or state.current_idx >= (state.current_end or 0):
|
|
512
|
+
work_data = params.storage.queue_pop()
|
|
513
|
+
if work_data is None:
|
|
514
|
+
out.finish()
|
|
515
|
+
return
|
|
516
|
+
state.current_start, state.current_end = struct.unpack(">QQ", work_data)
|
|
517
|
+
assert state.current_start is not None
|
|
518
|
+
state.current_idx = state.current_start
|
|
519
|
+
|
|
520
|
+
batch_end_idx = min(state.current_idx + cls.BATCH_SIZE, state.current_end or 0)
|
|
521
|
+
values = [idx * params.args.increment for idx in range(state.current_idx, batch_end_idx)]
|
|
522
|
+
|
|
523
|
+
out.emit(pa.RecordBatch.from_pydict({"n": values}, schema=params.output_schema))
|
|
524
|
+
|
|
525
|
+
state.current_idx = batch_end_idx
|
|
526
|
+
|
|
527
|
+
|
|
528
|
+
@dataclass(slots=True, frozen=True)
|
|
529
|
+
class TenThousandFunctionArguments:
|
|
530
|
+
"""Arguments for TenThousandFunction."""
|
|
531
|
+
|
|
532
|
+
|
|
533
|
+
@dataclass(kw_only=True)
|
|
534
|
+
class TenThousandState(ArrowSerializableDataclass):
|
|
535
|
+
"""Mutable state for TenThousandFunction."""
|
|
536
|
+
|
|
537
|
+
start: int = 0
|
|
538
|
+
|
|
539
|
+
|
|
540
|
+
@init_single_worker
|
|
541
|
+
@bind_fixed_schema
|
|
542
|
+
class TenThousandFunction(TableFunctionGenerator[TenThousandFunctionArguments, TenThousandState]):
|
|
543
|
+
"""Generates 10000 rows with integers from 0 to 9999.
|
|
544
|
+
|
|
545
|
+
USE CASE
|
|
546
|
+
--------
|
|
547
|
+
Simple test data generator with a fixed row count. Useful for testing
|
|
548
|
+
and benchmarking without needing to specify parameters.
|
|
549
|
+
|
|
550
|
+
SCHEMA
|
|
551
|
+
------
|
|
552
|
+
Output: {"n": int64}
|
|
553
|
+
|
|
554
|
+
Example:
|
|
555
|
+
-------
|
|
556
|
+
SELECT * FROM ten_thousand()
|
|
557
|
+
Returns: [{"n": 0}, {"n": 1}, ..., {"n": 9999}]
|
|
558
|
+
|
|
559
|
+
"""
|
|
560
|
+
|
|
561
|
+
class Meta:
|
|
562
|
+
"""Metadata for TenThousandFunction."""
|
|
563
|
+
|
|
564
|
+
name = "ten_thousand"
|
|
565
|
+
description = "Generates 10000 integers from 0 to 9999"
|
|
566
|
+
categories = ["generator", "utility"]
|
|
567
|
+
examples = [
|
|
568
|
+
FunctionExample(
|
|
569
|
+
sql="SELECT * FROM ten_thousand()",
|
|
570
|
+
description="Generate integers 0-9999",
|
|
571
|
+
),
|
|
572
|
+
]
|
|
573
|
+
|
|
574
|
+
BATCH_SIZE: ClassVar[int] = 1000
|
|
575
|
+
|
|
576
|
+
FIXED_SCHEMA: ClassVar[pa.Schema] = schema(n=pa.int64())
|
|
577
|
+
|
|
578
|
+
@classmethod
|
|
579
|
+
def cardinality(cls, params: BindParams[TenThousandFunctionArguments]) -> TableCardinality:
|
|
580
|
+
"""Return exact cardinality (always 10000)."""
|
|
581
|
+
return TableCardinality(estimate=10000, max=10000)
|
|
582
|
+
|
|
583
|
+
@classmethod
|
|
584
|
+
def initial_state(cls, params: ProcessParams[TenThousandFunctionArguments]) -> TenThousandState:
|
|
585
|
+
"""Create initial state."""
|
|
586
|
+
return TenThousandState()
|
|
587
|
+
|
|
588
|
+
@classmethod
|
|
589
|
+
def process(
|
|
590
|
+
cls,
|
|
591
|
+
params: ProcessParams[TenThousandFunctionArguments],
|
|
592
|
+
state: TenThousandState,
|
|
593
|
+
out: OutputCollector,
|
|
594
|
+
) -> None:
|
|
595
|
+
"""Generate 10000 integers in batches."""
|
|
596
|
+
if state.start >= 10000:
|
|
597
|
+
out.finish()
|
|
598
|
+
return
|
|
599
|
+
|
|
600
|
+
end = min(state.start + cls.BATCH_SIZE, 10000)
|
|
601
|
+
values = np.arange(state.start, end, dtype=np.int64)
|
|
602
|
+
out.emit(pa.RecordBatch.from_pydict({"n": values}, schema=params.output_schema))
|
|
603
|
+
state.start = end
|
|
604
|
+
|
|
605
|
+
|
|
606
|
+
@dataclass(slots=True, frozen=True)
|
|
607
|
+
class RowIdSequenceFunctionArgs:
|
|
608
|
+
"""Arguments for RowIdSequenceFunction."""
|
|
609
|
+
|
|
610
|
+
count: Annotated[int, Arg(0, doc="Number of rows to generate", ge=0)]
|
|
611
|
+
layout: Annotated[
|
|
612
|
+
str,
|
|
613
|
+
Arg(
|
|
614
|
+
"layout",
|
|
615
|
+
default="first",
|
|
616
|
+
doc="Row ID column position",
|
|
617
|
+
choices=("first", "middle", "last"),
|
|
618
|
+
),
|
|
619
|
+
]
|
|
620
|
+
row_id_type: Annotated[
|
|
621
|
+
str,
|
|
622
|
+
Arg(
|
|
623
|
+
"row_id_type",
|
|
624
|
+
default="int64",
|
|
625
|
+
doc="Row ID type",
|
|
626
|
+
choices=("int64", "string", "struct"),
|
|
627
|
+
),
|
|
628
|
+
]
|
|
629
|
+
|
|
630
|
+
|
|
631
|
+
@init_single_worker
|
|
632
|
+
class RowIdSequenceFunction(_BaseSequenceFunction):
|
|
633
|
+
"""Generates a sequence with a row_id column for testing row_id support.
|
|
634
|
+
|
|
635
|
+
The layout argument controls where the row_id column appears in the schema,
|
|
636
|
+
and row_id_type controls the type of the row_id column.
|
|
637
|
+
|
|
638
|
+
"""
|
|
639
|
+
|
|
640
|
+
FunctionArguments = RowIdSequenceFunctionArgs
|
|
641
|
+
|
|
642
|
+
class Meta:
|
|
643
|
+
"""Metadata for RowIdSequenceFunction."""
|
|
644
|
+
|
|
645
|
+
name = "rowid_sequence"
|
|
646
|
+
description = "Sequence with row_id column"
|
|
647
|
+
projection_pushdown = True
|
|
648
|
+
|
|
649
|
+
@classmethod
|
|
650
|
+
def statistics(cls, params: BindParams[RowIdSequenceFunctionArgs]) -> list[ColumnStatistics] | None:
|
|
651
|
+
"""Skip the base ``int64`` arange stats — schema is dynamic per-args here."""
|
|
652
|
+
return None
|
|
653
|
+
|
|
654
|
+
@classmethod
|
|
655
|
+
def on_bind(cls, params: BindParams[RowIdSequenceFunctionArgs]) -> BindResponse:
|
|
656
|
+
"""Build schema with is_row_id metadata on the appropriate field."""
|
|
657
|
+
layout = params.args.layout
|
|
658
|
+
row_id_type = params.args.row_id_type
|
|
659
|
+
|
|
660
|
+
# Build the row_id field with is_row_id metadata
|
|
661
|
+
rid_metadata = {b"is_row_id": b""}
|
|
662
|
+
rid_field: pa.Field[Any]
|
|
663
|
+
if row_id_type == "string":
|
|
664
|
+
rid_field = pa.field("row_id", pa.string(), metadata=rid_metadata)
|
|
665
|
+
elif row_id_type == "struct":
|
|
666
|
+
rid_field = pa.field(
|
|
667
|
+
"row_id",
|
|
668
|
+
pa.struct([("a", pa.int64()), ("b", pa.string())]),
|
|
669
|
+
metadata=rid_metadata,
|
|
670
|
+
)
|
|
671
|
+
else: # int64
|
|
672
|
+
rid_field = pa.field("row_id", pa.int64(), metadata=rid_metadata)
|
|
673
|
+
|
|
674
|
+
name_field = pa.field("name", pa.string())
|
|
675
|
+
value_field = pa.field("value", pa.string())
|
|
676
|
+
|
|
677
|
+
if layout == "middle":
|
|
678
|
+
fields = [name_field, rid_field, value_field]
|
|
679
|
+
elif layout == "last":
|
|
680
|
+
fields = [name_field, value_field, rid_field]
|
|
681
|
+
else: # first
|
|
682
|
+
fields = [rid_field, name_field, value_field]
|
|
683
|
+
|
|
684
|
+
return BindResponse(output_schema=pa.schema(fields))
|
|
685
|
+
|
|
686
|
+
@classmethod
|
|
687
|
+
def _emit_chunk(
|
|
688
|
+
cls,
|
|
689
|
+
params: ProcessParams[RowIdSequenceFunctionArgs],
|
|
690
|
+
state: CountdownState,
|
|
691
|
+
out: OutputCollector,
|
|
692
|
+
start: int,
|
|
693
|
+
size: int,
|
|
694
|
+
) -> None:
|
|
695
|
+
"""Emit a batch of row_id + data columns matching the dynamic output schema."""
|
|
696
|
+
columns: dict[str, Any] = {}
|
|
697
|
+
for f in params.output_schema:
|
|
698
|
+
if f.name == "row_id":
|
|
699
|
+
if pa.types.is_string(f.type):
|
|
700
|
+
columns["row_id"] = [f"rid_{i}" for i in range(start, start + size)]
|
|
701
|
+
elif pa.types.is_struct(f.type):
|
|
702
|
+
columns["row_id"] = [{"a": i, "b": f"s_{i}"} for i in range(start, start + size)]
|
|
703
|
+
else:
|
|
704
|
+
columns["row_id"] = list(range(start, start + size))
|
|
705
|
+
elif f.name == "name":
|
|
706
|
+
columns["name"] = [f"item_{i}" for i in range(start, start + size)]
|
|
707
|
+
elif f.name == "value":
|
|
708
|
+
columns["value"] = [f"val_{i}" for i in range(start, start + size)]
|
|
709
|
+
|
|
710
|
+
out.emit(pa.RecordBatch.from_pydict(columns, schema=params.output_schema))
|