vgi-python 0.8.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vgi/__init__.py +152 -0
- vgi/_duckdb.py +62 -0
- vgi/_storage_profile.py +132 -0
- vgi/_test_fixtures/__init__.py +20 -0
- vgi/_test_fixtures/accumulate/__init__.py +19 -0
- vgi/_test_fixtures/accumulate/worker.py +762 -0
- vgi/_test_fixtures/aggregate/__init__.py +62 -0
- vgi/_test_fixtures/aggregate/_common.py +21 -0
- vgi/_test_fixtures/aggregate/basic.py +232 -0
- vgi/_test_fixtures/aggregate/dynamic.py +409 -0
- vgi/_test_fixtures/aggregate/generic.py +86 -0
- vgi/_test_fixtures/aggregate/listagg.py +71 -0
- vgi/_test_fixtures/aggregate/percentile.py +107 -0
- vgi/_test_fixtures/aggregate/streaming.py +192 -0
- vgi/_test_fixtures/aggregate/varargs.py +75 -0
- vgi/_test_fixtures/aggregate/window.py +380 -0
- vgi/_test_fixtures/attach_options.py +308 -0
- vgi/_test_fixtures/bad_protocol.py +62 -0
- vgi/_test_fixtures/cancellable.py +336 -0
- vgi/_test_fixtures/catalog.py +813 -0
- vgi/_test_fixtures/http_server.py +394 -0
- vgi/_test_fixtures/nest_tensor.py +614 -0
- vgi/_test_fixtures/orchard_catalog.py +47 -0
- vgi/_test_fixtures/projection_repro/__init__.py +6 -0
- vgi/_test_fixtures/projection_repro/worker.py +454 -0
- vgi/_test_fixtures/scalar/__init__.py +116 -0
- vgi/_test_fixtures/scalar/_common.py +69 -0
- vgi/_test_fixtures/scalar/arithmetic.py +321 -0
- vgi/_test_fixtures/scalar/binary.py +120 -0
- vgi/_test_fixtures/scalar/formatting.py +176 -0
- vgi/_test_fixtures/scalar/geo.py +300 -0
- vgi/_test_fixtures/scalar/null_handling.py +107 -0
- vgi/_test_fixtures/scalar/random_demo.py +171 -0
- vgi/_test_fixtures/scalar/settings_secrets.py +102 -0
- vgi/_test_fixtures/scalar/type_info.py +219 -0
- vgi/_test_fixtures/schema_reconcile/__init__.py +29 -0
- vgi/_test_fixtures/schema_reconcile/worker.py +653 -0
- vgi/_test_fixtures/simple_writable.py +793 -0
- vgi/_test_fixtures/table/__init__.py +221 -0
- vgi/_test_fixtures/table/_common.py +162 -0
- vgi/_test_fixtures/table/batch_index.py +283 -0
- vgi/_test_fixtures/table/batch_index_broken.py +200 -0
- vgi/_test_fixtures/table/catalog_scans.py +162 -0
- vgi/_test_fixtures/table/filters.py +1005 -0
- vgi/_test_fixtures/table/late_materialization.py +249 -0
- vgi/_test_fixtures/table/make_series.py +273 -0
- vgi/_test_fixtures/table/misc.py +499 -0
- vgi/_test_fixtures/table/order_modes.py +164 -0
- vgi/_test_fixtures/table/pairs.py +437 -0
- vgi/_test_fixtures/table/partition_columns.py +472 -0
- vgi/_test_fixtures/table/partition_columns_broken.py +304 -0
- vgi/_test_fixtures/table/profiling_example.py +195 -0
- vgi/_test_fixtures/table/required_filters.py +234 -0
- vgi/_test_fixtures/table/sequence.py +710 -0
- vgi/_test_fixtures/table/settings.py +426 -0
- vgi/_test_fixtures/table/transaction_storage.py +162 -0
- vgi/_test_fixtures/table/tt_pushdown.py +191 -0
- vgi/_test_fixtures/table/versioned.py +230 -0
- vgi/_test_fixtures/table_in_out.py +1392 -0
- vgi/_test_fixtures/versioned.py +155 -0
- vgi/_test_fixtures/versioned_tables.py +595 -0
- vgi/_test_fixtures/worker.py +1631 -0
- vgi/_test_fixtures/writable/__init__.py +8 -0
- vgi/_test_fixtures/writable/generic.py +236 -0
- vgi/_test_fixtures/writable/table.py +149 -0
- vgi/_test_fixtures/writable/worker.py +1148 -0
- vgi/aggregate_function.py +607 -0
- vgi/argument_spec.py +472 -0
- vgi/arguments.py +1747 -0
- vgi/auth.py +55 -0
- vgi/catalog/__init__.py +88 -0
- vgi/catalog/attach_option.py +206 -0
- vgi/catalog/catalog_interface.py +2767 -0
- vgi/catalog/descriptors.py +870 -0
- vgi/catalog/duckdb_statistics.py +377 -0
- vgi/catalog/secret_type.py +96 -0
- vgi/catalog/setting.py +253 -0
- vgi/catalog/storage.py +372 -0
- vgi/client/__init__.py +67 -0
- vgi/client/catalog_mixin.py +1251 -0
- vgi/client/cli.py +582 -0
- vgi/client/cli_catalog.py +182 -0
- vgi/client/cli_schema.py +270 -0
- vgi/client/cli_table.py +907 -0
- vgi/client/cli_transaction.py +97 -0
- vgi/client/cli_utils.py +441 -0
- vgi/client/cli_view.py +303 -0
- vgi/client/client.py +2183 -0
- vgi/exceptions.py +205 -0
- vgi/function.py +245 -0
- vgi/function_storage.py +1636 -0
- vgi/function_storage_azure_sql.py +922 -0
- vgi/function_storage_cf_do.py +740 -0
- vgi/http/__init__.py +25 -0
- vgi/http/demo_storage.py +212 -0
- vgi/http/worker_page.py +1252 -0
- vgi/invocation.py +154 -0
- vgi/logging_config.py +93 -0
- vgi/meta_worker.py +661 -0
- vgi/metadata.py +1403 -0
- vgi/otel.py +406 -0
- vgi/protocol.py +2418 -0
- vgi/protocol_version.txt +1 -0
- vgi/py.typed +0 -0
- vgi/scalar_function.py +1211 -0
- vgi/schema_utils.py +234 -0
- vgi/secret_protocol.py +124 -0
- vgi/secret_service.py +238 -0
- vgi/serve.py +769 -0
- vgi/table_buffering_function.py +443 -0
- vgi/table_filter_pushdown.py +1528 -0
- vgi/table_function.py +1130 -0
- vgi/table_in_out_function.py +383 -0
- vgi/transactor/__init__.py +24 -0
- vgi/transactor/_duckdb_compat.py +27 -0
- vgi/transactor/client.py +137 -0
- vgi/transactor/protocol.py +149 -0
- vgi/transactor/server.py +740 -0
- vgi/worker.py +4761 -0
- vgi_python-0.8.0.dist-info/METADATA +735 -0
- vgi_python-0.8.0.dist-info/RECORD +124 -0
- vgi_python-0.8.0.dist-info/WHEEL +4 -0
- vgi_python-0.8.0.dist-info/entry_points.txt +5 -0
- vgi_python-0.8.0.dist-info/licenses/LICENSE +134 -0
|
@@ -0,0 +1,499 @@
|
|
|
1
|
+
# Copyright 2025, 2026 Query Farm LLC - https://query.farm
|
|
2
|
+
|
|
3
|
+
"""Misc fixtures: GeneratorException, LoggingGenerator, ProjectedData, OrderEcho, SampleEcho."""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
from dataclasses import dataclass
|
|
8
|
+
from typing import Annotated, ClassVar
|
|
9
|
+
|
|
10
|
+
import pyarrow as pa
|
|
11
|
+
from vgi_rpc import ArrowSerializableDataclass, Transient
|
|
12
|
+
from vgi_rpc.log import Level
|
|
13
|
+
from vgi_rpc.rpc import OutputCollector
|
|
14
|
+
|
|
15
|
+
from vgi._test_fixtures.table._common import (
|
|
16
|
+
CountdownState,
|
|
17
|
+
_cardinality_from_count,
|
|
18
|
+
)
|
|
19
|
+
from vgi.arguments import Arg
|
|
20
|
+
from vgi.metadata import FunctionExample
|
|
21
|
+
from vgi.schema_utils import schema
|
|
22
|
+
from vgi.table_function import (
|
|
23
|
+
ProcessParams,
|
|
24
|
+
TableFunctionGenerator,
|
|
25
|
+
bind_fixed_schema,
|
|
26
|
+
init_single_worker,
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
@dataclass(slots=True, frozen=True)
|
|
31
|
+
class GeneratorExceptionFunctionArguments:
|
|
32
|
+
"""Arguments for GeneratorExceptionFunction."""
|
|
33
|
+
|
|
34
|
+
fail_after: Annotated[int, Arg(0, doc="Number of batches before failure", ge=0)]
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
@dataclass(kw_only=True)
|
|
38
|
+
class GeneratorExceptionState(ArrowSerializableDataclass):
|
|
39
|
+
"""Mutable state for GeneratorExceptionFunction."""
|
|
40
|
+
|
|
41
|
+
batch_count: int = 0
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
@init_single_worker
|
|
45
|
+
@bind_fixed_schema
|
|
46
|
+
class GeneratorExceptionFunction(TableFunctionGenerator[GeneratorExceptionFunctionArguments, GeneratorExceptionState]):
|
|
47
|
+
"""Function that raises an exception after generating some output.
|
|
48
|
+
|
|
49
|
+
USE CASE
|
|
50
|
+
--------
|
|
51
|
+
Testing exception handling in the generator protocol.
|
|
52
|
+
|
|
53
|
+
SCHEMA
|
|
54
|
+
------
|
|
55
|
+
Output: {"n": int64}
|
|
56
|
+
|
|
57
|
+
"""
|
|
58
|
+
|
|
59
|
+
class Meta:
|
|
60
|
+
"""Metadata for GeneratorExceptionFunction."""
|
|
61
|
+
|
|
62
|
+
name = "generator_exception"
|
|
63
|
+
description = "Raises an exception after N batches for testing"
|
|
64
|
+
categories = ["testing"]
|
|
65
|
+
tags = {"category": "testing", "type": "error-handling"}
|
|
66
|
+
|
|
67
|
+
FIXED_SCHEMA: ClassVar[pa.Schema] = schema(n=pa.int64())
|
|
68
|
+
|
|
69
|
+
@classmethod
|
|
70
|
+
def initial_state(cls, params: ProcessParams[GeneratorExceptionFunctionArguments]) -> GeneratorExceptionState:
|
|
71
|
+
"""Create initial state."""
|
|
72
|
+
return GeneratorExceptionState()
|
|
73
|
+
|
|
74
|
+
@classmethod
|
|
75
|
+
def process(
|
|
76
|
+
cls,
|
|
77
|
+
params: ProcessParams[GeneratorExceptionFunctionArguments],
|
|
78
|
+
state: GeneratorExceptionState,
|
|
79
|
+
out: OutputCollector,
|
|
80
|
+
) -> None:
|
|
81
|
+
"""Generate batches then raise an exception."""
|
|
82
|
+
if state.batch_count >= params.args.fail_after:
|
|
83
|
+
raise ValueError(f"Intentional failure after {params.args.fail_after} batches")
|
|
84
|
+
|
|
85
|
+
out.emit(pa.RecordBatch.from_pydict({"n": [state.batch_count]}, schema=params.output_schema))
|
|
86
|
+
state.batch_count += 1
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
@dataclass(slots=True, frozen=True)
|
|
90
|
+
class LoggingGeneratorFunctionArguments:
|
|
91
|
+
"""Arguments for LoggingGeneratorFunction."""
|
|
92
|
+
|
|
93
|
+
count: Annotated[int, Arg(0, doc="Number of values to generate", ge=0)]
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
@dataclass(kw_only=True)
|
|
97
|
+
class LoggingGeneratorState(ArrowSerializableDataclass):
|
|
98
|
+
"""Mutable state for LoggingGeneratorFunction."""
|
|
99
|
+
|
|
100
|
+
index: int = 0
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
@init_single_worker
|
|
104
|
+
@bind_fixed_schema
|
|
105
|
+
class LoggingGeneratorFunction(TableFunctionGenerator[LoggingGeneratorFunctionArguments, LoggingGeneratorState]):
|
|
106
|
+
"""Function that emits log messages during generation.
|
|
107
|
+
|
|
108
|
+
USE CASE
|
|
109
|
+
--------
|
|
110
|
+
Testing log message handling in the generator protocol.
|
|
111
|
+
|
|
112
|
+
SCHEMA
|
|
113
|
+
------
|
|
114
|
+
Output: {"n": int64}
|
|
115
|
+
|
|
116
|
+
"""
|
|
117
|
+
|
|
118
|
+
class Meta:
|
|
119
|
+
"""Metadata for LoggingGeneratorFunction."""
|
|
120
|
+
|
|
121
|
+
name = "logging_generator"
|
|
122
|
+
description = "Emits log messages during generation"
|
|
123
|
+
categories = ["testing"]
|
|
124
|
+
|
|
125
|
+
FIXED_SCHEMA: ClassVar[pa.Schema] = schema(n=pa.int64())
|
|
126
|
+
|
|
127
|
+
@classmethod
|
|
128
|
+
def initial_state(cls, params: ProcessParams[LoggingGeneratorFunctionArguments]) -> LoggingGeneratorState:
|
|
129
|
+
"""Create initial state."""
|
|
130
|
+
return LoggingGeneratorState()
|
|
131
|
+
|
|
132
|
+
@classmethod
|
|
133
|
+
def process(
|
|
134
|
+
cls,
|
|
135
|
+
params: ProcessParams[LoggingGeneratorFunctionArguments],
|
|
136
|
+
state: LoggingGeneratorState,
|
|
137
|
+
out: OutputCollector,
|
|
138
|
+
) -> None:
|
|
139
|
+
"""Generate values with logging."""
|
|
140
|
+
if state.index == 0:
|
|
141
|
+
out.client_log(Level.INFO, f"Starting generation of {params.args.count} values")
|
|
142
|
+
|
|
143
|
+
if state.index >= params.args.count:
|
|
144
|
+
out.client_log(Level.INFO, "Generation complete")
|
|
145
|
+
out.finish()
|
|
146
|
+
return
|
|
147
|
+
|
|
148
|
+
out.emit(pa.RecordBatch.from_pydict({"n": [state.index]}, schema=params.output_schema))
|
|
149
|
+
state.index += 1
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
@dataclass(slots=True, frozen=True)
|
|
153
|
+
class ProjectedDataFunctionArguments:
|
|
154
|
+
"""Arguments for ProjectedDataFunction."""
|
|
155
|
+
|
|
156
|
+
count: Annotated[int, Arg(0, doc="Number of rows to generate", ge=0)]
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
@init_single_worker
|
|
160
|
+
@bind_fixed_schema
|
|
161
|
+
@_cardinality_from_count
|
|
162
|
+
class ProjectedDataFunction(TableFunctionGenerator[ProjectedDataFunctionArguments, CountdownState]):
|
|
163
|
+
"""Generates data with 4 columns, supporting projection pushdown.
|
|
164
|
+
|
|
165
|
+
USE CASE
|
|
166
|
+
--------
|
|
167
|
+
Demonstrates projection pushdown where the function only computes
|
|
168
|
+
columns that are actually requested. This is useful for expensive
|
|
169
|
+
column computations that can be skipped if the column isn't needed.
|
|
170
|
+
|
|
171
|
+
SCHEMA
|
|
172
|
+
------
|
|
173
|
+
Full output: {"id": int64, "name": string, "value": float64, "extra": int64}
|
|
174
|
+
With projection, only the projected columns are included.
|
|
175
|
+
|
|
176
|
+
Example:
|
|
177
|
+
-------
|
|
178
|
+
SELECT id, value FROM projected_data(10) -- Only computes id and value
|
|
179
|
+
Returns: 10 rows with id and value columns only
|
|
180
|
+
|
|
181
|
+
"""
|
|
182
|
+
|
|
183
|
+
class Meta:
|
|
184
|
+
"""Metadata for ProjectedDataFunction."""
|
|
185
|
+
|
|
186
|
+
name = "projected_data"
|
|
187
|
+
description = "Generates data with 4 columns, supporting projection pushdown"
|
|
188
|
+
categories = ["generator", "utility"]
|
|
189
|
+
projection_pushdown = True
|
|
190
|
+
examples = [
|
|
191
|
+
FunctionExample(
|
|
192
|
+
sql="SELECT * FROM projected_data(10)",
|
|
193
|
+
description="Generate 10 rows with all 4 columns",
|
|
194
|
+
),
|
|
195
|
+
FunctionExample(
|
|
196
|
+
sql="SELECT id, value FROM projected_data(10)",
|
|
197
|
+
description="Generate 10 rows with only id and value columns",
|
|
198
|
+
),
|
|
199
|
+
]
|
|
200
|
+
|
|
201
|
+
# Full schema with all 4 columns
|
|
202
|
+
FIXED_SCHEMA: ClassVar[pa.Schema] = schema(
|
|
203
|
+
{
|
|
204
|
+
"id": pa.int64(),
|
|
205
|
+
"name": pa.string(),
|
|
206
|
+
"value": pa.float64(),
|
|
207
|
+
"extra": pa.int64(),
|
|
208
|
+
}
|
|
209
|
+
)
|
|
210
|
+
|
|
211
|
+
BATCH_SIZE: ClassVar[int] = 1000
|
|
212
|
+
|
|
213
|
+
@classmethod
|
|
214
|
+
def _get_projected_column_indices(cls, projection_ids: list[int] | None) -> list[int]:
|
|
215
|
+
"""Get the column indices to generate.
|
|
216
|
+
|
|
217
|
+
Returns indices from projection_ids if set, otherwise all columns.
|
|
218
|
+
"""
|
|
219
|
+
if projection_ids is not None:
|
|
220
|
+
return projection_ids
|
|
221
|
+
return list(range(len(cls.FIXED_SCHEMA)))
|
|
222
|
+
|
|
223
|
+
@classmethod
|
|
224
|
+
def initial_state(cls, params: ProcessParams[ProjectedDataFunctionArguments]) -> CountdownState:
|
|
225
|
+
"""Create initial state with remaining count."""
|
|
226
|
+
return CountdownState(remaining=params.args.count)
|
|
227
|
+
|
|
228
|
+
@classmethod
|
|
229
|
+
def process(
|
|
230
|
+
cls,
|
|
231
|
+
params: ProcessParams[ProjectedDataFunctionArguments],
|
|
232
|
+
state: CountdownState,
|
|
233
|
+
out: OutputCollector,
|
|
234
|
+
) -> None:
|
|
235
|
+
"""Generate data for only the projected columns."""
|
|
236
|
+
if state.remaining <= 0:
|
|
237
|
+
out.finish()
|
|
238
|
+
return
|
|
239
|
+
|
|
240
|
+
assert params.init_call is not None
|
|
241
|
+
projected_indices = cls._get_projected_column_indices(params.init_call.projection_ids)
|
|
242
|
+
batch_size = min(state.remaining, cls.BATCH_SIZE)
|
|
243
|
+
|
|
244
|
+
# Only compute columns that are projected
|
|
245
|
+
columns: dict[str, list[int] | list[str] | list[float]] = {}
|
|
246
|
+
|
|
247
|
+
for idx in projected_indices:
|
|
248
|
+
f = cls.FIXED_SCHEMA.field(idx)
|
|
249
|
+
if f.name == "id":
|
|
250
|
+
columns["id"] = list(range(state.current_index, state.current_index + batch_size))
|
|
251
|
+
elif f.name == "name":
|
|
252
|
+
columns["name"] = [f"item_{i}" for i in range(state.current_index, state.current_index + batch_size)]
|
|
253
|
+
elif f.name == "value":
|
|
254
|
+
columns["value"] = [
|
|
255
|
+
float(i) * 1.5 for i in range(state.current_index, state.current_index + batch_size)
|
|
256
|
+
]
|
|
257
|
+
elif f.name == "extra":
|
|
258
|
+
columns["extra"] = [i * i for i in range(state.current_index, state.current_index + batch_size)]
|
|
259
|
+
|
|
260
|
+
out.emit(pa.RecordBatch.from_pydict(columns, schema=params.output_schema))
|
|
261
|
+
|
|
262
|
+
state.current_index += batch_size
|
|
263
|
+
state.remaining -= batch_size
|
|
264
|
+
|
|
265
|
+
|
|
266
|
+
# ============================================================================
|
|
267
|
+
|
|
268
|
+
|
|
269
|
+
@dataclass(slots=True, frozen=True)
|
|
270
|
+
class _OrderEchoArgs:
|
|
271
|
+
"""Arguments for OrderEchoFunction."""
|
|
272
|
+
|
|
273
|
+
count: Annotated[int, Arg(0, doc="Number of rows to generate", ge=0)]
|
|
274
|
+
batch_size: Annotated[int, Arg("batch_size", default=2048, doc="Batch size for output", ge=1)]
|
|
275
|
+
|
|
276
|
+
|
|
277
|
+
@dataclass(kw_only=True)
|
|
278
|
+
class _OrderEchoState(ArrowSerializableDataclass):
|
|
279
|
+
"""Mutable state for OrderEchoFunction."""
|
|
280
|
+
|
|
281
|
+
remaining: int
|
|
282
|
+
current_index: int = 0
|
|
283
|
+
order_column: Annotated[str, Transient()] = "(none)"
|
|
284
|
+
order_direction: Annotated[str, Transient()] = "(none)"
|
|
285
|
+
order_null_order: Annotated[str, Transient()] = "(none)"
|
|
286
|
+
order_limit: Annotated[int, Transient()] = -1
|
|
287
|
+
|
|
288
|
+
|
|
289
|
+
_ORDER_ECHO_SCHEMA = schema(
|
|
290
|
+
{
|
|
291
|
+
"n": pa.int64(),
|
|
292
|
+
"s": pa.utf8(),
|
|
293
|
+
"order_column": pa.utf8(),
|
|
294
|
+
"order_direction": pa.utf8(),
|
|
295
|
+
"order_null_order": pa.utf8(),
|
|
296
|
+
"order_limit": pa.int64(),
|
|
297
|
+
}
|
|
298
|
+
)
|
|
299
|
+
|
|
300
|
+
|
|
301
|
+
@init_single_worker
|
|
302
|
+
@bind_fixed_schema
|
|
303
|
+
@_cardinality_from_count
|
|
304
|
+
class OrderEchoFunction(TableFunctionGenerator[_OrderEchoArgs, _OrderEchoState]):
|
|
305
|
+
"""Echoes ORDER BY + LIMIT pushdown hints in output columns.
|
|
306
|
+
|
|
307
|
+
USE CASE
|
|
308
|
+
--------
|
|
309
|
+
Verify that DuckDB's RowGroupPruner optimizer pushes ORDER BY + LIMIT
|
|
310
|
+
hints to VGI table functions via the ``set_scan_order`` callback.
|
|
311
|
+
The order_* columns show what hints were received. The function does
|
|
312
|
+
NOT apply the order/limit itself -- DuckDB's operators handle that.
|
|
313
|
+
|
|
314
|
+
SCHEMA
|
|
315
|
+
------
|
|
316
|
+
Output: {"n": int64, "s": string, "order_column": string,
|
|
317
|
+
"order_direction": string, "order_null_order": string,
|
|
318
|
+
"order_limit": int64}
|
|
319
|
+
|
|
320
|
+
"""
|
|
321
|
+
|
|
322
|
+
class Meta:
|
|
323
|
+
"""Metadata for OrderEchoFunction."""
|
|
324
|
+
|
|
325
|
+
name = "order_echo"
|
|
326
|
+
description = "Echoes ORDER BY + LIMIT pushdown hints in output"
|
|
327
|
+
categories = ["generator", "diagnostic"]
|
|
328
|
+
filter_pushdown = True
|
|
329
|
+
auto_apply_filters = True
|
|
330
|
+
projection_pushdown = True
|
|
331
|
+
examples = [
|
|
332
|
+
FunctionExample(
|
|
333
|
+
sql="SELECT * FROM order_echo(100) ORDER BY n LIMIT 5",
|
|
334
|
+
description="See which ORDER BY hint was pushed down",
|
|
335
|
+
),
|
|
336
|
+
]
|
|
337
|
+
|
|
338
|
+
FIXED_SCHEMA: ClassVar[pa.Schema] = _ORDER_ECHO_SCHEMA
|
|
339
|
+
|
|
340
|
+
@classmethod
|
|
341
|
+
def initial_state(cls, params: ProcessParams[_OrderEchoArgs]) -> _OrderEchoState:
|
|
342
|
+
"""Create initial state with cached order hint values."""
|
|
343
|
+
assert params.init_call is not None
|
|
344
|
+
init = params.init_call
|
|
345
|
+
return _OrderEchoState(
|
|
346
|
+
remaining=params.args.count,
|
|
347
|
+
order_column=init.order_by_column_name or "(none)",
|
|
348
|
+
order_direction=init.order_by_direction.name if init.order_by_direction else "(none)",
|
|
349
|
+
order_null_order=init.order_by_null_order.name if init.order_by_null_order else "(none)",
|
|
350
|
+
order_limit=init.order_by_limit if init.order_by_limit is not None else -1,
|
|
351
|
+
)
|
|
352
|
+
|
|
353
|
+
@classmethod
|
|
354
|
+
def process(
|
|
355
|
+
cls,
|
|
356
|
+
params: ProcessParams[_OrderEchoArgs],
|
|
357
|
+
state: _OrderEchoState,
|
|
358
|
+
out: OutputCollector,
|
|
359
|
+
) -> None:
|
|
360
|
+
"""Generate rows echoing order pushdown hints."""
|
|
361
|
+
if state.remaining <= 0:
|
|
362
|
+
out.finish()
|
|
363
|
+
return
|
|
364
|
+
|
|
365
|
+
size = min(state.remaining, params.args.batch_size)
|
|
366
|
+
start = state.current_index
|
|
367
|
+
|
|
368
|
+
n_values = list(range(start, start + size))
|
|
369
|
+
s_values = [f"row_{i}" for i in n_values]
|
|
370
|
+
|
|
371
|
+
out.emit(
|
|
372
|
+
pa.RecordBatch.from_pydict(
|
|
373
|
+
{
|
|
374
|
+
"n": n_values,
|
|
375
|
+
"s": s_values,
|
|
376
|
+
"order_column": [state.order_column] * size,
|
|
377
|
+
"order_direction": [state.order_direction] * size,
|
|
378
|
+
"order_null_order": [state.order_null_order] * size,
|
|
379
|
+
"order_limit": [state.order_limit] * size,
|
|
380
|
+
},
|
|
381
|
+
schema=params.output_schema,
|
|
382
|
+
)
|
|
383
|
+
)
|
|
384
|
+
|
|
385
|
+
state.current_index += size
|
|
386
|
+
state.remaining -= size
|
|
387
|
+
|
|
388
|
+
|
|
389
|
+
# ============================================================================
|
|
390
|
+
|
|
391
|
+
|
|
392
|
+
@dataclass(slots=True, frozen=True)
|
|
393
|
+
class _SampleEchoArgs:
|
|
394
|
+
"""Arguments for SampleEchoFunction."""
|
|
395
|
+
|
|
396
|
+
count: Annotated[int, Arg(0, doc="Number of rows to generate", ge=0)]
|
|
397
|
+
batch_size: Annotated[int, Arg("batch_size", default=2048, doc="Batch size for output", ge=1)]
|
|
398
|
+
|
|
399
|
+
|
|
400
|
+
@dataclass(kw_only=True)
|
|
401
|
+
class _SampleEchoState(ArrowSerializableDataclass):
|
|
402
|
+
"""Mutable state for SampleEchoFunction."""
|
|
403
|
+
|
|
404
|
+
remaining: int
|
|
405
|
+
current_index: int = 0
|
|
406
|
+
sample_percentage: Annotated[float, Transient()] = -1.0
|
|
407
|
+
sample_seed: Annotated[int, Transient()] = -1
|
|
408
|
+
|
|
409
|
+
|
|
410
|
+
_SAMPLE_ECHO_SCHEMA = schema(
|
|
411
|
+
{
|
|
412
|
+
"n": pa.int64(),
|
|
413
|
+
"s": pa.utf8(),
|
|
414
|
+
"sample_percentage": pa.float64(),
|
|
415
|
+
"sample_seed": pa.int64(),
|
|
416
|
+
}
|
|
417
|
+
)
|
|
418
|
+
|
|
419
|
+
|
|
420
|
+
@init_single_worker
|
|
421
|
+
@bind_fixed_schema
|
|
422
|
+
@_cardinality_from_count
|
|
423
|
+
class SampleEchoFunction(TableFunctionGenerator[_SampleEchoArgs, _SampleEchoState]):
|
|
424
|
+
"""Echoes TABLESAMPLE pushdown hints in output columns.
|
|
425
|
+
|
|
426
|
+
USE CASE
|
|
427
|
+
--------
|
|
428
|
+
Verify that DuckDB's SamplingPushdown optimizer pushes TABLESAMPLE SYSTEM
|
|
429
|
+
hints to VGI table functions. The sample_* columns show what hints were
|
|
430
|
+
received. The function does NOT apply sampling itself -- it returns all
|
|
431
|
+
rows so tests can verify the echo values.
|
|
432
|
+
|
|
433
|
+
SCHEMA
|
|
434
|
+
------
|
|
435
|
+
Output: {"n": int64, "s": string, "sample_percentage": float64,
|
|
436
|
+
"sample_seed": int64}
|
|
437
|
+
|
|
438
|
+
"""
|
|
439
|
+
|
|
440
|
+
class Meta:
|
|
441
|
+
"""Metadata for SampleEchoFunction."""
|
|
442
|
+
|
|
443
|
+
name = "sample_echo"
|
|
444
|
+
description = "Echoes TABLESAMPLE pushdown hints in output"
|
|
445
|
+
categories = ["generator", "diagnostic"]
|
|
446
|
+
projection_pushdown = True
|
|
447
|
+
sampling_pushdown = True
|
|
448
|
+
examples = [
|
|
449
|
+
FunctionExample(
|
|
450
|
+
sql="SELECT * FROM sample_echo(100) TABLESAMPLE SYSTEM(10%)",
|
|
451
|
+
description="See which TABLESAMPLE hint was pushed down",
|
|
452
|
+
),
|
|
453
|
+
]
|
|
454
|
+
|
|
455
|
+
FIXED_SCHEMA: ClassVar[pa.Schema] = _SAMPLE_ECHO_SCHEMA
|
|
456
|
+
|
|
457
|
+
@classmethod
|
|
458
|
+
def initial_state(cls, params: ProcessParams[_SampleEchoArgs]) -> _SampleEchoState:
|
|
459
|
+
"""Create initial state with cached sample hint values."""
|
|
460
|
+
assert params.init_call is not None
|
|
461
|
+
init = params.init_call
|
|
462
|
+
return _SampleEchoState(
|
|
463
|
+
remaining=params.args.count,
|
|
464
|
+
sample_percentage=init.tablesample_percentage if init.tablesample_percentage is not None else -1.0,
|
|
465
|
+
sample_seed=init.tablesample_seed if init.tablesample_seed is not None else -1,
|
|
466
|
+
)
|
|
467
|
+
|
|
468
|
+
@classmethod
|
|
469
|
+
def process(
|
|
470
|
+
cls,
|
|
471
|
+
params: ProcessParams[_SampleEchoArgs],
|
|
472
|
+
state: _SampleEchoState,
|
|
473
|
+
out: OutputCollector,
|
|
474
|
+
) -> None:
|
|
475
|
+
"""Generate rows echoing sample pushdown hints."""
|
|
476
|
+
if state.remaining <= 0:
|
|
477
|
+
out.finish()
|
|
478
|
+
return
|
|
479
|
+
|
|
480
|
+
size = min(state.remaining, params.args.batch_size)
|
|
481
|
+
start = state.current_index
|
|
482
|
+
|
|
483
|
+
n_values = list(range(start, start + size))
|
|
484
|
+
s_values = [f"row_{i}" for i in n_values]
|
|
485
|
+
|
|
486
|
+
out.emit(
|
|
487
|
+
pa.RecordBatch.from_pydict(
|
|
488
|
+
{
|
|
489
|
+
"n": n_values,
|
|
490
|
+
"s": s_values,
|
|
491
|
+
"sample_percentage": [state.sample_percentage] * size,
|
|
492
|
+
"sample_seed": [state.sample_seed] * size,
|
|
493
|
+
},
|
|
494
|
+
schema=params.output_schema,
|
|
495
|
+
)
|
|
496
|
+
)
|
|
497
|
+
|
|
498
|
+
state.current_index += size
|
|
499
|
+
state.remaining -= size
|
|
@@ -0,0 +1,164 @@
|
|
|
1
|
+
# Copyright 2025, 2026 Query Farm LLC - https://query.farm
|
|
2
|
+
|
|
3
|
+
"""Multi-worker partitioned sequence fixtures, one per ``OrderPreservation`` mode.
|
|
4
|
+
|
|
5
|
+
These three fixtures are clones of :class:`PartitionedSequenceFunction` (see
|
|
6
|
+
``sequence.py``); the only difference is ``Meta.preserves_order``. They exist
|
|
7
|
+
so SQL integration tests can verify that DuckDB's planner honors each mode
|
|
8
|
+
end-to-end:
|
|
9
|
+
|
|
10
|
+
* ``partitioned_preserves_order`` — ``PRESERVES_ORDER`` → DuckDB ``INSERTION_ORDER``
|
|
11
|
+
* ``partitioned_no_order_guarantee`` — ``NO_ORDER_GUARANTEE`` → DuckDB ``NO_ORDER``
|
|
12
|
+
* ``partitioned_fixed_order`` — ``FIXED_ORDER`` → DuckDB ``FIXED_ORDER``
|
|
13
|
+
|
|
14
|
+
DuckDB serializes the pipeline (single worker) for ``FIXED_ORDER`` and
|
|
15
|
+
parallelizes for the other two. Tests grep ``conn=`` from ``duckdb_logs()``
|
|
16
|
+
to count distinct workers — the same pattern used by
|
|
17
|
+
``partitioned_sequence.test``.
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
from __future__ import annotations
|
|
21
|
+
|
|
22
|
+
import struct
|
|
23
|
+
from dataclasses import dataclass
|
|
24
|
+
from typing import Annotated, ClassVar
|
|
25
|
+
|
|
26
|
+
import pyarrow as pa
|
|
27
|
+
from vgi_rpc import ArrowSerializableDataclass
|
|
28
|
+
from vgi_rpc.rpc import OutputCollector
|
|
29
|
+
|
|
30
|
+
from vgi._test_fixtures.table._common import _cardinality_from_count
|
|
31
|
+
from vgi.arguments import Arg
|
|
32
|
+
from vgi.invocation import GlobalInitResponse
|
|
33
|
+
from vgi.metadata import FunctionExample, OrderPreservation
|
|
34
|
+
from vgi.schema_utils import schema
|
|
35
|
+
from vgi.table_function import (
|
|
36
|
+
InitParams,
|
|
37
|
+
ProcessParams,
|
|
38
|
+
TableFunctionGenerator,
|
|
39
|
+
bind_fixed_schema,
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
@dataclass(slots=True, frozen=True)
|
|
44
|
+
class _OrderModeArgs:
|
|
45
|
+
"""Arguments for the order-preservation-mode fixtures."""
|
|
46
|
+
|
|
47
|
+
count: Annotated[int, Arg(0, doc="Total number of integers to generate", ge=0)]
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
@dataclass(kw_only=True)
|
|
51
|
+
class _OrderModeState(ArrowSerializableDataclass):
|
|
52
|
+
"""Mutable per-worker state for the order-preservation-mode fixtures."""
|
|
53
|
+
|
|
54
|
+
current_start: int | None = None
|
|
55
|
+
current_end: int | None = None
|
|
56
|
+
current_idx: int = 0
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
class _BasePartitionedOrderMode(TableFunctionGenerator[_OrderModeArgs, _OrderModeState]):
|
|
60
|
+
"""Shared multi-worker work-queue logic. Subclasses pin ``Meta``.
|
|
61
|
+
|
|
62
|
+
The chunk/batch sizing matches ``PartitionedSequenceFunction``: 1k chunks,
|
|
63
|
+
1k-row output batches. The primary worker enqueues all chunks during
|
|
64
|
+
``on_init``; every worker (including the primary) pulls chunks atomically
|
|
65
|
+
via ``params.storage.queue_pop``.
|
|
66
|
+
"""
|
|
67
|
+
|
|
68
|
+
CHUNK_SIZE: ClassVar[int] = 1000
|
|
69
|
+
BATCH_SIZE: ClassVar[int] = 1000
|
|
70
|
+
|
|
71
|
+
FIXED_SCHEMA: ClassVar[pa.Schema] = schema(n=pa.int64())
|
|
72
|
+
|
|
73
|
+
@classmethod
|
|
74
|
+
def on_init(cls, params: InitParams[_OrderModeArgs]) -> GlobalInitResponse:
|
|
75
|
+
work_items: list[bytes] = []
|
|
76
|
+
for start_idx in range(0, params.args.count, cls.CHUNK_SIZE):
|
|
77
|
+
end_idx = min(start_idx + cls.CHUNK_SIZE, params.args.count)
|
|
78
|
+
work_items.append(struct.pack(">QQ", start_idx, end_idx))
|
|
79
|
+
params.storage.queue_push(work_items)
|
|
80
|
+
return GlobalInitResponse()
|
|
81
|
+
|
|
82
|
+
@classmethod
|
|
83
|
+
def initial_state(cls, params: ProcessParams[_OrderModeArgs]) -> _OrderModeState:
|
|
84
|
+
return _OrderModeState()
|
|
85
|
+
|
|
86
|
+
@classmethod
|
|
87
|
+
def process(
|
|
88
|
+
cls,
|
|
89
|
+
params: ProcessParams[_OrderModeArgs],
|
|
90
|
+
state: _OrderModeState,
|
|
91
|
+
out: OutputCollector,
|
|
92
|
+
) -> None:
|
|
93
|
+
if state.current_start is None or state.current_idx >= (state.current_end or 0):
|
|
94
|
+
work_data = params.storage.queue_pop()
|
|
95
|
+
if work_data is None:
|
|
96
|
+
out.finish()
|
|
97
|
+
return
|
|
98
|
+
state.current_start, state.current_end = struct.unpack(">QQ", work_data)
|
|
99
|
+
assert state.current_start is not None
|
|
100
|
+
state.current_idx = state.current_start
|
|
101
|
+
|
|
102
|
+
batch_end_idx = min(state.current_idx + cls.BATCH_SIZE, state.current_end or 0)
|
|
103
|
+
values = list(range(state.current_idx, batch_end_idx))
|
|
104
|
+
out.emit(pa.RecordBatch.from_pydict({"n": values}, schema=params.output_schema))
|
|
105
|
+
state.current_idx = batch_end_idx
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
@bind_fixed_schema
|
|
109
|
+
@_cardinality_from_count
|
|
110
|
+
class PartitionedPreservesOrderFunction(_BasePartitionedOrderMode):
|
|
111
|
+
"""Multi-worker partitioned sequence — ``PRESERVES_ORDER``."""
|
|
112
|
+
|
|
113
|
+
class Meta:
|
|
114
|
+
name = "partitioned_preserves_order"
|
|
115
|
+
description = (
|
|
116
|
+
"Multi-worker partitioned sequence; preserves_order=PRESERVES_ORDER (maps to DuckDB INSERTION_ORDER)."
|
|
117
|
+
)
|
|
118
|
+
categories = ["generator", "utility"]
|
|
119
|
+
preserves_order = OrderPreservation.PRESERVES_ORDER
|
|
120
|
+
examples = [
|
|
121
|
+
FunctionExample(
|
|
122
|
+
sql="SELECT * FROM partitioned_preserves_order(100)",
|
|
123
|
+
description="Generate 0-99 in parallel; preserves_order=PRESERVES_ORDER",
|
|
124
|
+
),
|
|
125
|
+
]
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
@bind_fixed_schema
|
|
129
|
+
@_cardinality_from_count
|
|
130
|
+
class PartitionedNoOrderGuaranteeFunction(_BasePartitionedOrderMode):
|
|
131
|
+
"""Multi-worker partitioned sequence — ``NO_ORDER_GUARANTEE``."""
|
|
132
|
+
|
|
133
|
+
class Meta:
|
|
134
|
+
name = "partitioned_no_order_guarantee"
|
|
135
|
+
description = "Multi-worker partitioned sequence; preserves_order=NO_ORDER_GUARANTEE (maps to DuckDB NO_ORDER)."
|
|
136
|
+
categories = ["generator", "utility"]
|
|
137
|
+
preserves_order = OrderPreservation.NO_ORDER_GUARANTEE
|
|
138
|
+
examples = [
|
|
139
|
+
FunctionExample(
|
|
140
|
+
sql="SELECT * FROM partitioned_no_order_guarantee(100)",
|
|
141
|
+
description="Generate 0-99 in parallel; preserves_order=NO_ORDER_GUARANTEE",
|
|
142
|
+
),
|
|
143
|
+
]
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
@bind_fixed_schema
|
|
147
|
+
@_cardinality_from_count
|
|
148
|
+
class PartitionedFixedOrderFunction(_BasePartitionedOrderMode):
|
|
149
|
+
"""Multi-worker partitioned sequence — ``FIXED_ORDER`` (DuckDB serializes)."""
|
|
150
|
+
|
|
151
|
+
class Meta:
|
|
152
|
+
name = "partitioned_fixed_order"
|
|
153
|
+
description = (
|
|
154
|
+
"Multi-worker partitioned sequence; preserves_order=FIXED_ORDER "
|
|
155
|
+
"(DuckDB serializes the pipeline so a single worker produces all rows)."
|
|
156
|
+
)
|
|
157
|
+
categories = ["generator", "utility"]
|
|
158
|
+
preserves_order = OrderPreservation.FIXED_ORDER
|
|
159
|
+
examples = [
|
|
160
|
+
FunctionExample(
|
|
161
|
+
sql="SELECT * FROM partitioned_fixed_order(100)",
|
|
162
|
+
description="Generate 0-99; FIXED_ORDER forces single-worker execution",
|
|
163
|
+
),
|
|
164
|
+
]
|