vgi-python 0.8.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vgi/__init__.py +152 -0
- vgi/_duckdb.py +62 -0
- vgi/_storage_profile.py +132 -0
- vgi/_test_fixtures/__init__.py +20 -0
- vgi/_test_fixtures/accumulate/__init__.py +19 -0
- vgi/_test_fixtures/accumulate/worker.py +762 -0
- vgi/_test_fixtures/aggregate/__init__.py +62 -0
- vgi/_test_fixtures/aggregate/_common.py +21 -0
- vgi/_test_fixtures/aggregate/basic.py +232 -0
- vgi/_test_fixtures/aggregate/dynamic.py +409 -0
- vgi/_test_fixtures/aggregate/generic.py +86 -0
- vgi/_test_fixtures/aggregate/listagg.py +71 -0
- vgi/_test_fixtures/aggregate/percentile.py +107 -0
- vgi/_test_fixtures/aggregate/streaming.py +192 -0
- vgi/_test_fixtures/aggregate/varargs.py +75 -0
- vgi/_test_fixtures/aggregate/window.py +380 -0
- vgi/_test_fixtures/attach_options.py +308 -0
- vgi/_test_fixtures/bad_protocol.py +62 -0
- vgi/_test_fixtures/cancellable.py +336 -0
- vgi/_test_fixtures/catalog.py +813 -0
- vgi/_test_fixtures/http_server.py +394 -0
- vgi/_test_fixtures/nest_tensor.py +614 -0
- vgi/_test_fixtures/orchard_catalog.py +47 -0
- vgi/_test_fixtures/projection_repro/__init__.py +6 -0
- vgi/_test_fixtures/projection_repro/worker.py +454 -0
- vgi/_test_fixtures/scalar/__init__.py +116 -0
- vgi/_test_fixtures/scalar/_common.py +69 -0
- vgi/_test_fixtures/scalar/arithmetic.py +321 -0
- vgi/_test_fixtures/scalar/binary.py +120 -0
- vgi/_test_fixtures/scalar/formatting.py +176 -0
- vgi/_test_fixtures/scalar/geo.py +300 -0
- vgi/_test_fixtures/scalar/null_handling.py +107 -0
- vgi/_test_fixtures/scalar/random_demo.py +171 -0
- vgi/_test_fixtures/scalar/settings_secrets.py +102 -0
- vgi/_test_fixtures/scalar/type_info.py +219 -0
- vgi/_test_fixtures/schema_reconcile/__init__.py +29 -0
- vgi/_test_fixtures/schema_reconcile/worker.py +653 -0
- vgi/_test_fixtures/simple_writable.py +793 -0
- vgi/_test_fixtures/table/__init__.py +221 -0
- vgi/_test_fixtures/table/_common.py +162 -0
- vgi/_test_fixtures/table/batch_index.py +283 -0
- vgi/_test_fixtures/table/batch_index_broken.py +200 -0
- vgi/_test_fixtures/table/catalog_scans.py +162 -0
- vgi/_test_fixtures/table/filters.py +1005 -0
- vgi/_test_fixtures/table/late_materialization.py +249 -0
- vgi/_test_fixtures/table/make_series.py +273 -0
- vgi/_test_fixtures/table/misc.py +499 -0
- vgi/_test_fixtures/table/order_modes.py +164 -0
- vgi/_test_fixtures/table/pairs.py +437 -0
- vgi/_test_fixtures/table/partition_columns.py +472 -0
- vgi/_test_fixtures/table/partition_columns_broken.py +304 -0
- vgi/_test_fixtures/table/profiling_example.py +195 -0
- vgi/_test_fixtures/table/required_filters.py +234 -0
- vgi/_test_fixtures/table/sequence.py +710 -0
- vgi/_test_fixtures/table/settings.py +426 -0
- vgi/_test_fixtures/table/transaction_storage.py +162 -0
- vgi/_test_fixtures/table/tt_pushdown.py +191 -0
- vgi/_test_fixtures/table/versioned.py +230 -0
- vgi/_test_fixtures/table_in_out.py +1392 -0
- vgi/_test_fixtures/versioned.py +155 -0
- vgi/_test_fixtures/versioned_tables.py +595 -0
- vgi/_test_fixtures/worker.py +1631 -0
- vgi/_test_fixtures/writable/__init__.py +8 -0
- vgi/_test_fixtures/writable/generic.py +236 -0
- vgi/_test_fixtures/writable/table.py +149 -0
- vgi/_test_fixtures/writable/worker.py +1148 -0
- vgi/aggregate_function.py +607 -0
- vgi/argument_spec.py +472 -0
- vgi/arguments.py +1747 -0
- vgi/auth.py +55 -0
- vgi/catalog/__init__.py +88 -0
- vgi/catalog/attach_option.py +206 -0
- vgi/catalog/catalog_interface.py +2767 -0
- vgi/catalog/descriptors.py +870 -0
- vgi/catalog/duckdb_statistics.py +377 -0
- vgi/catalog/secret_type.py +96 -0
- vgi/catalog/setting.py +253 -0
- vgi/catalog/storage.py +372 -0
- vgi/client/__init__.py +67 -0
- vgi/client/catalog_mixin.py +1251 -0
- vgi/client/cli.py +582 -0
- vgi/client/cli_catalog.py +182 -0
- vgi/client/cli_schema.py +270 -0
- vgi/client/cli_table.py +907 -0
- vgi/client/cli_transaction.py +97 -0
- vgi/client/cli_utils.py +441 -0
- vgi/client/cli_view.py +303 -0
- vgi/client/client.py +2183 -0
- vgi/exceptions.py +205 -0
- vgi/function.py +245 -0
- vgi/function_storage.py +1636 -0
- vgi/function_storage_azure_sql.py +922 -0
- vgi/function_storage_cf_do.py +740 -0
- vgi/http/__init__.py +25 -0
- vgi/http/demo_storage.py +212 -0
- vgi/http/worker_page.py +1252 -0
- vgi/invocation.py +154 -0
- vgi/logging_config.py +93 -0
- vgi/meta_worker.py +661 -0
- vgi/metadata.py +1403 -0
- vgi/otel.py +406 -0
- vgi/protocol.py +2418 -0
- vgi/protocol_version.txt +1 -0
- vgi/py.typed +0 -0
- vgi/scalar_function.py +1211 -0
- vgi/schema_utils.py +234 -0
- vgi/secret_protocol.py +124 -0
- vgi/secret_service.py +238 -0
- vgi/serve.py +769 -0
- vgi/table_buffering_function.py +443 -0
- vgi/table_filter_pushdown.py +1528 -0
- vgi/table_function.py +1130 -0
- vgi/table_in_out_function.py +383 -0
- vgi/transactor/__init__.py +24 -0
- vgi/transactor/_duckdb_compat.py +27 -0
- vgi/transactor/client.py +137 -0
- vgi/transactor/protocol.py +149 -0
- vgi/transactor/server.py +740 -0
- vgi/worker.py +4761 -0
- vgi_python-0.8.0.dist-info/METADATA +735 -0
- vgi_python-0.8.0.dist-info/RECORD +124 -0
- vgi_python-0.8.0.dist-info/WHEEL +4 -0
- vgi_python-0.8.0.dist-info/entry_points.txt +5 -0
- vgi_python-0.8.0.dist-info/licenses/LICENSE +134 -0
|
@@ -0,0 +1,454 @@
|
|
|
1
|
+
# Copyright 2025, 2026 Query Farm LLC - https://query.farm
|
|
2
|
+
|
|
3
|
+
"""Projection-pushdown reproducer worker.
|
|
4
|
+
|
|
5
|
+
Two table functions, both declaring ``projection_pushdown = True`` and a
|
|
6
|
+
12-column ``FIXED_SCHEMA`` (mirrors ``vgi-kafka``'s ``kafka_consume``):
|
|
7
|
+
|
|
8
|
+
* ``proj_repro_strict`` — emits batches built strictly from
|
|
9
|
+
``params.output_schema`` (the projected subset). This is what
|
|
10
|
+
``projected_data`` does and what every projection-aware function is
|
|
11
|
+
supposed to do.
|
|
12
|
+
* ``proj_repro_full_schema`` — emits batches built against the
|
|
13
|
+
declared ``FIXED_SCHEMA`` (all 12 columns), even when projection is
|
|
14
|
+
in effect. Mirrors what a worker would do if it didn't observe
|
|
15
|
+
``params.output_schema``.
|
|
16
|
+
|
|
17
|
+
Plus a catalog interface that exposes both as virtual tables under
|
|
18
|
+
``main`` schema, so the same functions can be exercised by end-to-end
|
|
19
|
+
SQL ``SELECT`` against ``projection_repro.main.<name>`` (catalog-routed
|
|
20
|
+
scan).
|
|
21
|
+
|
|
22
|
+
The reproducer test calls each function:
|
|
23
|
+
- directly via ``Client.table_function`` with explicit
|
|
24
|
+
``projection_ids``;
|
|
25
|
+
- through the catalog-routed scan path (DuckDB → C++ extension →
|
|
26
|
+
``table_scan_function_get`` → bind → init with planner-derived
|
|
27
|
+
projection_ids).
|
|
28
|
+
|
|
29
|
+
Mismatches between ``params.output_schema`` and the OutputCollector's
|
|
30
|
+
configured schema (which the framework's ``emit`` uses for the cast)
|
|
31
|
+
will surface as ``ValueError: Target schema's field names are not
|
|
32
|
+
matching the record batch's field names``.
|
|
33
|
+
"""
|
|
34
|
+
|
|
35
|
+
from __future__ import annotations
|
|
36
|
+
|
|
37
|
+
from dataclasses import dataclass
|
|
38
|
+
from typing import Annotated, Any, ClassVar
|
|
39
|
+
|
|
40
|
+
import pyarrow as pa
|
|
41
|
+
from vgi_rpc import ArrowSerializableDataclass
|
|
42
|
+
from vgi_rpc.rpc import OutputCollector
|
|
43
|
+
|
|
44
|
+
from vgi import Worker
|
|
45
|
+
from vgi.arguments import Arg
|
|
46
|
+
from vgi.catalog import Catalog, Schema
|
|
47
|
+
from vgi.catalog.catalog_interface import (
|
|
48
|
+
AttachOpaqueData,
|
|
49
|
+
ReadOnlyCatalogInterface,
|
|
50
|
+
ScanFunctionResult,
|
|
51
|
+
SchemaInfo,
|
|
52
|
+
SchemaObjectType,
|
|
53
|
+
SerializedSchema,
|
|
54
|
+
TableInfo,
|
|
55
|
+
TransactionOpaqueData,
|
|
56
|
+
)
|
|
57
|
+
from vgi.function import Function
|
|
58
|
+
from vgi.invocation import GlobalInitResponse
|
|
59
|
+
from vgi.table_function import (
|
|
60
|
+
ProcessParams,
|
|
61
|
+
TableFunctionGenerator,
|
|
62
|
+
bind_fixed_schema,
|
|
63
|
+
init_single_worker,
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
CATALOG_NAME = "projection_repro"
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
# A 12-column schema mirroring kafka_consume's CONSUME_SCHEMA in shape:
|
|
70
|
+
# string topic, primitives, BLOBs, list-of-struct headers, etc. Real-world
|
|
71
|
+
# projection_pushdown candidates often have wide schemas like this.
|
|
72
|
+
_WIDE_FIELDS: list[pa.Field[Any]] = [
|
|
73
|
+
pa.field("topic", pa.string(), nullable=False),
|
|
74
|
+
pa.field("partition", pa.int32(), nullable=False),
|
|
75
|
+
pa.field("offset", pa.int64(), nullable=False),
|
|
76
|
+
pa.field("timestamp", pa.timestamp("ms", tz="UTC"), nullable=True),
|
|
77
|
+
pa.field("timestamp_type", pa.string(), nullable=True),
|
|
78
|
+
pa.field("key", pa.binary(), nullable=True),
|
|
79
|
+
pa.field("key_string", pa.string(), nullable=True),
|
|
80
|
+
pa.field("key_schema_id", pa.int32(), nullable=True),
|
|
81
|
+
pa.field("value", pa.binary(), nullable=True),
|
|
82
|
+
pa.field("value_string", pa.string(), nullable=True),
|
|
83
|
+
pa.field("value_schema_id", pa.int32(), nullable=True),
|
|
84
|
+
pa.field(
|
|
85
|
+
"headers",
|
|
86
|
+
pa.list_(pa.struct([pa.field("k", pa.string()), pa.field("v", pa.binary())])),
|
|
87
|
+
nullable=False,
|
|
88
|
+
),
|
|
89
|
+
]
|
|
90
|
+
WIDE_SCHEMA: pa.Schema = pa.schema(_WIDE_FIELDS)
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
@dataclass(slots=True, frozen=True)
|
|
94
|
+
class _Args:
|
|
95
|
+
n: Annotated[int, Arg(0, doc="Number of rows to generate", ge=0)]
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def _build_row_dict(i: int) -> dict[str, object]:
|
|
99
|
+
"""One row's worth of values for every column in WIDE_SCHEMA."""
|
|
100
|
+
return {
|
|
101
|
+
"topic": "demo_topic",
|
|
102
|
+
"partition": int(i % 4),
|
|
103
|
+
"offset": int(i),
|
|
104
|
+
"timestamp": None,
|
|
105
|
+
"timestamp_type": None,
|
|
106
|
+
"key": f"k{i}".encode(),
|
|
107
|
+
"key_string": f"k{i}",
|
|
108
|
+
"key_schema_id": None,
|
|
109
|
+
"value": f"v{i}".encode(),
|
|
110
|
+
"value_string": f"v{i}",
|
|
111
|
+
"value_schema_id": None,
|
|
112
|
+
"headers": [],
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
@init_single_worker
|
|
117
|
+
@bind_fixed_schema
|
|
118
|
+
class ProjReproStrict(TableFunctionGenerator[_Args, None]):
|
|
119
|
+
"""Builds batch from ``params.output_schema`` only.
|
|
120
|
+
|
|
121
|
+
Mirrors how ``projected_data`` does it — the canonical projection-aware
|
|
122
|
+
pattern. Emits a batch shaped exactly like what DuckDB asked for.
|
|
123
|
+
"""
|
|
124
|
+
|
|
125
|
+
FunctionArguments = _Args
|
|
126
|
+
|
|
127
|
+
class Meta:
|
|
128
|
+
name = "proj_repro_strict"
|
|
129
|
+
description = "projection-pushdown reproducer (strict params.output_schema)"
|
|
130
|
+
projection_pushdown = True
|
|
131
|
+
|
|
132
|
+
FIXED_SCHEMA: ClassVar[pa.Schema] = WIDE_SCHEMA
|
|
133
|
+
|
|
134
|
+
@classmethod
|
|
135
|
+
def process(
|
|
136
|
+
cls,
|
|
137
|
+
params: ProcessParams[_Args],
|
|
138
|
+
state: None,
|
|
139
|
+
out: OutputCollector,
|
|
140
|
+
) -> None:
|
|
141
|
+
n = params.args.n
|
|
142
|
+
out_schema: pa.Schema = params.output_schema
|
|
143
|
+
wanted = list(out_schema.names)
|
|
144
|
+
if not wanted:
|
|
145
|
+
# Empty projection (count(*) shape) — the output schema has
|
|
146
|
+
# zero columns. ``pa.RecordBatch.from_pylist`` with an empty
|
|
147
|
+
# schema can't infer row count from empty dicts, so use the
|
|
148
|
+
# canonical pyarrow idiom for an N-row 0-column batch:
|
|
149
|
+
# build a 1-column placeholder array of the right length and
|
|
150
|
+
# then ``select([])`` it down to zero columns. This preserves
|
|
151
|
+
# the row count, which is what DuckDB's count(*) needs.
|
|
152
|
+
out.emit(pa.RecordBatch.from_arrays([pa.nulls(n)], names=[""]).select([]))
|
|
153
|
+
else:
|
|
154
|
+
rows: list[dict[str, object]] = []
|
|
155
|
+
for i in range(n):
|
|
156
|
+
full = _build_row_dict(i)
|
|
157
|
+
rows.append({name: full[name] for name in wanted})
|
|
158
|
+
out.emit(pa.RecordBatch.from_pylist(rows, schema=out_schema))
|
|
159
|
+
out.finish()
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
@init_single_worker
|
|
163
|
+
@bind_fixed_schema
|
|
164
|
+
class ProjReproFullSchema(TableFunctionGenerator[_Args, None]):
|
|
165
|
+
"""Builds batch from FIXED_SCHEMA (all 12 columns) regardless of projection.
|
|
166
|
+
|
|
167
|
+
A naive worker that forgets to observe ``params.output_schema``. We
|
|
168
|
+
expect the framework to either:
|
|
169
|
+
|
|
170
|
+
* accept the over-wide batch and project it down to ``output_schema``
|
|
171
|
+
on its side (the lenient interpretation), or
|
|
172
|
+
* raise a clear error like "expected projected schema, got full".
|
|
173
|
+
|
|
174
|
+
Whichever the framework does, it should be deterministic and not the
|
|
175
|
+
confusing "different schema" cast error.
|
|
176
|
+
"""
|
|
177
|
+
|
|
178
|
+
FunctionArguments = _Args
|
|
179
|
+
|
|
180
|
+
class Meta:
|
|
181
|
+
name = "proj_repro_full_schema"
|
|
182
|
+
description = "projection-pushdown reproducer (emits full FIXED_SCHEMA)"
|
|
183
|
+
projection_pushdown = True
|
|
184
|
+
|
|
185
|
+
FIXED_SCHEMA: ClassVar[pa.Schema] = WIDE_SCHEMA
|
|
186
|
+
|
|
187
|
+
@classmethod
|
|
188
|
+
def process(
|
|
189
|
+
cls,
|
|
190
|
+
params: ProcessParams[_Args],
|
|
191
|
+
state: None,
|
|
192
|
+
out: OutputCollector,
|
|
193
|
+
) -> None:
|
|
194
|
+
n = params.args.n
|
|
195
|
+
rows = [_build_row_dict(i) for i in range(n)]
|
|
196
|
+
out.emit(pa.RecordBatch.from_pylist(rows, schema=cls.FIXED_SCHEMA))
|
|
197
|
+
out.finish()
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
# ---------------------------------------------------------------------------
|
|
201
|
+
# Catalog — exposes both functions as virtual tables under main schema, so
|
|
202
|
+
# they can be invoked via catalog-routed scan path
|
|
203
|
+
# (table_scan_function_get → bound function with projection_ids from
|
|
204
|
+
# DuckDB's planner).
|
|
205
|
+
# ---------------------------------------------------------------------------
|
|
206
|
+
|
|
207
|
+
|
|
208
|
+
@dataclass(kw_only=True)
|
|
209
|
+
class _ChunkedState(ArrowSerializableDataclass):
|
|
210
|
+
"""Cross-tick progress for the multi-tick reproducer functions.
|
|
211
|
+
|
|
212
|
+
Must extend ``ArrowSerializableDataclass`` so the framework can
|
|
213
|
+
serialize it into the stream-state token — without that, HTTP
|
|
214
|
+
transport (where each ``process()`` tick is an independent request)
|
|
215
|
+
restarts from ``initial_state()`` every exchange and the producer
|
|
216
|
+
loop never terminates. Subprocess transport happens to keep the live
|
|
217
|
+
object around between ticks, which masked the missing contract.
|
|
218
|
+
"""
|
|
219
|
+
|
|
220
|
+
emitted: int = 0
|
|
221
|
+
|
|
222
|
+
|
|
223
|
+
@init_single_worker
|
|
224
|
+
@bind_fixed_schema
|
|
225
|
+
class ProjReproChunked(TableFunctionGenerator[_Args, _ChunkedState]):
|
|
226
|
+
"""Multi-tick variant — emits one small batch per ``process()`` call.
|
|
227
|
+
|
|
228
|
+
Mirrors ``kafka_consume``'s shard-queue pattern where each ``process()``
|
|
229
|
+
tick emits one batch and returns, letting the framework reschedule.
|
|
230
|
+
Multi-tick output is where we observed the projection bug in
|
|
231
|
+
vgi-kafka: ``count(*) WHERE value_schema_id IS NOT NULL`` returned
|
|
232
|
+
a non-zero count even though the worker emitted ``None`` for every
|
|
233
|
+
row's ``value_schema_id``.
|
|
234
|
+
"""
|
|
235
|
+
|
|
236
|
+
FunctionArguments = _Args
|
|
237
|
+
|
|
238
|
+
class Meta:
|
|
239
|
+
name = "proj_repro_chunked"
|
|
240
|
+
description = "projection-pushdown reproducer (multi-tick, full FIXED_SCHEMA)"
|
|
241
|
+
projection_pushdown = True
|
|
242
|
+
|
|
243
|
+
FIXED_SCHEMA: ClassVar[pa.Schema] = WIDE_SCHEMA
|
|
244
|
+
|
|
245
|
+
@classmethod
|
|
246
|
+
def initial_state(cls, params: Any) -> _ChunkedState:
|
|
247
|
+
return _ChunkedState()
|
|
248
|
+
|
|
249
|
+
@classmethod
|
|
250
|
+
def process(
|
|
251
|
+
cls,
|
|
252
|
+
params: ProcessParams[_Args],
|
|
253
|
+
state: _ChunkedState,
|
|
254
|
+
out: OutputCollector,
|
|
255
|
+
) -> None:
|
|
256
|
+
n = params.args.n
|
|
257
|
+
chunk = 2 # tiny — exercise multi-batch shape like kafka shard ticks
|
|
258
|
+
if state.emitted >= n:
|
|
259
|
+
out.finish()
|
|
260
|
+
return
|
|
261
|
+
end = min(state.emitted + chunk, n)
|
|
262
|
+
rows = [_build_row_dict(i) for i in range(state.emitted, end)]
|
|
263
|
+
out.emit(pa.RecordBatch.from_pylist(rows, schema=cls.FIXED_SCHEMA))
|
|
264
|
+
state.emitted = end
|
|
265
|
+
if state.emitted >= n:
|
|
266
|
+
out.finish()
|
|
267
|
+
|
|
268
|
+
|
|
269
|
+
@bind_fixed_schema
|
|
270
|
+
class ProjReproMultiWorker(TableFunctionGenerator[_Args, _ChunkedState]):
|
|
271
|
+
"""Multi-worker, multi-tick variant.
|
|
272
|
+
|
|
273
|
+
Mirrors ``kafka_consume`` with 4 partitions: ``on_init`` requests
|
|
274
|
+
``max_workers=4`` and each worker emits chunks of 2 rows per
|
|
275
|
+
``process()`` tick. Together with full-FIXED_SCHEMA emission and
|
|
276
|
+
projection_pushdown, this exercises the same code path that
|
|
277
|
+
misbehaved in vgi-kafka where ``count(*) WHERE value_schema_id IS
|
|
278
|
+
NOT NULL`` returned 4 instead of 0 on a topic where every emitted
|
|
279
|
+
row had ``value_schema_id=None``.
|
|
280
|
+
"""
|
|
281
|
+
|
|
282
|
+
FunctionArguments = _Args
|
|
283
|
+
|
|
284
|
+
class Meta:
|
|
285
|
+
name = "proj_repro_multi_worker"
|
|
286
|
+
description = "projection-pushdown reproducer (4 workers, multi-tick, full FIXED_SCHEMA)"
|
|
287
|
+
projection_pushdown = True
|
|
288
|
+
|
|
289
|
+
FIXED_SCHEMA: ClassVar[pa.Schema] = WIDE_SCHEMA
|
|
290
|
+
|
|
291
|
+
@classmethod
|
|
292
|
+
def on_init(cls, params: Any) -> GlobalInitResponse:
|
|
293
|
+
return GlobalInitResponse(max_workers=4)
|
|
294
|
+
|
|
295
|
+
@classmethod
|
|
296
|
+
def initial_state(cls, params: Any) -> _ChunkedState:
|
|
297
|
+
return _ChunkedState()
|
|
298
|
+
|
|
299
|
+
@classmethod
|
|
300
|
+
def process(
|
|
301
|
+
cls,
|
|
302
|
+
params: ProcessParams[_Args],
|
|
303
|
+
state: _ChunkedState,
|
|
304
|
+
out: OutputCollector,
|
|
305
|
+
) -> None:
|
|
306
|
+
n = params.args.n
|
|
307
|
+
chunk = 2
|
|
308
|
+
if state.emitted >= n:
|
|
309
|
+
out.finish()
|
|
310
|
+
return
|
|
311
|
+
end = min(state.emitted + chunk, n)
|
|
312
|
+
rows = [_build_row_dict(i) for i in range(state.emitted, end)]
|
|
313
|
+
out.emit(pa.RecordBatch.from_pylist(rows, schema=cls.FIXED_SCHEMA))
|
|
314
|
+
state.emitted = end
|
|
315
|
+
if state.emitted >= n:
|
|
316
|
+
out.finish()
|
|
317
|
+
|
|
318
|
+
|
|
319
|
+
_FUNCTIONS: list[type[Function]] = [
|
|
320
|
+
ProjReproStrict,
|
|
321
|
+
ProjReproFullSchema,
|
|
322
|
+
ProjReproChunked,
|
|
323
|
+
ProjReproMultiWorker,
|
|
324
|
+
]
|
|
325
|
+
|
|
326
|
+
|
|
327
|
+
_CATALOG = Catalog(
|
|
328
|
+
name=CATALOG_NAME,
|
|
329
|
+
default_schema="main",
|
|
330
|
+
schemas=[
|
|
331
|
+
Schema(
|
|
332
|
+
name="main",
|
|
333
|
+
comment="projection-pushdown reproducer catalog",
|
|
334
|
+
functions=list(_FUNCTIONS),
|
|
335
|
+
tables=[],
|
|
336
|
+
),
|
|
337
|
+
],
|
|
338
|
+
)
|
|
339
|
+
|
|
340
|
+
|
|
341
|
+
def _serialize_schema(s: pa.Schema) -> bytes:
|
|
342
|
+
sink = pa.BufferOutputStream()
|
|
343
|
+
with pa.ipc.new_stream(sink, s):
|
|
344
|
+
pass
|
|
345
|
+
return sink.getvalue().to_pybytes()
|
|
346
|
+
|
|
347
|
+
|
|
348
|
+
_TABLE_NAMES = {
|
|
349
|
+
"strict_table": "proj_repro_strict",
|
|
350
|
+
"full_table": "proj_repro_full_schema",
|
|
351
|
+
}
|
|
352
|
+
|
|
353
|
+
|
|
354
|
+
class ProjReproCatalog(ReadOnlyCatalogInterface):
|
|
355
|
+
"""Exposes virtual tables backed by the two reproducer functions."""
|
|
356
|
+
|
|
357
|
+
catalog = _CATALOG
|
|
358
|
+
catalog_name = CATALOG_NAME
|
|
359
|
+
|
|
360
|
+
def _info(self, table_name: str) -> TableInfo:
|
|
361
|
+
return TableInfo(
|
|
362
|
+
comment=f"reproducer table -> {_TABLE_NAMES[table_name]}",
|
|
363
|
+
tags={},
|
|
364
|
+
name=table_name,
|
|
365
|
+
schema_name="main",
|
|
366
|
+
columns=SerializedSchema(_serialize_schema(WIDE_SCHEMA)),
|
|
367
|
+
not_null_constraints=[],
|
|
368
|
+
unique_constraints=[],
|
|
369
|
+
check_constraints=[],
|
|
370
|
+
)
|
|
371
|
+
|
|
372
|
+
def schemas(
|
|
373
|
+
self, *, attach_opaque_data: AttachOpaqueData, transaction_opaque_data: TransactionOpaqueData | None
|
|
374
|
+
) -> list[SchemaInfo]:
|
|
375
|
+
# Override the declarative ``Schema(tables=[])``-derived
|
|
376
|
+
# ``estimated_object_count[table] = 0`` with the real population.
|
|
377
|
+
# Without this, the C++ client treats the static zero as a hard
|
|
378
|
+
# guarantee and skips ``catalog_schema_contents_tables``, hiding
|
|
379
|
+
# every table this catalog publishes via the override below.
|
|
380
|
+
infos = super().schemas(attach_opaque_data=attach_opaque_data, transaction_opaque_data=transaction_opaque_data)
|
|
381
|
+
for i, info in enumerate(infos):
|
|
382
|
+
if info.name == "main":
|
|
383
|
+
infos[i] = SchemaInfo(
|
|
384
|
+
attach_opaque_data=info.attach_opaque_data,
|
|
385
|
+
name=info.name,
|
|
386
|
+
comment=info.comment,
|
|
387
|
+
tags=info.tags,
|
|
388
|
+
estimated_object_count={
|
|
389
|
+
**(info.estimated_object_count or {}),
|
|
390
|
+
"table": len(_TABLE_NAMES),
|
|
391
|
+
},
|
|
392
|
+
)
|
|
393
|
+
return infos
|
|
394
|
+
|
|
395
|
+
def schema_contents(
|
|
396
|
+
self,
|
|
397
|
+
*,
|
|
398
|
+
attach_opaque_data: AttachOpaqueData,
|
|
399
|
+
transaction_opaque_data: TransactionOpaqueData | None,
|
|
400
|
+
name: str,
|
|
401
|
+
type: Any,
|
|
402
|
+
) -> Any:
|
|
403
|
+
if name.lower() == "main" and type == SchemaObjectType.TABLE:
|
|
404
|
+
return [self._info(table_name) for table_name in _TABLE_NAMES]
|
|
405
|
+
return super().schema_contents(
|
|
406
|
+
attach_opaque_data=attach_opaque_data, transaction_opaque_data=transaction_opaque_data, name=name, type=type
|
|
407
|
+
)
|
|
408
|
+
|
|
409
|
+
def table_get(
|
|
410
|
+
self,
|
|
411
|
+
*,
|
|
412
|
+
attach_opaque_data: AttachOpaqueData,
|
|
413
|
+
transaction_opaque_data: TransactionOpaqueData | None,
|
|
414
|
+
schema_name: str,
|
|
415
|
+
name: str,
|
|
416
|
+
at_unit: str | None = None,
|
|
417
|
+
at_value: str | None = None,
|
|
418
|
+
) -> TableInfo | None:
|
|
419
|
+
if schema_name.lower() != "main":
|
|
420
|
+
return None
|
|
421
|
+
if name in _TABLE_NAMES:
|
|
422
|
+
return self._info(name)
|
|
423
|
+
return None
|
|
424
|
+
|
|
425
|
+
def table_scan_function_get(
|
|
426
|
+
self,
|
|
427
|
+
*,
|
|
428
|
+
attach_opaque_data: AttachOpaqueData,
|
|
429
|
+
transaction_opaque_data: TransactionOpaqueData | None,
|
|
430
|
+
schema_name: str,
|
|
431
|
+
name: str,
|
|
432
|
+
at_unit: str | None,
|
|
433
|
+
at_value: str | None,
|
|
434
|
+
) -> ScanFunctionResult:
|
|
435
|
+
fn = _TABLE_NAMES.get(name)
|
|
436
|
+
if fn is None:
|
|
437
|
+
raise ValueError(f"unknown reproducer table: {name}")
|
|
438
|
+
return ScanFunctionResult(
|
|
439
|
+
function_name=fn,
|
|
440
|
+
# The reproducer functions take a single ``n`` argument — pass
|
|
441
|
+
# 100 by default so any SELECT against the virtual table
|
|
442
|
+
# actually has rows. (Real workloads would derive this from
|
|
443
|
+
# filter pushdown or other state; we just need a constant.)
|
|
444
|
+
positional_arguments=[pa.scalar(100, type=pa.int64())],
|
|
445
|
+
named_arguments={},
|
|
446
|
+
required_extensions=[],
|
|
447
|
+
)
|
|
448
|
+
|
|
449
|
+
|
|
450
|
+
class ProjReproWorker(Worker):
|
|
451
|
+
catalog_interface = ProjReproCatalog
|
|
452
|
+
catalog_name = CATALOG_NAME
|
|
453
|
+
catalog = _CATALOG
|
|
454
|
+
functions = list(_FUNCTIONS)
|
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
# Copyright 2025, 2026 Query Farm LLC - https://query.farm
|
|
2
|
+
|
|
3
|
+
"""Scalar-function fixtures.
|
|
4
|
+
|
|
5
|
+
Originally a single 1,568-line module; split into cohesive sub-modules and
|
|
6
|
+
re-exported here so existing import sites (worker.py, tests) keep working
|
|
7
|
+
unchanged.
|
|
8
|
+
|
|
9
|
+
* :mod:`._common` — numeric type promotion helpers
|
|
10
|
+
* :mod:`.arithmetic` — multiply, double, add_values, sum_values, concat_*
|
|
11
|
+
* :mod:`.formatting` — format_number_*, smart_format_*
|
|
12
|
+
* :mod:`.null_handling` — null_handling, conditional_message
|
|
13
|
+
* :mod:`.binary` — binary_packet, upper_case
|
|
14
|
+
* :mod:`.random_demo` — random_int, random_bytes, bernoulli, hash_seed
|
|
15
|
+
* :mod:`.type_info` — type_info_*, any_mixed_*, pair_type_*
|
|
16
|
+
* :mod:`.geo` — geo_distance_*, geo_centroid_*
|
|
17
|
+
* :mod:`.settings_secrets` — multiply_by_setting, return_secret_value, who_am_i
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
from vgi._test_fixtures.scalar.arithmetic import (
|
|
21
|
+
AddValuesFunction,
|
|
22
|
+
ConcatValuesIntFunction,
|
|
23
|
+
ConcatValuesStrFunction,
|
|
24
|
+
DoubleFunction,
|
|
25
|
+
MultiplyFunction,
|
|
26
|
+
SumValuesFunction,
|
|
27
|
+
)
|
|
28
|
+
from vgi._test_fixtures.scalar.binary import (
|
|
29
|
+
BinaryPacketFunction,
|
|
30
|
+
UpperCaseFunction,
|
|
31
|
+
)
|
|
32
|
+
from vgi._test_fixtures.scalar.formatting import (
|
|
33
|
+
FormatNumberDefaultFunction,
|
|
34
|
+
FormatNumberFullFunction,
|
|
35
|
+
FormatNumberPrecisionFunction,
|
|
36
|
+
SmartFormatPrefixFunction,
|
|
37
|
+
SmartFormatWidthFunction,
|
|
38
|
+
)
|
|
39
|
+
from vgi._test_fixtures.scalar.geo import (
|
|
40
|
+
_POINT_STRUCT_TYPE,
|
|
41
|
+
GeoCentroidFixedFunction,
|
|
42
|
+
GeoCentroidListFunction,
|
|
43
|
+
GeoCentroidStructFunction,
|
|
44
|
+
GeoDistanceFixedFunction,
|
|
45
|
+
GeoDistanceListFunction,
|
|
46
|
+
GeoDistanceStructFunction,
|
|
47
|
+
)
|
|
48
|
+
from vgi._test_fixtures.scalar.null_handling import (
|
|
49
|
+
ConditionalMessageFunction,
|
|
50
|
+
NullHandlingFunction,
|
|
51
|
+
)
|
|
52
|
+
from vgi._test_fixtures.scalar.random_demo import (
|
|
53
|
+
BernoulliFunction,
|
|
54
|
+
HashSeedFunction,
|
|
55
|
+
RandomBytesFunction,
|
|
56
|
+
RandomIntFunction,
|
|
57
|
+
)
|
|
58
|
+
from vgi._test_fixtures.scalar.settings_secrets import (
|
|
59
|
+
MultiplyBySettingFunction,
|
|
60
|
+
ReturnSecretValueFunction,
|
|
61
|
+
WhoAmIFunction,
|
|
62
|
+
)
|
|
63
|
+
from vgi._test_fixtures.scalar.type_info import (
|
|
64
|
+
AnyMixedIntFunction,
|
|
65
|
+
AnyMixedStrFunction,
|
|
66
|
+
PairTypeIntIntFunction,
|
|
67
|
+
PairTypeIntStrFunction,
|
|
68
|
+
PairTypeStrStrFunction,
|
|
69
|
+
TypeInfoInt32Function,
|
|
70
|
+
TypeInfoInt64Function,
|
|
71
|
+
TypeInfoStringFunction,
|
|
72
|
+
TypeInfoUInt32Function,
|
|
73
|
+
TypeInfoUInt64Function,
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
__all__ = [
|
|
77
|
+
"_POINT_STRUCT_TYPE",
|
|
78
|
+
"AddValuesFunction",
|
|
79
|
+
"AnyMixedIntFunction",
|
|
80
|
+
"AnyMixedStrFunction",
|
|
81
|
+
"BernoulliFunction",
|
|
82
|
+
"BinaryPacketFunction",
|
|
83
|
+
"ConcatValuesIntFunction",
|
|
84
|
+
"ConcatValuesStrFunction",
|
|
85
|
+
"ConditionalMessageFunction",
|
|
86
|
+
"DoubleFunction",
|
|
87
|
+
"FormatNumberDefaultFunction",
|
|
88
|
+
"FormatNumberFullFunction",
|
|
89
|
+
"FormatNumberPrecisionFunction",
|
|
90
|
+
"GeoCentroidFixedFunction",
|
|
91
|
+
"GeoCentroidListFunction",
|
|
92
|
+
"GeoCentroidStructFunction",
|
|
93
|
+
"GeoDistanceFixedFunction",
|
|
94
|
+
"GeoDistanceListFunction",
|
|
95
|
+
"GeoDistanceStructFunction",
|
|
96
|
+
"HashSeedFunction",
|
|
97
|
+
"MultiplyBySettingFunction",
|
|
98
|
+
"MultiplyFunction",
|
|
99
|
+
"NullHandlingFunction",
|
|
100
|
+
"PairTypeIntIntFunction",
|
|
101
|
+
"PairTypeIntStrFunction",
|
|
102
|
+
"PairTypeStrStrFunction",
|
|
103
|
+
"RandomBytesFunction",
|
|
104
|
+
"RandomIntFunction",
|
|
105
|
+
"ReturnSecretValueFunction",
|
|
106
|
+
"SmartFormatPrefixFunction",
|
|
107
|
+
"SmartFormatWidthFunction",
|
|
108
|
+
"SumValuesFunction",
|
|
109
|
+
"TypeInfoInt32Function",
|
|
110
|
+
"TypeInfoInt64Function",
|
|
111
|
+
"TypeInfoStringFunction",
|
|
112
|
+
"TypeInfoUInt32Function",
|
|
113
|
+
"TypeInfoUInt64Function",
|
|
114
|
+
"UpperCaseFunction",
|
|
115
|
+
"WhoAmIFunction",
|
|
116
|
+
]
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
# Copyright 2025, 2026 Query Farm LLC - https://query.farm
|
|
2
|
+
|
|
3
|
+
"""Shared scalar fixture helpers (numeric type promotion)."""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
import pyarrow as pa
|
|
8
|
+
|
|
9
|
+
from vgi.exceptions import SchemaValidationError
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def _is_addable_type(dtype: pa.DataType) -> bool:
|
|
13
|
+
"""Check if a type can be passed to pyarrow.compute.add."""
|
|
14
|
+
return (
|
|
15
|
+
pa.types.is_integer(dtype)
|
|
16
|
+
or pa.types.is_floating(dtype)
|
|
17
|
+
or pa.types.is_decimal(dtype)
|
|
18
|
+
or pa.types.is_temporal(dtype)
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def _is_multipliable_type(dtype: pa.DataType) -> bool:
|
|
23
|
+
"""Check if a type can be passed to pyarrow.compute.multiply.
|
|
24
|
+
|
|
25
|
+
Tighter than ``_is_addable_type`` because pc.multiply has no kernel for
|
|
26
|
+
temporal types (date/time/timestamp/interval) — pc.add does, since
|
|
27
|
+
date + interval is well-defined, but doubling a date is not.
|
|
28
|
+
"""
|
|
29
|
+
return pa.types.is_integer(dtype) or pa.types.is_floating(dtype) or pa.types.is_decimal(dtype)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def _promote_for_addition(dtype: pa.DataType) -> pa.DataType:
|
|
33
|
+
"""Return the appropriate output type for addition to reduce overflow risk.
|
|
34
|
+
|
|
35
|
+
Adding two values of the same type can overflow, so we promote integers
|
|
36
|
+
to the next larger size. For example, int32 + int32 -> int64.
|
|
37
|
+
"""
|
|
38
|
+
if pa.types.is_temporal(dtype):
|
|
39
|
+
return dtype
|
|
40
|
+
if pa.types.is_floating(dtype):
|
|
41
|
+
# Promote float32 -> float64 to reduce overflow risk
|
|
42
|
+
if dtype == pa.float16() or dtype == pa.float32():
|
|
43
|
+
return pa.float64()
|
|
44
|
+
return dtype
|
|
45
|
+
if pa.types.is_integer(dtype):
|
|
46
|
+
# Promote to a larger integer type since a + b can overflow
|
|
47
|
+
if dtype == pa.int8():
|
|
48
|
+
return pa.int16()
|
|
49
|
+
if dtype == pa.int16():
|
|
50
|
+
return pa.int32()
|
|
51
|
+
if dtype in (pa.int32(), pa.int64()):
|
|
52
|
+
return pa.int64()
|
|
53
|
+
# Unsigned integers
|
|
54
|
+
if dtype == pa.uint8():
|
|
55
|
+
return pa.uint16()
|
|
56
|
+
if dtype == pa.uint16():
|
|
57
|
+
return pa.uint32()
|
|
58
|
+
if dtype in (pa.uint32(), pa.uint64()):
|
|
59
|
+
return pa.uint64()
|
|
60
|
+
return dtype
|
|
61
|
+
if pa.types.is_decimal(dtype):
|
|
62
|
+
# Adding/doubling a decimal needs +1 digit of precision to avoid
|
|
63
|
+
# overflow (2 * 10^p uses p+1 digits). DuckDB only consumes
|
|
64
|
+
# decimal128 over the Arrow C ABI (no decimal256 reader), so we cap
|
|
65
|
+
# at precision 38; doubling at the cap keeps the same type and
|
|
66
|
+
# accepts that values >= 5e37 will overflow at compute time.
|
|
67
|
+
new_precision = min(dtype.precision + 1, 38)
|
|
68
|
+
return pa.decimal128(new_precision, dtype.scale)
|
|
69
|
+
raise SchemaValidationError(f"Unsupported numeric type for addition: {dtype}")
|