vgi-python 0.8.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (124) hide show
  1. vgi/__init__.py +152 -0
  2. vgi/_duckdb.py +62 -0
  3. vgi/_storage_profile.py +132 -0
  4. vgi/_test_fixtures/__init__.py +20 -0
  5. vgi/_test_fixtures/accumulate/__init__.py +19 -0
  6. vgi/_test_fixtures/accumulate/worker.py +762 -0
  7. vgi/_test_fixtures/aggregate/__init__.py +62 -0
  8. vgi/_test_fixtures/aggregate/_common.py +21 -0
  9. vgi/_test_fixtures/aggregate/basic.py +232 -0
  10. vgi/_test_fixtures/aggregate/dynamic.py +409 -0
  11. vgi/_test_fixtures/aggregate/generic.py +86 -0
  12. vgi/_test_fixtures/aggregate/listagg.py +71 -0
  13. vgi/_test_fixtures/aggregate/percentile.py +107 -0
  14. vgi/_test_fixtures/aggregate/streaming.py +192 -0
  15. vgi/_test_fixtures/aggregate/varargs.py +75 -0
  16. vgi/_test_fixtures/aggregate/window.py +380 -0
  17. vgi/_test_fixtures/attach_options.py +308 -0
  18. vgi/_test_fixtures/bad_protocol.py +62 -0
  19. vgi/_test_fixtures/cancellable.py +336 -0
  20. vgi/_test_fixtures/catalog.py +813 -0
  21. vgi/_test_fixtures/http_server.py +394 -0
  22. vgi/_test_fixtures/nest_tensor.py +614 -0
  23. vgi/_test_fixtures/orchard_catalog.py +47 -0
  24. vgi/_test_fixtures/projection_repro/__init__.py +6 -0
  25. vgi/_test_fixtures/projection_repro/worker.py +454 -0
  26. vgi/_test_fixtures/scalar/__init__.py +116 -0
  27. vgi/_test_fixtures/scalar/_common.py +69 -0
  28. vgi/_test_fixtures/scalar/arithmetic.py +321 -0
  29. vgi/_test_fixtures/scalar/binary.py +120 -0
  30. vgi/_test_fixtures/scalar/formatting.py +176 -0
  31. vgi/_test_fixtures/scalar/geo.py +300 -0
  32. vgi/_test_fixtures/scalar/null_handling.py +107 -0
  33. vgi/_test_fixtures/scalar/random_demo.py +171 -0
  34. vgi/_test_fixtures/scalar/settings_secrets.py +102 -0
  35. vgi/_test_fixtures/scalar/type_info.py +219 -0
  36. vgi/_test_fixtures/schema_reconcile/__init__.py +29 -0
  37. vgi/_test_fixtures/schema_reconcile/worker.py +653 -0
  38. vgi/_test_fixtures/simple_writable.py +793 -0
  39. vgi/_test_fixtures/table/__init__.py +221 -0
  40. vgi/_test_fixtures/table/_common.py +162 -0
  41. vgi/_test_fixtures/table/batch_index.py +283 -0
  42. vgi/_test_fixtures/table/batch_index_broken.py +200 -0
  43. vgi/_test_fixtures/table/catalog_scans.py +162 -0
  44. vgi/_test_fixtures/table/filters.py +1005 -0
  45. vgi/_test_fixtures/table/late_materialization.py +249 -0
  46. vgi/_test_fixtures/table/make_series.py +273 -0
  47. vgi/_test_fixtures/table/misc.py +499 -0
  48. vgi/_test_fixtures/table/order_modes.py +164 -0
  49. vgi/_test_fixtures/table/pairs.py +437 -0
  50. vgi/_test_fixtures/table/partition_columns.py +472 -0
  51. vgi/_test_fixtures/table/partition_columns_broken.py +304 -0
  52. vgi/_test_fixtures/table/profiling_example.py +195 -0
  53. vgi/_test_fixtures/table/required_filters.py +234 -0
  54. vgi/_test_fixtures/table/sequence.py +710 -0
  55. vgi/_test_fixtures/table/settings.py +426 -0
  56. vgi/_test_fixtures/table/transaction_storage.py +162 -0
  57. vgi/_test_fixtures/table/tt_pushdown.py +191 -0
  58. vgi/_test_fixtures/table/versioned.py +230 -0
  59. vgi/_test_fixtures/table_in_out.py +1392 -0
  60. vgi/_test_fixtures/versioned.py +155 -0
  61. vgi/_test_fixtures/versioned_tables.py +595 -0
  62. vgi/_test_fixtures/worker.py +1631 -0
  63. vgi/_test_fixtures/writable/__init__.py +8 -0
  64. vgi/_test_fixtures/writable/generic.py +236 -0
  65. vgi/_test_fixtures/writable/table.py +149 -0
  66. vgi/_test_fixtures/writable/worker.py +1148 -0
  67. vgi/aggregate_function.py +607 -0
  68. vgi/argument_spec.py +472 -0
  69. vgi/arguments.py +1747 -0
  70. vgi/auth.py +55 -0
  71. vgi/catalog/__init__.py +88 -0
  72. vgi/catalog/attach_option.py +206 -0
  73. vgi/catalog/catalog_interface.py +2767 -0
  74. vgi/catalog/descriptors.py +870 -0
  75. vgi/catalog/duckdb_statistics.py +377 -0
  76. vgi/catalog/secret_type.py +96 -0
  77. vgi/catalog/setting.py +253 -0
  78. vgi/catalog/storage.py +372 -0
  79. vgi/client/__init__.py +67 -0
  80. vgi/client/catalog_mixin.py +1251 -0
  81. vgi/client/cli.py +582 -0
  82. vgi/client/cli_catalog.py +182 -0
  83. vgi/client/cli_schema.py +270 -0
  84. vgi/client/cli_table.py +907 -0
  85. vgi/client/cli_transaction.py +97 -0
  86. vgi/client/cli_utils.py +441 -0
  87. vgi/client/cli_view.py +303 -0
  88. vgi/client/client.py +2183 -0
  89. vgi/exceptions.py +205 -0
  90. vgi/function.py +245 -0
  91. vgi/function_storage.py +1636 -0
  92. vgi/function_storage_azure_sql.py +922 -0
  93. vgi/function_storage_cf_do.py +740 -0
  94. vgi/http/__init__.py +25 -0
  95. vgi/http/demo_storage.py +212 -0
  96. vgi/http/worker_page.py +1252 -0
  97. vgi/invocation.py +154 -0
  98. vgi/logging_config.py +93 -0
  99. vgi/meta_worker.py +661 -0
  100. vgi/metadata.py +1403 -0
  101. vgi/otel.py +406 -0
  102. vgi/protocol.py +2418 -0
  103. vgi/protocol_version.txt +1 -0
  104. vgi/py.typed +0 -0
  105. vgi/scalar_function.py +1211 -0
  106. vgi/schema_utils.py +234 -0
  107. vgi/secret_protocol.py +124 -0
  108. vgi/secret_service.py +238 -0
  109. vgi/serve.py +769 -0
  110. vgi/table_buffering_function.py +443 -0
  111. vgi/table_filter_pushdown.py +1528 -0
  112. vgi/table_function.py +1130 -0
  113. vgi/table_in_out_function.py +383 -0
  114. vgi/transactor/__init__.py +24 -0
  115. vgi/transactor/_duckdb_compat.py +27 -0
  116. vgi/transactor/client.py +137 -0
  117. vgi/transactor/protocol.py +149 -0
  118. vgi/transactor/server.py +740 -0
  119. vgi/worker.py +4761 -0
  120. vgi_python-0.8.0.dist-info/METADATA +735 -0
  121. vgi_python-0.8.0.dist-info/RECORD +124 -0
  122. vgi_python-0.8.0.dist-info/WHEEL +4 -0
  123. vgi_python-0.8.0.dist-info/entry_points.txt +5 -0
  124. vgi_python-0.8.0.dist-info/licenses/LICENSE +134 -0
@@ -0,0 +1,454 @@
1
+ # Copyright 2025, 2026 Query Farm LLC - https://query.farm
2
+
3
+ """Projection-pushdown reproducer worker.
4
+
5
+ Two table functions, both declaring ``projection_pushdown = True`` and a
6
+ 12-column ``FIXED_SCHEMA`` (mirrors ``vgi-kafka``'s ``kafka_consume``):
7
+
8
+ * ``proj_repro_strict`` — emits batches built strictly from
9
+ ``params.output_schema`` (the projected subset). This is what
10
+ ``projected_data`` does and what every projection-aware function is
11
+ supposed to do.
12
+ * ``proj_repro_full_schema`` — emits batches built against the
13
+ declared ``FIXED_SCHEMA`` (all 12 columns), even when projection is
14
+ in effect. Mirrors what a worker would do if it didn't observe
15
+ ``params.output_schema``.
16
+
17
+ Plus a catalog interface that exposes both as virtual tables under
18
+ ``main`` schema, so the same functions can be exercised by end-to-end
19
+ SQL ``SELECT`` against ``projection_repro.main.<name>`` (catalog-routed
20
+ scan).
21
+
22
+ The reproducer test calls each function:
23
+ - directly via ``Client.table_function`` with explicit
24
+ ``projection_ids``;
25
+ - through the catalog-routed scan path (DuckDB → C++ extension →
26
+ ``table_scan_function_get`` → bind → init with planner-derived
27
+ projection_ids).
28
+
29
+ Mismatches between ``params.output_schema`` and the OutputCollector's
30
+ configured schema (which the framework's ``emit`` uses for the cast)
31
+ will surface as ``ValueError: Target schema's field names are not
32
+ matching the record batch's field names``.
33
+ """
34
+
35
+ from __future__ import annotations
36
+
37
+ from dataclasses import dataclass
38
+ from typing import Annotated, Any, ClassVar
39
+
40
+ import pyarrow as pa
41
+ from vgi_rpc import ArrowSerializableDataclass
42
+ from vgi_rpc.rpc import OutputCollector
43
+
44
+ from vgi import Worker
45
+ from vgi.arguments import Arg
46
+ from vgi.catalog import Catalog, Schema
47
+ from vgi.catalog.catalog_interface import (
48
+ AttachOpaqueData,
49
+ ReadOnlyCatalogInterface,
50
+ ScanFunctionResult,
51
+ SchemaInfo,
52
+ SchemaObjectType,
53
+ SerializedSchema,
54
+ TableInfo,
55
+ TransactionOpaqueData,
56
+ )
57
+ from vgi.function import Function
58
+ from vgi.invocation import GlobalInitResponse
59
+ from vgi.table_function import (
60
+ ProcessParams,
61
+ TableFunctionGenerator,
62
+ bind_fixed_schema,
63
+ init_single_worker,
64
+ )
65
+
66
+ CATALOG_NAME = "projection_repro"
67
+
68
+
69
+ # A 12-column schema mirroring kafka_consume's CONSUME_SCHEMA in shape:
70
+ # string topic, primitives, BLOBs, list-of-struct headers, etc. Real-world
71
+ # projection_pushdown candidates often have wide schemas like this.
72
+ _WIDE_FIELDS: list[pa.Field[Any]] = [
73
+ pa.field("topic", pa.string(), nullable=False),
74
+ pa.field("partition", pa.int32(), nullable=False),
75
+ pa.field("offset", pa.int64(), nullable=False),
76
+ pa.field("timestamp", pa.timestamp("ms", tz="UTC"), nullable=True),
77
+ pa.field("timestamp_type", pa.string(), nullable=True),
78
+ pa.field("key", pa.binary(), nullable=True),
79
+ pa.field("key_string", pa.string(), nullable=True),
80
+ pa.field("key_schema_id", pa.int32(), nullable=True),
81
+ pa.field("value", pa.binary(), nullable=True),
82
+ pa.field("value_string", pa.string(), nullable=True),
83
+ pa.field("value_schema_id", pa.int32(), nullable=True),
84
+ pa.field(
85
+ "headers",
86
+ pa.list_(pa.struct([pa.field("k", pa.string()), pa.field("v", pa.binary())])),
87
+ nullable=False,
88
+ ),
89
+ ]
90
+ WIDE_SCHEMA: pa.Schema = pa.schema(_WIDE_FIELDS)
91
+
92
+
93
+ @dataclass(slots=True, frozen=True)
94
+ class _Args:
95
+ n: Annotated[int, Arg(0, doc="Number of rows to generate", ge=0)]
96
+
97
+
98
+ def _build_row_dict(i: int) -> dict[str, object]:
99
+ """One row's worth of values for every column in WIDE_SCHEMA."""
100
+ return {
101
+ "topic": "demo_topic",
102
+ "partition": int(i % 4),
103
+ "offset": int(i),
104
+ "timestamp": None,
105
+ "timestamp_type": None,
106
+ "key": f"k{i}".encode(),
107
+ "key_string": f"k{i}",
108
+ "key_schema_id": None,
109
+ "value": f"v{i}".encode(),
110
+ "value_string": f"v{i}",
111
+ "value_schema_id": None,
112
+ "headers": [],
113
+ }
114
+
115
+
116
+ @init_single_worker
117
+ @bind_fixed_schema
118
+ class ProjReproStrict(TableFunctionGenerator[_Args, None]):
119
+ """Builds batch from ``params.output_schema`` only.
120
+
121
+ Mirrors how ``projected_data`` does it — the canonical projection-aware
122
+ pattern. Emits a batch shaped exactly like what DuckDB asked for.
123
+ """
124
+
125
+ FunctionArguments = _Args
126
+
127
+ class Meta:
128
+ name = "proj_repro_strict"
129
+ description = "projection-pushdown reproducer (strict params.output_schema)"
130
+ projection_pushdown = True
131
+
132
+ FIXED_SCHEMA: ClassVar[pa.Schema] = WIDE_SCHEMA
133
+
134
+ @classmethod
135
+ def process(
136
+ cls,
137
+ params: ProcessParams[_Args],
138
+ state: None,
139
+ out: OutputCollector,
140
+ ) -> None:
141
+ n = params.args.n
142
+ out_schema: pa.Schema = params.output_schema
143
+ wanted = list(out_schema.names)
144
+ if not wanted:
145
+ # Empty projection (count(*) shape) — the output schema has
146
+ # zero columns. ``pa.RecordBatch.from_pylist`` with an empty
147
+ # schema can't infer row count from empty dicts, so use the
148
+ # canonical pyarrow idiom for an N-row 0-column batch:
149
+ # build a 1-column placeholder array of the right length and
150
+ # then ``select([])`` it down to zero columns. This preserves
151
+ # the row count, which is what DuckDB's count(*) needs.
152
+ out.emit(pa.RecordBatch.from_arrays([pa.nulls(n)], names=[""]).select([]))
153
+ else:
154
+ rows: list[dict[str, object]] = []
155
+ for i in range(n):
156
+ full = _build_row_dict(i)
157
+ rows.append({name: full[name] for name in wanted})
158
+ out.emit(pa.RecordBatch.from_pylist(rows, schema=out_schema))
159
+ out.finish()
160
+
161
+
162
+ @init_single_worker
163
+ @bind_fixed_schema
164
+ class ProjReproFullSchema(TableFunctionGenerator[_Args, None]):
165
+ """Builds batch from FIXED_SCHEMA (all 12 columns) regardless of projection.
166
+
167
+ A naive worker that forgets to observe ``params.output_schema``. We
168
+ expect the framework to either:
169
+
170
+ * accept the over-wide batch and project it down to ``output_schema``
171
+ on its side (the lenient interpretation), or
172
+ * raise a clear error like "expected projected schema, got full".
173
+
174
+ Whichever the framework does, it should be deterministic and not the
175
+ confusing "different schema" cast error.
176
+ """
177
+
178
+ FunctionArguments = _Args
179
+
180
+ class Meta:
181
+ name = "proj_repro_full_schema"
182
+ description = "projection-pushdown reproducer (emits full FIXED_SCHEMA)"
183
+ projection_pushdown = True
184
+
185
+ FIXED_SCHEMA: ClassVar[pa.Schema] = WIDE_SCHEMA
186
+
187
+ @classmethod
188
+ def process(
189
+ cls,
190
+ params: ProcessParams[_Args],
191
+ state: None,
192
+ out: OutputCollector,
193
+ ) -> None:
194
+ n = params.args.n
195
+ rows = [_build_row_dict(i) for i in range(n)]
196
+ out.emit(pa.RecordBatch.from_pylist(rows, schema=cls.FIXED_SCHEMA))
197
+ out.finish()
198
+
199
+
200
+ # ---------------------------------------------------------------------------
201
+ # Catalog — exposes both functions as virtual tables under main schema, so
202
+ # they can be invoked via catalog-routed scan path
203
+ # (table_scan_function_get → bound function with projection_ids from
204
+ # DuckDB's planner).
205
+ # ---------------------------------------------------------------------------
206
+
207
+
208
+ @dataclass(kw_only=True)
209
+ class _ChunkedState(ArrowSerializableDataclass):
210
+ """Cross-tick progress for the multi-tick reproducer functions.
211
+
212
+ Must extend ``ArrowSerializableDataclass`` so the framework can
213
+ serialize it into the stream-state token — without that, HTTP
214
+ transport (where each ``process()`` tick is an independent request)
215
+ restarts from ``initial_state()`` every exchange and the producer
216
+ loop never terminates. Subprocess transport happens to keep the live
217
+ object around between ticks, which masked the missing contract.
218
+ """
219
+
220
+ emitted: int = 0
221
+
222
+
223
+ @init_single_worker
224
+ @bind_fixed_schema
225
+ class ProjReproChunked(TableFunctionGenerator[_Args, _ChunkedState]):
226
+ """Multi-tick variant — emits one small batch per ``process()`` call.
227
+
228
+ Mirrors ``kafka_consume``'s shard-queue pattern where each ``process()``
229
+ tick emits one batch and returns, letting the framework reschedule.
230
+ Multi-tick output is where we observed the projection bug in
231
+ vgi-kafka: ``count(*) WHERE value_schema_id IS NOT NULL`` returned
232
+ a non-zero count even though the worker emitted ``None`` for every
233
+ row's ``value_schema_id``.
234
+ """
235
+
236
+ FunctionArguments = _Args
237
+
238
+ class Meta:
239
+ name = "proj_repro_chunked"
240
+ description = "projection-pushdown reproducer (multi-tick, full FIXED_SCHEMA)"
241
+ projection_pushdown = True
242
+
243
+ FIXED_SCHEMA: ClassVar[pa.Schema] = WIDE_SCHEMA
244
+
245
+ @classmethod
246
+ def initial_state(cls, params: Any) -> _ChunkedState:
247
+ return _ChunkedState()
248
+
249
+ @classmethod
250
+ def process(
251
+ cls,
252
+ params: ProcessParams[_Args],
253
+ state: _ChunkedState,
254
+ out: OutputCollector,
255
+ ) -> None:
256
+ n = params.args.n
257
+ chunk = 2 # tiny — exercise multi-batch shape like kafka shard ticks
258
+ if state.emitted >= n:
259
+ out.finish()
260
+ return
261
+ end = min(state.emitted + chunk, n)
262
+ rows = [_build_row_dict(i) for i in range(state.emitted, end)]
263
+ out.emit(pa.RecordBatch.from_pylist(rows, schema=cls.FIXED_SCHEMA))
264
+ state.emitted = end
265
+ if state.emitted >= n:
266
+ out.finish()
267
+
268
+
269
+ @bind_fixed_schema
270
+ class ProjReproMultiWorker(TableFunctionGenerator[_Args, _ChunkedState]):
271
+ """Multi-worker, multi-tick variant.
272
+
273
+ Mirrors ``kafka_consume`` with 4 partitions: ``on_init`` requests
274
+ ``max_workers=4`` and each worker emits chunks of 2 rows per
275
+ ``process()`` tick. Together with full-FIXED_SCHEMA emission and
276
+ projection_pushdown, this exercises the same code path that
277
+ misbehaved in vgi-kafka where ``count(*) WHERE value_schema_id IS
278
+ NOT NULL`` returned 4 instead of 0 on a topic where every emitted
279
+ row had ``value_schema_id=None``.
280
+ """
281
+
282
+ FunctionArguments = _Args
283
+
284
+ class Meta:
285
+ name = "proj_repro_multi_worker"
286
+ description = "projection-pushdown reproducer (4 workers, multi-tick, full FIXED_SCHEMA)"
287
+ projection_pushdown = True
288
+
289
+ FIXED_SCHEMA: ClassVar[pa.Schema] = WIDE_SCHEMA
290
+
291
+ @classmethod
292
+ def on_init(cls, params: Any) -> GlobalInitResponse:
293
+ return GlobalInitResponse(max_workers=4)
294
+
295
+ @classmethod
296
+ def initial_state(cls, params: Any) -> _ChunkedState:
297
+ return _ChunkedState()
298
+
299
+ @classmethod
300
+ def process(
301
+ cls,
302
+ params: ProcessParams[_Args],
303
+ state: _ChunkedState,
304
+ out: OutputCollector,
305
+ ) -> None:
306
+ n = params.args.n
307
+ chunk = 2
308
+ if state.emitted >= n:
309
+ out.finish()
310
+ return
311
+ end = min(state.emitted + chunk, n)
312
+ rows = [_build_row_dict(i) for i in range(state.emitted, end)]
313
+ out.emit(pa.RecordBatch.from_pylist(rows, schema=cls.FIXED_SCHEMA))
314
+ state.emitted = end
315
+ if state.emitted >= n:
316
+ out.finish()
317
+
318
+
319
+ _FUNCTIONS: list[type[Function]] = [
320
+ ProjReproStrict,
321
+ ProjReproFullSchema,
322
+ ProjReproChunked,
323
+ ProjReproMultiWorker,
324
+ ]
325
+
326
+
327
+ _CATALOG = Catalog(
328
+ name=CATALOG_NAME,
329
+ default_schema="main",
330
+ schemas=[
331
+ Schema(
332
+ name="main",
333
+ comment="projection-pushdown reproducer catalog",
334
+ functions=list(_FUNCTIONS),
335
+ tables=[],
336
+ ),
337
+ ],
338
+ )
339
+
340
+
341
+ def _serialize_schema(s: pa.Schema) -> bytes:
342
+ sink = pa.BufferOutputStream()
343
+ with pa.ipc.new_stream(sink, s):
344
+ pass
345
+ return sink.getvalue().to_pybytes()
346
+
347
+
348
+ _TABLE_NAMES = {
349
+ "strict_table": "proj_repro_strict",
350
+ "full_table": "proj_repro_full_schema",
351
+ }
352
+
353
+
354
+ class ProjReproCatalog(ReadOnlyCatalogInterface):
355
+ """Exposes virtual tables backed by the two reproducer functions."""
356
+
357
+ catalog = _CATALOG
358
+ catalog_name = CATALOG_NAME
359
+
360
+ def _info(self, table_name: str) -> TableInfo:
361
+ return TableInfo(
362
+ comment=f"reproducer table -> {_TABLE_NAMES[table_name]}",
363
+ tags={},
364
+ name=table_name,
365
+ schema_name="main",
366
+ columns=SerializedSchema(_serialize_schema(WIDE_SCHEMA)),
367
+ not_null_constraints=[],
368
+ unique_constraints=[],
369
+ check_constraints=[],
370
+ )
371
+
372
+ def schemas(
373
+ self, *, attach_opaque_data: AttachOpaqueData, transaction_opaque_data: TransactionOpaqueData | None
374
+ ) -> list[SchemaInfo]:
375
+ # Override the declarative ``Schema(tables=[])``-derived
376
+ # ``estimated_object_count[table] = 0`` with the real population.
377
+ # Without this, the C++ client treats the static zero as a hard
378
+ # guarantee and skips ``catalog_schema_contents_tables``, hiding
379
+ # every table this catalog publishes via the override below.
380
+ infos = super().schemas(attach_opaque_data=attach_opaque_data, transaction_opaque_data=transaction_opaque_data)
381
+ for i, info in enumerate(infos):
382
+ if info.name == "main":
383
+ infos[i] = SchemaInfo(
384
+ attach_opaque_data=info.attach_opaque_data,
385
+ name=info.name,
386
+ comment=info.comment,
387
+ tags=info.tags,
388
+ estimated_object_count={
389
+ **(info.estimated_object_count or {}),
390
+ "table": len(_TABLE_NAMES),
391
+ },
392
+ )
393
+ return infos
394
+
395
+ def schema_contents(
396
+ self,
397
+ *,
398
+ attach_opaque_data: AttachOpaqueData,
399
+ transaction_opaque_data: TransactionOpaqueData | None,
400
+ name: str,
401
+ type: Any,
402
+ ) -> Any:
403
+ if name.lower() == "main" and type == SchemaObjectType.TABLE:
404
+ return [self._info(table_name) for table_name in _TABLE_NAMES]
405
+ return super().schema_contents(
406
+ attach_opaque_data=attach_opaque_data, transaction_opaque_data=transaction_opaque_data, name=name, type=type
407
+ )
408
+
409
+ def table_get(
410
+ self,
411
+ *,
412
+ attach_opaque_data: AttachOpaqueData,
413
+ transaction_opaque_data: TransactionOpaqueData | None,
414
+ schema_name: str,
415
+ name: str,
416
+ at_unit: str | None = None,
417
+ at_value: str | None = None,
418
+ ) -> TableInfo | None:
419
+ if schema_name.lower() != "main":
420
+ return None
421
+ if name in _TABLE_NAMES:
422
+ return self._info(name)
423
+ return None
424
+
425
+ def table_scan_function_get(
426
+ self,
427
+ *,
428
+ attach_opaque_data: AttachOpaqueData,
429
+ transaction_opaque_data: TransactionOpaqueData | None,
430
+ schema_name: str,
431
+ name: str,
432
+ at_unit: str | None,
433
+ at_value: str | None,
434
+ ) -> ScanFunctionResult:
435
+ fn = _TABLE_NAMES.get(name)
436
+ if fn is None:
437
+ raise ValueError(f"unknown reproducer table: {name}")
438
+ return ScanFunctionResult(
439
+ function_name=fn,
440
+ # The reproducer functions take a single ``n`` argument — pass
441
+ # 100 by default so any SELECT against the virtual table
442
+ # actually has rows. (Real workloads would derive this from
443
+ # filter pushdown or other state; we just need a constant.)
444
+ positional_arguments=[pa.scalar(100, type=pa.int64())],
445
+ named_arguments={},
446
+ required_extensions=[],
447
+ )
448
+
449
+
450
+ class ProjReproWorker(Worker):
451
+ catalog_interface = ProjReproCatalog
452
+ catalog_name = CATALOG_NAME
453
+ catalog = _CATALOG
454
+ functions = list(_FUNCTIONS)
@@ -0,0 +1,116 @@
1
+ # Copyright 2025, 2026 Query Farm LLC - https://query.farm
2
+
3
+ """Scalar-function fixtures.
4
+
5
+ Originally a single 1,568-line module; split into cohesive sub-modules and
6
+ re-exported here so existing import sites (worker.py, tests) keep working
7
+ unchanged.
8
+
9
+ * :mod:`._common` — numeric type promotion helpers
10
+ * :mod:`.arithmetic` — multiply, double, add_values, sum_values, concat_*
11
+ * :mod:`.formatting` — format_number_*, smart_format_*
12
+ * :mod:`.null_handling` — null_handling, conditional_message
13
+ * :mod:`.binary` — binary_packet, upper_case
14
+ * :mod:`.random_demo` — random_int, random_bytes, bernoulli, hash_seed
15
+ * :mod:`.type_info` — type_info_*, any_mixed_*, pair_type_*
16
+ * :mod:`.geo` — geo_distance_*, geo_centroid_*
17
+ * :mod:`.settings_secrets` — multiply_by_setting, return_secret_value, who_am_i
18
+ """
19
+
20
+ from vgi._test_fixtures.scalar.arithmetic import (
21
+ AddValuesFunction,
22
+ ConcatValuesIntFunction,
23
+ ConcatValuesStrFunction,
24
+ DoubleFunction,
25
+ MultiplyFunction,
26
+ SumValuesFunction,
27
+ )
28
+ from vgi._test_fixtures.scalar.binary import (
29
+ BinaryPacketFunction,
30
+ UpperCaseFunction,
31
+ )
32
+ from vgi._test_fixtures.scalar.formatting import (
33
+ FormatNumberDefaultFunction,
34
+ FormatNumberFullFunction,
35
+ FormatNumberPrecisionFunction,
36
+ SmartFormatPrefixFunction,
37
+ SmartFormatWidthFunction,
38
+ )
39
+ from vgi._test_fixtures.scalar.geo import (
40
+ _POINT_STRUCT_TYPE,
41
+ GeoCentroidFixedFunction,
42
+ GeoCentroidListFunction,
43
+ GeoCentroidStructFunction,
44
+ GeoDistanceFixedFunction,
45
+ GeoDistanceListFunction,
46
+ GeoDistanceStructFunction,
47
+ )
48
+ from vgi._test_fixtures.scalar.null_handling import (
49
+ ConditionalMessageFunction,
50
+ NullHandlingFunction,
51
+ )
52
+ from vgi._test_fixtures.scalar.random_demo import (
53
+ BernoulliFunction,
54
+ HashSeedFunction,
55
+ RandomBytesFunction,
56
+ RandomIntFunction,
57
+ )
58
+ from vgi._test_fixtures.scalar.settings_secrets import (
59
+ MultiplyBySettingFunction,
60
+ ReturnSecretValueFunction,
61
+ WhoAmIFunction,
62
+ )
63
+ from vgi._test_fixtures.scalar.type_info import (
64
+ AnyMixedIntFunction,
65
+ AnyMixedStrFunction,
66
+ PairTypeIntIntFunction,
67
+ PairTypeIntStrFunction,
68
+ PairTypeStrStrFunction,
69
+ TypeInfoInt32Function,
70
+ TypeInfoInt64Function,
71
+ TypeInfoStringFunction,
72
+ TypeInfoUInt32Function,
73
+ TypeInfoUInt64Function,
74
+ )
75
+
76
+ __all__ = [
77
+ "_POINT_STRUCT_TYPE",
78
+ "AddValuesFunction",
79
+ "AnyMixedIntFunction",
80
+ "AnyMixedStrFunction",
81
+ "BernoulliFunction",
82
+ "BinaryPacketFunction",
83
+ "ConcatValuesIntFunction",
84
+ "ConcatValuesStrFunction",
85
+ "ConditionalMessageFunction",
86
+ "DoubleFunction",
87
+ "FormatNumberDefaultFunction",
88
+ "FormatNumberFullFunction",
89
+ "FormatNumberPrecisionFunction",
90
+ "GeoCentroidFixedFunction",
91
+ "GeoCentroidListFunction",
92
+ "GeoCentroidStructFunction",
93
+ "GeoDistanceFixedFunction",
94
+ "GeoDistanceListFunction",
95
+ "GeoDistanceStructFunction",
96
+ "HashSeedFunction",
97
+ "MultiplyBySettingFunction",
98
+ "MultiplyFunction",
99
+ "NullHandlingFunction",
100
+ "PairTypeIntIntFunction",
101
+ "PairTypeIntStrFunction",
102
+ "PairTypeStrStrFunction",
103
+ "RandomBytesFunction",
104
+ "RandomIntFunction",
105
+ "ReturnSecretValueFunction",
106
+ "SmartFormatPrefixFunction",
107
+ "SmartFormatWidthFunction",
108
+ "SumValuesFunction",
109
+ "TypeInfoInt32Function",
110
+ "TypeInfoInt64Function",
111
+ "TypeInfoStringFunction",
112
+ "TypeInfoUInt32Function",
113
+ "TypeInfoUInt64Function",
114
+ "UpperCaseFunction",
115
+ "WhoAmIFunction",
116
+ ]
@@ -0,0 +1,69 @@
1
+ # Copyright 2025, 2026 Query Farm LLC - https://query.farm
2
+
3
+ """Shared scalar fixture helpers (numeric type promotion)."""
4
+
5
+ from __future__ import annotations
6
+
7
+ import pyarrow as pa
8
+
9
+ from vgi.exceptions import SchemaValidationError
10
+
11
+
12
+ def _is_addable_type(dtype: pa.DataType) -> bool:
13
+ """Check if a type can be passed to pyarrow.compute.add."""
14
+ return (
15
+ pa.types.is_integer(dtype)
16
+ or pa.types.is_floating(dtype)
17
+ or pa.types.is_decimal(dtype)
18
+ or pa.types.is_temporal(dtype)
19
+ )
20
+
21
+
22
+ def _is_multipliable_type(dtype: pa.DataType) -> bool:
23
+ """Check if a type can be passed to pyarrow.compute.multiply.
24
+
25
+ Tighter than ``_is_addable_type`` because pc.multiply has no kernel for
26
+ temporal types (date/time/timestamp/interval) — pc.add does, since
27
+ date + interval is well-defined, but doubling a date is not.
28
+ """
29
+ return pa.types.is_integer(dtype) or pa.types.is_floating(dtype) or pa.types.is_decimal(dtype)
30
+
31
+
32
+ def _promote_for_addition(dtype: pa.DataType) -> pa.DataType:
33
+ """Return the appropriate output type for addition to reduce overflow risk.
34
+
35
+ Adding two values of the same type can overflow, so we promote integers
36
+ to the next larger size. For example, int32 + int32 -> int64.
37
+ """
38
+ if pa.types.is_temporal(dtype):
39
+ return dtype
40
+ if pa.types.is_floating(dtype):
41
+ # Promote float32 -> float64 to reduce overflow risk
42
+ if dtype == pa.float16() or dtype == pa.float32():
43
+ return pa.float64()
44
+ return dtype
45
+ if pa.types.is_integer(dtype):
46
+ # Promote to a larger integer type since a + b can overflow
47
+ if dtype == pa.int8():
48
+ return pa.int16()
49
+ if dtype == pa.int16():
50
+ return pa.int32()
51
+ if dtype in (pa.int32(), pa.int64()):
52
+ return pa.int64()
53
+ # Unsigned integers
54
+ if dtype == pa.uint8():
55
+ return pa.uint16()
56
+ if dtype == pa.uint16():
57
+ return pa.uint32()
58
+ if dtype in (pa.uint32(), pa.uint64()):
59
+ return pa.uint64()
60
+ return dtype
61
+ if pa.types.is_decimal(dtype):
62
+ # Adding/doubling a decimal needs +1 digit of precision to avoid
63
+ # overflow (2 * 10^p uses p+1 digits). DuckDB only consumes
64
+ # decimal128 over the Arrow C ABI (no decimal256 reader), so we cap
65
+ # at precision 38; doubling at the cap keeps the same type and
66
+ # accepts that values >= 5e37 will overflow at compute time.
67
+ new_precision = min(dtype.precision + 1, 38)
68
+ return pa.decimal128(new_precision, dtype.scale)
69
+ raise SchemaValidationError(f"Unsupported numeric type for addition: {dtype}")