vgi-python 0.8.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (124) hide show
  1. vgi/__init__.py +152 -0
  2. vgi/_duckdb.py +62 -0
  3. vgi/_storage_profile.py +132 -0
  4. vgi/_test_fixtures/__init__.py +20 -0
  5. vgi/_test_fixtures/accumulate/__init__.py +19 -0
  6. vgi/_test_fixtures/accumulate/worker.py +762 -0
  7. vgi/_test_fixtures/aggregate/__init__.py +62 -0
  8. vgi/_test_fixtures/aggregate/_common.py +21 -0
  9. vgi/_test_fixtures/aggregate/basic.py +232 -0
  10. vgi/_test_fixtures/aggregate/dynamic.py +409 -0
  11. vgi/_test_fixtures/aggregate/generic.py +86 -0
  12. vgi/_test_fixtures/aggregate/listagg.py +71 -0
  13. vgi/_test_fixtures/aggregate/percentile.py +107 -0
  14. vgi/_test_fixtures/aggregate/streaming.py +192 -0
  15. vgi/_test_fixtures/aggregate/varargs.py +75 -0
  16. vgi/_test_fixtures/aggregate/window.py +380 -0
  17. vgi/_test_fixtures/attach_options.py +308 -0
  18. vgi/_test_fixtures/bad_protocol.py +62 -0
  19. vgi/_test_fixtures/cancellable.py +336 -0
  20. vgi/_test_fixtures/catalog.py +813 -0
  21. vgi/_test_fixtures/http_server.py +394 -0
  22. vgi/_test_fixtures/nest_tensor.py +614 -0
  23. vgi/_test_fixtures/orchard_catalog.py +47 -0
  24. vgi/_test_fixtures/projection_repro/__init__.py +6 -0
  25. vgi/_test_fixtures/projection_repro/worker.py +454 -0
  26. vgi/_test_fixtures/scalar/__init__.py +116 -0
  27. vgi/_test_fixtures/scalar/_common.py +69 -0
  28. vgi/_test_fixtures/scalar/arithmetic.py +321 -0
  29. vgi/_test_fixtures/scalar/binary.py +120 -0
  30. vgi/_test_fixtures/scalar/formatting.py +176 -0
  31. vgi/_test_fixtures/scalar/geo.py +300 -0
  32. vgi/_test_fixtures/scalar/null_handling.py +107 -0
  33. vgi/_test_fixtures/scalar/random_demo.py +171 -0
  34. vgi/_test_fixtures/scalar/settings_secrets.py +102 -0
  35. vgi/_test_fixtures/scalar/type_info.py +219 -0
  36. vgi/_test_fixtures/schema_reconcile/__init__.py +29 -0
  37. vgi/_test_fixtures/schema_reconcile/worker.py +653 -0
  38. vgi/_test_fixtures/simple_writable.py +793 -0
  39. vgi/_test_fixtures/table/__init__.py +221 -0
  40. vgi/_test_fixtures/table/_common.py +162 -0
  41. vgi/_test_fixtures/table/batch_index.py +283 -0
  42. vgi/_test_fixtures/table/batch_index_broken.py +200 -0
  43. vgi/_test_fixtures/table/catalog_scans.py +162 -0
  44. vgi/_test_fixtures/table/filters.py +1005 -0
  45. vgi/_test_fixtures/table/late_materialization.py +249 -0
  46. vgi/_test_fixtures/table/make_series.py +273 -0
  47. vgi/_test_fixtures/table/misc.py +499 -0
  48. vgi/_test_fixtures/table/order_modes.py +164 -0
  49. vgi/_test_fixtures/table/pairs.py +437 -0
  50. vgi/_test_fixtures/table/partition_columns.py +472 -0
  51. vgi/_test_fixtures/table/partition_columns_broken.py +304 -0
  52. vgi/_test_fixtures/table/profiling_example.py +195 -0
  53. vgi/_test_fixtures/table/required_filters.py +234 -0
  54. vgi/_test_fixtures/table/sequence.py +710 -0
  55. vgi/_test_fixtures/table/settings.py +426 -0
  56. vgi/_test_fixtures/table/transaction_storage.py +162 -0
  57. vgi/_test_fixtures/table/tt_pushdown.py +191 -0
  58. vgi/_test_fixtures/table/versioned.py +230 -0
  59. vgi/_test_fixtures/table_in_out.py +1392 -0
  60. vgi/_test_fixtures/versioned.py +155 -0
  61. vgi/_test_fixtures/versioned_tables.py +595 -0
  62. vgi/_test_fixtures/worker.py +1631 -0
  63. vgi/_test_fixtures/writable/__init__.py +8 -0
  64. vgi/_test_fixtures/writable/generic.py +236 -0
  65. vgi/_test_fixtures/writable/table.py +149 -0
  66. vgi/_test_fixtures/writable/worker.py +1148 -0
  67. vgi/aggregate_function.py +607 -0
  68. vgi/argument_spec.py +472 -0
  69. vgi/arguments.py +1747 -0
  70. vgi/auth.py +55 -0
  71. vgi/catalog/__init__.py +88 -0
  72. vgi/catalog/attach_option.py +206 -0
  73. vgi/catalog/catalog_interface.py +2767 -0
  74. vgi/catalog/descriptors.py +870 -0
  75. vgi/catalog/duckdb_statistics.py +377 -0
  76. vgi/catalog/secret_type.py +96 -0
  77. vgi/catalog/setting.py +253 -0
  78. vgi/catalog/storage.py +372 -0
  79. vgi/client/__init__.py +67 -0
  80. vgi/client/catalog_mixin.py +1251 -0
  81. vgi/client/cli.py +582 -0
  82. vgi/client/cli_catalog.py +182 -0
  83. vgi/client/cli_schema.py +270 -0
  84. vgi/client/cli_table.py +907 -0
  85. vgi/client/cli_transaction.py +97 -0
  86. vgi/client/cli_utils.py +441 -0
  87. vgi/client/cli_view.py +303 -0
  88. vgi/client/client.py +2183 -0
  89. vgi/exceptions.py +205 -0
  90. vgi/function.py +245 -0
  91. vgi/function_storage.py +1636 -0
  92. vgi/function_storage_azure_sql.py +922 -0
  93. vgi/function_storage_cf_do.py +740 -0
  94. vgi/http/__init__.py +25 -0
  95. vgi/http/demo_storage.py +212 -0
  96. vgi/http/worker_page.py +1252 -0
  97. vgi/invocation.py +154 -0
  98. vgi/logging_config.py +93 -0
  99. vgi/meta_worker.py +661 -0
  100. vgi/metadata.py +1403 -0
  101. vgi/otel.py +406 -0
  102. vgi/protocol.py +2418 -0
  103. vgi/protocol_version.txt +1 -0
  104. vgi/py.typed +0 -0
  105. vgi/scalar_function.py +1211 -0
  106. vgi/schema_utils.py +234 -0
  107. vgi/secret_protocol.py +124 -0
  108. vgi/secret_service.py +238 -0
  109. vgi/serve.py +769 -0
  110. vgi/table_buffering_function.py +443 -0
  111. vgi/table_filter_pushdown.py +1528 -0
  112. vgi/table_function.py +1130 -0
  113. vgi/table_in_out_function.py +383 -0
  114. vgi/transactor/__init__.py +24 -0
  115. vgi/transactor/_duckdb_compat.py +27 -0
  116. vgi/transactor/client.py +137 -0
  117. vgi/transactor/protocol.py +149 -0
  118. vgi/transactor/server.py +740 -0
  119. vgi/worker.py +4761 -0
  120. vgi_python-0.8.0.dist-info/METADATA +735 -0
  121. vgi_python-0.8.0.dist-info/RECORD +124 -0
  122. vgi_python-0.8.0.dist-info/WHEEL +4 -0
  123. vgi_python-0.8.0.dist-info/entry_points.txt +5 -0
  124. vgi_python-0.8.0.dist-info/licenses/LICENSE +134 -0
@@ -0,0 +1,1631 @@
1
+ # Copyright 2025, 2026 Query Farm LLC - https://query.farm
2
+
3
+ """Example worker with built-in functions for testing.
4
+
5
+ This demonstrates how to create a worker by subclassing Worker
6
+ and listing function classes. Function names are derived from
7
+ each class's metadata (Meta.name or snake_case of class name).
8
+
9
+ The worker supports:
10
+ - TableInOutGenerator: Transforms input batches to output batches
11
+ - TableFunctionGenerator: Generates output batches without input
12
+ - ScalarFunctionGenerator: Transforms input to single-column output (1:1 rows)
13
+
14
+ Settings:
15
+ - vgi_verbose_mode: Enable verbose output with extra columns (bool, default: false)
16
+ - greeting: Custom greeting message (str, default: "Hello")
17
+ - multiplier: Value multiplier (int, default: 1)
18
+ - threshold: Filter threshold for filter_by_setting (int, default: 0)
19
+ - config: Sequence configuration struct for struct_settings (struct, default: None)
20
+
21
+ Usage:
22
+ vgi-fixture-worker
23
+ """
24
+
25
+ # Friendly error if numpy is missing. Several fixture modules below depend on
26
+ # numpy, which the `vgi-fixtures` distribution installs; surface a clear install
27
+ # message instead of a raw ImportError.
28
+ try:
29
+ import numpy # noqa: F401
30
+ except ImportError:
31
+ import sys as _sys
32
+
33
+ _sys.exit("vgi-fixture-worker requires numpy. Install it with: pip install 'vgi-python[test-fixtures]'")
34
+
35
+ import uuid
36
+ from typing import Annotated, Any
37
+
38
+ import pyarrow as pa
39
+
40
+ from vgi._test_fixtures.aggregate import (
41
+ AvgFunction,
42
+ CountFunction,
43
+ DynamicAggregateFunction,
44
+ DynamicMLAggregateFunction,
45
+ GenericSumFunction,
46
+ ListAggFunction,
47
+ PercentileFunction,
48
+ StreamingSumFunction,
49
+ SumAllFunction,
50
+ SumFunction,
51
+ WeightedSumFunction,
52
+ WindowListAggFunction,
53
+ WindowMedianFunction,
54
+ WindowSumBatchFunction,
55
+ WindowSumFunction,
56
+ )
57
+ from vgi._test_fixtures.cancellable import (
58
+ SlowCancellableBufferingFunction,
59
+ SlowCancellableFunction,
60
+ SlowCancellableInOutFunction,
61
+ )
62
+ from vgi._test_fixtures.nest_tensor import NestTensorFunction, UnnestTensorFunction, UnnestTensorRowsFunction
63
+ from vgi._test_fixtures.scalar import (
64
+ AddValuesFunction,
65
+ AnyMixedIntFunction,
66
+ AnyMixedStrFunction,
67
+ BernoulliFunction,
68
+ BinaryPacketFunction,
69
+ ConcatValuesIntFunction,
70
+ ConcatValuesStrFunction,
71
+ ConditionalMessageFunction,
72
+ DoubleFunction,
73
+ FormatNumberDefaultFunction,
74
+ FormatNumberFullFunction,
75
+ FormatNumberPrecisionFunction,
76
+ GeoCentroidFixedFunction,
77
+ GeoCentroidListFunction,
78
+ GeoCentroidStructFunction,
79
+ GeoDistanceFixedFunction,
80
+ GeoDistanceListFunction,
81
+ GeoDistanceStructFunction,
82
+ HashSeedFunction,
83
+ MultiplyBySettingFunction,
84
+ MultiplyFunction,
85
+ NullHandlingFunction,
86
+ PairTypeIntIntFunction,
87
+ PairTypeIntStrFunction,
88
+ PairTypeStrStrFunction,
89
+ RandomBytesFunction,
90
+ RandomIntFunction,
91
+ ReturnSecretValueFunction,
92
+ SmartFormatPrefixFunction,
93
+ SmartFormatWidthFunction,
94
+ SumValuesFunction,
95
+ TypeInfoInt32Function,
96
+ TypeInfoInt64Function,
97
+ TypeInfoStringFunction,
98
+ TypeInfoUInt32Function,
99
+ TypeInfoUInt64Function,
100
+ UpperCaseFunction,
101
+ WhoAmIFunction,
102
+ )
103
+ from vgi._test_fixtures.table import (
104
+ _VERSIONED_CONSTRAINTS_SCHEMAS,
105
+ _VERSIONED_SCHEMAS,
106
+ RFF_MULTI_COLUMNS,
107
+ RFF_NESTED_COLUMNS,
108
+ RFF_NONE_COLUMNS,
109
+ RFF_ROWID_COLUMNS,
110
+ RFF_SIMPLE_COLUMNS,
111
+ RFF_STRUCT_COLUMNS,
112
+ BatchIndexOverflowFunction,
113
+ BrokenMissingPartitionValuesFunction,
114
+ BrokenPartitionColumnAbsentFromBatchFunction,
115
+ BrokenPartitionMinNeqMaxFunction,
116
+ BrokenPartitionValuesNoAnnotationFunction,
117
+ ColorsScanFunction,
118
+ ConstantColumnsFunction,
119
+ CountryPartitionedSalesFunction,
120
+ DepartmentsScanFunction,
121
+ DictFilterEchoFunction,
122
+ DisjointRangePartitionedFunction,
123
+ DoubleSequenceFunction,
124
+ DynamicFilterEchoFunction,
125
+ EmployeesScanFunction,
126
+ ExpressionFilterTestFunction,
127
+ FilterEchoFunction,
128
+ FilterEchoPartitionedFunction,
129
+ FilterEchoTableScanFunction,
130
+ GeneratorExceptionFunction,
131
+ LateMaterializationFunction,
132
+ LoggingGeneratorFunction,
133
+ MakePairsIntFunction,
134
+ MakePairsIntStrFunction,
135
+ MakePairsStrFunction,
136
+ MakeSeriesCountFunction,
137
+ MakeSeriesCsvFunction,
138
+ MakeSeriesFloatFunction,
139
+ MakeSeriesRangeFunction,
140
+ MakeSeriesStepFunction,
141
+ MissingBatchIndexTagFunction,
142
+ NamedParamsEchoFunction,
143
+ NestedSequenceFunction,
144
+ NonMonotoneBatchIndexFunction,
145
+ OrderEchoFunction,
146
+ PartitionedBatchIndexFunction,
147
+ PartitionedBatchIndexMarkedFunction,
148
+ PartitionedFixedOrderFunction,
149
+ PartitionedNoOrderGuaranteeFunction,
150
+ PartitionedPreservesOrderFunction,
151
+ PartitionedSequenceFunction,
152
+ PartitionedWithExplicitOverrideFunction,
153
+ ProductsScanFunction,
154
+ ProfilingDemoFunction,
155
+ ProjectedDataFunction,
156
+ ProjectsScanFunction,
157
+ RegionYearPartitionedFunction,
158
+ RepeatValueIntFunction,
159
+ RepeatValueStrFunction,
160
+ RffMultiScanFunction,
161
+ RffNestedScanFunction,
162
+ RffNoneScanFunction,
163
+ RffRowidScanFunction,
164
+ RffSimpleScanFunction,
165
+ RffStructScanFunction,
166
+ RowIdSequenceFunction,
167
+ SampleEchoFunction,
168
+ ScopedSecretDemoFunction,
169
+ SecretDemoFunction,
170
+ SequenceFunction,
171
+ SettingsAwareFunction,
172
+ SpatialFilterExampleFunction,
173
+ StructSettingsFunction,
174
+ TenThousandFunction,
175
+ TxCachedValueFunction,
176
+ ValuePruneFunction,
177
+ VersionedConstraintsScanFunction,
178
+ VersionedDataFunction,
179
+ resolve_version,
180
+ resolve_versioned_constraints_version,
181
+ )
182
+ from vgi._test_fixtures.table.tt_pushdown import (
183
+ _TT_SCHEMA,
184
+ TimeTravelPushdownFunction,
185
+ TtPushdownColsScanFunction,
186
+ resolve_tt_version,
187
+ )
188
+ from vgi._test_fixtures.table_in_out import (
189
+ BatchIndexBufferInputFunction,
190
+ BufferEmitWideFunction,
191
+ BufferInputFunction,
192
+ CrashOnCombineFunction,
193
+ CrashOnFinalizeFunction,
194
+ CrashOnProcessFunction,
195
+ EchoBufferingFunction,
196
+ EchoFunction,
197
+ EchoWitnessFunction,
198
+ ExceptionFinalizeFunction,
199
+ ExceptionProcessFunction,
200
+ FilterBySettingFunction,
201
+ HangOnProcessFunction,
202
+ LargeStateFunction,
203
+ OrderedBufferInputFunction,
204
+ OrderedSourceFunction,
205
+ RepeatInputsFunction,
206
+ SumAllColumnsFunction,
207
+ SumAllColumnsSimpleDistributed,
208
+ )
209
+ from vgi.arguments import Arguments
210
+ from vgi.catalog import (
211
+ AttachOpaqueData,
212
+ Catalog,
213
+ ForeignKeyDef,
214
+ Index,
215
+ IndexConstraintType,
216
+ Macro,
217
+ MacroType,
218
+ ReadOnlyCatalogInterface,
219
+ ScanBranch,
220
+ ScanBranchesResult,
221
+ ScanFunctionResult,
222
+ Schema,
223
+ SecretTypeSpec,
224
+ SerializedSchema,
225
+ Setting,
226
+ Table,
227
+ TableInfo,
228
+ TransactionOpaqueData,
229
+ View,
230
+ )
231
+ from vgi.catalog.catalog_interface import _validate_at_params
232
+ from vgi.catalog.descriptors import ColumnStatisticsInput
233
+ from vgi.catalog.duckdb_statistics import statistics_from_duckdb
234
+ from vgi.schema_utils import schema
235
+ from vgi.worker import Worker
236
+
237
+
238
+ # ---------------------------------------------------------------------------
239
+ # DuckDB-backed table: demonstrates statistics_from_duckdb() helper.
240
+ # Creates an in-memory table and extracts real statistics from it.
241
+ # ---------------------------------------------------------------------------
242
+ def _build_numbers_stats() -> dict[str, ColumnStatisticsInput]:
243
+ """Extract statistics for the 'numbers' table (integers 0-99) from DuckDB.
244
+
245
+ Demonstrates the ``statistics_from_duckdb()`` helper by creating the same
246
+ data in a DuckDB in-memory table and pulling real statistics from it.
247
+ """
248
+ from vgi._duckdb import connect as engine_connect
249
+
250
+ conn = engine_connect()
251
+ conn.execute("CREATE TABLE numbers AS SELECT unnest(range(100)) AS value")
252
+ stats = statistics_from_duckdb(conn, "numbers")
253
+ conn.close()
254
+ return stats
255
+
256
+
257
+ _NUMBERS_STATS = _build_numbers_stats()
258
+
259
+
260
+ def _build_geo_stats() -> tuple[pa.Schema, dict[str, ColumnStatisticsInput]]:
261
+ """Build a geometry table in DuckDB and extract spatial statistics.
262
+
263
+ Creates a 5x5 grid of points (0,0) to (4,4) with an integer ID.
264
+ Demonstrates geometry statistics via ``statistics_from_duckdb()``.
265
+ """
266
+ from vgi._duckdb import connect as engine_connect
267
+
268
+ conn = engine_connect()
269
+ # INSTALL is a no-op when the extension is already cached; fresh
270
+ # environments (CI runners) need the download before LOAD.
271
+ conn.execute("INSTALL spatial")
272
+ conn.execute("LOAD spatial")
273
+ conn.execute(
274
+ "CREATE TABLE geo_points AS "
275
+ "SELECT row_number() OVER () AS id, "
276
+ "ST_Point(x::DOUBLE, y::DOUBLE)::GEOMETRY AS geom "
277
+ "FROM range(5) t1(x), range(5) t2(y)"
278
+ )
279
+ schema = conn.execute("SELECT * FROM geo_points LIMIT 0").to_arrow_table().schema
280
+ stats = statistics_from_duckdb(conn, "geo_points")
281
+ conn.close()
282
+ return schema, stats
283
+
284
+
285
+ _GEO_SCHEMA, _GEO_STATS = _build_geo_stats()
286
+
287
+
288
+ def _build_enum_stats() -> dict[str, ColumnStatisticsInput]:
289
+ """Extract statistics for a table with ENUM (dictionary-encoded) columns.
290
+
291
+ Demonstrates that ``statistics_from_duckdb()`` correctly unwraps
292
+ dictionary-encoded min/max to actual string values rather than
293
+ returning dictionary indices.
294
+ """
295
+ from vgi._duckdb import connect as engine_connect
296
+
297
+ conn = engine_connect()
298
+ conn.execute("CREATE TYPE color AS ENUM ('red', 'green', 'blue')")
299
+ conn.execute(
300
+ "CREATE TABLE colors AS "
301
+ "SELECT unnest(range(3)) + 1 AS id, "
302
+ "unnest(['red', 'green', 'blue'])::color AS color, "
303
+ "unnest(['#FF0000', '#00FF00', '#0000FF']) AS hex_code"
304
+ )
305
+ stats = statistics_from_duckdb(conn, "colors")
306
+ conn.close()
307
+ return stats
308
+
309
+
310
+ _ENUM_STATS = _build_enum_stats()
311
+
312
+ _EXAMPLE_CATALOG = Catalog(
313
+ name="example",
314
+ default_schema="main",
315
+ comment="Example VGI catalog for testing",
316
+ tags={"source": "vgi-fixture-worker", "version": "1"},
317
+ schemas=[
318
+ Schema(
319
+ name="main",
320
+ comment="Example functions for testing VGI",
321
+ functions=[
322
+ # TableInOutGenerator - transform input batches
323
+ EchoFunction,
324
+ EchoWitnessFunction,
325
+ BufferInputFunction,
326
+ FilterBySettingFunction,
327
+ RepeatInputsFunction,
328
+ SlowCancellableInOutFunction,
329
+ SumAllColumnsFunction,
330
+ SumAllColumnsSimpleDistributed,
331
+ UnnestTensorRowsFunction,
332
+ ExceptionFinalizeFunction,
333
+ ExceptionProcessFunction,
334
+ CrashOnProcessFunction,
335
+ CrashOnCombineFunction,
336
+ CrashOnFinalizeFunction,
337
+ HangOnProcessFunction,
338
+ LargeStateFunction,
339
+ OrderedBufferInputFunction,
340
+ OrderedSourceFunction,
341
+ BatchIndexBufferInputFunction,
342
+ EchoBufferingFunction,
343
+ BufferEmitWideFunction,
344
+ SlowCancellableBufferingFunction,
345
+ # TableFunctionGenerator - generate output without input
346
+ ConstantColumnsFunction,
347
+ SlowCancellableFunction,
348
+ FilterEchoFunction,
349
+ FilterEchoPartitionedFunction,
350
+ FilterEchoTableScanFunction,
351
+ ValuePruneFunction,
352
+ LateMaterializationFunction,
353
+ DictFilterEchoFunction,
354
+ DoubleSequenceFunction,
355
+ DynamicFilterEchoFunction,
356
+ GeneratorExceptionFunction,
357
+ LoggingGeneratorFunction,
358
+ MakeSeriesCountFunction,
359
+ MakeSeriesCsvFunction,
360
+ MakeSeriesFloatFunction,
361
+ MakeSeriesRangeFunction,
362
+ MakeSeriesStepFunction,
363
+ MakePairsIntFunction,
364
+ MakePairsIntStrFunction,
365
+ MakePairsStrFunction,
366
+ RepeatValueIntFunction,
367
+ RepeatValueStrFunction,
368
+ NamedParamsEchoFunction,
369
+ NestedSequenceFunction,
370
+ ProfilingDemoFunction,
371
+ OrderEchoFunction,
372
+ PartitionedBatchIndexFunction,
373
+ PartitionedBatchIndexMarkedFunction,
374
+ PartitionedFixedOrderFunction,
375
+ PartitionedNoOrderGuaranteeFunction,
376
+ PartitionedPreservesOrderFunction,
377
+ PartitionedSequenceFunction,
378
+ # PartitionColumns (Hive-style partitioning) reference fixtures
379
+ # — see vgi/_test_fixtures/table/partition_columns.py.
380
+ CountryPartitionedSalesFunction,
381
+ DisjointRangePartitionedFunction,
382
+ PartitionedWithExplicitOverrideFunction,
383
+ RegionYearPartitionedFunction,
384
+ # Deliberately-broken batch_index fixtures (see
385
+ # vgi/_test_fixtures/table/batch_index_broken.py). Registered
386
+ # so SQL integration tests in batch_index_contract.test can
387
+ # call them and assert the C++ extension's contract checks
388
+ # fire as typed IOExceptions.
389
+ BatchIndexOverflowFunction,
390
+ MissingBatchIndexTagFunction,
391
+ NonMonotoneBatchIndexFunction,
392
+ # Deliberately-broken PartitionColumns fixtures (see
393
+ # vgi/_test_fixtures/table/partition_columns_broken.py).
394
+ BrokenMissingPartitionValuesFunction,
395
+ BrokenPartitionColumnAbsentFromBatchFunction,
396
+ BrokenPartitionMinNeqMaxFunction,
397
+ BrokenPartitionValuesNoAnnotationFunction,
398
+ ProjectedDataFunction,
399
+ SampleEchoFunction,
400
+ RowIdSequenceFunction,
401
+ SecretDemoFunction,
402
+ ScopedSecretDemoFunction,
403
+ ExpressionFilterTestFunction,
404
+ SequenceFunction,
405
+ SettingsAwareFunction,
406
+ SpatialFilterExampleFunction,
407
+ StructSettingsFunction,
408
+ TenThousandFunction,
409
+ TxCachedValueFunction,
410
+ VersionedDataFunction,
411
+ # Time-travel + filter-pushdown fixtures (one function-backed, one
412
+ # columns-based) — back time_travel_pushdown.test.
413
+ TimeTravelPushdownFunction,
414
+ TtPushdownColsScanFunction,
415
+ # Static data scan functions for constraint-backed tables
416
+ ColorsScanFunction,
417
+ DepartmentsScanFunction,
418
+ EmployeesScanFunction,
419
+ ProductsScanFunction,
420
+ ProjectsScanFunction,
421
+ VersionedConstraintsScanFunction,
422
+ # rff_* scan functions back the Tables exercised by the
423
+ # vgi_required_filters_*.test sqllogictest matrix.
424
+ RffMultiScanFunction,
425
+ RffNestedScanFunction,
426
+ RffNoneScanFunction,
427
+ RffRowidScanFunction,
428
+ RffSimpleScanFunction,
429
+ RffStructScanFunction,
430
+ # ScalarFunctionGenerator - transform to single-column output
431
+ AddValuesFunction,
432
+ BernoulliFunction,
433
+ BinaryPacketFunction,
434
+ ConcatValuesIntFunction,
435
+ ConcatValuesStrFunction,
436
+ ConditionalMessageFunction,
437
+ DoubleFunction,
438
+ FormatNumberDefaultFunction,
439
+ FormatNumberFullFunction,
440
+ FormatNumberPrecisionFunction,
441
+ GeoCentroidFixedFunction,
442
+ GeoCentroidListFunction,
443
+ GeoCentroidStructFunction,
444
+ GeoDistanceFixedFunction,
445
+ GeoDistanceListFunction,
446
+ GeoDistanceStructFunction,
447
+ HashSeedFunction,
448
+ MultiplyBySettingFunction,
449
+ MultiplyFunction,
450
+ NullHandlingFunction,
451
+ PairTypeIntIntFunction,
452
+ PairTypeIntStrFunction,
453
+ PairTypeStrStrFunction,
454
+ RandomBytesFunction,
455
+ RandomIntFunction,
456
+ ReturnSecretValueFunction,
457
+ SmartFormatPrefixFunction,
458
+ SmartFormatWidthFunction,
459
+ SumValuesFunction,
460
+ TypeInfoInt32Function,
461
+ TypeInfoInt64Function,
462
+ TypeInfoStringFunction,
463
+ TypeInfoUInt32Function,
464
+ TypeInfoUInt64Function,
465
+ AnyMixedIntFunction,
466
+ AnyMixedStrFunction,
467
+ UnnestTensorFunction,
468
+ UpperCaseFunction,
469
+ WhoAmIFunction,
470
+ # AggregateFunction - aggregate input rows
471
+ AvgFunction,
472
+ CountFunction,
473
+ DynamicAggregateFunction,
474
+ DynamicMLAggregateFunction,
475
+ GenericSumFunction,
476
+ ListAggFunction,
477
+ NestTensorFunction,
478
+ PercentileFunction,
479
+ StreamingSumFunction,
480
+ SumAllFunction,
481
+ SumFunction,
482
+ WeightedSumFunction,
483
+ WindowListAggFunction,
484
+ WindowMedianFunction,
485
+ WindowSumBatchFunction,
486
+ WindowSumFunction,
487
+ ],
488
+ views=[
489
+ View(
490
+ name="first_ten",
491
+ definition="SELECT * FROM sequence(10)",
492
+ comment="First 10 integers",
493
+ column_comments={"n": "Sequence index 0..9"},
494
+ tags={"layer": "demo", "origin": "sequence"},
495
+ ),
496
+ View(
497
+ name="even_numbers",
498
+ definition="SELECT * FROM sequence(100) WHERE n % 2 = 0",
499
+ comment="Even numbers from 0 to 98",
500
+ ),
501
+ ],
502
+ macros=[
503
+ Macro(
504
+ name="vgi_multiply",
505
+ macro_type=MacroType.SCALAR,
506
+ parameters=["x", "y"],
507
+ definition="x * y",
508
+ comment="Multiply two values",
509
+ ),
510
+ Macro(
511
+ name="vgi_clamp",
512
+ macro_type=MacroType.SCALAR,
513
+ parameters=["val", "lo", "hi"],
514
+ parameter_default_values=pa.RecordBatch.from_pydict(
515
+ {"lo": [pa.scalar(0).as_py()], "hi": [pa.scalar(100).as_py()]},
516
+ schema=schema(lo=pa.int64(), hi=pa.int64()),
517
+ ),
518
+ definition="GREATEST(lo, LEAST(hi, val))",
519
+ comment="Clamp a value between lo and hi (defaults: 0..100)",
520
+ ),
521
+ Macro(
522
+ name="vgi_range_table",
523
+ macro_type=MacroType.TABLE,
524
+ parameters=["n"],
525
+ definition="SELECT * FROM range(n)",
526
+ comment="Table macro returning range of values",
527
+ ),
528
+ ],
529
+ ),
530
+ Schema(
531
+ name="data",
532
+ comment="Example tables backed by functions",
533
+ tables=[
534
+ # Function-backed table: schema derived via bind()
535
+ Table(
536
+ name="large_sequence",
537
+ function=SequenceFunction,
538
+ arguments=Arguments(positional=(pa.scalar(1_000_000),)),
539
+ statistics={
540
+ "n": ColumnStatisticsInput(min=0, max=999_999, has_null=False, distinct_count=1_000_000),
541
+ },
542
+ statistics_cache_max_age_seconds=3600,
543
+ comment="A large sequence of integers from 0 to 1,000,000",
544
+ ),
545
+ # Function-backed table with a no-arg function. Used by the
546
+ # ``inlined_scan_function.test`` integration test to verify
547
+ # the C++ extension reads the inlined ``scan_function`` from
548
+ # ``TableInfo`` and skips ``catalog_table_scan_function_get``.
549
+ Table(
550
+ name="ten_thousand_table",
551
+ function=TenThousandFunction,
552
+ comment="Function-backed table over the no-arg ten_thousand function",
553
+ ),
554
+ # Function-backed table with inlined cardinality. Used by the
555
+ # ``inlined_cardinality.test`` integration test to verify the
556
+ # C++ extension uses ``Table.cardinality_estimate`` /
557
+ # ``cardinality_max`` from ``TableInfo`` and skips the per-bind
558
+ # ``table_function_cardinality`` RPC.
559
+ Table(
560
+ name="cardinality_inlined_table",
561
+ function=TenThousandFunction,
562
+ cardinality_estimate=10000,
563
+ cardinality_max=10000,
564
+ comment="Function-backed table with inlined cardinality (10000 rows)",
565
+ ),
566
+ # Time-travel table: version-specific schema
567
+ Table(
568
+ name="versioned_data",
569
+ columns=schema(id=pa.int64(), score=pa.float64()),
570
+ supports_time_travel=True,
571
+ comment="Versioned data table demonstrating time travel with schema evolution",
572
+ ),
573
+ # Time travel + filter pushdown together. tt_pushdown_fn is
574
+ # function-backed (reads AT at init); tt_pushdown_cols is
575
+ # columns-based (AT → version arg via table_scan_function_get).
576
+ Table(
577
+ name="tt_pushdown_fn",
578
+ function=TimeTravelPushdownFunction,
579
+ supports_time_travel=True,
580
+ comment="Function-backed: prunes by filter AND time-travels (AT read at init).",
581
+ ),
582
+ Table(
583
+ name="tt_pushdown_cols",
584
+ columns=_TT_SCHEMA,
585
+ supports_time_travel=True,
586
+ comment="Columns-based: prunes by filter AND time-travels (AT → version arg).",
587
+ ),
588
+ # Explicit columns table with statistics extracted from DuckDB
589
+ # via statistics_from_duckdb() — demonstrates the helper workflow
590
+ Table(
591
+ name="numbers",
592
+ columns=schema(value=pa.int64()),
593
+ statistics=_NUMBERS_STATS,
594
+ statistics_cache_max_age_seconds=3600,
595
+ comment="First 100 integers (demonstrates explicit columns)",
596
+ ),
597
+ # Geometry table with spatial statistics from DuckDB
598
+ Table(
599
+ name="geo_points",
600
+ columns=_GEO_SCHEMA,
601
+ statistics=_GEO_STATS,
602
+ statistics_cache_max_age_seconds=3600,
603
+ comment="5x5 grid of points with spatial statistics",
604
+ ),
605
+ # Table with TTL=0 (never cache) for cache expiry testing
606
+ Table(
607
+ name="volatile_numbers",
608
+ columns=schema(value=pa.int64()),
609
+ statistics={
610
+ "value": ColumnStatisticsInput(min=0, max=99, has_null=False, distinct_count=100),
611
+ },
612
+ statistics_cache_max_age_seconds=0,
613
+ comment="Numbers with volatile stats (TTL=0, always re-fetched)",
614
+ ),
615
+ # Table with NO declared statistics — stats must come from the underlying
616
+ # scan function (SequenceFunction.statistics) via table_function_statistics RPC.
617
+ # Column name matches the function output ("n") so no rename is needed.
618
+ Table(
619
+ name="funny_numbers",
620
+ columns=schema(n=pa.int64()),
621
+ comment="123456 integers; stats served by the sequence function, not the table",
622
+ ),
623
+ # Multi-branch fixture — two ScanBranch entries both calling
624
+ # sequence() with different counts. SELECT count(*) should
625
+ # return 100 (50 + 50). Exercises VgiMultiScanRewriter end-to-end.
626
+ Table(
627
+ name="multi_branch_numbers",
628
+ columns=schema(n=pa.int64()),
629
+ comment="Multi-branch: UNION of sequence(50) + sequence(50) — used by multi_branch_scan.test",
630
+ ),
631
+ # Multi-branch with branch_filters that partition the value range.
632
+ # Branch A: sequence(100) with `n < 50`; branch B: sequence(100)
633
+ # with `n >= 50`. Non-overlapping; total rows = 100.
634
+ Table(
635
+ name="multi_branch_filtered_numbers",
636
+ columns=schema(n=pa.int64()),
637
+ comment="Multi-branch with complementary branch_filters — exercises pruning",
638
+ ),
639
+ # Heterogeneous multi-branch: one VGI arm + one native read_parquet
640
+ # arm. The parquet file is created by the test at a well-known path
641
+ # (see multi_branch_heterogeneous.test). Demonstrates that cold-tier
642
+ # data can come from any DuckDB function the worker names, without
643
+ # tunneling through the worker pipe.
644
+ Table(
645
+ name="multi_branch_hetero",
646
+ columns=schema(n=pa.int64()),
647
+ comment="Multi-branch: sequence(50) + read_parquet — used by multi_branch_heterogeneous.test",
648
+ ),
649
+ # Column reconciliation: 3 read_parquet branches, the test creates
650
+ # the parquet files with deliberately different column orders and
651
+ # a missing column on one branch. Canonical schema (a, b) is
652
+ # populated by name; missing columns NULL-fill.
653
+ Table(
654
+ name="multi_branch_recon",
655
+ columns=schema(a=pa.int64(), b=pa.int64()),
656
+ comment="Multi-branch: column reconciliation — used by multi_branch_reconciliation.test",
657
+ ),
658
+ # Pushdown-incapable arm test (E3): one VGI sequence() arm
659
+ # (filter_pushdown=True) + one read_csv arm (read_csv lacks
660
+ # native filter pushdown, so filters stay as LogicalFilter
661
+ # above the scan). Tests that the rewriter doesn't assume
662
+ # pushdown always succeeds.
663
+ Table(
664
+ name="multi_branch_nopushdown",
665
+ columns=schema(n=pa.int64()),
666
+ comment="Multi-branch: VGI + read_csv — used by multi_branch_pushdown_incapable.test",
667
+ ),
668
+ # Empty-branches loud-fail test (E6): worker returns
669
+ # branches=[] from table_scan_branches_get. The C++ side's
670
+ # ParseScanBranchesResult must reject this at the wire layer
671
+ # with a BinderException before any plan is built.
672
+ Table(
673
+ name="multi_branch_empty",
674
+ columns=schema(n=pa.int64()),
675
+ comment="Multi-branch: empty branches list — used by multi_branch_empty_branches.test",
676
+ ),
677
+ # Parse-time rejection — worker returns two ScanBranch
678
+ # entries both with writable=True. ParseScanBranchesResult
679
+ # must throw BinderException citing DuckDB's
680
+ # single-writable-catalog rule. See multi_branch_two_writable.test.
681
+ Table(
682
+ name="multi_branch_two_writable",
683
+ columns=schema(n=pa.int64()),
684
+ comment="Multi-branch with two writable=True arms — used by multi_branch_two_writable.test",
685
+ ),
686
+ # ENUM (dictionary-encoded) column table — tests that statistics
687
+ # report actual string values, not dictionary indices.
688
+ Table(
689
+ name="colors",
690
+ columns=schema(id=pa.int64(), color=pa.string(), hex_code=pa.string()),
691
+ statistics=_ENUM_STATS,
692
+ statistics_cache_max_age_seconds=3600,
693
+ comment="Colors table with ENUM-derived statistics",
694
+ ),
695
+ # Row ID position tests (int64 row_id)
696
+ Table(
697
+ name="rowid_first",
698
+ columns=schema(
699
+ row_id=(pa.int64(), {b"is_row_id": b""}),
700
+ name=pa.string(),
701
+ value=pa.string(),
702
+ ),
703
+ comment="Table with row_id at column index 0",
704
+ ),
705
+ Table(
706
+ name="rowid_middle",
707
+ columns=schema(
708
+ name=pa.string(),
709
+ row_id=(pa.int64(), {b"is_row_id": b""}),
710
+ value=pa.string(),
711
+ ),
712
+ comment="Table with row_id at column index 1",
713
+ ),
714
+ Table(
715
+ name="rowid_last",
716
+ columns=schema(
717
+ name=pa.string(),
718
+ value=pa.string(),
719
+ row_id=(pa.int64(), {b"is_row_id": b""}),
720
+ ),
721
+ comment="Table with row_id at column index 2",
722
+ ),
723
+ # Row ID type tests (row_id at index 0)
724
+ Table(
725
+ name="rowid_string",
726
+ columns=schema(
727
+ row_id=(pa.string(), {b"is_row_id": b""}),
728
+ value=pa.int64(),
729
+ ),
730
+ comment="Table with string row_id",
731
+ ),
732
+ Table(
733
+ name="rowid_struct",
734
+ columns=schema(
735
+ row_id=(
736
+ pa.struct([("a", pa.int64()), ("b", pa.string())]),
737
+ {b"is_row_id": b""},
738
+ ),
739
+ value=pa.string(),
740
+ ),
741
+ comment="Table with struct row_id",
742
+ ),
743
+ # ----- Late-materialization tables (rowid + scrambled ord) -----
744
+ # Backed by the late_materialization scan function, which
745
+ # advertises Meta.late_materialization. The row_id is the row
746
+ # index (unique/deterministic/snapshot-stable); ord is a
747
+ # scrambled function of the index so a Top-N on ord yields
748
+ # scattered survivor rowids. pushed echoes the rowid filter the
749
+ # worker received. See late_materialization.test.
750
+ Table(
751
+ name="late_mat",
752
+ columns=schema(
753
+ row_id=(pa.int64(), {b"is_row_id": b""}),
754
+ ord=pa.int64(),
755
+ payload=pa.string(),
756
+ pushed=pa.string(),
757
+ ),
758
+ comment="Late-materialization table (1000 rows, unique rowid)",
759
+ ),
760
+ Table(
761
+ name="late_mat_dup",
762
+ columns=schema(
763
+ row_id=(pa.int64(), {b"is_row_id": b""}),
764
+ ord=pa.int64(),
765
+ payload=pa.string(),
766
+ pushed=pa.string(),
767
+ ),
768
+ comment="Late-materialization table with deliberately non-unique rowid (contract violation)",
769
+ ),
770
+ Table(
771
+ name="late_mat_nulls",
772
+ columns=schema(
773
+ row_id=(pa.int64(), {b"is_row_id": b""}),
774
+ ord=pa.int64(),
775
+ payload=pa.string(),
776
+ pushed=pa.string(),
777
+ ),
778
+ comment="Late-materialization table with NULLs in the ord column",
779
+ ),
780
+ # ----- Generated column example table -----
781
+ Table(
782
+ name="generated_sequence",
783
+ columns=schema(n=pa.int64(), doubled=pa.int64(), label=pa.string()),
784
+ generated_columns={
785
+ "doubled": "n * 2",
786
+ "label": "'item_' || CAST(n AS VARCHAR)",
787
+ },
788
+ comment="Table with generated columns backed by sequence(10)",
789
+ ),
790
+ # ----- Constraint example tables -----
791
+ Table(
792
+ name="departments",
793
+ columns=schema(id=pa.int64(), name=pa.string(), budget=pa.float64()),
794
+ primary_key=(("id",),),
795
+ not_null=("id", "name"),
796
+ unique=(("name",),),
797
+ check=("budget >= 0",),
798
+ defaults={"budget": 0},
799
+ statistics={
800
+ "id": ColumnStatisticsInput(min=1, max=10, has_null=False, distinct_count=10),
801
+ "name": ColumnStatisticsInput(
802
+ min="Accounting",
803
+ max="Sales",
804
+ has_null=False,
805
+ distinct_count=10,
806
+ contains_unicode=False,
807
+ max_string_length=20,
808
+ ),
809
+ "budget": ColumnStatisticsInput(min=50000.0, max=500000.0, has_null=False, distinct_count=10),
810
+ },
811
+ statistics_cache_max_age_seconds=3600,
812
+ comment="Department reference table",
813
+ ),
814
+ Table(
815
+ name="products",
816
+ columns=schema(
817
+ id=pa.int64(),
818
+ name=pa.string(),
819
+ quantity=pa.int64(),
820
+ price=pa.float64(),
821
+ ),
822
+ not_null=("id",),
823
+ primary_key=(("id",),),
824
+ defaults={
825
+ "quantity": 0,
826
+ "name": "unknown",
827
+ "price": 9.99,
828
+ },
829
+ column_comments={
830
+ "id": "Unique product identifier",
831
+ "name": "Product display name",
832
+ "price": "Unit price in USD",
833
+ },
834
+ statistics={
835
+ "id": ColumnStatisticsInput(min=1, max=100, has_null=False, distinct_count=100),
836
+ "name": ColumnStatisticsInput(
837
+ min="Anvil",
838
+ max="Zebra Tape",
839
+ has_null=False,
840
+ distinct_count=100,
841
+ contains_unicode=False,
842
+ max_string_length=30,
843
+ ),
844
+ "quantity": ColumnStatisticsInput(min=0, max=10000, has_null=True, distinct_count=50),
845
+ "price": ColumnStatisticsInput(min=0.99, max=999.99, has_null=False, distinct_count=80),
846
+ },
847
+ statistics_cache_max_age_seconds=3600,
848
+ comment="Product table with column defaults",
849
+ ),
850
+ Table(
851
+ name="employees",
852
+ columns=schema(
853
+ id=pa.int64(),
854
+ name=pa.string(),
855
+ email=pa.string(),
856
+ department_id=pa.int64(),
857
+ ),
858
+ primary_key=(("id",),),
859
+ not_null=("id", "name", "email"),
860
+ unique=(("email",),),
861
+ foreign_key=(
862
+ ForeignKeyDef(
863
+ columns=("department_id",),
864
+ referenced_table="departments",
865
+ referenced_columns=("id",),
866
+ ),
867
+ ),
868
+ comment="Employee table with FK to departments",
869
+ ),
870
+ Table(
871
+ name="projects",
872
+ columns=schema(
873
+ department_id=pa.int64(),
874
+ project_code=pa.string(),
875
+ title=pa.string(),
876
+ ),
877
+ primary_key=(("department_id", "project_code"),),
878
+ not_null=("department_id", "project_code", "title"),
879
+ foreign_key=(
880
+ ForeignKeyDef(
881
+ columns=("department_id",),
882
+ referenced_table="departments",
883
+ referenced_columns=("id",),
884
+ ),
885
+ ),
886
+ comment="Projects with composite PK and FK to departments",
887
+ ),
888
+ # filter_echo_table — catalog table that echoes the pushed-down
889
+ # filters it received (pushed_filters column). Backs
890
+ # ~/Development/vgi/test/sql/integration/table/filter_pushdown_through_view.test,
891
+ # which characterizes filter pushdown directly and through a VIEW.
892
+ # The backing scan opts into expression-filter pushdown so a
893
+ # `LIKE 'prefix%'` predicate is observable here.
894
+ Table(
895
+ name="filter_echo_table",
896
+ columns=schema(n=pa.int64(), s=pa.utf8(), pushed_filters=pa.utf8()),
897
+ comment="Catalog table echoing pushed-down filters (filter-pushdown-through-view tests).",
898
+ ),
899
+ # ----- required_field_filter_paths fixtures -----
900
+ # Exercised by ~/Development/vgi/test/sql/vgi_required_filters_*.test
901
+ # to verify the C++ optimizer extension that enforces the new
902
+ # Table.required_field_filter_paths field.
903
+ Table(
904
+ name="rff_simple",
905
+ columns=RFF_SIMPLE_COLUMNS,
906
+ required_field_filter_paths=("a",),
907
+ comment="rff_simple — requires a filter referencing column 'a'.",
908
+ ),
909
+ Table(
910
+ name="rff_struct",
911
+ columns=RFF_STRUCT_COLUMNS,
912
+ required_field_filter_paths=("s.a", "s.b"),
913
+ comment="rff_struct — requires filters on both struct subfields s.a and s.b.",
914
+ ),
915
+ Table(
916
+ name="rff_nested",
917
+ columns=RFF_NESTED_COLUMNS,
918
+ required_field_filter_paths=("wrapper.mid.leaf",),
919
+ comment="rff_nested — requires a filter on the 3-deep nested path wrapper.mid.leaf.",
920
+ ),
921
+ Table(
922
+ name="rff_multi",
923
+ columns=RFF_MULTI_COLUMNS,
924
+ required_field_filter_paths=("top", "s.a"),
925
+ comment="rff_multi — mixed top-level + struct subfield requirements.",
926
+ ),
927
+ Table(
928
+ name="rff_none",
929
+ columns=RFF_NONE_COLUMNS,
930
+ comment="rff_none — control table with no required_field_filter_paths (opt-out fast path).",
931
+ ),
932
+ Table(
933
+ name="rff_rowid",
934
+ columns=RFF_ROWID_COLUMNS,
935
+ required_field_filter_paths=(
936
+ "bbox.xmin",
937
+ "bbox.xmax",
938
+ "bbox.ymin",
939
+ "bbox.ymax",
940
+ ),
941
+ comment="rff_rowid — row_id virtual column + required bbox.* filters.",
942
+ ),
943
+ # rff_parquet — native read_parquet delegation + required_field_filter_paths
944
+ # on a FLOAT bbox struct (mirrors Overture transportation.segment).
945
+ Table(
946
+ name="rff_parquet",
947
+ columns=pa.schema(
948
+ [
949
+ pa.field(
950
+ "bbox",
951
+ pa.struct(
952
+ [
953
+ pa.field("xmin", pa.float32()),
954
+ pa.field("ymin", pa.float32()),
955
+ pa.field("xmax", pa.float32()),
956
+ pa.field("ymax", pa.float32()),
957
+ ]
958
+ ),
959
+ ),
960
+ pa.field("other", pa.int64()),
961
+ ]
962
+ ),
963
+ required_field_filter_paths=(
964
+ "bbox.xmin",
965
+ "bbox.xmax",
966
+ "bbox.ymin",
967
+ "bbox.ymax",
968
+ ),
969
+ comment="rff_parquet — native read_parquet delegation with bbox.* required filters.",
970
+ ),
971
+ # rff_hive — native read_parquet over a Hive-partitioned glob
972
+ # (theme/type partition columns), bbox at a non-zero index —
973
+ # closely mirrors Overture transportation.segment.
974
+ Table(
975
+ name="rff_hive",
976
+ columns=pa.schema(
977
+ [
978
+ pa.field("id", pa.string()),
979
+ pa.field(
980
+ "bbox",
981
+ pa.struct(
982
+ [
983
+ pa.field("xmin", pa.float32()),
984
+ pa.field("ymin", pa.float32()),
985
+ pa.field("xmax", pa.float32()),
986
+ pa.field("ymax", pa.float32()),
987
+ ]
988
+ ),
989
+ ),
990
+ pa.field("name", pa.string()),
991
+ pa.field("num", pa.int64()),
992
+ pa.field("theme", pa.string()),
993
+ pa.field("type", pa.string()),
994
+ ]
995
+ ),
996
+ required_field_filter_paths=(
997
+ "bbox.xmin",
998
+ "bbox.xmax",
999
+ "bbox.ymin",
1000
+ "bbox.ymax",
1001
+ ),
1002
+ comment="rff_hive — native read_parquet over Hive glob with bbox.* required filters.",
1003
+ ),
1004
+ # rff_hive_mixed — same Hive layout as rff_hive but a MIXED
1005
+ # requirement: a top-level field ('id') plus the struct corners.
1006
+ # Exercises the flat-field branch of the path walker over native
1007
+ # delegation, where 'id' sits at a permuted column_ids slot.
1008
+ Table(
1009
+ name="rff_hive_mixed",
1010
+ columns=pa.schema(
1011
+ [
1012
+ pa.field("id", pa.string()),
1013
+ pa.field(
1014
+ "bbox",
1015
+ pa.struct(
1016
+ [
1017
+ pa.field("xmin", pa.float32()),
1018
+ pa.field("ymin", pa.float32()),
1019
+ pa.field("xmax", pa.float32()),
1020
+ pa.field("ymax", pa.float32()),
1021
+ ]
1022
+ ),
1023
+ ),
1024
+ pa.field("name", pa.string()),
1025
+ pa.field("num", pa.int64()),
1026
+ pa.field("theme", pa.string()),
1027
+ pa.field("type", pa.string()),
1028
+ ]
1029
+ ),
1030
+ required_field_filter_paths=(
1031
+ "id",
1032
+ "bbox.xmin",
1033
+ "bbox.xmax",
1034
+ "bbox.ymin",
1035
+ "bbox.ymax",
1036
+ ),
1037
+ comment="rff_hive_mixed — native read_parquet, top-level 'id' + bbox.* required filters.",
1038
+ ),
1039
+ # Time-travel constraint evolution table
1040
+ Table(
1041
+ name="versioned_constraints",
1042
+ columns=schema(
1043
+ id=pa.int64(),
1044
+ name=pa.string(),
1045
+ email=pa.string(),
1046
+ department_id=pa.int64(),
1047
+ ),
1048
+ supports_time_travel=True,
1049
+ not_null=("id", "name"),
1050
+ primary_key=(("id",),),
1051
+ unique=(("email",),),
1052
+ foreign_key=(
1053
+ ForeignKeyDef(
1054
+ columns=("department_id",),
1055
+ referenced_table="departments",
1056
+ referenced_columns=("id",),
1057
+ ),
1058
+ ),
1059
+ comment="Table with constraints that evolve across versions",
1060
+ ),
1061
+ ],
1062
+ views=[
1063
+ View(
1064
+ name="small_numbers",
1065
+ definition="SELECT * FROM numbers WHERE value < 10",
1066
+ comment="Numbers less than 10",
1067
+ column_comments={"value": "Single-digit value 0..9"},
1068
+ ),
1069
+ ],
1070
+ indexes=[
1071
+ Index(
1072
+ name="idx_numbers_value",
1073
+ table_name="numbers",
1074
+ expressions=("value",),
1075
+ comment="Index on numbers.value",
1076
+ ),
1077
+ Index(
1078
+ name="idx_numbers_value_unique",
1079
+ table_name="numbers",
1080
+ expressions=("value",),
1081
+ constraint_type=IndexConstraintType.UNIQUE,
1082
+ comment="Unique index on numbers.value",
1083
+ ),
1084
+ ],
1085
+ ),
1086
+ ],
1087
+ )
1088
+
1089
+
1090
+ class ExampleCatalog(ReadOnlyCatalogInterface):
1091
+ """Catalog interface for the example worker.
1092
+
1093
+ Defines table_get and table_scan_function_get for tables with explicit
1094
+ columns, including time-travel support for versioned_data.
1095
+
1096
+ """
1097
+
1098
+ catalog = _EXAMPLE_CATALOG
1099
+
1100
+ def table_get(
1101
+ self,
1102
+ *,
1103
+ attach_opaque_data: AttachOpaqueData,
1104
+ transaction_opaque_data: TransactionOpaqueData | None,
1105
+ schema_name: str,
1106
+ name: str,
1107
+ at_unit: str | None = None,
1108
+ at_value: str | None = None,
1109
+ ) -> TableInfo | None:
1110
+ """Return version-specific schema for time-travel tables."""
1111
+ _validate_at_params(at_unit, at_value)
1112
+ if schema_name.lower() == "data" and name.lower() == "versioned_data" and at_unit:
1113
+ version = resolve_version(at_unit, at_value)
1114
+ cols = _VERSIONED_SCHEMAS[version]
1115
+ return TableInfo(
1116
+ name=name,
1117
+ schema_name=schema_name,
1118
+ columns=SerializedSchema(cols.serialize().to_pybytes()),
1119
+ not_null_constraints=[],
1120
+ unique_constraints=[],
1121
+ check_constraints=[],
1122
+ comment="Versioned data table demonstrating time travel with schema evolution",
1123
+ tags={},
1124
+ )
1125
+ if schema_name.lower() == "data" and name.lower() == "versioned_constraints" and at_unit:
1126
+ version = resolve_versioned_constraints_version(at_unit, at_value)
1127
+ cols = _VERSIONED_CONSTRAINTS_SCHEMAS[version]
1128
+ # Constraints evolve with version:
1129
+ # V1: NOT NULL on id only
1130
+ # V2: NOT NULL on id+name, PK on id, UNIQUE on email
1131
+ # V3: NOT NULL on id+name, PK on id, UNIQUE on email, FK department_id→departments.id
1132
+ not_null: list[int] = []
1133
+ pk: list[list[int]] = []
1134
+ unique: list[list[int]] = []
1135
+ fk: list[bytes] = []
1136
+ col_names = [f.name for f in cols]
1137
+ if version >= 1:
1138
+ not_null.append(col_names.index("id"))
1139
+ if version >= 2:
1140
+ not_null.append(col_names.index("name"))
1141
+ pk.append([col_names.index("id")])
1142
+ unique.append([col_names.index("email")])
1143
+ if version >= 3:
1144
+ from vgi_rpc.utils import serialize_record_batch_bytes
1145
+
1146
+ fk_batch = pa.RecordBatch.from_pydict(
1147
+ {
1148
+ "fk_columns": [["department_id"]],
1149
+ "pk_columns": [["id"]],
1150
+ "referenced_table": ["departments"],
1151
+ "referenced_schema": [schema_name],
1152
+ },
1153
+ schema=pa.schema(
1154
+ [
1155
+ ("fk_columns", pa.list_(pa.utf8())),
1156
+ ("pk_columns", pa.list_(pa.utf8())),
1157
+ ("referenced_table", pa.utf8()),
1158
+ ("referenced_schema", pa.utf8()),
1159
+ ]
1160
+ ),
1161
+ )
1162
+ fk.append(serialize_record_batch_bytes(fk_batch))
1163
+ return TableInfo(
1164
+ name=name,
1165
+ schema_name=schema_name,
1166
+ columns=SerializedSchema(cols.serialize().to_pybytes()),
1167
+ not_null_constraints=not_null,
1168
+ unique_constraints=unique,
1169
+ check_constraints=[],
1170
+ primary_key_constraints=pk,
1171
+ foreign_key_constraints=fk,
1172
+ comment="Table with constraints that evolve across versions",
1173
+ tags={},
1174
+ )
1175
+ # Multi-branch tables: accept AT at table_get and pass it through to
1176
+ # the underlying handler with AT stripped. The C++ side's B2 guard
1177
+ # in VgiTableEntry::GetScanFunctionImpl detects branches.size() > 1
1178
+ # and throws BinderException before any scan-function-get RPC fires.
1179
+ # Returning TableInfo here lets the C++ binding flow proceed far enough
1180
+ # to hit that guard with the documented error message.
1181
+ if schema_name.lower() == "data" and name.lower() in ("multi_branch_numbers", "multi_branch_filtered_numbers"):
1182
+ return super().table_get(
1183
+ attach_opaque_data=attach_opaque_data,
1184
+ transaction_opaque_data=transaction_opaque_data,
1185
+ schema_name=schema_name,
1186
+ name=name,
1187
+ at_unit=None,
1188
+ at_value=None,
1189
+ )
1190
+ return super().table_get(
1191
+ attach_opaque_data=attach_opaque_data,
1192
+ transaction_opaque_data=transaction_opaque_data,
1193
+ schema_name=schema_name,
1194
+ name=name,
1195
+ at_unit=at_unit,
1196
+ at_value=at_value,
1197
+ )
1198
+
1199
+ def table_scan_branches_get(
1200
+ self,
1201
+ *,
1202
+ attach_opaque_data: AttachOpaqueData,
1203
+ transaction_opaque_data: TransactionOpaqueData | None,
1204
+ schema_name: str,
1205
+ name: str,
1206
+ at_unit: str | None,
1207
+ at_value: str | None,
1208
+ ) -> ScanBranchesResult:
1209
+ """Return multi-branch scan plans for the multi_branch_* test tables.
1210
+
1211
+ Falls through to the CatalogInterface default-impl shim for every
1212
+ other table, which wraps the legacy table_scan_function_get result
1213
+ as a one-branch list.
1214
+ """
1215
+ _validate_at_params(at_unit, at_value)
1216
+
1217
+ # multi_branch_numbers: two arms, each sequence(50). Union size = 100.
1218
+ if schema_name.lower() == "data" and name.lower() == "multi_branch_numbers":
1219
+ return ScanBranchesResult(
1220
+ branches=[
1221
+ ScanBranch(
1222
+ function_name="sequence",
1223
+ positional_arguments=[pa.scalar(50)],
1224
+ named_arguments={},
1225
+ ),
1226
+ ScanBranch(
1227
+ function_name="sequence",
1228
+ positional_arguments=[pa.scalar(50)],
1229
+ named_arguments={},
1230
+ ),
1231
+ ],
1232
+ required_extensions=[],
1233
+ )
1234
+
1235
+ # multi_branch_filtered_numbers: two arms each sequence(100) with
1236
+ # complementary branch_filters carving the value range in half.
1237
+ # Total rows = 100 (50 from each arm after filtering).
1238
+ if schema_name.lower() == "data" and name.lower() == "multi_branch_filtered_numbers":
1239
+ return ScanBranchesResult(
1240
+ branches=[
1241
+ ScanBranch(
1242
+ function_name="sequence",
1243
+ positional_arguments=[pa.scalar(100)],
1244
+ named_arguments={},
1245
+ branch_filter="n < 50",
1246
+ ),
1247
+ ScanBranch(
1248
+ function_name="sequence",
1249
+ positional_arguments=[pa.scalar(100)],
1250
+ named_arguments={},
1251
+ branch_filter="n >= 50",
1252
+ ),
1253
+ ],
1254
+ required_extensions=[],
1255
+ )
1256
+
1257
+ # multi_branch_hetero: one VGI arm (sequence(50)) + one native
1258
+ # read_parquet arm pointing at a well-known path the test creates
1259
+ # before querying. The parquet file has a single column "n" holding
1260
+ # values 50..99. Total rows = 100.
1261
+ if schema_name.lower() == "data" and name.lower() == "multi_branch_hetero":
1262
+ return ScanBranchesResult(
1263
+ branches=[
1264
+ ScanBranch(
1265
+ function_name="sequence",
1266
+ positional_arguments=[pa.scalar(50)],
1267
+ named_arguments={},
1268
+ ),
1269
+ ScanBranch(
1270
+ function_name="read_parquet",
1271
+ positional_arguments=[pa.scalar("/tmp/vgi_hetero_branch.parquet", pa.string())],
1272
+ named_arguments={},
1273
+ ),
1274
+ ],
1275
+ required_extensions=[],
1276
+ )
1277
+
1278
+ # multi_branch_empty: worker deliberately returns branches=[] to
1279
+ # exercise the C++ side's BinderException loud-fail. ParseScanBranchesResult
1280
+ # must reject this at the wire layer.
1281
+ if schema_name.lower() == "data" and name.lower() == "multi_branch_empty":
1282
+ return ScanBranchesResult(branches=[], required_extensions=[])
1283
+
1284
+ # multi_branch_two_writable: two ScanBranch entries both with
1285
+ # writable=True. ParseScanBranchesResult must reject loudly with
1286
+ # BinderException — DuckDB's single-writable-catalog-per-transaction
1287
+ # rule means at most one branch may be writable.
1288
+ if schema_name.lower() == "data" and name.lower() == "multi_branch_two_writable":
1289
+ return ScanBranchesResult(
1290
+ branches=[
1291
+ ScanBranch(
1292
+ function_name="sequence",
1293
+ positional_arguments=[pa.scalar(10)],
1294
+ named_arguments={},
1295
+ writable=True,
1296
+ ),
1297
+ ScanBranch(
1298
+ function_name="sequence",
1299
+ positional_arguments=[pa.scalar(10)],
1300
+ named_arguments={},
1301
+ writable=True,
1302
+ ),
1303
+ ],
1304
+ required_extensions=[],
1305
+ )
1306
+
1307
+ # multi_branch_nopushdown: VGI sequence(50) + read_csv_auto. read_csv
1308
+ # has filter_pushdown=false in DuckDB, so any user WHERE clause stays
1309
+ # as a LogicalFilter above the csv arm — the rewriter must not assume
1310
+ # pushdown always succeeds.
1311
+ if schema_name.lower() == "data" and name.lower() == "multi_branch_nopushdown":
1312
+ return ScanBranchesResult(
1313
+ branches=[
1314
+ ScanBranch(
1315
+ function_name="sequence",
1316
+ positional_arguments=[pa.scalar(50)],
1317
+ named_arguments={},
1318
+ ),
1319
+ ScanBranch(
1320
+ function_name="read_csv_auto",
1321
+ positional_arguments=[pa.scalar("/tmp/vgi_nopushdown_branch.csv", pa.string())],
1322
+ named_arguments={},
1323
+ ),
1324
+ ],
1325
+ required_extensions=[],
1326
+ )
1327
+
1328
+ # multi_branch_recon: three read_parquet branches with deliberately
1329
+ # mismatched column shapes — used to exercise column-reconciliation
1330
+ # by NAME with NULL-fill for missing canonicals. Canonical schema
1331
+ # is (a int64, b int64). The test creates the parquet files at the
1332
+ # paths below before querying.
1333
+ if schema_name.lower() == "data" and name.lower() == "multi_branch_recon":
1334
+ return ScanBranchesResult(
1335
+ branches=[
1336
+ ScanBranch(
1337
+ function_name="read_parquet",
1338
+ positional_arguments=[pa.scalar("/tmp/vgi_recon_a_b.parquet", pa.string())],
1339
+ named_arguments={},
1340
+ ),
1341
+ ScanBranch(
1342
+ function_name="read_parquet",
1343
+ positional_arguments=[pa.scalar("/tmp/vgi_recon_b_a.parquet", pa.string())],
1344
+ named_arguments={},
1345
+ ),
1346
+ ScanBranch(
1347
+ function_name="read_parquet",
1348
+ positional_arguments=[pa.scalar("/tmp/vgi_recon_a_only.parquet", pa.string())],
1349
+ named_arguments={},
1350
+ ),
1351
+ ],
1352
+ required_extensions=[],
1353
+ )
1354
+
1355
+ # Everything else: fall through to the default-impl shim (wraps
1356
+ # table_scan_function_get as a one-branch list).
1357
+ return super().table_scan_branches_get(
1358
+ attach_opaque_data=attach_opaque_data,
1359
+ transaction_opaque_data=transaction_opaque_data,
1360
+ schema_name=schema_name,
1361
+ name=name,
1362
+ at_unit=at_unit,
1363
+ at_value=at_value,
1364
+ )
1365
+
1366
+ # Column statistics are defined inline on each Table descriptor using
1367
+ # the `statistics` dict. ReadOnlyCatalogInterface auto-serves them —
1368
+ # no override of table_column_statistics_get() needed here.
1369
+
1370
+ def table_scan_function_get(
1371
+ self,
1372
+ *,
1373
+ attach_opaque_data: AttachOpaqueData,
1374
+ transaction_opaque_data: TransactionOpaqueData | None,
1375
+ schema_name: str,
1376
+ name: str,
1377
+ at_unit: str | None,
1378
+ at_value: str | None,
1379
+ ) -> ScanFunctionResult:
1380
+ """Return scan function for tables with explicit columns."""
1381
+ _validate_at_params(at_unit, at_value)
1382
+
1383
+ # Handle the "versioned_data" table with time travel
1384
+ if schema_name.lower() == "data" and name.lower() == "versioned_data":
1385
+ version = resolve_version(at_unit, at_value)
1386
+ return ScanFunctionResult(
1387
+ function_name="versioned_data_scan",
1388
+ positional_arguments=[pa.scalar(version)],
1389
+ named_arguments={},
1390
+ )
1391
+
1392
+ # Columns-based time-travel + pushdown: resolve AT → version and pass it
1393
+ # as a scan-function argument (the native columns-based AT mechanism).
1394
+ if schema_name.lower() == "data" and name.lower() == "tt_pushdown_cols":
1395
+ version = resolve_tt_version(at_unit, at_value)
1396
+ return ScanFunctionResult(
1397
+ function_name="tt_pushdown_cols_scan",
1398
+ positional_arguments=[pa.scalar(version)],
1399
+ named_arguments={},
1400
+ )
1401
+
1402
+ # Handle the versioned_constraints table with time travel
1403
+ if schema_name.lower() == "data" and name.lower() == "versioned_constraints":
1404
+ version = resolve_versioned_constraints_version(at_unit, at_value)
1405
+ return ScanFunctionResult(
1406
+ function_name="versioned_constraints_scan",
1407
+ positional_arguments=[pa.scalar(version)],
1408
+ named_arguments={},
1409
+ )
1410
+
1411
+ # rff_parquet — single-branch native read_parquet delegation.
1412
+ if schema_name.lower() == "data" and name.lower() == "rff_parquet":
1413
+ return ScanFunctionResult(
1414
+ function_name="read_parquet",
1415
+ positional_arguments=[pa.scalar("/tmp/rff_seg.parquet", pa.string())],
1416
+ named_arguments={},
1417
+ )
1418
+
1419
+ # rff_hive / rff_hive_mixed — native read_parquet over a Hive glob.
1420
+ if schema_name.lower() == "data" and name.lower() in ("rff_hive", "rff_hive_mixed"):
1421
+ return ScanFunctionResult(
1422
+ function_name="read_parquet",
1423
+ positional_arguments=[pa.scalar("/tmp/rff_hive/*/*/*.parquet", pa.string())],
1424
+ named_arguments={"hive_partitioning": pa.scalar(True)},
1425
+ )
1426
+
1427
+ # Reject AT clause on tables that don't support time travel
1428
+ if at_unit:
1429
+ raise ValueError(f"Table '{schema_name}.{name}' does not support time travel queries")
1430
+
1431
+ # Handle the "generated_sequence" table (generated columns, backed by sequence)
1432
+ if schema_name.lower() == "data" and name.lower() == "generated_sequence":
1433
+ return ScanFunctionResult(
1434
+ function_name="sequence",
1435
+ positional_arguments=[pa.scalar(10)],
1436
+ named_arguments={},
1437
+ )
1438
+
1439
+ # Handle "numbers" and "volatile_numbers" — both use sequence(100)
1440
+ if schema_name.lower() == "data" and name.lower() in ("numbers", "volatile_numbers"):
1441
+ return ScanFunctionResult(
1442
+ function_name="sequence",
1443
+ positional_arguments=[pa.scalar(100)],
1444
+ named_arguments={},
1445
+ )
1446
+
1447
+ # funny_numbers — 123456 rows from sequence; statistics deliberately NOT set on
1448
+ # the table so SequenceFunction.statistics() provides them via table_function_statistics.
1449
+ if schema_name.lower() == "data" and name.lower() == "funny_numbers":
1450
+ return ScanFunctionResult(
1451
+ function_name="sequence",
1452
+ positional_arguments=[pa.scalar(123456)],
1453
+ named_arguments={},
1454
+ )
1455
+
1456
+ # Constraint example tables — simple static scan functions
1457
+ _static_scan_tables: dict[str, str] = {
1458
+ "colors": "colors_scan",
1459
+ "departments": "departments_scan",
1460
+ "employees": "employees_scan",
1461
+ "products": "products_scan",
1462
+ "projects": "projects_scan",
1463
+ # filter-pushdown-through-view fixture.
1464
+ "filter_echo_table": "filter_echo_table_scan",
1465
+ # rff_* — required_field_filter_paths fixtures.
1466
+ "rff_simple": "rff_simple_scan",
1467
+ "rff_struct": "rff_struct_scan",
1468
+ "rff_nested": "rff_nested_scan",
1469
+ "rff_multi": "rff_multi_scan",
1470
+ "rff_none": "rff_none_scan",
1471
+ "rff_rowid": "rff_rowid_scan",
1472
+ }
1473
+ if schema_name.lower() == "data" and name.lower() in _static_scan_tables:
1474
+ return ScanFunctionResult(
1475
+ function_name=_static_scan_tables[name.lower()],
1476
+ positional_arguments=[],
1477
+ named_arguments={},
1478
+ )
1479
+
1480
+ # Row ID test tables
1481
+ rowid_tables: dict[str, dict[str, str]] = {
1482
+ "rowid_first": {"layout": "first", "row_id_type": "int64"},
1483
+ "rowid_middle": {"layout": "middle", "row_id_type": "int64"},
1484
+ "rowid_last": {"layout": "last", "row_id_type": "int64"},
1485
+ "rowid_string": {"layout": "first", "row_id_type": "string"},
1486
+ "rowid_struct": {"layout": "first", "row_id_type": "struct"},
1487
+ }
1488
+ if schema_name.lower() == "data" and name.lower() in rowid_tables:
1489
+ opts = rowid_tables[name.lower()]
1490
+ return ScanFunctionResult(
1491
+ function_name="rowid_sequence",
1492
+ positional_arguments=[pa.scalar(20)],
1493
+ named_arguments={
1494
+ "layout": pa.scalar(opts["layout"]),
1495
+ "row_id_type": pa.scalar(opts["row_id_type"]),
1496
+ },
1497
+ )
1498
+
1499
+ # Late-materialization tables → late_materialization scan function.
1500
+ # 1000 rows is large enough that LIMIT k << count makes the rewrite a
1501
+ # real win and that LIMIT 200 exceeds dynamic_or_filter_threshold (50).
1502
+ late_mat_tables: dict[str, dict[str, Any]] = {
1503
+ "late_mat": {},
1504
+ "late_mat_dup": {"dup_row_id": pa.scalar(True)},
1505
+ "late_mat_nulls": {"null_ord_stride": pa.scalar(7)},
1506
+ }
1507
+ if schema_name.lower() == "data" and name.lower() in late_mat_tables:
1508
+ return ScanFunctionResult(
1509
+ function_name="late_materialization",
1510
+ positional_arguments=[pa.scalar(1000)],
1511
+ named_arguments=late_mat_tables[name.lower()],
1512
+ )
1513
+
1514
+ return super().table_scan_function_get(
1515
+ attach_opaque_data=attach_opaque_data,
1516
+ transaction_opaque_data=transaction_opaque_data,
1517
+ schema_name=schema_name,
1518
+ name=name,
1519
+ at_unit=at_unit,
1520
+ at_value=at_value,
1521
+ )
1522
+
1523
+ # --------- Transaction lifecycle ---------
1524
+ #
1525
+ # The example catalog has no transactional state of its own — these
1526
+ # methods exist solely so the C++ extension populates
1527
+ # ``BindRequest.transaction_opaque_data`` when SQL is wrapped in
1528
+ # ``BEGIN`` / ``COMMIT``. That id is what makes
1529
+ # ``BindParams.transaction_storage`` non-None, which lets
1530
+ # ``TxCachedValueFunction`` (and any user-written function) cache
1531
+ # per-transaction values via ``FunctionStorage.transaction_state_*``.
1532
+
1533
+ supports_transactions = True
1534
+
1535
+ def catalog_transaction_begin(self, *, attach_opaque_data: AttachOpaqueData) -> TransactionOpaqueData | None:
1536
+ """Allocate a fresh transaction_opaque_data; no catalog-side state to track."""
1537
+ del attach_opaque_data
1538
+ return TransactionOpaqueData(uuid.uuid4().bytes)
1539
+
1540
+ def catalog_transaction_commit(
1541
+ self, *, attach_opaque_data: AttachOpaqueData, transaction_opaque_data: TransactionOpaqueData
1542
+ ) -> None:
1543
+ """Clear per-transaction storage on commit (best-effort hygiene)."""
1544
+ del attach_opaque_data
1545
+ # transaction_opaque_data plays the role of scope_id in the unified
1546
+ # state_* API; execution_clear wipes every namespace for that scope.
1547
+ TxCachedValueFunction.storage.execution_clear(bytes(transaction_opaque_data))
1548
+
1549
+ def catalog_transaction_rollback(
1550
+ self, *, attach_opaque_data: AttachOpaqueData, transaction_opaque_data: TransactionOpaqueData
1551
+ ) -> None:
1552
+ """Mirror of commit — same cleanup path."""
1553
+ del attach_opaque_data
1554
+ TxCachedValueFunction.storage.execution_clear(bytes(transaction_opaque_data))
1555
+
1556
+
1557
+ class ExampleWorker(Worker):
1558
+ """Example worker with built-in test functions.
1559
+
1560
+ This worker exposes all example functions via the ExampleCatalog interface,
1561
+ allowing clients to discover available functions via the "example" catalog.
1562
+
1563
+ Settings exposed via catalog_attach:
1564
+ - vgi_verbose_mode: Enable verbose output (used by SettingsAwareFunction)
1565
+ - greeting: Custom greeting message (used by SettingsAwareFunction)
1566
+ - multiplier: Value multiplier (used by SettingsAwareFunction, MultiplyBySettingFunction)
1567
+ - threshold: Filter threshold (used by FilterBySettingFunction)
1568
+ - config: Sequence configuration struct (used by StructSettingsFunction)
1569
+ """
1570
+
1571
+ catalog_interface = ExampleCatalog
1572
+ # catalog is set for introspection (worker page, tests) — runtime catalog
1573
+ # operations go through catalog_interface.
1574
+ catalog = _EXAMPLE_CATALOG
1575
+
1576
+ class Settings:
1577
+ """Settings exposed via catalog_attach."""
1578
+
1579
+ vgi_verbose_mode: Annotated[bool, Setting(desc="Enable verbose output")] = False
1580
+ greeting: Annotated[str, Setting(desc="Custom greeting message")] = "Hello"
1581
+ multiplier: Annotated[int, Setting(desc="Value multiplier")] = 1
1582
+ threshold: Annotated[int, Setting(desc="Filter threshold")] = 0
1583
+ config: Annotated[ # type: ignore[valid-type]
1584
+ pa.struct([("start", pa.int64()), ("step", pa.int64()), ("label", pa.string())]),
1585
+ Setting(desc="Sequence configuration struct"),
1586
+ ] = None
1587
+
1588
+ secret_types = [
1589
+ SecretTypeSpec(
1590
+ name="vgi_example",
1591
+ description="Example VGI secret for testing",
1592
+ schema=pa.schema(
1593
+ [
1594
+ pa.field("secret_string", pa.string(), metadata={"redact": "true"}),
1595
+ pa.field("api_key", pa.string(), metadata={"redact": "true"}),
1596
+ pa.field("port", pa.int32()),
1597
+ pa.field("use_ssl", pa.bool_()),
1598
+ pa.field("timeout", pa.float64()),
1599
+ ] # type: ignore[arg-type] # PyArrow field metadata typing limitation
1600
+ ),
1601
+ ),
1602
+ ]
1603
+
1604
+
1605
+ def main() -> None:
1606
+ """Run the fixture worker process.
1607
+
1608
+ Always serves the base ExampleWorker catalog plus the
1609
+ ``projection_repro``, ``schema_reconcile``, and ``accumulate``
1610
+ fixture catalogs (all depend on the ``vgi[test-fixtures]`` extra).
1611
+ Adds the writable catalog when the ``vgi[test-fixtures-writable]``
1612
+ extra is also installed.
1613
+ """
1614
+ from vgi._test_fixtures.accumulate.worker import AccumulateWorker
1615
+ from vgi._test_fixtures.projection_repro.worker import ProjReproWorker
1616
+ from vgi._test_fixtures.schema_reconcile.worker import SchemaReconcileWorker
1617
+ from vgi.meta_worker import MetaWorker
1618
+
1619
+ workers: list[type] = [ExampleWorker, ProjReproWorker, SchemaReconcileWorker, AccumulateWorker]
1620
+ try:
1621
+ from vgi._test_fixtures.writable.worker import WritableWorker
1622
+ except ImportError:
1623
+ pass
1624
+ else:
1625
+ workers.append(WritableWorker)
1626
+
1627
+ MetaWorker.serve(*workers)
1628
+
1629
+
1630
+ if __name__ == "__main__":
1631
+ main()