vgi-python 0.8.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vgi/__init__.py +152 -0
- vgi/_duckdb.py +62 -0
- vgi/_storage_profile.py +132 -0
- vgi/_test_fixtures/__init__.py +20 -0
- vgi/_test_fixtures/accumulate/__init__.py +19 -0
- vgi/_test_fixtures/accumulate/worker.py +762 -0
- vgi/_test_fixtures/aggregate/__init__.py +62 -0
- vgi/_test_fixtures/aggregate/_common.py +21 -0
- vgi/_test_fixtures/aggregate/basic.py +232 -0
- vgi/_test_fixtures/aggregate/dynamic.py +409 -0
- vgi/_test_fixtures/aggregate/generic.py +86 -0
- vgi/_test_fixtures/aggregate/listagg.py +71 -0
- vgi/_test_fixtures/aggregate/percentile.py +107 -0
- vgi/_test_fixtures/aggregate/streaming.py +192 -0
- vgi/_test_fixtures/aggregate/varargs.py +75 -0
- vgi/_test_fixtures/aggregate/window.py +380 -0
- vgi/_test_fixtures/attach_options.py +308 -0
- vgi/_test_fixtures/bad_protocol.py +62 -0
- vgi/_test_fixtures/cancellable.py +336 -0
- vgi/_test_fixtures/catalog.py +813 -0
- vgi/_test_fixtures/http_server.py +394 -0
- vgi/_test_fixtures/nest_tensor.py +614 -0
- vgi/_test_fixtures/orchard_catalog.py +47 -0
- vgi/_test_fixtures/projection_repro/__init__.py +6 -0
- vgi/_test_fixtures/projection_repro/worker.py +454 -0
- vgi/_test_fixtures/scalar/__init__.py +116 -0
- vgi/_test_fixtures/scalar/_common.py +69 -0
- vgi/_test_fixtures/scalar/arithmetic.py +321 -0
- vgi/_test_fixtures/scalar/binary.py +120 -0
- vgi/_test_fixtures/scalar/formatting.py +176 -0
- vgi/_test_fixtures/scalar/geo.py +300 -0
- vgi/_test_fixtures/scalar/null_handling.py +107 -0
- vgi/_test_fixtures/scalar/random_demo.py +171 -0
- vgi/_test_fixtures/scalar/settings_secrets.py +102 -0
- vgi/_test_fixtures/scalar/type_info.py +219 -0
- vgi/_test_fixtures/schema_reconcile/__init__.py +29 -0
- vgi/_test_fixtures/schema_reconcile/worker.py +653 -0
- vgi/_test_fixtures/simple_writable.py +793 -0
- vgi/_test_fixtures/table/__init__.py +221 -0
- vgi/_test_fixtures/table/_common.py +162 -0
- vgi/_test_fixtures/table/batch_index.py +283 -0
- vgi/_test_fixtures/table/batch_index_broken.py +200 -0
- vgi/_test_fixtures/table/catalog_scans.py +162 -0
- vgi/_test_fixtures/table/filters.py +1005 -0
- vgi/_test_fixtures/table/late_materialization.py +249 -0
- vgi/_test_fixtures/table/make_series.py +273 -0
- vgi/_test_fixtures/table/misc.py +499 -0
- vgi/_test_fixtures/table/order_modes.py +164 -0
- vgi/_test_fixtures/table/pairs.py +437 -0
- vgi/_test_fixtures/table/partition_columns.py +472 -0
- vgi/_test_fixtures/table/partition_columns_broken.py +304 -0
- vgi/_test_fixtures/table/profiling_example.py +195 -0
- vgi/_test_fixtures/table/required_filters.py +234 -0
- vgi/_test_fixtures/table/sequence.py +710 -0
- vgi/_test_fixtures/table/settings.py +426 -0
- vgi/_test_fixtures/table/transaction_storage.py +162 -0
- vgi/_test_fixtures/table/tt_pushdown.py +191 -0
- vgi/_test_fixtures/table/versioned.py +230 -0
- vgi/_test_fixtures/table_in_out.py +1392 -0
- vgi/_test_fixtures/versioned.py +155 -0
- vgi/_test_fixtures/versioned_tables.py +595 -0
- vgi/_test_fixtures/worker.py +1631 -0
- vgi/_test_fixtures/writable/__init__.py +8 -0
- vgi/_test_fixtures/writable/generic.py +236 -0
- vgi/_test_fixtures/writable/table.py +149 -0
- vgi/_test_fixtures/writable/worker.py +1148 -0
- vgi/aggregate_function.py +607 -0
- vgi/argument_spec.py +472 -0
- vgi/arguments.py +1747 -0
- vgi/auth.py +55 -0
- vgi/catalog/__init__.py +88 -0
- vgi/catalog/attach_option.py +206 -0
- vgi/catalog/catalog_interface.py +2767 -0
- vgi/catalog/descriptors.py +870 -0
- vgi/catalog/duckdb_statistics.py +377 -0
- vgi/catalog/secret_type.py +96 -0
- vgi/catalog/setting.py +253 -0
- vgi/catalog/storage.py +372 -0
- vgi/client/__init__.py +67 -0
- vgi/client/catalog_mixin.py +1251 -0
- vgi/client/cli.py +582 -0
- vgi/client/cli_catalog.py +182 -0
- vgi/client/cli_schema.py +270 -0
- vgi/client/cli_table.py +907 -0
- vgi/client/cli_transaction.py +97 -0
- vgi/client/cli_utils.py +441 -0
- vgi/client/cli_view.py +303 -0
- vgi/client/client.py +2183 -0
- vgi/exceptions.py +205 -0
- vgi/function.py +245 -0
- vgi/function_storage.py +1636 -0
- vgi/function_storage_azure_sql.py +922 -0
- vgi/function_storage_cf_do.py +740 -0
- vgi/http/__init__.py +25 -0
- vgi/http/demo_storage.py +212 -0
- vgi/http/worker_page.py +1252 -0
- vgi/invocation.py +154 -0
- vgi/logging_config.py +93 -0
- vgi/meta_worker.py +661 -0
- vgi/metadata.py +1403 -0
- vgi/otel.py +406 -0
- vgi/protocol.py +2418 -0
- vgi/protocol_version.txt +1 -0
- vgi/py.typed +0 -0
- vgi/scalar_function.py +1211 -0
- vgi/schema_utils.py +234 -0
- vgi/secret_protocol.py +124 -0
- vgi/secret_service.py +238 -0
- vgi/serve.py +769 -0
- vgi/table_buffering_function.py +443 -0
- vgi/table_filter_pushdown.py +1528 -0
- vgi/table_function.py +1130 -0
- vgi/table_in_out_function.py +383 -0
- vgi/transactor/__init__.py +24 -0
- vgi/transactor/_duckdb_compat.py +27 -0
- vgi/transactor/client.py +137 -0
- vgi/transactor/protocol.py +149 -0
- vgi/transactor/server.py +740 -0
- vgi/worker.py +4761 -0
- vgi_python-0.8.0.dist-info/METADATA +735 -0
- vgi_python-0.8.0.dist-info/RECORD +124 -0
- vgi_python-0.8.0.dist-info/WHEEL +4 -0
- vgi_python-0.8.0.dist-info/entry_points.txt +5 -0
- vgi_python-0.8.0.dist-info/licenses/LICENSE +134 -0
|
@@ -0,0 +1,1631 @@
|
|
|
1
|
+
# Copyright 2025, 2026 Query Farm LLC - https://query.farm
|
|
2
|
+
|
|
3
|
+
"""Example worker with built-in functions for testing.
|
|
4
|
+
|
|
5
|
+
This demonstrates how to create a worker by subclassing Worker
|
|
6
|
+
and listing function classes. Function names are derived from
|
|
7
|
+
each class's metadata (Meta.name or snake_case of class name).
|
|
8
|
+
|
|
9
|
+
The worker supports:
|
|
10
|
+
- TableInOutGenerator: Transforms input batches to output batches
|
|
11
|
+
- TableFunctionGenerator: Generates output batches without input
|
|
12
|
+
- ScalarFunctionGenerator: Transforms input to single-column output (1:1 rows)
|
|
13
|
+
|
|
14
|
+
Settings:
|
|
15
|
+
- vgi_verbose_mode: Enable verbose output with extra columns (bool, default: false)
|
|
16
|
+
- greeting: Custom greeting message (str, default: "Hello")
|
|
17
|
+
- multiplier: Value multiplier (int, default: 1)
|
|
18
|
+
- threshold: Filter threshold for filter_by_setting (int, default: 0)
|
|
19
|
+
- config: Sequence configuration struct for struct_settings (struct, default: None)
|
|
20
|
+
|
|
21
|
+
Usage:
|
|
22
|
+
vgi-fixture-worker
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
# Friendly error if numpy is missing. Several fixture modules below depend on
|
|
26
|
+
# numpy, which the `vgi-fixtures` distribution installs; surface a clear install
|
|
27
|
+
# message instead of a raw ImportError.
|
|
28
|
+
try:
|
|
29
|
+
import numpy # noqa: F401
|
|
30
|
+
except ImportError:
|
|
31
|
+
import sys as _sys
|
|
32
|
+
|
|
33
|
+
_sys.exit("vgi-fixture-worker requires numpy. Install it with: pip install 'vgi-python[test-fixtures]'")
|
|
34
|
+
|
|
35
|
+
import uuid
|
|
36
|
+
from typing import Annotated, Any
|
|
37
|
+
|
|
38
|
+
import pyarrow as pa
|
|
39
|
+
|
|
40
|
+
from vgi._test_fixtures.aggregate import (
|
|
41
|
+
AvgFunction,
|
|
42
|
+
CountFunction,
|
|
43
|
+
DynamicAggregateFunction,
|
|
44
|
+
DynamicMLAggregateFunction,
|
|
45
|
+
GenericSumFunction,
|
|
46
|
+
ListAggFunction,
|
|
47
|
+
PercentileFunction,
|
|
48
|
+
StreamingSumFunction,
|
|
49
|
+
SumAllFunction,
|
|
50
|
+
SumFunction,
|
|
51
|
+
WeightedSumFunction,
|
|
52
|
+
WindowListAggFunction,
|
|
53
|
+
WindowMedianFunction,
|
|
54
|
+
WindowSumBatchFunction,
|
|
55
|
+
WindowSumFunction,
|
|
56
|
+
)
|
|
57
|
+
from vgi._test_fixtures.cancellable import (
|
|
58
|
+
SlowCancellableBufferingFunction,
|
|
59
|
+
SlowCancellableFunction,
|
|
60
|
+
SlowCancellableInOutFunction,
|
|
61
|
+
)
|
|
62
|
+
from vgi._test_fixtures.nest_tensor import NestTensorFunction, UnnestTensorFunction, UnnestTensorRowsFunction
|
|
63
|
+
from vgi._test_fixtures.scalar import (
|
|
64
|
+
AddValuesFunction,
|
|
65
|
+
AnyMixedIntFunction,
|
|
66
|
+
AnyMixedStrFunction,
|
|
67
|
+
BernoulliFunction,
|
|
68
|
+
BinaryPacketFunction,
|
|
69
|
+
ConcatValuesIntFunction,
|
|
70
|
+
ConcatValuesStrFunction,
|
|
71
|
+
ConditionalMessageFunction,
|
|
72
|
+
DoubleFunction,
|
|
73
|
+
FormatNumberDefaultFunction,
|
|
74
|
+
FormatNumberFullFunction,
|
|
75
|
+
FormatNumberPrecisionFunction,
|
|
76
|
+
GeoCentroidFixedFunction,
|
|
77
|
+
GeoCentroidListFunction,
|
|
78
|
+
GeoCentroidStructFunction,
|
|
79
|
+
GeoDistanceFixedFunction,
|
|
80
|
+
GeoDistanceListFunction,
|
|
81
|
+
GeoDistanceStructFunction,
|
|
82
|
+
HashSeedFunction,
|
|
83
|
+
MultiplyBySettingFunction,
|
|
84
|
+
MultiplyFunction,
|
|
85
|
+
NullHandlingFunction,
|
|
86
|
+
PairTypeIntIntFunction,
|
|
87
|
+
PairTypeIntStrFunction,
|
|
88
|
+
PairTypeStrStrFunction,
|
|
89
|
+
RandomBytesFunction,
|
|
90
|
+
RandomIntFunction,
|
|
91
|
+
ReturnSecretValueFunction,
|
|
92
|
+
SmartFormatPrefixFunction,
|
|
93
|
+
SmartFormatWidthFunction,
|
|
94
|
+
SumValuesFunction,
|
|
95
|
+
TypeInfoInt32Function,
|
|
96
|
+
TypeInfoInt64Function,
|
|
97
|
+
TypeInfoStringFunction,
|
|
98
|
+
TypeInfoUInt32Function,
|
|
99
|
+
TypeInfoUInt64Function,
|
|
100
|
+
UpperCaseFunction,
|
|
101
|
+
WhoAmIFunction,
|
|
102
|
+
)
|
|
103
|
+
from vgi._test_fixtures.table import (
|
|
104
|
+
_VERSIONED_CONSTRAINTS_SCHEMAS,
|
|
105
|
+
_VERSIONED_SCHEMAS,
|
|
106
|
+
RFF_MULTI_COLUMNS,
|
|
107
|
+
RFF_NESTED_COLUMNS,
|
|
108
|
+
RFF_NONE_COLUMNS,
|
|
109
|
+
RFF_ROWID_COLUMNS,
|
|
110
|
+
RFF_SIMPLE_COLUMNS,
|
|
111
|
+
RFF_STRUCT_COLUMNS,
|
|
112
|
+
BatchIndexOverflowFunction,
|
|
113
|
+
BrokenMissingPartitionValuesFunction,
|
|
114
|
+
BrokenPartitionColumnAbsentFromBatchFunction,
|
|
115
|
+
BrokenPartitionMinNeqMaxFunction,
|
|
116
|
+
BrokenPartitionValuesNoAnnotationFunction,
|
|
117
|
+
ColorsScanFunction,
|
|
118
|
+
ConstantColumnsFunction,
|
|
119
|
+
CountryPartitionedSalesFunction,
|
|
120
|
+
DepartmentsScanFunction,
|
|
121
|
+
DictFilterEchoFunction,
|
|
122
|
+
DisjointRangePartitionedFunction,
|
|
123
|
+
DoubleSequenceFunction,
|
|
124
|
+
DynamicFilterEchoFunction,
|
|
125
|
+
EmployeesScanFunction,
|
|
126
|
+
ExpressionFilterTestFunction,
|
|
127
|
+
FilterEchoFunction,
|
|
128
|
+
FilterEchoPartitionedFunction,
|
|
129
|
+
FilterEchoTableScanFunction,
|
|
130
|
+
GeneratorExceptionFunction,
|
|
131
|
+
LateMaterializationFunction,
|
|
132
|
+
LoggingGeneratorFunction,
|
|
133
|
+
MakePairsIntFunction,
|
|
134
|
+
MakePairsIntStrFunction,
|
|
135
|
+
MakePairsStrFunction,
|
|
136
|
+
MakeSeriesCountFunction,
|
|
137
|
+
MakeSeriesCsvFunction,
|
|
138
|
+
MakeSeriesFloatFunction,
|
|
139
|
+
MakeSeriesRangeFunction,
|
|
140
|
+
MakeSeriesStepFunction,
|
|
141
|
+
MissingBatchIndexTagFunction,
|
|
142
|
+
NamedParamsEchoFunction,
|
|
143
|
+
NestedSequenceFunction,
|
|
144
|
+
NonMonotoneBatchIndexFunction,
|
|
145
|
+
OrderEchoFunction,
|
|
146
|
+
PartitionedBatchIndexFunction,
|
|
147
|
+
PartitionedBatchIndexMarkedFunction,
|
|
148
|
+
PartitionedFixedOrderFunction,
|
|
149
|
+
PartitionedNoOrderGuaranteeFunction,
|
|
150
|
+
PartitionedPreservesOrderFunction,
|
|
151
|
+
PartitionedSequenceFunction,
|
|
152
|
+
PartitionedWithExplicitOverrideFunction,
|
|
153
|
+
ProductsScanFunction,
|
|
154
|
+
ProfilingDemoFunction,
|
|
155
|
+
ProjectedDataFunction,
|
|
156
|
+
ProjectsScanFunction,
|
|
157
|
+
RegionYearPartitionedFunction,
|
|
158
|
+
RepeatValueIntFunction,
|
|
159
|
+
RepeatValueStrFunction,
|
|
160
|
+
RffMultiScanFunction,
|
|
161
|
+
RffNestedScanFunction,
|
|
162
|
+
RffNoneScanFunction,
|
|
163
|
+
RffRowidScanFunction,
|
|
164
|
+
RffSimpleScanFunction,
|
|
165
|
+
RffStructScanFunction,
|
|
166
|
+
RowIdSequenceFunction,
|
|
167
|
+
SampleEchoFunction,
|
|
168
|
+
ScopedSecretDemoFunction,
|
|
169
|
+
SecretDemoFunction,
|
|
170
|
+
SequenceFunction,
|
|
171
|
+
SettingsAwareFunction,
|
|
172
|
+
SpatialFilterExampleFunction,
|
|
173
|
+
StructSettingsFunction,
|
|
174
|
+
TenThousandFunction,
|
|
175
|
+
TxCachedValueFunction,
|
|
176
|
+
ValuePruneFunction,
|
|
177
|
+
VersionedConstraintsScanFunction,
|
|
178
|
+
VersionedDataFunction,
|
|
179
|
+
resolve_version,
|
|
180
|
+
resolve_versioned_constraints_version,
|
|
181
|
+
)
|
|
182
|
+
from vgi._test_fixtures.table.tt_pushdown import (
|
|
183
|
+
_TT_SCHEMA,
|
|
184
|
+
TimeTravelPushdownFunction,
|
|
185
|
+
TtPushdownColsScanFunction,
|
|
186
|
+
resolve_tt_version,
|
|
187
|
+
)
|
|
188
|
+
from vgi._test_fixtures.table_in_out import (
|
|
189
|
+
BatchIndexBufferInputFunction,
|
|
190
|
+
BufferEmitWideFunction,
|
|
191
|
+
BufferInputFunction,
|
|
192
|
+
CrashOnCombineFunction,
|
|
193
|
+
CrashOnFinalizeFunction,
|
|
194
|
+
CrashOnProcessFunction,
|
|
195
|
+
EchoBufferingFunction,
|
|
196
|
+
EchoFunction,
|
|
197
|
+
EchoWitnessFunction,
|
|
198
|
+
ExceptionFinalizeFunction,
|
|
199
|
+
ExceptionProcessFunction,
|
|
200
|
+
FilterBySettingFunction,
|
|
201
|
+
HangOnProcessFunction,
|
|
202
|
+
LargeStateFunction,
|
|
203
|
+
OrderedBufferInputFunction,
|
|
204
|
+
OrderedSourceFunction,
|
|
205
|
+
RepeatInputsFunction,
|
|
206
|
+
SumAllColumnsFunction,
|
|
207
|
+
SumAllColumnsSimpleDistributed,
|
|
208
|
+
)
|
|
209
|
+
from vgi.arguments import Arguments
|
|
210
|
+
from vgi.catalog import (
|
|
211
|
+
AttachOpaqueData,
|
|
212
|
+
Catalog,
|
|
213
|
+
ForeignKeyDef,
|
|
214
|
+
Index,
|
|
215
|
+
IndexConstraintType,
|
|
216
|
+
Macro,
|
|
217
|
+
MacroType,
|
|
218
|
+
ReadOnlyCatalogInterface,
|
|
219
|
+
ScanBranch,
|
|
220
|
+
ScanBranchesResult,
|
|
221
|
+
ScanFunctionResult,
|
|
222
|
+
Schema,
|
|
223
|
+
SecretTypeSpec,
|
|
224
|
+
SerializedSchema,
|
|
225
|
+
Setting,
|
|
226
|
+
Table,
|
|
227
|
+
TableInfo,
|
|
228
|
+
TransactionOpaqueData,
|
|
229
|
+
View,
|
|
230
|
+
)
|
|
231
|
+
from vgi.catalog.catalog_interface import _validate_at_params
|
|
232
|
+
from vgi.catalog.descriptors import ColumnStatisticsInput
|
|
233
|
+
from vgi.catalog.duckdb_statistics import statistics_from_duckdb
|
|
234
|
+
from vgi.schema_utils import schema
|
|
235
|
+
from vgi.worker import Worker
|
|
236
|
+
|
|
237
|
+
|
|
238
|
+
# ---------------------------------------------------------------------------
|
|
239
|
+
# DuckDB-backed table: demonstrates statistics_from_duckdb() helper.
|
|
240
|
+
# Creates an in-memory table and extracts real statistics from it.
|
|
241
|
+
# ---------------------------------------------------------------------------
|
|
242
|
+
def _build_numbers_stats() -> dict[str, ColumnStatisticsInput]:
|
|
243
|
+
"""Extract statistics for the 'numbers' table (integers 0-99) from DuckDB.
|
|
244
|
+
|
|
245
|
+
Demonstrates the ``statistics_from_duckdb()`` helper by creating the same
|
|
246
|
+
data in a DuckDB in-memory table and pulling real statistics from it.
|
|
247
|
+
"""
|
|
248
|
+
from vgi._duckdb import connect as engine_connect
|
|
249
|
+
|
|
250
|
+
conn = engine_connect()
|
|
251
|
+
conn.execute("CREATE TABLE numbers AS SELECT unnest(range(100)) AS value")
|
|
252
|
+
stats = statistics_from_duckdb(conn, "numbers")
|
|
253
|
+
conn.close()
|
|
254
|
+
return stats
|
|
255
|
+
|
|
256
|
+
|
|
257
|
+
_NUMBERS_STATS = _build_numbers_stats()
|
|
258
|
+
|
|
259
|
+
|
|
260
|
+
def _build_geo_stats() -> tuple[pa.Schema, dict[str, ColumnStatisticsInput]]:
|
|
261
|
+
"""Build a geometry table in DuckDB and extract spatial statistics.
|
|
262
|
+
|
|
263
|
+
Creates a 5x5 grid of points (0,0) to (4,4) with an integer ID.
|
|
264
|
+
Demonstrates geometry statistics via ``statistics_from_duckdb()``.
|
|
265
|
+
"""
|
|
266
|
+
from vgi._duckdb import connect as engine_connect
|
|
267
|
+
|
|
268
|
+
conn = engine_connect()
|
|
269
|
+
# INSTALL is a no-op when the extension is already cached; fresh
|
|
270
|
+
# environments (CI runners) need the download before LOAD.
|
|
271
|
+
conn.execute("INSTALL spatial")
|
|
272
|
+
conn.execute("LOAD spatial")
|
|
273
|
+
conn.execute(
|
|
274
|
+
"CREATE TABLE geo_points AS "
|
|
275
|
+
"SELECT row_number() OVER () AS id, "
|
|
276
|
+
"ST_Point(x::DOUBLE, y::DOUBLE)::GEOMETRY AS geom "
|
|
277
|
+
"FROM range(5) t1(x), range(5) t2(y)"
|
|
278
|
+
)
|
|
279
|
+
schema = conn.execute("SELECT * FROM geo_points LIMIT 0").to_arrow_table().schema
|
|
280
|
+
stats = statistics_from_duckdb(conn, "geo_points")
|
|
281
|
+
conn.close()
|
|
282
|
+
return schema, stats
|
|
283
|
+
|
|
284
|
+
|
|
285
|
+
_GEO_SCHEMA, _GEO_STATS = _build_geo_stats()
|
|
286
|
+
|
|
287
|
+
|
|
288
|
+
def _build_enum_stats() -> dict[str, ColumnStatisticsInput]:
|
|
289
|
+
"""Extract statistics for a table with ENUM (dictionary-encoded) columns.
|
|
290
|
+
|
|
291
|
+
Demonstrates that ``statistics_from_duckdb()`` correctly unwraps
|
|
292
|
+
dictionary-encoded min/max to actual string values rather than
|
|
293
|
+
returning dictionary indices.
|
|
294
|
+
"""
|
|
295
|
+
from vgi._duckdb import connect as engine_connect
|
|
296
|
+
|
|
297
|
+
conn = engine_connect()
|
|
298
|
+
conn.execute("CREATE TYPE color AS ENUM ('red', 'green', 'blue')")
|
|
299
|
+
conn.execute(
|
|
300
|
+
"CREATE TABLE colors AS "
|
|
301
|
+
"SELECT unnest(range(3)) + 1 AS id, "
|
|
302
|
+
"unnest(['red', 'green', 'blue'])::color AS color, "
|
|
303
|
+
"unnest(['#FF0000', '#00FF00', '#0000FF']) AS hex_code"
|
|
304
|
+
)
|
|
305
|
+
stats = statistics_from_duckdb(conn, "colors")
|
|
306
|
+
conn.close()
|
|
307
|
+
return stats
|
|
308
|
+
|
|
309
|
+
|
|
310
|
+
_ENUM_STATS = _build_enum_stats()
|
|
311
|
+
|
|
312
|
+
_EXAMPLE_CATALOG = Catalog(
|
|
313
|
+
name="example",
|
|
314
|
+
default_schema="main",
|
|
315
|
+
comment="Example VGI catalog for testing",
|
|
316
|
+
tags={"source": "vgi-fixture-worker", "version": "1"},
|
|
317
|
+
schemas=[
|
|
318
|
+
Schema(
|
|
319
|
+
name="main",
|
|
320
|
+
comment="Example functions for testing VGI",
|
|
321
|
+
functions=[
|
|
322
|
+
# TableInOutGenerator - transform input batches
|
|
323
|
+
EchoFunction,
|
|
324
|
+
EchoWitnessFunction,
|
|
325
|
+
BufferInputFunction,
|
|
326
|
+
FilterBySettingFunction,
|
|
327
|
+
RepeatInputsFunction,
|
|
328
|
+
SlowCancellableInOutFunction,
|
|
329
|
+
SumAllColumnsFunction,
|
|
330
|
+
SumAllColumnsSimpleDistributed,
|
|
331
|
+
UnnestTensorRowsFunction,
|
|
332
|
+
ExceptionFinalizeFunction,
|
|
333
|
+
ExceptionProcessFunction,
|
|
334
|
+
CrashOnProcessFunction,
|
|
335
|
+
CrashOnCombineFunction,
|
|
336
|
+
CrashOnFinalizeFunction,
|
|
337
|
+
HangOnProcessFunction,
|
|
338
|
+
LargeStateFunction,
|
|
339
|
+
OrderedBufferInputFunction,
|
|
340
|
+
OrderedSourceFunction,
|
|
341
|
+
BatchIndexBufferInputFunction,
|
|
342
|
+
EchoBufferingFunction,
|
|
343
|
+
BufferEmitWideFunction,
|
|
344
|
+
SlowCancellableBufferingFunction,
|
|
345
|
+
# TableFunctionGenerator - generate output without input
|
|
346
|
+
ConstantColumnsFunction,
|
|
347
|
+
SlowCancellableFunction,
|
|
348
|
+
FilterEchoFunction,
|
|
349
|
+
FilterEchoPartitionedFunction,
|
|
350
|
+
FilterEchoTableScanFunction,
|
|
351
|
+
ValuePruneFunction,
|
|
352
|
+
LateMaterializationFunction,
|
|
353
|
+
DictFilterEchoFunction,
|
|
354
|
+
DoubleSequenceFunction,
|
|
355
|
+
DynamicFilterEchoFunction,
|
|
356
|
+
GeneratorExceptionFunction,
|
|
357
|
+
LoggingGeneratorFunction,
|
|
358
|
+
MakeSeriesCountFunction,
|
|
359
|
+
MakeSeriesCsvFunction,
|
|
360
|
+
MakeSeriesFloatFunction,
|
|
361
|
+
MakeSeriesRangeFunction,
|
|
362
|
+
MakeSeriesStepFunction,
|
|
363
|
+
MakePairsIntFunction,
|
|
364
|
+
MakePairsIntStrFunction,
|
|
365
|
+
MakePairsStrFunction,
|
|
366
|
+
RepeatValueIntFunction,
|
|
367
|
+
RepeatValueStrFunction,
|
|
368
|
+
NamedParamsEchoFunction,
|
|
369
|
+
NestedSequenceFunction,
|
|
370
|
+
ProfilingDemoFunction,
|
|
371
|
+
OrderEchoFunction,
|
|
372
|
+
PartitionedBatchIndexFunction,
|
|
373
|
+
PartitionedBatchIndexMarkedFunction,
|
|
374
|
+
PartitionedFixedOrderFunction,
|
|
375
|
+
PartitionedNoOrderGuaranteeFunction,
|
|
376
|
+
PartitionedPreservesOrderFunction,
|
|
377
|
+
PartitionedSequenceFunction,
|
|
378
|
+
# PartitionColumns (Hive-style partitioning) reference fixtures
|
|
379
|
+
# — see vgi/_test_fixtures/table/partition_columns.py.
|
|
380
|
+
CountryPartitionedSalesFunction,
|
|
381
|
+
DisjointRangePartitionedFunction,
|
|
382
|
+
PartitionedWithExplicitOverrideFunction,
|
|
383
|
+
RegionYearPartitionedFunction,
|
|
384
|
+
# Deliberately-broken batch_index fixtures (see
|
|
385
|
+
# vgi/_test_fixtures/table/batch_index_broken.py). Registered
|
|
386
|
+
# so SQL integration tests in batch_index_contract.test can
|
|
387
|
+
# call them and assert the C++ extension's contract checks
|
|
388
|
+
# fire as typed IOExceptions.
|
|
389
|
+
BatchIndexOverflowFunction,
|
|
390
|
+
MissingBatchIndexTagFunction,
|
|
391
|
+
NonMonotoneBatchIndexFunction,
|
|
392
|
+
# Deliberately-broken PartitionColumns fixtures (see
|
|
393
|
+
# vgi/_test_fixtures/table/partition_columns_broken.py).
|
|
394
|
+
BrokenMissingPartitionValuesFunction,
|
|
395
|
+
BrokenPartitionColumnAbsentFromBatchFunction,
|
|
396
|
+
BrokenPartitionMinNeqMaxFunction,
|
|
397
|
+
BrokenPartitionValuesNoAnnotationFunction,
|
|
398
|
+
ProjectedDataFunction,
|
|
399
|
+
SampleEchoFunction,
|
|
400
|
+
RowIdSequenceFunction,
|
|
401
|
+
SecretDemoFunction,
|
|
402
|
+
ScopedSecretDemoFunction,
|
|
403
|
+
ExpressionFilterTestFunction,
|
|
404
|
+
SequenceFunction,
|
|
405
|
+
SettingsAwareFunction,
|
|
406
|
+
SpatialFilterExampleFunction,
|
|
407
|
+
StructSettingsFunction,
|
|
408
|
+
TenThousandFunction,
|
|
409
|
+
TxCachedValueFunction,
|
|
410
|
+
VersionedDataFunction,
|
|
411
|
+
# Time-travel + filter-pushdown fixtures (one function-backed, one
|
|
412
|
+
# columns-based) — back time_travel_pushdown.test.
|
|
413
|
+
TimeTravelPushdownFunction,
|
|
414
|
+
TtPushdownColsScanFunction,
|
|
415
|
+
# Static data scan functions for constraint-backed tables
|
|
416
|
+
ColorsScanFunction,
|
|
417
|
+
DepartmentsScanFunction,
|
|
418
|
+
EmployeesScanFunction,
|
|
419
|
+
ProductsScanFunction,
|
|
420
|
+
ProjectsScanFunction,
|
|
421
|
+
VersionedConstraintsScanFunction,
|
|
422
|
+
# rff_* scan functions back the Tables exercised by the
|
|
423
|
+
# vgi_required_filters_*.test sqllogictest matrix.
|
|
424
|
+
RffMultiScanFunction,
|
|
425
|
+
RffNestedScanFunction,
|
|
426
|
+
RffNoneScanFunction,
|
|
427
|
+
RffRowidScanFunction,
|
|
428
|
+
RffSimpleScanFunction,
|
|
429
|
+
RffStructScanFunction,
|
|
430
|
+
# ScalarFunctionGenerator - transform to single-column output
|
|
431
|
+
AddValuesFunction,
|
|
432
|
+
BernoulliFunction,
|
|
433
|
+
BinaryPacketFunction,
|
|
434
|
+
ConcatValuesIntFunction,
|
|
435
|
+
ConcatValuesStrFunction,
|
|
436
|
+
ConditionalMessageFunction,
|
|
437
|
+
DoubleFunction,
|
|
438
|
+
FormatNumberDefaultFunction,
|
|
439
|
+
FormatNumberFullFunction,
|
|
440
|
+
FormatNumberPrecisionFunction,
|
|
441
|
+
GeoCentroidFixedFunction,
|
|
442
|
+
GeoCentroidListFunction,
|
|
443
|
+
GeoCentroidStructFunction,
|
|
444
|
+
GeoDistanceFixedFunction,
|
|
445
|
+
GeoDistanceListFunction,
|
|
446
|
+
GeoDistanceStructFunction,
|
|
447
|
+
HashSeedFunction,
|
|
448
|
+
MultiplyBySettingFunction,
|
|
449
|
+
MultiplyFunction,
|
|
450
|
+
NullHandlingFunction,
|
|
451
|
+
PairTypeIntIntFunction,
|
|
452
|
+
PairTypeIntStrFunction,
|
|
453
|
+
PairTypeStrStrFunction,
|
|
454
|
+
RandomBytesFunction,
|
|
455
|
+
RandomIntFunction,
|
|
456
|
+
ReturnSecretValueFunction,
|
|
457
|
+
SmartFormatPrefixFunction,
|
|
458
|
+
SmartFormatWidthFunction,
|
|
459
|
+
SumValuesFunction,
|
|
460
|
+
TypeInfoInt32Function,
|
|
461
|
+
TypeInfoInt64Function,
|
|
462
|
+
TypeInfoStringFunction,
|
|
463
|
+
TypeInfoUInt32Function,
|
|
464
|
+
TypeInfoUInt64Function,
|
|
465
|
+
AnyMixedIntFunction,
|
|
466
|
+
AnyMixedStrFunction,
|
|
467
|
+
UnnestTensorFunction,
|
|
468
|
+
UpperCaseFunction,
|
|
469
|
+
WhoAmIFunction,
|
|
470
|
+
# AggregateFunction - aggregate input rows
|
|
471
|
+
AvgFunction,
|
|
472
|
+
CountFunction,
|
|
473
|
+
DynamicAggregateFunction,
|
|
474
|
+
DynamicMLAggregateFunction,
|
|
475
|
+
GenericSumFunction,
|
|
476
|
+
ListAggFunction,
|
|
477
|
+
NestTensorFunction,
|
|
478
|
+
PercentileFunction,
|
|
479
|
+
StreamingSumFunction,
|
|
480
|
+
SumAllFunction,
|
|
481
|
+
SumFunction,
|
|
482
|
+
WeightedSumFunction,
|
|
483
|
+
WindowListAggFunction,
|
|
484
|
+
WindowMedianFunction,
|
|
485
|
+
WindowSumBatchFunction,
|
|
486
|
+
WindowSumFunction,
|
|
487
|
+
],
|
|
488
|
+
views=[
|
|
489
|
+
View(
|
|
490
|
+
name="first_ten",
|
|
491
|
+
definition="SELECT * FROM sequence(10)",
|
|
492
|
+
comment="First 10 integers",
|
|
493
|
+
column_comments={"n": "Sequence index 0..9"},
|
|
494
|
+
tags={"layer": "demo", "origin": "sequence"},
|
|
495
|
+
),
|
|
496
|
+
View(
|
|
497
|
+
name="even_numbers",
|
|
498
|
+
definition="SELECT * FROM sequence(100) WHERE n % 2 = 0",
|
|
499
|
+
comment="Even numbers from 0 to 98",
|
|
500
|
+
),
|
|
501
|
+
],
|
|
502
|
+
macros=[
|
|
503
|
+
Macro(
|
|
504
|
+
name="vgi_multiply",
|
|
505
|
+
macro_type=MacroType.SCALAR,
|
|
506
|
+
parameters=["x", "y"],
|
|
507
|
+
definition="x * y",
|
|
508
|
+
comment="Multiply two values",
|
|
509
|
+
),
|
|
510
|
+
Macro(
|
|
511
|
+
name="vgi_clamp",
|
|
512
|
+
macro_type=MacroType.SCALAR,
|
|
513
|
+
parameters=["val", "lo", "hi"],
|
|
514
|
+
parameter_default_values=pa.RecordBatch.from_pydict(
|
|
515
|
+
{"lo": [pa.scalar(0).as_py()], "hi": [pa.scalar(100).as_py()]},
|
|
516
|
+
schema=schema(lo=pa.int64(), hi=pa.int64()),
|
|
517
|
+
),
|
|
518
|
+
definition="GREATEST(lo, LEAST(hi, val))",
|
|
519
|
+
comment="Clamp a value between lo and hi (defaults: 0..100)",
|
|
520
|
+
),
|
|
521
|
+
Macro(
|
|
522
|
+
name="vgi_range_table",
|
|
523
|
+
macro_type=MacroType.TABLE,
|
|
524
|
+
parameters=["n"],
|
|
525
|
+
definition="SELECT * FROM range(n)",
|
|
526
|
+
comment="Table macro returning range of values",
|
|
527
|
+
),
|
|
528
|
+
],
|
|
529
|
+
),
|
|
530
|
+
Schema(
|
|
531
|
+
name="data",
|
|
532
|
+
comment="Example tables backed by functions",
|
|
533
|
+
tables=[
|
|
534
|
+
# Function-backed table: schema derived via bind()
|
|
535
|
+
Table(
|
|
536
|
+
name="large_sequence",
|
|
537
|
+
function=SequenceFunction,
|
|
538
|
+
arguments=Arguments(positional=(pa.scalar(1_000_000),)),
|
|
539
|
+
statistics={
|
|
540
|
+
"n": ColumnStatisticsInput(min=0, max=999_999, has_null=False, distinct_count=1_000_000),
|
|
541
|
+
},
|
|
542
|
+
statistics_cache_max_age_seconds=3600,
|
|
543
|
+
comment="A large sequence of integers from 0 to 1,000,000",
|
|
544
|
+
),
|
|
545
|
+
# Function-backed table with a no-arg function. Used by the
|
|
546
|
+
# ``inlined_scan_function.test`` integration test to verify
|
|
547
|
+
# the C++ extension reads the inlined ``scan_function`` from
|
|
548
|
+
# ``TableInfo`` and skips ``catalog_table_scan_function_get``.
|
|
549
|
+
Table(
|
|
550
|
+
name="ten_thousand_table",
|
|
551
|
+
function=TenThousandFunction,
|
|
552
|
+
comment="Function-backed table over the no-arg ten_thousand function",
|
|
553
|
+
),
|
|
554
|
+
# Function-backed table with inlined cardinality. Used by the
|
|
555
|
+
# ``inlined_cardinality.test`` integration test to verify the
|
|
556
|
+
# C++ extension uses ``Table.cardinality_estimate`` /
|
|
557
|
+
# ``cardinality_max`` from ``TableInfo`` and skips the per-bind
|
|
558
|
+
# ``table_function_cardinality`` RPC.
|
|
559
|
+
Table(
|
|
560
|
+
name="cardinality_inlined_table",
|
|
561
|
+
function=TenThousandFunction,
|
|
562
|
+
cardinality_estimate=10000,
|
|
563
|
+
cardinality_max=10000,
|
|
564
|
+
comment="Function-backed table with inlined cardinality (10000 rows)",
|
|
565
|
+
),
|
|
566
|
+
# Time-travel table: version-specific schema
|
|
567
|
+
Table(
|
|
568
|
+
name="versioned_data",
|
|
569
|
+
columns=schema(id=pa.int64(), score=pa.float64()),
|
|
570
|
+
supports_time_travel=True,
|
|
571
|
+
comment="Versioned data table demonstrating time travel with schema evolution",
|
|
572
|
+
),
|
|
573
|
+
# Time travel + filter pushdown together. tt_pushdown_fn is
|
|
574
|
+
# function-backed (reads AT at init); tt_pushdown_cols is
|
|
575
|
+
# columns-based (AT → version arg via table_scan_function_get).
|
|
576
|
+
Table(
|
|
577
|
+
name="tt_pushdown_fn",
|
|
578
|
+
function=TimeTravelPushdownFunction,
|
|
579
|
+
supports_time_travel=True,
|
|
580
|
+
comment="Function-backed: prunes by filter AND time-travels (AT read at init).",
|
|
581
|
+
),
|
|
582
|
+
Table(
|
|
583
|
+
name="tt_pushdown_cols",
|
|
584
|
+
columns=_TT_SCHEMA,
|
|
585
|
+
supports_time_travel=True,
|
|
586
|
+
comment="Columns-based: prunes by filter AND time-travels (AT → version arg).",
|
|
587
|
+
),
|
|
588
|
+
# Explicit columns table with statistics extracted from DuckDB
|
|
589
|
+
# via statistics_from_duckdb() — demonstrates the helper workflow
|
|
590
|
+
Table(
|
|
591
|
+
name="numbers",
|
|
592
|
+
columns=schema(value=pa.int64()),
|
|
593
|
+
statistics=_NUMBERS_STATS,
|
|
594
|
+
statistics_cache_max_age_seconds=3600,
|
|
595
|
+
comment="First 100 integers (demonstrates explicit columns)",
|
|
596
|
+
),
|
|
597
|
+
# Geometry table with spatial statistics from DuckDB
|
|
598
|
+
Table(
|
|
599
|
+
name="geo_points",
|
|
600
|
+
columns=_GEO_SCHEMA,
|
|
601
|
+
statistics=_GEO_STATS,
|
|
602
|
+
statistics_cache_max_age_seconds=3600,
|
|
603
|
+
comment="5x5 grid of points with spatial statistics",
|
|
604
|
+
),
|
|
605
|
+
# Table with TTL=0 (never cache) for cache expiry testing
|
|
606
|
+
Table(
|
|
607
|
+
name="volatile_numbers",
|
|
608
|
+
columns=schema(value=pa.int64()),
|
|
609
|
+
statistics={
|
|
610
|
+
"value": ColumnStatisticsInput(min=0, max=99, has_null=False, distinct_count=100),
|
|
611
|
+
},
|
|
612
|
+
statistics_cache_max_age_seconds=0,
|
|
613
|
+
comment="Numbers with volatile stats (TTL=0, always re-fetched)",
|
|
614
|
+
),
|
|
615
|
+
# Table with NO declared statistics — stats must come from the underlying
|
|
616
|
+
# scan function (SequenceFunction.statistics) via table_function_statistics RPC.
|
|
617
|
+
# Column name matches the function output ("n") so no rename is needed.
|
|
618
|
+
Table(
|
|
619
|
+
name="funny_numbers",
|
|
620
|
+
columns=schema(n=pa.int64()),
|
|
621
|
+
comment="123456 integers; stats served by the sequence function, not the table",
|
|
622
|
+
),
|
|
623
|
+
# Multi-branch fixture — two ScanBranch entries both calling
|
|
624
|
+
# sequence() with different counts. SELECT count(*) should
|
|
625
|
+
# return 100 (50 + 50). Exercises VgiMultiScanRewriter end-to-end.
|
|
626
|
+
Table(
|
|
627
|
+
name="multi_branch_numbers",
|
|
628
|
+
columns=schema(n=pa.int64()),
|
|
629
|
+
comment="Multi-branch: UNION of sequence(50) + sequence(50) — used by multi_branch_scan.test",
|
|
630
|
+
),
|
|
631
|
+
# Multi-branch with branch_filters that partition the value range.
|
|
632
|
+
# Branch A: sequence(100) with `n < 50`; branch B: sequence(100)
|
|
633
|
+
# with `n >= 50`. Non-overlapping; total rows = 100.
|
|
634
|
+
Table(
|
|
635
|
+
name="multi_branch_filtered_numbers",
|
|
636
|
+
columns=schema(n=pa.int64()),
|
|
637
|
+
comment="Multi-branch with complementary branch_filters — exercises pruning",
|
|
638
|
+
),
|
|
639
|
+
# Heterogeneous multi-branch: one VGI arm + one native read_parquet
|
|
640
|
+
# arm. The parquet file is created by the test at a well-known path
|
|
641
|
+
# (see multi_branch_heterogeneous.test). Demonstrates that cold-tier
|
|
642
|
+
# data can come from any DuckDB function the worker names, without
|
|
643
|
+
# tunneling through the worker pipe.
|
|
644
|
+
Table(
|
|
645
|
+
name="multi_branch_hetero",
|
|
646
|
+
columns=schema(n=pa.int64()),
|
|
647
|
+
comment="Multi-branch: sequence(50) + read_parquet — used by multi_branch_heterogeneous.test",
|
|
648
|
+
),
|
|
649
|
+
# Column reconciliation: 3 read_parquet branches, the test creates
|
|
650
|
+
# the parquet files with deliberately different column orders and
|
|
651
|
+
# a missing column on one branch. Canonical schema (a, b) is
|
|
652
|
+
# populated by name; missing columns NULL-fill.
|
|
653
|
+
Table(
|
|
654
|
+
name="multi_branch_recon",
|
|
655
|
+
columns=schema(a=pa.int64(), b=pa.int64()),
|
|
656
|
+
comment="Multi-branch: column reconciliation — used by multi_branch_reconciliation.test",
|
|
657
|
+
),
|
|
658
|
+
# Pushdown-incapable arm test (E3): one VGI sequence() arm
|
|
659
|
+
# (filter_pushdown=True) + one read_csv arm (read_csv lacks
|
|
660
|
+
# native filter pushdown, so filters stay as LogicalFilter
|
|
661
|
+
# above the scan). Tests that the rewriter doesn't assume
|
|
662
|
+
# pushdown always succeeds.
|
|
663
|
+
Table(
|
|
664
|
+
name="multi_branch_nopushdown",
|
|
665
|
+
columns=schema(n=pa.int64()),
|
|
666
|
+
comment="Multi-branch: VGI + read_csv — used by multi_branch_pushdown_incapable.test",
|
|
667
|
+
),
|
|
668
|
+
# Empty-branches loud-fail test (E6): worker returns
|
|
669
|
+
# branches=[] from table_scan_branches_get. The C++ side's
|
|
670
|
+
# ParseScanBranchesResult must reject this at the wire layer
|
|
671
|
+
# with a BinderException before any plan is built.
|
|
672
|
+
Table(
|
|
673
|
+
name="multi_branch_empty",
|
|
674
|
+
columns=schema(n=pa.int64()),
|
|
675
|
+
comment="Multi-branch: empty branches list — used by multi_branch_empty_branches.test",
|
|
676
|
+
),
|
|
677
|
+
# Parse-time rejection — worker returns two ScanBranch
|
|
678
|
+
# entries both with writable=True. ParseScanBranchesResult
|
|
679
|
+
# must throw BinderException citing DuckDB's
|
|
680
|
+
# single-writable-catalog rule. See multi_branch_two_writable.test.
|
|
681
|
+
Table(
|
|
682
|
+
name="multi_branch_two_writable",
|
|
683
|
+
columns=schema(n=pa.int64()),
|
|
684
|
+
comment="Multi-branch with two writable=True arms — used by multi_branch_two_writable.test",
|
|
685
|
+
),
|
|
686
|
+
# ENUM (dictionary-encoded) column table — tests that statistics
|
|
687
|
+
# report actual string values, not dictionary indices.
|
|
688
|
+
Table(
|
|
689
|
+
name="colors",
|
|
690
|
+
columns=schema(id=pa.int64(), color=pa.string(), hex_code=pa.string()),
|
|
691
|
+
statistics=_ENUM_STATS,
|
|
692
|
+
statistics_cache_max_age_seconds=3600,
|
|
693
|
+
comment="Colors table with ENUM-derived statistics",
|
|
694
|
+
),
|
|
695
|
+
# Row ID position tests (int64 row_id)
|
|
696
|
+
Table(
|
|
697
|
+
name="rowid_first",
|
|
698
|
+
columns=schema(
|
|
699
|
+
row_id=(pa.int64(), {b"is_row_id": b""}),
|
|
700
|
+
name=pa.string(),
|
|
701
|
+
value=pa.string(),
|
|
702
|
+
),
|
|
703
|
+
comment="Table with row_id at column index 0",
|
|
704
|
+
),
|
|
705
|
+
Table(
|
|
706
|
+
name="rowid_middle",
|
|
707
|
+
columns=schema(
|
|
708
|
+
name=pa.string(),
|
|
709
|
+
row_id=(pa.int64(), {b"is_row_id": b""}),
|
|
710
|
+
value=pa.string(),
|
|
711
|
+
),
|
|
712
|
+
comment="Table with row_id at column index 1",
|
|
713
|
+
),
|
|
714
|
+
Table(
|
|
715
|
+
name="rowid_last",
|
|
716
|
+
columns=schema(
|
|
717
|
+
name=pa.string(),
|
|
718
|
+
value=pa.string(),
|
|
719
|
+
row_id=(pa.int64(), {b"is_row_id": b""}),
|
|
720
|
+
),
|
|
721
|
+
comment="Table with row_id at column index 2",
|
|
722
|
+
),
|
|
723
|
+
# Row ID type tests (row_id at index 0)
|
|
724
|
+
Table(
|
|
725
|
+
name="rowid_string",
|
|
726
|
+
columns=schema(
|
|
727
|
+
row_id=(pa.string(), {b"is_row_id": b""}),
|
|
728
|
+
value=pa.int64(),
|
|
729
|
+
),
|
|
730
|
+
comment="Table with string row_id",
|
|
731
|
+
),
|
|
732
|
+
Table(
|
|
733
|
+
name="rowid_struct",
|
|
734
|
+
columns=schema(
|
|
735
|
+
row_id=(
|
|
736
|
+
pa.struct([("a", pa.int64()), ("b", pa.string())]),
|
|
737
|
+
{b"is_row_id": b""},
|
|
738
|
+
),
|
|
739
|
+
value=pa.string(),
|
|
740
|
+
),
|
|
741
|
+
comment="Table with struct row_id",
|
|
742
|
+
),
|
|
743
|
+
# ----- Late-materialization tables (rowid + scrambled ord) -----
|
|
744
|
+
# Backed by the late_materialization scan function, which
|
|
745
|
+
# advertises Meta.late_materialization. The row_id is the row
|
|
746
|
+
# index (unique/deterministic/snapshot-stable); ord is a
|
|
747
|
+
# scrambled function of the index so a Top-N on ord yields
|
|
748
|
+
# scattered survivor rowids. pushed echoes the rowid filter the
|
|
749
|
+
# worker received. See late_materialization.test.
|
|
750
|
+
Table(
|
|
751
|
+
name="late_mat",
|
|
752
|
+
columns=schema(
|
|
753
|
+
row_id=(pa.int64(), {b"is_row_id": b""}),
|
|
754
|
+
ord=pa.int64(),
|
|
755
|
+
payload=pa.string(),
|
|
756
|
+
pushed=pa.string(),
|
|
757
|
+
),
|
|
758
|
+
comment="Late-materialization table (1000 rows, unique rowid)",
|
|
759
|
+
),
|
|
760
|
+
Table(
|
|
761
|
+
name="late_mat_dup",
|
|
762
|
+
columns=schema(
|
|
763
|
+
row_id=(pa.int64(), {b"is_row_id": b""}),
|
|
764
|
+
ord=pa.int64(),
|
|
765
|
+
payload=pa.string(),
|
|
766
|
+
pushed=pa.string(),
|
|
767
|
+
),
|
|
768
|
+
comment="Late-materialization table with deliberately non-unique rowid (contract violation)",
|
|
769
|
+
),
|
|
770
|
+
Table(
|
|
771
|
+
name="late_mat_nulls",
|
|
772
|
+
columns=schema(
|
|
773
|
+
row_id=(pa.int64(), {b"is_row_id": b""}),
|
|
774
|
+
ord=pa.int64(),
|
|
775
|
+
payload=pa.string(),
|
|
776
|
+
pushed=pa.string(),
|
|
777
|
+
),
|
|
778
|
+
comment="Late-materialization table with NULLs in the ord column",
|
|
779
|
+
),
|
|
780
|
+
# ----- Generated column example table -----
|
|
781
|
+
Table(
|
|
782
|
+
name="generated_sequence",
|
|
783
|
+
columns=schema(n=pa.int64(), doubled=pa.int64(), label=pa.string()),
|
|
784
|
+
generated_columns={
|
|
785
|
+
"doubled": "n * 2",
|
|
786
|
+
"label": "'item_' || CAST(n AS VARCHAR)",
|
|
787
|
+
},
|
|
788
|
+
comment="Table with generated columns backed by sequence(10)",
|
|
789
|
+
),
|
|
790
|
+
# ----- Constraint example tables -----
|
|
791
|
+
Table(
|
|
792
|
+
name="departments",
|
|
793
|
+
columns=schema(id=pa.int64(), name=pa.string(), budget=pa.float64()),
|
|
794
|
+
primary_key=(("id",),),
|
|
795
|
+
not_null=("id", "name"),
|
|
796
|
+
unique=(("name",),),
|
|
797
|
+
check=("budget >= 0",),
|
|
798
|
+
defaults={"budget": 0},
|
|
799
|
+
statistics={
|
|
800
|
+
"id": ColumnStatisticsInput(min=1, max=10, has_null=False, distinct_count=10),
|
|
801
|
+
"name": ColumnStatisticsInput(
|
|
802
|
+
min="Accounting",
|
|
803
|
+
max="Sales",
|
|
804
|
+
has_null=False,
|
|
805
|
+
distinct_count=10,
|
|
806
|
+
contains_unicode=False,
|
|
807
|
+
max_string_length=20,
|
|
808
|
+
),
|
|
809
|
+
"budget": ColumnStatisticsInput(min=50000.0, max=500000.0, has_null=False, distinct_count=10),
|
|
810
|
+
},
|
|
811
|
+
statistics_cache_max_age_seconds=3600,
|
|
812
|
+
comment="Department reference table",
|
|
813
|
+
),
|
|
814
|
+
Table(
|
|
815
|
+
name="products",
|
|
816
|
+
columns=schema(
|
|
817
|
+
id=pa.int64(),
|
|
818
|
+
name=pa.string(),
|
|
819
|
+
quantity=pa.int64(),
|
|
820
|
+
price=pa.float64(),
|
|
821
|
+
),
|
|
822
|
+
not_null=("id",),
|
|
823
|
+
primary_key=(("id",),),
|
|
824
|
+
defaults={
|
|
825
|
+
"quantity": 0,
|
|
826
|
+
"name": "unknown",
|
|
827
|
+
"price": 9.99,
|
|
828
|
+
},
|
|
829
|
+
column_comments={
|
|
830
|
+
"id": "Unique product identifier",
|
|
831
|
+
"name": "Product display name",
|
|
832
|
+
"price": "Unit price in USD",
|
|
833
|
+
},
|
|
834
|
+
statistics={
|
|
835
|
+
"id": ColumnStatisticsInput(min=1, max=100, has_null=False, distinct_count=100),
|
|
836
|
+
"name": ColumnStatisticsInput(
|
|
837
|
+
min="Anvil",
|
|
838
|
+
max="Zebra Tape",
|
|
839
|
+
has_null=False,
|
|
840
|
+
distinct_count=100,
|
|
841
|
+
contains_unicode=False,
|
|
842
|
+
max_string_length=30,
|
|
843
|
+
),
|
|
844
|
+
"quantity": ColumnStatisticsInput(min=0, max=10000, has_null=True, distinct_count=50),
|
|
845
|
+
"price": ColumnStatisticsInput(min=0.99, max=999.99, has_null=False, distinct_count=80),
|
|
846
|
+
},
|
|
847
|
+
statistics_cache_max_age_seconds=3600,
|
|
848
|
+
comment="Product table with column defaults",
|
|
849
|
+
),
|
|
850
|
+
Table(
|
|
851
|
+
name="employees",
|
|
852
|
+
columns=schema(
|
|
853
|
+
id=pa.int64(),
|
|
854
|
+
name=pa.string(),
|
|
855
|
+
email=pa.string(),
|
|
856
|
+
department_id=pa.int64(),
|
|
857
|
+
),
|
|
858
|
+
primary_key=(("id",),),
|
|
859
|
+
not_null=("id", "name", "email"),
|
|
860
|
+
unique=(("email",),),
|
|
861
|
+
foreign_key=(
|
|
862
|
+
ForeignKeyDef(
|
|
863
|
+
columns=("department_id",),
|
|
864
|
+
referenced_table="departments",
|
|
865
|
+
referenced_columns=("id",),
|
|
866
|
+
),
|
|
867
|
+
),
|
|
868
|
+
comment="Employee table with FK to departments",
|
|
869
|
+
),
|
|
870
|
+
Table(
|
|
871
|
+
name="projects",
|
|
872
|
+
columns=schema(
|
|
873
|
+
department_id=pa.int64(),
|
|
874
|
+
project_code=pa.string(),
|
|
875
|
+
title=pa.string(),
|
|
876
|
+
),
|
|
877
|
+
primary_key=(("department_id", "project_code"),),
|
|
878
|
+
not_null=("department_id", "project_code", "title"),
|
|
879
|
+
foreign_key=(
|
|
880
|
+
ForeignKeyDef(
|
|
881
|
+
columns=("department_id",),
|
|
882
|
+
referenced_table="departments",
|
|
883
|
+
referenced_columns=("id",),
|
|
884
|
+
),
|
|
885
|
+
),
|
|
886
|
+
comment="Projects with composite PK and FK to departments",
|
|
887
|
+
),
|
|
888
|
+
# filter_echo_table — catalog table that echoes the pushed-down
|
|
889
|
+
# filters it received (pushed_filters column). Backs
|
|
890
|
+
# ~/Development/vgi/test/sql/integration/table/filter_pushdown_through_view.test,
|
|
891
|
+
# which characterizes filter pushdown directly and through a VIEW.
|
|
892
|
+
# The backing scan opts into expression-filter pushdown so a
|
|
893
|
+
# `LIKE 'prefix%'` predicate is observable here.
|
|
894
|
+
Table(
|
|
895
|
+
name="filter_echo_table",
|
|
896
|
+
columns=schema(n=pa.int64(), s=pa.utf8(), pushed_filters=pa.utf8()),
|
|
897
|
+
comment="Catalog table echoing pushed-down filters (filter-pushdown-through-view tests).",
|
|
898
|
+
),
|
|
899
|
+
# ----- required_field_filter_paths fixtures -----
|
|
900
|
+
# Exercised by ~/Development/vgi/test/sql/vgi_required_filters_*.test
|
|
901
|
+
# to verify the C++ optimizer extension that enforces the new
|
|
902
|
+
# Table.required_field_filter_paths field.
|
|
903
|
+
Table(
|
|
904
|
+
name="rff_simple",
|
|
905
|
+
columns=RFF_SIMPLE_COLUMNS,
|
|
906
|
+
required_field_filter_paths=("a",),
|
|
907
|
+
comment="rff_simple — requires a filter referencing column 'a'.",
|
|
908
|
+
),
|
|
909
|
+
Table(
|
|
910
|
+
name="rff_struct",
|
|
911
|
+
columns=RFF_STRUCT_COLUMNS,
|
|
912
|
+
required_field_filter_paths=("s.a", "s.b"),
|
|
913
|
+
comment="rff_struct — requires filters on both struct subfields s.a and s.b.",
|
|
914
|
+
),
|
|
915
|
+
Table(
|
|
916
|
+
name="rff_nested",
|
|
917
|
+
columns=RFF_NESTED_COLUMNS,
|
|
918
|
+
required_field_filter_paths=("wrapper.mid.leaf",),
|
|
919
|
+
comment="rff_nested — requires a filter on the 3-deep nested path wrapper.mid.leaf.",
|
|
920
|
+
),
|
|
921
|
+
Table(
|
|
922
|
+
name="rff_multi",
|
|
923
|
+
columns=RFF_MULTI_COLUMNS,
|
|
924
|
+
required_field_filter_paths=("top", "s.a"),
|
|
925
|
+
comment="rff_multi — mixed top-level + struct subfield requirements.",
|
|
926
|
+
),
|
|
927
|
+
Table(
|
|
928
|
+
name="rff_none",
|
|
929
|
+
columns=RFF_NONE_COLUMNS,
|
|
930
|
+
comment="rff_none — control table with no required_field_filter_paths (opt-out fast path).",
|
|
931
|
+
),
|
|
932
|
+
Table(
|
|
933
|
+
name="rff_rowid",
|
|
934
|
+
columns=RFF_ROWID_COLUMNS,
|
|
935
|
+
required_field_filter_paths=(
|
|
936
|
+
"bbox.xmin",
|
|
937
|
+
"bbox.xmax",
|
|
938
|
+
"bbox.ymin",
|
|
939
|
+
"bbox.ymax",
|
|
940
|
+
),
|
|
941
|
+
comment="rff_rowid — row_id virtual column + required bbox.* filters.",
|
|
942
|
+
),
|
|
943
|
+
# rff_parquet — native read_parquet delegation + required_field_filter_paths
|
|
944
|
+
# on a FLOAT bbox struct (mirrors Overture transportation.segment).
|
|
945
|
+
Table(
|
|
946
|
+
name="rff_parquet",
|
|
947
|
+
columns=pa.schema(
|
|
948
|
+
[
|
|
949
|
+
pa.field(
|
|
950
|
+
"bbox",
|
|
951
|
+
pa.struct(
|
|
952
|
+
[
|
|
953
|
+
pa.field("xmin", pa.float32()),
|
|
954
|
+
pa.field("ymin", pa.float32()),
|
|
955
|
+
pa.field("xmax", pa.float32()),
|
|
956
|
+
pa.field("ymax", pa.float32()),
|
|
957
|
+
]
|
|
958
|
+
),
|
|
959
|
+
),
|
|
960
|
+
pa.field("other", pa.int64()),
|
|
961
|
+
]
|
|
962
|
+
),
|
|
963
|
+
required_field_filter_paths=(
|
|
964
|
+
"bbox.xmin",
|
|
965
|
+
"bbox.xmax",
|
|
966
|
+
"bbox.ymin",
|
|
967
|
+
"bbox.ymax",
|
|
968
|
+
),
|
|
969
|
+
comment="rff_parquet — native read_parquet delegation with bbox.* required filters.",
|
|
970
|
+
),
|
|
971
|
+
# rff_hive — native read_parquet over a Hive-partitioned glob
|
|
972
|
+
# (theme/type partition columns), bbox at a non-zero index —
|
|
973
|
+
# closely mirrors Overture transportation.segment.
|
|
974
|
+
Table(
|
|
975
|
+
name="rff_hive",
|
|
976
|
+
columns=pa.schema(
|
|
977
|
+
[
|
|
978
|
+
pa.field("id", pa.string()),
|
|
979
|
+
pa.field(
|
|
980
|
+
"bbox",
|
|
981
|
+
pa.struct(
|
|
982
|
+
[
|
|
983
|
+
pa.field("xmin", pa.float32()),
|
|
984
|
+
pa.field("ymin", pa.float32()),
|
|
985
|
+
pa.field("xmax", pa.float32()),
|
|
986
|
+
pa.field("ymax", pa.float32()),
|
|
987
|
+
]
|
|
988
|
+
),
|
|
989
|
+
),
|
|
990
|
+
pa.field("name", pa.string()),
|
|
991
|
+
pa.field("num", pa.int64()),
|
|
992
|
+
pa.field("theme", pa.string()),
|
|
993
|
+
pa.field("type", pa.string()),
|
|
994
|
+
]
|
|
995
|
+
),
|
|
996
|
+
required_field_filter_paths=(
|
|
997
|
+
"bbox.xmin",
|
|
998
|
+
"bbox.xmax",
|
|
999
|
+
"bbox.ymin",
|
|
1000
|
+
"bbox.ymax",
|
|
1001
|
+
),
|
|
1002
|
+
comment="rff_hive — native read_parquet over Hive glob with bbox.* required filters.",
|
|
1003
|
+
),
|
|
1004
|
+
# rff_hive_mixed — same Hive layout as rff_hive but a MIXED
|
|
1005
|
+
# requirement: a top-level field ('id') plus the struct corners.
|
|
1006
|
+
# Exercises the flat-field branch of the path walker over native
|
|
1007
|
+
# delegation, where 'id' sits at a permuted column_ids slot.
|
|
1008
|
+
Table(
|
|
1009
|
+
name="rff_hive_mixed",
|
|
1010
|
+
columns=pa.schema(
|
|
1011
|
+
[
|
|
1012
|
+
pa.field("id", pa.string()),
|
|
1013
|
+
pa.field(
|
|
1014
|
+
"bbox",
|
|
1015
|
+
pa.struct(
|
|
1016
|
+
[
|
|
1017
|
+
pa.field("xmin", pa.float32()),
|
|
1018
|
+
pa.field("ymin", pa.float32()),
|
|
1019
|
+
pa.field("xmax", pa.float32()),
|
|
1020
|
+
pa.field("ymax", pa.float32()),
|
|
1021
|
+
]
|
|
1022
|
+
),
|
|
1023
|
+
),
|
|
1024
|
+
pa.field("name", pa.string()),
|
|
1025
|
+
pa.field("num", pa.int64()),
|
|
1026
|
+
pa.field("theme", pa.string()),
|
|
1027
|
+
pa.field("type", pa.string()),
|
|
1028
|
+
]
|
|
1029
|
+
),
|
|
1030
|
+
required_field_filter_paths=(
|
|
1031
|
+
"id",
|
|
1032
|
+
"bbox.xmin",
|
|
1033
|
+
"bbox.xmax",
|
|
1034
|
+
"bbox.ymin",
|
|
1035
|
+
"bbox.ymax",
|
|
1036
|
+
),
|
|
1037
|
+
comment="rff_hive_mixed — native read_parquet, top-level 'id' + bbox.* required filters.",
|
|
1038
|
+
),
|
|
1039
|
+
# Time-travel constraint evolution table
|
|
1040
|
+
Table(
|
|
1041
|
+
name="versioned_constraints",
|
|
1042
|
+
columns=schema(
|
|
1043
|
+
id=pa.int64(),
|
|
1044
|
+
name=pa.string(),
|
|
1045
|
+
email=pa.string(),
|
|
1046
|
+
department_id=pa.int64(),
|
|
1047
|
+
),
|
|
1048
|
+
supports_time_travel=True,
|
|
1049
|
+
not_null=("id", "name"),
|
|
1050
|
+
primary_key=(("id",),),
|
|
1051
|
+
unique=(("email",),),
|
|
1052
|
+
foreign_key=(
|
|
1053
|
+
ForeignKeyDef(
|
|
1054
|
+
columns=("department_id",),
|
|
1055
|
+
referenced_table="departments",
|
|
1056
|
+
referenced_columns=("id",),
|
|
1057
|
+
),
|
|
1058
|
+
),
|
|
1059
|
+
comment="Table with constraints that evolve across versions",
|
|
1060
|
+
),
|
|
1061
|
+
],
|
|
1062
|
+
views=[
|
|
1063
|
+
View(
|
|
1064
|
+
name="small_numbers",
|
|
1065
|
+
definition="SELECT * FROM numbers WHERE value < 10",
|
|
1066
|
+
comment="Numbers less than 10",
|
|
1067
|
+
column_comments={"value": "Single-digit value 0..9"},
|
|
1068
|
+
),
|
|
1069
|
+
],
|
|
1070
|
+
indexes=[
|
|
1071
|
+
Index(
|
|
1072
|
+
name="idx_numbers_value",
|
|
1073
|
+
table_name="numbers",
|
|
1074
|
+
expressions=("value",),
|
|
1075
|
+
comment="Index on numbers.value",
|
|
1076
|
+
),
|
|
1077
|
+
Index(
|
|
1078
|
+
name="idx_numbers_value_unique",
|
|
1079
|
+
table_name="numbers",
|
|
1080
|
+
expressions=("value",),
|
|
1081
|
+
constraint_type=IndexConstraintType.UNIQUE,
|
|
1082
|
+
comment="Unique index on numbers.value",
|
|
1083
|
+
),
|
|
1084
|
+
],
|
|
1085
|
+
),
|
|
1086
|
+
],
|
|
1087
|
+
)
|
|
1088
|
+
|
|
1089
|
+
|
|
1090
|
+
class ExampleCatalog(ReadOnlyCatalogInterface):
|
|
1091
|
+
"""Catalog interface for the example worker.
|
|
1092
|
+
|
|
1093
|
+
Defines table_get and table_scan_function_get for tables with explicit
|
|
1094
|
+
columns, including time-travel support for versioned_data.
|
|
1095
|
+
|
|
1096
|
+
"""
|
|
1097
|
+
|
|
1098
|
+
catalog = _EXAMPLE_CATALOG
|
|
1099
|
+
|
|
1100
|
+
def table_get(
|
|
1101
|
+
self,
|
|
1102
|
+
*,
|
|
1103
|
+
attach_opaque_data: AttachOpaqueData,
|
|
1104
|
+
transaction_opaque_data: TransactionOpaqueData | None,
|
|
1105
|
+
schema_name: str,
|
|
1106
|
+
name: str,
|
|
1107
|
+
at_unit: str | None = None,
|
|
1108
|
+
at_value: str | None = None,
|
|
1109
|
+
) -> TableInfo | None:
|
|
1110
|
+
"""Return version-specific schema for time-travel tables."""
|
|
1111
|
+
_validate_at_params(at_unit, at_value)
|
|
1112
|
+
if schema_name.lower() == "data" and name.lower() == "versioned_data" and at_unit:
|
|
1113
|
+
version = resolve_version(at_unit, at_value)
|
|
1114
|
+
cols = _VERSIONED_SCHEMAS[version]
|
|
1115
|
+
return TableInfo(
|
|
1116
|
+
name=name,
|
|
1117
|
+
schema_name=schema_name,
|
|
1118
|
+
columns=SerializedSchema(cols.serialize().to_pybytes()),
|
|
1119
|
+
not_null_constraints=[],
|
|
1120
|
+
unique_constraints=[],
|
|
1121
|
+
check_constraints=[],
|
|
1122
|
+
comment="Versioned data table demonstrating time travel with schema evolution",
|
|
1123
|
+
tags={},
|
|
1124
|
+
)
|
|
1125
|
+
if schema_name.lower() == "data" and name.lower() == "versioned_constraints" and at_unit:
|
|
1126
|
+
version = resolve_versioned_constraints_version(at_unit, at_value)
|
|
1127
|
+
cols = _VERSIONED_CONSTRAINTS_SCHEMAS[version]
|
|
1128
|
+
# Constraints evolve with version:
|
|
1129
|
+
# V1: NOT NULL on id only
|
|
1130
|
+
# V2: NOT NULL on id+name, PK on id, UNIQUE on email
|
|
1131
|
+
# V3: NOT NULL on id+name, PK on id, UNIQUE on email, FK department_id→departments.id
|
|
1132
|
+
not_null: list[int] = []
|
|
1133
|
+
pk: list[list[int]] = []
|
|
1134
|
+
unique: list[list[int]] = []
|
|
1135
|
+
fk: list[bytes] = []
|
|
1136
|
+
col_names = [f.name for f in cols]
|
|
1137
|
+
if version >= 1:
|
|
1138
|
+
not_null.append(col_names.index("id"))
|
|
1139
|
+
if version >= 2:
|
|
1140
|
+
not_null.append(col_names.index("name"))
|
|
1141
|
+
pk.append([col_names.index("id")])
|
|
1142
|
+
unique.append([col_names.index("email")])
|
|
1143
|
+
if version >= 3:
|
|
1144
|
+
from vgi_rpc.utils import serialize_record_batch_bytes
|
|
1145
|
+
|
|
1146
|
+
fk_batch = pa.RecordBatch.from_pydict(
|
|
1147
|
+
{
|
|
1148
|
+
"fk_columns": [["department_id"]],
|
|
1149
|
+
"pk_columns": [["id"]],
|
|
1150
|
+
"referenced_table": ["departments"],
|
|
1151
|
+
"referenced_schema": [schema_name],
|
|
1152
|
+
},
|
|
1153
|
+
schema=pa.schema(
|
|
1154
|
+
[
|
|
1155
|
+
("fk_columns", pa.list_(pa.utf8())),
|
|
1156
|
+
("pk_columns", pa.list_(pa.utf8())),
|
|
1157
|
+
("referenced_table", pa.utf8()),
|
|
1158
|
+
("referenced_schema", pa.utf8()),
|
|
1159
|
+
]
|
|
1160
|
+
),
|
|
1161
|
+
)
|
|
1162
|
+
fk.append(serialize_record_batch_bytes(fk_batch))
|
|
1163
|
+
return TableInfo(
|
|
1164
|
+
name=name,
|
|
1165
|
+
schema_name=schema_name,
|
|
1166
|
+
columns=SerializedSchema(cols.serialize().to_pybytes()),
|
|
1167
|
+
not_null_constraints=not_null,
|
|
1168
|
+
unique_constraints=unique,
|
|
1169
|
+
check_constraints=[],
|
|
1170
|
+
primary_key_constraints=pk,
|
|
1171
|
+
foreign_key_constraints=fk,
|
|
1172
|
+
comment="Table with constraints that evolve across versions",
|
|
1173
|
+
tags={},
|
|
1174
|
+
)
|
|
1175
|
+
# Multi-branch tables: accept AT at table_get and pass it through to
|
|
1176
|
+
# the underlying handler with AT stripped. The C++ side's B2 guard
|
|
1177
|
+
# in VgiTableEntry::GetScanFunctionImpl detects branches.size() > 1
|
|
1178
|
+
# and throws BinderException before any scan-function-get RPC fires.
|
|
1179
|
+
# Returning TableInfo here lets the C++ binding flow proceed far enough
|
|
1180
|
+
# to hit that guard with the documented error message.
|
|
1181
|
+
if schema_name.lower() == "data" and name.lower() in ("multi_branch_numbers", "multi_branch_filtered_numbers"):
|
|
1182
|
+
return super().table_get(
|
|
1183
|
+
attach_opaque_data=attach_opaque_data,
|
|
1184
|
+
transaction_opaque_data=transaction_opaque_data,
|
|
1185
|
+
schema_name=schema_name,
|
|
1186
|
+
name=name,
|
|
1187
|
+
at_unit=None,
|
|
1188
|
+
at_value=None,
|
|
1189
|
+
)
|
|
1190
|
+
return super().table_get(
|
|
1191
|
+
attach_opaque_data=attach_opaque_data,
|
|
1192
|
+
transaction_opaque_data=transaction_opaque_data,
|
|
1193
|
+
schema_name=schema_name,
|
|
1194
|
+
name=name,
|
|
1195
|
+
at_unit=at_unit,
|
|
1196
|
+
at_value=at_value,
|
|
1197
|
+
)
|
|
1198
|
+
|
|
1199
|
+
def table_scan_branches_get(
|
|
1200
|
+
self,
|
|
1201
|
+
*,
|
|
1202
|
+
attach_opaque_data: AttachOpaqueData,
|
|
1203
|
+
transaction_opaque_data: TransactionOpaqueData | None,
|
|
1204
|
+
schema_name: str,
|
|
1205
|
+
name: str,
|
|
1206
|
+
at_unit: str | None,
|
|
1207
|
+
at_value: str | None,
|
|
1208
|
+
) -> ScanBranchesResult:
|
|
1209
|
+
"""Return multi-branch scan plans for the multi_branch_* test tables.
|
|
1210
|
+
|
|
1211
|
+
Falls through to the CatalogInterface default-impl shim for every
|
|
1212
|
+
other table, which wraps the legacy table_scan_function_get result
|
|
1213
|
+
as a one-branch list.
|
|
1214
|
+
"""
|
|
1215
|
+
_validate_at_params(at_unit, at_value)
|
|
1216
|
+
|
|
1217
|
+
# multi_branch_numbers: two arms, each sequence(50). Union size = 100.
|
|
1218
|
+
if schema_name.lower() == "data" and name.lower() == "multi_branch_numbers":
|
|
1219
|
+
return ScanBranchesResult(
|
|
1220
|
+
branches=[
|
|
1221
|
+
ScanBranch(
|
|
1222
|
+
function_name="sequence",
|
|
1223
|
+
positional_arguments=[pa.scalar(50)],
|
|
1224
|
+
named_arguments={},
|
|
1225
|
+
),
|
|
1226
|
+
ScanBranch(
|
|
1227
|
+
function_name="sequence",
|
|
1228
|
+
positional_arguments=[pa.scalar(50)],
|
|
1229
|
+
named_arguments={},
|
|
1230
|
+
),
|
|
1231
|
+
],
|
|
1232
|
+
required_extensions=[],
|
|
1233
|
+
)
|
|
1234
|
+
|
|
1235
|
+
# multi_branch_filtered_numbers: two arms each sequence(100) with
|
|
1236
|
+
# complementary branch_filters carving the value range in half.
|
|
1237
|
+
# Total rows = 100 (50 from each arm after filtering).
|
|
1238
|
+
if schema_name.lower() == "data" and name.lower() == "multi_branch_filtered_numbers":
|
|
1239
|
+
return ScanBranchesResult(
|
|
1240
|
+
branches=[
|
|
1241
|
+
ScanBranch(
|
|
1242
|
+
function_name="sequence",
|
|
1243
|
+
positional_arguments=[pa.scalar(100)],
|
|
1244
|
+
named_arguments={},
|
|
1245
|
+
branch_filter="n < 50",
|
|
1246
|
+
),
|
|
1247
|
+
ScanBranch(
|
|
1248
|
+
function_name="sequence",
|
|
1249
|
+
positional_arguments=[pa.scalar(100)],
|
|
1250
|
+
named_arguments={},
|
|
1251
|
+
branch_filter="n >= 50",
|
|
1252
|
+
),
|
|
1253
|
+
],
|
|
1254
|
+
required_extensions=[],
|
|
1255
|
+
)
|
|
1256
|
+
|
|
1257
|
+
# multi_branch_hetero: one VGI arm (sequence(50)) + one native
|
|
1258
|
+
# read_parquet arm pointing at a well-known path the test creates
|
|
1259
|
+
# before querying. The parquet file has a single column "n" holding
|
|
1260
|
+
# values 50..99. Total rows = 100.
|
|
1261
|
+
if schema_name.lower() == "data" and name.lower() == "multi_branch_hetero":
|
|
1262
|
+
return ScanBranchesResult(
|
|
1263
|
+
branches=[
|
|
1264
|
+
ScanBranch(
|
|
1265
|
+
function_name="sequence",
|
|
1266
|
+
positional_arguments=[pa.scalar(50)],
|
|
1267
|
+
named_arguments={},
|
|
1268
|
+
),
|
|
1269
|
+
ScanBranch(
|
|
1270
|
+
function_name="read_parquet",
|
|
1271
|
+
positional_arguments=[pa.scalar("/tmp/vgi_hetero_branch.parquet", pa.string())],
|
|
1272
|
+
named_arguments={},
|
|
1273
|
+
),
|
|
1274
|
+
],
|
|
1275
|
+
required_extensions=[],
|
|
1276
|
+
)
|
|
1277
|
+
|
|
1278
|
+
# multi_branch_empty: worker deliberately returns branches=[] to
|
|
1279
|
+
# exercise the C++ side's BinderException loud-fail. ParseScanBranchesResult
|
|
1280
|
+
# must reject this at the wire layer.
|
|
1281
|
+
if schema_name.lower() == "data" and name.lower() == "multi_branch_empty":
|
|
1282
|
+
return ScanBranchesResult(branches=[], required_extensions=[])
|
|
1283
|
+
|
|
1284
|
+
# multi_branch_two_writable: two ScanBranch entries both with
|
|
1285
|
+
# writable=True. ParseScanBranchesResult must reject loudly with
|
|
1286
|
+
# BinderException — DuckDB's single-writable-catalog-per-transaction
|
|
1287
|
+
# rule means at most one branch may be writable.
|
|
1288
|
+
if schema_name.lower() == "data" and name.lower() == "multi_branch_two_writable":
|
|
1289
|
+
return ScanBranchesResult(
|
|
1290
|
+
branches=[
|
|
1291
|
+
ScanBranch(
|
|
1292
|
+
function_name="sequence",
|
|
1293
|
+
positional_arguments=[pa.scalar(10)],
|
|
1294
|
+
named_arguments={},
|
|
1295
|
+
writable=True,
|
|
1296
|
+
),
|
|
1297
|
+
ScanBranch(
|
|
1298
|
+
function_name="sequence",
|
|
1299
|
+
positional_arguments=[pa.scalar(10)],
|
|
1300
|
+
named_arguments={},
|
|
1301
|
+
writable=True,
|
|
1302
|
+
),
|
|
1303
|
+
],
|
|
1304
|
+
required_extensions=[],
|
|
1305
|
+
)
|
|
1306
|
+
|
|
1307
|
+
# multi_branch_nopushdown: VGI sequence(50) + read_csv_auto. read_csv
|
|
1308
|
+
# has filter_pushdown=false in DuckDB, so any user WHERE clause stays
|
|
1309
|
+
# as a LogicalFilter above the csv arm — the rewriter must not assume
|
|
1310
|
+
# pushdown always succeeds.
|
|
1311
|
+
if schema_name.lower() == "data" and name.lower() == "multi_branch_nopushdown":
|
|
1312
|
+
return ScanBranchesResult(
|
|
1313
|
+
branches=[
|
|
1314
|
+
ScanBranch(
|
|
1315
|
+
function_name="sequence",
|
|
1316
|
+
positional_arguments=[pa.scalar(50)],
|
|
1317
|
+
named_arguments={},
|
|
1318
|
+
),
|
|
1319
|
+
ScanBranch(
|
|
1320
|
+
function_name="read_csv_auto",
|
|
1321
|
+
positional_arguments=[pa.scalar("/tmp/vgi_nopushdown_branch.csv", pa.string())],
|
|
1322
|
+
named_arguments={},
|
|
1323
|
+
),
|
|
1324
|
+
],
|
|
1325
|
+
required_extensions=[],
|
|
1326
|
+
)
|
|
1327
|
+
|
|
1328
|
+
# multi_branch_recon: three read_parquet branches with deliberately
|
|
1329
|
+
# mismatched column shapes — used to exercise column-reconciliation
|
|
1330
|
+
# by NAME with NULL-fill for missing canonicals. Canonical schema
|
|
1331
|
+
# is (a int64, b int64). The test creates the parquet files at the
|
|
1332
|
+
# paths below before querying.
|
|
1333
|
+
if schema_name.lower() == "data" and name.lower() == "multi_branch_recon":
|
|
1334
|
+
return ScanBranchesResult(
|
|
1335
|
+
branches=[
|
|
1336
|
+
ScanBranch(
|
|
1337
|
+
function_name="read_parquet",
|
|
1338
|
+
positional_arguments=[pa.scalar("/tmp/vgi_recon_a_b.parquet", pa.string())],
|
|
1339
|
+
named_arguments={},
|
|
1340
|
+
),
|
|
1341
|
+
ScanBranch(
|
|
1342
|
+
function_name="read_parquet",
|
|
1343
|
+
positional_arguments=[pa.scalar("/tmp/vgi_recon_b_a.parquet", pa.string())],
|
|
1344
|
+
named_arguments={},
|
|
1345
|
+
),
|
|
1346
|
+
ScanBranch(
|
|
1347
|
+
function_name="read_parquet",
|
|
1348
|
+
positional_arguments=[pa.scalar("/tmp/vgi_recon_a_only.parquet", pa.string())],
|
|
1349
|
+
named_arguments={},
|
|
1350
|
+
),
|
|
1351
|
+
],
|
|
1352
|
+
required_extensions=[],
|
|
1353
|
+
)
|
|
1354
|
+
|
|
1355
|
+
# Everything else: fall through to the default-impl shim (wraps
|
|
1356
|
+
# table_scan_function_get as a one-branch list).
|
|
1357
|
+
return super().table_scan_branches_get(
|
|
1358
|
+
attach_opaque_data=attach_opaque_data,
|
|
1359
|
+
transaction_opaque_data=transaction_opaque_data,
|
|
1360
|
+
schema_name=schema_name,
|
|
1361
|
+
name=name,
|
|
1362
|
+
at_unit=at_unit,
|
|
1363
|
+
at_value=at_value,
|
|
1364
|
+
)
|
|
1365
|
+
|
|
1366
|
+
# Column statistics are defined inline on each Table descriptor using
|
|
1367
|
+
# the `statistics` dict. ReadOnlyCatalogInterface auto-serves them —
|
|
1368
|
+
# no override of table_column_statistics_get() needed here.
|
|
1369
|
+
|
|
1370
|
+
def table_scan_function_get(
|
|
1371
|
+
self,
|
|
1372
|
+
*,
|
|
1373
|
+
attach_opaque_data: AttachOpaqueData,
|
|
1374
|
+
transaction_opaque_data: TransactionOpaqueData | None,
|
|
1375
|
+
schema_name: str,
|
|
1376
|
+
name: str,
|
|
1377
|
+
at_unit: str | None,
|
|
1378
|
+
at_value: str | None,
|
|
1379
|
+
) -> ScanFunctionResult:
|
|
1380
|
+
"""Return scan function for tables with explicit columns."""
|
|
1381
|
+
_validate_at_params(at_unit, at_value)
|
|
1382
|
+
|
|
1383
|
+
# Handle the "versioned_data" table with time travel
|
|
1384
|
+
if schema_name.lower() == "data" and name.lower() == "versioned_data":
|
|
1385
|
+
version = resolve_version(at_unit, at_value)
|
|
1386
|
+
return ScanFunctionResult(
|
|
1387
|
+
function_name="versioned_data_scan",
|
|
1388
|
+
positional_arguments=[pa.scalar(version)],
|
|
1389
|
+
named_arguments={},
|
|
1390
|
+
)
|
|
1391
|
+
|
|
1392
|
+
# Columns-based time-travel + pushdown: resolve AT → version and pass it
|
|
1393
|
+
# as a scan-function argument (the native columns-based AT mechanism).
|
|
1394
|
+
if schema_name.lower() == "data" and name.lower() == "tt_pushdown_cols":
|
|
1395
|
+
version = resolve_tt_version(at_unit, at_value)
|
|
1396
|
+
return ScanFunctionResult(
|
|
1397
|
+
function_name="tt_pushdown_cols_scan",
|
|
1398
|
+
positional_arguments=[pa.scalar(version)],
|
|
1399
|
+
named_arguments={},
|
|
1400
|
+
)
|
|
1401
|
+
|
|
1402
|
+
# Handle the versioned_constraints table with time travel
|
|
1403
|
+
if schema_name.lower() == "data" and name.lower() == "versioned_constraints":
|
|
1404
|
+
version = resolve_versioned_constraints_version(at_unit, at_value)
|
|
1405
|
+
return ScanFunctionResult(
|
|
1406
|
+
function_name="versioned_constraints_scan",
|
|
1407
|
+
positional_arguments=[pa.scalar(version)],
|
|
1408
|
+
named_arguments={},
|
|
1409
|
+
)
|
|
1410
|
+
|
|
1411
|
+
# rff_parquet — single-branch native read_parquet delegation.
|
|
1412
|
+
if schema_name.lower() == "data" and name.lower() == "rff_parquet":
|
|
1413
|
+
return ScanFunctionResult(
|
|
1414
|
+
function_name="read_parquet",
|
|
1415
|
+
positional_arguments=[pa.scalar("/tmp/rff_seg.parquet", pa.string())],
|
|
1416
|
+
named_arguments={},
|
|
1417
|
+
)
|
|
1418
|
+
|
|
1419
|
+
# rff_hive / rff_hive_mixed — native read_parquet over a Hive glob.
|
|
1420
|
+
if schema_name.lower() == "data" and name.lower() in ("rff_hive", "rff_hive_mixed"):
|
|
1421
|
+
return ScanFunctionResult(
|
|
1422
|
+
function_name="read_parquet",
|
|
1423
|
+
positional_arguments=[pa.scalar("/tmp/rff_hive/*/*/*.parquet", pa.string())],
|
|
1424
|
+
named_arguments={"hive_partitioning": pa.scalar(True)},
|
|
1425
|
+
)
|
|
1426
|
+
|
|
1427
|
+
# Reject AT clause on tables that don't support time travel
|
|
1428
|
+
if at_unit:
|
|
1429
|
+
raise ValueError(f"Table '{schema_name}.{name}' does not support time travel queries")
|
|
1430
|
+
|
|
1431
|
+
# Handle the "generated_sequence" table (generated columns, backed by sequence)
|
|
1432
|
+
if schema_name.lower() == "data" and name.lower() == "generated_sequence":
|
|
1433
|
+
return ScanFunctionResult(
|
|
1434
|
+
function_name="sequence",
|
|
1435
|
+
positional_arguments=[pa.scalar(10)],
|
|
1436
|
+
named_arguments={},
|
|
1437
|
+
)
|
|
1438
|
+
|
|
1439
|
+
# Handle "numbers" and "volatile_numbers" — both use sequence(100)
|
|
1440
|
+
if schema_name.lower() == "data" and name.lower() in ("numbers", "volatile_numbers"):
|
|
1441
|
+
return ScanFunctionResult(
|
|
1442
|
+
function_name="sequence",
|
|
1443
|
+
positional_arguments=[pa.scalar(100)],
|
|
1444
|
+
named_arguments={},
|
|
1445
|
+
)
|
|
1446
|
+
|
|
1447
|
+
# funny_numbers — 123456 rows from sequence; statistics deliberately NOT set on
|
|
1448
|
+
# the table so SequenceFunction.statistics() provides them via table_function_statistics.
|
|
1449
|
+
if schema_name.lower() == "data" and name.lower() == "funny_numbers":
|
|
1450
|
+
return ScanFunctionResult(
|
|
1451
|
+
function_name="sequence",
|
|
1452
|
+
positional_arguments=[pa.scalar(123456)],
|
|
1453
|
+
named_arguments={},
|
|
1454
|
+
)
|
|
1455
|
+
|
|
1456
|
+
# Constraint example tables — simple static scan functions
|
|
1457
|
+
_static_scan_tables: dict[str, str] = {
|
|
1458
|
+
"colors": "colors_scan",
|
|
1459
|
+
"departments": "departments_scan",
|
|
1460
|
+
"employees": "employees_scan",
|
|
1461
|
+
"products": "products_scan",
|
|
1462
|
+
"projects": "projects_scan",
|
|
1463
|
+
# filter-pushdown-through-view fixture.
|
|
1464
|
+
"filter_echo_table": "filter_echo_table_scan",
|
|
1465
|
+
# rff_* — required_field_filter_paths fixtures.
|
|
1466
|
+
"rff_simple": "rff_simple_scan",
|
|
1467
|
+
"rff_struct": "rff_struct_scan",
|
|
1468
|
+
"rff_nested": "rff_nested_scan",
|
|
1469
|
+
"rff_multi": "rff_multi_scan",
|
|
1470
|
+
"rff_none": "rff_none_scan",
|
|
1471
|
+
"rff_rowid": "rff_rowid_scan",
|
|
1472
|
+
}
|
|
1473
|
+
if schema_name.lower() == "data" and name.lower() in _static_scan_tables:
|
|
1474
|
+
return ScanFunctionResult(
|
|
1475
|
+
function_name=_static_scan_tables[name.lower()],
|
|
1476
|
+
positional_arguments=[],
|
|
1477
|
+
named_arguments={},
|
|
1478
|
+
)
|
|
1479
|
+
|
|
1480
|
+
# Row ID test tables
|
|
1481
|
+
rowid_tables: dict[str, dict[str, str]] = {
|
|
1482
|
+
"rowid_first": {"layout": "first", "row_id_type": "int64"},
|
|
1483
|
+
"rowid_middle": {"layout": "middle", "row_id_type": "int64"},
|
|
1484
|
+
"rowid_last": {"layout": "last", "row_id_type": "int64"},
|
|
1485
|
+
"rowid_string": {"layout": "first", "row_id_type": "string"},
|
|
1486
|
+
"rowid_struct": {"layout": "first", "row_id_type": "struct"},
|
|
1487
|
+
}
|
|
1488
|
+
if schema_name.lower() == "data" and name.lower() in rowid_tables:
|
|
1489
|
+
opts = rowid_tables[name.lower()]
|
|
1490
|
+
return ScanFunctionResult(
|
|
1491
|
+
function_name="rowid_sequence",
|
|
1492
|
+
positional_arguments=[pa.scalar(20)],
|
|
1493
|
+
named_arguments={
|
|
1494
|
+
"layout": pa.scalar(opts["layout"]),
|
|
1495
|
+
"row_id_type": pa.scalar(opts["row_id_type"]),
|
|
1496
|
+
},
|
|
1497
|
+
)
|
|
1498
|
+
|
|
1499
|
+
# Late-materialization tables → late_materialization scan function.
|
|
1500
|
+
# 1000 rows is large enough that LIMIT k << count makes the rewrite a
|
|
1501
|
+
# real win and that LIMIT 200 exceeds dynamic_or_filter_threshold (50).
|
|
1502
|
+
late_mat_tables: dict[str, dict[str, Any]] = {
|
|
1503
|
+
"late_mat": {},
|
|
1504
|
+
"late_mat_dup": {"dup_row_id": pa.scalar(True)},
|
|
1505
|
+
"late_mat_nulls": {"null_ord_stride": pa.scalar(7)},
|
|
1506
|
+
}
|
|
1507
|
+
if schema_name.lower() == "data" and name.lower() in late_mat_tables:
|
|
1508
|
+
return ScanFunctionResult(
|
|
1509
|
+
function_name="late_materialization",
|
|
1510
|
+
positional_arguments=[pa.scalar(1000)],
|
|
1511
|
+
named_arguments=late_mat_tables[name.lower()],
|
|
1512
|
+
)
|
|
1513
|
+
|
|
1514
|
+
return super().table_scan_function_get(
|
|
1515
|
+
attach_opaque_data=attach_opaque_data,
|
|
1516
|
+
transaction_opaque_data=transaction_opaque_data,
|
|
1517
|
+
schema_name=schema_name,
|
|
1518
|
+
name=name,
|
|
1519
|
+
at_unit=at_unit,
|
|
1520
|
+
at_value=at_value,
|
|
1521
|
+
)
|
|
1522
|
+
|
|
1523
|
+
# --------- Transaction lifecycle ---------
|
|
1524
|
+
#
|
|
1525
|
+
# The example catalog has no transactional state of its own — these
|
|
1526
|
+
# methods exist solely so the C++ extension populates
|
|
1527
|
+
# ``BindRequest.transaction_opaque_data`` when SQL is wrapped in
|
|
1528
|
+
# ``BEGIN`` / ``COMMIT``. That id is what makes
|
|
1529
|
+
# ``BindParams.transaction_storage`` non-None, which lets
|
|
1530
|
+
# ``TxCachedValueFunction`` (and any user-written function) cache
|
|
1531
|
+
# per-transaction values via ``FunctionStorage.transaction_state_*``.
|
|
1532
|
+
|
|
1533
|
+
supports_transactions = True
|
|
1534
|
+
|
|
1535
|
+
def catalog_transaction_begin(self, *, attach_opaque_data: AttachOpaqueData) -> TransactionOpaqueData | None:
|
|
1536
|
+
"""Allocate a fresh transaction_opaque_data; no catalog-side state to track."""
|
|
1537
|
+
del attach_opaque_data
|
|
1538
|
+
return TransactionOpaqueData(uuid.uuid4().bytes)
|
|
1539
|
+
|
|
1540
|
+
def catalog_transaction_commit(
|
|
1541
|
+
self, *, attach_opaque_data: AttachOpaqueData, transaction_opaque_data: TransactionOpaqueData
|
|
1542
|
+
) -> None:
|
|
1543
|
+
"""Clear per-transaction storage on commit (best-effort hygiene)."""
|
|
1544
|
+
del attach_opaque_data
|
|
1545
|
+
# transaction_opaque_data plays the role of scope_id in the unified
|
|
1546
|
+
# state_* API; execution_clear wipes every namespace for that scope.
|
|
1547
|
+
TxCachedValueFunction.storage.execution_clear(bytes(transaction_opaque_data))
|
|
1548
|
+
|
|
1549
|
+
def catalog_transaction_rollback(
|
|
1550
|
+
self, *, attach_opaque_data: AttachOpaqueData, transaction_opaque_data: TransactionOpaqueData
|
|
1551
|
+
) -> None:
|
|
1552
|
+
"""Mirror of commit — same cleanup path."""
|
|
1553
|
+
del attach_opaque_data
|
|
1554
|
+
TxCachedValueFunction.storage.execution_clear(bytes(transaction_opaque_data))
|
|
1555
|
+
|
|
1556
|
+
|
|
1557
|
+
class ExampleWorker(Worker):
|
|
1558
|
+
"""Example worker with built-in test functions.
|
|
1559
|
+
|
|
1560
|
+
This worker exposes all example functions via the ExampleCatalog interface,
|
|
1561
|
+
allowing clients to discover available functions via the "example" catalog.
|
|
1562
|
+
|
|
1563
|
+
Settings exposed via catalog_attach:
|
|
1564
|
+
- vgi_verbose_mode: Enable verbose output (used by SettingsAwareFunction)
|
|
1565
|
+
- greeting: Custom greeting message (used by SettingsAwareFunction)
|
|
1566
|
+
- multiplier: Value multiplier (used by SettingsAwareFunction, MultiplyBySettingFunction)
|
|
1567
|
+
- threshold: Filter threshold (used by FilterBySettingFunction)
|
|
1568
|
+
- config: Sequence configuration struct (used by StructSettingsFunction)
|
|
1569
|
+
"""
|
|
1570
|
+
|
|
1571
|
+
catalog_interface = ExampleCatalog
|
|
1572
|
+
# catalog is set for introspection (worker page, tests) — runtime catalog
|
|
1573
|
+
# operations go through catalog_interface.
|
|
1574
|
+
catalog = _EXAMPLE_CATALOG
|
|
1575
|
+
|
|
1576
|
+
class Settings:
|
|
1577
|
+
"""Settings exposed via catalog_attach."""
|
|
1578
|
+
|
|
1579
|
+
vgi_verbose_mode: Annotated[bool, Setting(desc="Enable verbose output")] = False
|
|
1580
|
+
greeting: Annotated[str, Setting(desc="Custom greeting message")] = "Hello"
|
|
1581
|
+
multiplier: Annotated[int, Setting(desc="Value multiplier")] = 1
|
|
1582
|
+
threshold: Annotated[int, Setting(desc="Filter threshold")] = 0
|
|
1583
|
+
config: Annotated[ # type: ignore[valid-type]
|
|
1584
|
+
pa.struct([("start", pa.int64()), ("step", pa.int64()), ("label", pa.string())]),
|
|
1585
|
+
Setting(desc="Sequence configuration struct"),
|
|
1586
|
+
] = None
|
|
1587
|
+
|
|
1588
|
+
secret_types = [
|
|
1589
|
+
SecretTypeSpec(
|
|
1590
|
+
name="vgi_example",
|
|
1591
|
+
description="Example VGI secret for testing",
|
|
1592
|
+
schema=pa.schema(
|
|
1593
|
+
[
|
|
1594
|
+
pa.field("secret_string", pa.string(), metadata={"redact": "true"}),
|
|
1595
|
+
pa.field("api_key", pa.string(), metadata={"redact": "true"}),
|
|
1596
|
+
pa.field("port", pa.int32()),
|
|
1597
|
+
pa.field("use_ssl", pa.bool_()),
|
|
1598
|
+
pa.field("timeout", pa.float64()),
|
|
1599
|
+
] # type: ignore[arg-type] # PyArrow field metadata typing limitation
|
|
1600
|
+
),
|
|
1601
|
+
),
|
|
1602
|
+
]
|
|
1603
|
+
|
|
1604
|
+
|
|
1605
|
+
def main() -> None:
|
|
1606
|
+
"""Run the fixture worker process.
|
|
1607
|
+
|
|
1608
|
+
Always serves the base ExampleWorker catalog plus the
|
|
1609
|
+
``projection_repro``, ``schema_reconcile``, and ``accumulate``
|
|
1610
|
+
fixture catalogs (all depend on the ``vgi[test-fixtures]`` extra).
|
|
1611
|
+
Adds the writable catalog when the ``vgi[test-fixtures-writable]``
|
|
1612
|
+
extra is also installed.
|
|
1613
|
+
"""
|
|
1614
|
+
from vgi._test_fixtures.accumulate.worker import AccumulateWorker
|
|
1615
|
+
from vgi._test_fixtures.projection_repro.worker import ProjReproWorker
|
|
1616
|
+
from vgi._test_fixtures.schema_reconcile.worker import SchemaReconcileWorker
|
|
1617
|
+
from vgi.meta_worker import MetaWorker
|
|
1618
|
+
|
|
1619
|
+
workers: list[type] = [ExampleWorker, ProjReproWorker, SchemaReconcileWorker, AccumulateWorker]
|
|
1620
|
+
try:
|
|
1621
|
+
from vgi._test_fixtures.writable.worker import WritableWorker
|
|
1622
|
+
except ImportError:
|
|
1623
|
+
pass
|
|
1624
|
+
else:
|
|
1625
|
+
workers.append(WritableWorker)
|
|
1626
|
+
|
|
1627
|
+
MetaWorker.serve(*workers)
|
|
1628
|
+
|
|
1629
|
+
|
|
1630
|
+
if __name__ == "__main__":
|
|
1631
|
+
main()
|