vgi-python 0.8.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vgi/__init__.py +152 -0
- vgi/_duckdb.py +62 -0
- vgi/_storage_profile.py +132 -0
- vgi/_test_fixtures/__init__.py +20 -0
- vgi/_test_fixtures/accumulate/__init__.py +19 -0
- vgi/_test_fixtures/accumulate/worker.py +762 -0
- vgi/_test_fixtures/aggregate/__init__.py +62 -0
- vgi/_test_fixtures/aggregate/_common.py +21 -0
- vgi/_test_fixtures/aggregate/basic.py +232 -0
- vgi/_test_fixtures/aggregate/dynamic.py +409 -0
- vgi/_test_fixtures/aggregate/generic.py +86 -0
- vgi/_test_fixtures/aggregate/listagg.py +71 -0
- vgi/_test_fixtures/aggregate/percentile.py +107 -0
- vgi/_test_fixtures/aggregate/streaming.py +192 -0
- vgi/_test_fixtures/aggregate/varargs.py +75 -0
- vgi/_test_fixtures/aggregate/window.py +380 -0
- vgi/_test_fixtures/attach_options.py +308 -0
- vgi/_test_fixtures/bad_protocol.py +62 -0
- vgi/_test_fixtures/cancellable.py +336 -0
- vgi/_test_fixtures/catalog.py +813 -0
- vgi/_test_fixtures/http_server.py +394 -0
- vgi/_test_fixtures/nest_tensor.py +614 -0
- vgi/_test_fixtures/orchard_catalog.py +47 -0
- vgi/_test_fixtures/projection_repro/__init__.py +6 -0
- vgi/_test_fixtures/projection_repro/worker.py +454 -0
- vgi/_test_fixtures/scalar/__init__.py +116 -0
- vgi/_test_fixtures/scalar/_common.py +69 -0
- vgi/_test_fixtures/scalar/arithmetic.py +321 -0
- vgi/_test_fixtures/scalar/binary.py +120 -0
- vgi/_test_fixtures/scalar/formatting.py +176 -0
- vgi/_test_fixtures/scalar/geo.py +300 -0
- vgi/_test_fixtures/scalar/null_handling.py +107 -0
- vgi/_test_fixtures/scalar/random_demo.py +171 -0
- vgi/_test_fixtures/scalar/settings_secrets.py +102 -0
- vgi/_test_fixtures/scalar/type_info.py +219 -0
- vgi/_test_fixtures/schema_reconcile/__init__.py +29 -0
- vgi/_test_fixtures/schema_reconcile/worker.py +653 -0
- vgi/_test_fixtures/simple_writable.py +793 -0
- vgi/_test_fixtures/table/__init__.py +221 -0
- vgi/_test_fixtures/table/_common.py +162 -0
- vgi/_test_fixtures/table/batch_index.py +283 -0
- vgi/_test_fixtures/table/batch_index_broken.py +200 -0
- vgi/_test_fixtures/table/catalog_scans.py +162 -0
- vgi/_test_fixtures/table/filters.py +1005 -0
- vgi/_test_fixtures/table/late_materialization.py +249 -0
- vgi/_test_fixtures/table/make_series.py +273 -0
- vgi/_test_fixtures/table/misc.py +499 -0
- vgi/_test_fixtures/table/order_modes.py +164 -0
- vgi/_test_fixtures/table/pairs.py +437 -0
- vgi/_test_fixtures/table/partition_columns.py +472 -0
- vgi/_test_fixtures/table/partition_columns_broken.py +304 -0
- vgi/_test_fixtures/table/profiling_example.py +195 -0
- vgi/_test_fixtures/table/required_filters.py +234 -0
- vgi/_test_fixtures/table/sequence.py +710 -0
- vgi/_test_fixtures/table/settings.py +426 -0
- vgi/_test_fixtures/table/transaction_storage.py +162 -0
- vgi/_test_fixtures/table/tt_pushdown.py +191 -0
- vgi/_test_fixtures/table/versioned.py +230 -0
- vgi/_test_fixtures/table_in_out.py +1392 -0
- vgi/_test_fixtures/versioned.py +155 -0
- vgi/_test_fixtures/versioned_tables.py +595 -0
- vgi/_test_fixtures/worker.py +1631 -0
- vgi/_test_fixtures/writable/__init__.py +8 -0
- vgi/_test_fixtures/writable/generic.py +236 -0
- vgi/_test_fixtures/writable/table.py +149 -0
- vgi/_test_fixtures/writable/worker.py +1148 -0
- vgi/aggregate_function.py +607 -0
- vgi/argument_spec.py +472 -0
- vgi/arguments.py +1747 -0
- vgi/auth.py +55 -0
- vgi/catalog/__init__.py +88 -0
- vgi/catalog/attach_option.py +206 -0
- vgi/catalog/catalog_interface.py +2767 -0
- vgi/catalog/descriptors.py +870 -0
- vgi/catalog/duckdb_statistics.py +377 -0
- vgi/catalog/secret_type.py +96 -0
- vgi/catalog/setting.py +253 -0
- vgi/catalog/storage.py +372 -0
- vgi/client/__init__.py +67 -0
- vgi/client/catalog_mixin.py +1251 -0
- vgi/client/cli.py +582 -0
- vgi/client/cli_catalog.py +182 -0
- vgi/client/cli_schema.py +270 -0
- vgi/client/cli_table.py +907 -0
- vgi/client/cli_transaction.py +97 -0
- vgi/client/cli_utils.py +441 -0
- vgi/client/cli_view.py +303 -0
- vgi/client/client.py +2183 -0
- vgi/exceptions.py +205 -0
- vgi/function.py +245 -0
- vgi/function_storage.py +1636 -0
- vgi/function_storage_azure_sql.py +922 -0
- vgi/function_storage_cf_do.py +740 -0
- vgi/http/__init__.py +25 -0
- vgi/http/demo_storage.py +212 -0
- vgi/http/worker_page.py +1252 -0
- vgi/invocation.py +154 -0
- vgi/logging_config.py +93 -0
- vgi/meta_worker.py +661 -0
- vgi/metadata.py +1403 -0
- vgi/otel.py +406 -0
- vgi/protocol.py +2418 -0
- vgi/protocol_version.txt +1 -0
- vgi/py.typed +0 -0
- vgi/scalar_function.py +1211 -0
- vgi/schema_utils.py +234 -0
- vgi/secret_protocol.py +124 -0
- vgi/secret_service.py +238 -0
- vgi/serve.py +769 -0
- vgi/table_buffering_function.py +443 -0
- vgi/table_filter_pushdown.py +1528 -0
- vgi/table_function.py +1130 -0
- vgi/table_in_out_function.py +383 -0
- vgi/transactor/__init__.py +24 -0
- vgi/transactor/_duckdb_compat.py +27 -0
- vgi/transactor/client.py +137 -0
- vgi/transactor/protocol.py +149 -0
- vgi/transactor/server.py +740 -0
- vgi/worker.py +4761 -0
- vgi_python-0.8.0.dist-info/METADATA +735 -0
- vgi_python-0.8.0.dist-info/RECORD +124 -0
- vgi_python-0.8.0.dist-info/WHEEL +4 -0
- vgi_python-0.8.0.dist-info/entry_points.txt +5 -0
- vgi_python-0.8.0.dist-info/licenses/LICENSE +134 -0
|
@@ -0,0 +1,221 @@
|
|
|
1
|
+
# Copyright 2025, 2026 Query Farm LLC - https://query.farm
|
|
2
|
+
|
|
3
|
+
"""Table-function fixtures.
|
|
4
|
+
|
|
5
|
+
Originally a single 3,270-line module; split into cohesive sub-modules and
|
|
6
|
+
re-exported here so existing import sites (worker.py, tests) keep working
|
|
7
|
+
unchanged.
|
|
8
|
+
|
|
9
|
+
If you're looking for a specific fixture, the module names below should
|
|
10
|
+
point you at the right file:
|
|
11
|
+
|
|
12
|
+
* :mod:`._common` — ``CountdownState``, ``_BaseSequenceFunction``
|
|
13
|
+
* :mod:`.sequence` — sequence / partitioned / nested / row_id
|
|
14
|
+
* :mod:`.make_series` — make_series_count / range / step / csv / float
|
|
15
|
+
* :mod:`.pairs` — make_pairs_*, repeat_value_*, constant_columns
|
|
16
|
+
* :mod:`.settings` — settings_aware, struct_settings, secret_demo
|
|
17
|
+
* :mod:`.filters` — filter_echo, dynamic_filter_echo, expression_filter,
|
|
18
|
+
spatial_filter
|
|
19
|
+
* :mod:`.catalog_scans` — colors / departments / employees / products / projects
|
|
20
|
+
* :mod:`.versioned` — versioned_data + versioned_constraints (time travel)
|
|
21
|
+
* :mod:`.misc` — projected_data, generator_exception,
|
|
22
|
+
logging_generator, order_echo, sample_echo
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
from vgi._test_fixtures.table.batch_index import (
|
|
26
|
+
PartitionedBatchIndexFunction,
|
|
27
|
+
PartitionedBatchIndexMarkedFunction,
|
|
28
|
+
)
|
|
29
|
+
from vgi._test_fixtures.table.batch_index_broken import (
|
|
30
|
+
BatchIndexOverflowFunction,
|
|
31
|
+
MissingBatchIndexTagFunction,
|
|
32
|
+
NonMonotoneBatchIndexFunction,
|
|
33
|
+
)
|
|
34
|
+
from vgi._test_fixtures.table.catalog_scans import (
|
|
35
|
+
ColorsScanFunction,
|
|
36
|
+
DepartmentsScanFunction,
|
|
37
|
+
EmployeesScanFunction,
|
|
38
|
+
ProductsScanFunction,
|
|
39
|
+
ProjectsScanFunction,
|
|
40
|
+
)
|
|
41
|
+
from vgi._test_fixtures.table.filters import (
|
|
42
|
+
DictFilterEchoFunction,
|
|
43
|
+
DynamicFilterEchoFunction,
|
|
44
|
+
ExpressionFilterTestFunction,
|
|
45
|
+
FilterEchoFunction,
|
|
46
|
+
FilterEchoPartitionedFunction,
|
|
47
|
+
FilterEchoTableScanFunction,
|
|
48
|
+
SpatialFilterExampleFunction,
|
|
49
|
+
ValuePruneFunction,
|
|
50
|
+
)
|
|
51
|
+
from vgi._test_fixtures.table.late_materialization import (
|
|
52
|
+
LateMaterializationFunction,
|
|
53
|
+
)
|
|
54
|
+
from vgi._test_fixtures.table.make_series import (
|
|
55
|
+
MakeSeriesCountFunction,
|
|
56
|
+
MakeSeriesCsvFunction,
|
|
57
|
+
MakeSeriesFloatFunction,
|
|
58
|
+
MakeSeriesRangeFunction,
|
|
59
|
+
MakeSeriesStepFunction,
|
|
60
|
+
)
|
|
61
|
+
from vgi._test_fixtures.table.misc import (
|
|
62
|
+
GeneratorExceptionFunction,
|
|
63
|
+
LoggingGeneratorFunction,
|
|
64
|
+
OrderEchoFunction,
|
|
65
|
+
ProjectedDataFunction,
|
|
66
|
+
SampleEchoFunction,
|
|
67
|
+
)
|
|
68
|
+
from vgi._test_fixtures.table.order_modes import (
|
|
69
|
+
PartitionedFixedOrderFunction,
|
|
70
|
+
PartitionedNoOrderGuaranteeFunction,
|
|
71
|
+
PartitionedPreservesOrderFunction,
|
|
72
|
+
)
|
|
73
|
+
from vgi._test_fixtures.table.pairs import (
|
|
74
|
+
ConstantColumnsFunction,
|
|
75
|
+
MakePairsIntFunction,
|
|
76
|
+
MakePairsIntStrFunction,
|
|
77
|
+
MakePairsStrFunction,
|
|
78
|
+
RepeatValueIntFunction,
|
|
79
|
+
RepeatValueStrFunction,
|
|
80
|
+
)
|
|
81
|
+
from vgi._test_fixtures.table.partition_columns import (
|
|
82
|
+
CountryPartitionedSalesFunction,
|
|
83
|
+
DisjointRangePartitionedFunction,
|
|
84
|
+
PartitionedWithExplicitOverrideFunction,
|
|
85
|
+
RegionYearPartitionedFunction,
|
|
86
|
+
)
|
|
87
|
+
from vgi._test_fixtures.table.partition_columns_broken import (
|
|
88
|
+
BrokenMissingPartitionValuesFunction,
|
|
89
|
+
BrokenPartitionColumnAbsentFromBatchFunction,
|
|
90
|
+
BrokenPartitionMinNeqMaxFunction,
|
|
91
|
+
BrokenPartitionValuesNoAnnotationFunction,
|
|
92
|
+
)
|
|
93
|
+
from vgi._test_fixtures.table.profiling_example import (
|
|
94
|
+
ProfilingDemoFunction,
|
|
95
|
+
)
|
|
96
|
+
from vgi._test_fixtures.table.required_filters import (
|
|
97
|
+
RFF_MULTI_COLUMNS,
|
|
98
|
+
RFF_NESTED_COLUMNS,
|
|
99
|
+
RFF_NONE_COLUMNS,
|
|
100
|
+
RFF_ROWID_COLUMNS,
|
|
101
|
+
RFF_SIMPLE_COLUMNS,
|
|
102
|
+
RFF_STRUCT_COLUMNS,
|
|
103
|
+
RffMultiScanFunction,
|
|
104
|
+
RffNestedScanFunction,
|
|
105
|
+
RffNoneScanFunction,
|
|
106
|
+
RffRowidScanFunction,
|
|
107
|
+
RffSimpleScanFunction,
|
|
108
|
+
RffStructScanFunction,
|
|
109
|
+
)
|
|
110
|
+
from vgi._test_fixtures.table.sequence import (
|
|
111
|
+
DoubleSequenceFunction,
|
|
112
|
+
NamedParamsEchoFunction,
|
|
113
|
+
NestedSequenceFunction,
|
|
114
|
+
PartitionedSequenceFunction,
|
|
115
|
+
RowIdSequenceFunction,
|
|
116
|
+
SequenceFunction,
|
|
117
|
+
TenThousandFunction,
|
|
118
|
+
)
|
|
119
|
+
from vgi._test_fixtures.table.settings import (
|
|
120
|
+
ScopedSecretDemoFunction,
|
|
121
|
+
SecretDemoFunction,
|
|
122
|
+
SettingsAwareFunction,
|
|
123
|
+
StructSettingsFunction,
|
|
124
|
+
)
|
|
125
|
+
from vgi._test_fixtures.table.transaction_storage import TxCachedValueFunction
|
|
126
|
+
from vgi._test_fixtures.table.versioned import (
|
|
127
|
+
_CURRENT_VERSION,
|
|
128
|
+
_VERSIONED_CONSTRAINTS_CURRENT,
|
|
129
|
+
_VERSIONED_CONSTRAINTS_DATA,
|
|
130
|
+
_VERSIONED_CONSTRAINTS_SCHEMAS,
|
|
131
|
+
_VERSIONED_DATA,
|
|
132
|
+
_VERSIONED_SCHEMAS,
|
|
133
|
+
VersionedConstraintsScanFunction,
|
|
134
|
+
VersionedDataFunction,
|
|
135
|
+
resolve_version,
|
|
136
|
+
resolve_versioned_constraints_version,
|
|
137
|
+
)
|
|
138
|
+
|
|
139
|
+
__all__ = [
|
|
140
|
+
"_CURRENT_VERSION",
|
|
141
|
+
"_VERSIONED_CONSTRAINTS_CURRENT",
|
|
142
|
+
"_VERSIONED_CONSTRAINTS_DATA",
|
|
143
|
+
"_VERSIONED_CONSTRAINTS_SCHEMAS",
|
|
144
|
+
"_VERSIONED_DATA",
|
|
145
|
+
"_VERSIONED_SCHEMAS",
|
|
146
|
+
"BatchIndexOverflowFunction",
|
|
147
|
+
"BrokenMissingPartitionValuesFunction",
|
|
148
|
+
"BrokenPartitionColumnAbsentFromBatchFunction",
|
|
149
|
+
"BrokenPartitionMinNeqMaxFunction",
|
|
150
|
+
"BrokenPartitionValuesNoAnnotationFunction",
|
|
151
|
+
"ColorsScanFunction",
|
|
152
|
+
"ConstantColumnsFunction",
|
|
153
|
+
"CountryPartitionedSalesFunction",
|
|
154
|
+
"DisjointRangePartitionedFunction",
|
|
155
|
+
"DepartmentsScanFunction",
|
|
156
|
+
"DictFilterEchoFunction",
|
|
157
|
+
"DoubleSequenceFunction",
|
|
158
|
+
"DynamicFilterEchoFunction",
|
|
159
|
+
"EmployeesScanFunction",
|
|
160
|
+
"ExpressionFilterTestFunction",
|
|
161
|
+
"FilterEchoFunction",
|
|
162
|
+
"FilterEchoPartitionedFunction",
|
|
163
|
+
"FilterEchoTableScanFunction",
|
|
164
|
+
"GeneratorExceptionFunction",
|
|
165
|
+
"ValuePruneFunction",
|
|
166
|
+
"LateMaterializationFunction",
|
|
167
|
+
"LoggingGeneratorFunction",
|
|
168
|
+
"MakePairsIntFunction",
|
|
169
|
+
"MakePairsIntStrFunction",
|
|
170
|
+
"MakePairsStrFunction",
|
|
171
|
+
"MakeSeriesCountFunction",
|
|
172
|
+
"MakeSeriesCsvFunction",
|
|
173
|
+
"MakeSeriesFloatFunction",
|
|
174
|
+
"MakeSeriesRangeFunction",
|
|
175
|
+
"MakeSeriesStepFunction",
|
|
176
|
+
"MissingBatchIndexTagFunction",
|
|
177
|
+
"NamedParamsEchoFunction",
|
|
178
|
+
"NestedSequenceFunction",
|
|
179
|
+
"NonMonotoneBatchIndexFunction",
|
|
180
|
+
"OrderEchoFunction",
|
|
181
|
+
"PartitionedBatchIndexFunction",
|
|
182
|
+
"PartitionedBatchIndexMarkedFunction",
|
|
183
|
+
"PartitionedFixedOrderFunction",
|
|
184
|
+
"PartitionedNoOrderGuaranteeFunction",
|
|
185
|
+
"PartitionedPreservesOrderFunction",
|
|
186
|
+
"PartitionedSequenceFunction",
|
|
187
|
+
"PartitionedWithExplicitOverrideFunction",
|
|
188
|
+
"ProductsScanFunction",
|
|
189
|
+
"ProfilingDemoFunction",
|
|
190
|
+
"ProjectedDataFunction",
|
|
191
|
+
"ProjectsScanFunction",
|
|
192
|
+
"RegionYearPartitionedFunction",
|
|
193
|
+
"RepeatValueIntFunction",
|
|
194
|
+
"RepeatValueStrFunction",
|
|
195
|
+
"RFF_MULTI_COLUMNS",
|
|
196
|
+
"RFF_NESTED_COLUMNS",
|
|
197
|
+
"RFF_NONE_COLUMNS",
|
|
198
|
+
"RFF_ROWID_COLUMNS",
|
|
199
|
+
"RFF_SIMPLE_COLUMNS",
|
|
200
|
+
"RFF_STRUCT_COLUMNS",
|
|
201
|
+
"RffMultiScanFunction",
|
|
202
|
+
"RffNestedScanFunction",
|
|
203
|
+
"RffNoneScanFunction",
|
|
204
|
+
"RffRowidScanFunction",
|
|
205
|
+
"RffSimpleScanFunction",
|
|
206
|
+
"RffStructScanFunction",
|
|
207
|
+
"RowIdSequenceFunction",
|
|
208
|
+
"SampleEchoFunction",
|
|
209
|
+
"ScopedSecretDemoFunction",
|
|
210
|
+
"SecretDemoFunction",
|
|
211
|
+
"SequenceFunction",
|
|
212
|
+
"SettingsAwareFunction",
|
|
213
|
+
"SpatialFilterExampleFunction",
|
|
214
|
+
"StructSettingsFunction",
|
|
215
|
+
"TenThousandFunction",
|
|
216
|
+
"TxCachedValueFunction",
|
|
217
|
+
"VersionedConstraintsScanFunction",
|
|
218
|
+
"VersionedDataFunction",
|
|
219
|
+
"resolve_version",
|
|
220
|
+
"resolve_versioned_constraints_version",
|
|
221
|
+
]
|
|
@@ -0,0 +1,162 @@
|
|
|
1
|
+
# Copyright 2025, 2026 Query Farm LLC - https://query.farm
|
|
2
|
+
|
|
3
|
+
"""Shared infrastructure for table fixture functions.
|
|
4
|
+
|
|
5
|
+
Holds the cardinality decorator, the common ``CountdownState``, the
|
|
6
|
+
``CountBatchArgs`` base for fixtures that take ``(count, batch_size)``,
|
|
7
|
+
and the ``_BaseSequenceFunction`` template-method base class for
|
|
8
|
+
countdown-style generators.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
from dataclasses import dataclass
|
|
14
|
+
from typing import Annotated, Any, ClassVar
|
|
15
|
+
|
|
16
|
+
import numpy as np
|
|
17
|
+
import pyarrow as pa
|
|
18
|
+
from vgi_rpc import ArrowSerializableDataclass
|
|
19
|
+
from vgi_rpc.rpc import OutputCollector
|
|
20
|
+
|
|
21
|
+
from vgi.arguments import Arg
|
|
22
|
+
from vgi.catalog.catalog_interface import ColumnStatistics
|
|
23
|
+
from vgi.table_function import (
|
|
24
|
+
BindParams,
|
|
25
|
+
ProcessParams,
|
|
26
|
+
TableCardinality,
|
|
27
|
+
TableFunctionGenerator,
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def _cardinality_from_count[T: TableFunctionGenerator[Any, Any]](cls: type[T]) -> type[T]:
|
|
32
|
+
"""Class decorator to implement cardinality() based on a 'count' argument."""
|
|
33
|
+
if "cardinality" not in cls.__dict__: # only inject if subclass hasn't overridden
|
|
34
|
+
|
|
35
|
+
def cardinality_impl(cls_: type[T], params: BindParams[Any]) -> TableCardinality:
|
|
36
|
+
count = getattr(params.args, "count", None)
|
|
37
|
+
if not isinstance(count, int) or count < 0:
|
|
38
|
+
raise ValueError(f"Expected a non-negative integer 'count' argument for {cls_.__name__}")
|
|
39
|
+
return TableCardinality(estimate=count, max=count)
|
|
40
|
+
|
|
41
|
+
cls.cardinality = classmethod(cardinality_impl) # type: ignore[assignment]
|
|
42
|
+
|
|
43
|
+
return cls
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
@dataclass(kw_only=True)
|
|
47
|
+
class CountdownState(ArrowSerializableDataclass):
|
|
48
|
+
"""Mutable state tracking remaining rows and current position."""
|
|
49
|
+
|
|
50
|
+
remaining: int
|
|
51
|
+
current_index: int = 0
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
@dataclass(frozen=True)
|
|
55
|
+
class CountBatchArgs:
|
|
56
|
+
"""Standard ``(count, batch_size)`` argument pair for countdown-style fixtures.
|
|
57
|
+
|
|
58
|
+
Subclass this to add fixture-specific knobs without re-declaring the two
|
|
59
|
+
common fields. Note: ``slots=True`` is intentionally omitted so subclasses
|
|
60
|
+
can extend cleanly without slot-conflict gymnastics.
|
|
61
|
+
"""
|
|
62
|
+
|
|
63
|
+
count: Annotated[int, Arg(0, doc="Number of rows to generate", ge=0)]
|
|
64
|
+
batch_size: Annotated[int, Arg("batch_size", default=1000, doc="Batch size for output", ge=1)]
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
@dataclass(slots=True, frozen=True)
|
|
68
|
+
class _EmptyArgs:
|
|
69
|
+
"""No arguments."""
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
@dataclass(kw_only=True)
|
|
73
|
+
class _OneShotState(ArrowSerializableDataclass):
|
|
74
|
+
"""State that emits data once."""
|
|
75
|
+
|
|
76
|
+
done: bool = False
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
class _BaseSequenceFunction(TableFunctionGenerator[Any, CountdownState]):
|
|
80
|
+
"""Template-method base for countdown-style fixture generators.
|
|
81
|
+
|
|
82
|
+
Provides ``initial_state``, the countdown bookkeeping in ``process``, and
|
|
83
|
+
a default numpy-arange ``_emit_chunk`` used by SequenceFunction /
|
|
84
|
+
DoubleSequenceFunction. Subclasses with non-arange output (e.g. echoes,
|
|
85
|
+
nested types, row-id sequences) override ``_emit_chunk``.
|
|
86
|
+
|
|
87
|
+
``BATCH_SIZE_FALLBACK`` is used when ``params.args`` has no ``batch_size``
|
|
88
|
+
field — i.e. fixtures that want a fixed batch size rather than a user knob.
|
|
89
|
+
"""
|
|
90
|
+
|
|
91
|
+
NUMPY_DTYPE: ClassVar[type[np.generic]] = np.int64
|
|
92
|
+
STATS_ARROW_TYPE: ClassVar[pa.DataType] = pa.int64()
|
|
93
|
+
STATS_COLUMN_NAME: ClassVar[str] = "n"
|
|
94
|
+
BATCH_SIZE_FALLBACK: ClassVar[int] = 1000
|
|
95
|
+
|
|
96
|
+
@classmethod
|
|
97
|
+
def initial_state(cls, params: ProcessParams[Any]) -> CountdownState:
|
|
98
|
+
"""Create initial state with remaining count."""
|
|
99
|
+
return CountdownState(remaining=params.args.count)
|
|
100
|
+
|
|
101
|
+
@classmethod
|
|
102
|
+
def statistics(cls, params: BindParams[Any]) -> list[ColumnStatistics] | None:
|
|
103
|
+
"""Exact per-column stats derived from the user's bind args.
|
|
104
|
+
|
|
105
|
+
For sequence(count, increment=k): the output column spans
|
|
106
|
+
[0, (count - 1) * increment] with no nulls and count distinct values.
|
|
107
|
+
Returns ``None`` (no stats) for fixtures whose output isn't a single
|
|
108
|
+
``int64`` arange — they should override.
|
|
109
|
+
"""
|
|
110
|
+
count = getattr(params.args, "count", None)
|
|
111
|
+
increment = getattr(params.args, "increment", 1)
|
|
112
|
+
if not isinstance(count, int) or count <= 0:
|
|
113
|
+
return []
|
|
114
|
+
max_value = (count - 1) * increment
|
|
115
|
+
return [
|
|
116
|
+
ColumnStatistics(
|
|
117
|
+
column_name=cls.STATS_COLUMN_NAME,
|
|
118
|
+
min=pa.scalar(0, cls.STATS_ARROW_TYPE),
|
|
119
|
+
max=pa.scalar(max_value, cls.STATS_ARROW_TYPE),
|
|
120
|
+
has_null=False,
|
|
121
|
+
has_not_null=True,
|
|
122
|
+
distinct_count=count,
|
|
123
|
+
)
|
|
124
|
+
]
|
|
125
|
+
|
|
126
|
+
@classmethod
|
|
127
|
+
def process(cls, params: ProcessParams[Any], state: CountdownState, out: OutputCollector) -> None:
|
|
128
|
+
"""Run the standard countdown loop; delegate batch contents to ``_emit_chunk``."""
|
|
129
|
+
if state.remaining <= 0:
|
|
130
|
+
out.finish()
|
|
131
|
+
return
|
|
132
|
+
|
|
133
|
+
batch_size = getattr(params.args, "batch_size", cls.BATCH_SIZE_FALLBACK)
|
|
134
|
+
size = min(state.remaining, batch_size)
|
|
135
|
+
cls._emit_chunk(params, state, out, state.current_index, size)
|
|
136
|
+
state.current_index += size
|
|
137
|
+
state.remaining -= size
|
|
138
|
+
|
|
139
|
+
@classmethod
|
|
140
|
+
def _emit_chunk(
|
|
141
|
+
cls,
|
|
142
|
+
params: ProcessParams[Any],
|
|
143
|
+
state: CountdownState,
|
|
144
|
+
out: OutputCollector,
|
|
145
|
+
start: int,
|
|
146
|
+
size: int,
|
|
147
|
+
) -> None:
|
|
148
|
+
"""Default implementation: numpy arange × increment.
|
|
149
|
+
|
|
150
|
+
Subclasses with non-arange output override this hook. ``state`` is
|
|
151
|
+
passed in case subclasses want to track additional info; the standard
|
|
152
|
+
countdown bookkeeping (``current_index``/``remaining``) is handled by
|
|
153
|
+
``process`` itself, so subclass hooks should NOT mutate them.
|
|
154
|
+
"""
|
|
155
|
+
increment = params.args.increment
|
|
156
|
+
values = np.arange(
|
|
157
|
+
start * increment,
|
|
158
|
+
(start + size) * increment,
|
|
159
|
+
increment,
|
|
160
|
+
dtype=cls.NUMPY_DTYPE,
|
|
161
|
+
)
|
|
162
|
+
out.emit(pa.RecordBatch.from_arrays([pa.array(values)], schema=params.output_schema))
|
|
@@ -0,0 +1,283 @@
|
|
|
1
|
+
# Copyright 2025, 2026 Query Farm LLC - https://query.farm
|
|
2
|
+
|
|
3
|
+
"""Partitioned-queue fixtures that opt in to ``supports_batch_index``.
|
|
4
|
+
|
|
5
|
+
These exist so SQL integration tests can verify the batch_index feature:
|
|
6
|
+
|
|
7
|
+
* ``partitioned_batch_index(count)`` — single-column ``n int64`` output;
|
|
8
|
+
parallel scan with FIXED_ORDER preservation. Each queue item is tagged
|
|
9
|
+
with a stable partition_id; the worker emits Arrow batches tagged with
|
|
10
|
+
that id via ``out.emit(batch, batch_index=partition_id)``. The DuckDB
|
|
11
|
+
extension reads the tag from each batch's KeyValueMetadata, threads it
|
|
12
|
+
through ``TableFunction::get_partition_data``, and ordered sinks
|
|
13
|
+
(``PhysicalBatchCollector``, ``PhysicalBatchInsert``,
|
|
14
|
+
``PhysicalBatchCopyToFile``, ``PhysicalLimit``) reassemble output in
|
|
15
|
+
partition_id order. The FIXED_ORDER ``MaxThreads=1`` clamp is dropped
|
|
16
|
+
for opted-in functions.
|
|
17
|
+
|
|
18
|
+
* ``partitioned_batch_index_marked(count, chunk_size)`` — two-column
|
|
19
|
+
``(partition_id int64, seq int64)`` output. Lets tests directly
|
|
20
|
+
observe partition boundaries in the output stream (e.g. "no row with
|
|
21
|
+
partition_id=N appears after a row with partition_id=N+1"). Projection
|
|
22
|
+
pushdown is disabled so the ``partition_id`` column survives even
|
|
23
|
+
``SELECT seq FROM …`` queries.
|
|
24
|
+
|
|
25
|
+
The worker uses the existing in-process ``state`` to carry per-worker
|
|
26
|
+
cursor information across ``process()`` calls — same approach as
|
|
27
|
+
``_BasePartitionedOrderMode`` in ``order_modes.py``. HTTP transport's
|
|
28
|
+
existing STATE_KEY mechanism (in vgi_rpc.http) round-trips this state
|
|
29
|
+
across requests; nothing new is added for HTTP resumption.
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
from __future__ import annotations
|
|
33
|
+
|
|
34
|
+
import struct
|
|
35
|
+
from dataclasses import dataclass
|
|
36
|
+
from typing import Annotated, ClassVar, cast
|
|
37
|
+
|
|
38
|
+
import pyarrow as pa
|
|
39
|
+
from vgi_rpc import ArrowSerializableDataclass
|
|
40
|
+
from vgi_rpc.rpc import OutputCollector
|
|
41
|
+
|
|
42
|
+
from vgi._test_fixtures.table._common import _cardinality_from_count
|
|
43
|
+
from vgi.arguments import Arg
|
|
44
|
+
from vgi.invocation import GlobalInitResponse
|
|
45
|
+
from vgi.metadata import FunctionExample, OrderPreservation
|
|
46
|
+
from vgi.protocol import VgiOutputCollector
|
|
47
|
+
from vgi.schema_utils import schema
|
|
48
|
+
from vgi.table_function import (
|
|
49
|
+
InitParams,
|
|
50
|
+
ProcessParams,
|
|
51
|
+
TableFunctionGenerator,
|
|
52
|
+
bind_fixed_schema,
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
# Queue-item encoding: (partition_id, start, end) packed as three uint64s.
|
|
56
|
+
# Decoded by ``process()`` on the worker; partition_id is what the worker
|
|
57
|
+
# emits to DuckDB via the batch_index= kwarg.
|
|
58
|
+
_ITEM_FMT = ">QQQ"
|
|
59
|
+
_ITEM_SIZE = struct.calcsize(_ITEM_FMT)
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
# =============================================================================
|
|
63
|
+
# Single-column variant: partitioned_batch_index(count)
|
|
64
|
+
# =============================================================================
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
@dataclass(slots=True, frozen=True)
|
|
68
|
+
class _BatchIndexArgs:
|
|
69
|
+
"""Arguments for ``partitioned_batch_index``."""
|
|
70
|
+
|
|
71
|
+
count: Annotated[int, Arg(0, doc="Total number of integers to generate", ge=0)]
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
@dataclass(kw_only=True)
|
|
75
|
+
class _BatchIndexState(ArrowSerializableDataclass):
|
|
76
|
+
"""Per-worker cursor state.
|
|
77
|
+
|
|
78
|
+
``partition_id`` is the queue-push order of the current work item; emitted
|
|
79
|
+
on every Arrow batch via the batch_index= kwarg. ``current_idx`` advances
|
|
80
|
+
through ``[current_start, current_end)`` as the worker produces batches.
|
|
81
|
+
All three reset to None at the moment a partition is exhausted; the next
|
|
82
|
+
``process()`` call pops a fresh item.
|
|
83
|
+
"""
|
|
84
|
+
|
|
85
|
+
partition_id: int | None = None
|
|
86
|
+
current_start: int | None = None
|
|
87
|
+
current_end: int | None = None
|
|
88
|
+
current_idx: int = 0
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
@bind_fixed_schema
|
|
92
|
+
@_cardinality_from_count
|
|
93
|
+
class PartitionedBatchIndexFunction(TableFunctionGenerator[_BatchIndexArgs, _BatchIndexState]):
|
|
94
|
+
"""Parallel-scan sequence with batch_index ordering.
|
|
95
|
+
|
|
96
|
+
The primary worker enqueues N work items at on_init, each encoding
|
|
97
|
+
``(partition_id, start, end)``. Any worker pulls the next item via
|
|
98
|
+
``queue_pop``; emits a stream of Arrow batches tagged with
|
|
99
|
+
partition_id; advances to the next item when exhausted. DuckDB's
|
|
100
|
+
ordered sinks reassemble output in partition_id order — final output
|
|
101
|
+
matches a single-threaded scan, but the source itself fans out across
|
|
102
|
+
threads.
|
|
103
|
+
"""
|
|
104
|
+
|
|
105
|
+
# NOTE: left at the original small chunk for now. Capping the partition
|
|
106
|
+
# count (as partitioned_sequence / filter_echo_partitioned do) once made
|
|
107
|
+
# batch_index.test segfault (exit 139) — but that turned out to be a
|
|
108
|
+
# pre-existing use-after-free in the extension's async cancel path (the
|
|
109
|
+
# cancel dispatcher logged through the destroyed query ClientContext), not
|
|
110
|
+
# a batch_index bug. The cap just triggered more stream cancellations and
|
|
111
|
+
# exposed it. With that fixed in the extension, capping this fixture is now
|
|
112
|
+
# safe to re-enable (verified 15/15 clean under UBSan); deferred only so the
|
|
113
|
+
# resize lands as its own change.
|
|
114
|
+
CHUNK_SIZE: ClassVar[int] = 1000
|
|
115
|
+
BATCH_SIZE: ClassVar[int] = 1000
|
|
116
|
+
|
|
117
|
+
FIXED_SCHEMA: ClassVar[pa.Schema] = schema(n=pa.int64())
|
|
118
|
+
|
|
119
|
+
class Meta:
|
|
120
|
+
name = "partitioned_batch_index"
|
|
121
|
+
description = (
|
|
122
|
+
"Multi-worker partitioned sequence with per-batch batch_index "
|
|
123
|
+
"tagging; parallel scan + ordered sink reassembly."
|
|
124
|
+
)
|
|
125
|
+
categories = ["generator", "utility"]
|
|
126
|
+
preserves_order = OrderPreservation.FIXED_ORDER
|
|
127
|
+
supports_batch_index = True
|
|
128
|
+
examples = [
|
|
129
|
+
FunctionExample(
|
|
130
|
+
sql="SELECT * FROM partitioned_batch_index(100)",
|
|
131
|
+
description=(
|
|
132
|
+
"Generate 0..99 in parallel; DuckDB sinks reassemble output in partition_id (insertion) order."
|
|
133
|
+
),
|
|
134
|
+
),
|
|
135
|
+
]
|
|
136
|
+
|
|
137
|
+
@classmethod
|
|
138
|
+
def on_init(cls, params: InitParams[_BatchIndexArgs]) -> GlobalInitResponse:
|
|
139
|
+
work_items: list[bytes] = []
|
|
140
|
+
for partition_id, start_idx in enumerate(range(0, params.args.count, cls.CHUNK_SIZE)):
|
|
141
|
+
end_idx = min(start_idx + cls.CHUNK_SIZE, params.args.count)
|
|
142
|
+
work_items.append(struct.pack(_ITEM_FMT, partition_id, start_idx, end_idx))
|
|
143
|
+
params.storage.queue_push(work_items)
|
|
144
|
+
return GlobalInitResponse()
|
|
145
|
+
|
|
146
|
+
@classmethod
|
|
147
|
+
def initial_state(cls, params: ProcessParams[_BatchIndexArgs]) -> _BatchIndexState:
|
|
148
|
+
return _BatchIndexState()
|
|
149
|
+
|
|
150
|
+
@classmethod
|
|
151
|
+
def process(
|
|
152
|
+
cls,
|
|
153
|
+
params: ProcessParams[_BatchIndexArgs],
|
|
154
|
+
state: _BatchIndexState,
|
|
155
|
+
out: OutputCollector,
|
|
156
|
+
) -> None:
|
|
157
|
+
if state.partition_id is None or state.current_idx >= (state.current_end or 0):
|
|
158
|
+
work_data = params.storage.queue_pop()
|
|
159
|
+
if work_data is None:
|
|
160
|
+
out.finish()
|
|
161
|
+
return
|
|
162
|
+
partition_id, start, end = struct.unpack(_ITEM_FMT, work_data)
|
|
163
|
+
state.partition_id = partition_id
|
|
164
|
+
state.current_start = start
|
|
165
|
+
state.current_end = end
|
|
166
|
+
state.current_idx = start
|
|
167
|
+
|
|
168
|
+
batch_end_idx = min(state.current_idx + cls.BATCH_SIZE, state.current_end or 0)
|
|
169
|
+
values = list(range(state.current_idx, batch_end_idx))
|
|
170
|
+
cast(VgiOutputCollector, out).emit(
|
|
171
|
+
pa.RecordBatch.from_pydict({"n": values}, schema=params.output_schema),
|
|
172
|
+
batch_index=state.partition_id,
|
|
173
|
+
)
|
|
174
|
+
state.current_idx = batch_end_idx
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
# =============================================================================
|
|
178
|
+
# Two-column variant: partitioned_batch_index_marked(count, chunk_size)
|
|
179
|
+
# =============================================================================
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
@dataclass(slots=True, frozen=True)
|
|
183
|
+
class _BatchIndexMarkedArgs:
|
|
184
|
+
"""Arguments for ``partitioned_batch_index_marked``."""
|
|
185
|
+
|
|
186
|
+
count: Annotated[int, Arg(0, doc="Total number of rows to generate", ge=0)]
|
|
187
|
+
chunk_size: Annotated[int, Arg("chunk_size", default=1000, doc="Rows per partition", ge=1)]
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
@dataclass(kw_only=True)
|
|
191
|
+
class _BatchIndexMarkedState(ArrowSerializableDataclass):
|
|
192
|
+
partition_id: int | None = None
|
|
193
|
+
current_start: int | None = None
|
|
194
|
+
current_end: int | None = None
|
|
195
|
+
current_idx: int = 0
|
|
196
|
+
|
|
197
|
+
|
|
198
|
+
@bind_fixed_schema
|
|
199
|
+
@_cardinality_from_count
|
|
200
|
+
class PartitionedBatchIndexMarkedFunction(TableFunctionGenerator[_BatchIndexMarkedArgs, _BatchIndexMarkedState]):
|
|
201
|
+
"""Two-column batch_index fixture for direct ordering observation.
|
|
202
|
+
|
|
203
|
+
Output rows are ``(partition_id, seq)`` where ``partition_id`` is the
|
|
204
|
+
queue-push order (matches the emitted batch_index) and ``seq`` counts
|
|
205
|
+
up within each partition starting at 0. Tests assert that no row with
|
|
206
|
+
a higher partition_id appears before a row with a lower one — proving
|
|
207
|
+
that DuckDB's sink-side reassembly worked.
|
|
208
|
+
|
|
209
|
+
Projection pushdown is OFF on this fixture so ``SELECT seq FROM …``
|
|
210
|
+
still gets the partition_id column emitted by the worker; the C++
|
|
211
|
+
extension's projection then drops it after the ordering metadata has
|
|
212
|
+
been observed.
|
|
213
|
+
"""
|
|
214
|
+
|
|
215
|
+
BATCH_SIZE: ClassVar[int] = 256
|
|
216
|
+
|
|
217
|
+
FIXED_SCHEMA: ClassVar[pa.Schema] = schema(
|
|
218
|
+
partition_id=pa.int64(),
|
|
219
|
+
seq=pa.int64(),
|
|
220
|
+
)
|
|
221
|
+
|
|
222
|
+
class Meta:
|
|
223
|
+
name = "partitioned_batch_index_marked"
|
|
224
|
+
description = (
|
|
225
|
+
"Two-column batch_index demo: rows are (partition_id, seq). Tests "
|
|
226
|
+
"assert that DuckDB's ordered sinks reassemble output in "
|
|
227
|
+
"partition_id order under parallel execution."
|
|
228
|
+
)
|
|
229
|
+
categories = ["generator", "utility", "testing"]
|
|
230
|
+
preserves_order = OrderPreservation.FIXED_ORDER
|
|
231
|
+
supports_batch_index = True
|
|
232
|
+
projection_pushdown = False
|
|
233
|
+
examples = [
|
|
234
|
+
FunctionExample(
|
|
235
|
+
sql="SELECT * FROM partitioned_batch_index_marked(100, chunk_size := 25) LIMIT 5",
|
|
236
|
+
description="First 5 rows of partition 0",
|
|
237
|
+
),
|
|
238
|
+
]
|
|
239
|
+
|
|
240
|
+
@classmethod
|
|
241
|
+
def on_init(cls, params: InitParams[_BatchIndexMarkedArgs]) -> GlobalInitResponse:
|
|
242
|
+
work_items: list[bytes] = []
|
|
243
|
+
chunk_size = params.args.chunk_size
|
|
244
|
+
for partition_id, start_idx in enumerate(range(0, params.args.count, chunk_size)):
|
|
245
|
+
end_idx = min(start_idx + chunk_size, params.args.count)
|
|
246
|
+
work_items.append(struct.pack(_ITEM_FMT, partition_id, start_idx, end_idx))
|
|
247
|
+
params.storage.queue_push(work_items)
|
|
248
|
+
return GlobalInitResponse()
|
|
249
|
+
|
|
250
|
+
@classmethod
|
|
251
|
+
def initial_state(cls, params: ProcessParams[_BatchIndexMarkedArgs]) -> _BatchIndexMarkedState:
|
|
252
|
+
return _BatchIndexMarkedState()
|
|
253
|
+
|
|
254
|
+
@classmethod
|
|
255
|
+
def process(
|
|
256
|
+
cls,
|
|
257
|
+
params: ProcessParams[_BatchIndexMarkedArgs],
|
|
258
|
+
state: _BatchIndexMarkedState,
|
|
259
|
+
out: OutputCollector,
|
|
260
|
+
) -> None:
|
|
261
|
+
if state.partition_id is None or state.current_idx >= (state.current_end or 0):
|
|
262
|
+
work_data = params.storage.queue_pop()
|
|
263
|
+
if work_data is None:
|
|
264
|
+
out.finish()
|
|
265
|
+
return
|
|
266
|
+
partition_id, start, end = struct.unpack(_ITEM_FMT, work_data)
|
|
267
|
+
state.partition_id = partition_id
|
|
268
|
+
state.current_start = start
|
|
269
|
+
state.current_end = end
|
|
270
|
+
state.current_idx = start
|
|
271
|
+
|
|
272
|
+
batch_end_idx = min(state.current_idx + cls.BATCH_SIZE, state.current_end or 0)
|
|
273
|
+
rows = batch_end_idx - state.current_idx
|
|
274
|
+
partition_ids = [state.partition_id] * rows
|
|
275
|
+
seqs = list(range(state.current_idx - (state.current_start or 0), batch_end_idx - (state.current_start or 0)))
|
|
276
|
+
cast(VgiOutputCollector, out).emit(
|
|
277
|
+
pa.RecordBatch.from_pydict(
|
|
278
|
+
{"partition_id": partition_ids, "seq": seqs},
|
|
279
|
+
schema=params.output_schema,
|
|
280
|
+
),
|
|
281
|
+
batch_index=state.partition_id,
|
|
282
|
+
)
|
|
283
|
+
state.current_idx = batch_end_idx
|