vgi-python 0.8.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (124) hide show
  1. vgi/__init__.py +152 -0
  2. vgi/_duckdb.py +62 -0
  3. vgi/_storage_profile.py +132 -0
  4. vgi/_test_fixtures/__init__.py +20 -0
  5. vgi/_test_fixtures/accumulate/__init__.py +19 -0
  6. vgi/_test_fixtures/accumulate/worker.py +762 -0
  7. vgi/_test_fixtures/aggregate/__init__.py +62 -0
  8. vgi/_test_fixtures/aggregate/_common.py +21 -0
  9. vgi/_test_fixtures/aggregate/basic.py +232 -0
  10. vgi/_test_fixtures/aggregate/dynamic.py +409 -0
  11. vgi/_test_fixtures/aggregate/generic.py +86 -0
  12. vgi/_test_fixtures/aggregate/listagg.py +71 -0
  13. vgi/_test_fixtures/aggregate/percentile.py +107 -0
  14. vgi/_test_fixtures/aggregate/streaming.py +192 -0
  15. vgi/_test_fixtures/aggregate/varargs.py +75 -0
  16. vgi/_test_fixtures/aggregate/window.py +380 -0
  17. vgi/_test_fixtures/attach_options.py +308 -0
  18. vgi/_test_fixtures/bad_protocol.py +62 -0
  19. vgi/_test_fixtures/cancellable.py +336 -0
  20. vgi/_test_fixtures/catalog.py +813 -0
  21. vgi/_test_fixtures/http_server.py +394 -0
  22. vgi/_test_fixtures/nest_tensor.py +614 -0
  23. vgi/_test_fixtures/orchard_catalog.py +47 -0
  24. vgi/_test_fixtures/projection_repro/__init__.py +6 -0
  25. vgi/_test_fixtures/projection_repro/worker.py +454 -0
  26. vgi/_test_fixtures/scalar/__init__.py +116 -0
  27. vgi/_test_fixtures/scalar/_common.py +69 -0
  28. vgi/_test_fixtures/scalar/arithmetic.py +321 -0
  29. vgi/_test_fixtures/scalar/binary.py +120 -0
  30. vgi/_test_fixtures/scalar/formatting.py +176 -0
  31. vgi/_test_fixtures/scalar/geo.py +300 -0
  32. vgi/_test_fixtures/scalar/null_handling.py +107 -0
  33. vgi/_test_fixtures/scalar/random_demo.py +171 -0
  34. vgi/_test_fixtures/scalar/settings_secrets.py +102 -0
  35. vgi/_test_fixtures/scalar/type_info.py +219 -0
  36. vgi/_test_fixtures/schema_reconcile/__init__.py +29 -0
  37. vgi/_test_fixtures/schema_reconcile/worker.py +653 -0
  38. vgi/_test_fixtures/simple_writable.py +793 -0
  39. vgi/_test_fixtures/table/__init__.py +221 -0
  40. vgi/_test_fixtures/table/_common.py +162 -0
  41. vgi/_test_fixtures/table/batch_index.py +283 -0
  42. vgi/_test_fixtures/table/batch_index_broken.py +200 -0
  43. vgi/_test_fixtures/table/catalog_scans.py +162 -0
  44. vgi/_test_fixtures/table/filters.py +1005 -0
  45. vgi/_test_fixtures/table/late_materialization.py +249 -0
  46. vgi/_test_fixtures/table/make_series.py +273 -0
  47. vgi/_test_fixtures/table/misc.py +499 -0
  48. vgi/_test_fixtures/table/order_modes.py +164 -0
  49. vgi/_test_fixtures/table/pairs.py +437 -0
  50. vgi/_test_fixtures/table/partition_columns.py +472 -0
  51. vgi/_test_fixtures/table/partition_columns_broken.py +304 -0
  52. vgi/_test_fixtures/table/profiling_example.py +195 -0
  53. vgi/_test_fixtures/table/required_filters.py +234 -0
  54. vgi/_test_fixtures/table/sequence.py +710 -0
  55. vgi/_test_fixtures/table/settings.py +426 -0
  56. vgi/_test_fixtures/table/transaction_storage.py +162 -0
  57. vgi/_test_fixtures/table/tt_pushdown.py +191 -0
  58. vgi/_test_fixtures/table/versioned.py +230 -0
  59. vgi/_test_fixtures/table_in_out.py +1392 -0
  60. vgi/_test_fixtures/versioned.py +155 -0
  61. vgi/_test_fixtures/versioned_tables.py +595 -0
  62. vgi/_test_fixtures/worker.py +1631 -0
  63. vgi/_test_fixtures/writable/__init__.py +8 -0
  64. vgi/_test_fixtures/writable/generic.py +236 -0
  65. vgi/_test_fixtures/writable/table.py +149 -0
  66. vgi/_test_fixtures/writable/worker.py +1148 -0
  67. vgi/aggregate_function.py +607 -0
  68. vgi/argument_spec.py +472 -0
  69. vgi/arguments.py +1747 -0
  70. vgi/auth.py +55 -0
  71. vgi/catalog/__init__.py +88 -0
  72. vgi/catalog/attach_option.py +206 -0
  73. vgi/catalog/catalog_interface.py +2767 -0
  74. vgi/catalog/descriptors.py +870 -0
  75. vgi/catalog/duckdb_statistics.py +377 -0
  76. vgi/catalog/secret_type.py +96 -0
  77. vgi/catalog/setting.py +253 -0
  78. vgi/catalog/storage.py +372 -0
  79. vgi/client/__init__.py +67 -0
  80. vgi/client/catalog_mixin.py +1251 -0
  81. vgi/client/cli.py +582 -0
  82. vgi/client/cli_catalog.py +182 -0
  83. vgi/client/cli_schema.py +270 -0
  84. vgi/client/cli_table.py +907 -0
  85. vgi/client/cli_transaction.py +97 -0
  86. vgi/client/cli_utils.py +441 -0
  87. vgi/client/cli_view.py +303 -0
  88. vgi/client/client.py +2183 -0
  89. vgi/exceptions.py +205 -0
  90. vgi/function.py +245 -0
  91. vgi/function_storage.py +1636 -0
  92. vgi/function_storage_azure_sql.py +922 -0
  93. vgi/function_storage_cf_do.py +740 -0
  94. vgi/http/__init__.py +25 -0
  95. vgi/http/demo_storage.py +212 -0
  96. vgi/http/worker_page.py +1252 -0
  97. vgi/invocation.py +154 -0
  98. vgi/logging_config.py +93 -0
  99. vgi/meta_worker.py +661 -0
  100. vgi/metadata.py +1403 -0
  101. vgi/otel.py +406 -0
  102. vgi/protocol.py +2418 -0
  103. vgi/protocol_version.txt +1 -0
  104. vgi/py.typed +0 -0
  105. vgi/scalar_function.py +1211 -0
  106. vgi/schema_utils.py +234 -0
  107. vgi/secret_protocol.py +124 -0
  108. vgi/secret_service.py +238 -0
  109. vgi/serve.py +769 -0
  110. vgi/table_buffering_function.py +443 -0
  111. vgi/table_filter_pushdown.py +1528 -0
  112. vgi/table_function.py +1130 -0
  113. vgi/table_in_out_function.py +383 -0
  114. vgi/transactor/__init__.py +24 -0
  115. vgi/transactor/_duckdb_compat.py +27 -0
  116. vgi/transactor/client.py +137 -0
  117. vgi/transactor/protocol.py +149 -0
  118. vgi/transactor/server.py +740 -0
  119. vgi/worker.py +4761 -0
  120. vgi_python-0.8.0.dist-info/METADATA +735 -0
  121. vgi_python-0.8.0.dist-info/RECORD +124 -0
  122. vgi_python-0.8.0.dist-info/WHEEL +4 -0
  123. vgi_python-0.8.0.dist-info/entry_points.txt +5 -0
  124. vgi_python-0.8.0.dist-info/licenses/LICENSE +134 -0
@@ -0,0 +1,221 @@
1
+ # Copyright 2025, 2026 Query Farm LLC - https://query.farm
2
+
3
+ """Table-function fixtures.
4
+
5
+ Originally a single 3,270-line module; split into cohesive sub-modules and
6
+ re-exported here so existing import sites (worker.py, tests) keep working
7
+ unchanged.
8
+
9
+ If you're looking for a specific fixture, the module names below should
10
+ point you at the right file:
11
+
12
+ * :mod:`._common` — ``CountdownState``, ``_BaseSequenceFunction``
13
+ * :mod:`.sequence` — sequence / partitioned / nested / row_id
14
+ * :mod:`.make_series` — make_series_count / range / step / csv / float
15
+ * :mod:`.pairs` — make_pairs_*, repeat_value_*, constant_columns
16
+ * :mod:`.settings` — settings_aware, struct_settings, secret_demo
17
+ * :mod:`.filters` — filter_echo, dynamic_filter_echo, expression_filter,
18
+ spatial_filter
19
+ * :mod:`.catalog_scans` — colors / departments / employees / products / projects
20
+ * :mod:`.versioned` — versioned_data + versioned_constraints (time travel)
21
+ * :mod:`.misc` — projected_data, generator_exception,
22
+ logging_generator, order_echo, sample_echo
23
+ """
24
+
25
+ from vgi._test_fixtures.table.batch_index import (
26
+ PartitionedBatchIndexFunction,
27
+ PartitionedBatchIndexMarkedFunction,
28
+ )
29
+ from vgi._test_fixtures.table.batch_index_broken import (
30
+ BatchIndexOverflowFunction,
31
+ MissingBatchIndexTagFunction,
32
+ NonMonotoneBatchIndexFunction,
33
+ )
34
+ from vgi._test_fixtures.table.catalog_scans import (
35
+ ColorsScanFunction,
36
+ DepartmentsScanFunction,
37
+ EmployeesScanFunction,
38
+ ProductsScanFunction,
39
+ ProjectsScanFunction,
40
+ )
41
+ from vgi._test_fixtures.table.filters import (
42
+ DictFilterEchoFunction,
43
+ DynamicFilterEchoFunction,
44
+ ExpressionFilterTestFunction,
45
+ FilterEchoFunction,
46
+ FilterEchoPartitionedFunction,
47
+ FilterEchoTableScanFunction,
48
+ SpatialFilterExampleFunction,
49
+ ValuePruneFunction,
50
+ )
51
+ from vgi._test_fixtures.table.late_materialization import (
52
+ LateMaterializationFunction,
53
+ )
54
+ from vgi._test_fixtures.table.make_series import (
55
+ MakeSeriesCountFunction,
56
+ MakeSeriesCsvFunction,
57
+ MakeSeriesFloatFunction,
58
+ MakeSeriesRangeFunction,
59
+ MakeSeriesStepFunction,
60
+ )
61
+ from vgi._test_fixtures.table.misc import (
62
+ GeneratorExceptionFunction,
63
+ LoggingGeneratorFunction,
64
+ OrderEchoFunction,
65
+ ProjectedDataFunction,
66
+ SampleEchoFunction,
67
+ )
68
+ from vgi._test_fixtures.table.order_modes import (
69
+ PartitionedFixedOrderFunction,
70
+ PartitionedNoOrderGuaranteeFunction,
71
+ PartitionedPreservesOrderFunction,
72
+ )
73
+ from vgi._test_fixtures.table.pairs import (
74
+ ConstantColumnsFunction,
75
+ MakePairsIntFunction,
76
+ MakePairsIntStrFunction,
77
+ MakePairsStrFunction,
78
+ RepeatValueIntFunction,
79
+ RepeatValueStrFunction,
80
+ )
81
+ from vgi._test_fixtures.table.partition_columns import (
82
+ CountryPartitionedSalesFunction,
83
+ DisjointRangePartitionedFunction,
84
+ PartitionedWithExplicitOverrideFunction,
85
+ RegionYearPartitionedFunction,
86
+ )
87
+ from vgi._test_fixtures.table.partition_columns_broken import (
88
+ BrokenMissingPartitionValuesFunction,
89
+ BrokenPartitionColumnAbsentFromBatchFunction,
90
+ BrokenPartitionMinNeqMaxFunction,
91
+ BrokenPartitionValuesNoAnnotationFunction,
92
+ )
93
+ from vgi._test_fixtures.table.profiling_example import (
94
+ ProfilingDemoFunction,
95
+ )
96
+ from vgi._test_fixtures.table.required_filters import (
97
+ RFF_MULTI_COLUMNS,
98
+ RFF_NESTED_COLUMNS,
99
+ RFF_NONE_COLUMNS,
100
+ RFF_ROWID_COLUMNS,
101
+ RFF_SIMPLE_COLUMNS,
102
+ RFF_STRUCT_COLUMNS,
103
+ RffMultiScanFunction,
104
+ RffNestedScanFunction,
105
+ RffNoneScanFunction,
106
+ RffRowidScanFunction,
107
+ RffSimpleScanFunction,
108
+ RffStructScanFunction,
109
+ )
110
+ from vgi._test_fixtures.table.sequence import (
111
+ DoubleSequenceFunction,
112
+ NamedParamsEchoFunction,
113
+ NestedSequenceFunction,
114
+ PartitionedSequenceFunction,
115
+ RowIdSequenceFunction,
116
+ SequenceFunction,
117
+ TenThousandFunction,
118
+ )
119
+ from vgi._test_fixtures.table.settings import (
120
+ ScopedSecretDemoFunction,
121
+ SecretDemoFunction,
122
+ SettingsAwareFunction,
123
+ StructSettingsFunction,
124
+ )
125
+ from vgi._test_fixtures.table.transaction_storage import TxCachedValueFunction
126
+ from vgi._test_fixtures.table.versioned import (
127
+ _CURRENT_VERSION,
128
+ _VERSIONED_CONSTRAINTS_CURRENT,
129
+ _VERSIONED_CONSTRAINTS_DATA,
130
+ _VERSIONED_CONSTRAINTS_SCHEMAS,
131
+ _VERSIONED_DATA,
132
+ _VERSIONED_SCHEMAS,
133
+ VersionedConstraintsScanFunction,
134
+ VersionedDataFunction,
135
+ resolve_version,
136
+ resolve_versioned_constraints_version,
137
+ )
138
+
139
+ __all__ = [
140
+ "_CURRENT_VERSION",
141
+ "_VERSIONED_CONSTRAINTS_CURRENT",
142
+ "_VERSIONED_CONSTRAINTS_DATA",
143
+ "_VERSIONED_CONSTRAINTS_SCHEMAS",
144
+ "_VERSIONED_DATA",
145
+ "_VERSIONED_SCHEMAS",
146
+ "BatchIndexOverflowFunction",
147
+ "BrokenMissingPartitionValuesFunction",
148
+ "BrokenPartitionColumnAbsentFromBatchFunction",
149
+ "BrokenPartitionMinNeqMaxFunction",
150
+ "BrokenPartitionValuesNoAnnotationFunction",
151
+ "ColorsScanFunction",
152
+ "ConstantColumnsFunction",
153
+ "CountryPartitionedSalesFunction",
154
+ "DisjointRangePartitionedFunction",
155
+ "DepartmentsScanFunction",
156
+ "DictFilterEchoFunction",
157
+ "DoubleSequenceFunction",
158
+ "DynamicFilterEchoFunction",
159
+ "EmployeesScanFunction",
160
+ "ExpressionFilterTestFunction",
161
+ "FilterEchoFunction",
162
+ "FilterEchoPartitionedFunction",
163
+ "FilterEchoTableScanFunction",
164
+ "GeneratorExceptionFunction",
165
+ "ValuePruneFunction",
166
+ "LateMaterializationFunction",
167
+ "LoggingGeneratorFunction",
168
+ "MakePairsIntFunction",
169
+ "MakePairsIntStrFunction",
170
+ "MakePairsStrFunction",
171
+ "MakeSeriesCountFunction",
172
+ "MakeSeriesCsvFunction",
173
+ "MakeSeriesFloatFunction",
174
+ "MakeSeriesRangeFunction",
175
+ "MakeSeriesStepFunction",
176
+ "MissingBatchIndexTagFunction",
177
+ "NamedParamsEchoFunction",
178
+ "NestedSequenceFunction",
179
+ "NonMonotoneBatchIndexFunction",
180
+ "OrderEchoFunction",
181
+ "PartitionedBatchIndexFunction",
182
+ "PartitionedBatchIndexMarkedFunction",
183
+ "PartitionedFixedOrderFunction",
184
+ "PartitionedNoOrderGuaranteeFunction",
185
+ "PartitionedPreservesOrderFunction",
186
+ "PartitionedSequenceFunction",
187
+ "PartitionedWithExplicitOverrideFunction",
188
+ "ProductsScanFunction",
189
+ "ProfilingDemoFunction",
190
+ "ProjectedDataFunction",
191
+ "ProjectsScanFunction",
192
+ "RegionYearPartitionedFunction",
193
+ "RepeatValueIntFunction",
194
+ "RepeatValueStrFunction",
195
+ "RFF_MULTI_COLUMNS",
196
+ "RFF_NESTED_COLUMNS",
197
+ "RFF_NONE_COLUMNS",
198
+ "RFF_ROWID_COLUMNS",
199
+ "RFF_SIMPLE_COLUMNS",
200
+ "RFF_STRUCT_COLUMNS",
201
+ "RffMultiScanFunction",
202
+ "RffNestedScanFunction",
203
+ "RffNoneScanFunction",
204
+ "RffRowidScanFunction",
205
+ "RffSimpleScanFunction",
206
+ "RffStructScanFunction",
207
+ "RowIdSequenceFunction",
208
+ "SampleEchoFunction",
209
+ "ScopedSecretDemoFunction",
210
+ "SecretDemoFunction",
211
+ "SequenceFunction",
212
+ "SettingsAwareFunction",
213
+ "SpatialFilterExampleFunction",
214
+ "StructSettingsFunction",
215
+ "TenThousandFunction",
216
+ "TxCachedValueFunction",
217
+ "VersionedConstraintsScanFunction",
218
+ "VersionedDataFunction",
219
+ "resolve_version",
220
+ "resolve_versioned_constraints_version",
221
+ ]
@@ -0,0 +1,162 @@
1
+ # Copyright 2025, 2026 Query Farm LLC - https://query.farm
2
+
3
+ """Shared infrastructure for table fixture functions.
4
+
5
+ Holds the cardinality decorator, the common ``CountdownState``, the
6
+ ``CountBatchArgs`` base for fixtures that take ``(count, batch_size)``,
7
+ and the ``_BaseSequenceFunction`` template-method base class for
8
+ countdown-style generators.
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ from dataclasses import dataclass
14
+ from typing import Annotated, Any, ClassVar
15
+
16
+ import numpy as np
17
+ import pyarrow as pa
18
+ from vgi_rpc import ArrowSerializableDataclass
19
+ from vgi_rpc.rpc import OutputCollector
20
+
21
+ from vgi.arguments import Arg
22
+ from vgi.catalog.catalog_interface import ColumnStatistics
23
+ from vgi.table_function import (
24
+ BindParams,
25
+ ProcessParams,
26
+ TableCardinality,
27
+ TableFunctionGenerator,
28
+ )
29
+
30
+
31
+ def _cardinality_from_count[T: TableFunctionGenerator[Any, Any]](cls: type[T]) -> type[T]:
32
+ """Class decorator to implement cardinality() based on a 'count' argument."""
33
+ if "cardinality" not in cls.__dict__: # only inject if subclass hasn't overridden
34
+
35
+ def cardinality_impl(cls_: type[T], params: BindParams[Any]) -> TableCardinality:
36
+ count = getattr(params.args, "count", None)
37
+ if not isinstance(count, int) or count < 0:
38
+ raise ValueError(f"Expected a non-negative integer 'count' argument for {cls_.__name__}")
39
+ return TableCardinality(estimate=count, max=count)
40
+
41
+ cls.cardinality = classmethod(cardinality_impl) # type: ignore[assignment]
42
+
43
+ return cls
44
+
45
+
46
+ @dataclass(kw_only=True)
47
+ class CountdownState(ArrowSerializableDataclass):
48
+ """Mutable state tracking remaining rows and current position."""
49
+
50
+ remaining: int
51
+ current_index: int = 0
52
+
53
+
54
+ @dataclass(frozen=True)
55
+ class CountBatchArgs:
56
+ """Standard ``(count, batch_size)`` argument pair for countdown-style fixtures.
57
+
58
+ Subclass this to add fixture-specific knobs without re-declaring the two
59
+ common fields. Note: ``slots=True`` is intentionally omitted so subclasses
60
+ can extend cleanly without slot-conflict gymnastics.
61
+ """
62
+
63
+ count: Annotated[int, Arg(0, doc="Number of rows to generate", ge=0)]
64
+ batch_size: Annotated[int, Arg("batch_size", default=1000, doc="Batch size for output", ge=1)]
65
+
66
+
67
+ @dataclass(slots=True, frozen=True)
68
+ class _EmptyArgs:
69
+ """No arguments."""
70
+
71
+
72
+ @dataclass(kw_only=True)
73
+ class _OneShotState(ArrowSerializableDataclass):
74
+ """State that emits data once."""
75
+
76
+ done: bool = False
77
+
78
+
79
+ class _BaseSequenceFunction(TableFunctionGenerator[Any, CountdownState]):
80
+ """Template-method base for countdown-style fixture generators.
81
+
82
+ Provides ``initial_state``, the countdown bookkeeping in ``process``, and
83
+ a default numpy-arange ``_emit_chunk`` used by SequenceFunction /
84
+ DoubleSequenceFunction. Subclasses with non-arange output (e.g. echoes,
85
+ nested types, row-id sequences) override ``_emit_chunk``.
86
+
87
+ ``BATCH_SIZE_FALLBACK`` is used when ``params.args`` has no ``batch_size``
88
+ field — i.e. fixtures that want a fixed batch size rather than a user knob.
89
+ """
90
+
91
+ NUMPY_DTYPE: ClassVar[type[np.generic]] = np.int64
92
+ STATS_ARROW_TYPE: ClassVar[pa.DataType] = pa.int64()
93
+ STATS_COLUMN_NAME: ClassVar[str] = "n"
94
+ BATCH_SIZE_FALLBACK: ClassVar[int] = 1000
95
+
96
+ @classmethod
97
+ def initial_state(cls, params: ProcessParams[Any]) -> CountdownState:
98
+ """Create initial state with remaining count."""
99
+ return CountdownState(remaining=params.args.count)
100
+
101
+ @classmethod
102
+ def statistics(cls, params: BindParams[Any]) -> list[ColumnStatistics] | None:
103
+ """Exact per-column stats derived from the user's bind args.
104
+
105
+ For sequence(count, increment=k): the output column spans
106
+ [0, (count - 1) * increment] with no nulls and count distinct values.
107
+ Returns ``None`` (no stats) for fixtures whose output isn't a single
108
+ ``int64`` arange — they should override.
109
+ """
110
+ count = getattr(params.args, "count", None)
111
+ increment = getattr(params.args, "increment", 1)
112
+ if not isinstance(count, int) or count <= 0:
113
+ return []
114
+ max_value = (count - 1) * increment
115
+ return [
116
+ ColumnStatistics(
117
+ column_name=cls.STATS_COLUMN_NAME,
118
+ min=pa.scalar(0, cls.STATS_ARROW_TYPE),
119
+ max=pa.scalar(max_value, cls.STATS_ARROW_TYPE),
120
+ has_null=False,
121
+ has_not_null=True,
122
+ distinct_count=count,
123
+ )
124
+ ]
125
+
126
+ @classmethod
127
+ def process(cls, params: ProcessParams[Any], state: CountdownState, out: OutputCollector) -> None:
128
+ """Run the standard countdown loop; delegate batch contents to ``_emit_chunk``."""
129
+ if state.remaining <= 0:
130
+ out.finish()
131
+ return
132
+
133
+ batch_size = getattr(params.args, "batch_size", cls.BATCH_SIZE_FALLBACK)
134
+ size = min(state.remaining, batch_size)
135
+ cls._emit_chunk(params, state, out, state.current_index, size)
136
+ state.current_index += size
137
+ state.remaining -= size
138
+
139
+ @classmethod
140
+ def _emit_chunk(
141
+ cls,
142
+ params: ProcessParams[Any],
143
+ state: CountdownState,
144
+ out: OutputCollector,
145
+ start: int,
146
+ size: int,
147
+ ) -> None:
148
+ """Default implementation: numpy arange × increment.
149
+
150
+ Subclasses with non-arange output override this hook. ``state`` is
151
+ passed in case subclasses want to track additional info; the standard
152
+ countdown bookkeeping (``current_index``/``remaining``) is handled by
153
+ ``process`` itself, so subclass hooks should NOT mutate them.
154
+ """
155
+ increment = params.args.increment
156
+ values = np.arange(
157
+ start * increment,
158
+ (start + size) * increment,
159
+ increment,
160
+ dtype=cls.NUMPY_DTYPE,
161
+ )
162
+ out.emit(pa.RecordBatch.from_arrays([pa.array(values)], schema=params.output_schema))
@@ -0,0 +1,283 @@
1
+ # Copyright 2025, 2026 Query Farm LLC - https://query.farm
2
+
3
+ """Partitioned-queue fixtures that opt in to ``supports_batch_index``.
4
+
5
+ These exist so SQL integration tests can verify the batch_index feature:
6
+
7
+ * ``partitioned_batch_index(count)`` — single-column ``n int64`` output;
8
+ parallel scan with FIXED_ORDER preservation. Each queue item is tagged
9
+ with a stable partition_id; the worker emits Arrow batches tagged with
10
+ that id via ``out.emit(batch, batch_index=partition_id)``. The DuckDB
11
+ extension reads the tag from each batch's KeyValueMetadata, threads it
12
+ through ``TableFunction::get_partition_data``, and ordered sinks
13
+ (``PhysicalBatchCollector``, ``PhysicalBatchInsert``,
14
+ ``PhysicalBatchCopyToFile``, ``PhysicalLimit``) reassemble output in
15
+ partition_id order. The FIXED_ORDER ``MaxThreads=1`` clamp is dropped
16
+ for opted-in functions.
17
+
18
+ * ``partitioned_batch_index_marked(count, chunk_size)`` — two-column
19
+ ``(partition_id int64, seq int64)`` output. Lets tests directly
20
+ observe partition boundaries in the output stream (e.g. "no row with
21
+ partition_id=N appears after a row with partition_id=N+1"). Projection
22
+ pushdown is disabled so the ``partition_id`` column survives even
23
+ ``SELECT seq FROM …`` queries.
24
+
25
+ The worker uses the existing in-process ``state`` to carry per-worker
26
+ cursor information across ``process()`` calls — same approach as
27
+ ``_BasePartitionedOrderMode`` in ``order_modes.py``. HTTP transport's
28
+ existing STATE_KEY mechanism (in vgi_rpc.http) round-trips this state
29
+ across requests; nothing new is added for HTTP resumption.
30
+ """
31
+
32
+ from __future__ import annotations
33
+
34
+ import struct
35
+ from dataclasses import dataclass
36
+ from typing import Annotated, ClassVar, cast
37
+
38
+ import pyarrow as pa
39
+ from vgi_rpc import ArrowSerializableDataclass
40
+ from vgi_rpc.rpc import OutputCollector
41
+
42
+ from vgi._test_fixtures.table._common import _cardinality_from_count
43
+ from vgi.arguments import Arg
44
+ from vgi.invocation import GlobalInitResponse
45
+ from vgi.metadata import FunctionExample, OrderPreservation
46
+ from vgi.protocol import VgiOutputCollector
47
+ from vgi.schema_utils import schema
48
+ from vgi.table_function import (
49
+ InitParams,
50
+ ProcessParams,
51
+ TableFunctionGenerator,
52
+ bind_fixed_schema,
53
+ )
54
+
55
+ # Queue-item encoding: (partition_id, start, end) packed as three uint64s.
56
+ # Decoded by ``process()`` on the worker; partition_id is what the worker
57
+ # emits to DuckDB via the batch_index= kwarg.
58
+ _ITEM_FMT = ">QQQ"
59
+ _ITEM_SIZE = struct.calcsize(_ITEM_FMT)
60
+
61
+
62
+ # =============================================================================
63
+ # Single-column variant: partitioned_batch_index(count)
64
+ # =============================================================================
65
+
66
+
67
+ @dataclass(slots=True, frozen=True)
68
+ class _BatchIndexArgs:
69
+ """Arguments for ``partitioned_batch_index``."""
70
+
71
+ count: Annotated[int, Arg(0, doc="Total number of integers to generate", ge=0)]
72
+
73
+
74
+ @dataclass(kw_only=True)
75
+ class _BatchIndexState(ArrowSerializableDataclass):
76
+ """Per-worker cursor state.
77
+
78
+ ``partition_id`` is the queue-push order of the current work item; emitted
79
+ on every Arrow batch via the batch_index= kwarg. ``current_idx`` advances
80
+ through ``[current_start, current_end)`` as the worker produces batches.
81
+ All three reset to None at the moment a partition is exhausted; the next
82
+ ``process()`` call pops a fresh item.
83
+ """
84
+
85
+ partition_id: int | None = None
86
+ current_start: int | None = None
87
+ current_end: int | None = None
88
+ current_idx: int = 0
89
+
90
+
91
+ @bind_fixed_schema
92
+ @_cardinality_from_count
93
+ class PartitionedBatchIndexFunction(TableFunctionGenerator[_BatchIndexArgs, _BatchIndexState]):
94
+ """Parallel-scan sequence with batch_index ordering.
95
+
96
+ The primary worker enqueues N work items at on_init, each encoding
97
+ ``(partition_id, start, end)``. Any worker pulls the next item via
98
+ ``queue_pop``; emits a stream of Arrow batches tagged with
99
+ partition_id; advances to the next item when exhausted. DuckDB's
100
+ ordered sinks reassemble output in partition_id order — final output
101
+ matches a single-threaded scan, but the source itself fans out across
102
+ threads.
103
+ """
104
+
105
+ # NOTE: left at the original small chunk for now. Capping the partition
106
+ # count (as partitioned_sequence / filter_echo_partitioned do) once made
107
+ # batch_index.test segfault (exit 139) — but that turned out to be a
108
+ # pre-existing use-after-free in the extension's async cancel path (the
109
+ # cancel dispatcher logged through the destroyed query ClientContext), not
110
+ # a batch_index bug. The cap just triggered more stream cancellations and
111
+ # exposed it. With that fixed in the extension, capping this fixture is now
112
+ # safe to re-enable (verified 15/15 clean under UBSan); deferred only so the
113
+ # resize lands as its own change.
114
+ CHUNK_SIZE: ClassVar[int] = 1000
115
+ BATCH_SIZE: ClassVar[int] = 1000
116
+
117
+ FIXED_SCHEMA: ClassVar[pa.Schema] = schema(n=pa.int64())
118
+
119
+ class Meta:
120
+ name = "partitioned_batch_index"
121
+ description = (
122
+ "Multi-worker partitioned sequence with per-batch batch_index "
123
+ "tagging; parallel scan + ordered sink reassembly."
124
+ )
125
+ categories = ["generator", "utility"]
126
+ preserves_order = OrderPreservation.FIXED_ORDER
127
+ supports_batch_index = True
128
+ examples = [
129
+ FunctionExample(
130
+ sql="SELECT * FROM partitioned_batch_index(100)",
131
+ description=(
132
+ "Generate 0..99 in parallel; DuckDB sinks reassemble output in partition_id (insertion) order."
133
+ ),
134
+ ),
135
+ ]
136
+
137
+ @classmethod
138
+ def on_init(cls, params: InitParams[_BatchIndexArgs]) -> GlobalInitResponse:
139
+ work_items: list[bytes] = []
140
+ for partition_id, start_idx in enumerate(range(0, params.args.count, cls.CHUNK_SIZE)):
141
+ end_idx = min(start_idx + cls.CHUNK_SIZE, params.args.count)
142
+ work_items.append(struct.pack(_ITEM_FMT, partition_id, start_idx, end_idx))
143
+ params.storage.queue_push(work_items)
144
+ return GlobalInitResponse()
145
+
146
+ @classmethod
147
+ def initial_state(cls, params: ProcessParams[_BatchIndexArgs]) -> _BatchIndexState:
148
+ return _BatchIndexState()
149
+
150
+ @classmethod
151
+ def process(
152
+ cls,
153
+ params: ProcessParams[_BatchIndexArgs],
154
+ state: _BatchIndexState,
155
+ out: OutputCollector,
156
+ ) -> None:
157
+ if state.partition_id is None or state.current_idx >= (state.current_end or 0):
158
+ work_data = params.storage.queue_pop()
159
+ if work_data is None:
160
+ out.finish()
161
+ return
162
+ partition_id, start, end = struct.unpack(_ITEM_FMT, work_data)
163
+ state.partition_id = partition_id
164
+ state.current_start = start
165
+ state.current_end = end
166
+ state.current_idx = start
167
+
168
+ batch_end_idx = min(state.current_idx + cls.BATCH_SIZE, state.current_end or 0)
169
+ values = list(range(state.current_idx, batch_end_idx))
170
+ cast(VgiOutputCollector, out).emit(
171
+ pa.RecordBatch.from_pydict({"n": values}, schema=params.output_schema),
172
+ batch_index=state.partition_id,
173
+ )
174
+ state.current_idx = batch_end_idx
175
+
176
+
177
+ # =============================================================================
178
+ # Two-column variant: partitioned_batch_index_marked(count, chunk_size)
179
+ # =============================================================================
180
+
181
+
182
+ @dataclass(slots=True, frozen=True)
183
+ class _BatchIndexMarkedArgs:
184
+ """Arguments for ``partitioned_batch_index_marked``."""
185
+
186
+ count: Annotated[int, Arg(0, doc="Total number of rows to generate", ge=0)]
187
+ chunk_size: Annotated[int, Arg("chunk_size", default=1000, doc="Rows per partition", ge=1)]
188
+
189
+
190
+ @dataclass(kw_only=True)
191
+ class _BatchIndexMarkedState(ArrowSerializableDataclass):
192
+ partition_id: int | None = None
193
+ current_start: int | None = None
194
+ current_end: int | None = None
195
+ current_idx: int = 0
196
+
197
+
198
+ @bind_fixed_schema
199
+ @_cardinality_from_count
200
+ class PartitionedBatchIndexMarkedFunction(TableFunctionGenerator[_BatchIndexMarkedArgs, _BatchIndexMarkedState]):
201
+ """Two-column batch_index fixture for direct ordering observation.
202
+
203
+ Output rows are ``(partition_id, seq)`` where ``partition_id`` is the
204
+ queue-push order (matches the emitted batch_index) and ``seq`` counts
205
+ up within each partition starting at 0. Tests assert that no row with
206
+ a higher partition_id appears before a row with a lower one — proving
207
+ that DuckDB's sink-side reassembly worked.
208
+
209
+ Projection pushdown is OFF on this fixture so ``SELECT seq FROM …``
210
+ still gets the partition_id column emitted by the worker; the C++
211
+ extension's projection then drops it after the ordering metadata has
212
+ been observed.
213
+ """
214
+
215
+ BATCH_SIZE: ClassVar[int] = 256
216
+
217
+ FIXED_SCHEMA: ClassVar[pa.Schema] = schema(
218
+ partition_id=pa.int64(),
219
+ seq=pa.int64(),
220
+ )
221
+
222
+ class Meta:
223
+ name = "partitioned_batch_index_marked"
224
+ description = (
225
+ "Two-column batch_index demo: rows are (partition_id, seq). Tests "
226
+ "assert that DuckDB's ordered sinks reassemble output in "
227
+ "partition_id order under parallel execution."
228
+ )
229
+ categories = ["generator", "utility", "testing"]
230
+ preserves_order = OrderPreservation.FIXED_ORDER
231
+ supports_batch_index = True
232
+ projection_pushdown = False
233
+ examples = [
234
+ FunctionExample(
235
+ sql="SELECT * FROM partitioned_batch_index_marked(100, chunk_size := 25) LIMIT 5",
236
+ description="First 5 rows of partition 0",
237
+ ),
238
+ ]
239
+
240
+ @classmethod
241
+ def on_init(cls, params: InitParams[_BatchIndexMarkedArgs]) -> GlobalInitResponse:
242
+ work_items: list[bytes] = []
243
+ chunk_size = params.args.chunk_size
244
+ for partition_id, start_idx in enumerate(range(0, params.args.count, chunk_size)):
245
+ end_idx = min(start_idx + chunk_size, params.args.count)
246
+ work_items.append(struct.pack(_ITEM_FMT, partition_id, start_idx, end_idx))
247
+ params.storage.queue_push(work_items)
248
+ return GlobalInitResponse()
249
+
250
+ @classmethod
251
+ def initial_state(cls, params: ProcessParams[_BatchIndexMarkedArgs]) -> _BatchIndexMarkedState:
252
+ return _BatchIndexMarkedState()
253
+
254
+ @classmethod
255
+ def process(
256
+ cls,
257
+ params: ProcessParams[_BatchIndexMarkedArgs],
258
+ state: _BatchIndexMarkedState,
259
+ out: OutputCollector,
260
+ ) -> None:
261
+ if state.partition_id is None or state.current_idx >= (state.current_end or 0):
262
+ work_data = params.storage.queue_pop()
263
+ if work_data is None:
264
+ out.finish()
265
+ return
266
+ partition_id, start, end = struct.unpack(_ITEM_FMT, work_data)
267
+ state.partition_id = partition_id
268
+ state.current_start = start
269
+ state.current_end = end
270
+ state.current_idx = start
271
+
272
+ batch_end_idx = min(state.current_idx + cls.BATCH_SIZE, state.current_end or 0)
273
+ rows = batch_end_idx - state.current_idx
274
+ partition_ids = [state.partition_id] * rows
275
+ seqs = list(range(state.current_idx - (state.current_start or 0), batch_end_idx - (state.current_start or 0)))
276
+ cast(VgiOutputCollector, out).emit(
277
+ pa.RecordBatch.from_pydict(
278
+ {"partition_id": partition_ids, "seq": seqs},
279
+ schema=params.output_schema,
280
+ ),
281
+ batch_index=state.partition_id,
282
+ )
283
+ state.current_idx = batch_end_idx