vgi-python 0.8.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (124) hide show
  1. vgi/__init__.py +152 -0
  2. vgi/_duckdb.py +62 -0
  3. vgi/_storage_profile.py +132 -0
  4. vgi/_test_fixtures/__init__.py +20 -0
  5. vgi/_test_fixtures/accumulate/__init__.py +19 -0
  6. vgi/_test_fixtures/accumulate/worker.py +762 -0
  7. vgi/_test_fixtures/aggregate/__init__.py +62 -0
  8. vgi/_test_fixtures/aggregate/_common.py +21 -0
  9. vgi/_test_fixtures/aggregate/basic.py +232 -0
  10. vgi/_test_fixtures/aggregate/dynamic.py +409 -0
  11. vgi/_test_fixtures/aggregate/generic.py +86 -0
  12. vgi/_test_fixtures/aggregate/listagg.py +71 -0
  13. vgi/_test_fixtures/aggregate/percentile.py +107 -0
  14. vgi/_test_fixtures/aggregate/streaming.py +192 -0
  15. vgi/_test_fixtures/aggregate/varargs.py +75 -0
  16. vgi/_test_fixtures/aggregate/window.py +380 -0
  17. vgi/_test_fixtures/attach_options.py +308 -0
  18. vgi/_test_fixtures/bad_protocol.py +62 -0
  19. vgi/_test_fixtures/cancellable.py +336 -0
  20. vgi/_test_fixtures/catalog.py +813 -0
  21. vgi/_test_fixtures/http_server.py +394 -0
  22. vgi/_test_fixtures/nest_tensor.py +614 -0
  23. vgi/_test_fixtures/orchard_catalog.py +47 -0
  24. vgi/_test_fixtures/projection_repro/__init__.py +6 -0
  25. vgi/_test_fixtures/projection_repro/worker.py +454 -0
  26. vgi/_test_fixtures/scalar/__init__.py +116 -0
  27. vgi/_test_fixtures/scalar/_common.py +69 -0
  28. vgi/_test_fixtures/scalar/arithmetic.py +321 -0
  29. vgi/_test_fixtures/scalar/binary.py +120 -0
  30. vgi/_test_fixtures/scalar/formatting.py +176 -0
  31. vgi/_test_fixtures/scalar/geo.py +300 -0
  32. vgi/_test_fixtures/scalar/null_handling.py +107 -0
  33. vgi/_test_fixtures/scalar/random_demo.py +171 -0
  34. vgi/_test_fixtures/scalar/settings_secrets.py +102 -0
  35. vgi/_test_fixtures/scalar/type_info.py +219 -0
  36. vgi/_test_fixtures/schema_reconcile/__init__.py +29 -0
  37. vgi/_test_fixtures/schema_reconcile/worker.py +653 -0
  38. vgi/_test_fixtures/simple_writable.py +793 -0
  39. vgi/_test_fixtures/table/__init__.py +221 -0
  40. vgi/_test_fixtures/table/_common.py +162 -0
  41. vgi/_test_fixtures/table/batch_index.py +283 -0
  42. vgi/_test_fixtures/table/batch_index_broken.py +200 -0
  43. vgi/_test_fixtures/table/catalog_scans.py +162 -0
  44. vgi/_test_fixtures/table/filters.py +1005 -0
  45. vgi/_test_fixtures/table/late_materialization.py +249 -0
  46. vgi/_test_fixtures/table/make_series.py +273 -0
  47. vgi/_test_fixtures/table/misc.py +499 -0
  48. vgi/_test_fixtures/table/order_modes.py +164 -0
  49. vgi/_test_fixtures/table/pairs.py +437 -0
  50. vgi/_test_fixtures/table/partition_columns.py +472 -0
  51. vgi/_test_fixtures/table/partition_columns_broken.py +304 -0
  52. vgi/_test_fixtures/table/profiling_example.py +195 -0
  53. vgi/_test_fixtures/table/required_filters.py +234 -0
  54. vgi/_test_fixtures/table/sequence.py +710 -0
  55. vgi/_test_fixtures/table/settings.py +426 -0
  56. vgi/_test_fixtures/table/transaction_storage.py +162 -0
  57. vgi/_test_fixtures/table/tt_pushdown.py +191 -0
  58. vgi/_test_fixtures/table/versioned.py +230 -0
  59. vgi/_test_fixtures/table_in_out.py +1392 -0
  60. vgi/_test_fixtures/versioned.py +155 -0
  61. vgi/_test_fixtures/versioned_tables.py +595 -0
  62. vgi/_test_fixtures/worker.py +1631 -0
  63. vgi/_test_fixtures/writable/__init__.py +8 -0
  64. vgi/_test_fixtures/writable/generic.py +236 -0
  65. vgi/_test_fixtures/writable/table.py +149 -0
  66. vgi/_test_fixtures/writable/worker.py +1148 -0
  67. vgi/aggregate_function.py +607 -0
  68. vgi/argument_spec.py +472 -0
  69. vgi/arguments.py +1747 -0
  70. vgi/auth.py +55 -0
  71. vgi/catalog/__init__.py +88 -0
  72. vgi/catalog/attach_option.py +206 -0
  73. vgi/catalog/catalog_interface.py +2767 -0
  74. vgi/catalog/descriptors.py +870 -0
  75. vgi/catalog/duckdb_statistics.py +377 -0
  76. vgi/catalog/secret_type.py +96 -0
  77. vgi/catalog/setting.py +253 -0
  78. vgi/catalog/storage.py +372 -0
  79. vgi/client/__init__.py +67 -0
  80. vgi/client/catalog_mixin.py +1251 -0
  81. vgi/client/cli.py +582 -0
  82. vgi/client/cli_catalog.py +182 -0
  83. vgi/client/cli_schema.py +270 -0
  84. vgi/client/cli_table.py +907 -0
  85. vgi/client/cli_transaction.py +97 -0
  86. vgi/client/cli_utils.py +441 -0
  87. vgi/client/cli_view.py +303 -0
  88. vgi/client/client.py +2183 -0
  89. vgi/exceptions.py +205 -0
  90. vgi/function.py +245 -0
  91. vgi/function_storage.py +1636 -0
  92. vgi/function_storage_azure_sql.py +922 -0
  93. vgi/function_storage_cf_do.py +740 -0
  94. vgi/http/__init__.py +25 -0
  95. vgi/http/demo_storage.py +212 -0
  96. vgi/http/worker_page.py +1252 -0
  97. vgi/invocation.py +154 -0
  98. vgi/logging_config.py +93 -0
  99. vgi/meta_worker.py +661 -0
  100. vgi/metadata.py +1403 -0
  101. vgi/otel.py +406 -0
  102. vgi/protocol.py +2418 -0
  103. vgi/protocol_version.txt +1 -0
  104. vgi/py.typed +0 -0
  105. vgi/scalar_function.py +1211 -0
  106. vgi/schema_utils.py +234 -0
  107. vgi/secret_protocol.py +124 -0
  108. vgi/secret_service.py +238 -0
  109. vgi/serve.py +769 -0
  110. vgi/table_buffering_function.py +443 -0
  111. vgi/table_filter_pushdown.py +1528 -0
  112. vgi/table_function.py +1130 -0
  113. vgi/table_in_out_function.py +383 -0
  114. vgi/transactor/__init__.py +24 -0
  115. vgi/transactor/_duckdb_compat.py +27 -0
  116. vgi/transactor/client.py +137 -0
  117. vgi/transactor/protocol.py +149 -0
  118. vgi/transactor/server.py +740 -0
  119. vgi/worker.py +4761 -0
  120. vgi_python-0.8.0.dist-info/METADATA +735 -0
  121. vgi_python-0.8.0.dist-info/RECORD +124 -0
  122. vgi_python-0.8.0.dist-info/WHEEL +4 -0
  123. vgi_python-0.8.0.dist-info/entry_points.txt +5 -0
  124. vgi_python-0.8.0.dist-info/licenses/LICENSE +134 -0
@@ -0,0 +1,472 @@
1
+ # Copyright 2025, 2026 Query Farm LLC - https://query.farm
2
+
3
+ """Reference fixtures for the v2 PartitionColumns (Hive-style) batch_index mode.
4
+
5
+ These exercise the ``Meta.partition_kind`` + ``partition_field()``
6
+ opt-in. The C++ extension installs ``TableFunction::get_partition_info``
7
+ returning the declared kind, and ``get_partition_data`` populates
8
+ ``OperatorPartitionData::partition_data`` per chunk so DuckDB's planner
9
+ can pick ``PhysicalPartitionedAggregate`` for matching ``GROUP BY``
10
+ queries.
11
+
12
+ Today DuckDB consumes only ``SINGLE_VALUE_PARTITIONS``; OVERLAPPING /
13
+ DISJOINT are wire-level declarable and the C++ extension reports them
14
+ back to the planner, which falls back to ``HASH_GROUP_BY`` for those
15
+ modes until upstream adds consumers.
16
+
17
+ Fixtures:
18
+
19
+ * :class:`CountryPartitionedSalesFunction` — single-column
20
+ SINGLE_VALUE. Each emitted chunk has a single ``country`` value.
21
+ Core fixture for the planner-check assertion.
22
+
23
+ * :class:`RegionYearPartitionedFunction` — multi-column SINGLE_VALUE.
24
+ Each chunk has a single ``(region, year)`` tuple.
25
+
26
+ * :class:`PartitionedWithProjectedOutColumnFunction` — declares
27
+ partition on ``category`` but DOES NOT include ``category`` in the
28
+ emitted batch. Uses the explicit ``partition_values=`` override on
29
+ ``out.emit`` to supply the value the framework can't auto-extract.
30
+
31
+ * :class:`DisjointRangePartitionedFunction` — declares
32
+ ``DISJOINT_PARTITIONS``. Each chunk's ``key`` column has a distinct
33
+ disjoint integer range. Verifies the wire path; DuckDB falls back to
34
+ ``HASH_GROUP_BY`` for GROUP BY queries against it.
35
+
36
+ All fixtures use the in-memory state pattern (no work-queue / no
37
+ stream_state) — they're simpler than the v1 partitioned_batch_index
38
+ since the v2 plan is about correctness of the partition contract,
39
+ not parallelism stress. The v1 stress fixtures already exercise the
40
+ parallel-emit code path.
41
+ """
42
+
43
+ from __future__ import annotations
44
+
45
+ import struct
46
+ from dataclasses import dataclass
47
+ from typing import Annotated, ClassVar, cast
48
+
49
+ import pyarrow as pa
50
+ from vgi_rpc import ArrowSerializableDataclass
51
+ from vgi_rpc.rpc import OutputCollector
52
+
53
+ from vgi._test_fixtures.table._common import _cardinality_from_count
54
+ from vgi.arguments import Arg
55
+ from vgi.invocation import GlobalInitResponse
56
+ from vgi.metadata import FunctionExample, PartitionKind
57
+ from vgi.protocol import VgiOutputCollector
58
+ from vgi.schema_utils import partition_field
59
+ from vgi.table_function import (
60
+ InitParams,
61
+ ProcessParams,
62
+ TableFunctionGenerator,
63
+ bind_fixed_schema,
64
+ )
65
+
66
+ # =============================================================================
67
+ # Single-column SINGLE_VALUE_PARTITIONS — core fixture
68
+ # =============================================================================
69
+
70
+
71
+ @dataclass(slots=True, frozen=True)
72
+ class _CountryPartitionedArgs:
73
+ """Arguments for ``country_partitioned_sales``."""
74
+
75
+ rows_per_country: Annotated[int, Arg(0, doc="Rows to emit per country partition", ge=1)]
76
+
77
+
78
+ @dataclass(kw_only=True)
79
+ class _CountryPartitionedState(ArrowSerializableDataclass):
80
+ """Per-worker cursor over countries.
81
+
82
+ ``current_country`` is set after the worker pops a queue item;
83
+ ``current_idx`` advances through emitted rows until the per-country
84
+ quota is reached, then it pops the next item.
85
+ """
86
+
87
+ current_country: str | None = None
88
+ current_country_idx: int = -1
89
+ current_idx: int = 0
90
+
91
+
92
+ # A small, fixed list of partition values gives the SQL tests stable
93
+ # expected outputs and a predictable number of partitions (5).
94
+ _COUNTRIES: list[str] = ["AU", "BR", "CA", "FR", "US"]
95
+ # Queue items are ``(country_idx, country_name_bytes)``. The framework
96
+ # emits one Arrow batch per pop.
97
+ _QUEUE_ITEM_FMT = ">i" # int32 country_idx; country name lives in
98
+ # ``_COUNTRIES[idx]`` (avoids variable-length
99
+ # encoding for what's already a stable index).
100
+
101
+
102
+ @bind_fixed_schema
103
+ @_cardinality_from_count
104
+ class CountryPartitionedSalesFunction(TableFunctionGenerator[_CountryPartitionedArgs, _CountryPartitionedState]):
105
+ """One Arrow batch per ``country``; ``country`` is single-valued per chunk.
106
+
107
+ Demonstrates the SINGLE_VALUE_PARTITIONS contract. The C++ extension
108
+ reports SINGLE_VALUE_PARTITIONS from ``get_partition_info`` when the
109
+ planner asks about ``country``; ``GROUP BY country`` plans as
110
+ ``PARTITIONED_AGGREGATE``.
111
+
112
+ Uses the work-queue pattern so multi-worker parallel scan distributes
113
+ partitions across threads (each item processed exactly once), matching
114
+ the v1 ``partitioned_batch_index`` model.
115
+ """
116
+
117
+ FIXED_SCHEMA: ClassVar[pa.Schema] = pa.schema(
118
+ [
119
+ partition_field("country", pa.string()),
120
+ pa.field("sales", pa.int64()),
121
+ ]
122
+ )
123
+
124
+ class Meta:
125
+ name = "country_partitioned_sales"
126
+ description = (
127
+ "Per-country sales rows, one Arrow batch per country. Declares country as a SINGLE_VALUE partition column."
128
+ )
129
+ categories = ["generator", "partitioning"]
130
+ partition_kind = PartitionKind.SINGLE_VALUE_PARTITIONS
131
+ examples = [
132
+ FunctionExample(
133
+ sql="SELECT country, SUM(sales) FROM country_partitioned_sales(100) GROUP BY country",
134
+ description="Partitioned aggregate over country",
135
+ ),
136
+ ]
137
+
138
+ @classmethod
139
+ def on_init(cls, params: InitParams[_CountryPartitionedArgs]) -> GlobalInitResponse:
140
+ items = [struct.pack(_QUEUE_ITEM_FMT, i) for i in range(len(_COUNTRIES))]
141
+ params.storage.queue_push(items)
142
+ return GlobalInitResponse()
143
+
144
+ @classmethod
145
+ def initial_state(cls, params: ProcessParams[_CountryPartitionedArgs]) -> _CountryPartitionedState:
146
+ return _CountryPartitionedState()
147
+
148
+ @classmethod
149
+ def process(
150
+ cls,
151
+ params: ProcessParams[_CountryPartitionedArgs],
152
+ state: _CountryPartitionedState,
153
+ out: OutputCollector,
154
+ ) -> None:
155
+ if state.current_country is None or state.current_idx >= params.args.rows_per_country:
156
+ item = params.storage.queue_pop()
157
+ if item is None:
158
+ out.finish()
159
+ return
160
+ (state.current_country_idx,) = struct.unpack(_QUEUE_ITEM_FMT, item)
161
+ state.current_country = _COUNTRIES[state.current_country_idx]
162
+ state.current_idx = 0
163
+
164
+ rpc = params.args.rows_per_country
165
+ # Deterministic, unique sales values per (country, row) so the
166
+ # SQL test's SUM checks are easy to write.
167
+ base = state.current_country_idx * 1_000_000
168
+ sales_values = [base + i for i in range(rpc)]
169
+ batch = pa.RecordBatch.from_pydict(
170
+ {"country": [state.current_country] * rpc, "sales": sales_values},
171
+ schema=cls.FIXED_SCHEMA,
172
+ )
173
+ out.emit(batch)
174
+ # One batch per partition; mark current partition exhausted.
175
+ state.current_idx = rpc
176
+
177
+
178
+ # =============================================================================
179
+ # Multi-column SINGLE_VALUE_PARTITIONS
180
+ # =============================================================================
181
+
182
+
183
+ @dataclass(slots=True, frozen=True)
184
+ class _RegionYearArgs:
185
+ """Arguments for ``region_year_partitioned``."""
186
+
187
+ rows_per_partition: Annotated[int, Arg(0, doc="Rows per (region, year) partition", ge=1)]
188
+
189
+
190
+ @dataclass(kw_only=True)
191
+ class _RegionYearState(ArrowSerializableDataclass):
192
+ current_partition_idx: int = -1
193
+ current_idx: int = 0
194
+ started: bool = False
195
+
196
+
197
+ # (region, year) tuples — 6 partitions total
198
+ _REGIONS_YEARS: list[tuple[str, int]] = [
199
+ ("AMER", 2023),
200
+ ("AMER", 2024),
201
+ ("EMEA", 2023),
202
+ ("EMEA", 2024),
203
+ ("APAC", 2023),
204
+ ("APAC", 2024),
205
+ ]
206
+
207
+
208
+ @bind_fixed_schema
209
+ @_cardinality_from_count
210
+ class RegionYearPartitionedFunction(TableFunctionGenerator[_RegionYearArgs, _RegionYearState]):
211
+ """Per-(region, year) chunks with both columns single-valued.
212
+
213
+ Uses the work-queue pattern so multi-worker scan distributes
214
+ partitions across threads.
215
+ """
216
+
217
+ FIXED_SCHEMA: ClassVar[pa.Schema] = pa.schema(
218
+ [
219
+ partition_field("region", pa.string()),
220
+ partition_field("year", pa.int64()),
221
+ pa.field("value", pa.float64()),
222
+ ]
223
+ )
224
+
225
+ class Meta:
226
+ name = "region_year_partitioned"
227
+ description = (
228
+ "Per-(region, year) value rows. Declares both region and year "
229
+ "as SINGLE_VALUE partition columns; GROUP BY region, year "
230
+ "plans as PARTITIONED_AGGREGATE."
231
+ )
232
+ categories = ["generator", "partitioning"]
233
+ partition_kind = PartitionKind.SINGLE_VALUE_PARTITIONS
234
+ examples = [
235
+ FunctionExample(
236
+ sql="SELECT region, year, AVG(value) FROM region_year_partitioned(100) GROUP BY region, year",
237
+ description="Partitioned aggregate over (region, year)",
238
+ ),
239
+ ]
240
+
241
+ @classmethod
242
+ def on_init(cls, params: InitParams[_RegionYearArgs]) -> GlobalInitResponse:
243
+ items = [struct.pack(_QUEUE_ITEM_FMT, i) for i in range(len(_REGIONS_YEARS))]
244
+ params.storage.queue_push(items)
245
+ return GlobalInitResponse()
246
+
247
+ @classmethod
248
+ def initial_state(cls, params: ProcessParams[_RegionYearArgs]) -> _RegionYearState:
249
+ return _RegionYearState()
250
+
251
+ @classmethod
252
+ def process(
253
+ cls,
254
+ params: ProcessParams[_RegionYearArgs],
255
+ state: _RegionYearState,
256
+ out: OutputCollector,
257
+ ) -> None:
258
+ if not state.started or state.current_idx >= params.args.rows_per_partition:
259
+ item = params.storage.queue_pop()
260
+ if item is None:
261
+ out.finish()
262
+ return
263
+ (state.current_partition_idx,) = struct.unpack(_QUEUE_ITEM_FMT, item)
264
+ state.current_idx = 0
265
+ state.started = True
266
+
267
+ region, year = _REGIONS_YEARS[state.current_partition_idx]
268
+ rpp = params.args.rows_per_partition
269
+ base = float(state.current_partition_idx * 1000)
270
+ values = [base + float(i) for i in range(rpp)]
271
+ batch = pa.RecordBatch.from_pydict(
272
+ {
273
+ "region": [region] * rpp,
274
+ "year": [year] * rpp,
275
+ "value": values,
276
+ },
277
+ schema=cls.FIXED_SCHEMA,
278
+ )
279
+ out.emit(batch)
280
+ state.current_idx = rpp
281
+
282
+
283
+ # =============================================================================
284
+ # Projected-out partition column — exercises explicit override path
285
+ # =============================================================================
286
+
287
+
288
+ @dataclass(slots=True, frozen=True)
289
+ class _ProjectedOutArgs:
290
+ """Arguments for ``partitioned_with_explicit_override``."""
291
+
292
+ rows_per_category: Annotated[int, Arg(0, doc="Rows per category partition", ge=1)]
293
+
294
+
295
+ @dataclass(kw_only=True)
296
+ class _ProjectedOutState(ArrowSerializableDataclass):
297
+ current_category_idx: int = -1
298
+ current_idx: int = 0
299
+ started: bool = False
300
+
301
+
302
+ _CATEGORIES: list[str] = ["books", "music", "video"]
303
+
304
+
305
+ @bind_fixed_schema
306
+ @_cardinality_from_count
307
+ class PartitionedWithExplicitOverrideFunction(TableFunctionGenerator[_ProjectedOutArgs, _ProjectedOutState]):
308
+ """Uses the explicit ``partition_values=`` override on ``out.emit``.
309
+
310
+ Emits batches that DO include the partition column (so auto-extract
311
+ would work), but supplies ``partition_values`` explicitly anyway —
312
+ exercises the type-validation + IPC-batch-construction code path
313
+ for the explicit-override variant.
314
+
315
+ A worker whose emitted batches don't include the partition column
316
+ (e.g. under aggressive projection pushdown) MUST use this path;
317
+ this fixture covers the contract without needing to wire up
318
+ projection pushdown in the fixture itself.
319
+ """
320
+
321
+ FIXED_SCHEMA: ClassVar[pa.Schema] = pa.schema(
322
+ [
323
+ partition_field("category", pa.string()),
324
+ pa.field("revenue", pa.int64()),
325
+ ]
326
+ )
327
+
328
+ class Meta:
329
+ name = "partitioned_with_explicit_override"
330
+ description = (
331
+ "Partition column ``category`` is in the bind schema and the "
332
+ "emitted batches; worker uses the explicit "
333
+ "``partition_values=`` override on ``out.emit`` to exercise "
334
+ "the override code path."
335
+ )
336
+ categories = ["generator", "partitioning", "testing"]
337
+ partition_kind = PartitionKind.SINGLE_VALUE_PARTITIONS
338
+
339
+ @classmethod
340
+ def on_init(cls, params: InitParams[_ProjectedOutArgs]) -> GlobalInitResponse:
341
+ items = [struct.pack(_QUEUE_ITEM_FMT, i) for i in range(len(_CATEGORIES))]
342
+ params.storage.queue_push(items)
343
+ return GlobalInitResponse()
344
+
345
+ @classmethod
346
+ def initial_state(cls, params: ProcessParams[_ProjectedOutArgs]) -> _ProjectedOutState:
347
+ return _ProjectedOutState()
348
+
349
+ @classmethod
350
+ def process(
351
+ cls,
352
+ params: ProcessParams[_ProjectedOutArgs],
353
+ state: _ProjectedOutState,
354
+ out: OutputCollector,
355
+ ) -> None:
356
+ if not state.started or state.current_idx >= params.args.rows_per_category:
357
+ item = params.storage.queue_pop()
358
+ if item is None:
359
+ out.finish()
360
+ return
361
+ (state.current_category_idx,) = struct.unpack(_QUEUE_ITEM_FMT, item)
362
+ state.current_idx = 0
363
+ state.started = True
364
+
365
+ category = _CATEGORIES[state.current_category_idx]
366
+ rpc = params.args.rows_per_category
367
+ revenue = [(state.current_category_idx + 1) * 100 + i for i in range(rpc)]
368
+ batch = pa.RecordBatch.from_pydict(
369
+ {"category": [category] * rpc, "revenue": revenue},
370
+ schema=cls.FIXED_SCHEMA,
371
+ )
372
+ cast(VgiOutputCollector, out).emit(
373
+ batch,
374
+ partition_values={
375
+ "category": (
376
+ pa.scalar(category, type=pa.string()),
377
+ pa.scalar(category, type=pa.string()),
378
+ ),
379
+ },
380
+ )
381
+ state.current_idx = rpc
382
+
383
+
384
+ # =============================================================================
385
+ # DISJOINT_PARTITIONS — wire-level declaration only
386
+ # =============================================================================
387
+
388
+
389
+ @dataclass(slots=True, frozen=True)
390
+ class _DisjointArgs:
391
+ """Arguments for ``disjoint_range_partitioned``."""
392
+
393
+ partitions: Annotated[int, Arg(0, doc="Number of disjoint partitions", ge=1)]
394
+ rows_per_partition: Annotated[int, Arg("rows_per_partition", default=10, doc="Rows per partition", ge=1)]
395
+
396
+
397
+ @dataclass(kw_only=True)
398
+ class _DisjointState(ArrowSerializableDataclass):
399
+ current_partition_idx: int = -1
400
+ current_idx: int = 0
401
+ started: bool = False
402
+
403
+
404
+ @bind_fixed_schema
405
+ @_cardinality_from_count
406
+ class DisjointRangePartitionedFunction(TableFunctionGenerator[_DisjointArgs, _DisjointState]):
407
+ """Per-chunk disjoint integer ranges on ``key``.
408
+
409
+ Each chunk N emits ``key`` values in ``[N*1000, N*1000 + rows)``
410
+ — disjoint across partitions. Declares
411
+ ``DISJOINT_PARTITIONS``; the C++ extension propagates this to
412
+ DuckDB's ``get_partition_info``. DuckDB doesn't have a consumer
413
+ for DISJOINT today, so GROUP BY queries fall back to
414
+ ``HASH_GROUP_BY`` (verified by the integration test).
415
+
416
+ Purpose: verify the wire path (declaration, per-batch min/max
417
+ metadata, C++ extraction) works for the non-SINGLE_VALUE kinds.
418
+ """
419
+
420
+ FIXED_SCHEMA: ClassVar[pa.Schema] = pa.schema(
421
+ [
422
+ partition_field("key", pa.int64()),
423
+ pa.field("value", pa.int64()),
424
+ ]
425
+ )
426
+
427
+ class Meta:
428
+ name = "disjoint_range_partitioned"
429
+ description = (
430
+ "Disjoint per-chunk integer ranges on ``key``. Declares "
431
+ "DISJOINT_PARTITIONS (wire-level only; DuckDB falls back to "
432
+ "HASH_GROUP_BY for now)."
433
+ )
434
+ categories = ["generator", "partitioning", "testing"]
435
+ partition_kind = PartitionKind.DISJOINT_PARTITIONS
436
+
437
+ @classmethod
438
+ def on_init(cls, params: InitParams[_DisjointArgs]) -> GlobalInitResponse:
439
+ items = [struct.pack(_QUEUE_ITEM_FMT, i) for i in range(params.args.partitions)]
440
+ params.storage.queue_push(items)
441
+ return GlobalInitResponse()
442
+
443
+ @classmethod
444
+ def initial_state(cls, params: ProcessParams[_DisjointArgs]) -> _DisjointState:
445
+ return _DisjointState()
446
+
447
+ @classmethod
448
+ def process(
449
+ cls,
450
+ params: ProcessParams[_DisjointArgs],
451
+ state: _DisjointState,
452
+ out: OutputCollector,
453
+ ) -> None:
454
+ if not state.started or state.current_idx >= params.args.rows_per_partition:
455
+ item = params.storage.queue_pop()
456
+ if item is None:
457
+ out.finish()
458
+ return
459
+ (state.current_partition_idx,) = struct.unpack(_QUEUE_ITEM_FMT, item)
460
+ state.current_idx = 0
461
+ state.started = True
462
+
463
+ rpp = params.args.rows_per_partition
464
+ base = state.current_partition_idx * 1000
465
+ keys = [base + i for i in range(rpp)]
466
+ values = [state.current_partition_idx * 10 + i for i in range(rpp)]
467
+ batch = pa.RecordBatch.from_pydict(
468
+ {"key": keys, "value": values},
469
+ schema=cls.FIXED_SCHEMA,
470
+ )
471
+ out.emit(batch)
472
+ state.current_idx = rpp