vgi-python 0.8.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vgi/__init__.py +152 -0
- vgi/_duckdb.py +62 -0
- vgi/_storage_profile.py +132 -0
- vgi/_test_fixtures/__init__.py +20 -0
- vgi/_test_fixtures/accumulate/__init__.py +19 -0
- vgi/_test_fixtures/accumulate/worker.py +762 -0
- vgi/_test_fixtures/aggregate/__init__.py +62 -0
- vgi/_test_fixtures/aggregate/_common.py +21 -0
- vgi/_test_fixtures/aggregate/basic.py +232 -0
- vgi/_test_fixtures/aggregate/dynamic.py +409 -0
- vgi/_test_fixtures/aggregate/generic.py +86 -0
- vgi/_test_fixtures/aggregate/listagg.py +71 -0
- vgi/_test_fixtures/aggregate/percentile.py +107 -0
- vgi/_test_fixtures/aggregate/streaming.py +192 -0
- vgi/_test_fixtures/aggregate/varargs.py +75 -0
- vgi/_test_fixtures/aggregate/window.py +380 -0
- vgi/_test_fixtures/attach_options.py +308 -0
- vgi/_test_fixtures/bad_protocol.py +62 -0
- vgi/_test_fixtures/cancellable.py +336 -0
- vgi/_test_fixtures/catalog.py +813 -0
- vgi/_test_fixtures/http_server.py +394 -0
- vgi/_test_fixtures/nest_tensor.py +614 -0
- vgi/_test_fixtures/orchard_catalog.py +47 -0
- vgi/_test_fixtures/projection_repro/__init__.py +6 -0
- vgi/_test_fixtures/projection_repro/worker.py +454 -0
- vgi/_test_fixtures/scalar/__init__.py +116 -0
- vgi/_test_fixtures/scalar/_common.py +69 -0
- vgi/_test_fixtures/scalar/arithmetic.py +321 -0
- vgi/_test_fixtures/scalar/binary.py +120 -0
- vgi/_test_fixtures/scalar/formatting.py +176 -0
- vgi/_test_fixtures/scalar/geo.py +300 -0
- vgi/_test_fixtures/scalar/null_handling.py +107 -0
- vgi/_test_fixtures/scalar/random_demo.py +171 -0
- vgi/_test_fixtures/scalar/settings_secrets.py +102 -0
- vgi/_test_fixtures/scalar/type_info.py +219 -0
- vgi/_test_fixtures/schema_reconcile/__init__.py +29 -0
- vgi/_test_fixtures/schema_reconcile/worker.py +653 -0
- vgi/_test_fixtures/simple_writable.py +793 -0
- vgi/_test_fixtures/table/__init__.py +221 -0
- vgi/_test_fixtures/table/_common.py +162 -0
- vgi/_test_fixtures/table/batch_index.py +283 -0
- vgi/_test_fixtures/table/batch_index_broken.py +200 -0
- vgi/_test_fixtures/table/catalog_scans.py +162 -0
- vgi/_test_fixtures/table/filters.py +1005 -0
- vgi/_test_fixtures/table/late_materialization.py +249 -0
- vgi/_test_fixtures/table/make_series.py +273 -0
- vgi/_test_fixtures/table/misc.py +499 -0
- vgi/_test_fixtures/table/order_modes.py +164 -0
- vgi/_test_fixtures/table/pairs.py +437 -0
- vgi/_test_fixtures/table/partition_columns.py +472 -0
- vgi/_test_fixtures/table/partition_columns_broken.py +304 -0
- vgi/_test_fixtures/table/profiling_example.py +195 -0
- vgi/_test_fixtures/table/required_filters.py +234 -0
- vgi/_test_fixtures/table/sequence.py +710 -0
- vgi/_test_fixtures/table/settings.py +426 -0
- vgi/_test_fixtures/table/transaction_storage.py +162 -0
- vgi/_test_fixtures/table/tt_pushdown.py +191 -0
- vgi/_test_fixtures/table/versioned.py +230 -0
- vgi/_test_fixtures/table_in_out.py +1392 -0
- vgi/_test_fixtures/versioned.py +155 -0
- vgi/_test_fixtures/versioned_tables.py +595 -0
- vgi/_test_fixtures/worker.py +1631 -0
- vgi/_test_fixtures/writable/__init__.py +8 -0
- vgi/_test_fixtures/writable/generic.py +236 -0
- vgi/_test_fixtures/writable/table.py +149 -0
- vgi/_test_fixtures/writable/worker.py +1148 -0
- vgi/aggregate_function.py +607 -0
- vgi/argument_spec.py +472 -0
- vgi/arguments.py +1747 -0
- vgi/auth.py +55 -0
- vgi/catalog/__init__.py +88 -0
- vgi/catalog/attach_option.py +206 -0
- vgi/catalog/catalog_interface.py +2767 -0
- vgi/catalog/descriptors.py +870 -0
- vgi/catalog/duckdb_statistics.py +377 -0
- vgi/catalog/secret_type.py +96 -0
- vgi/catalog/setting.py +253 -0
- vgi/catalog/storage.py +372 -0
- vgi/client/__init__.py +67 -0
- vgi/client/catalog_mixin.py +1251 -0
- vgi/client/cli.py +582 -0
- vgi/client/cli_catalog.py +182 -0
- vgi/client/cli_schema.py +270 -0
- vgi/client/cli_table.py +907 -0
- vgi/client/cli_transaction.py +97 -0
- vgi/client/cli_utils.py +441 -0
- vgi/client/cli_view.py +303 -0
- vgi/client/client.py +2183 -0
- vgi/exceptions.py +205 -0
- vgi/function.py +245 -0
- vgi/function_storage.py +1636 -0
- vgi/function_storage_azure_sql.py +922 -0
- vgi/function_storage_cf_do.py +740 -0
- vgi/http/__init__.py +25 -0
- vgi/http/demo_storage.py +212 -0
- vgi/http/worker_page.py +1252 -0
- vgi/invocation.py +154 -0
- vgi/logging_config.py +93 -0
- vgi/meta_worker.py +661 -0
- vgi/metadata.py +1403 -0
- vgi/otel.py +406 -0
- vgi/protocol.py +2418 -0
- vgi/protocol_version.txt +1 -0
- vgi/py.typed +0 -0
- vgi/scalar_function.py +1211 -0
- vgi/schema_utils.py +234 -0
- vgi/secret_protocol.py +124 -0
- vgi/secret_service.py +238 -0
- vgi/serve.py +769 -0
- vgi/table_buffering_function.py +443 -0
- vgi/table_filter_pushdown.py +1528 -0
- vgi/table_function.py +1130 -0
- vgi/table_in_out_function.py +383 -0
- vgi/transactor/__init__.py +24 -0
- vgi/transactor/_duckdb_compat.py +27 -0
- vgi/transactor/client.py +137 -0
- vgi/transactor/protocol.py +149 -0
- vgi/transactor/server.py +740 -0
- vgi/worker.py +4761 -0
- vgi_python-0.8.0.dist-info/METADATA +735 -0
- vgi_python-0.8.0.dist-info/RECORD +124 -0
- vgi_python-0.8.0.dist-info/WHEEL +4 -0
- vgi_python-0.8.0.dist-info/entry_points.txt +5 -0
- vgi_python-0.8.0.dist-info/licenses/LICENSE +134 -0
|
@@ -0,0 +1,472 @@
|
|
|
1
|
+
# Copyright 2025, 2026 Query Farm LLC - https://query.farm
|
|
2
|
+
|
|
3
|
+
"""Reference fixtures for the v2 PartitionColumns (Hive-style) batch_index mode.
|
|
4
|
+
|
|
5
|
+
These exercise the ``Meta.partition_kind`` + ``partition_field()``
|
|
6
|
+
opt-in. The C++ extension installs ``TableFunction::get_partition_info``
|
|
7
|
+
returning the declared kind, and ``get_partition_data`` populates
|
|
8
|
+
``OperatorPartitionData::partition_data`` per chunk so DuckDB's planner
|
|
9
|
+
can pick ``PhysicalPartitionedAggregate`` for matching ``GROUP BY``
|
|
10
|
+
queries.
|
|
11
|
+
|
|
12
|
+
Today DuckDB consumes only ``SINGLE_VALUE_PARTITIONS``; OVERLAPPING /
|
|
13
|
+
DISJOINT are wire-level declarable and the C++ extension reports them
|
|
14
|
+
back to the planner, which falls back to ``HASH_GROUP_BY`` for those
|
|
15
|
+
modes until upstream adds consumers.
|
|
16
|
+
|
|
17
|
+
Fixtures:
|
|
18
|
+
|
|
19
|
+
* :class:`CountryPartitionedSalesFunction` — single-column
|
|
20
|
+
SINGLE_VALUE. Each emitted chunk has a single ``country`` value.
|
|
21
|
+
Core fixture for the planner-check assertion.
|
|
22
|
+
|
|
23
|
+
* :class:`RegionYearPartitionedFunction` — multi-column SINGLE_VALUE.
|
|
24
|
+
Each chunk has a single ``(region, year)`` tuple.
|
|
25
|
+
|
|
26
|
+
* :class:`PartitionedWithProjectedOutColumnFunction` — declares
|
|
27
|
+
partition on ``category`` but DOES NOT include ``category`` in the
|
|
28
|
+
emitted batch. Uses the explicit ``partition_values=`` override on
|
|
29
|
+
``out.emit`` to supply the value the framework can't auto-extract.
|
|
30
|
+
|
|
31
|
+
* :class:`DisjointRangePartitionedFunction` — declares
|
|
32
|
+
``DISJOINT_PARTITIONS``. Each chunk's ``key`` column has a distinct
|
|
33
|
+
disjoint integer range. Verifies the wire path; DuckDB falls back to
|
|
34
|
+
``HASH_GROUP_BY`` for GROUP BY queries against it.
|
|
35
|
+
|
|
36
|
+
All fixtures use the in-memory state pattern (no work-queue / no
|
|
37
|
+
stream_state) — they're simpler than the v1 partitioned_batch_index
|
|
38
|
+
since the v2 plan is about correctness of the partition contract,
|
|
39
|
+
not parallelism stress. The v1 stress fixtures already exercise the
|
|
40
|
+
parallel-emit code path.
|
|
41
|
+
"""
|
|
42
|
+
|
|
43
|
+
from __future__ import annotations
|
|
44
|
+
|
|
45
|
+
import struct
|
|
46
|
+
from dataclasses import dataclass
|
|
47
|
+
from typing import Annotated, ClassVar, cast
|
|
48
|
+
|
|
49
|
+
import pyarrow as pa
|
|
50
|
+
from vgi_rpc import ArrowSerializableDataclass
|
|
51
|
+
from vgi_rpc.rpc import OutputCollector
|
|
52
|
+
|
|
53
|
+
from vgi._test_fixtures.table._common import _cardinality_from_count
|
|
54
|
+
from vgi.arguments import Arg
|
|
55
|
+
from vgi.invocation import GlobalInitResponse
|
|
56
|
+
from vgi.metadata import FunctionExample, PartitionKind
|
|
57
|
+
from vgi.protocol import VgiOutputCollector
|
|
58
|
+
from vgi.schema_utils import partition_field
|
|
59
|
+
from vgi.table_function import (
|
|
60
|
+
InitParams,
|
|
61
|
+
ProcessParams,
|
|
62
|
+
TableFunctionGenerator,
|
|
63
|
+
bind_fixed_schema,
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
# =============================================================================
|
|
67
|
+
# Single-column SINGLE_VALUE_PARTITIONS — core fixture
|
|
68
|
+
# =============================================================================
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
@dataclass(slots=True, frozen=True)
|
|
72
|
+
class _CountryPartitionedArgs:
|
|
73
|
+
"""Arguments for ``country_partitioned_sales``."""
|
|
74
|
+
|
|
75
|
+
rows_per_country: Annotated[int, Arg(0, doc="Rows to emit per country partition", ge=1)]
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
@dataclass(kw_only=True)
|
|
79
|
+
class _CountryPartitionedState(ArrowSerializableDataclass):
|
|
80
|
+
"""Per-worker cursor over countries.
|
|
81
|
+
|
|
82
|
+
``current_country`` is set after the worker pops a queue item;
|
|
83
|
+
``current_idx`` advances through emitted rows until the per-country
|
|
84
|
+
quota is reached, then it pops the next item.
|
|
85
|
+
"""
|
|
86
|
+
|
|
87
|
+
current_country: str | None = None
|
|
88
|
+
current_country_idx: int = -1
|
|
89
|
+
current_idx: int = 0
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
# A small, fixed list of partition values gives the SQL tests stable
|
|
93
|
+
# expected outputs and a predictable number of partitions (5).
|
|
94
|
+
_COUNTRIES: list[str] = ["AU", "BR", "CA", "FR", "US"]
|
|
95
|
+
# Queue items are ``(country_idx, country_name_bytes)``. The framework
|
|
96
|
+
# emits one Arrow batch per pop.
|
|
97
|
+
_QUEUE_ITEM_FMT = ">i" # int32 country_idx; country name lives in
|
|
98
|
+
# ``_COUNTRIES[idx]`` (avoids variable-length
|
|
99
|
+
# encoding for what's already a stable index).
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
@bind_fixed_schema
|
|
103
|
+
@_cardinality_from_count
|
|
104
|
+
class CountryPartitionedSalesFunction(TableFunctionGenerator[_CountryPartitionedArgs, _CountryPartitionedState]):
|
|
105
|
+
"""One Arrow batch per ``country``; ``country`` is single-valued per chunk.
|
|
106
|
+
|
|
107
|
+
Demonstrates the SINGLE_VALUE_PARTITIONS contract. The C++ extension
|
|
108
|
+
reports SINGLE_VALUE_PARTITIONS from ``get_partition_info`` when the
|
|
109
|
+
planner asks about ``country``; ``GROUP BY country`` plans as
|
|
110
|
+
``PARTITIONED_AGGREGATE``.
|
|
111
|
+
|
|
112
|
+
Uses the work-queue pattern so multi-worker parallel scan distributes
|
|
113
|
+
partitions across threads (each item processed exactly once), matching
|
|
114
|
+
the v1 ``partitioned_batch_index`` model.
|
|
115
|
+
"""
|
|
116
|
+
|
|
117
|
+
FIXED_SCHEMA: ClassVar[pa.Schema] = pa.schema(
|
|
118
|
+
[
|
|
119
|
+
partition_field("country", pa.string()),
|
|
120
|
+
pa.field("sales", pa.int64()),
|
|
121
|
+
]
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
class Meta:
|
|
125
|
+
name = "country_partitioned_sales"
|
|
126
|
+
description = (
|
|
127
|
+
"Per-country sales rows, one Arrow batch per country. Declares country as a SINGLE_VALUE partition column."
|
|
128
|
+
)
|
|
129
|
+
categories = ["generator", "partitioning"]
|
|
130
|
+
partition_kind = PartitionKind.SINGLE_VALUE_PARTITIONS
|
|
131
|
+
examples = [
|
|
132
|
+
FunctionExample(
|
|
133
|
+
sql="SELECT country, SUM(sales) FROM country_partitioned_sales(100) GROUP BY country",
|
|
134
|
+
description="Partitioned aggregate over country",
|
|
135
|
+
),
|
|
136
|
+
]
|
|
137
|
+
|
|
138
|
+
@classmethod
|
|
139
|
+
def on_init(cls, params: InitParams[_CountryPartitionedArgs]) -> GlobalInitResponse:
|
|
140
|
+
items = [struct.pack(_QUEUE_ITEM_FMT, i) for i in range(len(_COUNTRIES))]
|
|
141
|
+
params.storage.queue_push(items)
|
|
142
|
+
return GlobalInitResponse()
|
|
143
|
+
|
|
144
|
+
@classmethod
|
|
145
|
+
def initial_state(cls, params: ProcessParams[_CountryPartitionedArgs]) -> _CountryPartitionedState:
|
|
146
|
+
return _CountryPartitionedState()
|
|
147
|
+
|
|
148
|
+
@classmethod
|
|
149
|
+
def process(
|
|
150
|
+
cls,
|
|
151
|
+
params: ProcessParams[_CountryPartitionedArgs],
|
|
152
|
+
state: _CountryPartitionedState,
|
|
153
|
+
out: OutputCollector,
|
|
154
|
+
) -> None:
|
|
155
|
+
if state.current_country is None or state.current_idx >= params.args.rows_per_country:
|
|
156
|
+
item = params.storage.queue_pop()
|
|
157
|
+
if item is None:
|
|
158
|
+
out.finish()
|
|
159
|
+
return
|
|
160
|
+
(state.current_country_idx,) = struct.unpack(_QUEUE_ITEM_FMT, item)
|
|
161
|
+
state.current_country = _COUNTRIES[state.current_country_idx]
|
|
162
|
+
state.current_idx = 0
|
|
163
|
+
|
|
164
|
+
rpc = params.args.rows_per_country
|
|
165
|
+
# Deterministic, unique sales values per (country, row) so the
|
|
166
|
+
# SQL test's SUM checks are easy to write.
|
|
167
|
+
base = state.current_country_idx * 1_000_000
|
|
168
|
+
sales_values = [base + i for i in range(rpc)]
|
|
169
|
+
batch = pa.RecordBatch.from_pydict(
|
|
170
|
+
{"country": [state.current_country] * rpc, "sales": sales_values},
|
|
171
|
+
schema=cls.FIXED_SCHEMA,
|
|
172
|
+
)
|
|
173
|
+
out.emit(batch)
|
|
174
|
+
# One batch per partition; mark current partition exhausted.
|
|
175
|
+
state.current_idx = rpc
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
# =============================================================================
|
|
179
|
+
# Multi-column SINGLE_VALUE_PARTITIONS
|
|
180
|
+
# =============================================================================
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
@dataclass(slots=True, frozen=True)
|
|
184
|
+
class _RegionYearArgs:
|
|
185
|
+
"""Arguments for ``region_year_partitioned``."""
|
|
186
|
+
|
|
187
|
+
rows_per_partition: Annotated[int, Arg(0, doc="Rows per (region, year) partition", ge=1)]
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
@dataclass(kw_only=True)
|
|
191
|
+
class _RegionYearState(ArrowSerializableDataclass):
|
|
192
|
+
current_partition_idx: int = -1
|
|
193
|
+
current_idx: int = 0
|
|
194
|
+
started: bool = False
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
# (region, year) tuples — 6 partitions total
|
|
198
|
+
_REGIONS_YEARS: list[tuple[str, int]] = [
|
|
199
|
+
("AMER", 2023),
|
|
200
|
+
("AMER", 2024),
|
|
201
|
+
("EMEA", 2023),
|
|
202
|
+
("EMEA", 2024),
|
|
203
|
+
("APAC", 2023),
|
|
204
|
+
("APAC", 2024),
|
|
205
|
+
]
|
|
206
|
+
|
|
207
|
+
|
|
208
|
+
@bind_fixed_schema
|
|
209
|
+
@_cardinality_from_count
|
|
210
|
+
class RegionYearPartitionedFunction(TableFunctionGenerator[_RegionYearArgs, _RegionYearState]):
|
|
211
|
+
"""Per-(region, year) chunks with both columns single-valued.
|
|
212
|
+
|
|
213
|
+
Uses the work-queue pattern so multi-worker scan distributes
|
|
214
|
+
partitions across threads.
|
|
215
|
+
"""
|
|
216
|
+
|
|
217
|
+
FIXED_SCHEMA: ClassVar[pa.Schema] = pa.schema(
|
|
218
|
+
[
|
|
219
|
+
partition_field("region", pa.string()),
|
|
220
|
+
partition_field("year", pa.int64()),
|
|
221
|
+
pa.field("value", pa.float64()),
|
|
222
|
+
]
|
|
223
|
+
)
|
|
224
|
+
|
|
225
|
+
class Meta:
|
|
226
|
+
name = "region_year_partitioned"
|
|
227
|
+
description = (
|
|
228
|
+
"Per-(region, year) value rows. Declares both region and year "
|
|
229
|
+
"as SINGLE_VALUE partition columns; GROUP BY region, year "
|
|
230
|
+
"plans as PARTITIONED_AGGREGATE."
|
|
231
|
+
)
|
|
232
|
+
categories = ["generator", "partitioning"]
|
|
233
|
+
partition_kind = PartitionKind.SINGLE_VALUE_PARTITIONS
|
|
234
|
+
examples = [
|
|
235
|
+
FunctionExample(
|
|
236
|
+
sql="SELECT region, year, AVG(value) FROM region_year_partitioned(100) GROUP BY region, year",
|
|
237
|
+
description="Partitioned aggregate over (region, year)",
|
|
238
|
+
),
|
|
239
|
+
]
|
|
240
|
+
|
|
241
|
+
@classmethod
|
|
242
|
+
def on_init(cls, params: InitParams[_RegionYearArgs]) -> GlobalInitResponse:
|
|
243
|
+
items = [struct.pack(_QUEUE_ITEM_FMT, i) for i in range(len(_REGIONS_YEARS))]
|
|
244
|
+
params.storage.queue_push(items)
|
|
245
|
+
return GlobalInitResponse()
|
|
246
|
+
|
|
247
|
+
@classmethod
|
|
248
|
+
def initial_state(cls, params: ProcessParams[_RegionYearArgs]) -> _RegionYearState:
|
|
249
|
+
return _RegionYearState()
|
|
250
|
+
|
|
251
|
+
@classmethod
|
|
252
|
+
def process(
|
|
253
|
+
cls,
|
|
254
|
+
params: ProcessParams[_RegionYearArgs],
|
|
255
|
+
state: _RegionYearState,
|
|
256
|
+
out: OutputCollector,
|
|
257
|
+
) -> None:
|
|
258
|
+
if not state.started or state.current_idx >= params.args.rows_per_partition:
|
|
259
|
+
item = params.storage.queue_pop()
|
|
260
|
+
if item is None:
|
|
261
|
+
out.finish()
|
|
262
|
+
return
|
|
263
|
+
(state.current_partition_idx,) = struct.unpack(_QUEUE_ITEM_FMT, item)
|
|
264
|
+
state.current_idx = 0
|
|
265
|
+
state.started = True
|
|
266
|
+
|
|
267
|
+
region, year = _REGIONS_YEARS[state.current_partition_idx]
|
|
268
|
+
rpp = params.args.rows_per_partition
|
|
269
|
+
base = float(state.current_partition_idx * 1000)
|
|
270
|
+
values = [base + float(i) for i in range(rpp)]
|
|
271
|
+
batch = pa.RecordBatch.from_pydict(
|
|
272
|
+
{
|
|
273
|
+
"region": [region] * rpp,
|
|
274
|
+
"year": [year] * rpp,
|
|
275
|
+
"value": values,
|
|
276
|
+
},
|
|
277
|
+
schema=cls.FIXED_SCHEMA,
|
|
278
|
+
)
|
|
279
|
+
out.emit(batch)
|
|
280
|
+
state.current_idx = rpp
|
|
281
|
+
|
|
282
|
+
|
|
283
|
+
# =============================================================================
|
|
284
|
+
# Projected-out partition column — exercises explicit override path
|
|
285
|
+
# =============================================================================
|
|
286
|
+
|
|
287
|
+
|
|
288
|
+
@dataclass(slots=True, frozen=True)
|
|
289
|
+
class _ProjectedOutArgs:
|
|
290
|
+
"""Arguments for ``partitioned_with_explicit_override``."""
|
|
291
|
+
|
|
292
|
+
rows_per_category: Annotated[int, Arg(0, doc="Rows per category partition", ge=1)]
|
|
293
|
+
|
|
294
|
+
|
|
295
|
+
@dataclass(kw_only=True)
|
|
296
|
+
class _ProjectedOutState(ArrowSerializableDataclass):
|
|
297
|
+
current_category_idx: int = -1
|
|
298
|
+
current_idx: int = 0
|
|
299
|
+
started: bool = False
|
|
300
|
+
|
|
301
|
+
|
|
302
|
+
_CATEGORIES: list[str] = ["books", "music", "video"]
|
|
303
|
+
|
|
304
|
+
|
|
305
|
+
@bind_fixed_schema
|
|
306
|
+
@_cardinality_from_count
|
|
307
|
+
class PartitionedWithExplicitOverrideFunction(TableFunctionGenerator[_ProjectedOutArgs, _ProjectedOutState]):
|
|
308
|
+
"""Uses the explicit ``partition_values=`` override on ``out.emit``.
|
|
309
|
+
|
|
310
|
+
Emits batches that DO include the partition column (so auto-extract
|
|
311
|
+
would work), but supplies ``partition_values`` explicitly anyway —
|
|
312
|
+
exercises the type-validation + IPC-batch-construction code path
|
|
313
|
+
for the explicit-override variant.
|
|
314
|
+
|
|
315
|
+
A worker whose emitted batches don't include the partition column
|
|
316
|
+
(e.g. under aggressive projection pushdown) MUST use this path;
|
|
317
|
+
this fixture covers the contract without needing to wire up
|
|
318
|
+
projection pushdown in the fixture itself.
|
|
319
|
+
"""
|
|
320
|
+
|
|
321
|
+
FIXED_SCHEMA: ClassVar[pa.Schema] = pa.schema(
|
|
322
|
+
[
|
|
323
|
+
partition_field("category", pa.string()),
|
|
324
|
+
pa.field("revenue", pa.int64()),
|
|
325
|
+
]
|
|
326
|
+
)
|
|
327
|
+
|
|
328
|
+
class Meta:
|
|
329
|
+
name = "partitioned_with_explicit_override"
|
|
330
|
+
description = (
|
|
331
|
+
"Partition column ``category`` is in the bind schema and the "
|
|
332
|
+
"emitted batches; worker uses the explicit "
|
|
333
|
+
"``partition_values=`` override on ``out.emit`` to exercise "
|
|
334
|
+
"the override code path."
|
|
335
|
+
)
|
|
336
|
+
categories = ["generator", "partitioning", "testing"]
|
|
337
|
+
partition_kind = PartitionKind.SINGLE_VALUE_PARTITIONS
|
|
338
|
+
|
|
339
|
+
@classmethod
|
|
340
|
+
def on_init(cls, params: InitParams[_ProjectedOutArgs]) -> GlobalInitResponse:
|
|
341
|
+
items = [struct.pack(_QUEUE_ITEM_FMT, i) for i in range(len(_CATEGORIES))]
|
|
342
|
+
params.storage.queue_push(items)
|
|
343
|
+
return GlobalInitResponse()
|
|
344
|
+
|
|
345
|
+
@classmethod
|
|
346
|
+
def initial_state(cls, params: ProcessParams[_ProjectedOutArgs]) -> _ProjectedOutState:
|
|
347
|
+
return _ProjectedOutState()
|
|
348
|
+
|
|
349
|
+
@classmethod
|
|
350
|
+
def process(
|
|
351
|
+
cls,
|
|
352
|
+
params: ProcessParams[_ProjectedOutArgs],
|
|
353
|
+
state: _ProjectedOutState,
|
|
354
|
+
out: OutputCollector,
|
|
355
|
+
) -> None:
|
|
356
|
+
if not state.started or state.current_idx >= params.args.rows_per_category:
|
|
357
|
+
item = params.storage.queue_pop()
|
|
358
|
+
if item is None:
|
|
359
|
+
out.finish()
|
|
360
|
+
return
|
|
361
|
+
(state.current_category_idx,) = struct.unpack(_QUEUE_ITEM_FMT, item)
|
|
362
|
+
state.current_idx = 0
|
|
363
|
+
state.started = True
|
|
364
|
+
|
|
365
|
+
category = _CATEGORIES[state.current_category_idx]
|
|
366
|
+
rpc = params.args.rows_per_category
|
|
367
|
+
revenue = [(state.current_category_idx + 1) * 100 + i for i in range(rpc)]
|
|
368
|
+
batch = pa.RecordBatch.from_pydict(
|
|
369
|
+
{"category": [category] * rpc, "revenue": revenue},
|
|
370
|
+
schema=cls.FIXED_SCHEMA,
|
|
371
|
+
)
|
|
372
|
+
cast(VgiOutputCollector, out).emit(
|
|
373
|
+
batch,
|
|
374
|
+
partition_values={
|
|
375
|
+
"category": (
|
|
376
|
+
pa.scalar(category, type=pa.string()),
|
|
377
|
+
pa.scalar(category, type=pa.string()),
|
|
378
|
+
),
|
|
379
|
+
},
|
|
380
|
+
)
|
|
381
|
+
state.current_idx = rpc
|
|
382
|
+
|
|
383
|
+
|
|
384
|
+
# =============================================================================
|
|
385
|
+
# DISJOINT_PARTITIONS — wire-level declaration only
|
|
386
|
+
# =============================================================================
|
|
387
|
+
|
|
388
|
+
|
|
389
|
+
@dataclass(slots=True, frozen=True)
|
|
390
|
+
class _DisjointArgs:
|
|
391
|
+
"""Arguments for ``disjoint_range_partitioned``."""
|
|
392
|
+
|
|
393
|
+
partitions: Annotated[int, Arg(0, doc="Number of disjoint partitions", ge=1)]
|
|
394
|
+
rows_per_partition: Annotated[int, Arg("rows_per_partition", default=10, doc="Rows per partition", ge=1)]
|
|
395
|
+
|
|
396
|
+
|
|
397
|
+
@dataclass(kw_only=True)
|
|
398
|
+
class _DisjointState(ArrowSerializableDataclass):
|
|
399
|
+
current_partition_idx: int = -1
|
|
400
|
+
current_idx: int = 0
|
|
401
|
+
started: bool = False
|
|
402
|
+
|
|
403
|
+
|
|
404
|
+
@bind_fixed_schema
|
|
405
|
+
@_cardinality_from_count
|
|
406
|
+
class DisjointRangePartitionedFunction(TableFunctionGenerator[_DisjointArgs, _DisjointState]):
|
|
407
|
+
"""Per-chunk disjoint integer ranges on ``key``.
|
|
408
|
+
|
|
409
|
+
Each chunk N emits ``key`` values in ``[N*1000, N*1000 + rows)``
|
|
410
|
+
— disjoint across partitions. Declares
|
|
411
|
+
``DISJOINT_PARTITIONS``; the C++ extension propagates this to
|
|
412
|
+
DuckDB's ``get_partition_info``. DuckDB doesn't have a consumer
|
|
413
|
+
for DISJOINT today, so GROUP BY queries fall back to
|
|
414
|
+
``HASH_GROUP_BY`` (verified by the integration test).
|
|
415
|
+
|
|
416
|
+
Purpose: verify the wire path (declaration, per-batch min/max
|
|
417
|
+
metadata, C++ extraction) works for the non-SINGLE_VALUE kinds.
|
|
418
|
+
"""
|
|
419
|
+
|
|
420
|
+
FIXED_SCHEMA: ClassVar[pa.Schema] = pa.schema(
|
|
421
|
+
[
|
|
422
|
+
partition_field("key", pa.int64()),
|
|
423
|
+
pa.field("value", pa.int64()),
|
|
424
|
+
]
|
|
425
|
+
)
|
|
426
|
+
|
|
427
|
+
class Meta:
|
|
428
|
+
name = "disjoint_range_partitioned"
|
|
429
|
+
description = (
|
|
430
|
+
"Disjoint per-chunk integer ranges on ``key``. Declares "
|
|
431
|
+
"DISJOINT_PARTITIONS (wire-level only; DuckDB falls back to "
|
|
432
|
+
"HASH_GROUP_BY for now)."
|
|
433
|
+
)
|
|
434
|
+
categories = ["generator", "partitioning", "testing"]
|
|
435
|
+
partition_kind = PartitionKind.DISJOINT_PARTITIONS
|
|
436
|
+
|
|
437
|
+
@classmethod
|
|
438
|
+
def on_init(cls, params: InitParams[_DisjointArgs]) -> GlobalInitResponse:
|
|
439
|
+
items = [struct.pack(_QUEUE_ITEM_FMT, i) for i in range(params.args.partitions)]
|
|
440
|
+
params.storage.queue_push(items)
|
|
441
|
+
return GlobalInitResponse()
|
|
442
|
+
|
|
443
|
+
@classmethod
|
|
444
|
+
def initial_state(cls, params: ProcessParams[_DisjointArgs]) -> _DisjointState:
|
|
445
|
+
return _DisjointState()
|
|
446
|
+
|
|
447
|
+
@classmethod
|
|
448
|
+
def process(
|
|
449
|
+
cls,
|
|
450
|
+
params: ProcessParams[_DisjointArgs],
|
|
451
|
+
state: _DisjointState,
|
|
452
|
+
out: OutputCollector,
|
|
453
|
+
) -> None:
|
|
454
|
+
if not state.started or state.current_idx >= params.args.rows_per_partition:
|
|
455
|
+
item = params.storage.queue_pop()
|
|
456
|
+
if item is None:
|
|
457
|
+
out.finish()
|
|
458
|
+
return
|
|
459
|
+
(state.current_partition_idx,) = struct.unpack(_QUEUE_ITEM_FMT, item)
|
|
460
|
+
state.current_idx = 0
|
|
461
|
+
state.started = True
|
|
462
|
+
|
|
463
|
+
rpp = params.args.rows_per_partition
|
|
464
|
+
base = state.current_partition_idx * 1000
|
|
465
|
+
keys = [base + i for i in range(rpp)]
|
|
466
|
+
values = [state.current_partition_idx * 10 + i for i in range(rpp)]
|
|
467
|
+
batch = pa.RecordBatch.from_pydict(
|
|
468
|
+
{"key": keys, "value": values},
|
|
469
|
+
schema=cls.FIXED_SCHEMA,
|
|
470
|
+
)
|
|
471
|
+
out.emit(batch)
|
|
472
|
+
state.current_idx = rpp
|