vgi-python 0.8.1__py3-none-any.whl → 0.8.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
vgi/__init__.py CHANGED
@@ -39,6 +39,7 @@ from vgi.arguments import (
39
39
  Param,
40
40
  Returns,
41
41
  TableInput,
42
+ TaggedUnion,
42
43
  )
43
44
  from vgi.auth import AuthContext, CallContext
44
45
  from vgi.metadata import (
@@ -143,6 +144,7 @@ __all__ = [
143
144
  "TableInOutGenerator",
144
145
  "TableInput",
145
146
  "TableInputValidationError",
147
+ "TaggedUnion",
146
148
  "TypeMismatchError",
147
149
  "Worker",
148
150
  "functions_to_arrow",
@@ -0,0 +1,72 @@
1
+ # Copyright 2025, 2026 Query Farm LLC - https://query.farm
2
+
3
+ """Fixture worker that deliberately advertises an unrecognized enum value.
4
+
5
+ This fixture exercises the C++ extension's *wire-enum validation* end-to-end:
6
+ the catalog-metadata parser (``ParseFunctionInfo`` in
7
+ ``vgi/src/vgi_catalog_api.cpp``) must reject an enum string it does not
8
+ recognize with a loud ``IOException`` rather than silently falling back to a
9
+ default. A silent fallback would run with behavior inconsistent with what the
10
+ worker declared (e.g. treating a ``SPECIAL`` null-handling function as
11
+ ``DEFAULT``).
12
+
13
+ The trick is entirely Python-side and needs no extension rebuild. The normal
14
+ metadata path can only ever emit valid enum names because the values come from
15
+ typed Python ``Enum`` members. To get a bogus string onto the wire we override
16
+ :meth:`ExampleCatalog._function_to_info` for one scalar function (``double``)
17
+ and swap its ``null_handling`` for :class:`_BogusNullHandling.WEIRD` — a real
18
+ ``Enum`` member whose ``.name`` is ``"WEIRD"``. The vgi-rpc serializer converts
19
+ any ``Enum`` field to ``value.name`` (see ``ArrowSerializableDataclass``), so
20
+ ``"WEIRD"`` lands in the ``null_handling`` Arrow column and the C++ parser
21
+ trips on it the moment the ``double`` function's metadata is loaded.
22
+
23
+ Otherwise this is a drop-in replacement for ``vgi-fixture-worker``: every other
24
+ function and the catalog are inherited unchanged from :class:`ExampleWorker`,
25
+ so any function except ``double`` still resolves normally.
26
+
27
+ Registered as the ``vgi-fixture-bad-enum-worker`` entry point.
28
+ """
29
+
30
+ from __future__ import annotations
31
+
32
+ from dataclasses import replace
33
+ from enum import Enum
34
+
35
+ from vgi._test_fixtures.worker import ExampleCatalog, ExampleWorker
36
+ from vgi.catalog.catalog_interface import FunctionInfo
37
+
38
+ # The scalar function whose null_handling we corrupt. Tests reference this name
39
+ # to force the broken metadata onto the parse path.
40
+ BAD_ENUM_FUNCTION = "double"
41
+
42
+
43
+ class _BogusNullHandling(Enum):
44
+ """An enum member whose ``.name`` is a value the C++ parser cannot map."""
45
+
46
+ WEIRD = "WEIRD"
47
+
48
+
49
+ class BadEnumCatalog(ExampleCatalog):
50
+ """ExampleCatalog that advertises a bogus null_handling for one function."""
51
+
52
+ def _function_to_info(self, func_cls: type, schema_name: str) -> FunctionInfo:
53
+ info = super()._function_to_info(func_cls, schema_name)
54
+ if info.name == BAD_ENUM_FUNCTION and info.null_handling is not None:
55
+ # FunctionInfo is frozen; replace() returns a corrupted copy.
56
+ return replace(info, null_handling=_BogusNullHandling.WEIRD) # type: ignore[arg-type]
57
+ return info
58
+
59
+
60
+ class BadEnumWorker(ExampleWorker):
61
+ """ExampleWorker that serves the example catalog with one bad enum value."""
62
+
63
+ catalog_interface = BadEnumCatalog
64
+
65
+
66
+ def main() -> None:
67
+ """Run the bad-enum fixture worker process."""
68
+ BadEnumWorker.main()
69
+
70
+
71
+ if __name__ == "__main__":
72
+ main()
@@ -0,0 +1,15 @@
1
+ # Copyright 2025, 2026 Query Farm LLC - https://query.farm
2
+
3
+ """Narrow-bind reproducer fixture.
4
+
5
+ Exposes a catalog whose virtual table advertises *more* columns in its
6
+ listing (``catalog_schema_contents_tables`` / ``catalog_table_get``) than
7
+ its scan function returns from ``on_bind``. A client that trusts the bind
8
+ ``output_schema`` without checking it against the planned catalog columns
9
+ indexes past the end of the worker's narrower batch in
10
+ ``ArrowTableFunction::ArrowToDuckDB`` and SIGSEGVs. The fix makes the
11
+ client fail closed at bind with a clear ``BinderException``.
12
+
13
+ Driven by ``test/sql/integration/narrow_bind_mismatch.test`` in
14
+ ``~/Development/vgi``.
15
+ """
@@ -0,0 +1,237 @@
1
+ # Copyright 2025, 2026 Query Farm LLC - https://query.farm
2
+
3
+ """Narrow-bind reproducer worker.
4
+
5
+ Two virtual tables, each backed by a table function:
6
+
7
+ * ``mismatch`` — advertises columns ``{id, val}`` in its catalog listing
8
+ but its scan function ``narrow_scan`` binds to ``{id}`` only. This is
9
+ the inconsistency that used to segfault the client at scan time
10
+ (``ArrowTableFunction::ArrowToDuckDB`` walking off the end of the
11
+ worker's 1-column batch). The client must now refuse it at bind with a
12
+ clear ``BinderException``.
13
+
14
+ * ``consistent`` — advertises ``{id, val}`` and its scan function
15
+ ``wide_scan`` binds to ``{id, val}``. Positive control: this must keep
16
+ working unchanged.
17
+ """
18
+
19
+ from __future__ import annotations
20
+
21
+ from dataclasses import dataclass
22
+ from typing import Annotated, Any
23
+
24
+ import pyarrow as pa
25
+ from vgi_rpc import ArrowSerializableDataclass
26
+ from vgi_rpc.rpc import OutputCollector
27
+
28
+ from vgi import Worker
29
+ from vgi.arguments import Arg
30
+ from vgi.catalog import Catalog, Schema
31
+ from vgi.catalog.catalog_interface import (
32
+ AttachOpaqueData,
33
+ ReadOnlyCatalogInterface,
34
+ ScanFunctionResult,
35
+ SchemaInfo,
36
+ SchemaObjectType,
37
+ SerializedSchema,
38
+ TableInfo,
39
+ TransactionOpaqueData,
40
+ )
41
+ from vgi.function import Function
42
+ from vgi.invocation import BindResponse
43
+ from vgi.table_function import (
44
+ BindParams,
45
+ ProcessParams,
46
+ TableFunctionGenerator,
47
+ init_single_worker,
48
+ )
49
+
50
+ CATALOG_NAME = "narrow_bind"
51
+
52
+ # What the catalog advertises for both tables: two columns.
53
+ _TABLE_SCHEMA: pa.Schema = pa.schema([pa.field("id", pa.int64()), pa.field("val", pa.int64())])
54
+ # What the narrow scan function actually binds to: one column.
55
+ _NARROW_BIND_SCHEMA: pa.Schema = pa.schema([pa.field("id", pa.int64())])
56
+
57
+
58
+ @dataclass(kw_only=True)
59
+ class _State(ArrowSerializableDataclass):
60
+ done: bool = False
61
+
62
+
63
+ @dataclass(frozen=True)
64
+ class _Args:
65
+ count: Annotated[int, Arg(0, doc="rows", ge=0)]
66
+
67
+
68
+ @init_single_worker
69
+ class NarrowScan(TableFunctionGenerator[_Args, _State]):
70
+ """Binds to a NARROWER schema than the catalog advertises (the bug)."""
71
+
72
+ class Meta:
73
+ name = "narrow_scan"
74
+ description = "bind reports a narrower schema than the table advertises"
75
+
76
+ @classmethod
77
+ def on_bind(cls, params: BindParams[_Args]) -> BindResponse:
78
+ return BindResponse(output_schema=_NARROW_BIND_SCHEMA)
79
+
80
+ @classmethod
81
+ def initial_state(cls, params: ProcessParams[_Args]) -> _State:
82
+ return _State()
83
+
84
+ @classmethod
85
+ def process(cls, params: ProcessParams[_Args], state: _State, out: OutputCollector) -> None:
86
+ if state.done:
87
+ out.finish()
88
+ return
89
+ state.done = True
90
+ out.emit(pa.RecordBatch.from_pydict({"id": [0, 1, 2]}, schema=params.output_schema))
91
+
92
+
93
+ @init_single_worker
94
+ class WideScan(TableFunctionGenerator[_Args, _State]):
95
+ """Binds to the full advertised schema (positive control — must work)."""
96
+
97
+ class Meta:
98
+ name = "wide_scan"
99
+ description = "bind matches the table's advertised schema"
100
+
101
+ @classmethod
102
+ def on_bind(cls, params: BindParams[_Args]) -> BindResponse:
103
+ return BindResponse(output_schema=_TABLE_SCHEMA)
104
+
105
+ @classmethod
106
+ def initial_state(cls, params: ProcessParams[_Args]) -> _State:
107
+ return _State()
108
+
109
+ @classmethod
110
+ def process(cls, params: ProcessParams[_Args], state: _State, out: OutputCollector) -> None:
111
+ if state.done:
112
+ out.finish()
113
+ return
114
+ state.done = True
115
+ out.emit(pa.RecordBatch.from_pydict({"id": [0, 1, 2], "val": [10, 20, 30]}, schema=params.output_schema))
116
+
117
+
118
+ _FUNCTIONS: list[type[Function]] = [NarrowScan, WideScan]
119
+
120
+ _CATALOG = Catalog(
121
+ name=CATALOG_NAME,
122
+ default_schema="main",
123
+ schemas=[
124
+ Schema(
125
+ name="main",
126
+ comment="narrow-bind reproducer catalog",
127
+ functions=list(_FUNCTIONS),
128
+ tables=[],
129
+ ),
130
+ ],
131
+ )
132
+
133
+
134
+ def _serialize_schema(s: pa.Schema) -> bytes:
135
+ sink = pa.BufferOutputStream()
136
+ with pa.ipc.new_stream(sink, s):
137
+ pass
138
+ return sink.getvalue().to_pybytes()
139
+
140
+
141
+ # table name -> scan function name. Both advertise _TABLE_SCHEMA (2 cols).
142
+ _TABLE_FUNCTIONS = {
143
+ "mismatch": "narrow_scan",
144
+ "consistent": "wide_scan",
145
+ }
146
+
147
+
148
+ class NarrowBindCatalog(ReadOnlyCatalogInterface):
149
+ catalog = _CATALOG
150
+ catalog_name = CATALOG_NAME
151
+
152
+ def _info(self, table_name: str) -> TableInfo:
153
+ return TableInfo(
154
+ comment=f"narrow-bind reproducer table -> {_TABLE_FUNCTIONS[table_name]}",
155
+ tags={},
156
+ name=table_name,
157
+ schema_name="main",
158
+ columns=SerializedSchema(_serialize_schema(_TABLE_SCHEMA)),
159
+ not_null_constraints=[],
160
+ unique_constraints=[],
161
+ check_constraints=[],
162
+ )
163
+
164
+ def schemas(
165
+ self, *, attach_opaque_data: AttachOpaqueData, transaction_opaque_data: TransactionOpaqueData | None
166
+ ) -> list[SchemaInfo]:
167
+ infos = super().schemas(attach_opaque_data=attach_opaque_data, transaction_opaque_data=transaction_opaque_data)
168
+ for i, info in enumerate(infos):
169
+ if info.name == "main":
170
+ infos[i] = SchemaInfo(
171
+ attach_opaque_data=info.attach_opaque_data,
172
+ name=info.name,
173
+ comment=info.comment,
174
+ tags=info.tags,
175
+ estimated_object_count={
176
+ **(info.estimated_object_count or {}),
177
+ "table": len(_TABLE_FUNCTIONS),
178
+ },
179
+ )
180
+ return infos
181
+
182
+ def schema_contents(
183
+ self,
184
+ *,
185
+ attach_opaque_data: AttachOpaqueData,
186
+ transaction_opaque_data: TransactionOpaqueData | None,
187
+ name: str,
188
+ type: Any,
189
+ ) -> Any:
190
+ if name.lower() == "main" and type == SchemaObjectType.TABLE:
191
+ return [self._info(table_name) for table_name in _TABLE_FUNCTIONS]
192
+ return super().schema_contents(
193
+ attach_opaque_data=attach_opaque_data, transaction_opaque_data=transaction_opaque_data, name=name, type=type
194
+ )
195
+
196
+ def table_get(
197
+ self,
198
+ *,
199
+ attach_opaque_data: AttachOpaqueData,
200
+ transaction_opaque_data: TransactionOpaqueData | None,
201
+ schema_name: str,
202
+ name: str,
203
+ at_unit: str | None = None,
204
+ at_value: str | None = None,
205
+ ) -> TableInfo | None:
206
+ if schema_name.lower() != "main":
207
+ return None
208
+ if name in _TABLE_FUNCTIONS:
209
+ return self._info(name)
210
+ return None
211
+
212
+ def table_scan_function_get(
213
+ self,
214
+ *,
215
+ attach_opaque_data: AttachOpaqueData,
216
+ transaction_opaque_data: TransactionOpaqueData | None,
217
+ schema_name: str,
218
+ name: str,
219
+ at_unit: str | None,
220
+ at_value: str | None,
221
+ ) -> ScanFunctionResult:
222
+ fn = _TABLE_FUNCTIONS.get(name)
223
+ if fn is None:
224
+ raise ValueError(f"unknown narrow-bind reproducer table: {name}")
225
+ return ScanFunctionResult(
226
+ function_name=fn,
227
+ positional_arguments=[pa.scalar(3, type=pa.int64())],
228
+ named_arguments={},
229
+ required_extensions=[],
230
+ )
231
+
232
+
233
+ class NarrowBindWorker(Worker):
234
+ catalog_interface = NarrowBindCatalog
235
+ catalog_name = CATALOG_NAME
236
+ catalog = _CATALOG
237
+ functions = list(_FUNCTIONS)
@@ -52,6 +52,7 @@ from vgi._test_fixtures.scalar.null_handling import (
52
52
  from vgi._test_fixtures.scalar.random_demo import (
53
53
  BernoulliFunction,
54
54
  HashSeedFunction,
55
+ QuerySeedFunction,
55
56
  RandomBytesFunction,
56
57
  RandomIntFunction,
57
58
  )
@@ -102,6 +103,7 @@ __all__ = [
102
103
  "PairTypeIntIntFunction",
103
104
  "PairTypeIntStrFunction",
104
105
  "PairTypeStrStrFunction",
106
+ "QuerySeedFunction",
105
107
  "RandomBytesFunction",
106
108
  "RandomIntFunction",
107
109
  "ReturnSecretValueFunction",
@@ -136,6 +136,50 @@ class HashSeedFunction(ScalarFunction):
136
136
  return pa.array([seed + i for i in range(_length)], type=pa.int64())
137
137
 
138
138
 
139
+ class QuerySeedFunction(ScalarFunction):
140
+ """Adds a per-query-stable seed to each input value.
141
+
142
+ Demonstrates ``FunctionStability.CONSISTENT_WITHIN_QUERY`` — the only
143
+ fixture that emits this stability variant. Semantically the value is fixed
144
+ for the duration of a single query but may differ across queries (like
145
+ ``now()``). DuckDB has no behavioral consumer that this fixture asserts; it
146
+ exists so the wire path for the third stability value stays exercised and
147
+ so other-language workers must specify it.
148
+
149
+ Example:
150
+ SQL: SELECT query_seed(value) FROM data
151
+
152
+ """
153
+
154
+ class Meta:
155
+ """Function metadata."""
156
+
157
+ name = "query_seed"
158
+ description = "Add a per-query-stable seed to each value (demonstrates CONSISTENT_WITHIN_QUERY stability)"
159
+ stability = FunctionStability.CONSISTENT_WITHIN_QUERY
160
+ examples = [
161
+ FunctionExample(
162
+ sql="SELECT query_seed(value) FROM data",
163
+ description="Offset each value by a seed that is constant within a query",
164
+ ),
165
+ ]
166
+
167
+ @classmethod
168
+ def compute(
169
+ cls,
170
+ value: Annotated[pa.Int64Array, Param(doc="Value to offset")],
171
+ ) -> Annotated[pa.Int64Array, Returns()]:
172
+ """Add a fixed per-query offset to each value.
173
+
174
+ The offset is deterministic here (a constant) so SQL tests have a
175
+ stable expected output; the stability flag is what is under test, not
176
+ the numeric result.
177
+ """
178
+ import pyarrow.compute as pc
179
+
180
+ return pc.add(value, 1000)
181
+
182
+
139
183
  class RandomBytesFunction(ScalarFunction):
140
184
  """Generates deterministic pseudo-random binary blobs from a seed."""
141
185
 
@@ -82,6 +82,7 @@ from vgi._test_fixtures.table.pairs import (
82
82
  from vgi._test_fixtures.table.partition_columns import (
83
83
  CountryPartitionedSalesFunction,
84
84
  DisjointRangePartitionedFunction,
85
+ OverlappingRangePartitionedFunction,
85
86
  PartitionedWithExplicitOverrideFunction,
86
87
  RegionYearPartitionedFunction,
87
88
  )
@@ -182,6 +183,7 @@ __all__ = [
182
183
  "NestedSequenceFunction",
183
184
  "NonMonotoneBatchIndexFunction",
184
185
  "OrderEchoFunction",
186
+ "OverlappingRangePartitionedFunction",
185
187
  "PartitionedBatchIndexFunction",
186
188
  "PartitionedBatchIndexMarkedFunction",
187
189
  "PartitionedFixedOrderFunction",
@@ -33,6 +33,11 @@ Fixtures:
33
33
  disjoint integer range. Verifies the wire path; DuckDB falls back to
34
34
  ``HASH_GROUP_BY`` for GROUP BY queries against it.
35
35
 
36
+ * :class:`OverlappingRangePartitionedFunction` — declares
37
+ ``OVERLAPPING_PARTITIONS`` (the only fixture that does). Consecutive
38
+ chunks share ``key`` values. Wire-level only; DuckDB falls back to
39
+ ``HASH_GROUP_BY``.
40
+
36
41
  All fixtures use the in-memory state pattern (no work-queue / no
37
42
  stream_state) — they're simpler than the v1 partitioned_batch_index
38
43
  since the v2 plan is about correctness of the partition contract,
@@ -470,3 +475,99 @@ class DisjointRangePartitionedFunction(TableFunctionGenerator[_DisjointArgs, _Di
470
475
  )
471
476
  out.emit(batch)
472
477
  state.current_idx = rpp
478
+
479
+
480
+ # =============================================================================
481
+ # OVERLAPPING_PARTITIONS — wire-level only
482
+ # =============================================================================
483
+
484
+
485
+ @dataclass(slots=True, frozen=True)
486
+ class _OverlappingArgs:
487
+ """Arguments for ``overlapping_range_partitioned``."""
488
+
489
+ partitions: Annotated[int, Arg(0, doc="Number of overlapping partitions", ge=1)]
490
+ rows_per_partition: Annotated[int, Arg("rows_per_partition", default=10, doc="Rows per partition", ge=1)]
491
+
492
+
493
+ @dataclass(kw_only=True)
494
+ class _OverlappingState(ArrowSerializableDataclass):
495
+ current_partition_idx: int = -1
496
+ current_idx: int = 0
497
+ started: bool = False
498
+
499
+
500
+ @bind_fixed_schema
501
+ @_cardinality_from_count
502
+ class OverlappingRangePartitionedFunction(TableFunctionGenerator[_OverlappingArgs, _OverlappingState]):
503
+ """Per-chunk *overlapping* integer ranges on ``key``.
504
+
505
+ Each chunk N emits ``key`` values in ``[N*500, N*500 + rows)``. With the
506
+ default ``rows_per_partition`` of 10 the ranges are disjoint, but callers
507
+ pass ``rows_per_partition > 500`` to make consecutive chunks overlap on
508
+ ``key`` — distinguishing this from
509
+ :class:`DisjointRangePartitionedFunction`.
510
+
511
+ Declares ``OVERLAPPING_PARTITIONS``. Like DISJOINT, DuckDB has no consumer
512
+ for OVERLAPPING today, so GROUP BY queries fall back to ``HASH_GROUP_BY``;
513
+ the value's purpose here is to keep the wire path (declaration, per-batch
514
+ min/max metadata, C++ extraction → ``get_partition_info``) exercised so
515
+ other-language workers must specify it. This is the only fixture that emits
516
+ ``OVERLAPPING_PARTITIONS``.
517
+ """
518
+
519
+ FIXED_SCHEMA: ClassVar[pa.Schema] = pa.schema(
520
+ [
521
+ partition_field("key", pa.int64()),
522
+ pa.field("value", pa.int64()),
523
+ ]
524
+ )
525
+
526
+ class Meta:
527
+ name = "overlapping_range_partitioned"
528
+ description = (
529
+ "Overlapping per-chunk integer ranges on ``key``. Declares "
530
+ "OVERLAPPING_PARTITIONS (wire-level only; DuckDB falls back to "
531
+ "HASH_GROUP_BY for now)."
532
+ )
533
+ categories = ["generator", "partitioning", "testing"]
534
+ partition_kind = PartitionKind.OVERLAPPING_PARTITIONS
535
+
536
+ @classmethod
537
+ def on_init(cls, params: InitParams[_OverlappingArgs]) -> GlobalInitResponse:
538
+ items = [struct.pack(_QUEUE_ITEM_FMT, i) for i in range(params.args.partitions)]
539
+ params.storage.queue_push(items)
540
+ return GlobalInitResponse()
541
+
542
+ @classmethod
543
+ def initial_state(cls, params: ProcessParams[_OverlappingArgs]) -> _OverlappingState:
544
+ return _OverlappingState()
545
+
546
+ @classmethod
547
+ def process(
548
+ cls,
549
+ params: ProcessParams[_OverlappingArgs],
550
+ state: _OverlappingState,
551
+ out: OutputCollector,
552
+ ) -> None:
553
+ if not state.started or state.current_idx >= params.args.rows_per_partition:
554
+ item = params.storage.queue_pop()
555
+ if item is None:
556
+ out.finish()
557
+ return
558
+ (state.current_partition_idx,) = struct.unpack(_QUEUE_ITEM_FMT, item)
559
+ state.current_idx = 0
560
+ state.started = True
561
+
562
+ rpp = params.args.rows_per_partition
563
+ # Stride of 500 (< rpp when callers want overlap) makes consecutive
564
+ # chunks share key values.
565
+ base = state.current_partition_idx * 500
566
+ keys = [base + i for i in range(rpp)]
567
+ values = [state.current_partition_idx * 10 + i for i in range(rpp)]
568
+ batch = pa.RecordBatch.from_pydict(
569
+ {"key": keys, "value": values},
570
+ schema=cls.FIXED_SCHEMA,
571
+ )
572
+ out.emit(batch)
573
+ state.current_idx = rpp
@@ -86,6 +86,7 @@ from vgi._test_fixtures.scalar import (
86
86
  PairTypeIntIntFunction,
87
87
  PairTypeIntStrFunction,
88
88
  PairTypeStrStrFunction,
89
+ QuerySeedFunction,
89
90
  RandomBytesFunction,
90
91
  RandomIntFunction,
91
92
  ReturnSecretValueFunction,
@@ -146,6 +147,7 @@ from vgi._test_fixtures.table import (
146
147
  NestedSequenceFunction,
147
148
  NonMonotoneBatchIndexFunction,
148
149
  OrderEchoFunction,
150
+ OverlappingRangePartitionedFunction,
149
151
  PartitionedBatchIndexFunction,
150
152
  PartitionedBatchIndexMarkedFunction,
151
153
  PartitionedFixedOrderFunction,
@@ -385,6 +387,7 @@ _EXAMPLE_CATALOG = Catalog(
385
387
  # — see vgi/_test_fixtures/table/partition_columns.py.
386
388
  CountryPartitionedSalesFunction,
387
389
  DisjointRangePartitionedFunction,
390
+ OverlappingRangePartitionedFunction,
388
391
  PartitionedWithExplicitOverrideFunction,
389
392
  RegionYearPartitionedFunction,
390
393
  # Deliberately-broken batch_index fixtures (see
@@ -457,6 +460,7 @@ _EXAMPLE_CATALOG = Catalog(
457
460
  PairTypeIntIntFunction,
458
461
  PairTypeIntStrFunction,
459
462
  PairTypeStrStrFunction,
463
+ QuerySeedFunction,
460
464
  RandomBytesFunction,
461
465
  RandomIntFunction,
462
466
  ReturnSecretValueFunction,
@@ -1621,11 +1625,18 @@ def main() -> None:
1621
1625
  extra is also installed.
1622
1626
  """
1623
1627
  from vgi._test_fixtures.accumulate.worker import AccumulateWorker
1628
+ from vgi._test_fixtures.narrow_bind.worker import NarrowBindWorker
1624
1629
  from vgi._test_fixtures.projection_repro.worker import ProjReproWorker
1625
1630
  from vgi._test_fixtures.schema_reconcile.worker import SchemaReconcileWorker
1626
1631
  from vgi.meta_worker import MetaWorker
1627
1632
 
1628
- workers: list[type] = [ExampleWorker, ProjReproWorker, SchemaReconcileWorker, AccumulateWorker]
1633
+ workers: list[type] = [
1634
+ ExampleWorker,
1635
+ ProjReproWorker,
1636
+ SchemaReconcileWorker,
1637
+ AccumulateWorker,
1638
+ NarrowBindWorker,
1639
+ ]
1629
1640
  try:
1630
1641
  from vgi._test_fixtures.writable.worker import WritableWorker
1631
1642
  except ImportError:
vgi/aggregate_function.py CHANGED
@@ -337,6 +337,22 @@ class AggregateFunction[TState: ArrowSerializableDataclass](vgi.function.Functio
337
337
  The ``states`` dict is pre-populated with ``initial_state()`` for
338
338
  all new group_ids. ``group_ids`` is parallel to each column array.
339
339
 
340
+ IMPORTANT — reassign to persist. Treat state as immutable: to record
341
+ a change you MUST write it back with ``states[gid] = new_state``. The
342
+ framework only persists a group whose entry you *assigned* during this
343
+ call (plus groups already saved from an earlier batch). Mutating the
344
+ existing object in place — e.g. ``states[gid].items.append(x)`` — is
345
+ NOT detected for a group first seen in this batch, so its data is
346
+ silently dropped and ``finalize()`` sees only ``initial_state()``. This
347
+ bites single-group / single-batch aggregates hardest. Do::
348
+
349
+ s = states[gid]
350
+ states[gid] = MyState(items=s.items + new_items) # reassign
351
+
352
+ not::
353
+
354
+ states[gid].items.extend(new_items) # in-place: may be lost
355
+
340
356
  """
341
357
  ...
342
358
 
vgi/arguments.py CHANGED
@@ -159,6 +159,7 @@ __all__ = [
159
159
  "PYTHON_TO_ARROW",
160
160
  "Returns",
161
161
  "TableInput",
162
+ "TaggedUnion",
162
163
  "TypeBoundPredicate",
163
164
  "OutputLength",
164
165
  "Setting",
@@ -168,6 +169,58 @@ __all__ = [
168
169
  ]
169
170
 
170
171
 
172
+ @dataclass(frozen=True, slots=True)
173
+ class TaggedUnion:
174
+ """A decoded union-typed argument: which member is set (``tag``) and its ``value``.
175
+
176
+ DuckDB ``UNION`` / Arrow union arguments are *tagged*: the discriminator
177
+ (which member is present) lives in the Arrow ``UnionScalar.type_code``, not
178
+ in the member value. Plain ``Scalar.as_py()`` returns only the member value
179
+ and drops that tag, so union arguments are decoded into this wrapper
180
+ instead — ``tag`` is the active member's field name and ``value`` is its
181
+ Python value.
182
+
183
+ Example::
184
+
185
+ config: Annotated[TaggedUnion, Arg("config", arrow_type=pa.sparse_union([...]))]
186
+ ...
187
+ cfg = params.args.config # TaggedUnion(tag=..., value=...)
188
+ if cfg.tag == "random_forest_classifier":
189
+ grid = cfg.value # the member struct, as a dict
190
+
191
+ """
192
+
193
+ tag: str | None
194
+ value: Any
195
+
196
+
197
+ def _scalar_to_py(scalar: "Scalar[Any]") -> Any:
198
+ """Convert an argument scalar to a Python value, preserving union tags.
199
+
200
+ Identical to ``scalar.as_py()`` for every type except unions: a
201
+ ``UnionScalar`` is decoded to a [`TaggedUnion`][] so the member
202
+ discriminator (which ``as_py()`` discards) is retained.
203
+
204
+ Args:
205
+ scalar: The argument scalar to convert.
206
+
207
+ Returns:
208
+ ``scalar.as_py()`` for non-union scalars; a [`TaggedUnion`][] for unions.
209
+
210
+ """
211
+ if isinstance(scalar, pa.UnionScalar):
212
+ # Map the active ``type_code`` to its member field name via the union
213
+ # type's parallel ``type_codes`` / ``field()``. (``type_code`` is coerced
214
+ # to int — it is an integer at runtime regardless of the stub's typing.)
215
+ union_type = scalar.type
216
+ type_codes = list(union_type.type_codes)
217
+ code = int(scalar.type_code)
218
+ tag = union_type.field(type_codes.index(code)).name if code in type_codes else None
219
+ inner = scalar.value
220
+ return TaggedUnion(tag=tag, value=inner.as_py() if inner is not None else None)
221
+ return scalar.as_py()
222
+
223
+
171
224
  class TableInput:
172
225
  """Sentinel type for table input parameters in table-in-out functions.
173
226
 
@@ -377,7 +430,7 @@ class Arguments:
377
430
  else:
378
431
  raise TypeError(f"Argument '{key}': expected {type}, got {scalar.type}")
379
432
 
380
- return scalar.as_py()
433
+ return _scalar_to_py(scalar)
381
434
 
382
435
  def get_varargs(
383
436
  self,
@@ -410,7 +463,7 @@ class Arguments:
410
463
  if type is not None and scalar.type != type:
411
464
  raise TypeError(f"Argument {i}: expected {type}, got {scalar.type}")
412
465
 
413
- values.append(scalar.as_py())
466
+ values.append(_scalar_to_py(scalar))
414
467
 
415
468
  return tuple(values)
416
469
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: vgi-python
3
- Version: 0.8.1
3
+ Version: 0.8.3
4
4
  Summary: Vector Gateway Interface - Connect DuckDB to external programs via Apache Arrow
5
5
  Project-URL: Homepage, https://query.farm
6
6
  Project-URL: Repository, https://github.com/Query-farm/vgi-python
@@ -162,7 +162,7 @@ Requires-Dist: httpx>=0.24
162
162
  Requires-Dist: platformdirs
163
163
  Requires-Dist: pyarrow
164
164
  Requires-Dist: typer>=0.9
165
- Requires-Dist: vgi-rpc>=0.20.4
165
+ Requires-Dist: vgi-rpc>=0.20.5
166
166
  Provides-Extra: azure
167
167
  Requires-Dist: azure-identity>=1.16.0; extra == 'azure'
168
168
  Requires-Dist: pymssql>=2.3.0; extra == 'azure'
@@ -1,9 +1,9 @@
1
- vgi/__init__.py,sha256=RHimcHtz9s4swAt2q3qFTYBFGWSIpqi9ZXonaaqwuPk,3378
1
+ vgi/__init__.py,sha256=PRtFvXxhHEbY_0KVhyXIbGigZDYKHzMS-4gp0p6IJSQ,3414
2
2
  vgi/_duckdb.py,sha256=YB5D7N3Bwg_xP6X8a5QlumtlAovSej1A1Go5XlNGVko,2162
3
3
  vgi/_storage_profile.py,sha256=VkTsXojuE0tHEzurmteQSAiL1vI3CZSgYkL6D_h8GvE,5061
4
- vgi/aggregate_function.py,sha256=s4u7F3nCNXQw-U0XoEoAh86P9LfXv63MCbTuvZZYWZE,24377
4
+ vgi/aggregate_function.py,sha256=vn9TjQEHxAKJl_xzQOzdj5TY_6LplcZjv06JkQMnUyo,25184
5
5
  vgi/argument_spec.py,sha256=fVO17BDDfjnTMUrRoILNr2oFLTl4KKedMObFUk2GRrI,17072
6
- vgi/arguments.py,sha256=cnM9qsHlnUsyufSGVvQoCB4RjJGF1YfjExl1FMUlnJc,64940
6
+ vgi/arguments.py,sha256=02tIMGIR_cRS73u-bsgpFERxhje-rnTokjBgGsm9pQA,67019
7
7
  vgi/auth.py,sha256=3HD2zM-Mt0Ie-_HT5RorpND1OusUw2CPROjPNs7rgbo,1478
8
8
  vgi/exceptions.py,sha256=oX_sZc9xGWi7Xf8cJQf89fX19i3ocDEj_V_76GINgBQ,7294
9
9
  vgi/function.py,sha256=SLXA3qErHIsDsn4R0nm56z83Kcw12SMZMYXnPnCzhZg,8520
@@ -30,6 +30,7 @@ vgi/table_in_out_function.py,sha256=Ouv02ubP-XsRdByIR0ZSagi6VbZfQ8G1e5QWpqlLqvY,
30
30
  vgi/worker.py,sha256=SGZsOpER_Aew6CDhyi6I8jchigTJJuL7pyV-E-Rz100,195818
31
31
  vgi/_test_fixtures/__init__.py,sha256=xQq-QQLk8USQ6TZHsPiPcZldNNUdcJ2gToXMPGzScLI,444
32
32
  vgi/_test_fixtures/attach_options.py,sha256=YtNk1krDdKLnNVtX9yGkgS8XyzdOI0oXtNDbAV73Phg,11091
33
+ vgi/_test_fixtures/bad_enum.py,sha256=lIVemVWTM1JVOHut47Q71Czw-sEiMjfFiyZ15NDaWAk,2910
33
34
  vgi/_test_fixtures/bad_protocol.py,sha256=L8kxafWZ-4Sd73F6EpCRo1kWVC-09qzof6OYRXazS_8,2495
34
35
  vgi/_test_fixtures/cancellable.py,sha256=hetTRLyT4kkazPNcxAdfzj0pYzEmegMksamJbR0w3Ho,12151
35
36
  vgi/_test_fixtures/catalog.py,sha256=CL6E52_vKr_kgGckT0vQO65uUWBSP518Ulfl8nmSey8,28004
@@ -40,7 +41,7 @@ vgi/_test_fixtures/simple_writable.py,sha256=CGmDBUY8kthauEm8eJN2lV72cJ9_TRxOAQC
40
41
  vgi/_test_fixtures/table_in_out.py,sha256=7QckA3NJhYAYuSctcwZLul5yOM2V3KWvLuG_33K0B_w,50459
41
42
  vgi/_test_fixtures/versioned.py,sha256=Itm-x_Zt9WDwLGT4Dl4VzU5GtFF4HkcaJEqg9ErB8As,5784
42
43
  vgi/_test_fixtures/versioned_tables.py,sha256=KRllGGRrwH8JUtqH-tLHT1JL09rKN-EcEYZVeQdbaLs,22112
43
- vgi/_test_fixtures/worker.py,sha256=hFbvCxbW01A6WD1GNqEBFRJiUGodRS8PWDdUwAyEhPQ,70883
44
+ vgi/_test_fixtures/worker.py,sha256=JPTVPONIoWL9VXN8mINoJH6_Puy1Byj5VOg_fsbw8ws,71171
44
45
  vgi/_test_fixtures/accumulate/__init__.py,sha256=4hYT8jqRoVHSjV9TB7v0Z1CMJtdLuPaDWSz4J2fvMDs,868
45
46
  vgi/_test_fixtures/accumulate/worker.py,sha256=yal9m-GjKNKUdLOLtwkCyFkeHVv_nnpUjh8amwueT48,30163
46
47
  vgi/_test_fixtures/aggregate/__init__.py,sha256=tjCVKdCuHlIAZL7uDi-o_q82oMieXsAyoKExesr-7a0,2156
@@ -53,21 +54,23 @@ vgi/_test_fixtures/aggregate/percentile.py,sha256=S2bm9_BVFrkgxFNoYT7Sz06KQGFDIt
53
54
  vgi/_test_fixtures/aggregate/streaming.py,sha256=yMY819ttLFUzE6ffFXTSdWtBfUcOjfhEYVMqtjUY-dY,7840
54
55
  vgi/_test_fixtures/aggregate/varargs.py,sha256=Wi3mp5q-qHiZqzgqtqtL_0zSO-AR-8qbXssdLGAfEkM,2742
55
56
  vgi/_test_fixtures/aggregate/window.py,sha256=AmIkUCAlDAFVo3tQHuVQTnexYCMJl4MC0gEvbEpcyQ0,13713
57
+ vgi/_test_fixtures/narrow_bind/__init__.py,sha256=NNHq6rwmxlCgOpMrLskDwjCyOA7GXSxBEfB_mOTYEgc,667
58
+ vgi/_test_fixtures/narrow_bind/worker.py,sha256=dEBmtnSpefSTxMV32B_1R5EBpJgzqEtTmLtiG9rGIFM,7430
56
59
  vgi/_test_fixtures/projection_repro/__init__.py,sha256=9CoNsJJbBlR1qqLrL76TkyEk6bNsRZgv_uluj8RTFwY,177
57
60
  vgi/_test_fixtures/projection_repro/worker.py,sha256=ypW4fAVOn5FFnrQJCVl_s722SF-ZKiZ-aUEpPFhy7rQ,15598
58
- vgi/_test_fixtures/scalar/__init__.py,sha256=d2lCvfDbu_bI8Y081nBdg0QtimG8-_nMiO65ZO3zBb0,3637
61
+ vgi/_test_fixtures/scalar/__init__.py,sha256=phgPD2CRm85KHeTpk9tIKGNrPPrxUlHKfWKVDn-BE7c,3685
59
62
  vgi/_test_fixtures/scalar/_common.py,sha256=rI2hJS-bq9wYGzDh73qy1abYktJLjvmN4mNlSEoxhtk,2664
60
63
  vgi/_test_fixtures/scalar/arithmetic.py,sha256=Bl6KQ1iMhe3USIRlV7eRU957I4R6dTidb00VXGTwM7M,10538
61
64
  vgi/_test_fixtures/scalar/binary.py,sha256=sfQR-RdNv4PP_9q77Zii3BCmI7RTz_OQQERFYp_9dTk,4121
62
65
  vgi/_test_fixtures/scalar/formatting.py,sha256=Ingqu4q2xjdhRpM5pYv-a8J-oMkVPeOllaBUbN4nIH0,5342
63
66
  vgi/_test_fixtures/scalar/geo.py,sha256=aciYyzXHauKphdr0Gm4KO_40_c-mnXPBEeHsfnpMULM,9480
64
67
  vgi/_test_fixtures/scalar/null_handling.py,sha256=ED0kJ1XfWPCcv7TzmiJX7tI1GysEPlLULjMgAzrwYcI,3834
65
- vgi/_test_fixtures/scalar/random_demo.py,sha256=83HMDfksBhk5BMoNc8ZK4E3fGyn-En4ETBVtIb-jeSE,5906
68
+ vgi/_test_fixtures/scalar/random_demo.py,sha256=_4R8ePfI2x2o8lolEbFRatN1VFitLb5zhZjRLcEJHPo,7482
66
69
  vgi/_test_fixtures/scalar/settings_secrets.py,sha256=ZOGkGY0CoD3ous_ij0--ss5byB4mbE2vVstmn5Aoj0I,5519
67
70
  vgi/_test_fixtures/scalar/type_info.py,sha256=2WeTxakT-_tcWybPfkCrAHVAMOadFN3tb8E3_EmqIyw,6304
68
71
  vgi/_test_fixtures/schema_reconcile/__init__.py,sha256=rCCtM5bd67-PTPeIYg9SCJaKUSglA6YeXsedQBEUlmA,1324
69
72
  vgi/_test_fixtures/schema_reconcile/worker.py,sha256=qkGRdKvI2AKItenlribd3cvUfvWUwPbAc2WrW7_7Ijc,23570
70
- vgi/_test_fixtures/table/__init__.py,sha256=JiXHTTsdNOS5Lj-ctiTxGaktkuF7COVtoSf0vR3zQRM,7287
73
+ vgi/_test_fixtures/table/__init__.py,sha256=PndeOVcsqi17XLwn0VnmPabjw3tFUfvOQFtETMOCjaU,7371
71
74
  vgi/_test_fixtures/table/_common.py,sha256=tO18gShWidcKcdHYn1FIEXDWzC2SwGsVMnA84r9Y3qs,5961
72
75
  vgi/_test_fixtures/table/batch_index.py,sha256=P5ds0xgikuEQanSEWVWKMLbdvIzUeJraI-GuSoPdb6U,11641
73
76
  vgi/_test_fixtures/table/batch_index_broken.py,sha256=kZOGrLL7ZW1rmwPmNEYRmiF_vqIfHsfXioq5vKPWHk0,7314
@@ -78,7 +81,7 @@ vgi/_test_fixtures/table/make_series.py,sha256=K-G_YNq25Kb7I5bp6XK4rCZzwMYNTxgKH
78
81
  vgi/_test_fixtures/table/misc.py,sha256=71WOIFqk5ntnEIqsG-57rZ9DY7ShQqMKHi7yluNAlM4,16250
79
82
  vgi/_test_fixtures/table/order_modes.py,sha256=FU6CoHCK61VyDdFVbl_MnlgZKGINsDsTwDmv3uD8590,6214
80
83
  vgi/_test_fixtures/table/pairs.py,sha256=idkRTbcLHGOAHG_L0n4QGG0_cuFLrArxFjhIFQKe3Iw,14875
81
- vgi/_test_fixtures/table/partition_columns.py,sha256=WBo3dTgY2ZPqATSR8S2yHijgHdjBN6rkrPH28igwqAY,17271
84
+ vgi/_test_fixtures/table/partition_columns.py,sha256=X3zJIo7P-7-FxHovYKE_reBRugU43H7Msnz4JOxMpJ0,21191
82
85
  vgi/_test_fixtures/table/partition_columns_broken.py,sha256=r4u2Kx5XfUtVeRc-1BRW-kVychJyrrYGdYYuvZnm1MM,11265
83
86
  vgi/_test_fixtures/table/profiling_example.py,sha256=Rt3fgKxJhr7Q8QQRss2-OV13wh2mDXNZjnXW9sBMokQ,6754
84
87
  vgi/_test_fixtures/table/required_filters.py,sha256=RqWVofNCZtaS5wS3TM4ycaZffCk_n0TFOl_B4hU04Vc,7156
@@ -119,8 +122,8 @@ vgi/transactor/_duckdb_compat.py,sha256=sXVZ9JLKAQyGR1BjWczSwdQEavtr-TcZPoVZZnTr
119
122
  vgi/transactor/client.py,sha256=7DTeMksogsw6ANjQjGOPpKYrV76rg4_kGjktMJf54jg,4486
120
123
  vgi/transactor/protocol.py,sha256=Mtmll3CdrLFL1B4NY4NZUTO_yi3PT0qhvMQnzapuBWU,4780
121
124
  vgi/transactor/server.py,sha256=WpIqjzy2Mebw17Jui4-w7vyGEo9pD-pEZJG-3Ob1Sk8,29705
122
- vgi_python-0.8.1.dist-info/METADATA,sha256=imB597Wed-YEoEFF7ko_Nz0UagDGIUddc9JQBGoMjuk,24725
123
- vgi_python-0.8.1.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
124
- vgi_python-0.8.1.dist-info/entry_points.txt,sha256=3Kz1vgodw3pOL_xjtSyDB55-ZRy-U2X-X_Bdr582x0Q,165
125
- vgi_python-0.8.1.dist-info/licenses/LICENSE,sha256=pbJb4zZasP6n5ifEV81wFu017TarjydaYVmGbHcehtY,6103
126
- vgi_python-0.8.1.dist-info/RECORD,,
125
+ vgi_python-0.8.3.dist-info/METADATA,sha256=1IObmLQGq14cxyOL9Q-3rdWmX5HR9y9br2CJf7az7R0,24725
126
+ vgi_python-0.8.3.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
127
+ vgi_python-0.8.3.dist-info/entry_points.txt,sha256=3Kz1vgodw3pOL_xjtSyDB55-ZRy-U2X-X_Bdr582x0Q,165
128
+ vgi_python-0.8.3.dist-info/licenses/LICENSE,sha256=pbJb4zZasP6n5ifEV81wFu017TarjydaYVmGbHcehtY,6103
129
+ vgi_python-0.8.3.dist-info/RECORD,,