vgi-python 0.8.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (124) hide show
  1. vgi/__init__.py +152 -0
  2. vgi/_duckdb.py +62 -0
  3. vgi/_storage_profile.py +132 -0
  4. vgi/_test_fixtures/__init__.py +20 -0
  5. vgi/_test_fixtures/accumulate/__init__.py +19 -0
  6. vgi/_test_fixtures/accumulate/worker.py +762 -0
  7. vgi/_test_fixtures/aggregate/__init__.py +62 -0
  8. vgi/_test_fixtures/aggregate/_common.py +21 -0
  9. vgi/_test_fixtures/aggregate/basic.py +232 -0
  10. vgi/_test_fixtures/aggregate/dynamic.py +409 -0
  11. vgi/_test_fixtures/aggregate/generic.py +86 -0
  12. vgi/_test_fixtures/aggregate/listagg.py +71 -0
  13. vgi/_test_fixtures/aggregate/percentile.py +107 -0
  14. vgi/_test_fixtures/aggregate/streaming.py +192 -0
  15. vgi/_test_fixtures/aggregate/varargs.py +75 -0
  16. vgi/_test_fixtures/aggregate/window.py +380 -0
  17. vgi/_test_fixtures/attach_options.py +308 -0
  18. vgi/_test_fixtures/bad_protocol.py +62 -0
  19. vgi/_test_fixtures/cancellable.py +336 -0
  20. vgi/_test_fixtures/catalog.py +813 -0
  21. vgi/_test_fixtures/http_server.py +394 -0
  22. vgi/_test_fixtures/nest_tensor.py +614 -0
  23. vgi/_test_fixtures/orchard_catalog.py +47 -0
  24. vgi/_test_fixtures/projection_repro/__init__.py +6 -0
  25. vgi/_test_fixtures/projection_repro/worker.py +454 -0
  26. vgi/_test_fixtures/scalar/__init__.py +116 -0
  27. vgi/_test_fixtures/scalar/_common.py +69 -0
  28. vgi/_test_fixtures/scalar/arithmetic.py +321 -0
  29. vgi/_test_fixtures/scalar/binary.py +120 -0
  30. vgi/_test_fixtures/scalar/formatting.py +176 -0
  31. vgi/_test_fixtures/scalar/geo.py +300 -0
  32. vgi/_test_fixtures/scalar/null_handling.py +107 -0
  33. vgi/_test_fixtures/scalar/random_demo.py +171 -0
  34. vgi/_test_fixtures/scalar/settings_secrets.py +102 -0
  35. vgi/_test_fixtures/scalar/type_info.py +219 -0
  36. vgi/_test_fixtures/schema_reconcile/__init__.py +29 -0
  37. vgi/_test_fixtures/schema_reconcile/worker.py +653 -0
  38. vgi/_test_fixtures/simple_writable.py +793 -0
  39. vgi/_test_fixtures/table/__init__.py +221 -0
  40. vgi/_test_fixtures/table/_common.py +162 -0
  41. vgi/_test_fixtures/table/batch_index.py +283 -0
  42. vgi/_test_fixtures/table/batch_index_broken.py +200 -0
  43. vgi/_test_fixtures/table/catalog_scans.py +162 -0
  44. vgi/_test_fixtures/table/filters.py +1005 -0
  45. vgi/_test_fixtures/table/late_materialization.py +249 -0
  46. vgi/_test_fixtures/table/make_series.py +273 -0
  47. vgi/_test_fixtures/table/misc.py +499 -0
  48. vgi/_test_fixtures/table/order_modes.py +164 -0
  49. vgi/_test_fixtures/table/pairs.py +437 -0
  50. vgi/_test_fixtures/table/partition_columns.py +472 -0
  51. vgi/_test_fixtures/table/partition_columns_broken.py +304 -0
  52. vgi/_test_fixtures/table/profiling_example.py +195 -0
  53. vgi/_test_fixtures/table/required_filters.py +234 -0
  54. vgi/_test_fixtures/table/sequence.py +710 -0
  55. vgi/_test_fixtures/table/settings.py +426 -0
  56. vgi/_test_fixtures/table/transaction_storage.py +162 -0
  57. vgi/_test_fixtures/table/tt_pushdown.py +191 -0
  58. vgi/_test_fixtures/table/versioned.py +230 -0
  59. vgi/_test_fixtures/table_in_out.py +1392 -0
  60. vgi/_test_fixtures/versioned.py +155 -0
  61. vgi/_test_fixtures/versioned_tables.py +595 -0
  62. vgi/_test_fixtures/worker.py +1631 -0
  63. vgi/_test_fixtures/writable/__init__.py +8 -0
  64. vgi/_test_fixtures/writable/generic.py +236 -0
  65. vgi/_test_fixtures/writable/table.py +149 -0
  66. vgi/_test_fixtures/writable/worker.py +1148 -0
  67. vgi/aggregate_function.py +607 -0
  68. vgi/argument_spec.py +472 -0
  69. vgi/arguments.py +1747 -0
  70. vgi/auth.py +55 -0
  71. vgi/catalog/__init__.py +88 -0
  72. vgi/catalog/attach_option.py +206 -0
  73. vgi/catalog/catalog_interface.py +2767 -0
  74. vgi/catalog/descriptors.py +870 -0
  75. vgi/catalog/duckdb_statistics.py +377 -0
  76. vgi/catalog/secret_type.py +96 -0
  77. vgi/catalog/setting.py +253 -0
  78. vgi/catalog/storage.py +372 -0
  79. vgi/client/__init__.py +67 -0
  80. vgi/client/catalog_mixin.py +1251 -0
  81. vgi/client/cli.py +582 -0
  82. vgi/client/cli_catalog.py +182 -0
  83. vgi/client/cli_schema.py +270 -0
  84. vgi/client/cli_table.py +907 -0
  85. vgi/client/cli_transaction.py +97 -0
  86. vgi/client/cli_utils.py +441 -0
  87. vgi/client/cli_view.py +303 -0
  88. vgi/client/client.py +2183 -0
  89. vgi/exceptions.py +205 -0
  90. vgi/function.py +245 -0
  91. vgi/function_storage.py +1636 -0
  92. vgi/function_storage_azure_sql.py +922 -0
  93. vgi/function_storage_cf_do.py +740 -0
  94. vgi/http/__init__.py +25 -0
  95. vgi/http/demo_storage.py +212 -0
  96. vgi/http/worker_page.py +1252 -0
  97. vgi/invocation.py +154 -0
  98. vgi/logging_config.py +93 -0
  99. vgi/meta_worker.py +661 -0
  100. vgi/metadata.py +1403 -0
  101. vgi/otel.py +406 -0
  102. vgi/protocol.py +2418 -0
  103. vgi/protocol_version.txt +1 -0
  104. vgi/py.typed +0 -0
  105. vgi/scalar_function.py +1211 -0
  106. vgi/schema_utils.py +234 -0
  107. vgi/secret_protocol.py +124 -0
  108. vgi/secret_service.py +238 -0
  109. vgi/serve.py +769 -0
  110. vgi/table_buffering_function.py +443 -0
  111. vgi/table_filter_pushdown.py +1528 -0
  112. vgi/table_function.py +1130 -0
  113. vgi/table_in_out_function.py +383 -0
  114. vgi/transactor/__init__.py +24 -0
  115. vgi/transactor/_duckdb_compat.py +27 -0
  116. vgi/transactor/client.py +137 -0
  117. vgi/transactor/protocol.py +149 -0
  118. vgi/transactor/server.py +740 -0
  119. vgi/worker.py +4761 -0
  120. vgi_python-0.8.0.dist-info/METADATA +735 -0
  121. vgi_python-0.8.0.dist-info/RECORD +124 -0
  122. vgi_python-0.8.0.dist-info/WHEEL +4 -0
  123. vgi_python-0.8.0.dist-info/entry_points.txt +5 -0
  124. vgi_python-0.8.0.dist-info/licenses/LICENSE +134 -0
vgi/protocol.py ADDED
@@ -0,0 +1,2418 @@
1
+ # Copyright 2025, 2026 Query Farm LLC - https://query.farm
2
+
3
+ """VGI protocol definition for vgi_rpc server integration.
4
+
5
+ Defines the VgiProtocol, consolidated request types (BindRequest, InitRequest),
6
+ catalog request/response types, and StreamState implementations for each function type.
7
+
8
+ VgiProtocol Methods
9
+ -------------------
10
+ - **bind()**: Schema resolution and argument validation (unary)
11
+ - **init()**: Worker initialization, returns a Stream for data processing
12
+ - **catalog_*()**: ~35 typed catalog interface methods (unary)
13
+
14
+ StreamState Implementations
15
+ ---------------------------
16
+ - **ScalarExchangeState**: Calls ScalarFunctionGenerator.process() per batch
17
+ - **TableProducerState**: Calls TableFunctionGenerator.process() per tick
18
+ - **TableInOutExchangeState**: Calls TableInOutGenerator.process() per input
19
+ - **BufferedFinalizeState**: Drains a state_log via cursor for streaming-shape
20
+ FINALIZE phase of TableInOutGenerator
21
+
22
+ """
23
+
24
+ from __future__ import annotations
25
+
26
+ import base64
27
+ import contextlib
28
+ import dataclasses
29
+ import logging
30
+ from dataclasses import dataclass, field
31
+ from typing import TYPE_CHECKING, Annotated, Any, ClassVar, Protocol, get_args, get_origin
32
+
33
+ import pyarrow as pa
34
+ import pyarrow.compute as pc
35
+ from vgi_rpc import ArrowSerializableDataclass, ArrowType, Transient
36
+ from vgi_rpc.rpc import (
37
+ AnnotatedBatch,
38
+ CallContext,
39
+ ExchangeState,
40
+ OutputCollector,
41
+ ProducerState,
42
+ Stream,
43
+ )
44
+
45
+ from vgi.arguments import Arguments
46
+ from vgi.catalog.catalog_interface import (
47
+ CatalogAttachResult,
48
+ CatalogInfo,
49
+ FunctionInfo,
50
+ IndexConstraintType,
51
+ IndexInfo,
52
+ MacroInfo,
53
+ MacroType,
54
+ OnConflict,
55
+ PartitionKind,
56
+ SchemaInfo,
57
+ SchemaObjectType,
58
+ TableInfo,
59
+ ViewInfo,
60
+ )
61
+ from vgi.function_storage import BoundStorage, attach_catalog_bytes
62
+ from vgi.invocation import BindResponse, FunctionType, GlobalInitResponse
63
+ from vgi.otel import VgiTracer, _batch_bytes, _timed_exchange, get_noop_tracer
64
+ from vgi.scalar_function import ScalarFunctionGenerator
65
+ from vgi.table_function import (
66
+ OrderByDirection,
67
+ OrderByNullOrder,
68
+ ProcessParams,
69
+ SecretsAccessor,
70
+ TableCardinality,
71
+ TableFunctionBase,
72
+ TableFunctionGenerator,
73
+ TableInOutFunctionInitPhase,
74
+ _batch_to_scalar_dict,
75
+ _effective_projection_ids,
76
+ project_schema,
77
+ )
78
+ from vgi.table_in_out_function import TableInOutGenerator
79
+
80
+ __all__ = [
81
+ "BindRequest",
82
+ "CatalogAttachRequest",
83
+ "CatalogCreateRequest",
84
+ "CatalogsResponse",
85
+ "IndexCreateRequest",
86
+ "IndexesResponse",
87
+ "MacroCreateRequest",
88
+ "MacrosResponse",
89
+ "TableCreateRequest",
90
+ "CatalogVersionResponse",
91
+ "FunctionsResponse",
92
+ "InitRequest",
93
+ "ProcessState",
94
+ "ScalarExchangeState",
95
+ "SchemasResponse",
96
+ "TableFunctionDynamicToStringRequest",
97
+ "TableFunctionDynamicToStringResponse",
98
+ "TableInOutExchangeState",
99
+ "BufferedFinalizeState",
100
+ "TableProducerState",
101
+ "TableBufferingCombineRequest",
102
+ "TableBufferingCombineResponse",
103
+ "TableBufferingDestructorRequest",
104
+ "TableBufferingDestructorResponse",
105
+ "TableBufferingFinalizeState",
106
+ "TableBufferingProcessRequest",
107
+ "TableBufferingProcessResponse",
108
+ "TablesResponse",
109
+ "TransactionBeginResponse",
110
+ "VgiProtocol",
111
+ "ViewsResponse",
112
+ ]
113
+
114
+
115
+ # ---------------------------------------------------------------------------
116
+ # Request types
117
+ # ---------------------------------------------------------------------------
118
+
119
+
120
+ @dataclass(frozen=True, slots=True, kw_only=True)
121
+ class BindRequest(ArrowSerializableDataclass):
122
+ """Consolidated bind request for all function types.
123
+
124
+ For table functions (no input schema), ``input_schema`` is ``None``.
125
+ For scalar and table-in-out functions, ``input_schema`` is set.
126
+
127
+ """
128
+
129
+ function_name: str
130
+ arguments: Annotated[Arguments, ArrowType(pa.binary())]
131
+ function_type: FunctionType
132
+ input_schema: Annotated[pa.Schema | None, ArrowType(pa.binary())] = None
133
+ settings: Annotated[pa.RecordBatch | None, ArrowType(pa.binary())] = None
134
+ secrets: Annotated[pa.RecordBatch | None, ArrowType(pa.binary())] = None
135
+ attach_opaque_data: bytes | None = None
136
+ transaction_opaque_data: bytes | None = None
137
+ resolved_secrets_provided: bool = False
138
+
139
+ # Time travel: the AT (TIMESTAMP|VERSION ...) clause for this scan, threaded
140
+ # through from DuckDB's per-reference bind. Both None when the scan has no AT
141
+ # clause. NOTE: for inline-bound (function-backed) tables the *actual* on_bind
142
+ # RPC runs once at attach with no AT, so these are None there; the per-scan AT
143
+ # is carried on the bind request embedded in each InitRequest, so functions
144
+ # should read it at init via ``params.init_call.bind_call.at_value`` (or the
145
+ # ``ProcessParams.at_value`` accessor), not at on_bind.
146
+ at_unit: str | None = None
147
+ at_value: str | None = None
148
+
149
+
150
+ @dataclass(frozen=True, slots=True, kw_only=True)
151
+ class InitRequest(ArrowSerializableDataclass):
152
+ """Consolidated init request for all function types.
153
+
154
+ For secondary init requests, ``execution_id`` and ``init_opaque_data``
155
+ are set; use :attr:`is_secondary` to distinguish.
156
+
157
+ """
158
+
159
+ # Core (always present)
160
+ bind_call: BindRequest
161
+ output_schema: Annotated[pa.Schema, ArrowType(pa.binary())]
162
+ # Wire-facing — bytes the framework produced from the typed
163
+ # ``BindResult.opaque_data``. Consumers reconstruct via
164
+ # ``MyConcreteDataclass.deserialize_from_bytes(raw)``. See
165
+ # ``BindResponse.opaque_data`` in vgi/invocation.py for the full
166
+ # contract rationale (typed producer / bytes wire / explicit
167
+ # consumer; abstract-base reconstruction can't be done in Python
168
+ # without a class registry).
169
+ bind_opaque_data: Annotated[bytes | None, ArrowType(pa.binary())] = None
170
+
171
+ # Table function extras (None for scalar)
172
+ projection_ids: list[int] | None = None
173
+ pushdown_filters: Annotated[pa.RecordBatch | None, ArrowType(pa.large_binary())] = None
174
+ join_keys: Annotated[list[pa.RecordBatch] | None, ArrowType(pa.list_(pa.large_binary()))] = None
175
+
176
+ # Table-in-out extras
177
+ phase: TableInOutFunctionInitPhase | None = None
178
+ # Buffered-table finalize stream: which state_id this stream serves.
179
+ # Required when phase=TABLE_BUFFERING_FINALIZE; None otherwise. Opaque
180
+ # bytes — worker chose the encoding when its combine() returned the
181
+ # finalize_state_ids list.
182
+ finalize_state_id: bytes | None = None
183
+
184
+ # Secondary init (None = global init, set = secondary)
185
+ execution_id: bytes | None = None
186
+ # Same contract as ``bind_opaque_data`` above.
187
+ init_opaque_data: Annotated[bytes | None, ArrowType(pa.binary())] = None
188
+
189
+ # Order pushdown hint from DuckDB's RowGroupPruner optimizer (all None when no hint)
190
+ order_by_column_name: str | None = None
191
+ order_by_direction: OrderByDirection | None = None
192
+ order_by_null_order: OrderByNullOrder | None = None
193
+ order_by_limit: int | None = None
194
+
195
+ # TABLESAMPLE pushdown hint from DuckDB's SamplingPushdown optimizer (all None when no hint)
196
+ tablesample_percentage: float | None = None
197
+ tablesample_seed: int | None = None
198
+
199
+ @property
200
+ def is_secondary(self) -> bool:
201
+ """True if this is a secondary init request."""
202
+ return self.execution_id is not None
203
+
204
+
205
+ @dataclass(frozen=True, slots=True, kw_only=True)
206
+ class TableFunctionCardinalityRequest(ArrowSerializableDataclass):
207
+ """Consolidated request for table function cardinality."""
208
+
209
+ bind_call: BindRequest
210
+ # Same contract as InitRequest.bind_opaque_data above.
211
+ bind_opaque_data: Annotated[bytes | None, ArrowType(pa.binary())] = None
212
+
213
+
214
+ @dataclass(frozen=True, slots=True, kw_only=True)
215
+ class TableFunctionStatisticsRequest(ArrowSerializableDataclass):
216
+ """Consolidated request for table function per-column statistics.
217
+
218
+ Mirrors TableFunctionCardinalityRequest: the worker receives a full
219
+ copy of the original BindRequest (including parsed Arguments), so it
220
+ can derive per-column stats from the user-supplied args.
221
+ """
222
+
223
+ bind_call: BindRequest
224
+ # Same contract as InitRequest.bind_opaque_data above.
225
+ bind_opaque_data: Annotated[bytes | None, ArrowType(pa.binary())] = None
226
+
227
+
228
+ @dataclass(frozen=True, slots=True, kw_only=True)
229
+ class TableFunctionDynamicToStringRequest(ArrowSerializableDataclass):
230
+ """Post-execution profile-info request, fired once per scan thread.
231
+
232
+ Carries ``global_execution_id`` so the function class can retrieve
233
+ whatever diagnostics it persisted during ``process()`` (shared
234
+ storage, external service, in-memory class state for single-worker
235
+ setups, etc.). VGI does not serialize per-thread ``_user_state``
236
+ across the boundary — the user owns persistence.
237
+ """
238
+
239
+ bind_call: BindRequest
240
+ # Same contract as InitRequest.bind_opaque_data above.
241
+ bind_opaque_data: Annotated[bytes | None, ArrowType(pa.binary())] = None
242
+ global_execution_id: bytes
243
+
244
+
245
+ @dataclass(frozen=True, slots=True, kw_only=True)
246
+ class TableFunctionDynamicToStringResponse(ArrowSerializableDataclass):
247
+ """Ordered key/value pairs surfaced as Extra Info under EXPLAIN ANALYZE.
248
+
249
+ Parallel ``keys``/``values`` lists keep insertion order explicit on
250
+ the wire. The C++ side reassembles them into an
251
+ ``InsertionOrderPreservingMap<string>``.
252
+ """
253
+
254
+ keys: Annotated[list[str], ArrowType(pa.list_(pa.string()))]
255
+ values: Annotated[list[str], ArrowType(pa.list_(pa.string()))]
256
+
257
+
258
+ # ---------------------------------------------------------------------------
259
+ # Catalog request types (for methods with complex parameters)
260
+ # ---------------------------------------------------------------------------
261
+
262
+
263
+ @dataclass(frozen=True, slots=True, kw_only=True)
264
+ class CatalogAttachRequest(ArrowSerializableDataclass):
265
+ """Request for catalog_attach. Uses RecordBatch for mixed-type options.
266
+
267
+ ``data_version_spec`` and ``implementation_version`` carry semver
268
+ strings the user supplied at ATTACH time (concrete or range). ``None``
269
+ = unconstrained. The worker is responsible for interpreting and
270
+ validating them.
271
+ """
272
+
273
+ name: str
274
+ options: Annotated[pa.RecordBatch | None, ArrowType(pa.binary())] = None
275
+ data_version_spec: str | None
276
+ implementation_version: str | None
277
+
278
+
279
+ @dataclass(frozen=True, slots=True, kw_only=True)
280
+ class CatalogCreateRequest(ArrowSerializableDataclass):
281
+ """Request for catalog_create. Uses RecordBatch for mixed-type options."""
282
+
283
+ name: str
284
+ on_conflict: OnConflict
285
+ options: Annotated[pa.RecordBatch | None, ArrowType(pa.binary())] = None
286
+
287
+
288
+ @dataclass(frozen=True, slots=True, kw_only=True)
289
+ class TableCreateRequest(ArrowSerializableDataclass):
290
+ """Request for catalog_table_create with complex constraint types."""
291
+
292
+ attach_opaque_data: bytes
293
+ schema_name: str
294
+ name: str
295
+ columns: bytes # SerializedSchema
296
+ on_conflict: OnConflict
297
+ not_null_constraints: Annotated[list[int], ArrowType(pa.list_(pa.int32()))] = field(default_factory=list)
298
+ unique_constraints: Annotated[list[list[int]], ArrowType(pa.list_(pa.list_(pa.int32())))] = field(
299
+ default_factory=list
300
+ )
301
+ check_constraints: list[str] = field(default_factory=list)
302
+ primary_key_constraints: Annotated[list[list[int]], ArrowType(pa.list_(pa.list_(pa.int32())))] = field(
303
+ default_factory=list
304
+ )
305
+ foreign_key_constraints: Annotated[list[bytes], ArrowType(pa.list_(pa.binary()))] = field(default_factory=list)
306
+ transaction_opaque_data: bytes | None = None
307
+
308
+
309
+ # ---------------------------------------------------------------------------
310
+ # Catalog response types
311
+ # ---------------------------------------------------------------------------
312
+
313
+
314
+ # ``CatalogsResponse`` is generated below via ``_catalog_items_response`` once
315
+ # that factory is defined — it wraps a list of CatalogInfo records serialized
316
+ # as bytes, matching the pattern used for other list[Info] responses.
317
+
318
+
319
+ @dataclass(frozen=True, slots=True, kw_only=True)
320
+ class CatalogVersionResponse(ArrowSerializableDataclass):
321
+ """Response wrapping int for catalog_version()."""
322
+
323
+ version: int
324
+
325
+
326
+ @dataclass(frozen=True, slots=True, kw_only=True)
327
+ class TransactionBeginResponse(ArrowSerializableDataclass):
328
+ """Response wrapping optional TransactionOpaqueData for catalog_transaction_begin()."""
329
+
330
+ transaction_opaque_data: bytes | None = None
331
+
332
+
333
+ def _catalog_items_response(item_type: type) -> type:
334
+ """Generate a catalog items response class for the given ArrowSerializableDataclass type.
335
+
336
+ Each generated class wraps a list of IPC-serialized items with helpers:
337
+ - from_infos(items) / from_optional(item) — serialize into response
338
+ - to_infos() / to_optional() — deserialize from response
339
+
340
+ The item_type must have serialize_to_bytes() and deserialize_from_bytes() methods
341
+ (i.e., be an ArrowSerializableDataclass).
342
+ """
343
+ type_name = item_type.__name__
344
+
345
+ @dataclass(frozen=True, slots=True, kw_only=True)
346
+ class _Response(ArrowSerializableDataclass):
347
+ items: Annotated[list[bytes], ArrowType(pa.list_(pa.binary()))]
348
+
349
+ @staticmethod
350
+ def from_infos(infos: list) -> _Response: # type: ignore[type-arg]
351
+ return _Response(items=[info.serialize_to_bytes() for info in infos])
352
+
353
+ @staticmethod
354
+ def from_optional(info: object | None) -> _Response:
355
+ if info is None:
356
+ return _Response(items=[])
357
+ return _Response(items=[info.serialize_to_bytes()]) # type: ignore[attr-defined]
358
+
359
+ def to_infos(self) -> list: # type: ignore[type-arg]
360
+ return [item_type.deserialize_from_bytes(b) for b in self.items] # type: ignore[attr-defined]
361
+
362
+ def to_optional(self) -> object | None:
363
+ if not self.items:
364
+ return None
365
+ return item_type.deserialize_from_bytes(self.items[0]) # type: ignore[attr-defined,no-any-return]
366
+
367
+ # Give the class a meaningful name for vgi_rpc introspection and repr
368
+ # "TableInfo" -> "TablesResponse", "IndexInfo" -> "IndexesResponse"
369
+ stem = type_name.removesuffix("Info")
370
+ plural = f"{stem}es" if stem.endswith(("x", "s", "sh", "ch")) else f"{stem}s"
371
+ class_name = f"{plural}Response"
372
+ _Response.__name__ = class_name
373
+ _Response.__qualname__ = class_name
374
+ _Response.__doc__ = f"Response wrapping list of {type_name}."
375
+
376
+ return _Response
377
+
378
+
379
+ if TYPE_CHECKING:
380
+ from typing import Self
381
+
382
+ # Provide mypy with explicit class shapes for the dynamically generated responses.
383
+ class _CatalogItemsResponseStub(ArrowSerializableDataclass):
384
+ items: list[bytes]
385
+
386
+ @classmethod
387
+ def from_infos(cls, infos: list[Any]) -> Self: ...
388
+
389
+ @classmethod
390
+ def from_optional(cls, info: object | None) -> Self: ...
391
+
392
+ def to_infos(self) -> list[Any]: ...
393
+
394
+ def to_optional(self) -> Any: ...
395
+
396
+ class CatalogsResponse(_CatalogItemsResponseStub):
397
+ """Response wrapping list of CatalogInfo."""
398
+
399
+ class SchemasResponse(_CatalogItemsResponseStub):
400
+ """Response wrapping list of SchemaInfo."""
401
+
402
+ class TablesResponse(_CatalogItemsResponseStub):
403
+ """Response wrapping list of TableInfo."""
404
+
405
+ class ViewsResponse(_CatalogItemsResponseStub):
406
+ """Response wrapping list of ViewInfo."""
407
+
408
+ class FunctionsResponse(_CatalogItemsResponseStub):
409
+ """Response wrapping list of FunctionInfo."""
410
+
411
+ class MacrosResponse(_CatalogItemsResponseStub):
412
+ """Response wrapping list of MacroInfo."""
413
+ else:
414
+ CatalogsResponse = _catalog_items_response(CatalogInfo)
415
+ SchemasResponse = _catalog_items_response(SchemaInfo)
416
+ TablesResponse = _catalog_items_response(TableInfo)
417
+ ViewsResponse = _catalog_items_response(ViewInfo)
418
+ FunctionsResponse = _catalog_items_response(FunctionInfo)
419
+ MacrosResponse = _catalog_items_response(MacroInfo)
420
+
421
+
422
+ @dataclass(frozen=True, slots=True, kw_only=True)
423
+ class MacroCreateRequest(ArrowSerializableDataclass):
424
+ """Request for catalog_macro_create with RecordBatch for parameter defaults."""
425
+
426
+ attach_opaque_data: bytes
427
+ schema_name: str
428
+ name: str
429
+ macro_type: MacroType
430
+ parameters: list[str]
431
+ definition: str
432
+ on_conflict: OnConflict
433
+ parameter_default_values: Annotated[pa.RecordBatch | None, ArrowType(pa.binary())] = None
434
+ transaction_opaque_data: bytes | None = None
435
+
436
+
437
+ if TYPE_CHECKING:
438
+
439
+ class IndexesResponse(_CatalogItemsResponseStub): # noqa: E302
440
+ """Response wrapping list of IndexInfo."""
441
+ else:
442
+ IndexesResponse = _catalog_items_response(IndexInfo)
443
+
444
+
445
+ @dataclass(frozen=True, slots=True, kw_only=True)
446
+ class IndexCreateRequest(ArrowSerializableDataclass):
447
+ """Request for catalog_index_create."""
448
+
449
+ attach_opaque_data: bytes
450
+ schema_name: str
451
+ name: str
452
+ table_name: str
453
+ index_type: str = ""
454
+ constraint_type: IndexConstraintType = IndexConstraintType.NONE
455
+ expressions: list[str] = field(default_factory=list)
456
+ on_conflict: OnConflict = OnConflict.ERROR
457
+ options: dict[str, str] = field(default_factory=dict)
458
+ transaction_opaque_data: bytes | None = None
459
+
460
+
461
+ # ---------------------------------------------------------------------------
462
+ # StreamState implementations
463
+ # ---------------------------------------------------------------------------
464
+
465
+
466
+ @dataclass
467
+ class ScalarExchangeState(ExchangeState):
468
+ """Exchange state for scalar function streams.
469
+
470
+ Calls ``ScalarFunctionGenerator.process()`` per batch. Each ``exchange()``
471
+ call sends one input batch and receives one output batch.
472
+
473
+ ``_init_call`` and ``_init_response`` are serialized into the state token
474
+ so they survive HTTP round-trips. ``_func_cls`` is transient and restored
475
+ via ``rehydrate()``.
476
+
477
+ """
478
+
479
+ _init_call: Annotated[InitRequest, ArrowType(pa.binary())] = field(default=None, repr=False) # type: ignore[assignment]
480
+ _init_response: Annotated[GlobalInitResponse, ArrowType(pa.binary())] = field(default=None, repr=False) # type: ignore[assignment]
481
+ # Full framework attach plaintext (uuid||catalog_bytes) persisted through
482
+ # serialization so each exchange can shard storage on its UUID without
483
+ # re-unwrapping (the auth-scoped seal can't be reopened, and ctx.implementation
484
+ # is the MetaWorker under subprocess transport).
485
+ _plaintext_attach: bytes | None = field(default=None, repr=False)
486
+ _func_cls: Annotated[type[ScalarFunctionGenerator], Transient()] = field(default=None, repr=False) # type: ignore[assignment]
487
+ _vgi_tracer: Annotated[VgiTracer, Transient()] = field(default_factory=get_noop_tracer, repr=False)
488
+
489
+ def rehydrate(self, implementation: object) -> None:
490
+ """Restore ``_func_cls`` from the worker's function registry."""
491
+ from vgi.worker import Worker
492
+
493
+ worker: Worker = implementation # type: ignore[assignment]
494
+ self._func_cls = worker._resolve_function(self._init_call.bind_call) # type: ignore[assignment]
495
+ self._vgi_tracer = worker._vgi_tracer
496
+
497
+ def exchange(self, input: AnnotatedBatch, out: OutputCollector, ctx: CallContext) -> None:
498
+ """Process one input batch through the scalar function."""
499
+ cls = self._func_cls
500
+ batch = input.batch
501
+
502
+ # Workaround: over HTTP, 0-column batches lose their row count because
503
+ # Arrow IPC RecordBatch messages with no arrays default to length 0.
504
+ # When a scalar function has no column inputs (e.g. "SELECT func()"),
505
+ # the caller expects 1 output row but sends num_rows=0. Add a dummy
506
+ # column so PyArrow preserves the row count, then strip it before
507
+ # validation.
508
+ inject_row = batch.num_columns == 0 and batch.num_rows == 0
509
+ if inject_row:
510
+ batch = pa.record_batch({"__row": pa.array([True])})
511
+
512
+ timer = _timed_exchange(
513
+ self._vgi_tracer,
514
+ "vgi.execute.scalar",
515
+ self._init_call.bind_call.function_name,
516
+ self._init_call.bind_call.function_type.value,
517
+ self._init_response.execution_id,
518
+ )
519
+ with timer:
520
+ output = cls.process(
521
+ batch=batch,
522
+ init_call=self._init_call,
523
+ init_response=self._init_response,
524
+ # Shard on the UUID of the full attach plaintext persisted at init.
525
+ storage=BoundStorage(
526
+ cls.storage,
527
+ self._init_response.execution_id,
528
+ attach_plaintext=self._plaintext_attach,
529
+ ),
530
+ auth_context=ctx.auth,
531
+ )
532
+ if inject_row:
533
+ cls._validate_row_count(output, batch)
534
+ else:
535
+ cls._validate_row_count(output, input.batch)
536
+ timer.record(
537
+ input_rows=input.batch.num_rows,
538
+ output_rows=output.num_rows,
539
+ input_bytes=_batch_bytes(input.batch),
540
+ output_bytes=_batch_bytes(output),
541
+ )
542
+ out.emit(output)
543
+
544
+
545
+ _log = logging.getLogger(__name__)
546
+
547
+
548
+ def _resolve_state_type(func_cls: type) -> type[ArrowSerializableDataclass] | None:
549
+ """Extract the TState type parameter from a TableFunctionGenerator or TableInOutGenerator.
550
+
551
+ Walks the MRO looking for ``TableFunctionGenerator[TArgs, TState]`` or
552
+ ``TableInOutGenerator[TArgs, TState]`` and returns ``TState`` if it is a
553
+ concrete ``ArrowSerializableDataclass`` subclass.
554
+
555
+ Raises TypeError if the state type is a concrete class that does not
556
+ extend ArrowSerializableDataclass — this catches the problem early
557
+ rather than silently falling back to initial_state() on each HTTP exchange.
558
+ """
559
+ for klass in func_cls.__mro__:
560
+ for base in getattr(klass, "__orig_bases__", ()):
561
+ origin = get_origin(base)
562
+ if origin is None:
563
+ continue
564
+ if issubclass(origin, (TableFunctionGenerator, TableInOutGenerator)):
565
+ args = get_args(base)
566
+ if len(args) >= 2:
567
+ state_type = args[1]
568
+ if isinstance(state_type, type) and issubclass(state_type, ArrowSerializableDataclass):
569
+ return state_type
570
+ if (
571
+ isinstance(state_type, type)
572
+ and state_type is not type(None)
573
+ and not issubclass(state_type, ArrowSerializableDataclass)
574
+ ):
575
+ raise TypeError(
576
+ f"{func_cls.__name__}: TState type {state_type.__name__} must extend "
577
+ f"ArrowSerializableDataclass for HTTP state serialization."
578
+ )
579
+ return None
580
+
581
+
582
+ def _partition_fields_from_schema(bind_schema: pa.Schema) -> list[pa.Field[Any]]:
583
+ """Walk a bind schema and return fields annotated as partition columns.
584
+
585
+ Recognises the ``vgi.partition_column = b"true"`` field metadata
586
+ set by :func:`vgi.schema_utils.partition_field`. Used by the
587
+ table-producer harness to precompute the list of partition fields
588
+ once at wrapper construction, so per-emit validation only does an
589
+ O(P) walk where P is the partition column count.
590
+ """
591
+ from vgi.schema_utils import VGI_PARTITION_COLUMN_KEY
592
+
593
+ result: list[pa.Field[Any]] = []
594
+ for f in bind_schema:
595
+ md = f.metadata
596
+ if md is not None and md.get(VGI_PARTITION_COLUMN_KEY) == b"true":
597
+ result.append(f)
598
+ return result
599
+
600
+
601
+ def _resolve_partition_min_max(
602
+ field: pa.Field[Any],
603
+ partition_kind: PartitionKind,
604
+ batch: pa.RecordBatch,
605
+ explicit: dict[str, tuple[pa.Scalar[Any], pa.Scalar[Any]]] | None,
606
+ ) -> tuple[pa.Scalar[Any], pa.Scalar[Any]]:
607
+ """Resolve ``(min, max)`` for one partition column.
608
+
609
+ Two paths:
610
+ * Explicit: ``explicit[field.name]`` is a ``(pa.Scalar, pa.Scalar)``
611
+ tuple with both elements typed to ``field.type``.
612
+ * Auto-extract: read the column from the batch, derive
613
+ ``(min, max)``. For SINGLE_VALUE, also validate single distinct
614
+ non-null value.
615
+ """
616
+ if explicit is not None and field.name in explicit:
617
+ pair = explicit[field.name]
618
+ if not isinstance(pair, tuple) or len(pair) != 2:
619
+ raise RuntimeError(f"partition_values[{field.name!r}] must be (min, max) tuple; got {pair!r}")
620
+ min_s, max_s = pair
621
+ if not isinstance(min_s, pa.Scalar) or not isinstance(max_s, pa.Scalar):
622
+ raise RuntimeError(
623
+ f"partition_values[{field.name!r}] elements must be pa.Scalar; "
624
+ f"got ({type(min_s).__name__}, {type(max_s).__name__})"
625
+ )
626
+ if min_s.type != field.type:
627
+ raise RuntimeError(
628
+ f"partition_values[{field.name!r}] min type mismatch: declared {field.type}, got {min_s.type}"
629
+ )
630
+ if max_s.type != field.type:
631
+ raise RuntimeError(
632
+ f"partition_values[{field.name!r}] max type mismatch: declared {field.type}, got {max_s.type}"
633
+ )
634
+ return min_s, max_s
635
+
636
+ # Auto-extract path.
637
+ try:
638
+ column = batch.column(field.name)
639
+ except KeyError as exc:
640
+ raise RuntimeError(
641
+ f"column {field.name!r} is partition-annotated but absent from emitted batch; "
642
+ f"pass partition_values={{{field.name!r}: (pa.scalar(...), pa.scalar(...))}}"
643
+ ) from exc
644
+
645
+ if partition_kind == PartitionKind.SINGLE_VALUE_PARTITIONS:
646
+ # Count distinct non-null values; SINGLE_VALUE requires <= 1.
647
+ # All-NULL columns are accepted: DuckDB routes NULL as its own
648
+ # partition (Value::NotDistinctFrom(NULL, NULL) is true).
649
+ non_null = pc.drop_null(column)
650
+ if len(non_null) > 0:
651
+ unique = pc.unique(non_null)
652
+ if len(unique) > 1:
653
+ raise RuntimeError(
654
+ f"column {field.name!r} has {len(unique)} distinct values; "
655
+ f"partition_kind=SINGLE_VALUE_PARTITIONS requires 1"
656
+ )
657
+
658
+ # ``pa.compute.min_max`` returns a scalar struct with min/max fields.
659
+ # For all-null columns it returns null/null of the column's type,
660
+ # which is exactly what we want.
661
+ mm_struct = pc.min_max(column)
662
+ return mm_struct["min"], mm_struct["max"]
663
+
664
+
665
+ def _build_partition_values_batch(
666
+ partition_fields: list[pa.Field[Any]],
667
+ resolved: dict[str, tuple[pa.Scalar[Any], pa.Scalar[Any]]],
668
+ ) -> pa.RecordBatch:
669
+ """Build the 2-row ``(min, max)`` RecordBatch from resolved scalars."""
670
+ arrays: list[pa.Array[Any]] = []
671
+ fields: list[pa.Field[Any]] = []
672
+ for pf in partition_fields:
673
+ min_s, max_s = resolved[pf.name]
674
+ # pa.array([scalar, scalar]) infers the same type as the scalars;
675
+ # the resolve step already validated those match field.type, so a
676
+ # direct cast is a no-op except for any storage-layout normalisation.
677
+ arr = pa.array([min_s, max_s], type=pf.type)
678
+ arrays.append(arr)
679
+ fields.append(pf)
680
+ return pa.RecordBatch.from_arrays(arrays, schema=pa.schema(fields))
681
+
682
+
683
+ def _serialize_partition_values_batch(batch: pa.RecordBatch) -> str:
684
+ """Serialize via Arrow IPC stream + base64.
685
+
686
+ Matches the ``vgi_rpc.stream_state#b64`` convention used elsewhere.
687
+ """
688
+ sink = pa.BufferOutputStream()
689
+ with pa.ipc.new_stream(sink, batch.schema) as writer:
690
+ writer.write_batch(batch)
691
+ return base64.b64encode(sink.getvalue().to_pybytes()).decode("ascii")
692
+
693
+
694
+ def _merge_partition_values(
695
+ *,
696
+ partition_fields: list[pa.Field[Any]],
697
+ partition_kind: PartitionKind,
698
+ batch: pa.RecordBatch,
699
+ partition_values: dict[str, tuple[pa.Scalar[Any], pa.Scalar[Any]]] | None,
700
+ metadata: dict[str, str] | None,
701
+ ) -> dict[str, str] | None:
702
+ """Validate the partition_values kwarg and fold it into the emit metadata.
703
+
704
+ Folds the resulting Arrow IPC bytes into the emit metadata dict under
705
+ ``vgi_partition_values#b64``.
706
+
707
+ Contract:
708
+
709
+ * If ``partition_fields`` is empty (function did not annotate any
710
+ partition column), then ``partition_values`` MUST be None —
711
+ catches "I forgot to mark fields" bugs that would otherwise
712
+ silently drop the kwarg.
713
+ * If ``partition_fields`` is non-empty AND ``batch.num_rows == 0``:
714
+ no metadata is emitted (empty-batch exemption — the C++ extension
715
+ skips its requirement check on 0-row batches).
716
+ * Otherwise: for each partition field, resolve ``(min, max)`` via
717
+ :func:`_resolve_partition_min_max`. Build a 2-row IPC batch,
718
+ serialize, base64-encode, set
719
+ ``metadata["vgi_partition_values#b64"]``.
720
+ """
721
+ if not partition_fields:
722
+ if partition_values is not None:
723
+ raise RuntimeError(
724
+ "out.emit(partition_values=...) requires partition-annotated fields "
725
+ "in the bind schema. Use vgi.schema_utils.partition_field() to mark "
726
+ "the column(s) and set Meta.partition_kind to a non-default value."
727
+ )
728
+ return metadata
729
+
730
+ if batch.num_rows == 0:
731
+ # Empty batches are exempt from partition-values; the C++ side
732
+ # skips its requirement check for 0-row batches. Leave metadata
733
+ # untouched so callers don't pay base64+IPC overhead for nothing.
734
+ return metadata
735
+
736
+ resolved: dict[str, tuple[pa.Scalar[Any], pa.Scalar[Any]]] = {}
737
+ for pf in partition_fields:
738
+ resolved[pf.name] = _resolve_partition_min_max(
739
+ pf,
740
+ partition_kind,
741
+ batch,
742
+ partition_values,
743
+ )
744
+
745
+ values_batch = _build_partition_values_batch(partition_fields, resolved)
746
+ b64 = _serialize_partition_values_batch(values_batch)
747
+
748
+ merged: dict[str, str] = dict(metadata) if metadata else {}
749
+ merged["vgi_partition_values#b64"] = b64
750
+ return merged
751
+
752
+
753
+ def _merge_batch_index(
754
+ *,
755
+ supports_batch_index: bool,
756
+ batch_index: int | None,
757
+ metadata: dict[str, str] | None,
758
+ ) -> dict[str, str] | None:
759
+ """Validate the batch_index kwarg and fold it into the emit metadata dict.
760
+
761
+ Contract:
762
+ * If ``supports_batch_index`` is True, ``batch_index`` MUST be supplied.
763
+ Forgetting the kwarg on an opted-in function is a programming error
764
+ that would otherwise produce a data batch with no
765
+ ``vgi_batch_index`` metadata — the C++ extension would raise an
766
+ IOException at scan time; raising here gives the worker author a
767
+ clearer line number.
768
+ * If ``supports_batch_index`` is False, ``batch_index`` MUST NOT be
769
+ supplied — catches "I forgot to set the Meta flag" bugs.
770
+ * The merged value is a decimal-string of the int (matches the wire
771
+ convention used by ``vgi_filter_version`` / ``vgi_join_keys_version``
772
+ elsewhere in the codebase).
773
+ """
774
+ if supports_batch_index:
775
+ if batch_index is None:
776
+ raise RuntimeError("out.emit() requires batch_index= on a function with Meta.supports_batch_index = True")
777
+ else:
778
+ if batch_index is not None:
779
+ raise RuntimeError("out.emit(batch_index=...) requires Meta.supports_batch_index = True")
780
+ if batch_index is None:
781
+ return metadata
782
+ merged: dict[str, str] = dict(metadata) if metadata else {}
783
+ merged["vgi_batch_index"] = str(batch_index)
784
+ return merged
785
+
786
+
787
+ class VgiOutputCollector(Protocol):
788
+ """Structural type for the ``out`` handed to a table function's body.
789
+
790
+ VGI's emit-path wrappers (:class:`_TrackingOutputCollector`,
791
+ :class:`_FilteringOutputCollector`) extend vgi-rpc's
792
+ ``OutputCollector.emit`` with ``batch_index=`` and ``partition_values=``
793
+ kwargs. Function bodies that opt into those features ``cast`` the
794
+ framework-supplied ``out`` to this protocol before calling ``emit``:
795
+ the base ``OutputCollector`` type cannot carry the wider signature
796
+ without breaking ``process()`` override compatibility across every
797
+ fixture.
798
+ """
799
+
800
+ def emit(
801
+ self,
802
+ batch: pa.RecordBatch,
803
+ batch_index: int | None = None,
804
+ partition_values: dict[str, tuple[pa.Scalar[Any], pa.Scalar[Any]]] | None = None,
805
+ metadata: dict[str, str] | None = None,
806
+ ) -> None: ...
807
+
808
+ def finish(self) -> None: ...
809
+
810
+ def client_log(self, level: Any, message: str, **extra: str) -> None: ...
811
+
812
+
813
+ class _FilteringOutputCollector:
814
+ """Wrapper that applies pushdown filters to emitted data batches.
815
+
816
+ Intercepts emit() calls and applies the pushdown filter before
817
+ delegating to the real OutputCollector. Threads ``batch_index=`` and
818
+ ``metadata=`` kwargs through unchanged — validation lives on the
819
+ innermost wrapper (``_TrackingOutputCollector``) so it happens exactly
820
+ once regardless of which wrappers are stacked.
821
+ """
822
+
823
+ __slots__ = ("_inner", "_func_cls", "_filters")
824
+
825
+ def __init__(self, inner: _TrackingOutputCollector, func_cls: type[TableFunctionBase[Any]], filters: Any) -> None:
826
+ self._inner = inner
827
+ self._func_cls = func_cls
828
+ self._filters = filters
829
+
830
+ def emit(
831
+ self,
832
+ batch: pa.RecordBatch,
833
+ batch_index: int | None = None,
834
+ partition_values: dict[str, tuple[pa.Scalar[Any], pa.Scalar[Any]]] | None = None,
835
+ metadata: dict[str, str] | None = None,
836
+ ) -> None:
837
+ filtered = self._func_cls._apply_pushdown_filter(batch, self._filters)
838
+ self._inner.emit(
839
+ filtered,
840
+ batch_index=batch_index,
841
+ partition_values=partition_values,
842
+ metadata=metadata,
843
+ )
844
+
845
+ def emit_pydict(self, data: dict[str, Any], schema: pa.Schema | None = None) -> None:
846
+ batch = pa.RecordBatch.from_pydict(data, schema=schema or self._inner.output_schema)
847
+ self.emit(batch)
848
+
849
+ def finish(self) -> None:
850
+ self._inner.finish()
851
+
852
+ @property
853
+ def finished(self) -> bool:
854
+ return self._inner.finished
855
+
856
+ def emit_client_log_message(self, msg: Any) -> None:
857
+ self._inner.emit_client_log_message(msg)
858
+
859
+ def client_log(self, level: Any, message: str, **extra: str) -> None:
860
+ self._inner.client_log(level, message, **extra)
861
+
862
+ def propagate(self) -> None:
863
+ """No-op: state already propagated to inner collector."""
864
+
865
+ @property
866
+ def output_schema(self) -> pa.Schema:
867
+ return self._inner.output_schema
868
+
869
+
870
+ class _TrackingOutputCollector:
871
+ """Wrapper that tracks total rows and bytes emitted, delegating all else.
872
+
873
+ Also the validation point for the ``batch_index=`` and
874
+ ``partition_values=`` kwargs on ``out.emit()`` (see
875
+ :func:`_merge_batch_index` and :func:`_merge_partition_values`). This
876
+ wrapper is always the innermost wrapper in the table-function emit
877
+ path, so validating here happens exactly once per emit regardless of
878
+ whether :class:`_FilteringOutputCollector` is also in the stack.
879
+ """
880
+
881
+ __slots__ = (
882
+ "_inner",
883
+ "_supports_batch_index",
884
+ "_partition_fields",
885
+ "_partition_kind",
886
+ "total_rows",
887
+ "total_bytes",
888
+ )
889
+
890
+ def __init__(
891
+ self,
892
+ inner: OutputCollector,
893
+ supports_batch_index: bool = False,
894
+ partition_fields: list[pa.Field[Any]] | None = None,
895
+ partition_kind: PartitionKind = PartitionKind.NOT_PARTITIONED,
896
+ ) -> None:
897
+ self._inner = inner
898
+ self._supports_batch_index = supports_batch_index
899
+ # Pre-computed list of partition-annotated fields from the bind
900
+ # schema; empty when the function did not opt in to PartitionColumns.
901
+ self._partition_fields = partition_fields or []
902
+ self._partition_kind = partition_kind
903
+ self.total_rows = 0
904
+ self.total_bytes = 0
905
+
906
+ def emit(
907
+ self,
908
+ batch: pa.RecordBatch,
909
+ batch_index: int | None = None,
910
+ partition_values: dict[str, tuple[pa.Scalar[Any], pa.Scalar[Any]]] | None = None,
911
+ metadata: dict[str, str] | None = None,
912
+ ) -> None:
913
+ merged_metadata = _merge_batch_index(
914
+ supports_batch_index=self._supports_batch_index,
915
+ batch_index=batch_index,
916
+ metadata=metadata,
917
+ )
918
+ merged_metadata = _merge_partition_values(
919
+ partition_fields=self._partition_fields,
920
+ partition_kind=self._partition_kind,
921
+ batch=batch,
922
+ partition_values=partition_values,
923
+ metadata=merged_metadata,
924
+ )
925
+ self.total_rows += batch.num_rows
926
+ self.total_bytes += _batch_bytes(batch)
927
+ if merged_metadata is None:
928
+ self._inner.emit(batch)
929
+ else:
930
+ self._inner.emit(batch, metadata=merged_metadata)
931
+
932
+ @property
933
+ def finished(self) -> bool:
934
+ return self._inner.finished
935
+
936
+ @property
937
+ def output_schema(self) -> pa.Schema:
938
+ return self._inner.output_schema
939
+
940
+ def __getattr__(self, name: str) -> Any:
941
+ return getattr(self._inner, name)
942
+
943
+
944
+ @dataclass
945
+ class TableProducerState(ProducerState):
946
+ """Producer state for table function streams.
947
+
948
+ Calls ``TableFunctionGenerator.process()`` per tick. Each ``produce()``
949
+ call delegates to the function's process method which uses ``out`` directly.
950
+
951
+ When ``auto_apply_filters`` is enabled on the function class, pushdown
952
+ filters from the init request are automatically applied to each output
953
+ batch after ``process()`` produces it.
954
+
955
+ ``_init_call`` and ``_init_response`` are serialized into the state token
956
+ so they survive HTTP round-trips. Transient fields are restored via
957
+ ``rehydrate()``.
958
+
959
+ ``_user_state`` is serialized when it is an ``ArrowSerializableDataclass``,
960
+ allowing iteration state to survive HTTP round-trips. When the state type
961
+ is not serializable, it falls back to ``initial_state()`` on rehydration.
962
+
963
+ """
964
+
965
+ _init_call: Annotated[InitRequest, ArrowType(pa.binary())] = field(default=None, repr=False) # type: ignore[assignment]
966
+ _init_response: Annotated[GlobalInitResponse, ArrowType(pa.binary())] = field(default=None, repr=False) # type: ignore[assignment]
967
+ _user_state_bytes: bytes | None = field(default=None, repr=False)
968
+ # Plaintext attach for bodies that read it as user data. ``_init_call`` now
969
+ # carries the SEALED attach (storage shards on it via request=); this carries
970
+ # the unwrapped form through serialization so rehydrate can set
971
+ # params.attach_opaque_data without re-unwrapping (the seal is auth-scoped).
972
+ _plaintext_attach: bytes | None = field(default=None, repr=False)
973
+ _func_cls: Annotated[type[TableFunctionGenerator[Any]], Transient()] = field(default=None, repr=False) # type: ignore[assignment]
974
+ _params: Annotated[ProcessParams[Any], Transient()] = field(default=None, repr=False) # type: ignore[arg-type]
975
+ _user_state: Annotated[Any, Transient()] = field(default=None, repr=False)
976
+ _pushdown_filters: Annotated[Any, Transient()] = field(default=None, repr=False) # PushdownFilters | None
977
+ _auto_apply: Annotated[bool, Transient()] = field(default=False, repr=False)
978
+ _vgi_tracer: Annotated[VgiTracer, Transient()] = field(default_factory=get_noop_tracer, repr=False)
979
+
980
+ def __post_init__(self) -> None:
981
+ """Resolve pushdown filters if auto_apply_filters is enabled."""
982
+ if self._func_cls is not None and self._func_cls._should_auto_apply_filters():
983
+ self._auto_apply = True
984
+ init_call = self._params.init_call if self._params is not None else None
985
+ if init_call is not None and init_call.pushdown_filters is not None:
986
+ self._pushdown_filters = self._func_cls.pushdown_filters(
987
+ init_call.pushdown_filters,
988
+ join_keys=init_call.join_keys,
989
+ )
990
+
991
+ def _to_row_dict(self) -> dict[str, object]:
992
+ """Serialize _user_state into _user_state_bytes before standard serialization."""
993
+ if self._user_state is not None and isinstance(self._user_state, ArrowSerializableDataclass):
994
+ self._user_state_bytes = self._user_state.serialize_to_bytes()
995
+ return super()._to_row_dict()
996
+
997
+ def rehydrate(self, implementation: object) -> None:
998
+ """Restore transient fields from serialized init data."""
999
+ from vgi.worker import Worker
1000
+
1001
+ worker: Worker = implementation # type: ignore[assignment]
1002
+ func_cls = worker._resolve_function(self._init_call.bind_call)
1003
+ assert issubclass(func_cls, TableFunctionGenerator)
1004
+ self._func_cls = func_cls
1005
+ self._vgi_tracer = worker._vgi_tracer
1006
+ proj_ids = _effective_projection_ids(func_cls, self._init_call.projection_ids)
1007
+ output_schema = project_schema(proj_ids, self._init_call.output_schema)
1008
+ self._params = ProcessParams(
1009
+ args=func_cls._parse_arguments(func_cls.FunctionArguments, self._init_call.bind_call.arguments),
1010
+ init_call=self._init_call,
1011
+ init_response=self._init_response,
1012
+ output_schema=output_schema,
1013
+ settings=_batch_to_scalar_dict(self._init_call.bind_call.settings),
1014
+ secrets=SecretsAccessor(self._init_call.bind_call.secrets).to_dict(),
1015
+ # Rehydrated tick: the auth-scoped seal can't be reopened here, so we
1016
+ # shard storage on the full plaintext (uuid||catalog_bytes) the init
1017
+ # state persisted; the body sees only the stripped catalog bytes.
1018
+ storage=BoundStorage(
1019
+ func_cls.storage,
1020
+ self._init_response.execution_id,
1021
+ attach_plaintext=self._plaintext_attach,
1022
+ ),
1023
+ attach_opaque_data=attach_catalog_bytes(self._plaintext_attach),
1024
+ )
1025
+ # Restore _user_state from serialized bytes if available
1026
+ if self._user_state_bytes is not None:
1027
+ state_type = _resolve_state_type(func_cls)
1028
+ if state_type is not None:
1029
+ self._user_state = state_type.deserialize_from_bytes(self._user_state_bytes)
1030
+ _log.debug("Restored user state from token: %s", type(self._user_state).__name__)
1031
+ else:
1032
+ _log.debug("State type not serializable, falling back to initial_state()")
1033
+ self._user_state = func_cls.initial_state(self._params)
1034
+ else:
1035
+ self._user_state = func_cls.initial_state(self._params)
1036
+ # Re-derive pushdown filters (triggers same logic as __post_init__)
1037
+ if func_cls._should_auto_apply_filters():
1038
+ self._auto_apply = True
1039
+ if self._init_call.pushdown_filters is not None:
1040
+ self._pushdown_filters = func_cls.pushdown_filters(
1041
+ self._init_call.pushdown_filters,
1042
+ join_keys=self._init_call.join_keys,
1043
+ )
1044
+
1045
+ def process(self, input: AnnotatedBatch, out: OutputCollector, ctx: CallContext) -> None:
1046
+ """Process tick batch — check for dynamic filter updates, then produce."""
1047
+ if input.custom_metadata is not None:
1048
+ encoded = input.custom_metadata.get(b"vgi_pushdown_filters")
1049
+ if encoded is not None:
1050
+ self._update_filters_from_metadata(encoded)
1051
+ self.produce(out, ctx)
1052
+
1053
+ def _update_filters_from_metadata(self, encoded_filters: bytes) -> None:
1054
+ """Decode and apply dynamic filter update from tick metadata."""
1055
+ import base64
1056
+
1057
+ from vgi.table_filter_pushdown import deserialize_filters
1058
+
1059
+ try:
1060
+ filter_bytes = base64.b64decode(encoded_filters)
1061
+ table = pa.ipc.open_stream(filter_bytes).read_all()
1062
+ if table.num_rows > 0:
1063
+ filter_batch = table.to_batches()[0]
1064
+ new_filters = deserialize_filters(filter_batch)
1065
+ self._pushdown_filters = new_filters
1066
+ except Exception:
1067
+ _log.warning("Failed to deserialize dynamic filter from tick metadata", exc_info=True)
1068
+
1069
+ def produce(self, out: OutputCollector, ctx: CallContext) -> None:
1070
+ """Produce the next output batch from the table function."""
1071
+ params = dataclasses.replace(
1072
+ self._params,
1073
+ auth_context=ctx.auth,
1074
+ current_pushdown_filters=self._pushdown_filters,
1075
+ )
1076
+ timer = _timed_exchange(
1077
+ self._vgi_tracer,
1078
+ "vgi.execute.table",
1079
+ self._init_call.bind_call.function_name,
1080
+ self._init_call.bind_call.function_type.value,
1081
+ self._init_response.execution_id,
1082
+ )
1083
+ with timer:
1084
+ tracking_out = _TrackingOutputCollector(
1085
+ out,
1086
+ supports_batch_index=self._func_cls._supports_batch_index(),
1087
+ partition_fields=_partition_fields_from_schema(self._init_call.output_schema),
1088
+ partition_kind=self._func_cls._partition_kind(),
1089
+ )
1090
+ if self._auto_apply and self._pushdown_filters is not None:
1091
+ filtered_out = _FilteringOutputCollector(tracking_out, self._func_cls, self._pushdown_filters)
1092
+ self._func_cls.process(params, self._user_state, filtered_out) # type: ignore[arg-type]
1093
+ filtered_out.propagate()
1094
+ else:
1095
+ self._func_cls.process(params, self._user_state, tracking_out) # type: ignore[arg-type]
1096
+ timer.record(
1097
+ output_rows=tracking_out.total_rows,
1098
+ output_bytes=tracking_out.total_bytes,
1099
+ )
1100
+
1101
+ def on_cancel(self, ctx: CallContext) -> None:
1102
+ """Forward cancel signal to the user function's classmethod."""
1103
+ if self._func_cls is None or self._params is None:
1104
+ return
1105
+ params = dataclasses.replace(self._params, auth_context=ctx.auth)
1106
+ try:
1107
+ self._func_cls.on_cancel(params, self._user_state)
1108
+ except Exception:
1109
+ _log.debug("on_cancel hook raised", exc_info=True)
1110
+
1111
+
1112
+ @dataclass
1113
+ class TableInOutExchangeState(ExchangeState):
1114
+ """Exchange state for table-in-out function streams (INPUT phase).
1115
+
1116
+ Calls ``TableInOutGenerator.process()`` per input batch. Each
1117
+ ``exchange()`` call sends one input batch and receives one output batch.
1118
+
1119
+ When ``auto_apply_filters`` is enabled, pushdown filters from the init
1120
+ request are automatically applied to each output batch.
1121
+
1122
+ ``_init_call`` and ``_init_response`` are serialized into the state token
1123
+ so they survive HTTP round-trips. Transient fields are restored via
1124
+ ``rehydrate()``.
1125
+
1126
+ ``_user_state`` is serialized when it is an ``ArrowSerializableDataclass``,
1127
+ allowing iteration state to survive HTTP round-trips.
1128
+
1129
+ """
1130
+
1131
+ _init_call: Annotated[InitRequest, ArrowType(pa.binary())] = field(default=None, repr=False) # type: ignore[assignment]
1132
+ _init_response: Annotated[GlobalInitResponse, ArrowType(pa.binary())] = field(default=None, repr=False) # type: ignore[assignment]
1133
+ _user_state_bytes: bytes | None = field(default=None, repr=False)
1134
+ # Plaintext attach for bodies that read it as user data. ``_init_call`` now
1135
+ # carries the SEALED attach (storage shards on it via request=); this carries
1136
+ # the unwrapped form through serialization so rehydrate can set
1137
+ # params.attach_opaque_data without re-unwrapping (the seal is auth-scoped).
1138
+ _plaintext_attach: bytes | None = field(default=None, repr=False)
1139
+ _func_cls: Annotated[type[TableInOutGenerator[Any]], Transient()] = field(default=None, repr=False) # type: ignore[assignment]
1140
+ _params: Annotated[ProcessParams[Any], Transient()] = field(default=None, repr=False) # type: ignore[arg-type]
1141
+ _user_state: Annotated[Any, Transient()] = field(default=None, repr=False)
1142
+ _pushdown_filters: Annotated[Any, Transient()] = field(default=None, repr=False) # PushdownFilters | None
1143
+ _auto_apply: Annotated[bool, Transient()] = field(default=False, repr=False)
1144
+ _vgi_tracer: Annotated[VgiTracer, Transient()] = field(default_factory=get_noop_tracer, repr=False)
1145
+
1146
+ def __post_init__(self) -> None:
1147
+ """Resolve pushdown filters if auto_apply_filters is enabled."""
1148
+ if self._func_cls is not None and self._func_cls._should_auto_apply_filters():
1149
+ self._auto_apply = True
1150
+ init_call = self._params.init_call if self._params is not None else None
1151
+ if init_call is not None and init_call.pushdown_filters is not None:
1152
+ self._pushdown_filters = self._func_cls.pushdown_filters(
1153
+ init_call.pushdown_filters,
1154
+ join_keys=init_call.join_keys,
1155
+ )
1156
+
1157
+ def _to_row_dict(self) -> dict[str, object]:
1158
+ """Serialize _user_state into _user_state_bytes before standard serialization."""
1159
+ if self._user_state is not None and isinstance(self._user_state, ArrowSerializableDataclass):
1160
+ self._user_state_bytes = self._user_state.serialize_to_bytes()
1161
+ return super()._to_row_dict()
1162
+
1163
+ def rehydrate(self, implementation: object) -> None:
1164
+ """Restore transient fields from serialized init data."""
1165
+ from vgi.worker import Worker
1166
+
1167
+ worker: Worker = implementation # type: ignore[assignment]
1168
+ func_cls = worker._resolve_function(self._init_call.bind_call)
1169
+ assert issubclass(func_cls, TableInOutGenerator)
1170
+ self._func_cls = func_cls
1171
+ self._vgi_tracer = worker._vgi_tracer
1172
+ proj_ids = _effective_projection_ids(func_cls, self._init_call.projection_ids)
1173
+ output_schema = project_schema(proj_ids, self._init_call.output_schema)
1174
+ self._params = ProcessParams(
1175
+ args=func_cls._parse_arguments(func_cls.FunctionArguments, self._init_call.bind_call.arguments),
1176
+ init_call=self._init_call,
1177
+ init_response=self._init_response,
1178
+ output_schema=output_schema,
1179
+ settings=_batch_to_scalar_dict(self._init_call.bind_call.settings),
1180
+ secrets=SecretsAccessor(self._init_call.bind_call.secrets).to_dict(),
1181
+ # Rehydrated tick: shard storage on the full plaintext the init state
1182
+ # persisted (the auth-scoped seal can't be reopened here); the body
1183
+ # sees only the stripped catalog bytes.
1184
+ storage=BoundStorage(
1185
+ func_cls.storage,
1186
+ self._init_response.execution_id,
1187
+ attach_plaintext=self._plaintext_attach,
1188
+ ),
1189
+ attach_opaque_data=attach_catalog_bytes(self._plaintext_attach),
1190
+ )
1191
+ # Restore _user_state from serialized bytes if available
1192
+ if self._user_state_bytes is not None:
1193
+ state_type = _resolve_state_type(func_cls)
1194
+ if state_type is not None:
1195
+ self._user_state = state_type.deserialize_from_bytes(self._user_state_bytes)
1196
+ else:
1197
+ self._user_state = func_cls.initial_state(self._params)
1198
+ else:
1199
+ self._user_state = func_cls.initial_state(self._params)
1200
+ if func_cls._should_auto_apply_filters():
1201
+ self._auto_apply = True
1202
+ if self._init_call.pushdown_filters is not None:
1203
+ self._pushdown_filters = func_cls.pushdown_filters(
1204
+ self._init_call.pushdown_filters,
1205
+ join_keys=self._init_call.join_keys,
1206
+ )
1207
+
1208
+ def exchange(self, input: AnnotatedBatch, out: OutputCollector, ctx: CallContext) -> None:
1209
+ """Process one input batch through the table-in-out function."""
1210
+ params = dataclasses.replace(self._params, auth_context=ctx.auth)
1211
+ timer = _timed_exchange(
1212
+ self._vgi_tracer,
1213
+ "vgi.execute.table_in_out",
1214
+ self._init_call.bind_call.function_name,
1215
+ self._init_call.bind_call.function_type.value,
1216
+ self._init_response.execution_id,
1217
+ )
1218
+ with timer:
1219
+ tracking_out = _TrackingOutputCollector(
1220
+ out,
1221
+ supports_batch_index=self._func_cls._supports_batch_index(),
1222
+ partition_fields=_partition_fields_from_schema(self._init_call.output_schema),
1223
+ partition_kind=self._func_cls._partition_kind(),
1224
+ )
1225
+ if self._auto_apply and self._pushdown_filters is not None:
1226
+ filtered_out = _FilteringOutputCollector(tracking_out, self._func_cls, self._pushdown_filters)
1227
+ self._func_cls.process(params, self._user_state, input.batch, filtered_out) # type: ignore[arg-type]
1228
+ filtered_out.propagate()
1229
+ else:
1230
+ self._func_cls.process(params, self._user_state, input.batch, tracking_out) # type: ignore[arg-type]
1231
+ timer.record(
1232
+ input_rows=input.batch.num_rows,
1233
+ output_rows=tracking_out.total_rows,
1234
+ input_bytes=_batch_bytes(input.batch),
1235
+ output_bytes=tracking_out.total_bytes,
1236
+ )
1237
+
1238
+ def on_cancel(self, ctx: CallContext) -> None:
1239
+ """Forward cancel signal to the user function's classmethod."""
1240
+ if self._func_cls is None or self._params is None:
1241
+ return
1242
+ params = dataclasses.replace(self._params, auth_context=ctx.auth)
1243
+ try:
1244
+ self._func_cls.on_cancel(params, self._user_state)
1245
+ except Exception:
1246
+ _log.debug("on_cancel hook raised", exc_info=True)
1247
+
1248
+
1249
+ @dataclass
1250
+ class BufferedFinalizeState(ProducerState):
1251
+ """Cursor-driven streaming finalize. Drains a state_log via cursor.
1252
+
1253
+ Used by the streaming-shape ``TableInOutGenerator`` FINALIZE phase
1254
+ (not the new ``TableBufferingFunction`` path — that has its own
1255
+ ``TableBufferingFinalizeState``). Wire-serializable end-to-end:
1256
+ nothing here is Transient, nothing holds object references. Each
1257
+ ``produce()`` tick: cold-build BoundedStorage from
1258
+ (execution_id + attach), scan the next page of log rows past
1259
+ ``cursor``, emit, advance cursor. No per-tick user code — the
1260
+ worker's init handler materializes the user's
1261
+ ``finalize() -> list[batch]`` return into BoundedStorage at init
1262
+ time, and produce() drains it.
1263
+ """
1264
+
1265
+ execution_id: bytes = b""
1266
+ ns: bytes = b""
1267
+ key: bytes = b""
1268
+ cursor: bytes = b"" # opaque, b"" = before-first
1269
+ attach_opaque_data: bytes | None = None
1270
+
1271
+ def produce(self, out: OutputCollector, ctx: CallContext) -> None:
1272
+ """Drain the next page of (ns, key) past cursor; finish at EOL."""
1273
+ # Local imports keep the protocol module's import graph minimal
1274
+ # and avoid a circular dependency on vgi.worker.
1275
+ from vgi.table_in_out_function import pack_int_cursor, unpack_int_cursor
1276
+ from vgi.worker import (
1277
+ _build_bound_storage_from_fields,
1278
+ _decode_ipc_batch,
1279
+ )
1280
+
1281
+ storage = _build_bound_storage_from_fields(
1282
+ self.execution_id,
1283
+ self.attach_opaque_data,
1284
+ ctx,
1285
+ )
1286
+ last_id = unpack_int_cursor(self.cursor)
1287
+ # OutputCollector enforces one data batch per produce() tick, so
1288
+ # we read exactly one row per call. Framework loops the ticks
1289
+ # until out.finish() is called on EOS.
1290
+ rows = storage.state_log_scan(
1291
+ self.ns,
1292
+ self.key,
1293
+ after_id=last_id,
1294
+ limit=1,
1295
+ )
1296
+ if not rows:
1297
+ out.finish()
1298
+ return
1299
+ log_id, value = rows[0]
1300
+ out.emit(_decode_ipc_batch(value))
1301
+ self.cursor = pack_int_cursor(log_id)
1302
+
1303
+
1304
+ @dataclass
1305
+ class TableBufferingFinalizeState(ProducerState):
1306
+ """Streaming finalize state for ``TableBufferingFunction.finalize``.
1307
+
1308
+ Producer-mode stream parameterized by (execution_id, finalize_state_id).
1309
+ One streaming RPC per finalize_state_id; framework calls user's
1310
+ ``cls.finalize(params, finalize_state_id, state, out)`` per tick,
1311
+ serializing the user's ``state_blob`` between ticks so the stream
1312
+ survives worker-process boundaries (HTTP transport).
1313
+ """
1314
+
1315
+ function_name: str = ""
1316
+ execution_id: bytes = b""
1317
+ transaction_id: bytes | None = None
1318
+ finalize_state_id: bytes = b""
1319
+ # Serialized form of the user's TFinalizeState (ArrowSerializableDataclass
1320
+ # bytes), or b"" on the first tick before initial_finalize_state() runs.
1321
+ state_blob: bytes = b""
1322
+ # True after the user's initial_finalize_state() has been invoked and
1323
+ # state_blob is populated. Distinguishes "first tick / build initial"
1324
+ # from "subsequent tick / deserialize existing".
1325
+ state_initialized: bool = False
1326
+ attach_opaque_data: bytes | None = None
1327
+ # Pushdown carried from the InitRequest. Wire-serialized on every tick so
1328
+ # an HTTP rehydration on a different worker process still knows which
1329
+ # columns to project and which filter predicates to apply. The streaming
1330
+ # peer ``TableInOutExchangeState`` rehydrates these the same way on every
1331
+ # round-trip (``vgi/protocol.py:1106-1119``).
1332
+ projection_ids: list[int] | None = None
1333
+ pushdown_filters: Annotated[pa.RecordBatch | None, ArrowType(pa.large_binary())] = None
1334
+
1335
+ def produce(self, out: OutputCollector, ctx: CallContext) -> None:
1336
+ """Drive one tick of the user's finalize() callback."""
1337
+ # Local import: keeps the protocol module's import graph minimal
1338
+ # and avoids a circular dependency on vgi.worker.
1339
+ from vgi.worker import run_table_buffering_finalize_tick
1340
+
1341
+ run_table_buffering_finalize_tick(self, out, ctx)
1342
+
1343
+ def on_cancel(self, ctx: CallContext) -> None:
1344
+ """Forward the framework's cancel signal to ``cls.on_cancel``.
1345
+
1346
+ Fired by vgi-rpc when the consumer abandons the stream before EOS
1347
+ (DuckDB LIMIT, exception unwind, user break). Resolves func_cls
1348
+ + params the same way ``produce()`` does (cold-load from storage)
1349
+ and deserializes the user's last-emitted finalize state from
1350
+ ``self.state_blob``. Anything raised inside the user hook is
1351
+ swallowed — we're already on a teardown path; don't mask the
1352
+ original cancel.
1353
+
1354
+ Idempotent: if ``state_initialized`` is False we haven't yet run
1355
+ ``initial_finalize_state``, so there's no user state worth
1356
+ forwarding — skip rather than build a fresh one just to discard it.
1357
+ """
1358
+ if not self.state_initialized:
1359
+ return
1360
+ # Local imports: same reason as produce(); the worker module pulls
1361
+ # in heavy dependencies (FunctionStorage backends, etc.) that we
1362
+ # don't want eager-loaded on protocol import.
1363
+ from dataclasses import dataclass as _dc
1364
+
1365
+ from vgi.worker import _deserialize_finalize_state
1366
+
1367
+ @_dc
1368
+ class _CancelStubRequest:
1369
+ function_name: str
1370
+ execution_id: bytes
1371
+ attach_opaque_data: bytes | None
1372
+ transaction_id: bytes | None
1373
+
1374
+ stub = _CancelStubRequest(
1375
+ function_name=self.function_name,
1376
+ execution_id=self.execution_id,
1377
+ attach_opaque_data=self.attach_opaque_data,
1378
+ transaction_id=self.transaction_id,
1379
+ )
1380
+ worker = ctx.implementation
1381
+ if worker is None:
1382
+ # produce() raises in this case; on_cancel is teardown so we
1383
+ # silently skip — better than crashing during pipeline unwind.
1384
+ return
1385
+ try:
1386
+ func_cls, params = worker._load_table_buffering_params(
1387
+ stub,
1388
+ ctx,
1389
+ attach_already_unwrapped=True,
1390
+ )
1391
+ except Exception: # noqa: BLE001 — teardown path, swallow
1392
+ return
1393
+ user_state = _deserialize_finalize_state(func_cls, self.state_blob) if self.state_blob else None
1394
+ with contextlib.suppress(Exception):
1395
+ func_cls.on_cancel(params, self.finalize_state_id, user_state)
1396
+
1397
+
1398
+ # Type alias for the union of all stream state variants produced by init().
1399
+ # vgi-rpc resolves this union using a method-local numeric tag in HTTP state
1400
+ # tokens, so state recovery does not depend on Python class names.
1401
+ ProcessState = (
1402
+ ScalarExchangeState
1403
+ | TableProducerState
1404
+ | TableInOutExchangeState
1405
+ | BufferedFinalizeState
1406
+ | TableBufferingFinalizeState
1407
+ )
1408
+
1409
+
1410
+ # ---------------------------------------------------------------------------
1411
+ # Aggregate Function RPC Types (all unary request/response)
1412
+ # ---------------------------------------------------------------------------
1413
+
1414
+
1415
+ @dataclass(frozen=True, slots=True, kw_only=True)
1416
+ class AggregateBindRequest(ArrowSerializableDataclass):
1417
+ """Request for aggregate_bind — resolve output schema."""
1418
+
1419
+ function_name: str
1420
+ arguments: Annotated[Arguments, ArrowType(pa.binary())]
1421
+ input_schema: Annotated[pa.Schema | None, ArrowType(pa.binary())] = None
1422
+ settings: Annotated[pa.RecordBatch | None, ArrowType(pa.binary())] = None
1423
+ secrets: Annotated[pa.RecordBatch | None, ArrowType(pa.binary())] = None
1424
+ attach_opaque_data: bytes | None = None
1425
+
1426
+
1427
+ @dataclass(frozen=True, slots=True, kw_only=True)
1428
+ class AggregateBindResponse(ArrowSerializableDataclass):
1429
+ """Response from aggregate_bind."""
1430
+
1431
+ output_schema: Annotated[pa.Schema, ArrowType(pa.binary())]
1432
+ execution_id: bytes
1433
+
1434
+
1435
+ @dataclass(frozen=True, slots=True, kw_only=True)
1436
+ class AggregateUpdateRequest(ArrowSerializableDataclass):
1437
+ """Request for aggregate_update — accumulate rows into per-group state."""
1438
+
1439
+ function_name: str
1440
+ execution_id: bytes
1441
+ input_batch: bytes # Full IPC stream bytes (schema + data + EOS)
1442
+ attach_opaque_data: bytes | None = None
1443
+
1444
+
1445
+ @dataclass(frozen=True, slots=True, kw_only=True)
1446
+ class AggregateUpdateResponse(ArrowSerializableDataclass):
1447
+ """Response from aggregate_update — empty ack."""
1448
+
1449
+ pass
1450
+
1451
+
1452
+ @dataclass(frozen=True, slots=True, kw_only=True)
1453
+ class AggregateCombineRequest(ArrowSerializableDataclass):
1454
+ """Request for aggregate_combine — merge source states into targets."""
1455
+
1456
+ function_name: str
1457
+ execution_id: bytes
1458
+ merge_batch: bytes # Full IPC stream bytes
1459
+ attach_opaque_data: bytes | None = None
1460
+
1461
+
1462
+ @dataclass(frozen=True, slots=True, kw_only=True)
1463
+ class AggregateCombineResponse(ArrowSerializableDataclass):
1464
+ """Response from aggregate_combine — empty ack."""
1465
+
1466
+ pass
1467
+
1468
+
1469
+ @dataclass(frozen=True, slots=True, kw_only=True)
1470
+ class AggregateFinalizeRequest(ArrowSerializableDataclass):
1471
+ """Request for aggregate_finalize — produce results for group_ids."""
1472
+
1473
+ function_name: str
1474
+ execution_id: bytes
1475
+ group_ids_batch: bytes # Full IPC stream bytes
1476
+ output_schema: Annotated[pa.Schema, ArrowType(pa.binary())]
1477
+ attach_opaque_data: bytes | None = None
1478
+
1479
+
1480
+ @dataclass(frozen=True, slots=True, kw_only=True)
1481
+ class AggregateFinalizeResponse(ArrowSerializableDataclass):
1482
+ """Response from aggregate_finalize — result batch as IPC stream bytes."""
1483
+
1484
+ result_batch: bytes # Full IPC stream bytes
1485
+
1486
+
1487
+ @dataclass(frozen=True, slots=True, kw_only=True)
1488
+ class AggregateDestructorRequest(ArrowSerializableDataclass):
1489
+ """Request for aggregate_destructor — best-effort state cleanup."""
1490
+
1491
+ function_name: str
1492
+ execution_id: bytes
1493
+ group_ids_batch: bytes # Full IPC stream bytes
1494
+ attach_opaque_data: bytes | None = None
1495
+
1496
+
1497
+ @dataclass(frozen=True, slots=True, kw_only=True)
1498
+ class AggregateDestructorResponse(ArrowSerializableDataclass):
1499
+ """Response from aggregate_destructor — empty ack."""
1500
+
1501
+ pass
1502
+
1503
+
1504
+ # ---------------------------------------------------------------------------
1505
+ # Table Sink+Source RPC Types
1506
+ # ---------------------------------------------------------------------------
1507
+ # Sink+Source PhysicalOperator path for TableBufferingFunction subclasses.
1508
+ # Contract:
1509
+ # * process() is UNARY; the worker-chosen state_id rides on the response
1510
+ # as opaque bytes.
1511
+ # * state_ids / finalize_state_ids are opaque bytes throughout.
1512
+ # * finalize is the existing streaming-init path with new
1513
+ # TableBufferingFinalizeState driving user finalize() per tick.
1514
+
1515
+
1516
+ @dataclass(frozen=True, slots=True, kw_only=True)
1517
+ class TableBufferingProcessRequest(ArrowSerializableDataclass):
1518
+ """Request for table_buffering_process — sink one batch (unary)."""
1519
+
1520
+ function_name: str
1521
+ execution_id: bytes
1522
+ input_batch: bytes # Full IPC stream bytes
1523
+ attach_opaque_data: bytes | None = None
1524
+ transaction_id: bytes | None = None
1525
+ batch_index: int | None = None
1526
+
1527
+
1528
+ @dataclass(frozen=True, slots=True, kw_only=True)
1529
+ class TableBufferingProcessResponse(ArrowSerializableDataclass):
1530
+ """Response from table_buffering_process — the worker-chosen state_id."""
1531
+
1532
+ state_id: bytes
1533
+
1534
+
1535
+ @dataclass(frozen=True, slots=True, kw_only=True)
1536
+ class TableBufferingCombineRequest(ArrowSerializableDataclass):
1537
+ """Request for table_buffering_combine — once-per-query end-of-input."""
1538
+
1539
+ function_name: str
1540
+ execution_id: bytes
1541
+ state_ids: Annotated[list[bytes], ArrowType(pa.list_(pa.binary()))]
1542
+ attach_opaque_data: bytes | None = None
1543
+ transaction_id: bytes | None = None
1544
+
1545
+
1546
+ @dataclass(frozen=True, slots=True, kw_only=True)
1547
+ class TableBufferingCombineResponse(ArrowSerializableDataclass):
1548
+ """Response from table_buffering_combine — opaque finalize partition keys."""
1549
+
1550
+ finalize_state_ids: Annotated[list[bytes], ArrowType(pa.list_(pa.binary()))]
1551
+
1552
+
1553
+ @dataclass(frozen=True, slots=True, kw_only=True)
1554
+ class TableBufferingDestructorRequest(ArrowSerializableDataclass):
1555
+ """Request for table_buffering_destructor — best-effort cleanup."""
1556
+
1557
+ function_name: str
1558
+ execution_id: bytes
1559
+ attach_opaque_data: bytes | None = None
1560
+ transaction_id: bytes | None = None
1561
+
1562
+
1563
+ @dataclass(frozen=True, slots=True, kw_only=True)
1564
+ class TableBufferingDestructorResponse(ArrowSerializableDataclass):
1565
+ """Response from table_buffering_destructor — empty ack."""
1566
+
1567
+ pass
1568
+
1569
+
1570
+ # ---------------------------------------------------------------------------
1571
+ # Aggregate Window Function RPC Types
1572
+ # ---------------------------------------------------------------------------
1573
+ # Optional windowed-aggregate protocol: ``aggregate_window_init`` ships the
1574
+ # partition once, ``aggregate_window`` evaluates one output row at a time
1575
+ # (per-call flushing — DuckDB's window callback API has no per-Evaluate hook),
1576
+ # ``aggregate_window_destructor`` evicts the partition from storage.
1577
+
1578
+
1579
+ @dataclass(frozen=True, slots=True, kw_only=True)
1580
+ class AggregateWindowInitRequest(ArrowSerializableDataclass):
1581
+ """Request for aggregate_window_init — ship a partition to the worker."""
1582
+
1583
+ function_name: str
1584
+ execution_id: bytes
1585
+ partition_id: int
1586
+ row_count: int
1587
+ partition_batch: bytes # Full IPC stream bytes (partition's input columns)
1588
+ output_schema: Annotated[pa.Schema, ArrowType(pa.binary())]
1589
+ filter_mask: bytes # Packed-bit bool array, length == row_count
1590
+ frame_stats: bytes # 4× int64: ((begin_delta,end_delta),(begin_delta,end_delta))
1591
+ all_valid: bytes # 1 byte per input column
1592
+ attach_opaque_data: bytes | None = None
1593
+
1594
+
1595
+ @dataclass(frozen=True, slots=True, kw_only=True)
1596
+ class AggregateWindowInitResponse(ArrowSerializableDataclass):
1597
+ """Response from aggregate_window_init — empty ack."""
1598
+
1599
+ pass
1600
+
1601
+
1602
+ @dataclass(frozen=True, slots=True, kw_only=True)
1603
+ class AggregateWindowRequest(ArrowSerializableDataclass):
1604
+ """Request for aggregate_window — compute the aggregate for one output row.
1605
+
1606
+ ``frame_starts`` and ``frame_ends`` are parallel arrays of length 1–3
1607
+ (one entry per subframe; 3 only for EXCLUDE TIES / EXCLUDE GROUP).
1608
+ """
1609
+
1610
+ function_name: str
1611
+ execution_id: bytes
1612
+ partition_id: int
1613
+ rid: int
1614
+ frame_starts: list[int]
1615
+ frame_ends: list[int]
1616
+ attach_opaque_data: bytes | None = None
1617
+
1618
+
1619
+ @dataclass(frozen=True, slots=True, kw_only=True)
1620
+ class AggregateWindowResponse(ArrowSerializableDataclass):
1621
+ """Response from aggregate_window — one row RecordBatch with the scalar result."""
1622
+
1623
+ result_batch: bytes # Full IPC stream bytes (one row, output schema)
1624
+
1625
+
1626
+ @dataclass(frozen=True, slots=True, kw_only=True)
1627
+ class AggregateWindowDestructorRequest(ArrowSerializableDataclass):
1628
+ """Request for aggregate_window_destructor — evict a partition from storage."""
1629
+
1630
+ function_name: str
1631
+ execution_id: bytes
1632
+ partition_id: int
1633
+ attach_opaque_data: bytes | None = None
1634
+
1635
+
1636
+ @dataclass(frozen=True, slots=True, kw_only=True)
1637
+ class AggregateWindowDestructorResponse(ArrowSerializableDataclass):
1638
+ """Response from aggregate_window_destructor — empty ack."""
1639
+
1640
+ pass
1641
+
1642
+
1643
+ @dataclass(frozen=True, slots=True, kw_only=True)
1644
+ class AggregateWindowBatchRequest(ArrowSerializableDataclass):
1645
+ """Request for aggregate_window_batch — compute ``count`` output rows in one RPC.
1646
+
1647
+ ``frames_per_row[i]`` gives the subframe cardinality for output row ``i``
1648
+ (1 normally, 2–3 for EXCLUDE TIES / EXCLUDE GROUP). ``frame_starts`` and
1649
+ ``frame_ends`` are flat arrays of length ``sum(frames_per_row)``.
1650
+ """
1651
+
1652
+ function_name: str
1653
+ execution_id: bytes
1654
+ partition_id: int
1655
+ row_idx: int
1656
+ count: int
1657
+ frames_per_row: list[int]
1658
+ frame_starts: list[int]
1659
+ frame_ends: list[int]
1660
+ attach_opaque_data: bytes | None = None
1661
+
1662
+
1663
+ @dataclass(frozen=True, slots=True, kw_only=True)
1664
+ class AggregateWindowBatchResponse(ArrowSerializableDataclass):
1665
+ """Response from aggregate_window_batch — count-row RecordBatch."""
1666
+
1667
+ result_batch: bytes # Full IPC stream bytes (count rows, output schema)
1668
+
1669
+
1670
+ # ---------------------------------------------------------------------------
1671
+ # Aggregate Streaming-Partitioned RPC Types
1672
+ # ---------------------------------------------------------------------------
1673
+ # Streaming protocol for partitioned aggregates whose state compresses
1674
+ # heavily relative to input rows (e.g. portfolio_agg's positions dict vs
1675
+ # millions of fills). DuckDB streams input chunks to the worker; the worker
1676
+ # maintains concurrent per-partition state in a hash map keyed by partition
1677
+ # key, dispatches each row to its partition's state, and emits one snapshot
1678
+ # per input row. No DuckDB-side partition materialisation. Cumulative
1679
+ # semantics only (UNBOUNDED PRECEDING -> CURRENT ROW); other frame shapes
1680
+ # fall back to the non-streaming path.
1681
+
1682
+
1683
+ @dataclass(frozen=True, slots=True, kw_only=True)
1684
+ class AggregateStreamingOpenRequest(ArrowSerializableDataclass):
1685
+ """Request for aggregate_streaming_open — start a streaming session.
1686
+
1687
+ The worker resolves the function, calls ``streaming_open`` to build the
1688
+ cross-partition global state, and returns an ``execution_id`` that
1689
+ subsequent chunk/close calls reference.
1690
+
1691
+ ``input_schema`` is the schema of every chunk shipped via
1692
+ ``streaming_chunk``. The first ``partition_key_count`` columns are
1693
+ partition-key columns (used by the worker to dispatch rows to the right
1694
+ per-partition state). The next ``order_key_count`` columns are
1695
+ order-key columns (informational; the worker may verify monotonicity).
1696
+ Remaining columns are the function's value arguments, in declaration
1697
+ order.
1698
+ """
1699
+
1700
+ function_name: str
1701
+ arguments: Annotated[Arguments, ArrowType(pa.binary())]
1702
+ input_schema: Annotated[pa.Schema, ArrowType(pa.binary())]
1703
+ partition_key_count: int
1704
+ order_key_count: int
1705
+ output_schema: Annotated[pa.Schema, ArrowType(pa.binary())]
1706
+ settings: Annotated[pa.RecordBatch | None, ArrowType(pa.binary())] = None
1707
+ secrets: Annotated[pa.RecordBatch | None, ArrowType(pa.binary())] = None
1708
+ attach_opaque_data: bytes | None = None
1709
+
1710
+
1711
+ @dataclass(frozen=True, slots=True, kw_only=True)
1712
+ class AggregateStreamingOpenResponse(ArrowSerializableDataclass):
1713
+ """Response from aggregate_streaming_open — session token."""
1714
+
1715
+ execution_id: bytes
1716
+
1717
+
1718
+ @dataclass(frozen=True, slots=True, kw_only=True)
1719
+ class AggregateStreamingChunkRequest(ArrowSerializableDataclass):
1720
+ """Request for aggregate_streaming_chunk — process one input chunk.
1721
+
1722
+ ``input_batch`` schema must match the ``input_schema`` agreed at
1723
+ ``streaming_open``. The worker iterates rows, dispatches to per-partition
1724
+ state by the partition-key columns, applies the function's update logic,
1725
+ and returns a same-length output array.
1726
+ """
1727
+
1728
+ function_name: str
1729
+ execution_id: bytes
1730
+ input_batch: bytes # Full IPC stream bytes
1731
+ attach_opaque_data: bytes | None = None
1732
+
1733
+
1734
+ @dataclass(frozen=True, slots=True, kw_only=True)
1735
+ class AggregateStreamingChunkResponse(ArrowSerializableDataclass):
1736
+ """Response from aggregate_streaming_chunk — same-length output batch."""
1737
+
1738
+ result_batch: bytes # Full IPC stream bytes (one row per input row)
1739
+
1740
+
1741
+ @dataclass(frozen=True, slots=True, kw_only=True)
1742
+ class AggregateStreamingCloseRequest(ArrowSerializableDataclass):
1743
+ """Request for aggregate_streaming_close — end the session, free state."""
1744
+
1745
+ function_name: str
1746
+ execution_id: bytes
1747
+ attach_opaque_data: bytes | None = None
1748
+
1749
+
1750
+ @dataclass(frozen=True, slots=True, kw_only=True)
1751
+ class AggregateStreamingCloseResponse(ArrowSerializableDataclass):
1752
+ """Response from aggregate_streaming_close — empty ack."""
1753
+
1754
+ pass
1755
+
1756
+
1757
+ # ---------------------------------------------------------------------------
1758
+ # VGI Protocol
1759
+ # ---------------------------------------------------------------------------
1760
+
1761
+
1762
+ class VgiProtocol(Protocol):
1763
+ """VGI wire protocol definition for vgi_rpc.
1764
+
1765
+ Method families: ``bind()`` / ``init()`` (scalar/table function invocation),
1766
+ ``aggregate_*`` (aggregate RPC methods, all unary), and ``catalog_*`` (~35
1767
+ typed catalog interface methods).
1768
+
1769
+ ``vgi_rpc.RpcServer(VgiProtocol, worker)`` handles serialization,
1770
+ dispatching, error propagation, and stream lifecycle.
1771
+
1772
+ Application protocol surface version
1773
+ ------------------------------------
1774
+ ``protocol_version`` is the canonical semver (MAJOR.MINOR.PATCH) of the
1775
+ method-and-schema contract this Protocol declares. The vgi-rpc framework
1776
+ enforces an exact major+minor match (patch ignored) on every dispatched
1777
+ request: when a client sends a mismatched version, the server raises
1778
+ ``ProtocolVersionError`` at the dispatch boundary with a directional
1779
+ "upgrade the client" / "upgrade the worker" message.
1780
+
1781
+ Bump rules:
1782
+
1783
+ - **Major** — any backwards-incompatible change: removing a method,
1784
+ renaming a method/parameter, changing a parameter or return type,
1785
+ adding a required parameter.
1786
+ - **Minor** — additive: a new method, a new optional parameter, or a new
1787
+ optional response column.
1788
+ - **Patch** — worker-side bug fixes that do not touch the surface.
1789
+
1790
+ Because Arrow's column-count check in the C++ consumer rejects
1791
+ return-schema drift today, even "minor" additive bumps force clients to
1792
+ rebuild in practice. Bump major when the surface changes and you want all
1793
+ deployed clients to refuse to talk to the new server until rebuilt.
1794
+
1795
+ Cross-language consumers (Rust / Go workers) read ``vgi/protocol_version.txt``
1796
+ (generated, committed). The C++ DuckDB extension reads
1797
+ ``VGI_PROTOCOL_VERSION`` from ``vgi/src/generated/vgi_protocol_version.hpp``
1798
+ (also generated; sibling of ``vgi_protocol_constants.hpp`` but produced by
1799
+ a dedicated generator so this version doesn't pollute the byte-key constants).
1800
+ """
1801
+
1802
+ protocol_version: ClassVar[str] = "1.0.0"
1803
+
1804
+ def bind(self, request: BindRequest) -> BindResponse:
1805
+ """Resolve output schema and validate arguments."""
1806
+ ...
1807
+
1808
+ def init(self, request: InitRequest) -> Stream[ProcessState, GlobalInitResponse]:
1809
+ """Initialize a function execution and return a processing stream."""
1810
+ ...
1811
+
1812
+ def table_function_cardinality(self, request: TableFunctionCardinalityRequest) -> TableCardinality:
1813
+ """Estimate the cardinality of a table function's output."""
1814
+ ...
1815
+
1816
+ def table_function_statistics(self, request: TableFunctionStatisticsRequest) -> bytes | None:
1817
+ """Return per-column statistics for a table function's output.
1818
+
1819
+ Returns IPC bytes of a RecordBatch with sparse-union min/max columns
1820
+ (same shape as catalog_table_column_statistics_get), or None if no
1821
+ statistics are available.
1822
+ """
1823
+ ...
1824
+
1825
+ def table_function_dynamic_to_string(
1826
+ self, request: TableFunctionDynamicToStringRequest
1827
+ ) -> TableFunctionDynamicToStringResponse:
1828
+ """Return user-defined diagnostics for EXPLAIN ANALYZE Extra Info.
1829
+
1830
+ Fired once per parallel scan thread at end-of-stream. The function
1831
+ class is responsible for persisting any diagnostics it wants to
1832
+ report and retrieving them by ``global_execution_id`` here.
1833
+
1834
+ Best-effort: must not raise. The dispatcher catches exceptions and
1835
+ returns an empty response so EXPLAIN ANALYZE never breaks the query.
1836
+ """
1837
+ ...
1838
+
1839
+ # ========== Aggregate Function Methods (all unary) ==========
1840
+
1841
+ def aggregate_bind(self, request: AggregateBindRequest) -> AggregateBindResponse:
1842
+ """Bind an aggregate function, return output schema and execution_id."""
1843
+ ...
1844
+
1845
+ def aggregate_update(self, request: AggregateUpdateRequest) -> AggregateUpdateResponse:
1846
+ """Accumulate rows from a DataChunk into per-group state."""
1847
+ ...
1848
+
1849
+ def aggregate_combine(self, request: AggregateCombineRequest) -> AggregateCombineResponse:
1850
+ """Merge source states into target states."""
1851
+ ...
1852
+
1853
+ def aggregate_finalize(self, request: AggregateFinalizeRequest) -> AggregateFinalizeResponse:
1854
+ """Produce results for a chunk of group_ids."""
1855
+ ...
1856
+
1857
+ def aggregate_destructor(self, request: AggregateDestructorRequest) -> AggregateDestructorResponse:
1858
+ """Best-effort cleanup of aggregate states. Must not raise."""
1859
+ ...
1860
+
1861
+ # ========== Table Sink+Source Function Methods ==========
1862
+
1863
+ def table_buffering_process(
1864
+ self,
1865
+ request: TableBufferingProcessRequest,
1866
+ ) -> TableBufferingProcessResponse:
1867
+ """Sink one input batch; return the worker-chosen state_id."""
1868
+ ...
1869
+
1870
+ def table_buffering_combine(
1871
+ self,
1872
+ request: TableBufferingCombineRequest,
1873
+ ) -> TableBufferingCombineResponse:
1874
+ """Once-per-query end-of-input signal. Returns finalize_state_ids."""
1875
+ ...
1876
+
1877
+ def table_buffering_destructor(
1878
+ self,
1879
+ request: TableBufferingDestructorRequest,
1880
+ ) -> TableBufferingDestructorResponse:
1881
+ """Best-effort end-of-query cleanup. Must not raise."""
1882
+ ...
1883
+
1884
+ # ========== Aggregate Window Function Methods (optional, all unary) ==========
1885
+
1886
+ def aggregate_window_init(self, request: AggregateWindowInitRequest) -> AggregateWindowInitResponse:
1887
+ """Ship a partition to the worker for windowed aggregation."""
1888
+ ...
1889
+
1890
+ def aggregate_window(self, request: AggregateWindowRequest) -> AggregateWindowResponse:
1891
+ """Compute an aggregate value for one output row of the window."""
1892
+ ...
1893
+
1894
+ def aggregate_window_destructor(
1895
+ self, request: AggregateWindowDestructorRequest
1896
+ ) -> AggregateWindowDestructorResponse:
1897
+ """Evict a cached partition from storage."""
1898
+ ...
1899
+
1900
+ def aggregate_window_batch(self, request: AggregateWindowBatchRequest) -> AggregateWindowBatchResponse:
1901
+ """Compute ``count`` window output rows in one batched RPC."""
1902
+ ...
1903
+
1904
+ # ========== Aggregate Streaming-Partitioned Methods (optional, all unary) ==========
1905
+
1906
+ def aggregate_streaming_open(self, request: AggregateStreamingOpenRequest) -> AggregateStreamingOpenResponse:
1907
+ """Start a streaming-partitioned aggregate session."""
1908
+ ...
1909
+
1910
+ def aggregate_streaming_chunk(self, request: AggregateStreamingChunkRequest) -> AggregateStreamingChunkResponse:
1911
+ """Process one input chunk; returns one output row per input row."""
1912
+ ...
1913
+
1914
+ def aggregate_streaming_close(self, request: AggregateStreamingCloseRequest) -> AggregateStreamingCloseResponse:
1915
+ """End the streaming session, free per-session state."""
1916
+ ...
1917
+
1918
+ # ========== Catalog - Discovery ==========
1919
+
1920
+ def catalog_catalogs(self) -> CatalogsResponse:
1921
+ """List available catalog names."""
1922
+ ...
1923
+
1924
+ # ========== Catalog - Lifecycle ==========
1925
+
1926
+ def catalog_attach(self, request: CatalogAttachRequest) -> CatalogAttachResult:
1927
+ """Attach to a catalog with options."""
1928
+ ...
1929
+
1930
+ def catalog_detach(self, attach_opaque_data: bytes) -> None:
1931
+ """Detach from a catalog."""
1932
+ ...
1933
+
1934
+ def catalog_create(self, request: CatalogCreateRequest) -> None:
1935
+ """Create a new catalog."""
1936
+ ...
1937
+
1938
+ def catalog_drop(self, name: str) -> None:
1939
+ """Drop a catalog."""
1940
+ ...
1941
+
1942
+ def catalog_version(
1943
+ self, attach_opaque_data: bytes, transaction_opaque_data: bytes | None = None
1944
+ ) -> CatalogVersionResponse:
1945
+ """Get the current catalog version."""
1946
+ ...
1947
+
1948
+ # ========== Catalog - Transactions ==========
1949
+
1950
+ def catalog_transaction_begin(self, attach_opaque_data: bytes) -> TransactionBeginResponse:
1951
+ """Begin a new transaction."""
1952
+ ...
1953
+
1954
+ def catalog_transaction_commit(self, attach_opaque_data: bytes, transaction_opaque_data: bytes) -> None:
1955
+ """Commit a transaction."""
1956
+ ...
1957
+
1958
+ def catalog_transaction_rollback(self, attach_opaque_data: bytes, transaction_opaque_data: bytes) -> None:
1959
+ """Rollback a transaction."""
1960
+ ...
1961
+
1962
+ # ========== Catalog - Schemas ==========
1963
+
1964
+ def catalog_schemas(
1965
+ self, attach_opaque_data: bytes, transaction_opaque_data: bytes | None = None
1966
+ ) -> SchemasResponse:
1967
+ """List schemas in the catalog."""
1968
+ ...
1969
+
1970
+ def catalog_schema_get(
1971
+ self, attach_opaque_data: bytes, name: str, transaction_opaque_data: bytes | None = None
1972
+ ) -> SchemasResponse:
1973
+ """Get information about a schema. Returns 0 or 1 items."""
1974
+ ...
1975
+
1976
+ def catalog_schema_create(
1977
+ self,
1978
+ attach_opaque_data: bytes,
1979
+ name: str,
1980
+ on_conflict: OnConflict = OnConflict.ERROR,
1981
+ comment: str | None = None,
1982
+ tags: dict[str, str] | None = None,
1983
+ transaction_opaque_data: bytes | None = None,
1984
+ ) -> None:
1985
+ """Create a new schema."""
1986
+ ...
1987
+
1988
+ def catalog_schema_drop(
1989
+ self,
1990
+ attach_opaque_data: bytes,
1991
+ name: str,
1992
+ ignore_not_found: bool = False,
1993
+ cascade: bool = False,
1994
+ transaction_opaque_data: bytes | None = None,
1995
+ ) -> None:
1996
+ """Drop a schema."""
1997
+ ...
1998
+
1999
+ def catalog_schema_contents_tables(
2000
+ self,
2001
+ attach_opaque_data: bytes,
2002
+ name: str,
2003
+ transaction_opaque_data: bytes | None = None,
2004
+ ) -> TablesResponse:
2005
+ """List tables in a schema."""
2006
+ ...
2007
+
2008
+ def catalog_schema_contents_views(
2009
+ self,
2010
+ attach_opaque_data: bytes,
2011
+ name: str,
2012
+ transaction_opaque_data: bytes | None = None,
2013
+ ) -> ViewsResponse:
2014
+ """List views in a schema."""
2015
+ ...
2016
+
2017
+ def catalog_schema_contents_functions(
2018
+ self,
2019
+ attach_opaque_data: bytes,
2020
+ name: str,
2021
+ type: SchemaObjectType,
2022
+ transaction_opaque_data: bytes | None = None,
2023
+ ) -> FunctionsResponse:
2024
+ """List functions in a schema (scalar or table)."""
2025
+ ...
2026
+
2027
+ # ========== Catalog - Tables ==========
2028
+
2029
+ def catalog_table_get(
2030
+ self,
2031
+ attach_opaque_data: bytes,
2032
+ schema_name: str,
2033
+ name: str,
2034
+ at_unit: str | None = None,
2035
+ at_value: str | None = None,
2036
+ transaction_opaque_data: bytes | None = None,
2037
+ ) -> TablesResponse:
2038
+ """Get information about a table. Returns 0 or 1 items."""
2039
+ ...
2040
+
2041
+ def catalog_table_create(self, request: TableCreateRequest) -> None:
2042
+ """Create a new table."""
2043
+ ...
2044
+
2045
+ def catalog_table_drop(
2046
+ self,
2047
+ attach_opaque_data: bytes,
2048
+ schema_name: str,
2049
+ name: str,
2050
+ ignore_not_found: bool = False,
2051
+ cascade: bool = False,
2052
+ transaction_opaque_data: bytes | None = None,
2053
+ ) -> None:
2054
+ """Drop a table."""
2055
+ ...
2056
+
2057
+ def catalog_table_scan_function_get(
2058
+ self,
2059
+ attach_opaque_data: bytes,
2060
+ schema_name: str,
2061
+ name: str,
2062
+ at_unit: str | None = None,
2063
+ at_value: str | None = None,
2064
+ transaction_opaque_data: bytes | None = None,
2065
+ ) -> bytes:
2066
+ """Get the scan function for a table. Returns ScanFunctionResult as IPC bytes."""
2067
+ ...
2068
+
2069
+ def catalog_table_scan_branches_get(
2070
+ self,
2071
+ attach_opaque_data: bytes,
2072
+ schema_name: str,
2073
+ name: str,
2074
+ at_unit: str | None = None,
2075
+ at_value: str | None = None,
2076
+ transaction_opaque_data: bytes | None = None,
2077
+ ) -> bytes:
2078
+ """Get the list of scan branches for a multi-branch table. Returns ScanBranchesResult as IPC bytes.
2079
+
2080
+ Additive successor to ``catalog_table_scan_function_get``. Workers that
2081
+ only implement the legacy method continue to work — the VGI extension's
2082
+ C++ side catches ``MethodNotImplementedError`` and falls back to the
2083
+ legacy RPC, wrapping the single-function result as a one-branch list.
2084
+ Workers that implement BOTH the legacy and the branches method
2085
+ guarantee single-process compatibility with both old and new extensions.
2086
+
2087
+ Multi-branch tables compose a logical scan from N physical sources
2088
+ (canonical case: Kafka hot tier + Iceberg cold tier). The extension's
2089
+ optimizer rewrite stitches the branches together via
2090
+ ``LogicalSetOperation(UNION_ALL, ...)``, one arm per branch.
2091
+ """
2092
+ ...
2093
+
2094
+ def catalog_table_column_statistics_get(
2095
+ self,
2096
+ attach_opaque_data: bytes,
2097
+ schema_name: str,
2098
+ name: str,
2099
+ transaction_opaque_data: bytes | None = None,
2100
+ ) -> bytes | None:
2101
+ """Get column statistics for a table.
2102
+
2103
+ Returns IPC bytes of a RecordBatch with sparse-union min/max columns,
2104
+ or None if statistics are not available.
2105
+ """
2106
+ ...
2107
+
2108
+ def catalog_table_insert_function_get(
2109
+ self,
2110
+ attach_opaque_data: bytes,
2111
+ schema_name: str,
2112
+ name: str,
2113
+ transaction_opaque_data: bytes | None = None,
2114
+ writable_branch_function_name: str | None = None,
2115
+ ) -> bytes:
2116
+ """Get the insert function for a table. Returns WriteFunctionResult as IPC bytes.
2117
+
2118
+ ``writable_branch_function_name`` is set by the C++ extension when the
2119
+ table is multi-branch and a branch declared ``writable=True``: the value
2120
+ is the writable arm's ``ScanBranch.function_name``. The worker uses it
2121
+ to disambiguate which physical arm to dispatch the INSERT to without
2122
+ re-resolving the writable arm internally. For single-branch tables (the
2123
+ common case) this is None and the worker dispatches as today.
2124
+ """
2125
+ ...
2126
+
2127
+ def catalog_table_update_function_get(
2128
+ self,
2129
+ attach_opaque_data: bytes,
2130
+ schema_name: str,
2131
+ name: str,
2132
+ transaction_opaque_data: bytes | None = None,
2133
+ ) -> bytes:
2134
+ """Get the update function for a table. Returns WriteFunctionResult as IPC bytes."""
2135
+ ...
2136
+
2137
+ def catalog_table_delete_function_get(
2138
+ self,
2139
+ attach_opaque_data: bytes,
2140
+ schema_name: str,
2141
+ name: str,
2142
+ transaction_opaque_data: bytes | None = None,
2143
+ ) -> bytes:
2144
+ """Get the delete function for a table. Returns WriteFunctionResult as IPC bytes."""
2145
+ ...
2146
+
2147
+ def catalog_table_comment_set(
2148
+ self,
2149
+ attach_opaque_data: bytes,
2150
+ schema_name: str,
2151
+ name: str,
2152
+ comment: str | None = None,
2153
+ ignore_not_found: bool = False,
2154
+ transaction_opaque_data: bytes | None = None,
2155
+ ) -> None:
2156
+ """Set or clear the comment on a table."""
2157
+ ...
2158
+
2159
+ def catalog_table_column_comment_set(
2160
+ self,
2161
+ attach_opaque_data: bytes,
2162
+ schema_name: str,
2163
+ name: str,
2164
+ column_name: str,
2165
+ comment: str | None = None,
2166
+ ignore_not_found: bool = False,
2167
+ transaction_opaque_data: bytes | None = None,
2168
+ ) -> None:
2169
+ """Set or clear the comment on a table column."""
2170
+ ...
2171
+
2172
+ def catalog_table_rename(
2173
+ self,
2174
+ attach_opaque_data: bytes,
2175
+ schema_name: str,
2176
+ name: str,
2177
+ new_name: str,
2178
+ ignore_not_found: bool = False,
2179
+ transaction_opaque_data: bytes | None = None,
2180
+ ) -> None:
2181
+ """Rename a table."""
2182
+ ...
2183
+
2184
+ def catalog_table_column_add(
2185
+ self,
2186
+ attach_opaque_data: bytes,
2187
+ schema_name: str,
2188
+ name: str,
2189
+ column_definition: bytes,
2190
+ ignore_not_found: bool = False,
2191
+ if_column_not_exists: bool = False,
2192
+ transaction_opaque_data: bytes | None = None,
2193
+ ) -> None:
2194
+ """Add a new column to a table."""
2195
+ ...
2196
+
2197
+ def catalog_table_column_drop(
2198
+ self,
2199
+ attach_opaque_data: bytes,
2200
+ schema_name: str,
2201
+ name: str,
2202
+ column_name: str,
2203
+ ignore_not_found: bool = False,
2204
+ if_column_exists: bool = False,
2205
+ cascade: bool = False,
2206
+ transaction_opaque_data: bytes | None = None,
2207
+ ) -> None:
2208
+ """Drop a column from a table."""
2209
+ ...
2210
+
2211
+ def catalog_table_column_rename(
2212
+ self,
2213
+ attach_opaque_data: bytes,
2214
+ schema_name: str,
2215
+ name: str,
2216
+ column_name: str,
2217
+ new_column_name: str,
2218
+ ignore_not_found: bool = False,
2219
+ transaction_opaque_data: bytes | None = None,
2220
+ ) -> None:
2221
+ """Rename a column."""
2222
+ ...
2223
+
2224
+ def catalog_table_column_default_set(
2225
+ self,
2226
+ attach_opaque_data: bytes,
2227
+ schema_name: str,
2228
+ name: str,
2229
+ column_name: str,
2230
+ expression: str,
2231
+ ignore_not_found: bool = False,
2232
+ transaction_opaque_data: bytes | None = None,
2233
+ ) -> None:
2234
+ """Set the default value expression for a column."""
2235
+ ...
2236
+
2237
+ def catalog_table_column_default_drop(
2238
+ self,
2239
+ attach_opaque_data: bytes,
2240
+ schema_name: str,
2241
+ name: str,
2242
+ column_name: str,
2243
+ ignore_not_found: bool = False,
2244
+ transaction_opaque_data: bytes | None = None,
2245
+ ) -> None:
2246
+ """Remove the default value from a column."""
2247
+ ...
2248
+
2249
+ def catalog_table_column_type_change(
2250
+ self,
2251
+ attach_opaque_data: bytes,
2252
+ schema_name: str,
2253
+ name: str,
2254
+ column_definition: bytes,
2255
+ expression: str | None = None,
2256
+ ignore_not_found: bool = False,
2257
+ transaction_opaque_data: bytes | None = None,
2258
+ ) -> None:
2259
+ """Change the type of a column."""
2260
+ ...
2261
+
2262
+ def catalog_table_not_null_drop(
2263
+ self,
2264
+ attach_opaque_data: bytes,
2265
+ schema_name: str,
2266
+ name: str,
2267
+ column_name: str,
2268
+ ignore_not_found: bool = False,
2269
+ transaction_opaque_data: bytes | None = None,
2270
+ ) -> None:
2271
+ """Remove NOT NULL constraint from a column."""
2272
+ ...
2273
+
2274
+ def catalog_table_not_null_set(
2275
+ self,
2276
+ attach_opaque_data: bytes,
2277
+ schema_name: str,
2278
+ name: str,
2279
+ column_name: str,
2280
+ ignore_not_found: bool = False,
2281
+ transaction_opaque_data: bytes | None = None,
2282
+ ) -> None:
2283
+ """Add NOT NULL constraint to a column."""
2284
+ ...
2285
+
2286
+ # ========== Catalog - Views ==========
2287
+
2288
+ def catalog_view_get(
2289
+ self,
2290
+ attach_opaque_data: bytes,
2291
+ schema_name: str,
2292
+ name: str,
2293
+ transaction_opaque_data: bytes | None = None,
2294
+ ) -> ViewsResponse:
2295
+ """Get information about a view. Returns 0 or 1 items."""
2296
+ ...
2297
+
2298
+ def catalog_view_create(
2299
+ self,
2300
+ attach_opaque_data: bytes,
2301
+ schema_name: str,
2302
+ name: str,
2303
+ definition: str,
2304
+ on_conflict: OnConflict,
2305
+ transaction_opaque_data: bytes | None = None,
2306
+ ) -> None:
2307
+ """Create a new view."""
2308
+ ...
2309
+
2310
+ def catalog_view_drop(
2311
+ self,
2312
+ attach_opaque_data: bytes,
2313
+ schema_name: str,
2314
+ name: str,
2315
+ ignore_not_found: bool = False,
2316
+ cascade: bool = False,
2317
+ transaction_opaque_data: bytes | None = None,
2318
+ ) -> None:
2319
+ """Drop a view."""
2320
+ ...
2321
+
2322
+ def catalog_view_rename(
2323
+ self,
2324
+ attach_opaque_data: bytes,
2325
+ schema_name: str,
2326
+ name: str,
2327
+ new_name: str,
2328
+ ignore_not_found: bool = False,
2329
+ transaction_opaque_data: bytes | None = None,
2330
+ ) -> None:
2331
+ """Rename a view."""
2332
+ ...
2333
+
2334
+ def catalog_view_comment_set(
2335
+ self,
2336
+ attach_opaque_data: bytes,
2337
+ schema_name: str,
2338
+ name: str,
2339
+ comment: str | None = None,
2340
+ ignore_not_found: bool = False,
2341
+ transaction_opaque_data: bytes | None = None,
2342
+ ) -> None:
2343
+ """Set or clear the comment on a view."""
2344
+ ...
2345
+
2346
+ # ========== Catalog - Macros ===========
2347
+
2348
+ def catalog_macro_get(
2349
+ self,
2350
+ attach_opaque_data: bytes,
2351
+ schema_name: str,
2352
+ name: str,
2353
+ transaction_opaque_data: bytes | None = None,
2354
+ ) -> MacrosResponse:
2355
+ """Get information about a macro. Returns 0 or 1 items."""
2356
+ ...
2357
+
2358
+ def catalog_macro_create(self, request: MacroCreateRequest) -> None:
2359
+ """Create a new macro."""
2360
+ ...
2361
+
2362
+ def catalog_macro_drop(
2363
+ self,
2364
+ attach_opaque_data: bytes,
2365
+ schema_name: str,
2366
+ name: str,
2367
+ ignore_not_found: bool = False,
2368
+ transaction_opaque_data: bytes | None = None,
2369
+ ) -> None:
2370
+ """Drop a macro."""
2371
+ ...
2372
+
2373
+ def catalog_schema_contents_macros(
2374
+ self,
2375
+ attach_opaque_data: bytes,
2376
+ name: str,
2377
+ type: SchemaObjectType,
2378
+ transaction_opaque_data: bytes | None = None,
2379
+ ) -> MacrosResponse:
2380
+ """List macros in a schema (scalar or table)."""
2381
+ ...
2382
+
2383
+ # ========== Catalog - Indexes ==========
2384
+
2385
+ def catalog_index_get(
2386
+ self,
2387
+ attach_opaque_data: bytes,
2388
+ schema_name: str,
2389
+ name: str,
2390
+ transaction_opaque_data: bytes | None = None,
2391
+ ) -> IndexesResponse:
2392
+ """Get information about an index. Returns 0 or 1 items."""
2393
+ ...
2394
+
2395
+ def catalog_index_create(self, request: IndexCreateRequest) -> None:
2396
+ """Create a new index."""
2397
+ ...
2398
+
2399
+ def catalog_index_drop(
2400
+ self,
2401
+ attach_opaque_data: bytes,
2402
+ schema_name: str,
2403
+ name: str,
2404
+ ignore_not_found: bool = False,
2405
+ cascade: bool = False,
2406
+ transaction_opaque_data: bytes | None = None,
2407
+ ) -> None:
2408
+ """Drop an index."""
2409
+ ...
2410
+
2411
+ def catalog_schema_contents_indexes(
2412
+ self,
2413
+ attach_opaque_data: bytes,
2414
+ name: str,
2415
+ transaction_opaque_data: bytes | None = None,
2416
+ ) -> IndexesResponse:
2417
+ """List indexes in a schema."""
2418
+ ...