vgi-python 0.8.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vgi/__init__.py +152 -0
- vgi/_duckdb.py +62 -0
- vgi/_storage_profile.py +132 -0
- vgi/_test_fixtures/__init__.py +20 -0
- vgi/_test_fixtures/accumulate/__init__.py +19 -0
- vgi/_test_fixtures/accumulate/worker.py +762 -0
- vgi/_test_fixtures/aggregate/__init__.py +62 -0
- vgi/_test_fixtures/aggregate/_common.py +21 -0
- vgi/_test_fixtures/aggregate/basic.py +232 -0
- vgi/_test_fixtures/aggregate/dynamic.py +409 -0
- vgi/_test_fixtures/aggregate/generic.py +86 -0
- vgi/_test_fixtures/aggregate/listagg.py +71 -0
- vgi/_test_fixtures/aggregate/percentile.py +107 -0
- vgi/_test_fixtures/aggregate/streaming.py +192 -0
- vgi/_test_fixtures/aggregate/varargs.py +75 -0
- vgi/_test_fixtures/aggregate/window.py +380 -0
- vgi/_test_fixtures/attach_options.py +308 -0
- vgi/_test_fixtures/bad_protocol.py +62 -0
- vgi/_test_fixtures/cancellable.py +336 -0
- vgi/_test_fixtures/catalog.py +813 -0
- vgi/_test_fixtures/http_server.py +394 -0
- vgi/_test_fixtures/nest_tensor.py +614 -0
- vgi/_test_fixtures/orchard_catalog.py +47 -0
- vgi/_test_fixtures/projection_repro/__init__.py +6 -0
- vgi/_test_fixtures/projection_repro/worker.py +454 -0
- vgi/_test_fixtures/scalar/__init__.py +116 -0
- vgi/_test_fixtures/scalar/_common.py +69 -0
- vgi/_test_fixtures/scalar/arithmetic.py +321 -0
- vgi/_test_fixtures/scalar/binary.py +120 -0
- vgi/_test_fixtures/scalar/formatting.py +176 -0
- vgi/_test_fixtures/scalar/geo.py +300 -0
- vgi/_test_fixtures/scalar/null_handling.py +107 -0
- vgi/_test_fixtures/scalar/random_demo.py +171 -0
- vgi/_test_fixtures/scalar/settings_secrets.py +102 -0
- vgi/_test_fixtures/scalar/type_info.py +219 -0
- vgi/_test_fixtures/schema_reconcile/__init__.py +29 -0
- vgi/_test_fixtures/schema_reconcile/worker.py +653 -0
- vgi/_test_fixtures/simple_writable.py +793 -0
- vgi/_test_fixtures/table/__init__.py +221 -0
- vgi/_test_fixtures/table/_common.py +162 -0
- vgi/_test_fixtures/table/batch_index.py +283 -0
- vgi/_test_fixtures/table/batch_index_broken.py +200 -0
- vgi/_test_fixtures/table/catalog_scans.py +162 -0
- vgi/_test_fixtures/table/filters.py +1005 -0
- vgi/_test_fixtures/table/late_materialization.py +249 -0
- vgi/_test_fixtures/table/make_series.py +273 -0
- vgi/_test_fixtures/table/misc.py +499 -0
- vgi/_test_fixtures/table/order_modes.py +164 -0
- vgi/_test_fixtures/table/pairs.py +437 -0
- vgi/_test_fixtures/table/partition_columns.py +472 -0
- vgi/_test_fixtures/table/partition_columns_broken.py +304 -0
- vgi/_test_fixtures/table/profiling_example.py +195 -0
- vgi/_test_fixtures/table/required_filters.py +234 -0
- vgi/_test_fixtures/table/sequence.py +710 -0
- vgi/_test_fixtures/table/settings.py +426 -0
- vgi/_test_fixtures/table/transaction_storage.py +162 -0
- vgi/_test_fixtures/table/tt_pushdown.py +191 -0
- vgi/_test_fixtures/table/versioned.py +230 -0
- vgi/_test_fixtures/table_in_out.py +1392 -0
- vgi/_test_fixtures/versioned.py +155 -0
- vgi/_test_fixtures/versioned_tables.py +595 -0
- vgi/_test_fixtures/worker.py +1631 -0
- vgi/_test_fixtures/writable/__init__.py +8 -0
- vgi/_test_fixtures/writable/generic.py +236 -0
- vgi/_test_fixtures/writable/table.py +149 -0
- vgi/_test_fixtures/writable/worker.py +1148 -0
- vgi/aggregate_function.py +607 -0
- vgi/argument_spec.py +472 -0
- vgi/arguments.py +1747 -0
- vgi/auth.py +55 -0
- vgi/catalog/__init__.py +88 -0
- vgi/catalog/attach_option.py +206 -0
- vgi/catalog/catalog_interface.py +2767 -0
- vgi/catalog/descriptors.py +870 -0
- vgi/catalog/duckdb_statistics.py +377 -0
- vgi/catalog/secret_type.py +96 -0
- vgi/catalog/setting.py +253 -0
- vgi/catalog/storage.py +372 -0
- vgi/client/__init__.py +67 -0
- vgi/client/catalog_mixin.py +1251 -0
- vgi/client/cli.py +582 -0
- vgi/client/cli_catalog.py +182 -0
- vgi/client/cli_schema.py +270 -0
- vgi/client/cli_table.py +907 -0
- vgi/client/cli_transaction.py +97 -0
- vgi/client/cli_utils.py +441 -0
- vgi/client/cli_view.py +303 -0
- vgi/client/client.py +2183 -0
- vgi/exceptions.py +205 -0
- vgi/function.py +245 -0
- vgi/function_storage.py +1636 -0
- vgi/function_storage_azure_sql.py +922 -0
- vgi/function_storage_cf_do.py +740 -0
- vgi/http/__init__.py +25 -0
- vgi/http/demo_storage.py +212 -0
- vgi/http/worker_page.py +1252 -0
- vgi/invocation.py +154 -0
- vgi/logging_config.py +93 -0
- vgi/meta_worker.py +661 -0
- vgi/metadata.py +1403 -0
- vgi/otel.py +406 -0
- vgi/protocol.py +2418 -0
- vgi/protocol_version.txt +1 -0
- vgi/py.typed +0 -0
- vgi/scalar_function.py +1211 -0
- vgi/schema_utils.py +234 -0
- vgi/secret_protocol.py +124 -0
- vgi/secret_service.py +238 -0
- vgi/serve.py +769 -0
- vgi/table_buffering_function.py +443 -0
- vgi/table_filter_pushdown.py +1528 -0
- vgi/table_function.py +1130 -0
- vgi/table_in_out_function.py +383 -0
- vgi/transactor/__init__.py +24 -0
- vgi/transactor/_duckdb_compat.py +27 -0
- vgi/transactor/client.py +137 -0
- vgi/transactor/protocol.py +149 -0
- vgi/transactor/server.py +740 -0
- vgi/worker.py +4761 -0
- vgi_python-0.8.0.dist-info/METADATA +735 -0
- vgi_python-0.8.0.dist-info/RECORD +124 -0
- vgi_python-0.8.0.dist-info/WHEEL +4 -0
- vgi_python-0.8.0.dist-info/entry_points.txt +5 -0
- vgi_python-0.8.0.dist-info/licenses/LICENSE +134 -0
vgi/protocol.py
ADDED
|
@@ -0,0 +1,2418 @@
|
|
|
1
|
+
# Copyright 2025, 2026 Query Farm LLC - https://query.farm
|
|
2
|
+
|
|
3
|
+
"""VGI protocol definition for vgi_rpc server integration.
|
|
4
|
+
|
|
5
|
+
Defines the VgiProtocol, consolidated request types (BindRequest, InitRequest),
|
|
6
|
+
catalog request/response types, and StreamState implementations for each function type.
|
|
7
|
+
|
|
8
|
+
VgiProtocol Methods
|
|
9
|
+
-------------------
|
|
10
|
+
- **bind()**: Schema resolution and argument validation (unary)
|
|
11
|
+
- **init()**: Worker initialization, returns a Stream for data processing
|
|
12
|
+
- **catalog_*()**: ~35 typed catalog interface methods (unary)
|
|
13
|
+
|
|
14
|
+
StreamState Implementations
|
|
15
|
+
---------------------------
|
|
16
|
+
- **ScalarExchangeState**: Calls ScalarFunctionGenerator.process() per batch
|
|
17
|
+
- **TableProducerState**: Calls TableFunctionGenerator.process() per tick
|
|
18
|
+
- **TableInOutExchangeState**: Calls TableInOutGenerator.process() per input
|
|
19
|
+
- **BufferedFinalizeState**: Drains a state_log via cursor for streaming-shape
|
|
20
|
+
FINALIZE phase of TableInOutGenerator
|
|
21
|
+
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
from __future__ import annotations
|
|
25
|
+
|
|
26
|
+
import base64
|
|
27
|
+
import contextlib
|
|
28
|
+
import dataclasses
|
|
29
|
+
import logging
|
|
30
|
+
from dataclasses import dataclass, field
|
|
31
|
+
from typing import TYPE_CHECKING, Annotated, Any, ClassVar, Protocol, get_args, get_origin
|
|
32
|
+
|
|
33
|
+
import pyarrow as pa
|
|
34
|
+
import pyarrow.compute as pc
|
|
35
|
+
from vgi_rpc import ArrowSerializableDataclass, ArrowType, Transient
|
|
36
|
+
from vgi_rpc.rpc import (
|
|
37
|
+
AnnotatedBatch,
|
|
38
|
+
CallContext,
|
|
39
|
+
ExchangeState,
|
|
40
|
+
OutputCollector,
|
|
41
|
+
ProducerState,
|
|
42
|
+
Stream,
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
from vgi.arguments import Arguments
|
|
46
|
+
from vgi.catalog.catalog_interface import (
|
|
47
|
+
CatalogAttachResult,
|
|
48
|
+
CatalogInfo,
|
|
49
|
+
FunctionInfo,
|
|
50
|
+
IndexConstraintType,
|
|
51
|
+
IndexInfo,
|
|
52
|
+
MacroInfo,
|
|
53
|
+
MacroType,
|
|
54
|
+
OnConflict,
|
|
55
|
+
PartitionKind,
|
|
56
|
+
SchemaInfo,
|
|
57
|
+
SchemaObjectType,
|
|
58
|
+
TableInfo,
|
|
59
|
+
ViewInfo,
|
|
60
|
+
)
|
|
61
|
+
from vgi.function_storage import BoundStorage, attach_catalog_bytes
|
|
62
|
+
from vgi.invocation import BindResponse, FunctionType, GlobalInitResponse
|
|
63
|
+
from vgi.otel import VgiTracer, _batch_bytes, _timed_exchange, get_noop_tracer
|
|
64
|
+
from vgi.scalar_function import ScalarFunctionGenerator
|
|
65
|
+
from vgi.table_function import (
|
|
66
|
+
OrderByDirection,
|
|
67
|
+
OrderByNullOrder,
|
|
68
|
+
ProcessParams,
|
|
69
|
+
SecretsAccessor,
|
|
70
|
+
TableCardinality,
|
|
71
|
+
TableFunctionBase,
|
|
72
|
+
TableFunctionGenerator,
|
|
73
|
+
TableInOutFunctionInitPhase,
|
|
74
|
+
_batch_to_scalar_dict,
|
|
75
|
+
_effective_projection_ids,
|
|
76
|
+
project_schema,
|
|
77
|
+
)
|
|
78
|
+
from vgi.table_in_out_function import TableInOutGenerator
|
|
79
|
+
|
|
80
|
+
__all__ = [
|
|
81
|
+
"BindRequest",
|
|
82
|
+
"CatalogAttachRequest",
|
|
83
|
+
"CatalogCreateRequest",
|
|
84
|
+
"CatalogsResponse",
|
|
85
|
+
"IndexCreateRequest",
|
|
86
|
+
"IndexesResponse",
|
|
87
|
+
"MacroCreateRequest",
|
|
88
|
+
"MacrosResponse",
|
|
89
|
+
"TableCreateRequest",
|
|
90
|
+
"CatalogVersionResponse",
|
|
91
|
+
"FunctionsResponse",
|
|
92
|
+
"InitRequest",
|
|
93
|
+
"ProcessState",
|
|
94
|
+
"ScalarExchangeState",
|
|
95
|
+
"SchemasResponse",
|
|
96
|
+
"TableFunctionDynamicToStringRequest",
|
|
97
|
+
"TableFunctionDynamicToStringResponse",
|
|
98
|
+
"TableInOutExchangeState",
|
|
99
|
+
"BufferedFinalizeState",
|
|
100
|
+
"TableProducerState",
|
|
101
|
+
"TableBufferingCombineRequest",
|
|
102
|
+
"TableBufferingCombineResponse",
|
|
103
|
+
"TableBufferingDestructorRequest",
|
|
104
|
+
"TableBufferingDestructorResponse",
|
|
105
|
+
"TableBufferingFinalizeState",
|
|
106
|
+
"TableBufferingProcessRequest",
|
|
107
|
+
"TableBufferingProcessResponse",
|
|
108
|
+
"TablesResponse",
|
|
109
|
+
"TransactionBeginResponse",
|
|
110
|
+
"VgiProtocol",
|
|
111
|
+
"ViewsResponse",
|
|
112
|
+
]
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
# ---------------------------------------------------------------------------
|
|
116
|
+
# Request types
|
|
117
|
+
# ---------------------------------------------------------------------------
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
@dataclass(frozen=True, slots=True, kw_only=True)
|
|
121
|
+
class BindRequest(ArrowSerializableDataclass):
|
|
122
|
+
"""Consolidated bind request for all function types.
|
|
123
|
+
|
|
124
|
+
For table functions (no input schema), ``input_schema`` is ``None``.
|
|
125
|
+
For scalar and table-in-out functions, ``input_schema`` is set.
|
|
126
|
+
|
|
127
|
+
"""
|
|
128
|
+
|
|
129
|
+
function_name: str
|
|
130
|
+
arguments: Annotated[Arguments, ArrowType(pa.binary())]
|
|
131
|
+
function_type: FunctionType
|
|
132
|
+
input_schema: Annotated[pa.Schema | None, ArrowType(pa.binary())] = None
|
|
133
|
+
settings: Annotated[pa.RecordBatch | None, ArrowType(pa.binary())] = None
|
|
134
|
+
secrets: Annotated[pa.RecordBatch | None, ArrowType(pa.binary())] = None
|
|
135
|
+
attach_opaque_data: bytes | None = None
|
|
136
|
+
transaction_opaque_data: bytes | None = None
|
|
137
|
+
resolved_secrets_provided: bool = False
|
|
138
|
+
|
|
139
|
+
# Time travel: the AT (TIMESTAMP|VERSION ...) clause for this scan, threaded
|
|
140
|
+
# through from DuckDB's per-reference bind. Both None when the scan has no AT
|
|
141
|
+
# clause. NOTE: for inline-bound (function-backed) tables the *actual* on_bind
|
|
142
|
+
# RPC runs once at attach with no AT, so these are None there; the per-scan AT
|
|
143
|
+
# is carried on the bind request embedded in each InitRequest, so functions
|
|
144
|
+
# should read it at init via ``params.init_call.bind_call.at_value`` (or the
|
|
145
|
+
# ``ProcessParams.at_value`` accessor), not at on_bind.
|
|
146
|
+
at_unit: str | None = None
|
|
147
|
+
at_value: str | None = None
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
@dataclass(frozen=True, slots=True, kw_only=True)
|
|
151
|
+
class InitRequest(ArrowSerializableDataclass):
|
|
152
|
+
"""Consolidated init request for all function types.
|
|
153
|
+
|
|
154
|
+
For secondary init requests, ``execution_id`` and ``init_opaque_data``
|
|
155
|
+
are set; use :attr:`is_secondary` to distinguish.
|
|
156
|
+
|
|
157
|
+
"""
|
|
158
|
+
|
|
159
|
+
# Core (always present)
|
|
160
|
+
bind_call: BindRequest
|
|
161
|
+
output_schema: Annotated[pa.Schema, ArrowType(pa.binary())]
|
|
162
|
+
# Wire-facing — bytes the framework produced from the typed
|
|
163
|
+
# ``BindResult.opaque_data``. Consumers reconstruct via
|
|
164
|
+
# ``MyConcreteDataclass.deserialize_from_bytes(raw)``. See
|
|
165
|
+
# ``BindResponse.opaque_data`` in vgi/invocation.py for the full
|
|
166
|
+
# contract rationale (typed producer / bytes wire / explicit
|
|
167
|
+
# consumer; abstract-base reconstruction can't be done in Python
|
|
168
|
+
# without a class registry).
|
|
169
|
+
bind_opaque_data: Annotated[bytes | None, ArrowType(pa.binary())] = None
|
|
170
|
+
|
|
171
|
+
# Table function extras (None for scalar)
|
|
172
|
+
projection_ids: list[int] | None = None
|
|
173
|
+
pushdown_filters: Annotated[pa.RecordBatch | None, ArrowType(pa.large_binary())] = None
|
|
174
|
+
join_keys: Annotated[list[pa.RecordBatch] | None, ArrowType(pa.list_(pa.large_binary()))] = None
|
|
175
|
+
|
|
176
|
+
# Table-in-out extras
|
|
177
|
+
phase: TableInOutFunctionInitPhase | None = None
|
|
178
|
+
# Buffered-table finalize stream: which state_id this stream serves.
|
|
179
|
+
# Required when phase=TABLE_BUFFERING_FINALIZE; None otherwise. Opaque
|
|
180
|
+
# bytes — worker chose the encoding when its combine() returned the
|
|
181
|
+
# finalize_state_ids list.
|
|
182
|
+
finalize_state_id: bytes | None = None
|
|
183
|
+
|
|
184
|
+
# Secondary init (None = global init, set = secondary)
|
|
185
|
+
execution_id: bytes | None = None
|
|
186
|
+
# Same contract as ``bind_opaque_data`` above.
|
|
187
|
+
init_opaque_data: Annotated[bytes | None, ArrowType(pa.binary())] = None
|
|
188
|
+
|
|
189
|
+
# Order pushdown hint from DuckDB's RowGroupPruner optimizer (all None when no hint)
|
|
190
|
+
order_by_column_name: str | None = None
|
|
191
|
+
order_by_direction: OrderByDirection | None = None
|
|
192
|
+
order_by_null_order: OrderByNullOrder | None = None
|
|
193
|
+
order_by_limit: int | None = None
|
|
194
|
+
|
|
195
|
+
# TABLESAMPLE pushdown hint from DuckDB's SamplingPushdown optimizer (all None when no hint)
|
|
196
|
+
tablesample_percentage: float | None = None
|
|
197
|
+
tablesample_seed: int | None = None
|
|
198
|
+
|
|
199
|
+
@property
|
|
200
|
+
def is_secondary(self) -> bool:
|
|
201
|
+
"""True if this is a secondary init request."""
|
|
202
|
+
return self.execution_id is not None
|
|
203
|
+
|
|
204
|
+
|
|
205
|
+
@dataclass(frozen=True, slots=True, kw_only=True)
|
|
206
|
+
class TableFunctionCardinalityRequest(ArrowSerializableDataclass):
|
|
207
|
+
"""Consolidated request for table function cardinality."""
|
|
208
|
+
|
|
209
|
+
bind_call: BindRequest
|
|
210
|
+
# Same contract as InitRequest.bind_opaque_data above.
|
|
211
|
+
bind_opaque_data: Annotated[bytes | None, ArrowType(pa.binary())] = None
|
|
212
|
+
|
|
213
|
+
|
|
214
|
+
@dataclass(frozen=True, slots=True, kw_only=True)
|
|
215
|
+
class TableFunctionStatisticsRequest(ArrowSerializableDataclass):
|
|
216
|
+
"""Consolidated request for table function per-column statistics.
|
|
217
|
+
|
|
218
|
+
Mirrors TableFunctionCardinalityRequest: the worker receives a full
|
|
219
|
+
copy of the original BindRequest (including parsed Arguments), so it
|
|
220
|
+
can derive per-column stats from the user-supplied args.
|
|
221
|
+
"""
|
|
222
|
+
|
|
223
|
+
bind_call: BindRequest
|
|
224
|
+
# Same contract as InitRequest.bind_opaque_data above.
|
|
225
|
+
bind_opaque_data: Annotated[bytes | None, ArrowType(pa.binary())] = None
|
|
226
|
+
|
|
227
|
+
|
|
228
|
+
@dataclass(frozen=True, slots=True, kw_only=True)
|
|
229
|
+
class TableFunctionDynamicToStringRequest(ArrowSerializableDataclass):
|
|
230
|
+
"""Post-execution profile-info request, fired once per scan thread.
|
|
231
|
+
|
|
232
|
+
Carries ``global_execution_id`` so the function class can retrieve
|
|
233
|
+
whatever diagnostics it persisted during ``process()`` (shared
|
|
234
|
+
storage, external service, in-memory class state for single-worker
|
|
235
|
+
setups, etc.). VGI does not serialize per-thread ``_user_state``
|
|
236
|
+
across the boundary — the user owns persistence.
|
|
237
|
+
"""
|
|
238
|
+
|
|
239
|
+
bind_call: BindRequest
|
|
240
|
+
# Same contract as InitRequest.bind_opaque_data above.
|
|
241
|
+
bind_opaque_data: Annotated[bytes | None, ArrowType(pa.binary())] = None
|
|
242
|
+
global_execution_id: bytes
|
|
243
|
+
|
|
244
|
+
|
|
245
|
+
@dataclass(frozen=True, slots=True, kw_only=True)
|
|
246
|
+
class TableFunctionDynamicToStringResponse(ArrowSerializableDataclass):
|
|
247
|
+
"""Ordered key/value pairs surfaced as Extra Info under EXPLAIN ANALYZE.
|
|
248
|
+
|
|
249
|
+
Parallel ``keys``/``values`` lists keep insertion order explicit on
|
|
250
|
+
the wire. The C++ side reassembles them into an
|
|
251
|
+
``InsertionOrderPreservingMap<string>``.
|
|
252
|
+
"""
|
|
253
|
+
|
|
254
|
+
keys: Annotated[list[str], ArrowType(pa.list_(pa.string()))]
|
|
255
|
+
values: Annotated[list[str], ArrowType(pa.list_(pa.string()))]
|
|
256
|
+
|
|
257
|
+
|
|
258
|
+
# ---------------------------------------------------------------------------
|
|
259
|
+
# Catalog request types (for methods with complex parameters)
|
|
260
|
+
# ---------------------------------------------------------------------------
|
|
261
|
+
|
|
262
|
+
|
|
263
|
+
@dataclass(frozen=True, slots=True, kw_only=True)
|
|
264
|
+
class CatalogAttachRequest(ArrowSerializableDataclass):
|
|
265
|
+
"""Request for catalog_attach. Uses RecordBatch for mixed-type options.
|
|
266
|
+
|
|
267
|
+
``data_version_spec`` and ``implementation_version`` carry semver
|
|
268
|
+
strings the user supplied at ATTACH time (concrete or range). ``None``
|
|
269
|
+
= unconstrained. The worker is responsible for interpreting and
|
|
270
|
+
validating them.
|
|
271
|
+
"""
|
|
272
|
+
|
|
273
|
+
name: str
|
|
274
|
+
options: Annotated[pa.RecordBatch | None, ArrowType(pa.binary())] = None
|
|
275
|
+
data_version_spec: str | None
|
|
276
|
+
implementation_version: str | None
|
|
277
|
+
|
|
278
|
+
|
|
279
|
+
@dataclass(frozen=True, slots=True, kw_only=True)
|
|
280
|
+
class CatalogCreateRequest(ArrowSerializableDataclass):
|
|
281
|
+
"""Request for catalog_create. Uses RecordBatch for mixed-type options."""
|
|
282
|
+
|
|
283
|
+
name: str
|
|
284
|
+
on_conflict: OnConflict
|
|
285
|
+
options: Annotated[pa.RecordBatch | None, ArrowType(pa.binary())] = None
|
|
286
|
+
|
|
287
|
+
|
|
288
|
+
@dataclass(frozen=True, slots=True, kw_only=True)
|
|
289
|
+
class TableCreateRequest(ArrowSerializableDataclass):
|
|
290
|
+
"""Request for catalog_table_create with complex constraint types."""
|
|
291
|
+
|
|
292
|
+
attach_opaque_data: bytes
|
|
293
|
+
schema_name: str
|
|
294
|
+
name: str
|
|
295
|
+
columns: bytes # SerializedSchema
|
|
296
|
+
on_conflict: OnConflict
|
|
297
|
+
not_null_constraints: Annotated[list[int], ArrowType(pa.list_(pa.int32()))] = field(default_factory=list)
|
|
298
|
+
unique_constraints: Annotated[list[list[int]], ArrowType(pa.list_(pa.list_(pa.int32())))] = field(
|
|
299
|
+
default_factory=list
|
|
300
|
+
)
|
|
301
|
+
check_constraints: list[str] = field(default_factory=list)
|
|
302
|
+
primary_key_constraints: Annotated[list[list[int]], ArrowType(pa.list_(pa.list_(pa.int32())))] = field(
|
|
303
|
+
default_factory=list
|
|
304
|
+
)
|
|
305
|
+
foreign_key_constraints: Annotated[list[bytes], ArrowType(pa.list_(pa.binary()))] = field(default_factory=list)
|
|
306
|
+
transaction_opaque_data: bytes | None = None
|
|
307
|
+
|
|
308
|
+
|
|
309
|
+
# ---------------------------------------------------------------------------
|
|
310
|
+
# Catalog response types
|
|
311
|
+
# ---------------------------------------------------------------------------
|
|
312
|
+
|
|
313
|
+
|
|
314
|
+
# ``CatalogsResponse`` is generated below via ``_catalog_items_response`` once
|
|
315
|
+
# that factory is defined — it wraps a list of CatalogInfo records serialized
|
|
316
|
+
# as bytes, matching the pattern used for other list[Info] responses.
|
|
317
|
+
|
|
318
|
+
|
|
319
|
+
@dataclass(frozen=True, slots=True, kw_only=True)
|
|
320
|
+
class CatalogVersionResponse(ArrowSerializableDataclass):
|
|
321
|
+
"""Response wrapping int for catalog_version()."""
|
|
322
|
+
|
|
323
|
+
version: int
|
|
324
|
+
|
|
325
|
+
|
|
326
|
+
@dataclass(frozen=True, slots=True, kw_only=True)
|
|
327
|
+
class TransactionBeginResponse(ArrowSerializableDataclass):
|
|
328
|
+
"""Response wrapping optional TransactionOpaqueData for catalog_transaction_begin()."""
|
|
329
|
+
|
|
330
|
+
transaction_opaque_data: bytes | None = None
|
|
331
|
+
|
|
332
|
+
|
|
333
|
+
def _catalog_items_response(item_type: type) -> type:
|
|
334
|
+
"""Generate a catalog items response class for the given ArrowSerializableDataclass type.
|
|
335
|
+
|
|
336
|
+
Each generated class wraps a list of IPC-serialized items with helpers:
|
|
337
|
+
- from_infos(items) / from_optional(item) — serialize into response
|
|
338
|
+
- to_infos() / to_optional() — deserialize from response
|
|
339
|
+
|
|
340
|
+
The item_type must have serialize_to_bytes() and deserialize_from_bytes() methods
|
|
341
|
+
(i.e., be an ArrowSerializableDataclass).
|
|
342
|
+
"""
|
|
343
|
+
type_name = item_type.__name__
|
|
344
|
+
|
|
345
|
+
@dataclass(frozen=True, slots=True, kw_only=True)
|
|
346
|
+
class _Response(ArrowSerializableDataclass):
|
|
347
|
+
items: Annotated[list[bytes], ArrowType(pa.list_(pa.binary()))]
|
|
348
|
+
|
|
349
|
+
@staticmethod
|
|
350
|
+
def from_infos(infos: list) -> _Response: # type: ignore[type-arg]
|
|
351
|
+
return _Response(items=[info.serialize_to_bytes() for info in infos])
|
|
352
|
+
|
|
353
|
+
@staticmethod
|
|
354
|
+
def from_optional(info: object | None) -> _Response:
|
|
355
|
+
if info is None:
|
|
356
|
+
return _Response(items=[])
|
|
357
|
+
return _Response(items=[info.serialize_to_bytes()]) # type: ignore[attr-defined]
|
|
358
|
+
|
|
359
|
+
def to_infos(self) -> list: # type: ignore[type-arg]
|
|
360
|
+
return [item_type.deserialize_from_bytes(b) for b in self.items] # type: ignore[attr-defined]
|
|
361
|
+
|
|
362
|
+
def to_optional(self) -> object | None:
|
|
363
|
+
if not self.items:
|
|
364
|
+
return None
|
|
365
|
+
return item_type.deserialize_from_bytes(self.items[0]) # type: ignore[attr-defined,no-any-return]
|
|
366
|
+
|
|
367
|
+
# Give the class a meaningful name for vgi_rpc introspection and repr
|
|
368
|
+
# "TableInfo" -> "TablesResponse", "IndexInfo" -> "IndexesResponse"
|
|
369
|
+
stem = type_name.removesuffix("Info")
|
|
370
|
+
plural = f"{stem}es" if stem.endswith(("x", "s", "sh", "ch")) else f"{stem}s"
|
|
371
|
+
class_name = f"{plural}Response"
|
|
372
|
+
_Response.__name__ = class_name
|
|
373
|
+
_Response.__qualname__ = class_name
|
|
374
|
+
_Response.__doc__ = f"Response wrapping list of {type_name}."
|
|
375
|
+
|
|
376
|
+
return _Response
|
|
377
|
+
|
|
378
|
+
|
|
379
|
+
if TYPE_CHECKING:
|
|
380
|
+
from typing import Self
|
|
381
|
+
|
|
382
|
+
# Provide mypy with explicit class shapes for the dynamically generated responses.
|
|
383
|
+
class _CatalogItemsResponseStub(ArrowSerializableDataclass):
|
|
384
|
+
items: list[bytes]
|
|
385
|
+
|
|
386
|
+
@classmethod
|
|
387
|
+
def from_infos(cls, infos: list[Any]) -> Self: ...
|
|
388
|
+
|
|
389
|
+
@classmethod
|
|
390
|
+
def from_optional(cls, info: object | None) -> Self: ...
|
|
391
|
+
|
|
392
|
+
def to_infos(self) -> list[Any]: ...
|
|
393
|
+
|
|
394
|
+
def to_optional(self) -> Any: ...
|
|
395
|
+
|
|
396
|
+
class CatalogsResponse(_CatalogItemsResponseStub):
|
|
397
|
+
"""Response wrapping list of CatalogInfo."""
|
|
398
|
+
|
|
399
|
+
class SchemasResponse(_CatalogItemsResponseStub):
|
|
400
|
+
"""Response wrapping list of SchemaInfo."""
|
|
401
|
+
|
|
402
|
+
class TablesResponse(_CatalogItemsResponseStub):
|
|
403
|
+
"""Response wrapping list of TableInfo."""
|
|
404
|
+
|
|
405
|
+
class ViewsResponse(_CatalogItemsResponseStub):
|
|
406
|
+
"""Response wrapping list of ViewInfo."""
|
|
407
|
+
|
|
408
|
+
class FunctionsResponse(_CatalogItemsResponseStub):
|
|
409
|
+
"""Response wrapping list of FunctionInfo."""
|
|
410
|
+
|
|
411
|
+
class MacrosResponse(_CatalogItemsResponseStub):
|
|
412
|
+
"""Response wrapping list of MacroInfo."""
|
|
413
|
+
else:
|
|
414
|
+
CatalogsResponse = _catalog_items_response(CatalogInfo)
|
|
415
|
+
SchemasResponse = _catalog_items_response(SchemaInfo)
|
|
416
|
+
TablesResponse = _catalog_items_response(TableInfo)
|
|
417
|
+
ViewsResponse = _catalog_items_response(ViewInfo)
|
|
418
|
+
FunctionsResponse = _catalog_items_response(FunctionInfo)
|
|
419
|
+
MacrosResponse = _catalog_items_response(MacroInfo)
|
|
420
|
+
|
|
421
|
+
|
|
422
|
+
@dataclass(frozen=True, slots=True, kw_only=True)
|
|
423
|
+
class MacroCreateRequest(ArrowSerializableDataclass):
|
|
424
|
+
"""Request for catalog_macro_create with RecordBatch for parameter defaults."""
|
|
425
|
+
|
|
426
|
+
attach_opaque_data: bytes
|
|
427
|
+
schema_name: str
|
|
428
|
+
name: str
|
|
429
|
+
macro_type: MacroType
|
|
430
|
+
parameters: list[str]
|
|
431
|
+
definition: str
|
|
432
|
+
on_conflict: OnConflict
|
|
433
|
+
parameter_default_values: Annotated[pa.RecordBatch | None, ArrowType(pa.binary())] = None
|
|
434
|
+
transaction_opaque_data: bytes | None = None
|
|
435
|
+
|
|
436
|
+
|
|
437
|
+
if TYPE_CHECKING:
|
|
438
|
+
|
|
439
|
+
class IndexesResponse(_CatalogItemsResponseStub): # noqa: E302
|
|
440
|
+
"""Response wrapping list of IndexInfo."""
|
|
441
|
+
else:
|
|
442
|
+
IndexesResponse = _catalog_items_response(IndexInfo)
|
|
443
|
+
|
|
444
|
+
|
|
445
|
+
@dataclass(frozen=True, slots=True, kw_only=True)
|
|
446
|
+
class IndexCreateRequest(ArrowSerializableDataclass):
|
|
447
|
+
"""Request for catalog_index_create."""
|
|
448
|
+
|
|
449
|
+
attach_opaque_data: bytes
|
|
450
|
+
schema_name: str
|
|
451
|
+
name: str
|
|
452
|
+
table_name: str
|
|
453
|
+
index_type: str = ""
|
|
454
|
+
constraint_type: IndexConstraintType = IndexConstraintType.NONE
|
|
455
|
+
expressions: list[str] = field(default_factory=list)
|
|
456
|
+
on_conflict: OnConflict = OnConflict.ERROR
|
|
457
|
+
options: dict[str, str] = field(default_factory=dict)
|
|
458
|
+
transaction_opaque_data: bytes | None = None
|
|
459
|
+
|
|
460
|
+
|
|
461
|
+
# ---------------------------------------------------------------------------
|
|
462
|
+
# StreamState implementations
|
|
463
|
+
# ---------------------------------------------------------------------------
|
|
464
|
+
|
|
465
|
+
|
|
466
|
+
@dataclass
|
|
467
|
+
class ScalarExchangeState(ExchangeState):
|
|
468
|
+
"""Exchange state for scalar function streams.
|
|
469
|
+
|
|
470
|
+
Calls ``ScalarFunctionGenerator.process()`` per batch. Each ``exchange()``
|
|
471
|
+
call sends one input batch and receives one output batch.
|
|
472
|
+
|
|
473
|
+
``_init_call`` and ``_init_response`` are serialized into the state token
|
|
474
|
+
so they survive HTTP round-trips. ``_func_cls`` is transient and restored
|
|
475
|
+
via ``rehydrate()``.
|
|
476
|
+
|
|
477
|
+
"""
|
|
478
|
+
|
|
479
|
+
_init_call: Annotated[InitRequest, ArrowType(pa.binary())] = field(default=None, repr=False) # type: ignore[assignment]
|
|
480
|
+
_init_response: Annotated[GlobalInitResponse, ArrowType(pa.binary())] = field(default=None, repr=False) # type: ignore[assignment]
|
|
481
|
+
# Full framework attach plaintext (uuid||catalog_bytes) persisted through
|
|
482
|
+
# serialization so each exchange can shard storage on its UUID without
|
|
483
|
+
# re-unwrapping (the auth-scoped seal can't be reopened, and ctx.implementation
|
|
484
|
+
# is the MetaWorker under subprocess transport).
|
|
485
|
+
_plaintext_attach: bytes | None = field(default=None, repr=False)
|
|
486
|
+
_func_cls: Annotated[type[ScalarFunctionGenerator], Transient()] = field(default=None, repr=False) # type: ignore[assignment]
|
|
487
|
+
_vgi_tracer: Annotated[VgiTracer, Transient()] = field(default_factory=get_noop_tracer, repr=False)
|
|
488
|
+
|
|
489
|
+
def rehydrate(self, implementation: object) -> None:
|
|
490
|
+
"""Restore ``_func_cls`` from the worker's function registry."""
|
|
491
|
+
from vgi.worker import Worker
|
|
492
|
+
|
|
493
|
+
worker: Worker = implementation # type: ignore[assignment]
|
|
494
|
+
self._func_cls = worker._resolve_function(self._init_call.bind_call) # type: ignore[assignment]
|
|
495
|
+
self._vgi_tracer = worker._vgi_tracer
|
|
496
|
+
|
|
497
|
+
def exchange(self, input: AnnotatedBatch, out: OutputCollector, ctx: CallContext) -> None:
|
|
498
|
+
"""Process one input batch through the scalar function."""
|
|
499
|
+
cls = self._func_cls
|
|
500
|
+
batch = input.batch
|
|
501
|
+
|
|
502
|
+
# Workaround: over HTTP, 0-column batches lose their row count because
|
|
503
|
+
# Arrow IPC RecordBatch messages with no arrays default to length 0.
|
|
504
|
+
# When a scalar function has no column inputs (e.g. "SELECT func()"),
|
|
505
|
+
# the caller expects 1 output row but sends num_rows=0. Add a dummy
|
|
506
|
+
# column so PyArrow preserves the row count, then strip it before
|
|
507
|
+
# validation.
|
|
508
|
+
inject_row = batch.num_columns == 0 and batch.num_rows == 0
|
|
509
|
+
if inject_row:
|
|
510
|
+
batch = pa.record_batch({"__row": pa.array([True])})
|
|
511
|
+
|
|
512
|
+
timer = _timed_exchange(
|
|
513
|
+
self._vgi_tracer,
|
|
514
|
+
"vgi.execute.scalar",
|
|
515
|
+
self._init_call.bind_call.function_name,
|
|
516
|
+
self._init_call.bind_call.function_type.value,
|
|
517
|
+
self._init_response.execution_id,
|
|
518
|
+
)
|
|
519
|
+
with timer:
|
|
520
|
+
output = cls.process(
|
|
521
|
+
batch=batch,
|
|
522
|
+
init_call=self._init_call,
|
|
523
|
+
init_response=self._init_response,
|
|
524
|
+
# Shard on the UUID of the full attach plaintext persisted at init.
|
|
525
|
+
storage=BoundStorage(
|
|
526
|
+
cls.storage,
|
|
527
|
+
self._init_response.execution_id,
|
|
528
|
+
attach_plaintext=self._plaintext_attach,
|
|
529
|
+
),
|
|
530
|
+
auth_context=ctx.auth,
|
|
531
|
+
)
|
|
532
|
+
if inject_row:
|
|
533
|
+
cls._validate_row_count(output, batch)
|
|
534
|
+
else:
|
|
535
|
+
cls._validate_row_count(output, input.batch)
|
|
536
|
+
timer.record(
|
|
537
|
+
input_rows=input.batch.num_rows,
|
|
538
|
+
output_rows=output.num_rows,
|
|
539
|
+
input_bytes=_batch_bytes(input.batch),
|
|
540
|
+
output_bytes=_batch_bytes(output),
|
|
541
|
+
)
|
|
542
|
+
out.emit(output)
|
|
543
|
+
|
|
544
|
+
|
|
545
|
+
_log = logging.getLogger(__name__)
|
|
546
|
+
|
|
547
|
+
|
|
548
|
+
def _resolve_state_type(func_cls: type) -> type[ArrowSerializableDataclass] | None:
|
|
549
|
+
"""Extract the TState type parameter from a TableFunctionGenerator or TableInOutGenerator.
|
|
550
|
+
|
|
551
|
+
Walks the MRO looking for ``TableFunctionGenerator[TArgs, TState]`` or
|
|
552
|
+
``TableInOutGenerator[TArgs, TState]`` and returns ``TState`` if it is a
|
|
553
|
+
concrete ``ArrowSerializableDataclass`` subclass.
|
|
554
|
+
|
|
555
|
+
Raises TypeError if the state type is a concrete class that does not
|
|
556
|
+
extend ArrowSerializableDataclass — this catches the problem early
|
|
557
|
+
rather than silently falling back to initial_state() on each HTTP exchange.
|
|
558
|
+
"""
|
|
559
|
+
for klass in func_cls.__mro__:
|
|
560
|
+
for base in getattr(klass, "__orig_bases__", ()):
|
|
561
|
+
origin = get_origin(base)
|
|
562
|
+
if origin is None:
|
|
563
|
+
continue
|
|
564
|
+
if issubclass(origin, (TableFunctionGenerator, TableInOutGenerator)):
|
|
565
|
+
args = get_args(base)
|
|
566
|
+
if len(args) >= 2:
|
|
567
|
+
state_type = args[1]
|
|
568
|
+
if isinstance(state_type, type) and issubclass(state_type, ArrowSerializableDataclass):
|
|
569
|
+
return state_type
|
|
570
|
+
if (
|
|
571
|
+
isinstance(state_type, type)
|
|
572
|
+
and state_type is not type(None)
|
|
573
|
+
and not issubclass(state_type, ArrowSerializableDataclass)
|
|
574
|
+
):
|
|
575
|
+
raise TypeError(
|
|
576
|
+
f"{func_cls.__name__}: TState type {state_type.__name__} must extend "
|
|
577
|
+
f"ArrowSerializableDataclass for HTTP state serialization."
|
|
578
|
+
)
|
|
579
|
+
return None
|
|
580
|
+
|
|
581
|
+
|
|
582
|
+
def _partition_fields_from_schema(bind_schema: pa.Schema) -> list[pa.Field[Any]]:
|
|
583
|
+
"""Walk a bind schema and return fields annotated as partition columns.
|
|
584
|
+
|
|
585
|
+
Recognises the ``vgi.partition_column = b"true"`` field metadata
|
|
586
|
+
set by :func:`vgi.schema_utils.partition_field`. Used by the
|
|
587
|
+
table-producer harness to precompute the list of partition fields
|
|
588
|
+
once at wrapper construction, so per-emit validation only does an
|
|
589
|
+
O(P) walk where P is the partition column count.
|
|
590
|
+
"""
|
|
591
|
+
from vgi.schema_utils import VGI_PARTITION_COLUMN_KEY
|
|
592
|
+
|
|
593
|
+
result: list[pa.Field[Any]] = []
|
|
594
|
+
for f in bind_schema:
|
|
595
|
+
md = f.metadata
|
|
596
|
+
if md is not None and md.get(VGI_PARTITION_COLUMN_KEY) == b"true":
|
|
597
|
+
result.append(f)
|
|
598
|
+
return result
|
|
599
|
+
|
|
600
|
+
|
|
601
|
+
def _resolve_partition_min_max(
|
|
602
|
+
field: pa.Field[Any],
|
|
603
|
+
partition_kind: PartitionKind,
|
|
604
|
+
batch: pa.RecordBatch,
|
|
605
|
+
explicit: dict[str, tuple[pa.Scalar[Any], pa.Scalar[Any]]] | None,
|
|
606
|
+
) -> tuple[pa.Scalar[Any], pa.Scalar[Any]]:
|
|
607
|
+
"""Resolve ``(min, max)`` for one partition column.
|
|
608
|
+
|
|
609
|
+
Two paths:
|
|
610
|
+
* Explicit: ``explicit[field.name]`` is a ``(pa.Scalar, pa.Scalar)``
|
|
611
|
+
tuple with both elements typed to ``field.type``.
|
|
612
|
+
* Auto-extract: read the column from the batch, derive
|
|
613
|
+
``(min, max)``. For SINGLE_VALUE, also validate single distinct
|
|
614
|
+
non-null value.
|
|
615
|
+
"""
|
|
616
|
+
if explicit is not None and field.name in explicit:
|
|
617
|
+
pair = explicit[field.name]
|
|
618
|
+
if not isinstance(pair, tuple) or len(pair) != 2:
|
|
619
|
+
raise RuntimeError(f"partition_values[{field.name!r}] must be (min, max) tuple; got {pair!r}")
|
|
620
|
+
min_s, max_s = pair
|
|
621
|
+
if not isinstance(min_s, pa.Scalar) or not isinstance(max_s, pa.Scalar):
|
|
622
|
+
raise RuntimeError(
|
|
623
|
+
f"partition_values[{field.name!r}] elements must be pa.Scalar; "
|
|
624
|
+
f"got ({type(min_s).__name__}, {type(max_s).__name__})"
|
|
625
|
+
)
|
|
626
|
+
if min_s.type != field.type:
|
|
627
|
+
raise RuntimeError(
|
|
628
|
+
f"partition_values[{field.name!r}] min type mismatch: declared {field.type}, got {min_s.type}"
|
|
629
|
+
)
|
|
630
|
+
if max_s.type != field.type:
|
|
631
|
+
raise RuntimeError(
|
|
632
|
+
f"partition_values[{field.name!r}] max type mismatch: declared {field.type}, got {max_s.type}"
|
|
633
|
+
)
|
|
634
|
+
return min_s, max_s
|
|
635
|
+
|
|
636
|
+
# Auto-extract path.
|
|
637
|
+
try:
|
|
638
|
+
column = batch.column(field.name)
|
|
639
|
+
except KeyError as exc:
|
|
640
|
+
raise RuntimeError(
|
|
641
|
+
f"column {field.name!r} is partition-annotated but absent from emitted batch; "
|
|
642
|
+
f"pass partition_values={{{field.name!r}: (pa.scalar(...), pa.scalar(...))}}"
|
|
643
|
+
) from exc
|
|
644
|
+
|
|
645
|
+
if partition_kind == PartitionKind.SINGLE_VALUE_PARTITIONS:
|
|
646
|
+
# Count distinct non-null values; SINGLE_VALUE requires <= 1.
|
|
647
|
+
# All-NULL columns are accepted: DuckDB routes NULL as its own
|
|
648
|
+
# partition (Value::NotDistinctFrom(NULL, NULL) is true).
|
|
649
|
+
non_null = pc.drop_null(column)
|
|
650
|
+
if len(non_null) > 0:
|
|
651
|
+
unique = pc.unique(non_null)
|
|
652
|
+
if len(unique) > 1:
|
|
653
|
+
raise RuntimeError(
|
|
654
|
+
f"column {field.name!r} has {len(unique)} distinct values; "
|
|
655
|
+
f"partition_kind=SINGLE_VALUE_PARTITIONS requires 1"
|
|
656
|
+
)
|
|
657
|
+
|
|
658
|
+
# ``pa.compute.min_max`` returns a scalar struct with min/max fields.
|
|
659
|
+
# For all-null columns it returns null/null of the column's type,
|
|
660
|
+
# which is exactly what we want.
|
|
661
|
+
mm_struct = pc.min_max(column)
|
|
662
|
+
return mm_struct["min"], mm_struct["max"]
|
|
663
|
+
|
|
664
|
+
|
|
665
|
+
def _build_partition_values_batch(
|
|
666
|
+
partition_fields: list[pa.Field[Any]],
|
|
667
|
+
resolved: dict[str, tuple[pa.Scalar[Any], pa.Scalar[Any]]],
|
|
668
|
+
) -> pa.RecordBatch:
|
|
669
|
+
"""Build the 2-row ``(min, max)`` RecordBatch from resolved scalars."""
|
|
670
|
+
arrays: list[pa.Array[Any]] = []
|
|
671
|
+
fields: list[pa.Field[Any]] = []
|
|
672
|
+
for pf in partition_fields:
|
|
673
|
+
min_s, max_s = resolved[pf.name]
|
|
674
|
+
# pa.array([scalar, scalar]) infers the same type as the scalars;
|
|
675
|
+
# the resolve step already validated those match field.type, so a
|
|
676
|
+
# direct cast is a no-op except for any storage-layout normalisation.
|
|
677
|
+
arr = pa.array([min_s, max_s], type=pf.type)
|
|
678
|
+
arrays.append(arr)
|
|
679
|
+
fields.append(pf)
|
|
680
|
+
return pa.RecordBatch.from_arrays(arrays, schema=pa.schema(fields))
|
|
681
|
+
|
|
682
|
+
|
|
683
|
+
def _serialize_partition_values_batch(batch: pa.RecordBatch) -> str:
|
|
684
|
+
"""Serialize via Arrow IPC stream + base64.
|
|
685
|
+
|
|
686
|
+
Matches the ``vgi_rpc.stream_state#b64`` convention used elsewhere.
|
|
687
|
+
"""
|
|
688
|
+
sink = pa.BufferOutputStream()
|
|
689
|
+
with pa.ipc.new_stream(sink, batch.schema) as writer:
|
|
690
|
+
writer.write_batch(batch)
|
|
691
|
+
return base64.b64encode(sink.getvalue().to_pybytes()).decode("ascii")
|
|
692
|
+
|
|
693
|
+
|
|
694
|
+
def _merge_partition_values(
|
|
695
|
+
*,
|
|
696
|
+
partition_fields: list[pa.Field[Any]],
|
|
697
|
+
partition_kind: PartitionKind,
|
|
698
|
+
batch: pa.RecordBatch,
|
|
699
|
+
partition_values: dict[str, tuple[pa.Scalar[Any], pa.Scalar[Any]]] | None,
|
|
700
|
+
metadata: dict[str, str] | None,
|
|
701
|
+
) -> dict[str, str] | None:
|
|
702
|
+
"""Validate the partition_values kwarg and fold it into the emit metadata.
|
|
703
|
+
|
|
704
|
+
Folds the resulting Arrow IPC bytes into the emit metadata dict under
|
|
705
|
+
``vgi_partition_values#b64``.
|
|
706
|
+
|
|
707
|
+
Contract:
|
|
708
|
+
|
|
709
|
+
* If ``partition_fields`` is empty (function did not annotate any
|
|
710
|
+
partition column), then ``partition_values`` MUST be None —
|
|
711
|
+
catches "I forgot to mark fields" bugs that would otherwise
|
|
712
|
+
silently drop the kwarg.
|
|
713
|
+
* If ``partition_fields`` is non-empty AND ``batch.num_rows == 0``:
|
|
714
|
+
no metadata is emitted (empty-batch exemption — the C++ extension
|
|
715
|
+
skips its requirement check on 0-row batches).
|
|
716
|
+
* Otherwise: for each partition field, resolve ``(min, max)`` via
|
|
717
|
+
:func:`_resolve_partition_min_max`. Build a 2-row IPC batch,
|
|
718
|
+
serialize, base64-encode, set
|
|
719
|
+
``metadata["vgi_partition_values#b64"]``.
|
|
720
|
+
"""
|
|
721
|
+
if not partition_fields:
|
|
722
|
+
if partition_values is not None:
|
|
723
|
+
raise RuntimeError(
|
|
724
|
+
"out.emit(partition_values=...) requires partition-annotated fields "
|
|
725
|
+
"in the bind schema. Use vgi.schema_utils.partition_field() to mark "
|
|
726
|
+
"the column(s) and set Meta.partition_kind to a non-default value."
|
|
727
|
+
)
|
|
728
|
+
return metadata
|
|
729
|
+
|
|
730
|
+
if batch.num_rows == 0:
|
|
731
|
+
# Empty batches are exempt from partition-values; the C++ side
|
|
732
|
+
# skips its requirement check for 0-row batches. Leave metadata
|
|
733
|
+
# untouched so callers don't pay base64+IPC overhead for nothing.
|
|
734
|
+
return metadata
|
|
735
|
+
|
|
736
|
+
resolved: dict[str, tuple[pa.Scalar[Any], pa.Scalar[Any]]] = {}
|
|
737
|
+
for pf in partition_fields:
|
|
738
|
+
resolved[pf.name] = _resolve_partition_min_max(
|
|
739
|
+
pf,
|
|
740
|
+
partition_kind,
|
|
741
|
+
batch,
|
|
742
|
+
partition_values,
|
|
743
|
+
)
|
|
744
|
+
|
|
745
|
+
values_batch = _build_partition_values_batch(partition_fields, resolved)
|
|
746
|
+
b64 = _serialize_partition_values_batch(values_batch)
|
|
747
|
+
|
|
748
|
+
merged: dict[str, str] = dict(metadata) if metadata else {}
|
|
749
|
+
merged["vgi_partition_values#b64"] = b64
|
|
750
|
+
return merged
|
|
751
|
+
|
|
752
|
+
|
|
753
|
+
def _merge_batch_index(
|
|
754
|
+
*,
|
|
755
|
+
supports_batch_index: bool,
|
|
756
|
+
batch_index: int | None,
|
|
757
|
+
metadata: dict[str, str] | None,
|
|
758
|
+
) -> dict[str, str] | None:
|
|
759
|
+
"""Validate the batch_index kwarg and fold it into the emit metadata dict.
|
|
760
|
+
|
|
761
|
+
Contract:
|
|
762
|
+
* If ``supports_batch_index`` is True, ``batch_index`` MUST be supplied.
|
|
763
|
+
Forgetting the kwarg on an opted-in function is a programming error
|
|
764
|
+
that would otherwise produce a data batch with no
|
|
765
|
+
``vgi_batch_index`` metadata — the C++ extension would raise an
|
|
766
|
+
IOException at scan time; raising here gives the worker author a
|
|
767
|
+
clearer line number.
|
|
768
|
+
* If ``supports_batch_index`` is False, ``batch_index`` MUST NOT be
|
|
769
|
+
supplied — catches "I forgot to set the Meta flag" bugs.
|
|
770
|
+
* The merged value is a decimal-string of the int (matches the wire
|
|
771
|
+
convention used by ``vgi_filter_version`` / ``vgi_join_keys_version``
|
|
772
|
+
elsewhere in the codebase).
|
|
773
|
+
"""
|
|
774
|
+
if supports_batch_index:
|
|
775
|
+
if batch_index is None:
|
|
776
|
+
raise RuntimeError("out.emit() requires batch_index= on a function with Meta.supports_batch_index = True")
|
|
777
|
+
else:
|
|
778
|
+
if batch_index is not None:
|
|
779
|
+
raise RuntimeError("out.emit(batch_index=...) requires Meta.supports_batch_index = True")
|
|
780
|
+
if batch_index is None:
|
|
781
|
+
return metadata
|
|
782
|
+
merged: dict[str, str] = dict(metadata) if metadata else {}
|
|
783
|
+
merged["vgi_batch_index"] = str(batch_index)
|
|
784
|
+
return merged
|
|
785
|
+
|
|
786
|
+
|
|
787
|
+
class VgiOutputCollector(Protocol):
|
|
788
|
+
"""Structural type for the ``out`` handed to a table function's body.
|
|
789
|
+
|
|
790
|
+
VGI's emit-path wrappers (:class:`_TrackingOutputCollector`,
|
|
791
|
+
:class:`_FilteringOutputCollector`) extend vgi-rpc's
|
|
792
|
+
``OutputCollector.emit`` with ``batch_index=`` and ``partition_values=``
|
|
793
|
+
kwargs. Function bodies that opt into those features ``cast`` the
|
|
794
|
+
framework-supplied ``out`` to this protocol before calling ``emit``:
|
|
795
|
+
the base ``OutputCollector`` type cannot carry the wider signature
|
|
796
|
+
without breaking ``process()`` override compatibility across every
|
|
797
|
+
fixture.
|
|
798
|
+
"""
|
|
799
|
+
|
|
800
|
+
def emit(
|
|
801
|
+
self,
|
|
802
|
+
batch: pa.RecordBatch,
|
|
803
|
+
batch_index: int | None = None,
|
|
804
|
+
partition_values: dict[str, tuple[pa.Scalar[Any], pa.Scalar[Any]]] | None = None,
|
|
805
|
+
metadata: dict[str, str] | None = None,
|
|
806
|
+
) -> None: ...
|
|
807
|
+
|
|
808
|
+
def finish(self) -> None: ...
|
|
809
|
+
|
|
810
|
+
def client_log(self, level: Any, message: str, **extra: str) -> None: ...
|
|
811
|
+
|
|
812
|
+
|
|
813
|
+
class _FilteringOutputCollector:
|
|
814
|
+
"""Wrapper that applies pushdown filters to emitted data batches.
|
|
815
|
+
|
|
816
|
+
Intercepts emit() calls and applies the pushdown filter before
|
|
817
|
+
delegating to the real OutputCollector. Threads ``batch_index=`` and
|
|
818
|
+
``metadata=`` kwargs through unchanged — validation lives on the
|
|
819
|
+
innermost wrapper (``_TrackingOutputCollector``) so it happens exactly
|
|
820
|
+
once regardless of which wrappers are stacked.
|
|
821
|
+
"""
|
|
822
|
+
|
|
823
|
+
__slots__ = ("_inner", "_func_cls", "_filters")
|
|
824
|
+
|
|
825
|
+
def __init__(self, inner: _TrackingOutputCollector, func_cls: type[TableFunctionBase[Any]], filters: Any) -> None:
|
|
826
|
+
self._inner = inner
|
|
827
|
+
self._func_cls = func_cls
|
|
828
|
+
self._filters = filters
|
|
829
|
+
|
|
830
|
+
def emit(
|
|
831
|
+
self,
|
|
832
|
+
batch: pa.RecordBatch,
|
|
833
|
+
batch_index: int | None = None,
|
|
834
|
+
partition_values: dict[str, tuple[pa.Scalar[Any], pa.Scalar[Any]]] | None = None,
|
|
835
|
+
metadata: dict[str, str] | None = None,
|
|
836
|
+
) -> None:
|
|
837
|
+
filtered = self._func_cls._apply_pushdown_filter(batch, self._filters)
|
|
838
|
+
self._inner.emit(
|
|
839
|
+
filtered,
|
|
840
|
+
batch_index=batch_index,
|
|
841
|
+
partition_values=partition_values,
|
|
842
|
+
metadata=metadata,
|
|
843
|
+
)
|
|
844
|
+
|
|
845
|
+
def emit_pydict(self, data: dict[str, Any], schema: pa.Schema | None = None) -> None:
|
|
846
|
+
batch = pa.RecordBatch.from_pydict(data, schema=schema or self._inner.output_schema)
|
|
847
|
+
self.emit(batch)
|
|
848
|
+
|
|
849
|
+
def finish(self) -> None:
|
|
850
|
+
self._inner.finish()
|
|
851
|
+
|
|
852
|
+
@property
|
|
853
|
+
def finished(self) -> bool:
|
|
854
|
+
return self._inner.finished
|
|
855
|
+
|
|
856
|
+
def emit_client_log_message(self, msg: Any) -> None:
|
|
857
|
+
self._inner.emit_client_log_message(msg)
|
|
858
|
+
|
|
859
|
+
def client_log(self, level: Any, message: str, **extra: str) -> None:
|
|
860
|
+
self._inner.client_log(level, message, **extra)
|
|
861
|
+
|
|
862
|
+
def propagate(self) -> None:
|
|
863
|
+
"""No-op: state already propagated to inner collector."""
|
|
864
|
+
|
|
865
|
+
@property
|
|
866
|
+
def output_schema(self) -> pa.Schema:
|
|
867
|
+
return self._inner.output_schema
|
|
868
|
+
|
|
869
|
+
|
|
870
|
+
class _TrackingOutputCollector:
|
|
871
|
+
"""Wrapper that tracks total rows and bytes emitted, delegating all else.
|
|
872
|
+
|
|
873
|
+
Also the validation point for the ``batch_index=`` and
|
|
874
|
+
``partition_values=`` kwargs on ``out.emit()`` (see
|
|
875
|
+
:func:`_merge_batch_index` and :func:`_merge_partition_values`). This
|
|
876
|
+
wrapper is always the innermost wrapper in the table-function emit
|
|
877
|
+
path, so validating here happens exactly once per emit regardless of
|
|
878
|
+
whether :class:`_FilteringOutputCollector` is also in the stack.
|
|
879
|
+
"""
|
|
880
|
+
|
|
881
|
+
__slots__ = (
|
|
882
|
+
"_inner",
|
|
883
|
+
"_supports_batch_index",
|
|
884
|
+
"_partition_fields",
|
|
885
|
+
"_partition_kind",
|
|
886
|
+
"total_rows",
|
|
887
|
+
"total_bytes",
|
|
888
|
+
)
|
|
889
|
+
|
|
890
|
+
def __init__(
|
|
891
|
+
self,
|
|
892
|
+
inner: OutputCollector,
|
|
893
|
+
supports_batch_index: bool = False,
|
|
894
|
+
partition_fields: list[pa.Field[Any]] | None = None,
|
|
895
|
+
partition_kind: PartitionKind = PartitionKind.NOT_PARTITIONED,
|
|
896
|
+
) -> None:
|
|
897
|
+
self._inner = inner
|
|
898
|
+
self._supports_batch_index = supports_batch_index
|
|
899
|
+
# Pre-computed list of partition-annotated fields from the bind
|
|
900
|
+
# schema; empty when the function did not opt in to PartitionColumns.
|
|
901
|
+
self._partition_fields = partition_fields or []
|
|
902
|
+
self._partition_kind = partition_kind
|
|
903
|
+
self.total_rows = 0
|
|
904
|
+
self.total_bytes = 0
|
|
905
|
+
|
|
906
|
+
def emit(
|
|
907
|
+
self,
|
|
908
|
+
batch: pa.RecordBatch,
|
|
909
|
+
batch_index: int | None = None,
|
|
910
|
+
partition_values: dict[str, tuple[pa.Scalar[Any], pa.Scalar[Any]]] | None = None,
|
|
911
|
+
metadata: dict[str, str] | None = None,
|
|
912
|
+
) -> None:
|
|
913
|
+
merged_metadata = _merge_batch_index(
|
|
914
|
+
supports_batch_index=self._supports_batch_index,
|
|
915
|
+
batch_index=batch_index,
|
|
916
|
+
metadata=metadata,
|
|
917
|
+
)
|
|
918
|
+
merged_metadata = _merge_partition_values(
|
|
919
|
+
partition_fields=self._partition_fields,
|
|
920
|
+
partition_kind=self._partition_kind,
|
|
921
|
+
batch=batch,
|
|
922
|
+
partition_values=partition_values,
|
|
923
|
+
metadata=merged_metadata,
|
|
924
|
+
)
|
|
925
|
+
self.total_rows += batch.num_rows
|
|
926
|
+
self.total_bytes += _batch_bytes(batch)
|
|
927
|
+
if merged_metadata is None:
|
|
928
|
+
self._inner.emit(batch)
|
|
929
|
+
else:
|
|
930
|
+
self._inner.emit(batch, metadata=merged_metadata)
|
|
931
|
+
|
|
932
|
+
@property
|
|
933
|
+
def finished(self) -> bool:
|
|
934
|
+
return self._inner.finished
|
|
935
|
+
|
|
936
|
+
@property
|
|
937
|
+
def output_schema(self) -> pa.Schema:
|
|
938
|
+
return self._inner.output_schema
|
|
939
|
+
|
|
940
|
+
def __getattr__(self, name: str) -> Any:
|
|
941
|
+
return getattr(self._inner, name)
|
|
942
|
+
|
|
943
|
+
|
|
944
|
+
@dataclass
|
|
945
|
+
class TableProducerState(ProducerState):
|
|
946
|
+
"""Producer state for table function streams.
|
|
947
|
+
|
|
948
|
+
Calls ``TableFunctionGenerator.process()`` per tick. Each ``produce()``
|
|
949
|
+
call delegates to the function's process method which uses ``out`` directly.
|
|
950
|
+
|
|
951
|
+
When ``auto_apply_filters`` is enabled on the function class, pushdown
|
|
952
|
+
filters from the init request are automatically applied to each output
|
|
953
|
+
batch after ``process()`` produces it.
|
|
954
|
+
|
|
955
|
+
``_init_call`` and ``_init_response`` are serialized into the state token
|
|
956
|
+
so they survive HTTP round-trips. Transient fields are restored via
|
|
957
|
+
``rehydrate()``.
|
|
958
|
+
|
|
959
|
+
``_user_state`` is serialized when it is an ``ArrowSerializableDataclass``,
|
|
960
|
+
allowing iteration state to survive HTTP round-trips. When the state type
|
|
961
|
+
is not serializable, it falls back to ``initial_state()`` on rehydration.
|
|
962
|
+
|
|
963
|
+
"""
|
|
964
|
+
|
|
965
|
+
_init_call: Annotated[InitRequest, ArrowType(pa.binary())] = field(default=None, repr=False) # type: ignore[assignment]
|
|
966
|
+
_init_response: Annotated[GlobalInitResponse, ArrowType(pa.binary())] = field(default=None, repr=False) # type: ignore[assignment]
|
|
967
|
+
_user_state_bytes: bytes | None = field(default=None, repr=False)
|
|
968
|
+
# Plaintext attach for bodies that read it as user data. ``_init_call`` now
|
|
969
|
+
# carries the SEALED attach (storage shards on it via request=); this carries
|
|
970
|
+
# the unwrapped form through serialization so rehydrate can set
|
|
971
|
+
# params.attach_opaque_data without re-unwrapping (the seal is auth-scoped).
|
|
972
|
+
_plaintext_attach: bytes | None = field(default=None, repr=False)
|
|
973
|
+
_func_cls: Annotated[type[TableFunctionGenerator[Any]], Transient()] = field(default=None, repr=False) # type: ignore[assignment]
|
|
974
|
+
_params: Annotated[ProcessParams[Any], Transient()] = field(default=None, repr=False) # type: ignore[arg-type]
|
|
975
|
+
_user_state: Annotated[Any, Transient()] = field(default=None, repr=False)
|
|
976
|
+
_pushdown_filters: Annotated[Any, Transient()] = field(default=None, repr=False) # PushdownFilters | None
|
|
977
|
+
_auto_apply: Annotated[bool, Transient()] = field(default=False, repr=False)
|
|
978
|
+
_vgi_tracer: Annotated[VgiTracer, Transient()] = field(default_factory=get_noop_tracer, repr=False)
|
|
979
|
+
|
|
980
|
+
def __post_init__(self) -> None:
|
|
981
|
+
"""Resolve pushdown filters if auto_apply_filters is enabled."""
|
|
982
|
+
if self._func_cls is not None and self._func_cls._should_auto_apply_filters():
|
|
983
|
+
self._auto_apply = True
|
|
984
|
+
init_call = self._params.init_call if self._params is not None else None
|
|
985
|
+
if init_call is not None and init_call.pushdown_filters is not None:
|
|
986
|
+
self._pushdown_filters = self._func_cls.pushdown_filters(
|
|
987
|
+
init_call.pushdown_filters,
|
|
988
|
+
join_keys=init_call.join_keys,
|
|
989
|
+
)
|
|
990
|
+
|
|
991
|
+
def _to_row_dict(self) -> dict[str, object]:
|
|
992
|
+
"""Serialize _user_state into _user_state_bytes before standard serialization."""
|
|
993
|
+
if self._user_state is not None and isinstance(self._user_state, ArrowSerializableDataclass):
|
|
994
|
+
self._user_state_bytes = self._user_state.serialize_to_bytes()
|
|
995
|
+
return super()._to_row_dict()
|
|
996
|
+
|
|
997
|
+
def rehydrate(self, implementation: object) -> None:
|
|
998
|
+
"""Restore transient fields from serialized init data."""
|
|
999
|
+
from vgi.worker import Worker
|
|
1000
|
+
|
|
1001
|
+
worker: Worker = implementation # type: ignore[assignment]
|
|
1002
|
+
func_cls = worker._resolve_function(self._init_call.bind_call)
|
|
1003
|
+
assert issubclass(func_cls, TableFunctionGenerator)
|
|
1004
|
+
self._func_cls = func_cls
|
|
1005
|
+
self._vgi_tracer = worker._vgi_tracer
|
|
1006
|
+
proj_ids = _effective_projection_ids(func_cls, self._init_call.projection_ids)
|
|
1007
|
+
output_schema = project_schema(proj_ids, self._init_call.output_schema)
|
|
1008
|
+
self._params = ProcessParams(
|
|
1009
|
+
args=func_cls._parse_arguments(func_cls.FunctionArguments, self._init_call.bind_call.arguments),
|
|
1010
|
+
init_call=self._init_call,
|
|
1011
|
+
init_response=self._init_response,
|
|
1012
|
+
output_schema=output_schema,
|
|
1013
|
+
settings=_batch_to_scalar_dict(self._init_call.bind_call.settings),
|
|
1014
|
+
secrets=SecretsAccessor(self._init_call.bind_call.secrets).to_dict(),
|
|
1015
|
+
# Rehydrated tick: the auth-scoped seal can't be reopened here, so we
|
|
1016
|
+
# shard storage on the full plaintext (uuid||catalog_bytes) the init
|
|
1017
|
+
# state persisted; the body sees only the stripped catalog bytes.
|
|
1018
|
+
storage=BoundStorage(
|
|
1019
|
+
func_cls.storage,
|
|
1020
|
+
self._init_response.execution_id,
|
|
1021
|
+
attach_plaintext=self._plaintext_attach,
|
|
1022
|
+
),
|
|
1023
|
+
attach_opaque_data=attach_catalog_bytes(self._plaintext_attach),
|
|
1024
|
+
)
|
|
1025
|
+
# Restore _user_state from serialized bytes if available
|
|
1026
|
+
if self._user_state_bytes is not None:
|
|
1027
|
+
state_type = _resolve_state_type(func_cls)
|
|
1028
|
+
if state_type is not None:
|
|
1029
|
+
self._user_state = state_type.deserialize_from_bytes(self._user_state_bytes)
|
|
1030
|
+
_log.debug("Restored user state from token: %s", type(self._user_state).__name__)
|
|
1031
|
+
else:
|
|
1032
|
+
_log.debug("State type not serializable, falling back to initial_state()")
|
|
1033
|
+
self._user_state = func_cls.initial_state(self._params)
|
|
1034
|
+
else:
|
|
1035
|
+
self._user_state = func_cls.initial_state(self._params)
|
|
1036
|
+
# Re-derive pushdown filters (triggers same logic as __post_init__)
|
|
1037
|
+
if func_cls._should_auto_apply_filters():
|
|
1038
|
+
self._auto_apply = True
|
|
1039
|
+
if self._init_call.pushdown_filters is not None:
|
|
1040
|
+
self._pushdown_filters = func_cls.pushdown_filters(
|
|
1041
|
+
self._init_call.pushdown_filters,
|
|
1042
|
+
join_keys=self._init_call.join_keys,
|
|
1043
|
+
)
|
|
1044
|
+
|
|
1045
|
+
def process(self, input: AnnotatedBatch, out: OutputCollector, ctx: CallContext) -> None:
|
|
1046
|
+
"""Process tick batch — check for dynamic filter updates, then produce."""
|
|
1047
|
+
if input.custom_metadata is not None:
|
|
1048
|
+
encoded = input.custom_metadata.get(b"vgi_pushdown_filters")
|
|
1049
|
+
if encoded is not None:
|
|
1050
|
+
self._update_filters_from_metadata(encoded)
|
|
1051
|
+
self.produce(out, ctx)
|
|
1052
|
+
|
|
1053
|
+
def _update_filters_from_metadata(self, encoded_filters: bytes) -> None:
|
|
1054
|
+
"""Decode and apply dynamic filter update from tick metadata."""
|
|
1055
|
+
import base64
|
|
1056
|
+
|
|
1057
|
+
from vgi.table_filter_pushdown import deserialize_filters
|
|
1058
|
+
|
|
1059
|
+
try:
|
|
1060
|
+
filter_bytes = base64.b64decode(encoded_filters)
|
|
1061
|
+
table = pa.ipc.open_stream(filter_bytes).read_all()
|
|
1062
|
+
if table.num_rows > 0:
|
|
1063
|
+
filter_batch = table.to_batches()[0]
|
|
1064
|
+
new_filters = deserialize_filters(filter_batch)
|
|
1065
|
+
self._pushdown_filters = new_filters
|
|
1066
|
+
except Exception:
|
|
1067
|
+
_log.warning("Failed to deserialize dynamic filter from tick metadata", exc_info=True)
|
|
1068
|
+
|
|
1069
|
+
def produce(self, out: OutputCollector, ctx: CallContext) -> None:
|
|
1070
|
+
"""Produce the next output batch from the table function."""
|
|
1071
|
+
params = dataclasses.replace(
|
|
1072
|
+
self._params,
|
|
1073
|
+
auth_context=ctx.auth,
|
|
1074
|
+
current_pushdown_filters=self._pushdown_filters,
|
|
1075
|
+
)
|
|
1076
|
+
timer = _timed_exchange(
|
|
1077
|
+
self._vgi_tracer,
|
|
1078
|
+
"vgi.execute.table",
|
|
1079
|
+
self._init_call.bind_call.function_name,
|
|
1080
|
+
self._init_call.bind_call.function_type.value,
|
|
1081
|
+
self._init_response.execution_id,
|
|
1082
|
+
)
|
|
1083
|
+
with timer:
|
|
1084
|
+
tracking_out = _TrackingOutputCollector(
|
|
1085
|
+
out,
|
|
1086
|
+
supports_batch_index=self._func_cls._supports_batch_index(),
|
|
1087
|
+
partition_fields=_partition_fields_from_schema(self._init_call.output_schema),
|
|
1088
|
+
partition_kind=self._func_cls._partition_kind(),
|
|
1089
|
+
)
|
|
1090
|
+
if self._auto_apply and self._pushdown_filters is not None:
|
|
1091
|
+
filtered_out = _FilteringOutputCollector(tracking_out, self._func_cls, self._pushdown_filters)
|
|
1092
|
+
self._func_cls.process(params, self._user_state, filtered_out) # type: ignore[arg-type]
|
|
1093
|
+
filtered_out.propagate()
|
|
1094
|
+
else:
|
|
1095
|
+
self._func_cls.process(params, self._user_state, tracking_out) # type: ignore[arg-type]
|
|
1096
|
+
timer.record(
|
|
1097
|
+
output_rows=tracking_out.total_rows,
|
|
1098
|
+
output_bytes=tracking_out.total_bytes,
|
|
1099
|
+
)
|
|
1100
|
+
|
|
1101
|
+
def on_cancel(self, ctx: CallContext) -> None:
|
|
1102
|
+
"""Forward cancel signal to the user function's classmethod."""
|
|
1103
|
+
if self._func_cls is None or self._params is None:
|
|
1104
|
+
return
|
|
1105
|
+
params = dataclasses.replace(self._params, auth_context=ctx.auth)
|
|
1106
|
+
try:
|
|
1107
|
+
self._func_cls.on_cancel(params, self._user_state)
|
|
1108
|
+
except Exception:
|
|
1109
|
+
_log.debug("on_cancel hook raised", exc_info=True)
|
|
1110
|
+
|
|
1111
|
+
|
|
1112
|
+
@dataclass
|
|
1113
|
+
class TableInOutExchangeState(ExchangeState):
|
|
1114
|
+
"""Exchange state for table-in-out function streams (INPUT phase).
|
|
1115
|
+
|
|
1116
|
+
Calls ``TableInOutGenerator.process()`` per input batch. Each
|
|
1117
|
+
``exchange()`` call sends one input batch and receives one output batch.
|
|
1118
|
+
|
|
1119
|
+
When ``auto_apply_filters`` is enabled, pushdown filters from the init
|
|
1120
|
+
request are automatically applied to each output batch.
|
|
1121
|
+
|
|
1122
|
+
``_init_call`` and ``_init_response`` are serialized into the state token
|
|
1123
|
+
so they survive HTTP round-trips. Transient fields are restored via
|
|
1124
|
+
``rehydrate()``.
|
|
1125
|
+
|
|
1126
|
+
``_user_state`` is serialized when it is an ``ArrowSerializableDataclass``,
|
|
1127
|
+
allowing iteration state to survive HTTP round-trips.
|
|
1128
|
+
|
|
1129
|
+
"""
|
|
1130
|
+
|
|
1131
|
+
_init_call: Annotated[InitRequest, ArrowType(pa.binary())] = field(default=None, repr=False) # type: ignore[assignment]
|
|
1132
|
+
_init_response: Annotated[GlobalInitResponse, ArrowType(pa.binary())] = field(default=None, repr=False) # type: ignore[assignment]
|
|
1133
|
+
_user_state_bytes: bytes | None = field(default=None, repr=False)
|
|
1134
|
+
# Plaintext attach for bodies that read it as user data. ``_init_call`` now
|
|
1135
|
+
# carries the SEALED attach (storage shards on it via request=); this carries
|
|
1136
|
+
# the unwrapped form through serialization so rehydrate can set
|
|
1137
|
+
# params.attach_opaque_data without re-unwrapping (the seal is auth-scoped).
|
|
1138
|
+
_plaintext_attach: bytes | None = field(default=None, repr=False)
|
|
1139
|
+
_func_cls: Annotated[type[TableInOutGenerator[Any]], Transient()] = field(default=None, repr=False) # type: ignore[assignment]
|
|
1140
|
+
_params: Annotated[ProcessParams[Any], Transient()] = field(default=None, repr=False) # type: ignore[arg-type]
|
|
1141
|
+
_user_state: Annotated[Any, Transient()] = field(default=None, repr=False)
|
|
1142
|
+
_pushdown_filters: Annotated[Any, Transient()] = field(default=None, repr=False) # PushdownFilters | None
|
|
1143
|
+
_auto_apply: Annotated[bool, Transient()] = field(default=False, repr=False)
|
|
1144
|
+
_vgi_tracer: Annotated[VgiTracer, Transient()] = field(default_factory=get_noop_tracer, repr=False)
|
|
1145
|
+
|
|
1146
|
+
def __post_init__(self) -> None:
|
|
1147
|
+
"""Resolve pushdown filters if auto_apply_filters is enabled."""
|
|
1148
|
+
if self._func_cls is not None and self._func_cls._should_auto_apply_filters():
|
|
1149
|
+
self._auto_apply = True
|
|
1150
|
+
init_call = self._params.init_call if self._params is not None else None
|
|
1151
|
+
if init_call is not None and init_call.pushdown_filters is not None:
|
|
1152
|
+
self._pushdown_filters = self._func_cls.pushdown_filters(
|
|
1153
|
+
init_call.pushdown_filters,
|
|
1154
|
+
join_keys=init_call.join_keys,
|
|
1155
|
+
)
|
|
1156
|
+
|
|
1157
|
+
def _to_row_dict(self) -> dict[str, object]:
|
|
1158
|
+
"""Serialize _user_state into _user_state_bytes before standard serialization."""
|
|
1159
|
+
if self._user_state is not None and isinstance(self._user_state, ArrowSerializableDataclass):
|
|
1160
|
+
self._user_state_bytes = self._user_state.serialize_to_bytes()
|
|
1161
|
+
return super()._to_row_dict()
|
|
1162
|
+
|
|
1163
|
+
def rehydrate(self, implementation: object) -> None:
|
|
1164
|
+
"""Restore transient fields from serialized init data."""
|
|
1165
|
+
from vgi.worker import Worker
|
|
1166
|
+
|
|
1167
|
+
worker: Worker = implementation # type: ignore[assignment]
|
|
1168
|
+
func_cls = worker._resolve_function(self._init_call.bind_call)
|
|
1169
|
+
assert issubclass(func_cls, TableInOutGenerator)
|
|
1170
|
+
self._func_cls = func_cls
|
|
1171
|
+
self._vgi_tracer = worker._vgi_tracer
|
|
1172
|
+
proj_ids = _effective_projection_ids(func_cls, self._init_call.projection_ids)
|
|
1173
|
+
output_schema = project_schema(proj_ids, self._init_call.output_schema)
|
|
1174
|
+
self._params = ProcessParams(
|
|
1175
|
+
args=func_cls._parse_arguments(func_cls.FunctionArguments, self._init_call.bind_call.arguments),
|
|
1176
|
+
init_call=self._init_call,
|
|
1177
|
+
init_response=self._init_response,
|
|
1178
|
+
output_schema=output_schema,
|
|
1179
|
+
settings=_batch_to_scalar_dict(self._init_call.bind_call.settings),
|
|
1180
|
+
secrets=SecretsAccessor(self._init_call.bind_call.secrets).to_dict(),
|
|
1181
|
+
# Rehydrated tick: shard storage on the full plaintext the init state
|
|
1182
|
+
# persisted (the auth-scoped seal can't be reopened here); the body
|
|
1183
|
+
# sees only the stripped catalog bytes.
|
|
1184
|
+
storage=BoundStorage(
|
|
1185
|
+
func_cls.storage,
|
|
1186
|
+
self._init_response.execution_id,
|
|
1187
|
+
attach_plaintext=self._plaintext_attach,
|
|
1188
|
+
),
|
|
1189
|
+
attach_opaque_data=attach_catalog_bytes(self._plaintext_attach),
|
|
1190
|
+
)
|
|
1191
|
+
# Restore _user_state from serialized bytes if available
|
|
1192
|
+
if self._user_state_bytes is not None:
|
|
1193
|
+
state_type = _resolve_state_type(func_cls)
|
|
1194
|
+
if state_type is not None:
|
|
1195
|
+
self._user_state = state_type.deserialize_from_bytes(self._user_state_bytes)
|
|
1196
|
+
else:
|
|
1197
|
+
self._user_state = func_cls.initial_state(self._params)
|
|
1198
|
+
else:
|
|
1199
|
+
self._user_state = func_cls.initial_state(self._params)
|
|
1200
|
+
if func_cls._should_auto_apply_filters():
|
|
1201
|
+
self._auto_apply = True
|
|
1202
|
+
if self._init_call.pushdown_filters is not None:
|
|
1203
|
+
self._pushdown_filters = func_cls.pushdown_filters(
|
|
1204
|
+
self._init_call.pushdown_filters,
|
|
1205
|
+
join_keys=self._init_call.join_keys,
|
|
1206
|
+
)
|
|
1207
|
+
|
|
1208
|
+
def exchange(self, input: AnnotatedBatch, out: OutputCollector, ctx: CallContext) -> None:
|
|
1209
|
+
"""Process one input batch through the table-in-out function."""
|
|
1210
|
+
params = dataclasses.replace(self._params, auth_context=ctx.auth)
|
|
1211
|
+
timer = _timed_exchange(
|
|
1212
|
+
self._vgi_tracer,
|
|
1213
|
+
"vgi.execute.table_in_out",
|
|
1214
|
+
self._init_call.bind_call.function_name,
|
|
1215
|
+
self._init_call.bind_call.function_type.value,
|
|
1216
|
+
self._init_response.execution_id,
|
|
1217
|
+
)
|
|
1218
|
+
with timer:
|
|
1219
|
+
tracking_out = _TrackingOutputCollector(
|
|
1220
|
+
out,
|
|
1221
|
+
supports_batch_index=self._func_cls._supports_batch_index(),
|
|
1222
|
+
partition_fields=_partition_fields_from_schema(self._init_call.output_schema),
|
|
1223
|
+
partition_kind=self._func_cls._partition_kind(),
|
|
1224
|
+
)
|
|
1225
|
+
if self._auto_apply and self._pushdown_filters is not None:
|
|
1226
|
+
filtered_out = _FilteringOutputCollector(tracking_out, self._func_cls, self._pushdown_filters)
|
|
1227
|
+
self._func_cls.process(params, self._user_state, input.batch, filtered_out) # type: ignore[arg-type]
|
|
1228
|
+
filtered_out.propagate()
|
|
1229
|
+
else:
|
|
1230
|
+
self._func_cls.process(params, self._user_state, input.batch, tracking_out) # type: ignore[arg-type]
|
|
1231
|
+
timer.record(
|
|
1232
|
+
input_rows=input.batch.num_rows,
|
|
1233
|
+
output_rows=tracking_out.total_rows,
|
|
1234
|
+
input_bytes=_batch_bytes(input.batch),
|
|
1235
|
+
output_bytes=tracking_out.total_bytes,
|
|
1236
|
+
)
|
|
1237
|
+
|
|
1238
|
+
def on_cancel(self, ctx: CallContext) -> None:
|
|
1239
|
+
"""Forward cancel signal to the user function's classmethod."""
|
|
1240
|
+
if self._func_cls is None or self._params is None:
|
|
1241
|
+
return
|
|
1242
|
+
params = dataclasses.replace(self._params, auth_context=ctx.auth)
|
|
1243
|
+
try:
|
|
1244
|
+
self._func_cls.on_cancel(params, self._user_state)
|
|
1245
|
+
except Exception:
|
|
1246
|
+
_log.debug("on_cancel hook raised", exc_info=True)
|
|
1247
|
+
|
|
1248
|
+
|
|
1249
|
+
@dataclass
|
|
1250
|
+
class BufferedFinalizeState(ProducerState):
|
|
1251
|
+
"""Cursor-driven streaming finalize. Drains a state_log via cursor.
|
|
1252
|
+
|
|
1253
|
+
Used by the streaming-shape ``TableInOutGenerator`` FINALIZE phase
|
|
1254
|
+
(not the new ``TableBufferingFunction`` path — that has its own
|
|
1255
|
+
``TableBufferingFinalizeState``). Wire-serializable end-to-end:
|
|
1256
|
+
nothing here is Transient, nothing holds object references. Each
|
|
1257
|
+
``produce()`` tick: cold-build BoundedStorage from
|
|
1258
|
+
(execution_id + attach), scan the next page of log rows past
|
|
1259
|
+
``cursor``, emit, advance cursor. No per-tick user code — the
|
|
1260
|
+
worker's init handler materializes the user's
|
|
1261
|
+
``finalize() -> list[batch]`` return into BoundedStorage at init
|
|
1262
|
+
time, and produce() drains it.
|
|
1263
|
+
"""
|
|
1264
|
+
|
|
1265
|
+
execution_id: bytes = b""
|
|
1266
|
+
ns: bytes = b""
|
|
1267
|
+
key: bytes = b""
|
|
1268
|
+
cursor: bytes = b"" # opaque, b"" = before-first
|
|
1269
|
+
attach_opaque_data: bytes | None = None
|
|
1270
|
+
|
|
1271
|
+
def produce(self, out: OutputCollector, ctx: CallContext) -> None:
|
|
1272
|
+
"""Drain the next page of (ns, key) past cursor; finish at EOL."""
|
|
1273
|
+
# Local imports keep the protocol module's import graph minimal
|
|
1274
|
+
# and avoid a circular dependency on vgi.worker.
|
|
1275
|
+
from vgi.table_in_out_function import pack_int_cursor, unpack_int_cursor
|
|
1276
|
+
from vgi.worker import (
|
|
1277
|
+
_build_bound_storage_from_fields,
|
|
1278
|
+
_decode_ipc_batch,
|
|
1279
|
+
)
|
|
1280
|
+
|
|
1281
|
+
storage = _build_bound_storage_from_fields(
|
|
1282
|
+
self.execution_id,
|
|
1283
|
+
self.attach_opaque_data,
|
|
1284
|
+
ctx,
|
|
1285
|
+
)
|
|
1286
|
+
last_id = unpack_int_cursor(self.cursor)
|
|
1287
|
+
# OutputCollector enforces one data batch per produce() tick, so
|
|
1288
|
+
# we read exactly one row per call. Framework loops the ticks
|
|
1289
|
+
# until out.finish() is called on EOS.
|
|
1290
|
+
rows = storage.state_log_scan(
|
|
1291
|
+
self.ns,
|
|
1292
|
+
self.key,
|
|
1293
|
+
after_id=last_id,
|
|
1294
|
+
limit=1,
|
|
1295
|
+
)
|
|
1296
|
+
if not rows:
|
|
1297
|
+
out.finish()
|
|
1298
|
+
return
|
|
1299
|
+
log_id, value = rows[0]
|
|
1300
|
+
out.emit(_decode_ipc_batch(value))
|
|
1301
|
+
self.cursor = pack_int_cursor(log_id)
|
|
1302
|
+
|
|
1303
|
+
|
|
1304
|
+
@dataclass
|
|
1305
|
+
class TableBufferingFinalizeState(ProducerState):
|
|
1306
|
+
"""Streaming finalize state for ``TableBufferingFunction.finalize``.
|
|
1307
|
+
|
|
1308
|
+
Producer-mode stream parameterized by (execution_id, finalize_state_id).
|
|
1309
|
+
One streaming RPC per finalize_state_id; framework calls user's
|
|
1310
|
+
``cls.finalize(params, finalize_state_id, state, out)`` per tick,
|
|
1311
|
+
serializing the user's ``state_blob`` between ticks so the stream
|
|
1312
|
+
survives worker-process boundaries (HTTP transport).
|
|
1313
|
+
"""
|
|
1314
|
+
|
|
1315
|
+
function_name: str = ""
|
|
1316
|
+
execution_id: bytes = b""
|
|
1317
|
+
transaction_id: bytes | None = None
|
|
1318
|
+
finalize_state_id: bytes = b""
|
|
1319
|
+
# Serialized form of the user's TFinalizeState (ArrowSerializableDataclass
|
|
1320
|
+
# bytes), or b"" on the first tick before initial_finalize_state() runs.
|
|
1321
|
+
state_blob: bytes = b""
|
|
1322
|
+
# True after the user's initial_finalize_state() has been invoked and
|
|
1323
|
+
# state_blob is populated. Distinguishes "first tick / build initial"
|
|
1324
|
+
# from "subsequent tick / deserialize existing".
|
|
1325
|
+
state_initialized: bool = False
|
|
1326
|
+
attach_opaque_data: bytes | None = None
|
|
1327
|
+
# Pushdown carried from the InitRequest. Wire-serialized on every tick so
|
|
1328
|
+
# an HTTP rehydration on a different worker process still knows which
|
|
1329
|
+
# columns to project and which filter predicates to apply. The streaming
|
|
1330
|
+
# peer ``TableInOutExchangeState`` rehydrates these the same way on every
|
|
1331
|
+
# round-trip (``vgi/protocol.py:1106-1119``).
|
|
1332
|
+
projection_ids: list[int] | None = None
|
|
1333
|
+
pushdown_filters: Annotated[pa.RecordBatch | None, ArrowType(pa.large_binary())] = None
|
|
1334
|
+
|
|
1335
|
+
def produce(self, out: OutputCollector, ctx: CallContext) -> None:
|
|
1336
|
+
"""Drive one tick of the user's finalize() callback."""
|
|
1337
|
+
# Local import: keeps the protocol module's import graph minimal
|
|
1338
|
+
# and avoids a circular dependency on vgi.worker.
|
|
1339
|
+
from vgi.worker import run_table_buffering_finalize_tick
|
|
1340
|
+
|
|
1341
|
+
run_table_buffering_finalize_tick(self, out, ctx)
|
|
1342
|
+
|
|
1343
|
+
def on_cancel(self, ctx: CallContext) -> None:
|
|
1344
|
+
"""Forward the framework's cancel signal to ``cls.on_cancel``.
|
|
1345
|
+
|
|
1346
|
+
Fired by vgi-rpc when the consumer abandons the stream before EOS
|
|
1347
|
+
(DuckDB LIMIT, exception unwind, user break). Resolves func_cls
|
|
1348
|
+
+ params the same way ``produce()`` does (cold-load from storage)
|
|
1349
|
+
and deserializes the user's last-emitted finalize state from
|
|
1350
|
+
``self.state_blob``. Anything raised inside the user hook is
|
|
1351
|
+
swallowed — we're already on a teardown path; don't mask the
|
|
1352
|
+
original cancel.
|
|
1353
|
+
|
|
1354
|
+
Idempotent: if ``state_initialized`` is False we haven't yet run
|
|
1355
|
+
``initial_finalize_state``, so there's no user state worth
|
|
1356
|
+
forwarding — skip rather than build a fresh one just to discard it.
|
|
1357
|
+
"""
|
|
1358
|
+
if not self.state_initialized:
|
|
1359
|
+
return
|
|
1360
|
+
# Local imports: same reason as produce(); the worker module pulls
|
|
1361
|
+
# in heavy dependencies (FunctionStorage backends, etc.) that we
|
|
1362
|
+
# don't want eager-loaded on protocol import.
|
|
1363
|
+
from dataclasses import dataclass as _dc
|
|
1364
|
+
|
|
1365
|
+
from vgi.worker import _deserialize_finalize_state
|
|
1366
|
+
|
|
1367
|
+
@_dc
|
|
1368
|
+
class _CancelStubRequest:
|
|
1369
|
+
function_name: str
|
|
1370
|
+
execution_id: bytes
|
|
1371
|
+
attach_opaque_data: bytes | None
|
|
1372
|
+
transaction_id: bytes | None
|
|
1373
|
+
|
|
1374
|
+
stub = _CancelStubRequest(
|
|
1375
|
+
function_name=self.function_name,
|
|
1376
|
+
execution_id=self.execution_id,
|
|
1377
|
+
attach_opaque_data=self.attach_opaque_data,
|
|
1378
|
+
transaction_id=self.transaction_id,
|
|
1379
|
+
)
|
|
1380
|
+
worker = ctx.implementation
|
|
1381
|
+
if worker is None:
|
|
1382
|
+
# produce() raises in this case; on_cancel is teardown so we
|
|
1383
|
+
# silently skip — better than crashing during pipeline unwind.
|
|
1384
|
+
return
|
|
1385
|
+
try:
|
|
1386
|
+
func_cls, params = worker._load_table_buffering_params(
|
|
1387
|
+
stub,
|
|
1388
|
+
ctx,
|
|
1389
|
+
attach_already_unwrapped=True,
|
|
1390
|
+
)
|
|
1391
|
+
except Exception: # noqa: BLE001 — teardown path, swallow
|
|
1392
|
+
return
|
|
1393
|
+
user_state = _deserialize_finalize_state(func_cls, self.state_blob) if self.state_blob else None
|
|
1394
|
+
with contextlib.suppress(Exception):
|
|
1395
|
+
func_cls.on_cancel(params, self.finalize_state_id, user_state)
|
|
1396
|
+
|
|
1397
|
+
|
|
1398
|
+
# Type alias for the union of all stream state variants produced by init().
|
|
1399
|
+
# vgi-rpc resolves this union using a method-local numeric tag in HTTP state
|
|
1400
|
+
# tokens, so state recovery does not depend on Python class names.
|
|
1401
|
+
ProcessState = (
|
|
1402
|
+
ScalarExchangeState
|
|
1403
|
+
| TableProducerState
|
|
1404
|
+
| TableInOutExchangeState
|
|
1405
|
+
| BufferedFinalizeState
|
|
1406
|
+
| TableBufferingFinalizeState
|
|
1407
|
+
)
|
|
1408
|
+
|
|
1409
|
+
|
|
1410
|
+
# ---------------------------------------------------------------------------
|
|
1411
|
+
# Aggregate Function RPC Types (all unary request/response)
|
|
1412
|
+
# ---------------------------------------------------------------------------
|
|
1413
|
+
|
|
1414
|
+
|
|
1415
|
+
@dataclass(frozen=True, slots=True, kw_only=True)
|
|
1416
|
+
class AggregateBindRequest(ArrowSerializableDataclass):
|
|
1417
|
+
"""Request for aggregate_bind — resolve output schema."""
|
|
1418
|
+
|
|
1419
|
+
function_name: str
|
|
1420
|
+
arguments: Annotated[Arguments, ArrowType(pa.binary())]
|
|
1421
|
+
input_schema: Annotated[pa.Schema | None, ArrowType(pa.binary())] = None
|
|
1422
|
+
settings: Annotated[pa.RecordBatch | None, ArrowType(pa.binary())] = None
|
|
1423
|
+
secrets: Annotated[pa.RecordBatch | None, ArrowType(pa.binary())] = None
|
|
1424
|
+
attach_opaque_data: bytes | None = None
|
|
1425
|
+
|
|
1426
|
+
|
|
1427
|
+
@dataclass(frozen=True, slots=True, kw_only=True)
|
|
1428
|
+
class AggregateBindResponse(ArrowSerializableDataclass):
|
|
1429
|
+
"""Response from aggregate_bind."""
|
|
1430
|
+
|
|
1431
|
+
output_schema: Annotated[pa.Schema, ArrowType(pa.binary())]
|
|
1432
|
+
execution_id: bytes
|
|
1433
|
+
|
|
1434
|
+
|
|
1435
|
+
@dataclass(frozen=True, slots=True, kw_only=True)
|
|
1436
|
+
class AggregateUpdateRequest(ArrowSerializableDataclass):
|
|
1437
|
+
"""Request for aggregate_update — accumulate rows into per-group state."""
|
|
1438
|
+
|
|
1439
|
+
function_name: str
|
|
1440
|
+
execution_id: bytes
|
|
1441
|
+
input_batch: bytes # Full IPC stream bytes (schema + data + EOS)
|
|
1442
|
+
attach_opaque_data: bytes | None = None
|
|
1443
|
+
|
|
1444
|
+
|
|
1445
|
+
@dataclass(frozen=True, slots=True, kw_only=True)
|
|
1446
|
+
class AggregateUpdateResponse(ArrowSerializableDataclass):
|
|
1447
|
+
"""Response from aggregate_update — empty ack."""
|
|
1448
|
+
|
|
1449
|
+
pass
|
|
1450
|
+
|
|
1451
|
+
|
|
1452
|
+
@dataclass(frozen=True, slots=True, kw_only=True)
|
|
1453
|
+
class AggregateCombineRequest(ArrowSerializableDataclass):
|
|
1454
|
+
"""Request for aggregate_combine — merge source states into targets."""
|
|
1455
|
+
|
|
1456
|
+
function_name: str
|
|
1457
|
+
execution_id: bytes
|
|
1458
|
+
merge_batch: bytes # Full IPC stream bytes
|
|
1459
|
+
attach_opaque_data: bytes | None = None
|
|
1460
|
+
|
|
1461
|
+
|
|
1462
|
+
@dataclass(frozen=True, slots=True, kw_only=True)
|
|
1463
|
+
class AggregateCombineResponse(ArrowSerializableDataclass):
|
|
1464
|
+
"""Response from aggregate_combine — empty ack."""
|
|
1465
|
+
|
|
1466
|
+
pass
|
|
1467
|
+
|
|
1468
|
+
|
|
1469
|
+
@dataclass(frozen=True, slots=True, kw_only=True)
|
|
1470
|
+
class AggregateFinalizeRequest(ArrowSerializableDataclass):
|
|
1471
|
+
"""Request for aggregate_finalize — produce results for group_ids."""
|
|
1472
|
+
|
|
1473
|
+
function_name: str
|
|
1474
|
+
execution_id: bytes
|
|
1475
|
+
group_ids_batch: bytes # Full IPC stream bytes
|
|
1476
|
+
output_schema: Annotated[pa.Schema, ArrowType(pa.binary())]
|
|
1477
|
+
attach_opaque_data: bytes | None = None
|
|
1478
|
+
|
|
1479
|
+
|
|
1480
|
+
@dataclass(frozen=True, slots=True, kw_only=True)
|
|
1481
|
+
class AggregateFinalizeResponse(ArrowSerializableDataclass):
|
|
1482
|
+
"""Response from aggregate_finalize — result batch as IPC stream bytes."""
|
|
1483
|
+
|
|
1484
|
+
result_batch: bytes # Full IPC stream bytes
|
|
1485
|
+
|
|
1486
|
+
|
|
1487
|
+
@dataclass(frozen=True, slots=True, kw_only=True)
|
|
1488
|
+
class AggregateDestructorRequest(ArrowSerializableDataclass):
|
|
1489
|
+
"""Request for aggregate_destructor — best-effort state cleanup."""
|
|
1490
|
+
|
|
1491
|
+
function_name: str
|
|
1492
|
+
execution_id: bytes
|
|
1493
|
+
group_ids_batch: bytes # Full IPC stream bytes
|
|
1494
|
+
attach_opaque_data: bytes | None = None
|
|
1495
|
+
|
|
1496
|
+
|
|
1497
|
+
@dataclass(frozen=True, slots=True, kw_only=True)
|
|
1498
|
+
class AggregateDestructorResponse(ArrowSerializableDataclass):
|
|
1499
|
+
"""Response from aggregate_destructor — empty ack."""
|
|
1500
|
+
|
|
1501
|
+
pass
|
|
1502
|
+
|
|
1503
|
+
|
|
1504
|
+
# ---------------------------------------------------------------------------
|
|
1505
|
+
# Table Sink+Source RPC Types
|
|
1506
|
+
# ---------------------------------------------------------------------------
|
|
1507
|
+
# Sink+Source PhysicalOperator path for TableBufferingFunction subclasses.
|
|
1508
|
+
# Contract:
|
|
1509
|
+
# * process() is UNARY; the worker-chosen state_id rides on the response
|
|
1510
|
+
# as opaque bytes.
|
|
1511
|
+
# * state_ids / finalize_state_ids are opaque bytes throughout.
|
|
1512
|
+
# * finalize is the existing streaming-init path with new
|
|
1513
|
+
# TableBufferingFinalizeState driving user finalize() per tick.
|
|
1514
|
+
|
|
1515
|
+
|
|
1516
|
+
@dataclass(frozen=True, slots=True, kw_only=True)
|
|
1517
|
+
class TableBufferingProcessRequest(ArrowSerializableDataclass):
|
|
1518
|
+
"""Request for table_buffering_process — sink one batch (unary)."""
|
|
1519
|
+
|
|
1520
|
+
function_name: str
|
|
1521
|
+
execution_id: bytes
|
|
1522
|
+
input_batch: bytes # Full IPC stream bytes
|
|
1523
|
+
attach_opaque_data: bytes | None = None
|
|
1524
|
+
transaction_id: bytes | None = None
|
|
1525
|
+
batch_index: int | None = None
|
|
1526
|
+
|
|
1527
|
+
|
|
1528
|
+
@dataclass(frozen=True, slots=True, kw_only=True)
|
|
1529
|
+
class TableBufferingProcessResponse(ArrowSerializableDataclass):
|
|
1530
|
+
"""Response from table_buffering_process — the worker-chosen state_id."""
|
|
1531
|
+
|
|
1532
|
+
state_id: bytes
|
|
1533
|
+
|
|
1534
|
+
|
|
1535
|
+
@dataclass(frozen=True, slots=True, kw_only=True)
|
|
1536
|
+
class TableBufferingCombineRequest(ArrowSerializableDataclass):
|
|
1537
|
+
"""Request for table_buffering_combine — once-per-query end-of-input."""
|
|
1538
|
+
|
|
1539
|
+
function_name: str
|
|
1540
|
+
execution_id: bytes
|
|
1541
|
+
state_ids: Annotated[list[bytes], ArrowType(pa.list_(pa.binary()))]
|
|
1542
|
+
attach_opaque_data: bytes | None = None
|
|
1543
|
+
transaction_id: bytes | None = None
|
|
1544
|
+
|
|
1545
|
+
|
|
1546
|
+
@dataclass(frozen=True, slots=True, kw_only=True)
|
|
1547
|
+
class TableBufferingCombineResponse(ArrowSerializableDataclass):
|
|
1548
|
+
"""Response from table_buffering_combine — opaque finalize partition keys."""
|
|
1549
|
+
|
|
1550
|
+
finalize_state_ids: Annotated[list[bytes], ArrowType(pa.list_(pa.binary()))]
|
|
1551
|
+
|
|
1552
|
+
|
|
1553
|
+
@dataclass(frozen=True, slots=True, kw_only=True)
|
|
1554
|
+
class TableBufferingDestructorRequest(ArrowSerializableDataclass):
|
|
1555
|
+
"""Request for table_buffering_destructor — best-effort cleanup."""
|
|
1556
|
+
|
|
1557
|
+
function_name: str
|
|
1558
|
+
execution_id: bytes
|
|
1559
|
+
attach_opaque_data: bytes | None = None
|
|
1560
|
+
transaction_id: bytes | None = None
|
|
1561
|
+
|
|
1562
|
+
|
|
1563
|
+
@dataclass(frozen=True, slots=True, kw_only=True)
|
|
1564
|
+
class TableBufferingDestructorResponse(ArrowSerializableDataclass):
|
|
1565
|
+
"""Response from table_buffering_destructor — empty ack."""
|
|
1566
|
+
|
|
1567
|
+
pass
|
|
1568
|
+
|
|
1569
|
+
|
|
1570
|
+
# ---------------------------------------------------------------------------
|
|
1571
|
+
# Aggregate Window Function RPC Types
|
|
1572
|
+
# ---------------------------------------------------------------------------
|
|
1573
|
+
# Optional windowed-aggregate protocol: ``aggregate_window_init`` ships the
|
|
1574
|
+
# partition once, ``aggregate_window`` evaluates one output row at a time
|
|
1575
|
+
# (per-call flushing — DuckDB's window callback API has no per-Evaluate hook),
|
|
1576
|
+
# ``aggregate_window_destructor`` evicts the partition from storage.
|
|
1577
|
+
|
|
1578
|
+
|
|
1579
|
+
@dataclass(frozen=True, slots=True, kw_only=True)
|
|
1580
|
+
class AggregateWindowInitRequest(ArrowSerializableDataclass):
|
|
1581
|
+
"""Request for aggregate_window_init — ship a partition to the worker."""
|
|
1582
|
+
|
|
1583
|
+
function_name: str
|
|
1584
|
+
execution_id: bytes
|
|
1585
|
+
partition_id: int
|
|
1586
|
+
row_count: int
|
|
1587
|
+
partition_batch: bytes # Full IPC stream bytes (partition's input columns)
|
|
1588
|
+
output_schema: Annotated[pa.Schema, ArrowType(pa.binary())]
|
|
1589
|
+
filter_mask: bytes # Packed-bit bool array, length == row_count
|
|
1590
|
+
frame_stats: bytes # 4× int64: ((begin_delta,end_delta),(begin_delta,end_delta))
|
|
1591
|
+
all_valid: bytes # 1 byte per input column
|
|
1592
|
+
attach_opaque_data: bytes | None = None
|
|
1593
|
+
|
|
1594
|
+
|
|
1595
|
+
@dataclass(frozen=True, slots=True, kw_only=True)
|
|
1596
|
+
class AggregateWindowInitResponse(ArrowSerializableDataclass):
|
|
1597
|
+
"""Response from aggregate_window_init — empty ack."""
|
|
1598
|
+
|
|
1599
|
+
pass
|
|
1600
|
+
|
|
1601
|
+
|
|
1602
|
+
@dataclass(frozen=True, slots=True, kw_only=True)
|
|
1603
|
+
class AggregateWindowRequest(ArrowSerializableDataclass):
|
|
1604
|
+
"""Request for aggregate_window — compute the aggregate for one output row.
|
|
1605
|
+
|
|
1606
|
+
``frame_starts`` and ``frame_ends`` are parallel arrays of length 1–3
|
|
1607
|
+
(one entry per subframe; 3 only for EXCLUDE TIES / EXCLUDE GROUP).
|
|
1608
|
+
"""
|
|
1609
|
+
|
|
1610
|
+
function_name: str
|
|
1611
|
+
execution_id: bytes
|
|
1612
|
+
partition_id: int
|
|
1613
|
+
rid: int
|
|
1614
|
+
frame_starts: list[int]
|
|
1615
|
+
frame_ends: list[int]
|
|
1616
|
+
attach_opaque_data: bytes | None = None
|
|
1617
|
+
|
|
1618
|
+
|
|
1619
|
+
@dataclass(frozen=True, slots=True, kw_only=True)
|
|
1620
|
+
class AggregateWindowResponse(ArrowSerializableDataclass):
|
|
1621
|
+
"""Response from aggregate_window — one row RecordBatch with the scalar result."""
|
|
1622
|
+
|
|
1623
|
+
result_batch: bytes # Full IPC stream bytes (one row, output schema)
|
|
1624
|
+
|
|
1625
|
+
|
|
1626
|
+
@dataclass(frozen=True, slots=True, kw_only=True)
|
|
1627
|
+
class AggregateWindowDestructorRequest(ArrowSerializableDataclass):
|
|
1628
|
+
"""Request for aggregate_window_destructor — evict a partition from storage."""
|
|
1629
|
+
|
|
1630
|
+
function_name: str
|
|
1631
|
+
execution_id: bytes
|
|
1632
|
+
partition_id: int
|
|
1633
|
+
attach_opaque_data: bytes | None = None
|
|
1634
|
+
|
|
1635
|
+
|
|
1636
|
+
@dataclass(frozen=True, slots=True, kw_only=True)
|
|
1637
|
+
class AggregateWindowDestructorResponse(ArrowSerializableDataclass):
|
|
1638
|
+
"""Response from aggregate_window_destructor — empty ack."""
|
|
1639
|
+
|
|
1640
|
+
pass
|
|
1641
|
+
|
|
1642
|
+
|
|
1643
|
+
@dataclass(frozen=True, slots=True, kw_only=True)
|
|
1644
|
+
class AggregateWindowBatchRequest(ArrowSerializableDataclass):
|
|
1645
|
+
"""Request for aggregate_window_batch — compute ``count`` output rows in one RPC.
|
|
1646
|
+
|
|
1647
|
+
``frames_per_row[i]`` gives the subframe cardinality for output row ``i``
|
|
1648
|
+
(1 normally, 2–3 for EXCLUDE TIES / EXCLUDE GROUP). ``frame_starts`` and
|
|
1649
|
+
``frame_ends`` are flat arrays of length ``sum(frames_per_row)``.
|
|
1650
|
+
"""
|
|
1651
|
+
|
|
1652
|
+
function_name: str
|
|
1653
|
+
execution_id: bytes
|
|
1654
|
+
partition_id: int
|
|
1655
|
+
row_idx: int
|
|
1656
|
+
count: int
|
|
1657
|
+
frames_per_row: list[int]
|
|
1658
|
+
frame_starts: list[int]
|
|
1659
|
+
frame_ends: list[int]
|
|
1660
|
+
attach_opaque_data: bytes | None = None
|
|
1661
|
+
|
|
1662
|
+
|
|
1663
|
+
@dataclass(frozen=True, slots=True, kw_only=True)
|
|
1664
|
+
class AggregateWindowBatchResponse(ArrowSerializableDataclass):
|
|
1665
|
+
"""Response from aggregate_window_batch — count-row RecordBatch."""
|
|
1666
|
+
|
|
1667
|
+
result_batch: bytes # Full IPC stream bytes (count rows, output schema)
|
|
1668
|
+
|
|
1669
|
+
|
|
1670
|
+
# ---------------------------------------------------------------------------
|
|
1671
|
+
# Aggregate Streaming-Partitioned RPC Types
|
|
1672
|
+
# ---------------------------------------------------------------------------
|
|
1673
|
+
# Streaming protocol for partitioned aggregates whose state compresses
|
|
1674
|
+
# heavily relative to input rows (e.g. portfolio_agg's positions dict vs
|
|
1675
|
+
# millions of fills). DuckDB streams input chunks to the worker; the worker
|
|
1676
|
+
# maintains concurrent per-partition state in a hash map keyed by partition
|
|
1677
|
+
# key, dispatches each row to its partition's state, and emits one snapshot
|
|
1678
|
+
# per input row. No DuckDB-side partition materialisation. Cumulative
|
|
1679
|
+
# semantics only (UNBOUNDED PRECEDING -> CURRENT ROW); other frame shapes
|
|
1680
|
+
# fall back to the non-streaming path.
|
|
1681
|
+
|
|
1682
|
+
|
|
1683
|
+
@dataclass(frozen=True, slots=True, kw_only=True)
|
|
1684
|
+
class AggregateStreamingOpenRequest(ArrowSerializableDataclass):
|
|
1685
|
+
"""Request for aggregate_streaming_open — start a streaming session.
|
|
1686
|
+
|
|
1687
|
+
The worker resolves the function, calls ``streaming_open`` to build the
|
|
1688
|
+
cross-partition global state, and returns an ``execution_id`` that
|
|
1689
|
+
subsequent chunk/close calls reference.
|
|
1690
|
+
|
|
1691
|
+
``input_schema`` is the schema of every chunk shipped via
|
|
1692
|
+
``streaming_chunk``. The first ``partition_key_count`` columns are
|
|
1693
|
+
partition-key columns (used by the worker to dispatch rows to the right
|
|
1694
|
+
per-partition state). The next ``order_key_count`` columns are
|
|
1695
|
+
order-key columns (informational; the worker may verify monotonicity).
|
|
1696
|
+
Remaining columns are the function's value arguments, in declaration
|
|
1697
|
+
order.
|
|
1698
|
+
"""
|
|
1699
|
+
|
|
1700
|
+
function_name: str
|
|
1701
|
+
arguments: Annotated[Arguments, ArrowType(pa.binary())]
|
|
1702
|
+
input_schema: Annotated[pa.Schema, ArrowType(pa.binary())]
|
|
1703
|
+
partition_key_count: int
|
|
1704
|
+
order_key_count: int
|
|
1705
|
+
output_schema: Annotated[pa.Schema, ArrowType(pa.binary())]
|
|
1706
|
+
settings: Annotated[pa.RecordBatch | None, ArrowType(pa.binary())] = None
|
|
1707
|
+
secrets: Annotated[pa.RecordBatch | None, ArrowType(pa.binary())] = None
|
|
1708
|
+
attach_opaque_data: bytes | None = None
|
|
1709
|
+
|
|
1710
|
+
|
|
1711
|
+
@dataclass(frozen=True, slots=True, kw_only=True)
|
|
1712
|
+
class AggregateStreamingOpenResponse(ArrowSerializableDataclass):
|
|
1713
|
+
"""Response from aggregate_streaming_open — session token."""
|
|
1714
|
+
|
|
1715
|
+
execution_id: bytes
|
|
1716
|
+
|
|
1717
|
+
|
|
1718
|
+
@dataclass(frozen=True, slots=True, kw_only=True)
|
|
1719
|
+
class AggregateStreamingChunkRequest(ArrowSerializableDataclass):
|
|
1720
|
+
"""Request for aggregate_streaming_chunk — process one input chunk.
|
|
1721
|
+
|
|
1722
|
+
``input_batch`` schema must match the ``input_schema`` agreed at
|
|
1723
|
+
``streaming_open``. The worker iterates rows, dispatches to per-partition
|
|
1724
|
+
state by the partition-key columns, applies the function's update logic,
|
|
1725
|
+
and returns a same-length output array.
|
|
1726
|
+
"""
|
|
1727
|
+
|
|
1728
|
+
function_name: str
|
|
1729
|
+
execution_id: bytes
|
|
1730
|
+
input_batch: bytes # Full IPC stream bytes
|
|
1731
|
+
attach_opaque_data: bytes | None = None
|
|
1732
|
+
|
|
1733
|
+
|
|
1734
|
+
@dataclass(frozen=True, slots=True, kw_only=True)
|
|
1735
|
+
class AggregateStreamingChunkResponse(ArrowSerializableDataclass):
|
|
1736
|
+
"""Response from aggregate_streaming_chunk — same-length output batch."""
|
|
1737
|
+
|
|
1738
|
+
result_batch: bytes # Full IPC stream bytes (one row per input row)
|
|
1739
|
+
|
|
1740
|
+
|
|
1741
|
+
@dataclass(frozen=True, slots=True, kw_only=True)
|
|
1742
|
+
class AggregateStreamingCloseRequest(ArrowSerializableDataclass):
|
|
1743
|
+
"""Request for aggregate_streaming_close — end the session, free state."""
|
|
1744
|
+
|
|
1745
|
+
function_name: str
|
|
1746
|
+
execution_id: bytes
|
|
1747
|
+
attach_opaque_data: bytes | None = None
|
|
1748
|
+
|
|
1749
|
+
|
|
1750
|
+
@dataclass(frozen=True, slots=True, kw_only=True)
|
|
1751
|
+
class AggregateStreamingCloseResponse(ArrowSerializableDataclass):
|
|
1752
|
+
"""Response from aggregate_streaming_close — empty ack."""
|
|
1753
|
+
|
|
1754
|
+
pass
|
|
1755
|
+
|
|
1756
|
+
|
|
1757
|
+
# ---------------------------------------------------------------------------
|
|
1758
|
+
# VGI Protocol
|
|
1759
|
+
# ---------------------------------------------------------------------------
|
|
1760
|
+
|
|
1761
|
+
|
|
1762
|
+
class VgiProtocol(Protocol):
|
|
1763
|
+
"""VGI wire protocol definition for vgi_rpc.
|
|
1764
|
+
|
|
1765
|
+
Method families: ``bind()`` / ``init()`` (scalar/table function invocation),
|
|
1766
|
+
``aggregate_*`` (aggregate RPC methods, all unary), and ``catalog_*`` (~35
|
|
1767
|
+
typed catalog interface methods).
|
|
1768
|
+
|
|
1769
|
+
``vgi_rpc.RpcServer(VgiProtocol, worker)`` handles serialization,
|
|
1770
|
+
dispatching, error propagation, and stream lifecycle.
|
|
1771
|
+
|
|
1772
|
+
Application protocol surface version
|
|
1773
|
+
------------------------------------
|
|
1774
|
+
``protocol_version`` is the canonical semver (MAJOR.MINOR.PATCH) of the
|
|
1775
|
+
method-and-schema contract this Protocol declares. The vgi-rpc framework
|
|
1776
|
+
enforces an exact major+minor match (patch ignored) on every dispatched
|
|
1777
|
+
request: when a client sends a mismatched version, the server raises
|
|
1778
|
+
``ProtocolVersionError`` at the dispatch boundary with a directional
|
|
1779
|
+
"upgrade the client" / "upgrade the worker" message.
|
|
1780
|
+
|
|
1781
|
+
Bump rules:
|
|
1782
|
+
|
|
1783
|
+
- **Major** — any backwards-incompatible change: removing a method,
|
|
1784
|
+
renaming a method/parameter, changing a parameter or return type,
|
|
1785
|
+
adding a required parameter.
|
|
1786
|
+
- **Minor** — additive: a new method, a new optional parameter, or a new
|
|
1787
|
+
optional response column.
|
|
1788
|
+
- **Patch** — worker-side bug fixes that do not touch the surface.
|
|
1789
|
+
|
|
1790
|
+
Because Arrow's column-count check in the C++ consumer rejects
|
|
1791
|
+
return-schema drift today, even "minor" additive bumps force clients to
|
|
1792
|
+
rebuild in practice. Bump major when the surface changes and you want all
|
|
1793
|
+
deployed clients to refuse to talk to the new server until rebuilt.
|
|
1794
|
+
|
|
1795
|
+
Cross-language consumers (Rust / Go workers) read ``vgi/protocol_version.txt``
|
|
1796
|
+
(generated, committed). The C++ DuckDB extension reads
|
|
1797
|
+
``VGI_PROTOCOL_VERSION`` from ``vgi/src/generated/vgi_protocol_version.hpp``
|
|
1798
|
+
(also generated; sibling of ``vgi_protocol_constants.hpp`` but produced by
|
|
1799
|
+
a dedicated generator so this version doesn't pollute the byte-key constants).
|
|
1800
|
+
"""
|
|
1801
|
+
|
|
1802
|
+
protocol_version: ClassVar[str] = "1.0.0"
|
|
1803
|
+
|
|
1804
|
+
def bind(self, request: BindRequest) -> BindResponse:
|
|
1805
|
+
"""Resolve output schema and validate arguments."""
|
|
1806
|
+
...
|
|
1807
|
+
|
|
1808
|
+
def init(self, request: InitRequest) -> Stream[ProcessState, GlobalInitResponse]:
|
|
1809
|
+
"""Initialize a function execution and return a processing stream."""
|
|
1810
|
+
...
|
|
1811
|
+
|
|
1812
|
+
def table_function_cardinality(self, request: TableFunctionCardinalityRequest) -> TableCardinality:
|
|
1813
|
+
"""Estimate the cardinality of a table function's output."""
|
|
1814
|
+
...
|
|
1815
|
+
|
|
1816
|
+
def table_function_statistics(self, request: TableFunctionStatisticsRequest) -> bytes | None:
|
|
1817
|
+
"""Return per-column statistics for a table function's output.
|
|
1818
|
+
|
|
1819
|
+
Returns IPC bytes of a RecordBatch with sparse-union min/max columns
|
|
1820
|
+
(same shape as catalog_table_column_statistics_get), or None if no
|
|
1821
|
+
statistics are available.
|
|
1822
|
+
"""
|
|
1823
|
+
...
|
|
1824
|
+
|
|
1825
|
+
def table_function_dynamic_to_string(
|
|
1826
|
+
self, request: TableFunctionDynamicToStringRequest
|
|
1827
|
+
) -> TableFunctionDynamicToStringResponse:
|
|
1828
|
+
"""Return user-defined diagnostics for EXPLAIN ANALYZE Extra Info.
|
|
1829
|
+
|
|
1830
|
+
Fired once per parallel scan thread at end-of-stream. The function
|
|
1831
|
+
class is responsible for persisting any diagnostics it wants to
|
|
1832
|
+
report and retrieving them by ``global_execution_id`` here.
|
|
1833
|
+
|
|
1834
|
+
Best-effort: must not raise. The dispatcher catches exceptions and
|
|
1835
|
+
returns an empty response so EXPLAIN ANALYZE never breaks the query.
|
|
1836
|
+
"""
|
|
1837
|
+
...
|
|
1838
|
+
|
|
1839
|
+
# ========== Aggregate Function Methods (all unary) ==========
|
|
1840
|
+
|
|
1841
|
+
def aggregate_bind(self, request: AggregateBindRequest) -> AggregateBindResponse:
|
|
1842
|
+
"""Bind an aggregate function, return output schema and execution_id."""
|
|
1843
|
+
...
|
|
1844
|
+
|
|
1845
|
+
def aggregate_update(self, request: AggregateUpdateRequest) -> AggregateUpdateResponse:
|
|
1846
|
+
"""Accumulate rows from a DataChunk into per-group state."""
|
|
1847
|
+
...
|
|
1848
|
+
|
|
1849
|
+
def aggregate_combine(self, request: AggregateCombineRequest) -> AggregateCombineResponse:
|
|
1850
|
+
"""Merge source states into target states."""
|
|
1851
|
+
...
|
|
1852
|
+
|
|
1853
|
+
def aggregate_finalize(self, request: AggregateFinalizeRequest) -> AggregateFinalizeResponse:
|
|
1854
|
+
"""Produce results for a chunk of group_ids."""
|
|
1855
|
+
...
|
|
1856
|
+
|
|
1857
|
+
def aggregate_destructor(self, request: AggregateDestructorRequest) -> AggregateDestructorResponse:
|
|
1858
|
+
"""Best-effort cleanup of aggregate states. Must not raise."""
|
|
1859
|
+
...
|
|
1860
|
+
|
|
1861
|
+
# ========== Table Sink+Source Function Methods ==========
|
|
1862
|
+
|
|
1863
|
+
def table_buffering_process(
|
|
1864
|
+
self,
|
|
1865
|
+
request: TableBufferingProcessRequest,
|
|
1866
|
+
) -> TableBufferingProcessResponse:
|
|
1867
|
+
"""Sink one input batch; return the worker-chosen state_id."""
|
|
1868
|
+
...
|
|
1869
|
+
|
|
1870
|
+
def table_buffering_combine(
|
|
1871
|
+
self,
|
|
1872
|
+
request: TableBufferingCombineRequest,
|
|
1873
|
+
) -> TableBufferingCombineResponse:
|
|
1874
|
+
"""Once-per-query end-of-input signal. Returns finalize_state_ids."""
|
|
1875
|
+
...
|
|
1876
|
+
|
|
1877
|
+
def table_buffering_destructor(
|
|
1878
|
+
self,
|
|
1879
|
+
request: TableBufferingDestructorRequest,
|
|
1880
|
+
) -> TableBufferingDestructorResponse:
|
|
1881
|
+
"""Best-effort end-of-query cleanup. Must not raise."""
|
|
1882
|
+
...
|
|
1883
|
+
|
|
1884
|
+
# ========== Aggregate Window Function Methods (optional, all unary) ==========
|
|
1885
|
+
|
|
1886
|
+
def aggregate_window_init(self, request: AggregateWindowInitRequest) -> AggregateWindowInitResponse:
|
|
1887
|
+
"""Ship a partition to the worker for windowed aggregation."""
|
|
1888
|
+
...
|
|
1889
|
+
|
|
1890
|
+
def aggregate_window(self, request: AggregateWindowRequest) -> AggregateWindowResponse:
|
|
1891
|
+
"""Compute an aggregate value for one output row of the window."""
|
|
1892
|
+
...
|
|
1893
|
+
|
|
1894
|
+
def aggregate_window_destructor(
|
|
1895
|
+
self, request: AggregateWindowDestructorRequest
|
|
1896
|
+
) -> AggregateWindowDestructorResponse:
|
|
1897
|
+
"""Evict a cached partition from storage."""
|
|
1898
|
+
...
|
|
1899
|
+
|
|
1900
|
+
def aggregate_window_batch(self, request: AggregateWindowBatchRequest) -> AggregateWindowBatchResponse:
|
|
1901
|
+
"""Compute ``count`` window output rows in one batched RPC."""
|
|
1902
|
+
...
|
|
1903
|
+
|
|
1904
|
+
# ========== Aggregate Streaming-Partitioned Methods (optional, all unary) ==========
|
|
1905
|
+
|
|
1906
|
+
def aggregate_streaming_open(self, request: AggregateStreamingOpenRequest) -> AggregateStreamingOpenResponse:
|
|
1907
|
+
"""Start a streaming-partitioned aggregate session."""
|
|
1908
|
+
...
|
|
1909
|
+
|
|
1910
|
+
def aggregate_streaming_chunk(self, request: AggregateStreamingChunkRequest) -> AggregateStreamingChunkResponse:
|
|
1911
|
+
"""Process one input chunk; returns one output row per input row."""
|
|
1912
|
+
...
|
|
1913
|
+
|
|
1914
|
+
def aggregate_streaming_close(self, request: AggregateStreamingCloseRequest) -> AggregateStreamingCloseResponse:
|
|
1915
|
+
"""End the streaming session, free per-session state."""
|
|
1916
|
+
...
|
|
1917
|
+
|
|
1918
|
+
# ========== Catalog - Discovery ==========
|
|
1919
|
+
|
|
1920
|
+
def catalog_catalogs(self) -> CatalogsResponse:
|
|
1921
|
+
"""List available catalog names."""
|
|
1922
|
+
...
|
|
1923
|
+
|
|
1924
|
+
# ========== Catalog - Lifecycle ==========
|
|
1925
|
+
|
|
1926
|
+
def catalog_attach(self, request: CatalogAttachRequest) -> CatalogAttachResult:
|
|
1927
|
+
"""Attach to a catalog with options."""
|
|
1928
|
+
...
|
|
1929
|
+
|
|
1930
|
+
def catalog_detach(self, attach_opaque_data: bytes) -> None:
|
|
1931
|
+
"""Detach from a catalog."""
|
|
1932
|
+
...
|
|
1933
|
+
|
|
1934
|
+
def catalog_create(self, request: CatalogCreateRequest) -> None:
|
|
1935
|
+
"""Create a new catalog."""
|
|
1936
|
+
...
|
|
1937
|
+
|
|
1938
|
+
def catalog_drop(self, name: str) -> None:
|
|
1939
|
+
"""Drop a catalog."""
|
|
1940
|
+
...
|
|
1941
|
+
|
|
1942
|
+
def catalog_version(
|
|
1943
|
+
self, attach_opaque_data: bytes, transaction_opaque_data: bytes | None = None
|
|
1944
|
+
) -> CatalogVersionResponse:
|
|
1945
|
+
"""Get the current catalog version."""
|
|
1946
|
+
...
|
|
1947
|
+
|
|
1948
|
+
# ========== Catalog - Transactions ==========
|
|
1949
|
+
|
|
1950
|
+
def catalog_transaction_begin(self, attach_opaque_data: bytes) -> TransactionBeginResponse:
|
|
1951
|
+
"""Begin a new transaction."""
|
|
1952
|
+
...
|
|
1953
|
+
|
|
1954
|
+
def catalog_transaction_commit(self, attach_opaque_data: bytes, transaction_opaque_data: bytes) -> None:
|
|
1955
|
+
"""Commit a transaction."""
|
|
1956
|
+
...
|
|
1957
|
+
|
|
1958
|
+
def catalog_transaction_rollback(self, attach_opaque_data: bytes, transaction_opaque_data: bytes) -> None:
|
|
1959
|
+
"""Rollback a transaction."""
|
|
1960
|
+
...
|
|
1961
|
+
|
|
1962
|
+
# ========== Catalog - Schemas ==========
|
|
1963
|
+
|
|
1964
|
+
def catalog_schemas(
|
|
1965
|
+
self, attach_opaque_data: bytes, transaction_opaque_data: bytes | None = None
|
|
1966
|
+
) -> SchemasResponse:
|
|
1967
|
+
"""List schemas in the catalog."""
|
|
1968
|
+
...
|
|
1969
|
+
|
|
1970
|
+
def catalog_schema_get(
|
|
1971
|
+
self, attach_opaque_data: bytes, name: str, transaction_opaque_data: bytes | None = None
|
|
1972
|
+
) -> SchemasResponse:
|
|
1973
|
+
"""Get information about a schema. Returns 0 or 1 items."""
|
|
1974
|
+
...
|
|
1975
|
+
|
|
1976
|
+
def catalog_schema_create(
|
|
1977
|
+
self,
|
|
1978
|
+
attach_opaque_data: bytes,
|
|
1979
|
+
name: str,
|
|
1980
|
+
on_conflict: OnConflict = OnConflict.ERROR,
|
|
1981
|
+
comment: str | None = None,
|
|
1982
|
+
tags: dict[str, str] | None = None,
|
|
1983
|
+
transaction_opaque_data: bytes | None = None,
|
|
1984
|
+
) -> None:
|
|
1985
|
+
"""Create a new schema."""
|
|
1986
|
+
...
|
|
1987
|
+
|
|
1988
|
+
def catalog_schema_drop(
|
|
1989
|
+
self,
|
|
1990
|
+
attach_opaque_data: bytes,
|
|
1991
|
+
name: str,
|
|
1992
|
+
ignore_not_found: bool = False,
|
|
1993
|
+
cascade: bool = False,
|
|
1994
|
+
transaction_opaque_data: bytes | None = None,
|
|
1995
|
+
) -> None:
|
|
1996
|
+
"""Drop a schema."""
|
|
1997
|
+
...
|
|
1998
|
+
|
|
1999
|
+
def catalog_schema_contents_tables(
|
|
2000
|
+
self,
|
|
2001
|
+
attach_opaque_data: bytes,
|
|
2002
|
+
name: str,
|
|
2003
|
+
transaction_opaque_data: bytes | None = None,
|
|
2004
|
+
) -> TablesResponse:
|
|
2005
|
+
"""List tables in a schema."""
|
|
2006
|
+
...
|
|
2007
|
+
|
|
2008
|
+
def catalog_schema_contents_views(
|
|
2009
|
+
self,
|
|
2010
|
+
attach_opaque_data: bytes,
|
|
2011
|
+
name: str,
|
|
2012
|
+
transaction_opaque_data: bytes | None = None,
|
|
2013
|
+
) -> ViewsResponse:
|
|
2014
|
+
"""List views in a schema."""
|
|
2015
|
+
...
|
|
2016
|
+
|
|
2017
|
+
def catalog_schema_contents_functions(
|
|
2018
|
+
self,
|
|
2019
|
+
attach_opaque_data: bytes,
|
|
2020
|
+
name: str,
|
|
2021
|
+
type: SchemaObjectType,
|
|
2022
|
+
transaction_opaque_data: bytes | None = None,
|
|
2023
|
+
) -> FunctionsResponse:
|
|
2024
|
+
"""List functions in a schema (scalar or table)."""
|
|
2025
|
+
...
|
|
2026
|
+
|
|
2027
|
+
# ========== Catalog - Tables ==========
|
|
2028
|
+
|
|
2029
|
+
def catalog_table_get(
|
|
2030
|
+
self,
|
|
2031
|
+
attach_opaque_data: bytes,
|
|
2032
|
+
schema_name: str,
|
|
2033
|
+
name: str,
|
|
2034
|
+
at_unit: str | None = None,
|
|
2035
|
+
at_value: str | None = None,
|
|
2036
|
+
transaction_opaque_data: bytes | None = None,
|
|
2037
|
+
) -> TablesResponse:
|
|
2038
|
+
"""Get information about a table. Returns 0 or 1 items."""
|
|
2039
|
+
...
|
|
2040
|
+
|
|
2041
|
+
def catalog_table_create(self, request: TableCreateRequest) -> None:
|
|
2042
|
+
"""Create a new table."""
|
|
2043
|
+
...
|
|
2044
|
+
|
|
2045
|
+
def catalog_table_drop(
|
|
2046
|
+
self,
|
|
2047
|
+
attach_opaque_data: bytes,
|
|
2048
|
+
schema_name: str,
|
|
2049
|
+
name: str,
|
|
2050
|
+
ignore_not_found: bool = False,
|
|
2051
|
+
cascade: bool = False,
|
|
2052
|
+
transaction_opaque_data: bytes | None = None,
|
|
2053
|
+
) -> None:
|
|
2054
|
+
"""Drop a table."""
|
|
2055
|
+
...
|
|
2056
|
+
|
|
2057
|
+
def catalog_table_scan_function_get(
|
|
2058
|
+
self,
|
|
2059
|
+
attach_opaque_data: bytes,
|
|
2060
|
+
schema_name: str,
|
|
2061
|
+
name: str,
|
|
2062
|
+
at_unit: str | None = None,
|
|
2063
|
+
at_value: str | None = None,
|
|
2064
|
+
transaction_opaque_data: bytes | None = None,
|
|
2065
|
+
) -> bytes:
|
|
2066
|
+
"""Get the scan function for a table. Returns ScanFunctionResult as IPC bytes."""
|
|
2067
|
+
...
|
|
2068
|
+
|
|
2069
|
+
def catalog_table_scan_branches_get(
|
|
2070
|
+
self,
|
|
2071
|
+
attach_opaque_data: bytes,
|
|
2072
|
+
schema_name: str,
|
|
2073
|
+
name: str,
|
|
2074
|
+
at_unit: str | None = None,
|
|
2075
|
+
at_value: str | None = None,
|
|
2076
|
+
transaction_opaque_data: bytes | None = None,
|
|
2077
|
+
) -> bytes:
|
|
2078
|
+
"""Get the list of scan branches for a multi-branch table. Returns ScanBranchesResult as IPC bytes.
|
|
2079
|
+
|
|
2080
|
+
Additive successor to ``catalog_table_scan_function_get``. Workers that
|
|
2081
|
+
only implement the legacy method continue to work — the VGI extension's
|
|
2082
|
+
C++ side catches ``MethodNotImplementedError`` and falls back to the
|
|
2083
|
+
legacy RPC, wrapping the single-function result as a one-branch list.
|
|
2084
|
+
Workers that implement BOTH the legacy and the branches method
|
|
2085
|
+
guarantee single-process compatibility with both old and new extensions.
|
|
2086
|
+
|
|
2087
|
+
Multi-branch tables compose a logical scan from N physical sources
|
|
2088
|
+
(canonical case: Kafka hot tier + Iceberg cold tier). The extension's
|
|
2089
|
+
optimizer rewrite stitches the branches together via
|
|
2090
|
+
``LogicalSetOperation(UNION_ALL, ...)``, one arm per branch.
|
|
2091
|
+
"""
|
|
2092
|
+
...
|
|
2093
|
+
|
|
2094
|
+
def catalog_table_column_statistics_get(
|
|
2095
|
+
self,
|
|
2096
|
+
attach_opaque_data: bytes,
|
|
2097
|
+
schema_name: str,
|
|
2098
|
+
name: str,
|
|
2099
|
+
transaction_opaque_data: bytes | None = None,
|
|
2100
|
+
) -> bytes | None:
|
|
2101
|
+
"""Get column statistics for a table.
|
|
2102
|
+
|
|
2103
|
+
Returns IPC bytes of a RecordBatch with sparse-union min/max columns,
|
|
2104
|
+
or None if statistics are not available.
|
|
2105
|
+
"""
|
|
2106
|
+
...
|
|
2107
|
+
|
|
2108
|
+
def catalog_table_insert_function_get(
|
|
2109
|
+
self,
|
|
2110
|
+
attach_opaque_data: bytes,
|
|
2111
|
+
schema_name: str,
|
|
2112
|
+
name: str,
|
|
2113
|
+
transaction_opaque_data: bytes | None = None,
|
|
2114
|
+
writable_branch_function_name: str | None = None,
|
|
2115
|
+
) -> bytes:
|
|
2116
|
+
"""Get the insert function for a table. Returns WriteFunctionResult as IPC bytes.
|
|
2117
|
+
|
|
2118
|
+
``writable_branch_function_name`` is set by the C++ extension when the
|
|
2119
|
+
table is multi-branch and a branch declared ``writable=True``: the value
|
|
2120
|
+
is the writable arm's ``ScanBranch.function_name``. The worker uses it
|
|
2121
|
+
to disambiguate which physical arm to dispatch the INSERT to without
|
|
2122
|
+
re-resolving the writable arm internally. For single-branch tables (the
|
|
2123
|
+
common case) this is None and the worker dispatches as today.
|
|
2124
|
+
"""
|
|
2125
|
+
...
|
|
2126
|
+
|
|
2127
|
+
def catalog_table_update_function_get(
|
|
2128
|
+
self,
|
|
2129
|
+
attach_opaque_data: bytes,
|
|
2130
|
+
schema_name: str,
|
|
2131
|
+
name: str,
|
|
2132
|
+
transaction_opaque_data: bytes | None = None,
|
|
2133
|
+
) -> bytes:
|
|
2134
|
+
"""Get the update function for a table. Returns WriteFunctionResult as IPC bytes."""
|
|
2135
|
+
...
|
|
2136
|
+
|
|
2137
|
+
def catalog_table_delete_function_get(
|
|
2138
|
+
self,
|
|
2139
|
+
attach_opaque_data: bytes,
|
|
2140
|
+
schema_name: str,
|
|
2141
|
+
name: str,
|
|
2142
|
+
transaction_opaque_data: bytes | None = None,
|
|
2143
|
+
) -> bytes:
|
|
2144
|
+
"""Get the delete function for a table. Returns WriteFunctionResult as IPC bytes."""
|
|
2145
|
+
...
|
|
2146
|
+
|
|
2147
|
+
def catalog_table_comment_set(
|
|
2148
|
+
self,
|
|
2149
|
+
attach_opaque_data: bytes,
|
|
2150
|
+
schema_name: str,
|
|
2151
|
+
name: str,
|
|
2152
|
+
comment: str | None = None,
|
|
2153
|
+
ignore_not_found: bool = False,
|
|
2154
|
+
transaction_opaque_data: bytes | None = None,
|
|
2155
|
+
) -> None:
|
|
2156
|
+
"""Set or clear the comment on a table."""
|
|
2157
|
+
...
|
|
2158
|
+
|
|
2159
|
+
def catalog_table_column_comment_set(
|
|
2160
|
+
self,
|
|
2161
|
+
attach_opaque_data: bytes,
|
|
2162
|
+
schema_name: str,
|
|
2163
|
+
name: str,
|
|
2164
|
+
column_name: str,
|
|
2165
|
+
comment: str | None = None,
|
|
2166
|
+
ignore_not_found: bool = False,
|
|
2167
|
+
transaction_opaque_data: bytes | None = None,
|
|
2168
|
+
) -> None:
|
|
2169
|
+
"""Set or clear the comment on a table column."""
|
|
2170
|
+
...
|
|
2171
|
+
|
|
2172
|
+
def catalog_table_rename(
|
|
2173
|
+
self,
|
|
2174
|
+
attach_opaque_data: bytes,
|
|
2175
|
+
schema_name: str,
|
|
2176
|
+
name: str,
|
|
2177
|
+
new_name: str,
|
|
2178
|
+
ignore_not_found: bool = False,
|
|
2179
|
+
transaction_opaque_data: bytes | None = None,
|
|
2180
|
+
) -> None:
|
|
2181
|
+
"""Rename a table."""
|
|
2182
|
+
...
|
|
2183
|
+
|
|
2184
|
+
def catalog_table_column_add(
|
|
2185
|
+
self,
|
|
2186
|
+
attach_opaque_data: bytes,
|
|
2187
|
+
schema_name: str,
|
|
2188
|
+
name: str,
|
|
2189
|
+
column_definition: bytes,
|
|
2190
|
+
ignore_not_found: bool = False,
|
|
2191
|
+
if_column_not_exists: bool = False,
|
|
2192
|
+
transaction_opaque_data: bytes | None = None,
|
|
2193
|
+
) -> None:
|
|
2194
|
+
"""Add a new column to a table."""
|
|
2195
|
+
...
|
|
2196
|
+
|
|
2197
|
+
def catalog_table_column_drop(
|
|
2198
|
+
self,
|
|
2199
|
+
attach_opaque_data: bytes,
|
|
2200
|
+
schema_name: str,
|
|
2201
|
+
name: str,
|
|
2202
|
+
column_name: str,
|
|
2203
|
+
ignore_not_found: bool = False,
|
|
2204
|
+
if_column_exists: bool = False,
|
|
2205
|
+
cascade: bool = False,
|
|
2206
|
+
transaction_opaque_data: bytes | None = None,
|
|
2207
|
+
) -> None:
|
|
2208
|
+
"""Drop a column from a table."""
|
|
2209
|
+
...
|
|
2210
|
+
|
|
2211
|
+
def catalog_table_column_rename(
|
|
2212
|
+
self,
|
|
2213
|
+
attach_opaque_data: bytes,
|
|
2214
|
+
schema_name: str,
|
|
2215
|
+
name: str,
|
|
2216
|
+
column_name: str,
|
|
2217
|
+
new_column_name: str,
|
|
2218
|
+
ignore_not_found: bool = False,
|
|
2219
|
+
transaction_opaque_data: bytes | None = None,
|
|
2220
|
+
) -> None:
|
|
2221
|
+
"""Rename a column."""
|
|
2222
|
+
...
|
|
2223
|
+
|
|
2224
|
+
def catalog_table_column_default_set(
|
|
2225
|
+
self,
|
|
2226
|
+
attach_opaque_data: bytes,
|
|
2227
|
+
schema_name: str,
|
|
2228
|
+
name: str,
|
|
2229
|
+
column_name: str,
|
|
2230
|
+
expression: str,
|
|
2231
|
+
ignore_not_found: bool = False,
|
|
2232
|
+
transaction_opaque_data: bytes | None = None,
|
|
2233
|
+
) -> None:
|
|
2234
|
+
"""Set the default value expression for a column."""
|
|
2235
|
+
...
|
|
2236
|
+
|
|
2237
|
+
def catalog_table_column_default_drop(
|
|
2238
|
+
self,
|
|
2239
|
+
attach_opaque_data: bytes,
|
|
2240
|
+
schema_name: str,
|
|
2241
|
+
name: str,
|
|
2242
|
+
column_name: str,
|
|
2243
|
+
ignore_not_found: bool = False,
|
|
2244
|
+
transaction_opaque_data: bytes | None = None,
|
|
2245
|
+
) -> None:
|
|
2246
|
+
"""Remove the default value from a column."""
|
|
2247
|
+
...
|
|
2248
|
+
|
|
2249
|
+
def catalog_table_column_type_change(
|
|
2250
|
+
self,
|
|
2251
|
+
attach_opaque_data: bytes,
|
|
2252
|
+
schema_name: str,
|
|
2253
|
+
name: str,
|
|
2254
|
+
column_definition: bytes,
|
|
2255
|
+
expression: str | None = None,
|
|
2256
|
+
ignore_not_found: bool = False,
|
|
2257
|
+
transaction_opaque_data: bytes | None = None,
|
|
2258
|
+
) -> None:
|
|
2259
|
+
"""Change the type of a column."""
|
|
2260
|
+
...
|
|
2261
|
+
|
|
2262
|
+
def catalog_table_not_null_drop(
|
|
2263
|
+
self,
|
|
2264
|
+
attach_opaque_data: bytes,
|
|
2265
|
+
schema_name: str,
|
|
2266
|
+
name: str,
|
|
2267
|
+
column_name: str,
|
|
2268
|
+
ignore_not_found: bool = False,
|
|
2269
|
+
transaction_opaque_data: bytes | None = None,
|
|
2270
|
+
) -> None:
|
|
2271
|
+
"""Remove NOT NULL constraint from a column."""
|
|
2272
|
+
...
|
|
2273
|
+
|
|
2274
|
+
def catalog_table_not_null_set(
|
|
2275
|
+
self,
|
|
2276
|
+
attach_opaque_data: bytes,
|
|
2277
|
+
schema_name: str,
|
|
2278
|
+
name: str,
|
|
2279
|
+
column_name: str,
|
|
2280
|
+
ignore_not_found: bool = False,
|
|
2281
|
+
transaction_opaque_data: bytes | None = None,
|
|
2282
|
+
) -> None:
|
|
2283
|
+
"""Add NOT NULL constraint to a column."""
|
|
2284
|
+
...
|
|
2285
|
+
|
|
2286
|
+
# ========== Catalog - Views ==========
|
|
2287
|
+
|
|
2288
|
+
def catalog_view_get(
|
|
2289
|
+
self,
|
|
2290
|
+
attach_opaque_data: bytes,
|
|
2291
|
+
schema_name: str,
|
|
2292
|
+
name: str,
|
|
2293
|
+
transaction_opaque_data: bytes | None = None,
|
|
2294
|
+
) -> ViewsResponse:
|
|
2295
|
+
"""Get information about a view. Returns 0 or 1 items."""
|
|
2296
|
+
...
|
|
2297
|
+
|
|
2298
|
+
def catalog_view_create(
|
|
2299
|
+
self,
|
|
2300
|
+
attach_opaque_data: bytes,
|
|
2301
|
+
schema_name: str,
|
|
2302
|
+
name: str,
|
|
2303
|
+
definition: str,
|
|
2304
|
+
on_conflict: OnConflict,
|
|
2305
|
+
transaction_opaque_data: bytes | None = None,
|
|
2306
|
+
) -> None:
|
|
2307
|
+
"""Create a new view."""
|
|
2308
|
+
...
|
|
2309
|
+
|
|
2310
|
+
def catalog_view_drop(
|
|
2311
|
+
self,
|
|
2312
|
+
attach_opaque_data: bytes,
|
|
2313
|
+
schema_name: str,
|
|
2314
|
+
name: str,
|
|
2315
|
+
ignore_not_found: bool = False,
|
|
2316
|
+
cascade: bool = False,
|
|
2317
|
+
transaction_opaque_data: bytes | None = None,
|
|
2318
|
+
) -> None:
|
|
2319
|
+
"""Drop a view."""
|
|
2320
|
+
...
|
|
2321
|
+
|
|
2322
|
+
def catalog_view_rename(
|
|
2323
|
+
self,
|
|
2324
|
+
attach_opaque_data: bytes,
|
|
2325
|
+
schema_name: str,
|
|
2326
|
+
name: str,
|
|
2327
|
+
new_name: str,
|
|
2328
|
+
ignore_not_found: bool = False,
|
|
2329
|
+
transaction_opaque_data: bytes | None = None,
|
|
2330
|
+
) -> None:
|
|
2331
|
+
"""Rename a view."""
|
|
2332
|
+
...
|
|
2333
|
+
|
|
2334
|
+
def catalog_view_comment_set(
|
|
2335
|
+
self,
|
|
2336
|
+
attach_opaque_data: bytes,
|
|
2337
|
+
schema_name: str,
|
|
2338
|
+
name: str,
|
|
2339
|
+
comment: str | None = None,
|
|
2340
|
+
ignore_not_found: bool = False,
|
|
2341
|
+
transaction_opaque_data: bytes | None = None,
|
|
2342
|
+
) -> None:
|
|
2343
|
+
"""Set or clear the comment on a view."""
|
|
2344
|
+
...
|
|
2345
|
+
|
|
2346
|
+
# ========== Catalog - Macros ===========
|
|
2347
|
+
|
|
2348
|
+
def catalog_macro_get(
|
|
2349
|
+
self,
|
|
2350
|
+
attach_opaque_data: bytes,
|
|
2351
|
+
schema_name: str,
|
|
2352
|
+
name: str,
|
|
2353
|
+
transaction_opaque_data: bytes | None = None,
|
|
2354
|
+
) -> MacrosResponse:
|
|
2355
|
+
"""Get information about a macro. Returns 0 or 1 items."""
|
|
2356
|
+
...
|
|
2357
|
+
|
|
2358
|
+
def catalog_macro_create(self, request: MacroCreateRequest) -> None:
|
|
2359
|
+
"""Create a new macro."""
|
|
2360
|
+
...
|
|
2361
|
+
|
|
2362
|
+
def catalog_macro_drop(
|
|
2363
|
+
self,
|
|
2364
|
+
attach_opaque_data: bytes,
|
|
2365
|
+
schema_name: str,
|
|
2366
|
+
name: str,
|
|
2367
|
+
ignore_not_found: bool = False,
|
|
2368
|
+
transaction_opaque_data: bytes | None = None,
|
|
2369
|
+
) -> None:
|
|
2370
|
+
"""Drop a macro."""
|
|
2371
|
+
...
|
|
2372
|
+
|
|
2373
|
+
def catalog_schema_contents_macros(
|
|
2374
|
+
self,
|
|
2375
|
+
attach_opaque_data: bytes,
|
|
2376
|
+
name: str,
|
|
2377
|
+
type: SchemaObjectType,
|
|
2378
|
+
transaction_opaque_data: bytes | None = None,
|
|
2379
|
+
) -> MacrosResponse:
|
|
2380
|
+
"""List macros in a schema (scalar or table)."""
|
|
2381
|
+
...
|
|
2382
|
+
|
|
2383
|
+
# ========== Catalog - Indexes ==========
|
|
2384
|
+
|
|
2385
|
+
def catalog_index_get(
|
|
2386
|
+
self,
|
|
2387
|
+
attach_opaque_data: bytes,
|
|
2388
|
+
schema_name: str,
|
|
2389
|
+
name: str,
|
|
2390
|
+
transaction_opaque_data: bytes | None = None,
|
|
2391
|
+
) -> IndexesResponse:
|
|
2392
|
+
"""Get information about an index. Returns 0 or 1 items."""
|
|
2393
|
+
...
|
|
2394
|
+
|
|
2395
|
+
def catalog_index_create(self, request: IndexCreateRequest) -> None:
|
|
2396
|
+
"""Create a new index."""
|
|
2397
|
+
...
|
|
2398
|
+
|
|
2399
|
+
def catalog_index_drop(
|
|
2400
|
+
self,
|
|
2401
|
+
attach_opaque_data: bytes,
|
|
2402
|
+
schema_name: str,
|
|
2403
|
+
name: str,
|
|
2404
|
+
ignore_not_found: bool = False,
|
|
2405
|
+
cascade: bool = False,
|
|
2406
|
+
transaction_opaque_data: bytes | None = None,
|
|
2407
|
+
) -> None:
|
|
2408
|
+
"""Drop an index."""
|
|
2409
|
+
...
|
|
2410
|
+
|
|
2411
|
+
def catalog_schema_contents_indexes(
|
|
2412
|
+
self,
|
|
2413
|
+
attach_opaque_data: bytes,
|
|
2414
|
+
name: str,
|
|
2415
|
+
transaction_opaque_data: bytes | None = None,
|
|
2416
|
+
) -> IndexesResponse:
|
|
2417
|
+
"""List indexes in a schema."""
|
|
2418
|
+
...
|