vgi-python 0.8.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vgi/__init__.py +152 -0
- vgi/_duckdb.py +62 -0
- vgi/_storage_profile.py +132 -0
- vgi/_test_fixtures/__init__.py +20 -0
- vgi/_test_fixtures/accumulate/__init__.py +19 -0
- vgi/_test_fixtures/accumulate/worker.py +762 -0
- vgi/_test_fixtures/aggregate/__init__.py +62 -0
- vgi/_test_fixtures/aggregate/_common.py +21 -0
- vgi/_test_fixtures/aggregate/basic.py +232 -0
- vgi/_test_fixtures/aggregate/dynamic.py +409 -0
- vgi/_test_fixtures/aggregate/generic.py +86 -0
- vgi/_test_fixtures/aggregate/listagg.py +71 -0
- vgi/_test_fixtures/aggregate/percentile.py +107 -0
- vgi/_test_fixtures/aggregate/streaming.py +192 -0
- vgi/_test_fixtures/aggregate/varargs.py +75 -0
- vgi/_test_fixtures/aggregate/window.py +380 -0
- vgi/_test_fixtures/attach_options.py +308 -0
- vgi/_test_fixtures/bad_protocol.py +62 -0
- vgi/_test_fixtures/cancellable.py +336 -0
- vgi/_test_fixtures/catalog.py +813 -0
- vgi/_test_fixtures/http_server.py +394 -0
- vgi/_test_fixtures/nest_tensor.py +614 -0
- vgi/_test_fixtures/orchard_catalog.py +47 -0
- vgi/_test_fixtures/projection_repro/__init__.py +6 -0
- vgi/_test_fixtures/projection_repro/worker.py +454 -0
- vgi/_test_fixtures/scalar/__init__.py +116 -0
- vgi/_test_fixtures/scalar/_common.py +69 -0
- vgi/_test_fixtures/scalar/arithmetic.py +321 -0
- vgi/_test_fixtures/scalar/binary.py +120 -0
- vgi/_test_fixtures/scalar/formatting.py +176 -0
- vgi/_test_fixtures/scalar/geo.py +300 -0
- vgi/_test_fixtures/scalar/null_handling.py +107 -0
- vgi/_test_fixtures/scalar/random_demo.py +171 -0
- vgi/_test_fixtures/scalar/settings_secrets.py +102 -0
- vgi/_test_fixtures/scalar/type_info.py +219 -0
- vgi/_test_fixtures/schema_reconcile/__init__.py +29 -0
- vgi/_test_fixtures/schema_reconcile/worker.py +653 -0
- vgi/_test_fixtures/simple_writable.py +793 -0
- vgi/_test_fixtures/table/__init__.py +221 -0
- vgi/_test_fixtures/table/_common.py +162 -0
- vgi/_test_fixtures/table/batch_index.py +283 -0
- vgi/_test_fixtures/table/batch_index_broken.py +200 -0
- vgi/_test_fixtures/table/catalog_scans.py +162 -0
- vgi/_test_fixtures/table/filters.py +1005 -0
- vgi/_test_fixtures/table/late_materialization.py +249 -0
- vgi/_test_fixtures/table/make_series.py +273 -0
- vgi/_test_fixtures/table/misc.py +499 -0
- vgi/_test_fixtures/table/order_modes.py +164 -0
- vgi/_test_fixtures/table/pairs.py +437 -0
- vgi/_test_fixtures/table/partition_columns.py +472 -0
- vgi/_test_fixtures/table/partition_columns_broken.py +304 -0
- vgi/_test_fixtures/table/profiling_example.py +195 -0
- vgi/_test_fixtures/table/required_filters.py +234 -0
- vgi/_test_fixtures/table/sequence.py +710 -0
- vgi/_test_fixtures/table/settings.py +426 -0
- vgi/_test_fixtures/table/transaction_storage.py +162 -0
- vgi/_test_fixtures/table/tt_pushdown.py +191 -0
- vgi/_test_fixtures/table/versioned.py +230 -0
- vgi/_test_fixtures/table_in_out.py +1392 -0
- vgi/_test_fixtures/versioned.py +155 -0
- vgi/_test_fixtures/versioned_tables.py +595 -0
- vgi/_test_fixtures/worker.py +1631 -0
- vgi/_test_fixtures/writable/__init__.py +8 -0
- vgi/_test_fixtures/writable/generic.py +236 -0
- vgi/_test_fixtures/writable/table.py +149 -0
- vgi/_test_fixtures/writable/worker.py +1148 -0
- vgi/aggregate_function.py +607 -0
- vgi/argument_spec.py +472 -0
- vgi/arguments.py +1747 -0
- vgi/auth.py +55 -0
- vgi/catalog/__init__.py +88 -0
- vgi/catalog/attach_option.py +206 -0
- vgi/catalog/catalog_interface.py +2767 -0
- vgi/catalog/descriptors.py +870 -0
- vgi/catalog/duckdb_statistics.py +377 -0
- vgi/catalog/secret_type.py +96 -0
- vgi/catalog/setting.py +253 -0
- vgi/catalog/storage.py +372 -0
- vgi/client/__init__.py +67 -0
- vgi/client/catalog_mixin.py +1251 -0
- vgi/client/cli.py +582 -0
- vgi/client/cli_catalog.py +182 -0
- vgi/client/cli_schema.py +270 -0
- vgi/client/cli_table.py +907 -0
- vgi/client/cli_transaction.py +97 -0
- vgi/client/cli_utils.py +441 -0
- vgi/client/cli_view.py +303 -0
- vgi/client/client.py +2183 -0
- vgi/exceptions.py +205 -0
- vgi/function.py +245 -0
- vgi/function_storage.py +1636 -0
- vgi/function_storage_azure_sql.py +922 -0
- vgi/function_storage_cf_do.py +740 -0
- vgi/http/__init__.py +25 -0
- vgi/http/demo_storage.py +212 -0
- vgi/http/worker_page.py +1252 -0
- vgi/invocation.py +154 -0
- vgi/logging_config.py +93 -0
- vgi/meta_worker.py +661 -0
- vgi/metadata.py +1403 -0
- vgi/otel.py +406 -0
- vgi/protocol.py +2418 -0
- vgi/protocol_version.txt +1 -0
- vgi/py.typed +0 -0
- vgi/scalar_function.py +1211 -0
- vgi/schema_utils.py +234 -0
- vgi/secret_protocol.py +124 -0
- vgi/secret_service.py +238 -0
- vgi/serve.py +769 -0
- vgi/table_buffering_function.py +443 -0
- vgi/table_filter_pushdown.py +1528 -0
- vgi/table_function.py +1130 -0
- vgi/table_in_out_function.py +383 -0
- vgi/transactor/__init__.py +24 -0
- vgi/transactor/_duckdb_compat.py +27 -0
- vgi/transactor/client.py +137 -0
- vgi/transactor/protocol.py +149 -0
- vgi/transactor/server.py +740 -0
- vgi/worker.py +4761 -0
- vgi_python-0.8.0.dist-info/METADATA +735 -0
- vgi_python-0.8.0.dist-info/RECORD +124 -0
- vgi_python-0.8.0.dist-info/WHEEL +4 -0
- vgi_python-0.8.0.dist-info/entry_points.txt +5 -0
- vgi_python-0.8.0.dist-info/licenses/LICENSE +134 -0
|
@@ -0,0 +1,443 @@
|
|
|
1
|
+
# Copyright 2025, 2026 Query Farm LLC - https://query.farm
|
|
2
|
+
|
|
3
|
+
"""Framework for implementing table sink+source functions.
|
|
4
|
+
|
|
5
|
+
``TableBufferingFunction`` is the worker-side base for functions that must
|
|
6
|
+
see *every* input row before producing any output (e.g. buffer-then-emit,
|
|
7
|
+
global aggregations, sort-then-emit). Routed through the C++
|
|
8
|
+
``PhysicalVgiTableBuffering`` Sink+Source operator.
|
|
9
|
+
|
|
10
|
+
Three callbacks, mirroring the operator's three phases:
|
|
11
|
+
|
|
12
|
+
* ``process(batch, params) -> bytes`` — ingest one batch, return an opaque
|
|
13
|
+
state_id naming where the worker stored it.
|
|
14
|
+
* ``combine(state_ids, params) -> list[bytes]`` — once per query, on the
|
|
15
|
+
coordinator worker; group/merge/sort the per-batch state_ids and
|
|
16
|
+
return finalize_state_ids for the Source phase.
|
|
17
|
+
* ``finalize(params, finalize_state_id, state, out)`` — producer-mode
|
|
18
|
+
streaming RPC mirroring ``TableFunctionGenerator.process``: one tick
|
|
19
|
+
per call, emit one batch via ``out.emit(batch)`` (or ``out.finish()``
|
|
20
|
+
for EOS), state persists between ticks via wire-serialization.
|
|
21
|
+
|
|
22
|
+
State_ids are opaque ``bytes``. The worker picks the granularity (per-batch,
|
|
23
|
+
per-thread, custom partitioning); the framework just round-trips them.
|
|
24
|
+
|
|
25
|
+
INVARIANT: any state the worker stores in ``process()`` that ``finalize()``
|
|
26
|
+
will need MUST live in cross-process storage scoped by
|
|
27
|
+
``params.execution_id`` (``BoundStorage`` is the canonical choice). The
|
|
28
|
+
Source phase may route a given ``finalize_state_id`` to a worker process
|
|
29
|
+
that did NOT run the corresponding ``process()`` calls.
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
from __future__ import annotations
|
|
33
|
+
|
|
34
|
+
from abc import abstractmethod
|
|
35
|
+
from collections.abc import Callable
|
|
36
|
+
from dataclasses import dataclass, field
|
|
37
|
+
from typing import TYPE_CHECKING, Any, ClassVar, TypeVar, get_args, get_origin
|
|
38
|
+
|
|
39
|
+
import pyarrow as pa
|
|
40
|
+
from vgi_rpc import ArrowSerializableDataclass
|
|
41
|
+
from vgi_rpc.rpc import OutputCollector
|
|
42
|
+
|
|
43
|
+
from vgi.invocation import (
|
|
44
|
+
BindResponse,
|
|
45
|
+
)
|
|
46
|
+
from vgi.table_function import (
|
|
47
|
+
_ON_CANCEL_CAVEATS,
|
|
48
|
+
BindParams,
|
|
49
|
+
ProcessParams,
|
|
50
|
+
TableFunctionBase,
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
if TYPE_CHECKING:
|
|
54
|
+
pass
|
|
55
|
+
|
|
56
|
+
__all__ = [
|
|
57
|
+
"TableBufferingFunction",
|
|
58
|
+
"TableBufferingParams",
|
|
59
|
+
]
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
# Sentinel meaning "no parameterization of TableBufferingFunction was found
|
|
63
|
+
# in the MRO walk; leave the existing class attribute alone (inherits via
|
|
64
|
+
# normal MRO lookup from a base that did resolve)". Distinguished from None
|
|
65
|
+
# (a valid resolved value meaning "no per-tick state").
|
|
66
|
+
_UNCHANGED: Any = object()
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def _resolve_finalize_state_class(
|
|
70
|
+
cls: type,
|
|
71
|
+
) -> type[ArrowSerializableDataclass] | None | Any:
|
|
72
|
+
"""Walk ``cls.__mro__`` to resolve ``TFinalizeState`` to a concrete type.
|
|
73
|
+
|
|
74
|
+
Returns the resolved class, ``None`` (state explicitly disabled), or
|
|
75
|
+
the ``_UNCHANGED`` sentinel when no TBF parameterization is found.
|
|
76
|
+
|
|
77
|
+
Handles generic-through chains by maintaining a TypeVar→concrete
|
|
78
|
+
substitution map as we walk from most-derived to base. When an
|
|
79
|
+
intermediate class binds a TypeVar to a concrete type, later levels
|
|
80
|
+
that reference that TypeVar in their own bases get substituted.
|
|
81
|
+
"""
|
|
82
|
+
# Walk lazy-imported to avoid the forward-reference dance.
|
|
83
|
+
substitutions: dict[TypeVar, Any] = {}
|
|
84
|
+
saw_parameterization = False
|
|
85
|
+
|
|
86
|
+
for klass in cls.__mro__:
|
|
87
|
+
# __orig_bases__ is per-class (not inherited); look it up directly
|
|
88
|
+
# on klass without falling back to attribute resolution.
|
|
89
|
+
orig_bases = klass.__dict__.get("__orig_bases__", ())
|
|
90
|
+
for base in orig_bases:
|
|
91
|
+
origin = get_origin(base)
|
|
92
|
+
if origin is None or not isinstance(origin, type):
|
|
93
|
+
continue
|
|
94
|
+
if not issubclass(origin, TableBufferingFunction):
|
|
95
|
+
continue
|
|
96
|
+
saw_parameterization = True
|
|
97
|
+
type_args = get_args(base)
|
|
98
|
+
|
|
99
|
+
if origin is TableBufferingFunction:
|
|
100
|
+
# Direct parameterization: TableBufferingFunction[TArgs, TState].
|
|
101
|
+
if len(type_args) < 2:
|
|
102
|
+
continue
|
|
103
|
+
state = type_args[1]
|
|
104
|
+
# Resolve transitively through prior substitutions.
|
|
105
|
+
while isinstance(state, TypeVar) and state in substitutions:
|
|
106
|
+
state = substitutions[state]
|
|
107
|
+
if state is None or state is type(None):
|
|
108
|
+
return None
|
|
109
|
+
if isinstance(state, TypeVar):
|
|
110
|
+
# Still unresolved — generic-through to a leaf class
|
|
111
|
+
# that we either haven't seen yet (impossible: we walk
|
|
112
|
+
# most-derived first) or that didn't bind. Leave None.
|
|
113
|
+
return None
|
|
114
|
+
return state
|
|
115
|
+
|
|
116
|
+
# Intermediate parameterized base — record TypeVar substitutions
|
|
117
|
+
# so the next iteration up the MRO can use them.
|
|
118
|
+
type_params: tuple[TypeVar, ...] = getattr(origin, "__parameters__", ())
|
|
119
|
+
# strict=False on purpose: an intermediate generic may declare
|
|
120
|
+
# more TypeVars than the parameterization binds (callers can
|
|
121
|
+
# leave trailing positions unbound by intent), in which case
|
|
122
|
+
# ``zip`` should silently truncate.
|
|
123
|
+
for tv, ta in zip(type_params, type_args, strict=False):
|
|
124
|
+
# If ta itself is a TypeVar resolved earlier (deeper-nested
|
|
125
|
+
# generic), chase the chain to its concrete binding.
|
|
126
|
+
while isinstance(ta, TypeVar) and ta in substitutions:
|
|
127
|
+
ta = substitutions[ta]
|
|
128
|
+
substitutions[tv] = ta
|
|
129
|
+
|
|
130
|
+
return _UNCHANGED if not saw_parameterization else None
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
@dataclass(slots=True, frozen=True, kw_only=True)
|
|
134
|
+
class TableBufferingParams[TArgs](ProcessParams[TArgs]):
|
|
135
|
+
"""Params for ``TableBufferingFunction`` callbacks.
|
|
136
|
+
|
|
137
|
+
Adds identity fields that the buffered API needs to scope worker-owned
|
|
138
|
+
storage and coordinate cross-process state. Other function shapes
|
|
139
|
+
(``TableFunctionGenerator``, ``TableInOutGenerator``, aggregates) keep
|
|
140
|
+
using the plain ``ProcessParams`` they always have.
|
|
141
|
+
|
|
142
|
+
Attributes:
|
|
143
|
+
execution_id: Stable across coordinator + secondary workers for one
|
|
144
|
+
DuckDB query execution. Key worker-owned storage by this.
|
|
145
|
+
attach_id: Catalog attach identity; pin attach-time config lookups
|
|
146
|
+
by this.
|
|
147
|
+
transaction_id: Hex-encoded VGI transaction id when running inside
|
|
148
|
+
a DuckDB transaction, ``None`` otherwise.
|
|
149
|
+
function_name: Convenience accessor — same as
|
|
150
|
+
``init_call.function_name``.
|
|
151
|
+
worker_path: Subprocess path / ``unix://`` / ``launch:`` argv. For
|
|
152
|
+
diagnostics.
|
|
153
|
+
|
|
154
|
+
"""
|
|
155
|
+
|
|
156
|
+
execution_id: bytes
|
|
157
|
+
attach_id: bytes
|
|
158
|
+
transaction_id: bytes | None
|
|
159
|
+
function_name: str
|
|
160
|
+
worker_path: str | None = None
|
|
161
|
+
|
|
162
|
+
# In-band log sink — emits a 0-row log batch on the RPC response stream,
|
|
163
|
+
# which DuckDB surfaces as a row in ``duckdb_logs()`` with ``type='VGI'``.
|
|
164
|
+
# Use this from ``process()`` and ``combine()`` (which are unary RPCs and
|
|
165
|
+
# have no ``OutputCollector``). The streaming ``finalize(... out)``
|
|
166
|
+
# callback should use ``out.client_log(...)`` instead — it goes through
|
|
167
|
+
# the same wire mechanism but flows through the producer-mode stream.
|
|
168
|
+
#
|
|
169
|
+
# The worker handler wires this to ``ctx.client_log`` before invoking
|
|
170
|
+
# the user callback; the default no-op is a safety net for unit-test
|
|
171
|
+
# callers that build ``TableBufferingParams`` outside the RPC path.
|
|
172
|
+
client_log: Callable[..., None] = field(
|
|
173
|
+
default=lambda *_a, **_kw: None,
|
|
174
|
+
repr=False,
|
|
175
|
+
compare=False,
|
|
176
|
+
)
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
class TableBufferingFunction[TArgs, TFinalizeState = None](TableFunctionBase[TArgs]):
|
|
180
|
+
"""Base class for table sink+source functions.
|
|
181
|
+
|
|
182
|
+
Subclass to declare a function that must see every input row before
|
|
183
|
+
producing output. The C++ ``PhysicalVgiTableBuffering`` operator
|
|
184
|
+
routes calls through three phases:
|
|
185
|
+
|
|
186
|
+
1. **Sink** — ``process(batch, params) -> state_id`` is called per
|
|
187
|
+
input batch (parallel across DuckDB threads unless
|
|
188
|
+
``Meta.sink_order_dependent`` is set).
|
|
189
|
+
2. **Combine** — ``combine(state_ids, params) -> finalize_state_ids``
|
|
190
|
+
is called once on the coordinator worker after every ``process()``
|
|
191
|
+
completes.
|
|
192
|
+
3. **Source** — ``finalize(params, fid, state, out)`` is called per
|
|
193
|
+
tick by the framework, emitting one batch per call (parallel
|
|
194
|
+
across ``finalize_state_ids`` unless ``Meta.source_order_dependent``).
|
|
195
|
+
|
|
196
|
+
Cross-process invariant: any state the worker writes during
|
|
197
|
+
``process()`` that ``finalize()`` will read MUST live in cross-process
|
|
198
|
+
storage scoped by ``params.execution_id`` — ``BoundStorage`` is the
|
|
199
|
+
canonical choice. The Source phase routes a given ``finalize_state_id``
|
|
200
|
+
to whatever worker process the C++ scheduler picks; it is NOT
|
|
201
|
+
guaranteed to be the same process that ran ``process()``.
|
|
202
|
+
|
|
203
|
+
Type parameters:
|
|
204
|
+
TArgs: User-facing function arguments dataclass.
|
|
205
|
+
TFinalizeState: Wire-serializable state carried between
|
|
206
|
+
``finalize()`` ticks. Must subclass ``ArrowSerializableDataclass``
|
|
207
|
+
when set to anything other than ``None``.
|
|
208
|
+
"""
|
|
209
|
+
|
|
210
|
+
# Resolved at class-definition time by ``__init_subclass__`` from the
|
|
211
|
+
# ``TFinalizeState`` generic parameter (position 1 in the parameterized
|
|
212
|
+
# base). ``None`` means "no per-tick state" (the user passed ``None`` as
|
|
213
|
+
# ``TFinalizeState`` or didn't parameterize). Inherits through subclassing,
|
|
214
|
+
# so ``class Foo(BufferInputFunction): ...`` reuses the parent's resolution
|
|
215
|
+
# without re-walking ``__orig_bases__``.
|
|
216
|
+
_finalize_state_class: ClassVar[type[ArrowSerializableDataclass] | None] = None
|
|
217
|
+
|
|
218
|
+
class Meta:
|
|
219
|
+
"""Per-class metadata for TableBufferingFunction."""
|
|
220
|
+
|
|
221
|
+
name: ClassVar[str]
|
|
222
|
+
# Output schema declared via Meta.return_schema or via on_bind().
|
|
223
|
+
# Sink-side ordering: forces ParallelSink=false in the C++ operator.
|
|
224
|
+
sink_order_dependent: ClassVar[bool] = False
|
|
225
|
+
# Source-side ordering: forces serial output in finalize_queue order.
|
|
226
|
+
source_order_dependent: ClassVar[bool] = False
|
|
227
|
+
# Threads DuckDB's per-chunk batch_index into every process() call.
|
|
228
|
+
# Mutually exclusive with sink_order_dependent (validated below).
|
|
229
|
+
requires_input_batch_index: ClassVar[bool] = False
|
|
230
|
+
|
|
231
|
+
def __init_subclass__(cls) -> None: # noqa: D105 — internal hook
|
|
232
|
+
super().__init_subclass__()
|
|
233
|
+
|
|
234
|
+
# Resolve ``TFinalizeState`` by walking the MRO chain of
|
|
235
|
+
# generic-parameterizations. The naive "look at cls.__orig_bases__"
|
|
236
|
+
# approach handles ``class Foo(TableBufferingFunction[Args, State])``
|
|
237
|
+
# but silently loses the state type on intermediate generics:
|
|
238
|
+
#
|
|
239
|
+
# class Mid[X](TableBufferingFunction[Args, X]): ...
|
|
240
|
+
# class Concrete(Mid[MyState]): # bug: TFinalizeState = None
|
|
241
|
+
#
|
|
242
|
+
# ``Concrete.__orig_bases__`` is ``(Mid[MyState],)``; the old loop
|
|
243
|
+
# saw origin=Mid (a TBF subclass), tried ``type_args[1]`` (out of
|
|
244
|
+
# range, only one arg), and bailed, leaving _finalize_state_class
|
|
245
|
+
# unset → MyState lost. We instead walk ``cls.__mro__``, build a
|
|
246
|
+
# TypeVar→concrete substitution map level by level, and resolve
|
|
247
|
+
# when we reach a base whose origin is TableBufferingFunction
|
|
248
|
+
# itself. ``TableFunctionBase.__init_subclass__`` (via super())
|
|
249
|
+
# has already validated state_type when it was first introduced.
|
|
250
|
+
resolved = _resolve_finalize_state_class(cls)
|
|
251
|
+
if resolved is _UNCHANGED:
|
|
252
|
+
# No parameterization found in the MRO walk — leave the
|
|
253
|
+
# inherited class-attribute value alone (covers
|
|
254
|
+
# ``class Foo(BufferInputFunction): ...`` where Foo doesn't
|
|
255
|
+
# re-parameterize and just inherits BufferInputFunction's
|
|
256
|
+
# resolved class).
|
|
257
|
+
pass
|
|
258
|
+
else:
|
|
259
|
+
cls._finalize_state_class = resolved
|
|
260
|
+
|
|
261
|
+
meta = getattr(cls, "Meta", None)
|
|
262
|
+
if meta is None:
|
|
263
|
+
return
|
|
264
|
+
sink_order = bool(getattr(meta, "sink_order_dependent", False))
|
|
265
|
+
requires_batch_index = bool(getattr(meta, "requires_input_batch_index", False))
|
|
266
|
+
if sink_order and requires_batch_index:
|
|
267
|
+
raise TypeError(
|
|
268
|
+
f"{cls.__name__}.Meta: sink_order_dependent and "
|
|
269
|
+
f"requires_input_batch_index are mutually exclusive — "
|
|
270
|
+
f"single-thread sink already orders input, batch_index is "
|
|
271
|
+
f"only meaningful under parallel ingest."
|
|
272
|
+
)
|
|
273
|
+
|
|
274
|
+
@classmethod
|
|
275
|
+
def on_bind(
|
|
276
|
+
cls,
|
|
277
|
+
params: BindParams[TArgs],
|
|
278
|
+
) -> BindResponse:
|
|
279
|
+
"""Pass-through default — output schema is the input schema.
|
|
280
|
+
|
|
281
|
+
Override to validate arguments, compute a dynamic output type, or
|
|
282
|
+
request secrets via ``SecretsAccessor``. See
|
|
283
|
+
``TableFunctionBase.on_bind`` for the broader contract.
|
|
284
|
+
"""
|
|
285
|
+
assert params.bind_call.input_schema is not None
|
|
286
|
+
return BindResponse(output_schema=params.bind_call.input_schema)
|
|
287
|
+
|
|
288
|
+
# bind / on_init / global_init are defined on TableFunctionBase.
|
|
289
|
+
|
|
290
|
+
# ------------------------------------------------------------------
|
|
291
|
+
# Sink phase
|
|
292
|
+
# ------------------------------------------------------------------
|
|
293
|
+
|
|
294
|
+
@classmethod
|
|
295
|
+
@abstractmethod
|
|
296
|
+
def process(
|
|
297
|
+
cls,
|
|
298
|
+
batch: pa.RecordBatch,
|
|
299
|
+
params: TableBufferingParams[TArgs],
|
|
300
|
+
) -> bytes:
|
|
301
|
+
"""Ingest one input batch and return an opaque ``state_id``.
|
|
302
|
+
|
|
303
|
+
The worker chooses both *where* to store the batch (BoundStorage,
|
|
304
|
+
external files, in-memory cross-process structures, etc.) and the
|
|
305
|
+
*granularity* of state_ids (per-batch, per-thread, custom
|
|
306
|
+
partitioning). The framework collects all returned state_ids and
|
|
307
|
+
passes them to ``combine()`` on the coordinator worker.
|
|
308
|
+
|
|
309
|
+
Common pattern for "one bucket per execution" is to return
|
|
310
|
+
``params.execution_id``; ``combine()`` then collapses the list of
|
|
311
|
+
identical state_ids to a single finalize stream.
|
|
312
|
+
|
|
313
|
+
Cross-process invariant: any state the worker stores here that
|
|
314
|
+
``finalize()`` will need MUST live in cross-process storage scoped
|
|
315
|
+
by ``params.execution_id``. The Source phase may route the
|
|
316
|
+
corresponding finalize_state_id to a different worker process.
|
|
317
|
+
|
|
318
|
+
Args:
|
|
319
|
+
batch: One input batch from DuckDB. Schema matches the
|
|
320
|
+
function's declared ``input_schema``.
|
|
321
|
+
params: Process-time params, including identity fields
|
|
322
|
+
(``execution_id``, ``attach_id``, ``transaction_id``,
|
|
323
|
+
``function_name``) and ``params.batch_index`` when
|
|
324
|
+
``Meta.requires_input_batch_index=True``.
|
|
325
|
+
|
|
326
|
+
Returns:
|
|
327
|
+
Opaque state_id naming where the batch was stored.
|
|
328
|
+
|
|
329
|
+
"""
|
|
330
|
+
|
|
331
|
+
# ------------------------------------------------------------------
|
|
332
|
+
# Combine phase
|
|
333
|
+
# ------------------------------------------------------------------
|
|
334
|
+
|
|
335
|
+
@classmethod
|
|
336
|
+
@abstractmethod
|
|
337
|
+
def combine(
|
|
338
|
+
cls,
|
|
339
|
+
state_ids: list[bytes],
|
|
340
|
+
params: TableBufferingParams[TArgs],
|
|
341
|
+
) -> list[bytes]:
|
|
342
|
+
"""Group / merge / sort state_ids; return finalize_state_ids.
|
|
343
|
+
|
|
344
|
+
Called once on the coordinator worker after every ``process()``
|
|
345
|
+
completes. State_ids are opaque bytes — the framework does not
|
|
346
|
+
inspect, dedup, or transform them. ``combine`` returns the exact
|
|
347
|
+
list of finalize_state_ids the Source phase will iterate; one
|
|
348
|
+
finalize stream per returned id.
|
|
349
|
+
|
|
350
|
+
Typical patterns:
|
|
351
|
+
|
|
352
|
+
* **Single-bucket execution** — process() returns ``params.execution_id``
|
|
353
|
+
for every call; combine() returns ``[params.execution_id]`` so
|
|
354
|
+
one finalize stream drains the single accumulator.
|
|
355
|
+
* **Per-shard fan-out** — process() returns a per-shard
|
|
356
|
+
identifier; combine() returns the list of unique shard ids
|
|
357
|
+
for parallel finalize.
|
|
358
|
+
* **Global sort under ``Meta.sink_order_dependent``** — process()
|
|
359
|
+
returns per-batch ids; combine() reads each, sorts globally,
|
|
360
|
+
returns ``[sentinel]`` so a single ordered finalize stream
|
|
361
|
+
emits the merged result.
|
|
362
|
+
|
|
363
|
+
Args:
|
|
364
|
+
state_ids: Every state_id returned from every ``process()``
|
|
365
|
+
call across every DuckDB thread, in arbitrary order.
|
|
366
|
+
Duplicates from multiple Sink threads using the same
|
|
367
|
+
state_id are NOT dedup'd by the framework.
|
|
368
|
+
params: Process-time params (same identity fields as
|
|
369
|
+
``process()``).
|
|
370
|
+
|
|
371
|
+
Returns:
|
|
372
|
+
finalize_state_ids — keys the Source phase will iterate.
|
|
373
|
+
|
|
374
|
+
"""
|
|
375
|
+
|
|
376
|
+
# ------------------------------------------------------------------
|
|
377
|
+
# Source phase — mirrors TableFunctionGenerator.process producer-mode
|
|
378
|
+
# ------------------------------------------------------------------
|
|
379
|
+
|
|
380
|
+
@classmethod
|
|
381
|
+
def initial_finalize_state(
|
|
382
|
+
cls,
|
|
383
|
+
finalize_state_id: bytes,
|
|
384
|
+
params: TableBufferingParams[TArgs],
|
|
385
|
+
) -> TFinalizeState | None:
|
|
386
|
+
"""Build the initial wire-serializable state for a finalize stream.
|
|
387
|
+
|
|
388
|
+
Called once per finalize_state_id at stream init time. The
|
|
389
|
+
returned state is passed to the first ``finalize()`` tick; the
|
|
390
|
+
framework serializes it between ticks so the stream survives
|
|
391
|
+
worker process boundaries (HTTP transport).
|
|
392
|
+
|
|
393
|
+
Default returns ``None`` (suitable when ``TFinalizeState = None``).
|
|
394
|
+
Override and declare a concrete ``TFinalizeState`` subclass of
|
|
395
|
+
``ArrowSerializableDataclass`` to carry cursor / progress state
|
|
396
|
+
between ticks.
|
|
397
|
+
"""
|
|
398
|
+
return None
|
|
399
|
+
|
|
400
|
+
@classmethod
|
|
401
|
+
@abstractmethod
|
|
402
|
+
def finalize(
|
|
403
|
+
cls,
|
|
404
|
+
params: TableBufferingParams[TArgs],
|
|
405
|
+
finalize_state_id: bytes,
|
|
406
|
+
state: TFinalizeState,
|
|
407
|
+
out: OutputCollector,
|
|
408
|
+
) -> None:
|
|
409
|
+
"""Produce one batch's worth of output for ``finalize_state_id``.
|
|
410
|
+
|
|
411
|
+
Called repeatedly by the framework (one call per tick). Each call
|
|
412
|
+
should either:
|
|
413
|
+
|
|
414
|
+
* ``out.emit(batch)`` to produce one output batch and mutate
|
|
415
|
+
``state`` in place — ``state`` is wire-serialized after the
|
|
416
|
+
call so the next tick (possibly on a different worker)
|
|
417
|
+
resumes from the updated value.
|
|
418
|
+
* ``out.finish()`` to signal EOS for this ``finalize_state_id``.
|
|
419
|
+
|
|
420
|
+
Mirrors ``TableFunctionGenerator.process`` exactly — the only
|
|
421
|
+
difference is the parameterization by ``finalize_state_id``
|
|
422
|
+
instead of free function arguments.
|
|
423
|
+
"""
|
|
424
|
+
|
|
425
|
+
@classmethod
|
|
426
|
+
def on_cancel(
|
|
427
|
+
cls,
|
|
428
|
+
params: TableBufferingParams[TArgs],
|
|
429
|
+
finalize_state_id: bytes,
|
|
430
|
+
state: TFinalizeState,
|
|
431
|
+
) -> None:
|
|
432
|
+
"""No-op default; runtime docstring set below via __func__.__doc__."""
|
|
433
|
+
|
|
434
|
+
on_cancel.__func__.__doc__ = ( # type: ignore[attr-defined]
|
|
435
|
+
f"""Release resources when a finalize stream is cancelled before EOS.
|
|
436
|
+
|
|
437
|
+
Fired when DuckDB tears down a scan early (LIMIT clause, user
|
|
438
|
+
break, exception unwind). Override to release expensive resources
|
|
439
|
+
held in ``state`` (DB connections, large buffers, etc.).
|
|
440
|
+
|
|
441
|
+
{_ON_CANCEL_CAVEATS}
|
|
442
|
+
"""
|
|
443
|
+
)
|