vgi-python 0.8.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (124) hide show
  1. vgi/__init__.py +152 -0
  2. vgi/_duckdb.py +62 -0
  3. vgi/_storage_profile.py +132 -0
  4. vgi/_test_fixtures/__init__.py +20 -0
  5. vgi/_test_fixtures/accumulate/__init__.py +19 -0
  6. vgi/_test_fixtures/accumulate/worker.py +762 -0
  7. vgi/_test_fixtures/aggregate/__init__.py +62 -0
  8. vgi/_test_fixtures/aggregate/_common.py +21 -0
  9. vgi/_test_fixtures/aggregate/basic.py +232 -0
  10. vgi/_test_fixtures/aggregate/dynamic.py +409 -0
  11. vgi/_test_fixtures/aggregate/generic.py +86 -0
  12. vgi/_test_fixtures/aggregate/listagg.py +71 -0
  13. vgi/_test_fixtures/aggregate/percentile.py +107 -0
  14. vgi/_test_fixtures/aggregate/streaming.py +192 -0
  15. vgi/_test_fixtures/aggregate/varargs.py +75 -0
  16. vgi/_test_fixtures/aggregate/window.py +380 -0
  17. vgi/_test_fixtures/attach_options.py +308 -0
  18. vgi/_test_fixtures/bad_protocol.py +62 -0
  19. vgi/_test_fixtures/cancellable.py +336 -0
  20. vgi/_test_fixtures/catalog.py +813 -0
  21. vgi/_test_fixtures/http_server.py +394 -0
  22. vgi/_test_fixtures/nest_tensor.py +614 -0
  23. vgi/_test_fixtures/orchard_catalog.py +47 -0
  24. vgi/_test_fixtures/projection_repro/__init__.py +6 -0
  25. vgi/_test_fixtures/projection_repro/worker.py +454 -0
  26. vgi/_test_fixtures/scalar/__init__.py +116 -0
  27. vgi/_test_fixtures/scalar/_common.py +69 -0
  28. vgi/_test_fixtures/scalar/arithmetic.py +321 -0
  29. vgi/_test_fixtures/scalar/binary.py +120 -0
  30. vgi/_test_fixtures/scalar/formatting.py +176 -0
  31. vgi/_test_fixtures/scalar/geo.py +300 -0
  32. vgi/_test_fixtures/scalar/null_handling.py +107 -0
  33. vgi/_test_fixtures/scalar/random_demo.py +171 -0
  34. vgi/_test_fixtures/scalar/settings_secrets.py +102 -0
  35. vgi/_test_fixtures/scalar/type_info.py +219 -0
  36. vgi/_test_fixtures/schema_reconcile/__init__.py +29 -0
  37. vgi/_test_fixtures/schema_reconcile/worker.py +653 -0
  38. vgi/_test_fixtures/simple_writable.py +793 -0
  39. vgi/_test_fixtures/table/__init__.py +221 -0
  40. vgi/_test_fixtures/table/_common.py +162 -0
  41. vgi/_test_fixtures/table/batch_index.py +283 -0
  42. vgi/_test_fixtures/table/batch_index_broken.py +200 -0
  43. vgi/_test_fixtures/table/catalog_scans.py +162 -0
  44. vgi/_test_fixtures/table/filters.py +1005 -0
  45. vgi/_test_fixtures/table/late_materialization.py +249 -0
  46. vgi/_test_fixtures/table/make_series.py +273 -0
  47. vgi/_test_fixtures/table/misc.py +499 -0
  48. vgi/_test_fixtures/table/order_modes.py +164 -0
  49. vgi/_test_fixtures/table/pairs.py +437 -0
  50. vgi/_test_fixtures/table/partition_columns.py +472 -0
  51. vgi/_test_fixtures/table/partition_columns_broken.py +304 -0
  52. vgi/_test_fixtures/table/profiling_example.py +195 -0
  53. vgi/_test_fixtures/table/required_filters.py +234 -0
  54. vgi/_test_fixtures/table/sequence.py +710 -0
  55. vgi/_test_fixtures/table/settings.py +426 -0
  56. vgi/_test_fixtures/table/transaction_storage.py +162 -0
  57. vgi/_test_fixtures/table/tt_pushdown.py +191 -0
  58. vgi/_test_fixtures/table/versioned.py +230 -0
  59. vgi/_test_fixtures/table_in_out.py +1392 -0
  60. vgi/_test_fixtures/versioned.py +155 -0
  61. vgi/_test_fixtures/versioned_tables.py +595 -0
  62. vgi/_test_fixtures/worker.py +1631 -0
  63. vgi/_test_fixtures/writable/__init__.py +8 -0
  64. vgi/_test_fixtures/writable/generic.py +236 -0
  65. vgi/_test_fixtures/writable/table.py +149 -0
  66. vgi/_test_fixtures/writable/worker.py +1148 -0
  67. vgi/aggregate_function.py +607 -0
  68. vgi/argument_spec.py +472 -0
  69. vgi/arguments.py +1747 -0
  70. vgi/auth.py +55 -0
  71. vgi/catalog/__init__.py +88 -0
  72. vgi/catalog/attach_option.py +206 -0
  73. vgi/catalog/catalog_interface.py +2767 -0
  74. vgi/catalog/descriptors.py +870 -0
  75. vgi/catalog/duckdb_statistics.py +377 -0
  76. vgi/catalog/secret_type.py +96 -0
  77. vgi/catalog/setting.py +253 -0
  78. vgi/catalog/storage.py +372 -0
  79. vgi/client/__init__.py +67 -0
  80. vgi/client/catalog_mixin.py +1251 -0
  81. vgi/client/cli.py +582 -0
  82. vgi/client/cli_catalog.py +182 -0
  83. vgi/client/cli_schema.py +270 -0
  84. vgi/client/cli_table.py +907 -0
  85. vgi/client/cli_transaction.py +97 -0
  86. vgi/client/cli_utils.py +441 -0
  87. vgi/client/cli_view.py +303 -0
  88. vgi/client/client.py +2183 -0
  89. vgi/exceptions.py +205 -0
  90. vgi/function.py +245 -0
  91. vgi/function_storage.py +1636 -0
  92. vgi/function_storage_azure_sql.py +922 -0
  93. vgi/function_storage_cf_do.py +740 -0
  94. vgi/http/__init__.py +25 -0
  95. vgi/http/demo_storage.py +212 -0
  96. vgi/http/worker_page.py +1252 -0
  97. vgi/invocation.py +154 -0
  98. vgi/logging_config.py +93 -0
  99. vgi/meta_worker.py +661 -0
  100. vgi/metadata.py +1403 -0
  101. vgi/otel.py +406 -0
  102. vgi/protocol.py +2418 -0
  103. vgi/protocol_version.txt +1 -0
  104. vgi/py.typed +0 -0
  105. vgi/scalar_function.py +1211 -0
  106. vgi/schema_utils.py +234 -0
  107. vgi/secret_protocol.py +124 -0
  108. vgi/secret_service.py +238 -0
  109. vgi/serve.py +769 -0
  110. vgi/table_buffering_function.py +443 -0
  111. vgi/table_filter_pushdown.py +1528 -0
  112. vgi/table_function.py +1130 -0
  113. vgi/table_in_out_function.py +383 -0
  114. vgi/transactor/__init__.py +24 -0
  115. vgi/transactor/_duckdb_compat.py +27 -0
  116. vgi/transactor/client.py +137 -0
  117. vgi/transactor/protocol.py +149 -0
  118. vgi/transactor/server.py +740 -0
  119. vgi/worker.py +4761 -0
  120. vgi_python-0.8.0.dist-info/METADATA +735 -0
  121. vgi_python-0.8.0.dist-info/RECORD +124 -0
  122. vgi_python-0.8.0.dist-info/WHEEL +4 -0
  123. vgi_python-0.8.0.dist-info/entry_points.txt +5 -0
  124. vgi_python-0.8.0.dist-info/licenses/LICENSE +134 -0
@@ -0,0 +1,443 @@
1
+ # Copyright 2025, 2026 Query Farm LLC - https://query.farm
2
+
3
+ """Framework for implementing table sink+source functions.
4
+
5
+ ``TableBufferingFunction`` is the worker-side base for functions that must
6
+ see *every* input row before producing any output (e.g. buffer-then-emit,
7
+ global aggregations, sort-then-emit). Routed through the C++
8
+ ``PhysicalVgiTableBuffering`` Sink+Source operator.
9
+
10
+ Three callbacks, mirroring the operator's three phases:
11
+
12
+ * ``process(batch, params) -> bytes`` — ingest one batch, return an opaque
13
+ state_id naming where the worker stored it.
14
+ * ``combine(state_ids, params) -> list[bytes]`` — once per query, on the
15
+ coordinator worker; group/merge/sort the per-batch state_ids and
16
+ return finalize_state_ids for the Source phase.
17
+ * ``finalize(params, finalize_state_id, state, out)`` — producer-mode
18
+ streaming RPC mirroring ``TableFunctionGenerator.process``: one tick
19
+ per call, emit one batch via ``out.emit(batch)`` (or ``out.finish()``
20
+ for EOS), state persists between ticks via wire-serialization.
21
+
22
+ State_ids are opaque ``bytes``. The worker picks the granularity (per-batch,
23
+ per-thread, custom partitioning); the framework just round-trips them.
24
+
25
+ INVARIANT: any state the worker stores in ``process()`` that ``finalize()``
26
+ will need MUST live in cross-process storage scoped by
27
+ ``params.execution_id`` (``BoundStorage`` is the canonical choice). The
28
+ Source phase may route a given ``finalize_state_id`` to a worker process
29
+ that did NOT run the corresponding ``process()`` calls.
30
+ """
31
+
32
+ from __future__ import annotations
33
+
34
+ from abc import abstractmethod
35
+ from collections.abc import Callable
36
+ from dataclasses import dataclass, field
37
+ from typing import TYPE_CHECKING, Any, ClassVar, TypeVar, get_args, get_origin
38
+
39
+ import pyarrow as pa
40
+ from vgi_rpc import ArrowSerializableDataclass
41
+ from vgi_rpc.rpc import OutputCollector
42
+
43
+ from vgi.invocation import (
44
+ BindResponse,
45
+ )
46
+ from vgi.table_function import (
47
+ _ON_CANCEL_CAVEATS,
48
+ BindParams,
49
+ ProcessParams,
50
+ TableFunctionBase,
51
+ )
52
+
53
+ if TYPE_CHECKING:
54
+ pass
55
+
56
+ __all__ = [
57
+ "TableBufferingFunction",
58
+ "TableBufferingParams",
59
+ ]
60
+
61
+
62
+ # Sentinel meaning "no parameterization of TableBufferingFunction was found
63
+ # in the MRO walk; leave the existing class attribute alone (inherits via
64
+ # normal MRO lookup from a base that did resolve)". Distinguished from None
65
+ # (a valid resolved value meaning "no per-tick state").
66
+ _UNCHANGED: Any = object()
67
+
68
+
69
+ def _resolve_finalize_state_class(
70
+ cls: type,
71
+ ) -> type[ArrowSerializableDataclass] | None | Any:
72
+ """Walk ``cls.__mro__`` to resolve ``TFinalizeState`` to a concrete type.
73
+
74
+ Returns the resolved class, ``None`` (state explicitly disabled), or
75
+ the ``_UNCHANGED`` sentinel when no TBF parameterization is found.
76
+
77
+ Handles generic-through chains by maintaining a TypeVar→concrete
78
+ substitution map as we walk from most-derived to base. When an
79
+ intermediate class binds a TypeVar to a concrete type, later levels
80
+ that reference that TypeVar in their own bases get substituted.
81
+ """
82
+ # Walk lazy-imported to avoid the forward-reference dance.
83
+ substitutions: dict[TypeVar, Any] = {}
84
+ saw_parameterization = False
85
+
86
+ for klass in cls.__mro__:
87
+ # __orig_bases__ is per-class (not inherited); look it up directly
88
+ # on klass without falling back to attribute resolution.
89
+ orig_bases = klass.__dict__.get("__orig_bases__", ())
90
+ for base in orig_bases:
91
+ origin = get_origin(base)
92
+ if origin is None or not isinstance(origin, type):
93
+ continue
94
+ if not issubclass(origin, TableBufferingFunction):
95
+ continue
96
+ saw_parameterization = True
97
+ type_args = get_args(base)
98
+
99
+ if origin is TableBufferingFunction:
100
+ # Direct parameterization: TableBufferingFunction[TArgs, TState].
101
+ if len(type_args) < 2:
102
+ continue
103
+ state = type_args[1]
104
+ # Resolve transitively through prior substitutions.
105
+ while isinstance(state, TypeVar) and state in substitutions:
106
+ state = substitutions[state]
107
+ if state is None or state is type(None):
108
+ return None
109
+ if isinstance(state, TypeVar):
110
+ # Still unresolved — generic-through to a leaf class
111
+ # that we either haven't seen yet (impossible: we walk
112
+ # most-derived first) or that didn't bind. Leave None.
113
+ return None
114
+ return state
115
+
116
+ # Intermediate parameterized base — record TypeVar substitutions
117
+ # so the next iteration up the MRO can use them.
118
+ type_params: tuple[TypeVar, ...] = getattr(origin, "__parameters__", ())
119
+ # strict=False on purpose: an intermediate generic may declare
120
+ # more TypeVars than the parameterization binds (callers can
121
+ # leave trailing positions unbound by intent), in which case
122
+ # ``zip`` should silently truncate.
123
+ for tv, ta in zip(type_params, type_args, strict=False):
124
+ # If ta itself is a TypeVar resolved earlier (deeper-nested
125
+ # generic), chase the chain to its concrete binding.
126
+ while isinstance(ta, TypeVar) and ta in substitutions:
127
+ ta = substitutions[ta]
128
+ substitutions[tv] = ta
129
+
130
+ return _UNCHANGED if not saw_parameterization else None
131
+
132
+
133
+ @dataclass(slots=True, frozen=True, kw_only=True)
134
+ class TableBufferingParams[TArgs](ProcessParams[TArgs]):
135
+ """Params for ``TableBufferingFunction`` callbacks.
136
+
137
+ Adds identity fields that the buffered API needs to scope worker-owned
138
+ storage and coordinate cross-process state. Other function shapes
139
+ (``TableFunctionGenerator``, ``TableInOutGenerator``, aggregates) keep
140
+ using the plain ``ProcessParams`` they always have.
141
+
142
+ Attributes:
143
+ execution_id: Stable across coordinator + secondary workers for one
144
+ DuckDB query execution. Key worker-owned storage by this.
145
+ attach_id: Catalog attach identity; pin attach-time config lookups
146
+ by this.
147
+ transaction_id: Hex-encoded VGI transaction id when running inside
148
+ a DuckDB transaction, ``None`` otherwise.
149
+ function_name: Convenience accessor — same as
150
+ ``init_call.function_name``.
151
+ worker_path: Subprocess path / ``unix://`` / ``launch:`` argv. For
152
+ diagnostics.
153
+
154
+ """
155
+
156
+ execution_id: bytes
157
+ attach_id: bytes
158
+ transaction_id: bytes | None
159
+ function_name: str
160
+ worker_path: str | None = None
161
+
162
+ # In-band log sink — emits a 0-row log batch on the RPC response stream,
163
+ # which DuckDB surfaces as a row in ``duckdb_logs()`` with ``type='VGI'``.
164
+ # Use this from ``process()`` and ``combine()`` (which are unary RPCs and
165
+ # have no ``OutputCollector``). The streaming ``finalize(... out)``
166
+ # callback should use ``out.client_log(...)`` instead — it goes through
167
+ # the same wire mechanism but flows through the producer-mode stream.
168
+ #
169
+ # The worker handler wires this to ``ctx.client_log`` before invoking
170
+ # the user callback; the default no-op is a safety net for unit-test
171
+ # callers that build ``TableBufferingParams`` outside the RPC path.
172
+ client_log: Callable[..., None] = field(
173
+ default=lambda *_a, **_kw: None,
174
+ repr=False,
175
+ compare=False,
176
+ )
177
+
178
+
179
+ class TableBufferingFunction[TArgs, TFinalizeState = None](TableFunctionBase[TArgs]):
180
+ """Base class for table sink+source functions.
181
+
182
+ Subclass to declare a function that must see every input row before
183
+ producing output. The C++ ``PhysicalVgiTableBuffering`` operator
184
+ routes calls through three phases:
185
+
186
+ 1. **Sink** — ``process(batch, params) -> state_id`` is called per
187
+ input batch (parallel across DuckDB threads unless
188
+ ``Meta.sink_order_dependent`` is set).
189
+ 2. **Combine** — ``combine(state_ids, params) -> finalize_state_ids``
190
+ is called once on the coordinator worker after every ``process()``
191
+ completes.
192
+ 3. **Source** — ``finalize(params, fid, state, out)`` is called per
193
+ tick by the framework, emitting one batch per call (parallel
194
+ across ``finalize_state_ids`` unless ``Meta.source_order_dependent``).
195
+
196
+ Cross-process invariant: any state the worker writes during
197
+ ``process()`` that ``finalize()`` will read MUST live in cross-process
198
+ storage scoped by ``params.execution_id`` — ``BoundStorage`` is the
199
+ canonical choice. The Source phase routes a given ``finalize_state_id``
200
+ to whatever worker process the C++ scheduler picks; it is NOT
201
+ guaranteed to be the same process that ran ``process()``.
202
+
203
+ Type parameters:
204
+ TArgs: User-facing function arguments dataclass.
205
+ TFinalizeState: Wire-serializable state carried between
206
+ ``finalize()`` ticks. Must subclass ``ArrowSerializableDataclass``
207
+ when set to anything other than ``None``.
208
+ """
209
+
210
+ # Resolved at class-definition time by ``__init_subclass__`` from the
211
+ # ``TFinalizeState`` generic parameter (position 1 in the parameterized
212
+ # base). ``None`` means "no per-tick state" (the user passed ``None`` as
213
+ # ``TFinalizeState`` or didn't parameterize). Inherits through subclassing,
214
+ # so ``class Foo(BufferInputFunction): ...`` reuses the parent's resolution
215
+ # without re-walking ``__orig_bases__``.
216
+ _finalize_state_class: ClassVar[type[ArrowSerializableDataclass] | None] = None
217
+
218
+ class Meta:
219
+ """Per-class metadata for TableBufferingFunction."""
220
+
221
+ name: ClassVar[str]
222
+ # Output schema declared via Meta.return_schema or via on_bind().
223
+ # Sink-side ordering: forces ParallelSink=false in the C++ operator.
224
+ sink_order_dependent: ClassVar[bool] = False
225
+ # Source-side ordering: forces serial output in finalize_queue order.
226
+ source_order_dependent: ClassVar[bool] = False
227
+ # Threads DuckDB's per-chunk batch_index into every process() call.
228
+ # Mutually exclusive with sink_order_dependent (validated below).
229
+ requires_input_batch_index: ClassVar[bool] = False
230
+
231
+ def __init_subclass__(cls) -> None: # noqa: D105 — internal hook
232
+ super().__init_subclass__()
233
+
234
+ # Resolve ``TFinalizeState`` by walking the MRO chain of
235
+ # generic-parameterizations. The naive "look at cls.__orig_bases__"
236
+ # approach handles ``class Foo(TableBufferingFunction[Args, State])``
237
+ # but silently loses the state type on intermediate generics:
238
+ #
239
+ # class Mid[X](TableBufferingFunction[Args, X]): ...
240
+ # class Concrete(Mid[MyState]): # bug: TFinalizeState = None
241
+ #
242
+ # ``Concrete.__orig_bases__`` is ``(Mid[MyState],)``; the old loop
243
+ # saw origin=Mid (a TBF subclass), tried ``type_args[1]`` (out of
244
+ # range, only one arg), and bailed, leaving _finalize_state_class
245
+ # unset → MyState lost. We instead walk ``cls.__mro__``, build a
246
+ # TypeVar→concrete substitution map level by level, and resolve
247
+ # when we reach a base whose origin is TableBufferingFunction
248
+ # itself. ``TableFunctionBase.__init_subclass__`` (via super())
249
+ # has already validated state_type when it was first introduced.
250
+ resolved = _resolve_finalize_state_class(cls)
251
+ if resolved is _UNCHANGED:
252
+ # No parameterization found in the MRO walk — leave the
253
+ # inherited class-attribute value alone (covers
254
+ # ``class Foo(BufferInputFunction): ...`` where Foo doesn't
255
+ # re-parameterize and just inherits BufferInputFunction's
256
+ # resolved class).
257
+ pass
258
+ else:
259
+ cls._finalize_state_class = resolved
260
+
261
+ meta = getattr(cls, "Meta", None)
262
+ if meta is None:
263
+ return
264
+ sink_order = bool(getattr(meta, "sink_order_dependent", False))
265
+ requires_batch_index = bool(getattr(meta, "requires_input_batch_index", False))
266
+ if sink_order and requires_batch_index:
267
+ raise TypeError(
268
+ f"{cls.__name__}.Meta: sink_order_dependent and "
269
+ f"requires_input_batch_index are mutually exclusive — "
270
+ f"single-thread sink already orders input, batch_index is "
271
+ f"only meaningful under parallel ingest."
272
+ )
273
+
274
+ @classmethod
275
+ def on_bind(
276
+ cls,
277
+ params: BindParams[TArgs],
278
+ ) -> BindResponse:
279
+ """Pass-through default — output schema is the input schema.
280
+
281
+ Override to validate arguments, compute a dynamic output type, or
282
+ request secrets via ``SecretsAccessor``. See
283
+ ``TableFunctionBase.on_bind`` for the broader contract.
284
+ """
285
+ assert params.bind_call.input_schema is not None
286
+ return BindResponse(output_schema=params.bind_call.input_schema)
287
+
288
+ # bind / on_init / global_init are defined on TableFunctionBase.
289
+
290
+ # ------------------------------------------------------------------
291
+ # Sink phase
292
+ # ------------------------------------------------------------------
293
+
294
+ @classmethod
295
+ @abstractmethod
296
+ def process(
297
+ cls,
298
+ batch: pa.RecordBatch,
299
+ params: TableBufferingParams[TArgs],
300
+ ) -> bytes:
301
+ """Ingest one input batch and return an opaque ``state_id``.
302
+
303
+ The worker chooses both *where* to store the batch (BoundStorage,
304
+ external files, in-memory cross-process structures, etc.) and the
305
+ *granularity* of state_ids (per-batch, per-thread, custom
306
+ partitioning). The framework collects all returned state_ids and
307
+ passes them to ``combine()`` on the coordinator worker.
308
+
309
+ Common pattern for "one bucket per execution" is to return
310
+ ``params.execution_id``; ``combine()`` then collapses the list of
311
+ identical state_ids to a single finalize stream.
312
+
313
+ Cross-process invariant: any state the worker stores here that
314
+ ``finalize()`` will need MUST live in cross-process storage scoped
315
+ by ``params.execution_id``. The Source phase may route the
316
+ corresponding finalize_state_id to a different worker process.
317
+
318
+ Args:
319
+ batch: One input batch from DuckDB. Schema matches the
320
+ function's declared ``input_schema``.
321
+ params: Process-time params, including identity fields
322
+ (``execution_id``, ``attach_id``, ``transaction_id``,
323
+ ``function_name``) and ``params.batch_index`` when
324
+ ``Meta.requires_input_batch_index=True``.
325
+
326
+ Returns:
327
+ Opaque state_id naming where the batch was stored.
328
+
329
+ """
330
+
331
+ # ------------------------------------------------------------------
332
+ # Combine phase
333
+ # ------------------------------------------------------------------
334
+
335
+ @classmethod
336
+ @abstractmethod
337
+ def combine(
338
+ cls,
339
+ state_ids: list[bytes],
340
+ params: TableBufferingParams[TArgs],
341
+ ) -> list[bytes]:
342
+ """Group / merge / sort state_ids; return finalize_state_ids.
343
+
344
+ Called once on the coordinator worker after every ``process()``
345
+ completes. State_ids are opaque bytes — the framework does not
346
+ inspect, dedup, or transform them. ``combine`` returns the exact
347
+ list of finalize_state_ids the Source phase will iterate; one
348
+ finalize stream per returned id.
349
+
350
+ Typical patterns:
351
+
352
+ * **Single-bucket execution** — process() returns ``params.execution_id``
353
+ for every call; combine() returns ``[params.execution_id]`` so
354
+ one finalize stream drains the single accumulator.
355
+ * **Per-shard fan-out** — process() returns a per-shard
356
+ identifier; combine() returns the list of unique shard ids
357
+ for parallel finalize.
358
+ * **Global sort under ``Meta.sink_order_dependent``** — process()
359
+ returns per-batch ids; combine() reads each, sorts globally,
360
+ returns ``[sentinel]`` so a single ordered finalize stream
361
+ emits the merged result.
362
+
363
+ Args:
364
+ state_ids: Every state_id returned from every ``process()``
365
+ call across every DuckDB thread, in arbitrary order.
366
+ Duplicates from multiple Sink threads using the same
367
+ state_id are NOT dedup'd by the framework.
368
+ params: Process-time params (same identity fields as
369
+ ``process()``).
370
+
371
+ Returns:
372
+ finalize_state_ids — keys the Source phase will iterate.
373
+
374
+ """
375
+
376
+ # ------------------------------------------------------------------
377
+ # Source phase — mirrors TableFunctionGenerator.process producer-mode
378
+ # ------------------------------------------------------------------
379
+
380
+ @classmethod
381
+ def initial_finalize_state(
382
+ cls,
383
+ finalize_state_id: bytes,
384
+ params: TableBufferingParams[TArgs],
385
+ ) -> TFinalizeState | None:
386
+ """Build the initial wire-serializable state for a finalize stream.
387
+
388
+ Called once per finalize_state_id at stream init time. The
389
+ returned state is passed to the first ``finalize()`` tick; the
390
+ framework serializes it between ticks so the stream survives
391
+ worker process boundaries (HTTP transport).
392
+
393
+ Default returns ``None`` (suitable when ``TFinalizeState = None``).
394
+ Override and declare a concrete ``TFinalizeState`` subclass of
395
+ ``ArrowSerializableDataclass`` to carry cursor / progress state
396
+ between ticks.
397
+ """
398
+ return None
399
+
400
+ @classmethod
401
+ @abstractmethod
402
+ def finalize(
403
+ cls,
404
+ params: TableBufferingParams[TArgs],
405
+ finalize_state_id: bytes,
406
+ state: TFinalizeState,
407
+ out: OutputCollector,
408
+ ) -> None:
409
+ """Produce one batch's worth of output for ``finalize_state_id``.
410
+
411
+ Called repeatedly by the framework (one call per tick). Each call
412
+ should either:
413
+
414
+ * ``out.emit(batch)`` to produce one output batch and mutate
415
+ ``state`` in place — ``state`` is wire-serialized after the
416
+ call so the next tick (possibly on a different worker)
417
+ resumes from the updated value.
418
+ * ``out.finish()`` to signal EOS for this ``finalize_state_id``.
419
+
420
+ Mirrors ``TableFunctionGenerator.process`` exactly — the only
421
+ difference is the parameterization by ``finalize_state_id``
422
+ instead of free function arguments.
423
+ """
424
+
425
+ @classmethod
426
+ def on_cancel(
427
+ cls,
428
+ params: TableBufferingParams[TArgs],
429
+ finalize_state_id: bytes,
430
+ state: TFinalizeState,
431
+ ) -> None:
432
+ """No-op default; runtime docstring set below via __func__.__doc__."""
433
+
434
+ on_cancel.__func__.__doc__ = ( # type: ignore[attr-defined]
435
+ f"""Release resources when a finalize stream is cancelled before EOS.
436
+
437
+ Fired when DuckDB tears down a scan early (LIMIT clause, user
438
+ break, exception unwind). Override to release expensive resources
439
+ held in ``state`` (DB connections, large buffers, etc.).
440
+
441
+ {_ON_CANCEL_CAVEATS}
442
+ """
443
+ )