vgi-python 0.8.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (124) hide show
  1. vgi/__init__.py +152 -0
  2. vgi/_duckdb.py +62 -0
  3. vgi/_storage_profile.py +132 -0
  4. vgi/_test_fixtures/__init__.py +20 -0
  5. vgi/_test_fixtures/accumulate/__init__.py +19 -0
  6. vgi/_test_fixtures/accumulate/worker.py +762 -0
  7. vgi/_test_fixtures/aggregate/__init__.py +62 -0
  8. vgi/_test_fixtures/aggregate/_common.py +21 -0
  9. vgi/_test_fixtures/aggregate/basic.py +232 -0
  10. vgi/_test_fixtures/aggregate/dynamic.py +409 -0
  11. vgi/_test_fixtures/aggregate/generic.py +86 -0
  12. vgi/_test_fixtures/aggregate/listagg.py +71 -0
  13. vgi/_test_fixtures/aggregate/percentile.py +107 -0
  14. vgi/_test_fixtures/aggregate/streaming.py +192 -0
  15. vgi/_test_fixtures/aggregate/varargs.py +75 -0
  16. vgi/_test_fixtures/aggregate/window.py +380 -0
  17. vgi/_test_fixtures/attach_options.py +308 -0
  18. vgi/_test_fixtures/bad_protocol.py +62 -0
  19. vgi/_test_fixtures/cancellable.py +336 -0
  20. vgi/_test_fixtures/catalog.py +813 -0
  21. vgi/_test_fixtures/http_server.py +394 -0
  22. vgi/_test_fixtures/nest_tensor.py +614 -0
  23. vgi/_test_fixtures/orchard_catalog.py +47 -0
  24. vgi/_test_fixtures/projection_repro/__init__.py +6 -0
  25. vgi/_test_fixtures/projection_repro/worker.py +454 -0
  26. vgi/_test_fixtures/scalar/__init__.py +116 -0
  27. vgi/_test_fixtures/scalar/_common.py +69 -0
  28. vgi/_test_fixtures/scalar/arithmetic.py +321 -0
  29. vgi/_test_fixtures/scalar/binary.py +120 -0
  30. vgi/_test_fixtures/scalar/formatting.py +176 -0
  31. vgi/_test_fixtures/scalar/geo.py +300 -0
  32. vgi/_test_fixtures/scalar/null_handling.py +107 -0
  33. vgi/_test_fixtures/scalar/random_demo.py +171 -0
  34. vgi/_test_fixtures/scalar/settings_secrets.py +102 -0
  35. vgi/_test_fixtures/scalar/type_info.py +219 -0
  36. vgi/_test_fixtures/schema_reconcile/__init__.py +29 -0
  37. vgi/_test_fixtures/schema_reconcile/worker.py +653 -0
  38. vgi/_test_fixtures/simple_writable.py +793 -0
  39. vgi/_test_fixtures/table/__init__.py +221 -0
  40. vgi/_test_fixtures/table/_common.py +162 -0
  41. vgi/_test_fixtures/table/batch_index.py +283 -0
  42. vgi/_test_fixtures/table/batch_index_broken.py +200 -0
  43. vgi/_test_fixtures/table/catalog_scans.py +162 -0
  44. vgi/_test_fixtures/table/filters.py +1005 -0
  45. vgi/_test_fixtures/table/late_materialization.py +249 -0
  46. vgi/_test_fixtures/table/make_series.py +273 -0
  47. vgi/_test_fixtures/table/misc.py +499 -0
  48. vgi/_test_fixtures/table/order_modes.py +164 -0
  49. vgi/_test_fixtures/table/pairs.py +437 -0
  50. vgi/_test_fixtures/table/partition_columns.py +472 -0
  51. vgi/_test_fixtures/table/partition_columns_broken.py +304 -0
  52. vgi/_test_fixtures/table/profiling_example.py +195 -0
  53. vgi/_test_fixtures/table/required_filters.py +234 -0
  54. vgi/_test_fixtures/table/sequence.py +710 -0
  55. vgi/_test_fixtures/table/settings.py +426 -0
  56. vgi/_test_fixtures/table/transaction_storage.py +162 -0
  57. vgi/_test_fixtures/table/tt_pushdown.py +191 -0
  58. vgi/_test_fixtures/table/versioned.py +230 -0
  59. vgi/_test_fixtures/table_in_out.py +1392 -0
  60. vgi/_test_fixtures/versioned.py +155 -0
  61. vgi/_test_fixtures/versioned_tables.py +595 -0
  62. vgi/_test_fixtures/worker.py +1631 -0
  63. vgi/_test_fixtures/writable/__init__.py +8 -0
  64. vgi/_test_fixtures/writable/generic.py +236 -0
  65. vgi/_test_fixtures/writable/table.py +149 -0
  66. vgi/_test_fixtures/writable/worker.py +1148 -0
  67. vgi/aggregate_function.py +607 -0
  68. vgi/argument_spec.py +472 -0
  69. vgi/arguments.py +1747 -0
  70. vgi/auth.py +55 -0
  71. vgi/catalog/__init__.py +88 -0
  72. vgi/catalog/attach_option.py +206 -0
  73. vgi/catalog/catalog_interface.py +2767 -0
  74. vgi/catalog/descriptors.py +870 -0
  75. vgi/catalog/duckdb_statistics.py +377 -0
  76. vgi/catalog/secret_type.py +96 -0
  77. vgi/catalog/setting.py +253 -0
  78. vgi/catalog/storage.py +372 -0
  79. vgi/client/__init__.py +67 -0
  80. vgi/client/catalog_mixin.py +1251 -0
  81. vgi/client/cli.py +582 -0
  82. vgi/client/cli_catalog.py +182 -0
  83. vgi/client/cli_schema.py +270 -0
  84. vgi/client/cli_table.py +907 -0
  85. vgi/client/cli_transaction.py +97 -0
  86. vgi/client/cli_utils.py +441 -0
  87. vgi/client/cli_view.py +303 -0
  88. vgi/client/client.py +2183 -0
  89. vgi/exceptions.py +205 -0
  90. vgi/function.py +245 -0
  91. vgi/function_storage.py +1636 -0
  92. vgi/function_storage_azure_sql.py +922 -0
  93. vgi/function_storage_cf_do.py +740 -0
  94. vgi/http/__init__.py +25 -0
  95. vgi/http/demo_storage.py +212 -0
  96. vgi/http/worker_page.py +1252 -0
  97. vgi/invocation.py +154 -0
  98. vgi/logging_config.py +93 -0
  99. vgi/meta_worker.py +661 -0
  100. vgi/metadata.py +1403 -0
  101. vgi/otel.py +406 -0
  102. vgi/protocol.py +2418 -0
  103. vgi/protocol_version.txt +1 -0
  104. vgi/py.typed +0 -0
  105. vgi/scalar_function.py +1211 -0
  106. vgi/schema_utils.py +234 -0
  107. vgi/secret_protocol.py +124 -0
  108. vgi/secret_service.py +238 -0
  109. vgi/serve.py +769 -0
  110. vgi/table_buffering_function.py +443 -0
  111. vgi/table_filter_pushdown.py +1528 -0
  112. vgi/table_function.py +1130 -0
  113. vgi/table_in_out_function.py +383 -0
  114. vgi/transactor/__init__.py +24 -0
  115. vgi/transactor/_duckdb_compat.py +27 -0
  116. vgi/transactor/client.py +137 -0
  117. vgi/transactor/protocol.py +149 -0
  118. vgi/transactor/server.py +740 -0
  119. vgi/worker.py +4761 -0
  120. vgi_python-0.8.0.dist-info/METADATA +735 -0
  121. vgi_python-0.8.0.dist-info/RECORD +124 -0
  122. vgi_python-0.8.0.dist-info/WHEEL +4 -0
  123. vgi_python-0.8.0.dist-info/entry_points.txt +5 -0
  124. vgi_python-0.8.0.dist-info/licenses/LICENSE +134 -0
vgi/client/client.py ADDED
@@ -0,0 +1,2183 @@
1
+ # Copyright 2025, 2026 Query Farm LLC - https://query.farm
2
+
3
+ """VGI reference client — canonical implementation for other-language ports.
4
+
5
+ ``vgi-python`` is the authoritative VGI implementation. Real users invoke
6
+ VGI from DuckDB via the C++ extension; this module exists for two other
7
+ audiences:
8
+
9
+ 1. **Non-DuckDB callers.** A TypeScript port that wants to browse catalog
10
+ contents, invoke a scalar, or feed an HTTP worker from outside DuckDB.
11
+ The HTTP transport below is the canonical path for those callers.
12
+ 2. **Porters.** TS/Go/Rust teams reading this file to understand what
13
+ their client must do. Every HTTP-relevant code path aims to be
14
+ plain enough to translate.
15
+
16
+ Protocol sequence (HTTP)::
17
+
18
+ capabilities → GET /capabilities — upload-URL caps
19
+ connect → http_connect(base_url, auth) — typed proxy
20
+ catalogs → proxy.catalog_catalogs() — discover
21
+ attach → proxy.catalog_attach(req) — open a catalog
22
+ bind → proxy.bind(BindRequest) — resolve schema
23
+ init → proxy.init(InitRequest) — open a stream
24
+ exchange loop → stream.exchange(AnnotatedBatch)
25
+ • oversize input → request_upload_urls + PUT + pointer batch
26
+ • pointer output → auto-resolve via external_location config
27
+ detach → proxy.catalog_detach(attach_opaque_data)
28
+
29
+ The subprocess transport (``_spawn_subprocess_connection``, ``WorkerPool``,
30
+ ``shell=True``) is a Python-only convenience for running tests against a
31
+ local worker. Other-language ports do not need to mirror it — implement
32
+ the HTTP flow and skip the subprocess branch.
33
+
34
+ Parallel processing
35
+ -------------------
36
+ When a bind returns ``max_workers > 1`` the client spawns additional
37
+ worker connections and distributes input batches round-robin. Output
38
+ order is non-deterministic in parallel mode. This is optimization; a
39
+ minimal port can ignore it and always use one connection.
40
+
41
+ Key classes
42
+ -----------
43
+ Client — main entry point; ``Client.from_http(...)`` for HTTP
44
+ ClientError — raised on communication errors
45
+ WorkerConnection — internal; one per transport-level connection
46
+
47
+ Key methods
48
+ -----------
49
+ client.catalogs() — discover catalogs
50
+ client.catalog_attach(...) — open a catalog
51
+ client.schemas(...) — list schemas
52
+ client.schema_contents(...) — list tables/views/functions/macros
53
+ client.scalar_function(...) — invoke a scalar
54
+ client.table_function(...) — invoke a table function
55
+ client.table_in_out_function(...) — invoke a table-in-out function
56
+ client.server_capabilities() — HTTP only; upload-URL caps
57
+
58
+ See Also
59
+ --------
60
+ vgi.protocol.VgiProtocol — the RPC interface this client exercises
61
+ vgi.protocol.BindRequest — request types
62
+ vgi.arguments.Arguments — positional/named argument container
63
+ vgi_rpc.http.http_connect — transport primitive this client wraps
64
+
65
+ """
66
+
67
+ from __future__ import annotations
68
+
69
+ import io
70
+ import itertools
71
+ import logging
72
+ import os
73
+ import shlex
74
+ import subprocess
75
+ import sys
76
+ import threading
77
+ from collections.abc import Callable, Generator, Iterator
78
+ from contextlib import AbstractContextManager
79
+ from dataclasses import dataclass, field
80
+ from queue import Queue
81
+ from typing import IO, Any, Literal, cast
82
+
83
+ import pyarrow as pa
84
+ from vgi_rpc import WorkerPool
85
+ from vgi_rpc.log import Message
86
+ from vgi_rpc.rpc import (
87
+ AnnotatedBatch,
88
+ PipeTransport,
89
+ RpcConnection,
90
+ RpcError,
91
+ StreamSession,
92
+ )
93
+
94
+ from vgi.arguments import Arguments
95
+ from vgi.client.catalog_mixin import CatalogClientMixin
96
+ from vgi.invocation import (
97
+ BindResponse,
98
+ FunctionType,
99
+ GlobalInitResponse,
100
+ )
101
+ from vgi.protocol import (
102
+ BindRequest,
103
+ InitRequest,
104
+ TableBufferingCombineRequest,
105
+ TableBufferingDestructorRequest,
106
+ TableBufferingProcessRequest,
107
+ VgiProtocol,
108
+ )
109
+ from vgi.table_function import TableInOutFunctionInitPhase
110
+
111
+ _logger = logging.getLogger("vgi.client")
112
+ _worker_logger = logging.getLogger("vgi.client.worker")
113
+
114
+
115
+ class ClientError(Exception):
116
+ """Error raised by Client operations.
117
+
118
+ The first line of ``str(ClientError)`` is the remote exception as the
119
+ worker raised it (``{error_type}: {error_message}``), so that whatever
120
+ a user typed into their `raise ValueError(...)` shows up at the top of
121
+ their traceback instead of being buried under VGI framing. Remote
122
+ traceback and worker-stderr excerpts, when present, follow after an
123
+ empty line.
124
+ """
125
+
126
+ @classmethod
127
+ def from_rpc_error(cls, e: RpcError) -> ClientError:
128
+ """Create a ClientError from an RpcError, including remote traceback.
129
+
130
+ Lead with the user's exception (``error_type: error_message``) so
131
+ the most actionable line is first. The ``Remote traceback`` section
132
+ trails and is only included when the worker produced one.
133
+ """
134
+ # str(e) is already "error_type: error_message" from RpcError.__init__.
135
+ parts: list[str] = [str(e)]
136
+ if getattr(e, "remote_traceback", ""):
137
+ parts.append(f"Remote traceback:\n{e.remote_traceback}")
138
+ return cls("\n\n".join(parts))
139
+
140
+
141
+ class ResumeUnsupported(ClientError):
142
+ """Raised when a resumable scan is requested on a non-resumable transport.
143
+
144
+ Only the HTTP transport round-trips producer state in continuation tokens,
145
+ so only HTTP clients can drive :meth:`Client.table_scan_resumable`. On the
146
+ pipe/subprocess transport the stream is a live connection with no
147
+ serializable resume point; the caller must keep the live stream in-process
148
+ instead.
149
+ """
150
+
151
+
152
+ class ResumableTableScan:
153
+ """A resumable, one-batch-at-a-time handle on an upstream table-function scan.
154
+
155
+ Unlike :meth:`Client.table_function` (a live generator that hides the
156
+ server's continuation token), each :meth:`next` returns ``(batch, token)``
157
+ where ``token`` is the worker's serialized producer state AFTER ``batch``.
158
+ A stateless client (e.g. a load-balanced proxy) can persist ``token``, drop
159
+ the connection, and resume on another node via
160
+ ``Client.table_scan_resumable(resume_token=token, ...)``.
161
+
162
+ Single-worker: reads the primary stream only (parallel ``max_workers>1``
163
+ reads are unordered and not resumable from a single token).
164
+ """
165
+
166
+ def __init__(self, client: Client, stream: StreamSession) -> None:
167
+ """Wrap a started single-worker stream as a resumable cursor."""
168
+ self._client = client
169
+ self._stream = stream
170
+
171
+ def next(self) -> tuple[pa.RecordBatch | None, bytes | None]:
172
+ """Return ``(batch, resume_token)``; ``(None, None)`` at end-of-stream.
173
+
174
+ ``resume_token`` resumes the scan AFTER ``batch`` on any node.
175
+ """
176
+ try:
177
+ ab, token = self._stream.next_with_token() # type: ignore[attr-defined]
178
+ except RpcError as e:
179
+ raise ClientError.from_rpc_error(e) from e
180
+ return (ab.batch if ab is not None else None), token
181
+
182
+ def close(self) -> None:
183
+ """Release the underlying stream (no-op over HTTP — stateless)."""
184
+ self._stream.close()
185
+
186
+
187
+ # Module-level worker pool shared across all Client instances.
188
+ # Reuses idle worker subprocesses between Client sessions, avoiding
189
+ # repeated spawn/teardown overhead (especially valuable in tests).
190
+ _default_pool = WorkerPool(max_idle=8, idle_timeout=30.0)
191
+
192
+ # True once the HTTP transport is wired end-to-end. Used by the
193
+ # parametrized ``client_transport`` fixture in tests/conftest.py to decide
194
+ # whether to skip the HTTP leg of the matrix.
195
+ _HTTP_TRANSPORT_READY = True
196
+
197
+
198
+ @dataclass
199
+ class WorkerConnection:
200
+ """Holds state for a single worker connection (subprocess or HTTP).
201
+
202
+ Exactly one of {proc+connection, _pool_ctx, _http_ctx} is active per
203
+ connection — transport-specific teardown inspects these fields.
204
+ """
205
+
206
+ proxy: VgiProtocol
207
+ worker_index: int = 0
208
+ stream: StreamSession | None = None
209
+ # Subprocess transport, direct (non-pooled).
210
+ proc: subprocess.Popen[bytes] | None = None
211
+ connection: RpcConnection[VgiProtocol] | None = None
212
+ # Subprocess transport, pooled.
213
+ _pool_ctx: AbstractContextManager[Any] | None = field(default=None, repr=False)
214
+ # HTTP transport: context manager from vgi_rpc.http.http_connect.
215
+ _http_ctx: AbstractContextManager[Any] | None = field(default=None, repr=False)
216
+
217
+
218
+ class Client(CatalogClientMixin):
219
+ """Canonical VGI client — HTTP is the path other-language ports mirror.
220
+
221
+ Two transports:
222
+
223
+ * **HTTP** (``Client.from_http(base_url, bearer_token=...)``). The
224
+ canonical non-DuckDB path. Uses ``vgi_rpc.http.http_connect`` under
225
+ the hood; transparently resolves pointer batches returned by workers
226
+ that externalize large outputs (demo storage, S3). Transparently
227
+ externalizes large input batches when the server advertises upload-URL
228
+ support.
229
+ * **Subprocess** (``Client(server_path)``). Python-only convenience for
230
+ local workers. Uses shell subprocesses + a ``WorkerPool`` for reuse.
231
+ Ports don't need to mirror this.
232
+
233
+ Catalog operations (``catalogs()``, ``schema_contents()``, etc.) are
234
+ provided by ``CatalogClientMixin`` and don't require ``start()``. They
235
+ open a short-lived connection per call (HTTP) or borrow a pooled
236
+ subprocess worker.
237
+
238
+ Function invocation (``scalar_function``, ``table_function``,
239
+ ``table_in_out_function``) requires ``start()`` — typically via the
240
+ context-manager protocol::
241
+
242
+ with Client.from_http("http://host:port", bearer_token="...") as c:
243
+ for batch in c.table_function(function_name="sequence", ...):
244
+ ...
245
+ """
246
+
247
+ # Timeout for thread join operations (seconds)
248
+ THREAD_JOIN_TIMEOUT: float = 5.0
249
+
250
+ # Timeout for worker process wait operations (seconds)
251
+ PROCESS_WAIT_TIMEOUT: float = 5.0
252
+
253
+ @staticmethod
254
+ def _combine_batches(batches: list[pa.RecordBatch]) -> pa.RecordBatch | None:
255
+ """Combine multiple RecordBatches into a single RecordBatch.
256
+
257
+ Converts the batches to a PyArrow Table, combines chunks, and converts
258
+ back to a single batch. When all input batches have zero rows, PyArrow's
259
+ combine_chunks returns an empty list; in that case, the first original
260
+ batch is returned to preserve the schema.
261
+
262
+ Args:
263
+ batches: List of RecordBatches to combine. All batches must have
264
+ compatible schemas.
265
+
266
+ Returns:
267
+ A single combined RecordBatch, or None if the input list is empty.
268
+
269
+ """
270
+ if not batches:
271
+ return None
272
+
273
+ combined = list(pa.Table.from_batches(batches).combine_chunks().to_batches())
274
+ # If all batches were empty, combine_chunks returns empty list
275
+ if len(combined) == 0:
276
+ return batches[0]
277
+ return combined[0]
278
+
279
+ @staticmethod
280
+ def _on_worker_log(msg: Message) -> None:
281
+ """Forward log messages from vgi_rpc to the worker logger."""
282
+ level = getattr(logging, msg.level.name.upper(), logging.INFO)
283
+ _worker_logger.log(level, "%s", msg.message)
284
+
285
+ def _determine_max_workers(self, requested: int) -> int:
286
+ """Apply system and user limits to the function's requested max_workers.
287
+
288
+ Clamps the requested parallelism to the lower of:
289
+ 1. The system's CPU count (from os.cpu_count(), defaulting to 1)
290
+ 2. The user-specified worker_limit (if set via Client constructor)
291
+
292
+ Args:
293
+ requested: The max_workers value requested by the function,
294
+ typically from the init response header.
295
+
296
+ Returns:
297
+ The effective max_workers after applying all limits. Always >= 1.
298
+
299
+ """
300
+ max_workers = requested
301
+
302
+ # Limit to CPU count
303
+ cpu_count = os.cpu_count() or 1
304
+ if max_workers > cpu_count:
305
+ _logger.debug("limiting_max_workers_to_cpu_count requested=%s cpu_count=%s", max_workers, cpu_count)
306
+ max_workers = cpu_count
307
+
308
+ # Limit to user-specified worker_limit
309
+ if self._worker_limit is not None and max_workers > self._worker_limit:
310
+ _logger.debug(
311
+ "limiting_max_workers_to_worker_limit requested=%s worker_limit=%s",
312
+ max_workers,
313
+ self._worker_limit,
314
+ )
315
+ max_workers = self._worker_limit
316
+
317
+ return max_workers
318
+
319
+ @staticmethod
320
+ def _settings_to_batch(settings: dict[str, Any] | None) -> pa.RecordBatch | None:
321
+ """Convert settings dict to RecordBatch for protocol.
322
+
323
+ Args:
324
+ settings: Dictionary of setting name to value pairs.
325
+
326
+ Returns:
327
+ A single-row RecordBatch with one column per setting, or None.
328
+
329
+ """
330
+ if settings is None:
331
+ return None
332
+ return pa.RecordBatch.from_pydict({k: [v] for k, v in settings.items()})
333
+
334
+ @staticmethod
335
+ def _secrets_to_batch(secrets: dict[str, Any] | None) -> pa.RecordBatch | None:
336
+ """Convert secrets dict to RecordBatch for protocol.
337
+
338
+ Args:
339
+ secrets: Dictionary of secret name to value pairs. Values can be
340
+ simple scalars or dicts (for struct-typed secrets).
341
+
342
+ Returns:
343
+ A single-row RecordBatch with one column per secret, or None.
344
+
345
+ """
346
+ if secrets is None:
347
+ return None
348
+ return pa.RecordBatch.from_pydict({k: [v] for k, v in secrets.items()})
349
+
350
+ @staticmethod
351
+ def _deserialize_pushdown_filters(filters_bytes: bytes | None) -> pa.RecordBatch | None:
352
+ """Deserialize pushdown filter bytes to RecordBatch.
353
+
354
+ Args:
355
+ filters_bytes: IPC-serialized RecordBatch bytes, or None.
356
+
357
+ Returns:
358
+ Deserialized RecordBatch, or None.
359
+
360
+ """
361
+ if filters_bytes is None:
362
+ return None
363
+ reader = pa.ipc.open_stream(pa.BufferReader(filters_bytes))
364
+ return reader.read_next_batch()
365
+
366
+ def __init__(
367
+ self,
368
+ server_path: str | None = None,
369
+ passthrough_stderr: bool = False,
370
+ worker_limit: int | None = None,
371
+ attach_opaque_data: bytes | None = None,
372
+ pool: WorkerPool | None = _default_pool,
373
+ *,
374
+ transport: Literal["subprocess", "http"] = "subprocess",
375
+ base_url: str | None = None,
376
+ bearer_token: str | None = None,
377
+ httpx_client: Any | None = None,
378
+ external_location: Any | None = None,
379
+ ):
380
+ """Initialize the VGI client.
381
+
382
+ Creates a client configured to communicate with a VGI worker. The
383
+ worker is not contacted until start() is called or the client is used
384
+ as a context manager.
385
+
386
+ Transport selection: pass ``server_path`` (default) to spawn a local
387
+ subprocess worker; pass ``transport="http"`` + ``base_url=...`` (or
388
+ use the ``Client.from_http(...)`` factory) to talk to a remote HTTP
389
+ worker. Subprocess is Python-specific; HTTP is the canonical path
390
+ other-language clients mirror.
391
+
392
+ Args:
393
+ server_path: Subprocess-only. Shell command or path to the VGI
394
+ worker executable. Executed via shell=True.
395
+ passthrough_stderr: Subprocess-only. If True, worker stderr is
396
+ passed through to the parent process's stderr in real-time.
397
+ worker_limit: Maximum number of parallel worker processes.
398
+ attach_opaque_data: Optional unique identifier for the DuckDB database
399
+ attachment. When VGI is used from an attached database, this
400
+ allows tracing calls back to that specific attachment.
401
+ pool: Subprocess-only. Optional WorkerPool for subprocess reuse.
402
+ Pass None to disable pooling and use direct subprocess
403
+ management.
404
+ transport: Which transport to use. ``"subprocess"`` (default)
405
+ spawns a local subprocess per worker; ``"http"`` connects to
406
+ a running worker via ``vgi_rpc.http.http_connect``.
407
+ base_url: HTTP-only. Base URL of the running worker, e.g.
408
+ ``"http://127.0.0.1:8765"``.
409
+ bearer_token: HTTP-only. When set, every request carries an
410
+ ``Authorization: Bearer <token>`` header. Static token
411
+ support only — no JWT / OAuth flows.
412
+ httpx_client: HTTP-only escape hatch. When provided, overrides
413
+ ``bearer_token`` and is used verbatim; supply this when you
414
+ need mTLS or a custom auth scheme. Not the canonical path.
415
+ external_location: HTTP-only. ``ExternalLocationConfig`` that
416
+ controls how the client fetches pointer batches (workers
417
+ that externalize large outputs via demo storage / S3 return
418
+ empty batches carrying ``vgi_rpc.location`` metadata).
419
+ Defaults to a vanilla ``ExternalLocationConfig()`` for HTTP
420
+ transport so pointer batches are resolved automatically.
421
+ Subprocess transport ignores this — subprocess workers
422
+ don't return pointer batches.
423
+
424
+ Raises:
425
+ ValueError: If the transport / server_path / base_url
426
+ combination is inconsistent.
427
+
428
+ """
429
+ if transport == "subprocess":
430
+ if server_path is None:
431
+ raise ValueError("subprocess transport requires server_path")
432
+ if base_url is not None:
433
+ raise ValueError("base_url is only meaningful for transport='http'")
434
+ elif transport == "http":
435
+ if base_url is None:
436
+ raise ValueError("transport='http' requires base_url")
437
+ if server_path is not None:
438
+ raise ValueError("server_path is only meaningful for transport='subprocess'")
439
+ else:
440
+ raise ValueError(f"unknown transport {transport!r}")
441
+
442
+ self.server_path = server_path or ""
443
+ self._transport = transport
444
+ self._base_url = base_url
445
+ self._bearer_token = bearer_token
446
+ self._httpx_client = httpx_client
447
+ # True when ``_get_or_create_httpx_client`` constructed the client and
448
+ # is therefore responsible for closing it on ``stop()``. False when
449
+ # the caller passed ``httpx_client=`` — ownership stays with them.
450
+ self._httpx_client_owned = False
451
+ # Auto-enable pointer-batch resolution for HTTP unless the caller
452
+ # asked for something different. See ``external_location`` docs above.
453
+ if transport == "http" and external_location is None:
454
+ from vgi_rpc.external import ExternalLocationConfig
455
+
456
+ external_location = ExternalLocationConfig()
457
+ self._external_location = external_location
458
+ # HTTP server capabilities cache. Populated lazily by
459
+ # ``_get_http_capabilities`` — a single round-trip per Client that
460
+ # drives upload-URL externalization decisions.
461
+ self._http_capabilities: Any | None = None
462
+ _worker_debug = os.environ.get("VGI_WORKER_DEBUG", "").lower() in ("1", "true", "yes")
463
+ self.passthrough_stderr = passthrough_stderr or _worker_debug
464
+ self._worker_limit = worker_limit
465
+ self._attach_opaque_data = attach_opaque_data
466
+ self._pool = pool
467
+ self._primary: WorkerConnection | None = None
468
+ # For multi-worker support
469
+ self._additional_workers: list[WorkerConnection] = []
470
+ self._stderr_buffer: list[bytes] = []
471
+ self._stderr_lock = threading.Lock()
472
+ self._stderr_threads: list[threading.Thread] = []
473
+
474
+ @classmethod
475
+ def from_http(
476
+ cls,
477
+ base_url: str,
478
+ *,
479
+ bearer_token: str | None = None,
480
+ httpx_client: Any | None = None,
481
+ external_location: Any | None = None,
482
+ worker_limit: int | None = None,
483
+ attach_opaque_data: bytes | None = None,
484
+ ) -> Client:
485
+ """Create a ``Client`` bound to a remote HTTP VGI worker.
486
+
487
+ Canonical entry point for non-DuckDB callers (e.g. a TypeScript port
488
+ browsing catalog contents). Subprocess-specific kwargs are not
489
+ accepted; pool/stderr semantics do not apply.
490
+ """
491
+ return cls(
492
+ transport="http",
493
+ base_url=base_url,
494
+ bearer_token=bearer_token,
495
+ httpx_client=httpx_client,
496
+ external_location=external_location,
497
+ worker_limit=worker_limit,
498
+ attach_opaque_data=attach_opaque_data,
499
+ pool=None,
500
+ )
501
+
502
+ def _drain_stderr(self, stderr: IO[bytes]) -> None:
503
+ """Background thread that continuously reads stderr.
504
+
505
+ This is necessary when using pipes because if stderr
506
+ fills up the entire process will be blocked even writing
507
+ to stdout.
508
+ """
509
+ while True:
510
+ line = stderr.readline()
511
+ if not line:
512
+ break
513
+ with self._stderr_lock:
514
+ self._stderr_buffer.append(line)
515
+
516
+ def get_worker_stderr(self) -> str:
517
+ """Return all captured stderr from the worker processes.
518
+
519
+ Returns stderr output from the primary worker and all additional workers
520
+ spawned for parallel processing. The output is accumulated in a shared
521
+ buffer throughout the client's lifetime.
522
+
523
+ This method is thread-safe and can be called while processing is ongoing,
524
+ though the buffer may not yet contain all output until the workers have
525
+ completed.
526
+
527
+ Returns:
528
+ All captured stderr output as a UTF-8 decoded string. Invalid UTF-8
529
+ sequences are replaced with the Unicode replacement character.
530
+
531
+ Note:
532
+ This method only returns data when passthrough_stderr=False was set
533
+ in the constructor. When passthrough_stderr=True, stderr goes directly
534
+ to the parent process's stderr and this method returns an empty string.
535
+
536
+ """
537
+ with self._stderr_lock:
538
+ return b"".join(self._stderr_buffer).decode("utf-8", errors="replace")
539
+
540
+ def _client_error_with_stderr(self, error: ClientError) -> ClientError:
541
+ """Enrich a ClientError with captured worker stderr, if available.
542
+
543
+ When passthrough_stderr is enabled, stderr already went to the terminal
544
+ so we return the error unchanged. Otherwise we append the last 50 lines
545
+ of captured stderr *after* the existing message — so the user's actual
546
+ exception (first line of ``str(error)``) stays at the top of the
547
+ rendered traceback and operational log noise trails.
548
+ """
549
+ if self.passthrough_stderr:
550
+ return error
551
+ stderr = self.get_worker_stderr()
552
+ if not stderr.strip():
553
+ return error
554
+ lines = stderr.strip().splitlines()
555
+ excerpt = "\n".join(lines[-50:]) if len(lines) > 50 else "\n".join(lines)
556
+ new_error = ClientError(f"{error}\n\nWorker stderr (last {len(excerpt.splitlines())} lines):\n{excerpt}")
557
+ new_error.__cause__ = error.__cause__
558
+ return new_error
559
+
560
+ def _spawn_worker(self, worker_index: int) -> WorkerConnection:
561
+ """Create a ``WorkerConnection`` for the configured transport.
562
+
563
+ Dispatches to ``_spawn_subprocess_connection`` (Python-specific) or
564
+ ``_spawn_http_connection`` (the canonical path other-language ports
565
+ mirror). Keeping the two bodies separate makes the HTTP path easy
566
+ to read in isolation.
567
+ """
568
+ if self._transport == "http":
569
+ return self._spawn_http_connection(worker_index)
570
+ return self._spawn_subprocess_connection(worker_index)
571
+
572
+ def _spawn_http_connection(self, worker_index: int) -> WorkerConnection:
573
+ """Connect to a remote HTTP worker via ``vgi_rpc.http.http_connect``.
574
+
575
+ This is the canonical path non-DuckDB clients implement; subprocess
576
+ is a Python convenience. Multiple ``worker_index`` values map to
577
+ independent RPC proxies against the same shared ``httpx.Client``
578
+ (and therefore the same base URL + auth config).
579
+ """
580
+ from vgi_rpc.http import http_connect
581
+
582
+ httpx_client = self._get_or_create_httpx_client()
583
+ ctx: AbstractContextManager[VgiProtocol] = http_connect(
584
+ VgiProtocol, # type: ignore[type-abstract]
585
+ base_url=self._base_url,
586
+ client=httpx_client,
587
+ on_log=self._on_worker_log,
588
+ external_location=self._external_location,
589
+ )
590
+ proxy = ctx.__enter__()
591
+ _logger.debug("http_connection_opened worker_index=%s base_url=%s", worker_index, self._base_url)
592
+ return WorkerConnection(
593
+ proxy=proxy,
594
+ worker_index=worker_index,
595
+ _http_ctx=ctx,
596
+ )
597
+
598
+ def _get_or_create_httpx_client(self) -> Any:
599
+ """Return the shared httpx.Client for this Client's HTTP transport.
600
+
601
+ Lazily constructs one bound to ``self._base_url`` (so RPC requests
602
+ resolve against the remote worker) with an ``Authorization: Bearer
603
+ <token>`` header when ``bearer_token`` was supplied. When the
604
+ caller passes ``httpx_client=`` directly, they're responsible for
605
+ configuring ``base_url`` and auth on it — we use it verbatim.
606
+ """
607
+ if self._httpx_client is not None:
608
+ return self._httpx_client
609
+
610
+ import httpx
611
+
612
+ headers: dict[str, str] = {}
613
+ if self._bearer_token is not None:
614
+ headers["Authorization"] = f"Bearer {self._bearer_token}"
615
+ self._httpx_client = httpx.Client(
616
+ base_url=self._base_url or "",
617
+ follow_redirects=True,
618
+ headers=headers,
619
+ # httpx's 5s default read timeout is too aggressive for RPC
620
+ # calls that do real server-side work (scans, cold workers).
621
+ timeout=httpx.Timeout(60.0, connect=10.0),
622
+ )
623
+ self._httpx_client_owned = True
624
+ return self._httpx_client
625
+
626
+ def _spawn_subprocess_connection(self, worker_index: int) -> WorkerConnection:
627
+ """Spawn or borrow a subprocess worker and wrap it in an RPC proxy.
628
+
629
+ When a pool is configured, borrows an idle worker (or spawns a new
630
+ one) from the pool. Otherwise creates a subprocess directly.
631
+
632
+ Python-specific: subprocess management relies on ``shell=True``
633
+ semantics and the ``WorkerPool`` abstraction that other languages
634
+ don't need to mirror.
635
+ """
636
+ if self._pool is not None:
637
+ _logger.debug("borrowing_worker worker_index=%s", worker_index)
638
+ cmd = shlex.split(self.server_path, posix=sys.platform != "win32")
639
+ ctx = self._pool.connect(
640
+ VgiProtocol, # type: ignore[type-abstract]
641
+ cmd,
642
+ on_log=self._on_worker_log,
643
+ )
644
+ proxy = ctx.__enter__()
645
+ _logger.debug("worker_borrowed worker_index=%s", worker_index)
646
+ return WorkerConnection(
647
+ proxy=proxy,
648
+ worker_index=worker_index,
649
+ _pool_ctx=ctx,
650
+ )
651
+
652
+ _logger.debug("spawning_worker worker_index=%s", worker_index)
653
+ proc = subprocess.Popen(
654
+ self.server_path,
655
+ stdin=subprocess.PIPE,
656
+ stdout=subprocess.PIPE,
657
+ stderr=None if self.passthrough_stderr else subprocess.PIPE,
658
+ text=False,
659
+ bufsize=0,
660
+ shell=True,
661
+ )
662
+ _logger.debug("worker_spawned worker_index=%s pid=%s", worker_index, proc.pid)
663
+
664
+ if proc.stdout is None:
665
+ raise ClientError("Failed to create stdout pipe for worker subprocess")
666
+
667
+ if not self.passthrough_stderr:
668
+ if proc.stderr is None:
669
+ raise ClientError("Failed to create stderr pipe for worker subprocess")
670
+ stderr_thread = threading.Thread(target=self._drain_stderr, args=(proc.stderr,), daemon=True)
671
+ stderr_thread.start()
672
+ self._stderr_threads.append(stderr_thread)
673
+
674
+ assert proc.stdin is not None, "stdin pipe not created for worker"
675
+ stdout_buffered = io.BufferedReader(cast(io.RawIOBase, proc.stdout))
676
+ transport = PipeTransport(reader=stdout_buffered, writer=cast(io.IOBase, proc.stdin))
677
+ connection: RpcConnection[VgiProtocol] = RpcConnection(
678
+ VgiProtocol, # type: ignore[type-abstract]
679
+ transport,
680
+ on_log=self._on_worker_log,
681
+ )
682
+ proxy = connection.__enter__()
683
+
684
+ return WorkerConnection(
685
+ proxy=proxy,
686
+ worker_index=worker_index,
687
+ proc=proc,
688
+ connection=connection,
689
+ )
690
+
691
+ def _stop_worker(self, worker: WorkerConnection) -> int:
692
+ """Stop a worker subprocess or return it to the pool.
693
+
694
+ Closes the worker's stream session (if open), then either returns the
695
+ worker to the pool (pooled) or exits the RPC connection and waits for
696
+ the subprocess to terminate (direct).
697
+
698
+ Args:
699
+ worker: The worker connection to stop.
700
+
701
+ Returns:
702
+ The subprocess exit code. Returns 0 for pooled workers (returned
703
+ to pool) or normal termination, non-zero for errors.
704
+
705
+ """
706
+ if worker.stream is not None:
707
+ worker.stream.close()
708
+ worker.stream = None
709
+
710
+ if worker._http_ctx is not None:
711
+ # HTTP transport — close the RPC proxy. The underlying httpx
712
+ # client is shared across workers and closed in Client.stop().
713
+ worker._http_ctx.__exit__(None, None, None)
714
+ _logger.debug("http_connection_closed worker_index=%s", worker.worker_index)
715
+ return 0
716
+
717
+ if worker._pool_ctx is not None:
718
+ # Return to pool — pool handles subprocess lifecycle
719
+ worker._pool_ctx.__exit__(None, None, None)
720
+ _logger.debug("worker_returned_to_pool worker_index=%s", worker.worker_index)
721
+ return 0
722
+
723
+ # Direct subprocess management
724
+ assert worker.connection is not None
725
+ assert worker.proc is not None
726
+ worker.connection.__exit__(None, None, None)
727
+ worker.proc.wait(timeout=self.PROCESS_WAIT_TIMEOUT)
728
+ returncode = worker.proc.returncode
729
+ if returncode != 0:
730
+ _logger.error(
731
+ "worker_exited_with_error worker_index=%s pid=%s returncode=%s",
732
+ worker.worker_index,
733
+ worker.proc.pid,
734
+ returncode,
735
+ )
736
+ else:
737
+ _logger.debug(
738
+ "worker_exited worker_index=%s pid=%s returncode=%s",
739
+ worker.worker_index,
740
+ worker.proc.pid,
741
+ returncode,
742
+ )
743
+ return returncode
744
+
745
+ def _close_secondary_workers(self) -> None:
746
+ """Close and stop all secondary (additional) workers."""
747
+ for worker in self._additional_workers:
748
+ self._stop_worker(worker)
749
+ self._additional_workers = []
750
+
751
+ def _join_threads(self, threads: list[threading.Thread]) -> None:
752
+ """Wait for all threads to complete with timeout.
753
+
754
+ Joins each thread with a timeout of THREAD_JOIN_TIMEOUT seconds.
755
+ Logs a warning for any thread that does not terminate within the
756
+ timeout period but does not raise an exception.
757
+
758
+ Args:
759
+ threads: List of Thread objects to wait for. Threads that have
760
+ already completed will return immediately from join().
761
+
762
+ """
763
+ for thread in threads:
764
+ thread.join(timeout=self.THREAD_JOIN_TIMEOUT)
765
+ if thread.is_alive():
766
+ _logger.warning("worker_thread_did_not_terminate")
767
+
768
+ def start(self) -> None:
769
+ """Start the primary worker subprocess.
770
+
771
+ Spawns the worker process using the server_path configured in __init__,
772
+ sets up RPC transport, and creates a typed VgiProtocol proxy for
773
+ method calls.
774
+
775
+ After this method returns, the client is ready to invoke functions via
776
+ table_in_out_function(), table_function(), or scalar_function(). When
777
+ using the context manager protocol (with statement), this method is
778
+ called automatically.
779
+
780
+ The stderr buffer is cleared when start() is called, so any stderr from
781
+ previous runs is discarded.
782
+
783
+ Raises:
784
+ ClientError: If the client is already started (call stop() first),
785
+ or if stdout/stderr pipes fail to be created.
786
+
787
+ """
788
+ if self._primary is not None:
789
+ raise ClientError("Client already started")
790
+
791
+ self._stderr_buffer = []
792
+ _logger.debug("starting_server server_path=%s", self.server_path)
793
+ self._primary = self._spawn_worker(0)
794
+ if self._primary.proc is not None:
795
+ id_repr: Any = self._primary.proc.pid
796
+ elif self._primary._http_ctx is not None:
797
+ id_repr = f"http({self._base_url})"
798
+ else:
799
+ id_repr = "pooled"
800
+ _logger.debug("server_started id=%s", id_repr)
801
+
802
+ def stop(self) -> int:
803
+ """Stop all worker subprocesses and clean up resources.
804
+
805
+ Terminates all workers in the following order:
806
+ 1. Stops all additional workers (spawned for parallel processing)
807
+ 2. Stops the primary worker
808
+ 3. Waits for all stderr drain threads to complete (with timeout)
809
+ 4. Resets all internal state
810
+
811
+ After this method returns, the client can be started again with start().
812
+ When using the context manager protocol (with statement), this method
813
+ is called automatically on exit.
814
+
815
+ Returns:
816
+ The exit code of the primary worker process. Returns 0 for normal
817
+ termination, non-zero values indicate errors. Exit codes from
818
+ additional workers are logged but not returned.
819
+
820
+ Raises:
821
+ ClientError: If the client was not started (call start() first).
822
+
823
+ """
824
+ if self._primary is None:
825
+ raise ClientError("Client not started")
826
+
827
+ # Stop additional workers first
828
+ self._close_secondary_workers()
829
+
830
+ # Stop primary worker
831
+ returncode = self._stop_worker(self._primary)
832
+ self._primary = None
833
+
834
+ # Wait for stderr threads to finish draining
835
+ for stderr_thread in self._stderr_threads:
836
+ stderr_thread.join(timeout=self.THREAD_JOIN_TIMEOUT)
837
+ if stderr_thread.is_alive():
838
+ _logger.warning("stderr_thread_did_not_terminate")
839
+ self._stderr_threads = []
840
+
841
+ # Close the shared httpx.Client if we created it ourselves.
842
+ if self._httpx_client_owned and self._httpx_client is not None:
843
+ try:
844
+ self._httpx_client.close()
845
+ finally:
846
+ self._httpx_client = None
847
+ self._httpx_client_owned = False
848
+
849
+ return returncode
850
+
851
+ def server_capabilities(self) -> Any:
852
+ """Return the HTTP server's advertised capabilities.
853
+
854
+ Only valid for HTTP-mode clients. The returned
855
+ ``HttpServerCapabilities`` carries ``max_request_bytes``,
856
+ ``upload_url_support``, and ``max_upload_bytes`` — the fields the
857
+ client consults before deciding to externalize large input batches
858
+ via upload URLs (see Phase 4 of the whimsical-mccarthy plan).
859
+ """
860
+ if self._transport != "http":
861
+ raise ClientError("server_capabilities() is only available for HTTP transport")
862
+ from vgi_rpc.http import http_capabilities
863
+
864
+ httpx_client = self._get_or_create_httpx_client()
865
+ return http_capabilities(base_url=self._base_url, client=httpx_client)
866
+
867
+ def __enter__(self) -> Client:
868
+ """Enter the context manager by starting the worker subprocess."""
869
+ self.start()
870
+ return self
871
+
872
+ def __exit__(self, _exc_type: Any, _exc_val: Any, _exc_tb: Any) -> None:
873
+ """Exit the context manager by stopping all worker subprocesses."""
874
+ self.stop()
875
+
876
+ # -----------------------------------------------------------------------
877
+ # RPC helpers
878
+ # -----------------------------------------------------------------------
879
+
880
+ def _make_bind_request(
881
+ self,
882
+ *,
883
+ function_name: str,
884
+ arguments: Arguments,
885
+ function_type: FunctionType,
886
+ input_schema: pa.Schema | None = None,
887
+ settings: dict[str, Any] | None = None,
888
+ secrets: dict[str, Any] | None = None,
889
+ transaction_opaque_data: bytes | None = None,
890
+ ) -> BindRequest:
891
+ """Create a BindRequest for the given function parameters."""
892
+ return BindRequest(
893
+ function_name=function_name,
894
+ arguments=arguments,
895
+ function_type=function_type,
896
+ input_schema=input_schema,
897
+ settings=self._settings_to_batch(settings),
898
+ secrets=self._secrets_to_batch(secrets),
899
+ attach_opaque_data=self._attach_opaque_data,
900
+ transaction_opaque_data=transaction_opaque_data,
901
+ )
902
+
903
+ @staticmethod
904
+ def _do_bind(
905
+ proxy: VgiProtocol,
906
+ bind_request: BindRequest,
907
+ bind_result_callback: Callable[[BindResponse], None] | None = None,
908
+ ) -> BindResponse:
909
+ """Call bind on a worker proxy and return BindResponse.
910
+
911
+ Args:
912
+ proxy: VgiProtocol proxy from RpcConnection.
913
+ bind_request: The bind request to send.
914
+ bind_result_callback: Optional callback invoked with the
915
+ BindResponse before returning.
916
+
917
+ Returns:
918
+ BindResponse containing output_schema and opaque_data.
919
+
920
+ Raises:
921
+ ClientError: If the RPC call fails.
922
+
923
+ """
924
+ try:
925
+ bind_response: BindResponse = proxy.bind(request=bind_request)
926
+ except RpcError as e:
927
+ raise ClientError.from_rpc_error(e) from e
928
+
929
+ if bind_result_callback is not None:
930
+ bind_result_callback(bind_response)
931
+
932
+ return bind_response
933
+
934
+ @staticmethod
935
+ def _do_init(
936
+ proxy: VgiProtocol,
937
+ bind_request: BindRequest,
938
+ bind_response: BindResponse,
939
+ *,
940
+ projection_ids: list[int] | None = None,
941
+ pushdown_filters_batch: pa.RecordBatch | None = None,
942
+ phase: TableInOutFunctionInitPhase | None = None,
943
+ execution_id: bytes | None = None,
944
+ init_opaque_data: bytes | None = None,
945
+ finalize_state_id: bytes | None = None,
946
+ ) -> StreamSession:
947
+ """Call init on a worker proxy and return a StreamSession.
948
+
949
+ Args:
950
+ proxy: VgiProtocol proxy from RpcConnection.
951
+ bind_request: The original bind request.
952
+ bind_response: The bind response containing output_schema.
953
+ projection_ids: Optional column indices for projection.
954
+ pushdown_filters_batch: Optional deserialized filter predicates.
955
+ phase: Table-in-out function phase (INPUT or FINALIZE).
956
+ execution_id: For secondary init, the execution ID from
957
+ the primary worker's init response.
958
+ init_opaque_data: For secondary init, the opaque data from
959
+ the primary worker's init response.
960
+ finalize_state_id: For ``TABLE_BUFFERING_FINALIZE`` init, the
961
+ opaque finalize partition key this producer stream serves.
962
+
963
+ Returns:
964
+ StreamSession for data exchange or production.
965
+
966
+ Raises:
967
+ ClientError: If the RPC call fails.
968
+
969
+ """
970
+ init_request = InitRequest(
971
+ bind_call=bind_request,
972
+ output_schema=bind_response.output_schema,
973
+ bind_opaque_data=bind_response.opaque_data,
974
+ projection_ids=projection_ids,
975
+ pushdown_filters=pushdown_filters_batch,
976
+ phase=phase,
977
+ execution_id=execution_id,
978
+ init_opaque_data=init_opaque_data,
979
+ finalize_state_id=finalize_state_id,
980
+ )
981
+ try:
982
+ stream: StreamSession = proxy.init(request=init_request) # type: ignore[assignment]
983
+ return stream
984
+ except RpcError as e:
985
+ raise ClientError.from_rpc_error(e) from e
986
+
987
+ def _initialize_stream_common(
988
+ self,
989
+ *,
990
+ function_name: str,
991
+ arguments: Arguments,
992
+ function_type: FunctionType,
993
+ input_schema: pa.Schema | None,
994
+ settings: dict[str, Any] | None,
995
+ secrets: dict[str, Any] | None,
996
+ transaction_opaque_data: bytes | None,
997
+ projection_ids: list[int] | None,
998
+ pushdown_filters_batch: pa.RecordBatch | None,
999
+ phase: TableInOutFunctionInitPhase | None,
1000
+ bind_result_callback: Callable[[BindResponse], None] | None,
1001
+ ) -> tuple[BindRequest, BindResponse, GlobalInitResponse]:
1002
+ """Run the canonical bind → init → fan-out-workers sequence.
1003
+
1004
+ All three function entry points (``scalar_function``,
1005
+ ``table_function``, ``table_in_out_function``) share this shape:
1006
+
1007
+ 1. Build a ``BindRequest`` from the user's call.
1008
+ 2. ``bind`` against the primary worker proxy.
1009
+ 3. ``init`` against the primary — stores ``StreamSession`` on the
1010
+ primary worker connection.
1011
+ 4. Read the ``GlobalInitResponse`` header (carries ``max_workers``
1012
+ + ``execution_id`` for secondary workers).
1013
+ 5. Spawn any additional workers and drive their ``init`` with the
1014
+ primary's execution identity.
1015
+
1016
+ Centralizing this keeps HTTP/subprocess differences and protocol
1017
+ changes (e.g. future scoped-secret re-bind, init hints) in one
1018
+ place.
1019
+ """
1020
+ assert self._primary is not None, "primary worker not started"
1021
+
1022
+ bind_request = self._make_bind_request(
1023
+ function_name=function_name,
1024
+ arguments=arguments,
1025
+ function_type=function_type,
1026
+ input_schema=input_schema,
1027
+ settings=settings,
1028
+ secrets=secrets,
1029
+ transaction_opaque_data=transaction_opaque_data,
1030
+ )
1031
+ bind_response = self._do_bind(self._primary.proxy, bind_request, bind_result_callback)
1032
+
1033
+ stream = self._do_init(
1034
+ self._primary.proxy,
1035
+ bind_request,
1036
+ bind_response,
1037
+ projection_ids=projection_ids,
1038
+ pushdown_filters_batch=pushdown_filters_batch,
1039
+ phase=phase,
1040
+ )
1041
+ self._primary.stream = stream
1042
+
1043
+ init_response = stream.typed_header(GlobalInitResponse)
1044
+ max_workers = self._determine_max_workers(init_response.max_workers)
1045
+
1046
+ self._spawn_additional_workers(
1047
+ max_workers,
1048
+ bind_request,
1049
+ bind_response,
1050
+ init_response,
1051
+ projection_ids=projection_ids,
1052
+ pushdown_filters_batch=pushdown_filters_batch,
1053
+ phase=phase,
1054
+ )
1055
+
1056
+ return bind_request, bind_response, init_response
1057
+
1058
+ def _spawn_additional_workers(
1059
+ self,
1060
+ max_workers: int,
1061
+ bind_request: BindRequest,
1062
+ bind_response: BindResponse,
1063
+ global_init_response: GlobalInitResponse,
1064
+ *,
1065
+ projection_ids: list[int] | None = None,
1066
+ pushdown_filters_batch: pa.RecordBatch | None = None,
1067
+ phase: TableInOutFunctionInitPhase | None = None,
1068
+ ) -> None:
1069
+ """Spawn and initialize additional worker subprocesses in parallel.
1070
+
1071
+ First spawns all worker subprocesses sequentially (fast operation), then
1072
+ initializes all workers in parallel using threads. Each additional worker
1073
+ receives a secondary init with the execution_id from the primary worker.
1074
+
1075
+ The spawned workers are appended to self._additional_workers list.
1076
+
1077
+ If max_workers is 1 or less, this method returns immediately without
1078
+ spawning any workers.
1079
+
1080
+ Args:
1081
+ max_workers: Total number of workers desired (including the primary
1082
+ worker). For example, if max_workers=4, this method spawns
1083
+ 3 additional workers (indices 1, 2, 3).
1084
+ bind_request: The original bind request to embed in init.
1085
+ bind_response: The bind response with output schema.
1086
+ global_init_response: The primary worker's init response containing
1087
+ execution_id and opaque_data for secondary init.
1088
+ projection_ids: Optional column indices for projection.
1089
+ pushdown_filters_batch: Optional deserialized filter predicates.
1090
+ phase: Table-in-out function phase (INPUT or FINALIZE).
1091
+
1092
+ Raises:
1093
+ ClientError: If any worker fails to initialize. The exception wraps
1094
+ the first initialization error encountered.
1095
+
1096
+ """
1097
+ if max_workers <= 1:
1098
+ return
1099
+
1100
+ # Spawn all worker subprocesses first (fast)
1101
+ new_workers: list[WorkerConnection] = []
1102
+ for worker_index in range(1, max_workers):
1103
+ worker = self._spawn_worker(worker_index)
1104
+ new_workers.append(worker)
1105
+ self._additional_workers.append(worker)
1106
+
1107
+ # Initialize all workers in parallel (overlaps Python startup time)
1108
+ init_errors: list[Exception] = []
1109
+
1110
+ def do_init(worker: WorkerConnection) -> None:
1111
+ try:
1112
+ stream = self._do_init(
1113
+ worker.proxy,
1114
+ bind_request,
1115
+ bind_response,
1116
+ projection_ids=projection_ids,
1117
+ pushdown_filters_batch=pushdown_filters_batch,
1118
+ phase=phase,
1119
+ execution_id=global_init_response.execution_id,
1120
+ init_opaque_data=global_init_response.opaque_data,
1121
+ )
1122
+ worker.stream = stream
1123
+ except Exception as e:
1124
+ init_errors.append(e)
1125
+
1126
+ init_threads: list[threading.Thread] = []
1127
+ for worker in new_workers:
1128
+ t = threading.Thread(target=do_init, args=(worker,))
1129
+ t.start()
1130
+ init_threads.append(t)
1131
+
1132
+ for t in init_threads:
1133
+ t.join()
1134
+
1135
+ if init_errors:
1136
+ error_msgs = [str(e) for e in init_errors]
1137
+ raise ClientError(
1138
+ f"Failed to initialize {len(init_errors)} worker(s):\n" + "\n".join(f" - {msg}" for msg in error_msgs)
1139
+ ) from init_errors[0]
1140
+
1141
+ _logger.debug("additional_workers_spawned count=%s", len(new_workers))
1142
+
1143
+ # -----------------------------------------------------------------------
1144
+ # Batch processing helpers
1145
+ # -----------------------------------------------------------------------
1146
+
1147
+ # -----------------------------------------------------------------------
1148
+ # HTTP upload-URL externalization (Phase 4)
1149
+ #
1150
+ # Non-DuckDB clients send IPC bytes inline on each exchange() call.
1151
+ # Servers can advertise a maximum request size via VGI-Max-Request-Bytes
1152
+ # (surfaced as HttpServerCapabilities.max_request_bytes). When an input
1153
+ # batch would exceed it AND the server supports upload URLs, we:
1154
+ # 1. request_upload_urls(count=1) → {upload_url, download_url}
1155
+ # 2. PUT the IPC bytes to upload_url
1156
+ # 3. replace the batch with an empty one + vgi_rpc.location metadata
1157
+ # pointing at download_url
1158
+ # The worker resolves the pointer batch on its end (mirror of the
1159
+ # client's own external-location resolution on outputs).
1160
+ # -----------------------------------------------------------------------
1161
+
1162
+ def _get_http_capabilities(self) -> Any:
1163
+ """Return cached ``HttpServerCapabilities`` (HTTP transport only)."""
1164
+ if self._http_capabilities is not None:
1165
+ return self._http_capabilities
1166
+ from vgi_rpc.http import http_capabilities
1167
+
1168
+ httpx_client = self._get_or_create_httpx_client()
1169
+ self._http_capabilities = http_capabilities(base_url=self._base_url, client=httpx_client)
1170
+ return self._http_capabilities
1171
+
1172
+ @staticmethod
1173
+ def _serialize_batch_ipc(batch: pa.RecordBatch) -> bytes:
1174
+ """Return Arrow IPC stream bytes for a single ``RecordBatch``."""
1175
+ sink = pa.BufferOutputStream()
1176
+ with pa.ipc.new_stream(sink, batch.schema) as writer:
1177
+ writer.write_batch(batch)
1178
+ return sink.getvalue().to_pybytes()
1179
+
1180
+ def _maybe_externalize_input_batch(self, batch: pa.RecordBatch) -> AnnotatedBatch:
1181
+ """If the batch would exceed ``max_request_bytes``, externalize via upload URL.
1182
+
1183
+ No-op for subprocess transport or when the server doesn't advertise
1184
+ upload-URL support. Returns an ``AnnotatedBatch`` either wrapping
1185
+ the original batch (no externalization needed) or a pointer batch
1186
+ carrying ``vgi_rpc.location`` metadata.
1187
+ """
1188
+ if self._transport != "http":
1189
+ return AnnotatedBatch(batch=batch)
1190
+
1191
+ caps = self._get_http_capabilities()
1192
+ if not getattr(caps, "upload_url_support", False):
1193
+ return AnnotatedBatch(batch=batch)
1194
+ threshold = getattr(caps, "max_request_bytes", None)
1195
+ if threshold is None or threshold <= 0:
1196
+ return AnnotatedBatch(batch=batch)
1197
+
1198
+ ipc_bytes = self._serialize_batch_ipc(batch)
1199
+ if len(ipc_bytes) <= threshold:
1200
+ return AnnotatedBatch(batch=batch)
1201
+
1202
+ from vgi_rpc.http import request_upload_urls
1203
+ from vgi_rpc.metadata import LOCATION_KEY
1204
+
1205
+ httpx_client = self._get_or_create_httpx_client()
1206
+ urls = request_upload_urls(base_url=self._base_url, count=1, client=httpx_client)
1207
+ if not urls:
1208
+ # Server claimed support but vended no URLs — surface the raw
1209
+ # request rather than silently sending too-large bytes.
1210
+ return AnnotatedBatch(batch=batch)
1211
+ upload = urls[0]
1212
+
1213
+ put_resp = httpx_client.put(upload.upload_url, content=ipc_bytes, timeout=30.0)
1214
+ put_resp.raise_for_status()
1215
+
1216
+ pointer = pa.RecordBatch.from_pydict(
1217
+ {field.name: [] for field in batch.schema},
1218
+ schema=batch.schema,
1219
+ )
1220
+ cm = pa.KeyValueMetadata({LOCATION_KEY: upload.download_url.encode()})
1221
+ _logger.debug(
1222
+ "externalized_input_batch size_bytes=%s download_url=%s",
1223
+ len(ipc_bytes),
1224
+ upload.download_url,
1225
+ )
1226
+ return AnnotatedBatch(batch=pointer, custom_metadata=cm)
1227
+
1228
+ def _process_batch_on_worker(
1229
+ self,
1230
+ worker: WorkerConnection,
1231
+ input_batch: pa.RecordBatch,
1232
+ batch_index: int,
1233
+ ) -> list[pa.RecordBatch]:
1234
+ """Send a batch to a worker and collect all output batches.
1235
+
1236
+ Sends the input batch via stream.exchange(), then checks the vgi.status
1237
+ metadata. If the worker returns HAVE_MORE_OUTPUT, sends the same input
1238
+ again. Continues until NEED_MORE_INPUT or no status (scalar functions).
1239
+
1240
+ Args:
1241
+ worker: The worker connection to use. Must have stream initialized.
1242
+ input_batch: The input RecordBatch to send to the worker.
1243
+ batch_index: Index of this batch in the input sequence (for logging).
1244
+
1245
+ Returns:
1246
+ List of output RecordBatches produced by processing this input batch.
1247
+
1248
+ Raises:
1249
+ ClientError: If worker.stream is None, or if the worker returns
1250
+ an unexpected status, or if the RPC call fails.
1251
+
1252
+ """
1253
+ if worker.stream is None:
1254
+ raise ClientError(f"Worker {worker.worker_index} stream not initialized")
1255
+
1256
+ output_batches: list[pa.RecordBatch] = []
1257
+
1258
+ while True:
1259
+ _logger.debug(
1260
+ "sending_batch_to_worker worker_index=%s batch_index=%s num_rows=%s",
1261
+ worker.worker_index,
1262
+ batch_index,
1263
+ input_batch.num_rows,
1264
+ )
1265
+
1266
+ try:
1267
+ annotated = self._maybe_externalize_input_batch(input_batch)
1268
+ output = worker.stream.exchange(annotated)
1269
+ except RpcError as e:
1270
+ raise ClientError.from_rpc_error(e) from e
1271
+
1272
+ _logger.debug(
1273
+ "received_output_from_worker worker_index=%s num_rows=%s",
1274
+ worker.worker_index,
1275
+ output.batch.num_rows,
1276
+ )
1277
+
1278
+ output_batches.append(output.batch)
1279
+
1280
+ # Check vgi.status for table-in-out status
1281
+ status = None
1282
+ if output.custom_metadata:
1283
+ status = output.custom_metadata.get(b"vgi.status")
1284
+
1285
+ # status is None for scalar functions (no status metadata)
1286
+ if status == b"HAVE_MORE_OUTPUT":
1287
+ continue
1288
+ elif status == b"NEED_MORE_INPUT" or status is None:
1289
+ break
1290
+ else:
1291
+ raise ClientError(f"Unexpected status from worker {worker.worker_index}: {status!r}")
1292
+
1293
+ return output_batches
1294
+
1295
+ def _worker_thread_loop(
1296
+ self,
1297
+ worker: WorkerConnection,
1298
+ input_queue: Queue[tuple[int, pa.RecordBatch] | None],
1299
+ output_queue: Queue[tuple[int, list[pa.RecordBatch]] | BaseException],
1300
+ ) -> None:
1301
+ """Thread function that processes batches for a single worker.
1302
+
1303
+ Runs in a dedicated thread, pulling (batch_index, batch) tuples from
1304
+ the input queue, processing them via _process_batch_on_worker, and
1305
+ pushing (batch_index, output_batches) tuples to the output queue.
1306
+
1307
+ When None is received from input_queue, signals thread completion by
1308
+ pushing (-1, []) to output_queue and exits.
1309
+
1310
+ If an exception occurs during processing, it is caught, logged, and
1311
+ pushed to output_queue as the exception object itself.
1312
+
1313
+ Args:
1314
+ worker: The worker connection to use for processing batches.
1315
+ input_queue: Thread-safe queue providing (batch_index, RecordBatch)
1316
+ tuples for processing. A None value signals end of input.
1317
+ output_queue: Thread-safe queue for results.
1318
+
1319
+ """
1320
+ try:
1321
+ while True:
1322
+ item = input_queue.get()
1323
+ if item is None:
1324
+ # End of input - signal thread completion
1325
+ output_queue.put((-1, []))
1326
+ break
1327
+
1328
+ batch_index, input_batch = item
1329
+ outputs = self._process_batch_on_worker(worker, input_batch, batch_index)
1330
+ output_queue.put((batch_index, outputs))
1331
+ except Exception as e:
1332
+ _logger.exception("worker_thread_error worker_index=%s", worker.worker_index)
1333
+ output_queue.put(e)
1334
+
1335
+ def _distribute_and_collect(
1336
+ self,
1337
+ *,
1338
+ all_workers: list[WorkerConnection],
1339
+ first_batch: pa.RecordBatch,
1340
+ remaining_input: Iterator[pa.RecordBatch],
1341
+ ) -> Generator[pa.RecordBatch]:
1342
+ """Distribute input batches round-robin across workers and collect output.
1343
+
1344
+ Handles both single-worker and multi-worker cases uniformly. For each
1345
+ worker, spawns a dedicated thread that pulls batches from an input queue,
1346
+ sends them to the worker, and pushes results to a shared output queue.
1347
+
1348
+ Args:
1349
+ all_workers: List of all workers (primary + additional).
1350
+ first_batch: The first input batch, already consumed from the
1351
+ iterator by the calling method.
1352
+ remaining_input: Iterator for remaining input batches.
1353
+
1354
+ Yields:
1355
+ Output RecordBatches from processing. When multiple batches are
1356
+ returned for a single input (HAVE_MORE_OUTPUT), they are combined
1357
+ into one batch. Order is non-deterministic for multi-worker mode.
1358
+
1359
+ Raises:
1360
+ ClientError: If a worker thread fails with an exception.
1361
+
1362
+ """
1363
+ num_workers = len(all_workers)
1364
+
1365
+ _logger.debug("starting_parallel_processing num_workers=%s", num_workers)
1366
+
1367
+ # Create queues for each worker
1368
+ input_queues: list[Queue[tuple[int, pa.RecordBatch] | None]] = [Queue() for _ in range(num_workers)]
1369
+ output_queue: Queue[tuple[int, list[pa.RecordBatch]] | BaseException] = Queue()
1370
+
1371
+ # Start worker threads
1372
+ threads: list[threading.Thread] = []
1373
+ for i, worker in enumerate(all_workers):
1374
+ thread = threading.Thread(
1375
+ target=self._worker_thread_loop,
1376
+ args=(worker, input_queues[i], output_queue),
1377
+ daemon=True,
1378
+ )
1379
+ thread.start()
1380
+ threads.append(thread)
1381
+
1382
+ # Distribute batches round-robin across workers
1383
+ batch_index = 0
1384
+ batches_sent = 0
1385
+
1386
+ # Send first batch
1387
+ worker_idx = batch_index % num_workers
1388
+ input_queues[worker_idx].put((batch_index, first_batch))
1389
+ batches_sent += 1
1390
+ batch_index += 1
1391
+
1392
+ # Send remaining batches
1393
+ for input_batch in remaining_input:
1394
+ worker_idx = batch_index % num_workers
1395
+ input_queues[worker_idx].put((batch_index, input_batch))
1396
+ batches_sent += 1
1397
+ batch_index += 1
1398
+
1399
+ # Signal end of input to all workers
1400
+ for q in input_queues:
1401
+ q.put(None)
1402
+
1403
+ _logger.debug("all_batches_distributed total_batches=%s", batches_sent)
1404
+
1405
+ # Collect outputs from all workers
1406
+ # We expect batches_sent regular outputs + num_workers thread completion signals
1407
+ outputs_expected = batches_sent + num_workers
1408
+ outputs_received = 0
1409
+
1410
+ while outputs_received < outputs_expected:
1411
+ result = output_queue.get()
1412
+
1413
+ # Check for exceptions from worker threads
1414
+ if isinstance(result, BaseException):
1415
+ if isinstance(result, RpcError):
1416
+ raise ClientError.from_rpc_error(result) from result
1417
+ raise ClientError(f"Worker thread failed: {result}") from result
1418
+
1419
+ batch_idx, output_batches = result
1420
+ outputs_received += 1
1421
+
1422
+ # Combine output batches if needed
1423
+ combined = self._combine_batches(output_batches)
1424
+ if combined is not None:
1425
+ yield combined
1426
+
1427
+ _logger.debug(
1428
+ "output_received batch_index=%s outputs_received=%s outputs_expected=%s",
1429
+ batch_idx,
1430
+ outputs_received,
1431
+ outputs_expected,
1432
+ )
1433
+
1434
+ self._join_threads(threads)
1435
+ _logger.debug("all_worker_threads_complete")
1436
+
1437
+ # -----------------------------------------------------------------------
1438
+ # Function methods
1439
+ # -----------------------------------------------------------------------
1440
+
1441
+ def table_in_out_function(
1442
+ self,
1443
+ *,
1444
+ function_name: str,
1445
+ input: Iterator[pa.RecordBatch],
1446
+ arguments: Arguments | None = None,
1447
+ bind_result_callback: Callable[[BindResponse], None] | None = None,
1448
+ projection_ids: list[int] | None = None,
1449
+ pushdown_filters: bytes | None = None,
1450
+ settings: dict[str, Any] | None = None,
1451
+ transaction_opaque_data: bytes | None = None,
1452
+ ) -> Generator[pa.RecordBatch]:
1453
+ """Invoke a table-in-out function on the worker and stream results.
1454
+
1455
+ For parallel processing (max_workers > 1), input batches are distributed
1456
+ round-robin across workers using dedicated threads. Output order may not
1457
+ match input order in parallel mode. Only the primary worker receives the
1458
+ FINALIZE phase and produces final aggregated output.
1459
+
1460
+ Args:
1461
+ function_name: Name of the function to invoke. Must exist in the
1462
+ worker's registry.
1463
+ input: Iterator yielding input RecordBatches. Must yield at least one
1464
+ batch. The first batch's schema is used to initialize the IPC
1465
+ stream. Raises ClientError if the iterator is empty.
1466
+ arguments: Optional Arguments container with positional and named
1467
+ arguments to pass to the function. Defaults to empty Arguments().
1468
+ bind_result_callback: Optional callback invoked with the BindResponse
1469
+ before processing begins.
1470
+ projection_ids: Optional list of column indices for column projection.
1471
+ pushdown_filters: Optional byte string containing filter predicates
1472
+ to push down to the function.
1473
+ settings: Optional dictionary of settings/pragmas to
1474
+ pass to the function.
1475
+ transaction_opaque_data: Optional unique identifier for the DuckDB transaction.
1476
+
1477
+ Yields:
1478
+ Output RecordBatches from the function. In single-worker mode, output
1479
+ order corresponds to input order. In parallel mode (max_workers > 1),
1480
+ output order is non-deterministic due to round-robin distribution.
1481
+ Final output from finalize is always yielded last.
1482
+
1483
+ Raises:
1484
+ ClientError: If the client is not started, input iterator is empty,
1485
+ input iterator yields non-RecordBatch objects, communication
1486
+ with the worker fails, or the worker returns an unexpected
1487
+ status or exception.
1488
+
1489
+ """
1490
+ if arguments is None:
1491
+ arguments = Arguments()
1492
+
1493
+ if self._primary is None:
1494
+ raise ClientError("Client not started. Call start() or use context manager.")
1495
+
1496
+ try:
1497
+ # Get the first batch to determine schema and initialize
1498
+ for first_batch in input:
1499
+ if not isinstance(first_batch, pa.RecordBatch):
1500
+ raise ClientError("Input iterator must yield RecordBatches")
1501
+
1502
+ input_schema = first_batch.schema
1503
+ pushdown_filters_batch = self._deserialize_pushdown_filters(pushdown_filters)
1504
+
1505
+ bind_request, bind_response, init_response = self._initialize_stream_common(
1506
+ function_name=function_name,
1507
+ arguments=arguments,
1508
+ function_type=FunctionType.TABLE,
1509
+ input_schema=input_schema,
1510
+ settings=settings,
1511
+ secrets=None,
1512
+ transaction_opaque_data=transaction_opaque_data,
1513
+ projection_ids=projection_ids,
1514
+ pushdown_filters_batch=pushdown_filters_batch,
1515
+ phase=TableInOutFunctionInitPhase.INPUT,
1516
+ bind_result_callback=bind_result_callback,
1517
+ )
1518
+
1519
+ # Process input batches across all workers
1520
+ all_workers = [self._primary] + self._additional_workers
1521
+ yield from self._distribute_and_collect(
1522
+ all_workers=all_workers,
1523
+ first_batch=first_batch,
1524
+ remaining_input=input,
1525
+ )
1526
+
1527
+ # Close all input streams
1528
+ for worker in all_workers:
1529
+ if worker.stream is not None:
1530
+ worker.stream.close()
1531
+ worker.stream = None
1532
+
1533
+ # Close secondary workers
1534
+ self._close_secondary_workers()
1535
+
1536
+ # Finalize on primary worker
1537
+ _logger.debug("finalizing_primary_worker")
1538
+ yield from self._finalize_primary_worker(
1539
+ bind_request,
1540
+ bind_response,
1541
+ input_schema,
1542
+ init_response,
1543
+ )
1544
+ _logger.debug("parallel_processing_complete")
1545
+ return
1546
+
1547
+ # Input iterator was empty - table-in-out functions require input
1548
+ raise ClientError(
1549
+ f"table_in_out_function requires at least one input batch. "
1550
+ f"The input iterator for function '{function_name}' was empty. "
1551
+ f"Use table_function() for functions that generate data without input."
1552
+ )
1553
+ except ClientError as e:
1554
+ raise self._client_error_with_stderr(e) from e.__cause__
1555
+
1556
+ def table_buffering_function(
1557
+ self,
1558
+ *,
1559
+ function_name: str,
1560
+ input: Iterator[pa.RecordBatch],
1561
+ arguments: Arguments | None = None,
1562
+ bind_result_callback: Callable[[BindResponse], None] | None = None,
1563
+ projection_ids: list[int] | None = None,
1564
+ pushdown_filters: bytes | None = None,
1565
+ settings: dict[str, Any] | None = None,
1566
+ transaction_opaque_data: bytes | None = None,
1567
+ ) -> Generator[pa.RecordBatch]:
1568
+ """Invoke a ``TableBufferingFunction`` (Sink+Source) and stream results.
1569
+
1570
+ This mirrors the C++ ``PhysicalVgiTableBufferingFunction`` operator
1571
+ rather than the streaming INPUT/FINALIZE path used by
1572
+ :meth:`table_in_out_function`. The sequence is:
1573
+
1574
+ 1. ``bind`` → ``init(phase=TABLE_BUFFERING)`` on the primary worker.
1575
+ The sink init persists init metadata to cold storage so any pool
1576
+ worker can serve subsequent process/combine RPCs; its stream
1577
+ carries no data, so it is closed immediately after the header.
1578
+ 2. ``table_buffering_process`` (unary) per input batch — the worker
1579
+ sinks the batch and returns an opaque ``state_id``.
1580
+ 3. ``table_buffering_combine`` (unary) once at end-of-input — the
1581
+ worker hands all ``state_id``s to user ``combine()`` and returns
1582
+ opaque ``finalize_state_id``s (the source-side partition keys).
1583
+ 4. ``init(phase=TABLE_BUFFERING_FINALIZE, finalize_state_id=...)`` per
1584
+ finalize key — a producer stream driving user ``finalize()`` per
1585
+ tick. Output batches are yielded in finalize-key order.
1586
+ 5. ``table_buffering_destructor`` (unary, best-effort) for cleanup.
1587
+
1588
+ Unlike :meth:`table_in_out_function` this driver runs entirely on the
1589
+ primary worker connection (process/combine are unary RPCs); the
1590
+ worker buffers all input regardless, so the aggregate result is
1591
+ identical to the distributed C++ path.
1592
+
1593
+ Args:
1594
+ function_name: Name of the ``TableBufferingFunction`` to invoke.
1595
+ input: Iterator yielding input RecordBatches. May be empty —
1596
+ buffering aggregations still produce a result for zero rows.
1597
+ arguments: Optional Arguments container. Defaults to empty.
1598
+ bind_result_callback: Optional callback invoked with the
1599
+ BindResponse before processing begins.
1600
+ projection_ids: Optional column indices for projection.
1601
+ pushdown_filters: Optional serialized filter predicates.
1602
+ settings: Optional settings/pragmas to pass to the function.
1603
+ transaction_opaque_data: Optional DuckDB transaction identifier.
1604
+
1605
+ Yields:
1606
+ Output RecordBatches produced by the finalize (source) phase.
1607
+
1608
+ Raises:
1609
+ ClientError: If the client is not started or any RPC fails.
1610
+
1611
+ """
1612
+ if arguments is None:
1613
+ arguments = Arguments()
1614
+
1615
+ if self._primary is None:
1616
+ raise ClientError("Client not started. Call start() or use context manager.")
1617
+
1618
+ proxy = self._primary.proxy
1619
+ attach = self._attach_opaque_data
1620
+ pushdown_filters_batch = self._deserialize_pushdown_filters(pushdown_filters)
1621
+
1622
+ try:
1623
+ # Peek the first batch to learn the input schema for bind/init.
1624
+ first_batch: pa.RecordBatch | None = None
1625
+ for batch in input:
1626
+ if not isinstance(batch, pa.RecordBatch):
1627
+ raise ClientError("Input iterator must yield RecordBatches")
1628
+ first_batch = batch
1629
+ break
1630
+
1631
+ input_schema = first_batch.schema if first_batch is not None else None
1632
+
1633
+ bind_request = self._make_bind_request(
1634
+ function_name=function_name,
1635
+ arguments=arguments,
1636
+ function_type=FunctionType.TABLE_BUFFERING,
1637
+ input_schema=input_schema,
1638
+ settings=settings,
1639
+ secrets=None,
1640
+ transaction_opaque_data=transaction_opaque_data,
1641
+ )
1642
+ bind_response = self._do_bind(proxy, bind_request, bind_result_callback)
1643
+
1644
+ # Sink init: persists init metadata; the stream carries no data.
1645
+ sink_stream = self._do_init(
1646
+ proxy,
1647
+ bind_request,
1648
+ bind_response,
1649
+ projection_ids=projection_ids,
1650
+ pushdown_filters_batch=pushdown_filters_batch,
1651
+ phase=TableInOutFunctionInitPhase.TABLE_BUFFERING,
1652
+ )
1653
+ init_response = sink_stream.typed_header(GlobalInitResponse)
1654
+ sink_stream.close()
1655
+ execution_id = init_response.execution_id
1656
+
1657
+ try:
1658
+ # Sink each input batch via the unary process RPC.
1659
+ state_ids: list[bytes] = []
1660
+ remaining: Iterator[pa.RecordBatch] = (
1661
+ itertools.chain([first_batch], input) if first_batch is not None else iter(())
1662
+ )
1663
+ for batch_index, batch in enumerate(remaining):
1664
+ if not isinstance(batch, pa.RecordBatch):
1665
+ raise ClientError("Input iterator must yield RecordBatches")
1666
+ try:
1667
+ process_response = proxy.table_buffering_process(
1668
+ request=TableBufferingProcessRequest(
1669
+ function_name=function_name,
1670
+ execution_id=execution_id,
1671
+ input_batch=self._serialize_batch_ipc(batch),
1672
+ attach_opaque_data=attach,
1673
+ transaction_id=transaction_opaque_data,
1674
+ batch_index=batch_index,
1675
+ )
1676
+ )
1677
+ except RpcError as e:
1678
+ raise ClientError.from_rpc_error(e) from e
1679
+ state_ids.append(process_response.state_id)
1680
+
1681
+ # End-of-input: combine → finalize partition keys.
1682
+ try:
1683
+ combine_response = proxy.table_buffering_combine(
1684
+ request=TableBufferingCombineRequest(
1685
+ function_name=function_name,
1686
+ execution_id=execution_id,
1687
+ state_ids=state_ids,
1688
+ attach_opaque_data=attach,
1689
+ transaction_id=transaction_opaque_data,
1690
+ )
1691
+ )
1692
+ except RpcError as e:
1693
+ raise ClientError.from_rpc_error(e) from e
1694
+
1695
+ # Source: one producer stream per finalize partition key.
1696
+ for finalize_state_id in combine_response.finalize_state_ids:
1697
+ finalize_stream = self._do_init(
1698
+ proxy,
1699
+ bind_request,
1700
+ bind_response,
1701
+ projection_ids=projection_ids,
1702
+ pushdown_filters_batch=pushdown_filters_batch,
1703
+ phase=TableInOutFunctionInitPhase.TABLE_BUFFERING_FINALIZE,
1704
+ execution_id=execution_id,
1705
+ finalize_state_id=finalize_state_id,
1706
+ )
1707
+ try:
1708
+ while True:
1709
+ try:
1710
+ output = finalize_stream.tick()
1711
+ except StopIteration:
1712
+ break
1713
+ except RpcError as e:
1714
+ raise ClientError.from_rpc_error(e) from e
1715
+ if output.batch.num_rows > 0:
1716
+ yield output.batch
1717
+ finally:
1718
+ finalize_stream.close()
1719
+ finally:
1720
+ # Best-effort cleanup, mirroring the C++ destructor.
1721
+ try:
1722
+ proxy.table_buffering_destructor(
1723
+ request=TableBufferingDestructorRequest(
1724
+ function_name=function_name,
1725
+ execution_id=execution_id,
1726
+ attach_opaque_data=attach,
1727
+ transaction_id=transaction_opaque_data,
1728
+ )
1729
+ )
1730
+ except RpcError:
1731
+ _logger.debug("table_buffering_destructor failed (ignored)", exc_info=True)
1732
+ except ClientError as e:
1733
+ raise self._client_error_with_stderr(e) from e.__cause__
1734
+
1735
+ def _finalize_primary_worker(
1736
+ self,
1737
+ bind_request: BindRequest,
1738
+ bind_response: BindResponse,
1739
+ input_schema: pa.Schema,
1740
+ init_response: GlobalInitResponse,
1741
+ ) -> Generator[pa.RecordBatch]:
1742
+ """Send FINALIZE init to the primary worker and collect final output.
1743
+
1744
+ Creates a new init(phase=FINALIZE) stream on the primary worker and
1745
+ iterates the producer stream until it finishes.
1746
+
1747
+ Args:
1748
+ bind_request: The original bind request.
1749
+ bind_response: The bind response with output schema.
1750
+ input_schema: Schema of input batches (unused, kept for API compat).
1751
+ init_response: The init response from the INPUT phase, providing
1752
+ the execution_id needed to access stored worker state.
1753
+
1754
+ Yields:
1755
+ Final output RecordBatches from the worker's finalize phase.
1756
+
1757
+ Raises:
1758
+ ClientError: If the RPC call fails.
1759
+
1760
+ """
1761
+ assert self._primary is not None
1762
+
1763
+ # Start FINALIZE stream (producer — uses tick(), not exchange())
1764
+ # Pass execution_id from INPUT phase so finalize can access stored state
1765
+ finalize_stream = self._do_init(
1766
+ self._primary.proxy,
1767
+ bind_request,
1768
+ bind_response,
1769
+ phase=TableInOutFunctionInitPhase.FINALIZE,
1770
+ execution_id=init_response.execution_id,
1771
+ init_opaque_data=init_response.opaque_data,
1772
+ )
1773
+
1774
+ try:
1775
+ while True:
1776
+ try:
1777
+ output = finalize_stream.tick()
1778
+ except StopIteration:
1779
+ break
1780
+ except RpcError as e:
1781
+ raise ClientError.from_rpc_error(e) from e
1782
+
1783
+ _logger.debug("received_finalize_from_worker num_rows=%s", output.batch.num_rows)
1784
+
1785
+ if output.batch.num_rows > 0:
1786
+ yield output.batch
1787
+ finally:
1788
+ finalize_stream.close()
1789
+
1790
+ def table_function(
1791
+ self,
1792
+ *,
1793
+ function_name: str,
1794
+ arguments: Arguments | None = None,
1795
+ bind_result_callback: Callable[[BindResponse], None] | None = None,
1796
+ projection_ids: list[int] | None = None,
1797
+ pushdown_filters: bytes | None = None,
1798
+ settings: dict[str, Any] | None = None,
1799
+ transaction_opaque_data: bytes | None = None,
1800
+ ) -> Generator[pa.RecordBatch]:
1801
+ """Invoke a table function (source function) and stream output batches.
1802
+
1803
+ Table functions generate output batches without receiving input data.
1804
+ They are useful for data sources, generators, or functions that produce
1805
+ results based solely on their arguments.
1806
+
1807
+ For parallel processing (max_workers > 1), output is read from all
1808
+ workers concurrently using threads. Output order is non-deterministic.
1809
+
1810
+ Args:
1811
+ function_name: Name of the function to invoke. Must exist in the
1812
+ worker's registry and be a table function (not table-in-out).
1813
+ arguments: Optional Arguments container with positional and named
1814
+ arguments to pass to the function. Defaults to empty Arguments().
1815
+ bind_result_callback: Optional callback invoked with the BindResponse
1816
+ before processing begins.
1817
+ projection_ids: Optional list of column indices for column projection.
1818
+ pushdown_filters: Optional byte string containing filter predicates
1819
+ to push down to the function.
1820
+ settings: Optional dictionary of settings/pragmas to
1821
+ pass to the function.
1822
+ transaction_opaque_data: Optional unique identifier for the DuckDB transaction.
1823
+
1824
+ Yields:
1825
+ Output RecordBatches from the function. In parallel mode
1826
+ (max_workers > 1), output order is non-deterministic.
1827
+
1828
+ Raises:
1829
+ ClientError: If the client is not started, communication with the
1830
+ worker fails, or the worker returns an exception.
1831
+
1832
+ """
1833
+ if arguments is None:
1834
+ arguments = Arguments()
1835
+
1836
+ if self._primary is None:
1837
+ raise ClientError("Client not started. Call start() or use context manager.")
1838
+
1839
+ try:
1840
+ pushdown_filters_batch = self._deserialize_pushdown_filters(pushdown_filters)
1841
+
1842
+ self._initialize_stream_common(
1843
+ function_name=function_name,
1844
+ arguments=arguments,
1845
+ function_type=FunctionType.TABLE,
1846
+ input_schema=None,
1847
+ settings=settings,
1848
+ secrets=None,
1849
+ transaction_opaque_data=transaction_opaque_data,
1850
+ projection_ids=projection_ids,
1851
+ pushdown_filters_batch=pushdown_filters_batch,
1852
+ phase=None,
1853
+ bind_result_callback=bind_result_callback,
1854
+ )
1855
+
1856
+ # Read output from all workers in parallel
1857
+ yield from self._table_function_parallel()
1858
+ except ClientError as e:
1859
+ raise self._client_error_with_stderr(e) from e.__cause__
1860
+
1861
+ @property
1862
+ def supports_resumable_scan(self) -> bool:
1863
+ """Whether this client's transport can drive :meth:`table_scan_resumable`.
1864
+
1865
+ True only for HTTP, whose producer streams round-trip state in
1866
+ continuation tokens. The pipe/subprocess transport holds a live stream
1867
+ with no serializable resume point.
1868
+ """
1869
+ return self._transport == "http"
1870
+
1871
+ def table_scan_resumable(
1872
+ self,
1873
+ *,
1874
+ function_name: str,
1875
+ arguments: Arguments | None = None,
1876
+ projection_ids: list[int] | None = None,
1877
+ pushdown_filters: bytes | None = None,
1878
+ settings: dict[str, Any] | None = None,
1879
+ transaction_opaque_data: bytes | None = None,
1880
+ resume_token: bytes | None = None,
1881
+ ) -> ResumableTableScan:
1882
+ """Open (or resume) a resumable table-function scan.
1883
+
1884
+ Resumable variant of :meth:`table_function`: the returned
1885
+ :class:`ResumableTableScan` yields ``(batch, token)`` one batch at a
1886
+ time, surfacing the worker's continuation token so a stateless caller
1887
+ can persist it and resume on another process/node.
1888
+
1889
+ When ``resume_token`` is given, the scan continues from that token
1890
+ (the bind/init is still issued — the upstream's first turn is produced
1891
+ and discarded — so the same ``function_name``/projection/filters must
1892
+ be supplied). When ``None``, a fresh scan starts.
1893
+
1894
+ Raises:
1895
+ ResumeUnsupported: If the transport is not HTTP.
1896
+ ClientError: If the client is not started or the worker errors.
1897
+
1898
+ """
1899
+ if not self.supports_resumable_scan:
1900
+ raise ResumeUnsupported(
1901
+ f"table_scan_resumable requires the HTTP transport; this client uses {self._transport!r}. "
1902
+ "Use table_function() and keep the live stream in-process."
1903
+ )
1904
+ if arguments is None:
1905
+ arguments = Arguments()
1906
+ if self._primary is None:
1907
+ raise ClientError("Client not started. Call start() or use context manager.")
1908
+
1909
+ try:
1910
+ pushdown_filters_batch = self._deserialize_pushdown_filters(pushdown_filters)
1911
+ # Bind + init against the PRIMARY only — a resumable scan is single-worker
1912
+ # (parallel max_workers>1 reads are unordered and not token-resumable).
1913
+ bind_request = self._make_bind_request(
1914
+ function_name=function_name,
1915
+ arguments=arguments,
1916
+ function_type=FunctionType.TABLE,
1917
+ input_schema=None,
1918
+ settings=settings,
1919
+ secrets=None,
1920
+ transaction_opaque_data=transaction_opaque_data,
1921
+ )
1922
+ bind_response = self._do_bind(self._primary.proxy, bind_request, None)
1923
+ stream = self._do_init(
1924
+ self._primary.proxy,
1925
+ bind_request,
1926
+ bind_response,
1927
+ projection_ids=projection_ids,
1928
+ pushdown_filters_batch=pushdown_filters_batch,
1929
+ phase=None,
1930
+ )
1931
+ self._primary.stream = stream
1932
+ stream.typed_header(GlobalInitResponse) # consume the init header
1933
+ if resume_token is not None:
1934
+ # Discard the freshly-produced first turn and continue from the token.
1935
+ stream.seek_to_token(resume_token) # type: ignore[attr-defined]
1936
+ return ResumableTableScan(self, stream)
1937
+ except ClientError as e:
1938
+ raise self._client_error_with_stderr(e) from e.__cause__
1939
+
1940
+ def table_scan_continue(
1941
+ self,
1942
+ *,
1943
+ resume_token: bytes,
1944
+ output_schema: pa.Schema | None = None,
1945
+ ) -> ResumableTableScan:
1946
+ """Resume a producer table scan from a continuation token WITHOUT re-binding.
1947
+
1948
+ The cheap counterpart to ``table_scan_resumable(resume_token=...)``: a continuation
1949
+ token is a signed, self-describing snapshot of the worker's producer state, so the
1950
+ server recovers state + schemas + function identity from the token alone. This skips
1951
+ the ``bind``/``init`` round-trip (and the discarded first turn) that
1952
+ ``table_scan_resumable`` pays — the right primitive for a stateless relay that holds
1953
+ a per-batch token and resumes on any node every batch.
1954
+
1955
+ The client must be started and connected to a worker that honours the token (the
1956
+ token is verified against the caller's auth identity, and routed by the same
1957
+ ``init`` stream method that minted it). HTTP transport only.
1958
+
1959
+ Args:
1960
+ resume_token: A token previously returned by ``ResumableTableScan.next()``.
1961
+ output_schema: Unused on the producer-continuation path (each response carries
1962
+ its own schema); accepted for symmetry with ``table_scan_resumable``.
1963
+
1964
+ Returns:
1965
+ A ``ResumableTableScan`` positioned AFTER the token; ``next()`` continues the
1966
+ stream, yielding ``(batch, token)`` per call.
1967
+
1968
+ Raises:
1969
+ ResumeUnsupported: If the transport is not HTTP.
1970
+ ClientError: If the client is not started.
1971
+
1972
+ """
1973
+ if not self.supports_resumable_scan:
1974
+ raise ResumeUnsupported(
1975
+ f"table_scan_continue requires the HTTP transport; this client uses {self._transport!r}. "
1976
+ "Use table_function() and keep the live stream in-process."
1977
+ )
1978
+ if self._primary is None:
1979
+ raise ClientError("Client not started. Call start() or use context manager.")
1980
+ try:
1981
+ # ``init`` is the VgiProtocol stream method that mints producer continuation
1982
+ # tokens; continuations resume at ``POST /init/exchange``. The server is
1983
+ # stateless per token, so no prior bind/init on this connection is required.
1984
+ stream = self._primary.proxy.resume_stream( # type: ignore[attr-defined]
1985
+ "init", resume_token, output_schema=output_schema
1986
+ )
1987
+ self._primary.stream = stream
1988
+ return ResumableTableScan(self, stream)
1989
+ except ClientError as e:
1990
+ raise self._client_error_with_stderr(e) from e.__cause__
1991
+
1992
+ def _table_function_parallel(self) -> Generator[pa.RecordBatch]:
1993
+ """Read output from table function workers using parallel threads.
1994
+
1995
+ Handles both single-worker and multi-worker cases uniformly. For each
1996
+ worker, spawns a dedicated thread that reads output batches and pushes
1997
+ them to a shared output queue.
1998
+
1999
+ Yields:
2000
+ Output RecordBatches from all workers in non-deterministic order.
2001
+
2002
+ Raises:
2003
+ ClientError: If a worker thread fails with an exception.
2004
+
2005
+ """
2006
+ assert self._primary is not None
2007
+ all_workers = [self._primary] + self._additional_workers
2008
+ num_workers = len(all_workers)
2009
+
2010
+ _logger.debug("starting_parallel_table_function num_workers=%s", num_workers)
2011
+
2012
+ # Queue for collecting output from all workers
2013
+ output_queue: Queue[pa.RecordBatch | BaseException | None] = Queue()
2014
+
2015
+ def read_worker_output(worker: WorkerConnection) -> None:
2016
+ """Thread function that reads all output from a single worker."""
2017
+ try:
2018
+ if worker.stream is None:
2019
+ output_queue.put(None)
2020
+ return
2021
+
2022
+ for output in worker.stream:
2023
+ _logger.debug(
2024
+ "received_output_from_worker worker_index=%s num_rows=%s",
2025
+ worker.worker_index,
2026
+ output.batch.num_rows,
2027
+ )
2028
+ if output.batch.num_rows > 0:
2029
+ output_queue.put(output.batch)
2030
+
2031
+ output_queue.put(None) # Signal completion
2032
+ except StopIteration:
2033
+ output_queue.put(None)
2034
+ except Exception as e:
2035
+ _logger.exception("table_function_worker_thread_error worker_index=%s", worker.worker_index)
2036
+ output_queue.put(e)
2037
+
2038
+ # Start reader threads for all workers
2039
+ threads: list[threading.Thread] = []
2040
+ for worker in all_workers:
2041
+ thread = threading.Thread(
2042
+ target=read_worker_output,
2043
+ args=(worker,),
2044
+ daemon=True,
2045
+ )
2046
+ thread.start()
2047
+ threads.append(thread)
2048
+
2049
+ # Collect outputs from all workers until all are done
2050
+ workers_finished = 0
2051
+ while workers_finished < num_workers:
2052
+ result = output_queue.get()
2053
+
2054
+ # Check for exceptions from worker threads
2055
+ if isinstance(result, BaseException):
2056
+ if isinstance(result, RpcError):
2057
+ raise ClientError.from_rpc_error(result) from result
2058
+ raise ClientError(f"Worker thread failed: {result}") from result
2059
+
2060
+ # None signals a worker finished
2061
+ if result is None:
2062
+ workers_finished += 1
2063
+ _logger.debug(
2064
+ "worker_finished workers_finished=%s total_workers=%s",
2065
+ workers_finished,
2066
+ num_workers,
2067
+ )
2068
+ continue
2069
+
2070
+ yield result
2071
+
2072
+ self._join_threads(threads)
2073
+ _logger.debug("all_table_function_workers_complete")
2074
+
2075
+ # Close streams and secondary workers
2076
+ for worker in all_workers:
2077
+ if worker.stream is not None:
2078
+ worker.stream.close()
2079
+ worker.stream = None
2080
+ self._close_secondary_workers()
2081
+ _logger.debug("parallel_table_function_complete")
2082
+
2083
+ def scalar_function(
2084
+ self,
2085
+ *,
2086
+ function_name: str,
2087
+ input: Iterator[pa.RecordBatch],
2088
+ arguments: Arguments | None = None,
2089
+ bind_result_callback: Callable[[BindResponse], None] | None = None,
2090
+ settings: dict[str, Any] | None = None,
2091
+ secrets: dict[str, Any] | None = None,
2092
+ transaction_opaque_data: bytes | None = None,
2093
+ ) -> Generator[pa.RecordBatch]:
2094
+ """Invoke a scalar function on the worker and stream results.
2095
+
2096
+ Scalar functions transform input batches to single-column output with
2097
+ 1:1 row mapping. Processing ends when input is exhausted.
2098
+
2099
+ For parallel processing (max_workers > 1), input batches are distributed
2100
+ round-robin across workers using dedicated threads. Output order may not
2101
+ match input order in parallel mode.
2102
+
2103
+ Args:
2104
+ function_name: Name of the function to invoke. Must exist in the
2105
+ worker's registry.
2106
+ input: Iterator yielding input RecordBatches. Must yield at least one
2107
+ batch. The first batch's schema is used to initialize the IPC
2108
+ stream. Raises ClientError if the iterator is empty.
2109
+ arguments: Optional Arguments container with positional and named
2110
+ arguments to pass to the function. Defaults to empty Arguments().
2111
+ bind_result_callback: Optional callback invoked with the BindResponse
2112
+ before processing begins.
2113
+ settings: Optional dictionary of settings/pragmas to
2114
+ pass to the function.
2115
+ secrets: Optional dictionary of secret name to value pairs.
2116
+ Values can be simple scalars or dicts (for struct-typed secrets).
2117
+ transaction_opaque_data: Optional unique identifier for the DuckDB transaction.
2118
+
2119
+ Yields:
2120
+ Output RecordBatches from the function. Each output batch has a single
2121
+ column and the same number of rows as its corresponding input batch.
2122
+ In single-worker mode, output order corresponds to input order.
2123
+ In parallel mode (max_workers > 1), output order is non-deterministic.
2124
+
2125
+ Raises:
2126
+ ClientError: If the client is not started, input iterator is empty,
2127
+ input iterator yields non-RecordBatch objects, communication
2128
+ with the worker fails, or the worker returns an unexpected
2129
+ status or exception.
2130
+
2131
+ """
2132
+ if arguments is None:
2133
+ arguments = Arguments()
2134
+
2135
+ if self._primary is None:
2136
+ raise ClientError("Client not started. Call start() or use context manager.")
2137
+
2138
+ try:
2139
+ # Get the first batch to determine schema and initialize
2140
+ for first_batch in input:
2141
+ if not isinstance(first_batch, pa.RecordBatch):
2142
+ raise ClientError("Input iterator must yield RecordBatches")
2143
+
2144
+ input_schema = first_batch.schema
2145
+
2146
+ self._initialize_stream_common(
2147
+ function_name=function_name,
2148
+ arguments=arguments,
2149
+ function_type=FunctionType.SCALAR,
2150
+ input_schema=input_schema,
2151
+ settings=settings,
2152
+ secrets=secrets,
2153
+ transaction_opaque_data=transaction_opaque_data,
2154
+ projection_ids=None,
2155
+ pushdown_filters_batch=None,
2156
+ phase=None,
2157
+ bind_result_callback=bind_result_callback,
2158
+ )
2159
+
2160
+ # Process batches across all workers
2161
+ all_workers = [self._primary] + self._additional_workers
2162
+ yield from self._distribute_and_collect(
2163
+ all_workers=all_workers,
2164
+ first_batch=first_batch,
2165
+ remaining_input=input,
2166
+ )
2167
+
2168
+ # Close streams and secondary workers
2169
+ for worker in all_workers:
2170
+ if worker.stream is not None:
2171
+ worker.stream.close()
2172
+ worker.stream = None
2173
+ self._close_secondary_workers()
2174
+ return
2175
+
2176
+ # Input iterator was empty - scalar functions require input
2177
+ raise ClientError(
2178
+ f"scalar_function requires at least one input batch. "
2179
+ f"The input iterator for function '{function_name}' was empty. "
2180
+ f"Use table_function() for functions that generate data without input."
2181
+ )
2182
+ except ClientError as e:
2183
+ raise self._client_error_with_stderr(e) from e.__cause__