vgi-python 0.8.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vgi/__init__.py +152 -0
- vgi/_duckdb.py +62 -0
- vgi/_storage_profile.py +132 -0
- vgi/_test_fixtures/__init__.py +20 -0
- vgi/_test_fixtures/accumulate/__init__.py +19 -0
- vgi/_test_fixtures/accumulate/worker.py +762 -0
- vgi/_test_fixtures/aggregate/__init__.py +62 -0
- vgi/_test_fixtures/aggregate/_common.py +21 -0
- vgi/_test_fixtures/aggregate/basic.py +232 -0
- vgi/_test_fixtures/aggregate/dynamic.py +409 -0
- vgi/_test_fixtures/aggregate/generic.py +86 -0
- vgi/_test_fixtures/aggregate/listagg.py +71 -0
- vgi/_test_fixtures/aggregate/percentile.py +107 -0
- vgi/_test_fixtures/aggregate/streaming.py +192 -0
- vgi/_test_fixtures/aggregate/varargs.py +75 -0
- vgi/_test_fixtures/aggregate/window.py +380 -0
- vgi/_test_fixtures/attach_options.py +308 -0
- vgi/_test_fixtures/bad_protocol.py +62 -0
- vgi/_test_fixtures/cancellable.py +336 -0
- vgi/_test_fixtures/catalog.py +813 -0
- vgi/_test_fixtures/http_server.py +394 -0
- vgi/_test_fixtures/nest_tensor.py +614 -0
- vgi/_test_fixtures/orchard_catalog.py +47 -0
- vgi/_test_fixtures/projection_repro/__init__.py +6 -0
- vgi/_test_fixtures/projection_repro/worker.py +454 -0
- vgi/_test_fixtures/scalar/__init__.py +116 -0
- vgi/_test_fixtures/scalar/_common.py +69 -0
- vgi/_test_fixtures/scalar/arithmetic.py +321 -0
- vgi/_test_fixtures/scalar/binary.py +120 -0
- vgi/_test_fixtures/scalar/formatting.py +176 -0
- vgi/_test_fixtures/scalar/geo.py +300 -0
- vgi/_test_fixtures/scalar/null_handling.py +107 -0
- vgi/_test_fixtures/scalar/random_demo.py +171 -0
- vgi/_test_fixtures/scalar/settings_secrets.py +102 -0
- vgi/_test_fixtures/scalar/type_info.py +219 -0
- vgi/_test_fixtures/schema_reconcile/__init__.py +29 -0
- vgi/_test_fixtures/schema_reconcile/worker.py +653 -0
- vgi/_test_fixtures/simple_writable.py +793 -0
- vgi/_test_fixtures/table/__init__.py +221 -0
- vgi/_test_fixtures/table/_common.py +162 -0
- vgi/_test_fixtures/table/batch_index.py +283 -0
- vgi/_test_fixtures/table/batch_index_broken.py +200 -0
- vgi/_test_fixtures/table/catalog_scans.py +162 -0
- vgi/_test_fixtures/table/filters.py +1005 -0
- vgi/_test_fixtures/table/late_materialization.py +249 -0
- vgi/_test_fixtures/table/make_series.py +273 -0
- vgi/_test_fixtures/table/misc.py +499 -0
- vgi/_test_fixtures/table/order_modes.py +164 -0
- vgi/_test_fixtures/table/pairs.py +437 -0
- vgi/_test_fixtures/table/partition_columns.py +472 -0
- vgi/_test_fixtures/table/partition_columns_broken.py +304 -0
- vgi/_test_fixtures/table/profiling_example.py +195 -0
- vgi/_test_fixtures/table/required_filters.py +234 -0
- vgi/_test_fixtures/table/sequence.py +710 -0
- vgi/_test_fixtures/table/settings.py +426 -0
- vgi/_test_fixtures/table/transaction_storage.py +162 -0
- vgi/_test_fixtures/table/tt_pushdown.py +191 -0
- vgi/_test_fixtures/table/versioned.py +230 -0
- vgi/_test_fixtures/table_in_out.py +1392 -0
- vgi/_test_fixtures/versioned.py +155 -0
- vgi/_test_fixtures/versioned_tables.py +595 -0
- vgi/_test_fixtures/worker.py +1631 -0
- vgi/_test_fixtures/writable/__init__.py +8 -0
- vgi/_test_fixtures/writable/generic.py +236 -0
- vgi/_test_fixtures/writable/table.py +149 -0
- vgi/_test_fixtures/writable/worker.py +1148 -0
- vgi/aggregate_function.py +607 -0
- vgi/argument_spec.py +472 -0
- vgi/arguments.py +1747 -0
- vgi/auth.py +55 -0
- vgi/catalog/__init__.py +88 -0
- vgi/catalog/attach_option.py +206 -0
- vgi/catalog/catalog_interface.py +2767 -0
- vgi/catalog/descriptors.py +870 -0
- vgi/catalog/duckdb_statistics.py +377 -0
- vgi/catalog/secret_type.py +96 -0
- vgi/catalog/setting.py +253 -0
- vgi/catalog/storage.py +372 -0
- vgi/client/__init__.py +67 -0
- vgi/client/catalog_mixin.py +1251 -0
- vgi/client/cli.py +582 -0
- vgi/client/cli_catalog.py +182 -0
- vgi/client/cli_schema.py +270 -0
- vgi/client/cli_table.py +907 -0
- vgi/client/cli_transaction.py +97 -0
- vgi/client/cli_utils.py +441 -0
- vgi/client/cli_view.py +303 -0
- vgi/client/client.py +2183 -0
- vgi/exceptions.py +205 -0
- vgi/function.py +245 -0
- vgi/function_storage.py +1636 -0
- vgi/function_storage_azure_sql.py +922 -0
- vgi/function_storage_cf_do.py +740 -0
- vgi/http/__init__.py +25 -0
- vgi/http/demo_storage.py +212 -0
- vgi/http/worker_page.py +1252 -0
- vgi/invocation.py +154 -0
- vgi/logging_config.py +93 -0
- vgi/meta_worker.py +661 -0
- vgi/metadata.py +1403 -0
- vgi/otel.py +406 -0
- vgi/protocol.py +2418 -0
- vgi/protocol_version.txt +1 -0
- vgi/py.typed +0 -0
- vgi/scalar_function.py +1211 -0
- vgi/schema_utils.py +234 -0
- vgi/secret_protocol.py +124 -0
- vgi/secret_service.py +238 -0
- vgi/serve.py +769 -0
- vgi/table_buffering_function.py +443 -0
- vgi/table_filter_pushdown.py +1528 -0
- vgi/table_function.py +1130 -0
- vgi/table_in_out_function.py +383 -0
- vgi/transactor/__init__.py +24 -0
- vgi/transactor/_duckdb_compat.py +27 -0
- vgi/transactor/client.py +137 -0
- vgi/transactor/protocol.py +149 -0
- vgi/transactor/server.py +740 -0
- vgi/worker.py +4761 -0
- vgi_python-0.8.0.dist-info/METADATA +735 -0
- vgi_python-0.8.0.dist-info/RECORD +124 -0
- vgi_python-0.8.0.dist-info/WHEEL +4 -0
- vgi_python-0.8.0.dist-info/entry_points.txt +5 -0
- vgi_python-0.8.0.dist-info/licenses/LICENSE +134 -0
|
@@ -0,0 +1,383 @@
|
|
|
1
|
+
# Copyright 2025, 2026 Query Farm LLC - https://query.farm
|
|
2
|
+
|
|
3
|
+
"""Framework for implementing streaming table-in-table-out functions.
|
|
4
|
+
|
|
5
|
+
TableInOutGenerator processes input batches via a per-batch callback.
|
|
6
|
+
Each call to process() emits one output batch via out.emit().
|
|
7
|
+
|
|
8
|
+
TableInOutFunction provides a simpler callback API (transform/finish)
|
|
9
|
+
with automatic state serialization for distributed processing.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
import os
|
|
15
|
+
from dataclasses import dataclass
|
|
16
|
+
from typing import TYPE_CHECKING, final, get_args, get_origin
|
|
17
|
+
|
|
18
|
+
import pyarrow as pa
|
|
19
|
+
from vgi_rpc import ArrowSerializableDataclass
|
|
20
|
+
from vgi_rpc.rpc import OutputCollector
|
|
21
|
+
from vgi_rpc.utils import empty_batch
|
|
22
|
+
|
|
23
|
+
from vgi.function_storage import BoundStorage, FrameworkNS
|
|
24
|
+
from vgi.invocation import (
|
|
25
|
+
BindResponse,
|
|
26
|
+
)
|
|
27
|
+
from vgi.table_function import (
|
|
28
|
+
_ON_CANCEL_CAVEATS,
|
|
29
|
+
BindParams,
|
|
30
|
+
ProcessParams,
|
|
31
|
+
TableFunctionBase,
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
if TYPE_CHECKING:
|
|
35
|
+
pass
|
|
36
|
+
|
|
37
|
+
__all__ = [
|
|
38
|
+
"TableInOutGenerator",
|
|
39
|
+
"TableInOutFunction",
|
|
40
|
+
"TableInOutFunctionStateNoOp",
|
|
41
|
+
"pack_int_cursor",
|
|
42
|
+
"unpack_int_cursor",
|
|
43
|
+
]
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
# --- Cursor helpers for cursor-based finalize streams -----------------------
|
|
47
|
+
#
|
|
48
|
+
# The framework's BufferedFinalizeState carries an opaque ``cursor: bytes``
|
|
49
|
+
# wire-state field. The canonical encoding is the int64 of the last
|
|
50
|
+
# state_log id consumed; these helpers make that intent explicit at
|
|
51
|
+
# call sites without coupling user code to struct layout.
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def pack_int_cursor(value: int) -> bytes:
|
|
55
|
+
"""Encode a signed int64 cursor (e.g., last log_id consumed)."""
|
|
56
|
+
return value.to_bytes(8, "little", signed=True)
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def unpack_int_cursor(cursor: bytes, default: int = -1) -> int:
|
|
60
|
+
"""Decode a packed int64 cursor; ``b""`` returns ``default``.
|
|
61
|
+
|
|
62
|
+
Use ``default=-1`` (before-first sentinel) to start at the beginning
|
|
63
|
+
of a state_log when no prior cursor exists.
|
|
64
|
+
"""
|
|
65
|
+
if not cursor:
|
|
66
|
+
return default
|
|
67
|
+
return int.from_bytes(cursor, "little", signed=True)
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
class TableInOutGenerator[TArgs, TState = None](TableFunctionBase[TArgs]):
|
|
71
|
+
"""Base class for streaming table functions that transform Arrow RecordBatches.
|
|
72
|
+
|
|
73
|
+
Each call to process() should emit exactly one output batch via out.emit().
|
|
74
|
+
Use TState to persist state between process() calls.
|
|
75
|
+
|
|
76
|
+
For functions that need a finalize phase (e.g., aggregation), override
|
|
77
|
+
finalize() to return the final output batches.
|
|
78
|
+
|
|
79
|
+
"""
|
|
80
|
+
|
|
81
|
+
# Subclasses opt into framework-managed state by setting this to a
|
|
82
|
+
# concrete ArrowSerializableDataclass type. Default None means
|
|
83
|
+
# process()/finalize() get state=None and the framework skips its
|
|
84
|
+
# round-trip. TableInOutFunction's __init_subclass__ infers this from
|
|
85
|
+
# the TState type parameter when a subclass declares one. Constrained
|
|
86
|
+
# to ArrowSerializableDataclass so the framework can call
|
|
87
|
+
# serialize_to_bytes / deserialize_from_bytes on instances without
|
|
88
|
+
# further type narrowing at the call site.
|
|
89
|
+
state_class: type[ArrowSerializableDataclass] | None = None
|
|
90
|
+
|
|
91
|
+
@classmethod
|
|
92
|
+
def has_finalize_override(cls) -> bool:
|
|
93
|
+
"""Whether this class's ``finalize``/``finish`` represents real work.
|
|
94
|
+
|
|
95
|
+
Returns True iff either:
|
|
96
|
+
|
|
97
|
+
- The class's ``Meta`` declares ``has_finalize`` as ``True`` or ``False``
|
|
98
|
+
(explicit override — the declared value wins, even if it disagrees
|
|
99
|
+
with the auto-detection).
|
|
100
|
+
- Auto-detection finds a user subclass (one that is itself a
|
|
101
|
+
``TableInOutGenerator`` subclass) strictly above the VGI bases in
|
|
102
|
+
the MRO defining a callable ``finish`` or ``finalize`` attribute.
|
|
103
|
+
|
|
104
|
+
The framework uses this to decide whether to advertise a finalize
|
|
105
|
+
callback to DuckDB; DuckDB rejects LATERAL with correlated input on
|
|
106
|
+
table functions that register ``in_out_function_final``.
|
|
107
|
+
"""
|
|
108
|
+
# Explicit Meta override.
|
|
109
|
+
meta = getattr(cls, "Meta", None)
|
|
110
|
+
explicit = getattr(meta, "has_finalize", None) if meta is not None else None
|
|
111
|
+
if explicit is not None:
|
|
112
|
+
return bool(explicit)
|
|
113
|
+
|
|
114
|
+
# Auto-detect.
|
|
115
|
+
bases: set[type] = {TableInOutGenerator, TableInOutFunction}
|
|
116
|
+
for klass in cls.__mro__:
|
|
117
|
+
if klass in bases:
|
|
118
|
+
return False
|
|
119
|
+
# Only count overrides defined on an actual TableInOut subclass, so
|
|
120
|
+
# an unrelated mixin with an identically-named attribute can't
|
|
121
|
+
# trigger a false positive.
|
|
122
|
+
if not (isinstance(klass, type) and issubclass(klass, TableInOutGenerator)):
|
|
123
|
+
continue
|
|
124
|
+
for attr_name in ("finish", "finalize"):
|
|
125
|
+
raw = klass.__dict__.get(attr_name)
|
|
126
|
+
if raw is None:
|
|
127
|
+
continue
|
|
128
|
+
if isinstance(raw, (classmethod, staticmethod)):
|
|
129
|
+
raw = raw.__func__
|
|
130
|
+
if callable(raw):
|
|
131
|
+
return True
|
|
132
|
+
return False
|
|
133
|
+
|
|
134
|
+
@classmethod
|
|
135
|
+
def on_bind(
|
|
136
|
+
cls,
|
|
137
|
+
params: BindParams[TArgs],
|
|
138
|
+
) -> BindResponse:
|
|
139
|
+
"""Pass-through default — output schema is the input schema.
|
|
140
|
+
|
|
141
|
+
Override to compute a dynamic output type or validate arguments.
|
|
142
|
+
See ``TableFunctionBase.on_bind`` for the broader contract.
|
|
143
|
+
"""
|
|
144
|
+
assert params.bind_call.input_schema is not None
|
|
145
|
+
return BindResponse(output_schema=params.bind_call.input_schema)
|
|
146
|
+
|
|
147
|
+
# bind / on_init / global_init are defined on TableFunctionBase.
|
|
148
|
+
|
|
149
|
+
@classmethod
|
|
150
|
+
def initial_state(cls, params: ProcessParams[TArgs]) -> TState | None:
|
|
151
|
+
"""Create initial processing state. Override when TState is used.
|
|
152
|
+
|
|
153
|
+
Called once during init to create the state object that will be
|
|
154
|
+
passed to process() on each input batch.
|
|
155
|
+
|
|
156
|
+
Args:
|
|
157
|
+
params: Process parameters including arguments and schemas.
|
|
158
|
+
|
|
159
|
+
Returns:
|
|
160
|
+
Initial state, or None if no state is needed.
|
|
161
|
+
|
|
162
|
+
"""
|
|
163
|
+
return None
|
|
164
|
+
|
|
165
|
+
@classmethod
|
|
166
|
+
def process(
|
|
167
|
+
cls,
|
|
168
|
+
params: ProcessParams[TArgs],
|
|
169
|
+
state: TState,
|
|
170
|
+
batch: pa.RecordBatch,
|
|
171
|
+
out: OutputCollector,
|
|
172
|
+
) -> None:
|
|
173
|
+
"""Process one input batch.
|
|
174
|
+
|
|
175
|
+
Called once per input batch during the INPUT phase. Must call
|
|
176
|
+
out.emit(batch) exactly once to produce output.
|
|
177
|
+
|
|
178
|
+
Use out.client_log(level, message) for in-band logging.
|
|
179
|
+
|
|
180
|
+
Args:
|
|
181
|
+
params: Process parameters including arguments and schemas.
|
|
182
|
+
state: Mutable state persisted between calls. None if TState not used.
|
|
183
|
+
batch: The input RecordBatch to process.
|
|
184
|
+
out: OutputCollector for emitting output and logging.
|
|
185
|
+
|
|
186
|
+
"""
|
|
187
|
+
out.emit(batch)
|
|
188
|
+
|
|
189
|
+
@classmethod
|
|
190
|
+
def finalize(cls, params: ProcessParams[TArgs]) -> list[pa.RecordBatch]:
|
|
191
|
+
"""Finalize processing and produce any remaining output.
|
|
192
|
+
|
|
193
|
+
Called after all input batches have been processed during the
|
|
194
|
+
FINALIZE phase. Override to emit buffered or aggregated results.
|
|
195
|
+
|
|
196
|
+
Args:
|
|
197
|
+
params: Process parameters including arguments and schemas.
|
|
198
|
+
|
|
199
|
+
Returns:
|
|
200
|
+
List of output RecordBatches, or empty list if no finalization needed.
|
|
201
|
+
|
|
202
|
+
"""
|
|
203
|
+
return []
|
|
204
|
+
|
|
205
|
+
@classmethod
|
|
206
|
+
def on_cancel(cls, params: ProcessParams[TArgs], state: TState | None) -> None: # noqa: D102
|
|
207
|
+
pass
|
|
208
|
+
|
|
209
|
+
on_cancel.__func__.__doc__ = ( # type: ignore[attr-defined]
|
|
210
|
+
f"""Release resources when the stream is cancelled before natural end.
|
|
211
|
+
|
|
212
|
+
The VGI C++ extension fires this hook when a DuckDB query tears
|
|
213
|
+
down a VGI table-in-out scan early (LIMIT clause upstream, user
|
|
214
|
+
break, Ctrl-C, exception unwind). Override to release expensive
|
|
215
|
+
per-stream resources the function was holding in ``state``
|
|
216
|
+
(database cursors, LLM streaming sessions, file handles, GPU
|
|
217
|
+
buffers).
|
|
218
|
+
|
|
219
|
+
{_ON_CANCEL_CAVEATS}
|
|
220
|
+
|
|
221
|
+
Args:
|
|
222
|
+
params: Process parameters (same as ``process()`` received).
|
|
223
|
+
state: The current user state; ``None`` when state is unused.
|
|
224
|
+
"""
|
|
225
|
+
)
|
|
226
|
+
|
|
227
|
+
|
|
228
|
+
@dataclass(slots=True, frozen=True, kw_only=True)
|
|
229
|
+
class TableInOutFunctionStateNoOp(ArrowSerializableDataclass):
|
|
230
|
+
"""No-op state class for TableInOutFunction when no state is needed."""
|
|
231
|
+
|
|
232
|
+
|
|
233
|
+
class TableInOutFunction[
|
|
234
|
+
TArgs,
|
|
235
|
+
TState: ArrowSerializableDataclass = TableInOutFunctionStateNoOp,
|
|
236
|
+
](TableInOutGenerator[TArgs, TState]):
|
|
237
|
+
"""Simplified base class using transform/finish callbacks.
|
|
238
|
+
|
|
239
|
+
This class provides a simpler API for common use cases where you don't need
|
|
240
|
+
to work directly with OutputCollector. Instead of implementing process()
|
|
241
|
+
directly, you override transform() and optionally finish() as regular methods.
|
|
242
|
+
|
|
243
|
+
TState is optional. If not provided, state management is disabled and
|
|
244
|
+
transform() will always receive state=None. When TState is an
|
|
245
|
+
ArrowSerializableDataclass, state is automatically saved to storage
|
|
246
|
+
after each transform() call for distributed processing.
|
|
247
|
+
|
|
248
|
+
"""
|
|
249
|
+
|
|
250
|
+
state_class: type[TState] | None = None
|
|
251
|
+
|
|
252
|
+
def __init_subclass__(cls, **kwargs: object) -> None:
|
|
253
|
+
"""Automatically infer the state_class from the generic type parameters."""
|
|
254
|
+
super().__init_subclass__(**kwargs)
|
|
255
|
+
|
|
256
|
+
# Iterate over the original bases to find the generic parameters
|
|
257
|
+
orig_bases = getattr(cls, "__orig_bases__", ())
|
|
258
|
+
for base in orig_bases:
|
|
259
|
+
origin = get_origin(base)
|
|
260
|
+
if origin is None:
|
|
261
|
+
continue # not a generic base
|
|
262
|
+
args = get_args(base)
|
|
263
|
+
if len(args) >= 2:
|
|
264
|
+
# Assign the second type parameter to state_class
|
|
265
|
+
cls.state_class = args[1]
|
|
266
|
+
break
|
|
267
|
+
|
|
268
|
+
@classmethod
|
|
269
|
+
def transform(
|
|
270
|
+
cls,
|
|
271
|
+
batch: pa.RecordBatch,
|
|
272
|
+
params: ProcessParams[TArgs],
|
|
273
|
+
state: TState | None,
|
|
274
|
+
) -> pa.RecordBatch | list[pa.RecordBatch]:
|
|
275
|
+
"""Transform a single input batch.
|
|
276
|
+
|
|
277
|
+
Override this method to implement your transformation logic. This is called
|
|
278
|
+
once for each input batch.
|
|
279
|
+
|
|
280
|
+
Args:
|
|
281
|
+
batch: Input RecordBatch to transform.
|
|
282
|
+
params: ProcessParams containing arguments, schemas, and settings.
|
|
283
|
+
state: Mutable state that should be updated and will be serialized as needed.
|
|
284
|
+
|
|
285
|
+
Returns:
|
|
286
|
+
Either:
|
|
287
|
+
- A single pa.RecordBatch: The transformed output
|
|
288
|
+
- A list of pa.RecordBatch: Multiple outputs (will be concatenated)
|
|
289
|
+
|
|
290
|
+
"""
|
|
291
|
+
return batch
|
|
292
|
+
|
|
293
|
+
@classmethod
|
|
294
|
+
def finish(
|
|
295
|
+
cls,
|
|
296
|
+
params: ProcessParams[TArgs],
|
|
297
|
+
states: list[TState],
|
|
298
|
+
) -> list[pa.RecordBatch]:
|
|
299
|
+
"""Return final batches after all input is processed.
|
|
300
|
+
|
|
301
|
+
Override this method to emit results after all input batches have been
|
|
302
|
+
processed. This is useful for aggregations, sorting, or any operation
|
|
303
|
+
that needs to see all data before producing output.
|
|
304
|
+
|
|
305
|
+
Returns:
|
|
306
|
+
List of pa.RecordBatch to emit as final output.
|
|
307
|
+
Return an empty list if no finalization output is needed.
|
|
308
|
+
|
|
309
|
+
"""
|
|
310
|
+
return []
|
|
311
|
+
|
|
312
|
+
@classmethod
|
|
313
|
+
def initial_state(
|
|
314
|
+
cls,
|
|
315
|
+
params: ProcessParams[TArgs],
|
|
316
|
+
) -> TState | None:
|
|
317
|
+
"""Create the initial state for processing.
|
|
318
|
+
|
|
319
|
+
Override this method to initialize the state object before processing
|
|
320
|
+
begins.
|
|
321
|
+
|
|
322
|
+
Args:
|
|
323
|
+
params: ProcessParams containing arguments, schemas, and settings.
|
|
324
|
+
|
|
325
|
+
Returns:
|
|
326
|
+
An instance of TState representing the initial state.
|
|
327
|
+
|
|
328
|
+
"""
|
|
329
|
+
return None
|
|
330
|
+
|
|
331
|
+
@final
|
|
332
|
+
@classmethod
|
|
333
|
+
def process(
|
|
334
|
+
cls,
|
|
335
|
+
params: ProcessParams[TArgs],
|
|
336
|
+
state: TState,
|
|
337
|
+
batch: pa.RecordBatch,
|
|
338
|
+
out: OutputCollector,
|
|
339
|
+
) -> None:
|
|
340
|
+
"""Process input batches by calling transform(). Do not override.
|
|
341
|
+
|
|
342
|
+
This method implements the exchange protocol by calling your transform()
|
|
343
|
+
method for each input batch. State is automatically saved to storage
|
|
344
|
+
after each call for distributed processing.
|
|
345
|
+
|
|
346
|
+
"""
|
|
347
|
+
result = cls.transform(batch, params, state)
|
|
348
|
+
|
|
349
|
+
# Save state for distributed processing (upsert semantics)
|
|
350
|
+
if state is not None:
|
|
351
|
+
params.storage.state_put(
|
|
352
|
+
FrameworkNS.TIO_STATE, BoundStorage.pack_int_key(os.getpid()), state.serialize_to_bytes()
|
|
353
|
+
)
|
|
354
|
+
|
|
355
|
+
# Handle single batch or list of batches — exchange must emit exactly one
|
|
356
|
+
if isinstance(result, list):
|
|
357
|
+
if not result:
|
|
358
|
+
out.emit(empty_batch(params.output_schema))
|
|
359
|
+
elif len(result) == 1:
|
|
360
|
+
out.emit(result[0])
|
|
361
|
+
else:
|
|
362
|
+
combined = pa.Table.from_batches(result).combine_chunks()
|
|
363
|
+
out.emit(combined.to_batches()[0])
|
|
364
|
+
else:
|
|
365
|
+
out.emit(result)
|
|
366
|
+
|
|
367
|
+
@final
|
|
368
|
+
@classmethod
|
|
369
|
+
def finalize(cls, params: ProcessParams[TArgs]) -> list[pa.RecordBatch]:
|
|
370
|
+
"""Emit final batches by calling finish(). Do not override.
|
|
371
|
+
|
|
372
|
+
This method collects serialized states from all workers, deserializes
|
|
373
|
+
them, and passes them to your finish() method.
|
|
374
|
+
|
|
375
|
+
"""
|
|
376
|
+
if cls.state_class is not None and cls.state_class is not TableInOutFunctionStateNoOp:
|
|
377
|
+
states = [
|
|
378
|
+
cls.state_class.deserialize_from_bytes(v) for _k, v in params.storage.state_drain(FrameworkNS.TIO_STATE)
|
|
379
|
+
]
|
|
380
|
+
else:
|
|
381
|
+
states = []
|
|
382
|
+
|
|
383
|
+
return cls.finish(params, states)
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
# Copyright 2025, 2026 Query Farm LLC - https://query.farm
|
|
2
|
+
|
|
3
|
+
"""db-transactor — transactional database access for VGI workers.
|
|
4
|
+
|
|
5
|
+
The transactor is a long-lived subprocess that owns a single DuckDB
|
|
6
|
+
connection. VGI worker processes communicate with it via ``vgi_rpc``
|
|
7
|
+
over Unix domain sockets, using the same streaming exchange patterns
|
|
8
|
+
that DuckDB uses with VGI workers.
|
|
9
|
+
|
|
10
|
+
Architecture::
|
|
11
|
+
|
|
12
|
+
VGI Worker(s) ──── vgi_rpc (Unix socket) ──── db-transactor
|
|
13
|
+
│
|
|
14
|
+
DuckDB file
|
|
15
|
+
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
from vgi.transactor.client import TransactorClient
|
|
19
|
+
from vgi.transactor.protocol import TransactorProtocol
|
|
20
|
+
|
|
21
|
+
__all__ = [
|
|
22
|
+
"TransactorClient",
|
|
23
|
+
"TransactorProtocol",
|
|
24
|
+
]
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
# Copyright 2025, 2026 Query Farm LLC - https://query.farm
|
|
2
|
+
|
|
3
|
+
"""Type-safe shim for VGI's ``subcursor()`` extension to duckdb-python.
|
|
4
|
+
|
|
5
|
+
The VGI fork of duckdb-python adds ``DuckDBPyConnection.subcursor()`` so
|
|
6
|
+
callers can issue reads inside an open write transaction. That change has
|
|
7
|
+
not yet been merged into haybarn or upstreamed to duckdb — only local fork
|
|
8
|
+
builds provide it. The upstream type stubs don't know about it, so we cast
|
|
9
|
+
through a small Protocol here rather than scatter ``# type: ignore``
|
|
10
|
+
across the codebase.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
from typing import TYPE_CHECKING, Protocol, cast
|
|
16
|
+
|
|
17
|
+
if TYPE_CHECKING:
|
|
18
|
+
import duckdb
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class _SupportsSubcursor(Protocol):
|
|
22
|
+
def subcursor(self) -> duckdb.DuckDBPyConnection: ...
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def subcursor(conn: duckdb.DuckDBPyConnection) -> duckdb.DuckDBPyConnection:
|
|
26
|
+
"""Return a read cursor that shares ``conn``'s transaction context."""
|
|
27
|
+
return cast(_SupportsSubcursor, conn).subcursor()
|
vgi/transactor/client.py
ADDED
|
@@ -0,0 +1,137 @@
|
|
|
1
|
+
# Copyright 2025, 2026 Query Farm LLC - https://query.farm
|
|
2
|
+
|
|
3
|
+
"""TransactorClient — connects to a db-transactor subprocess.
|
|
4
|
+
|
|
5
|
+
Handles auto-spawning the transactor process if one isn't running,
|
|
6
|
+
and provides a typed ``vgi_rpc`` proxy for RPC calls.
|
|
7
|
+
|
|
8
|
+
The transactor manages multiple databases internally (one per attach_opaque_data),
|
|
9
|
+
so a single transactor process serves all catalog attachments.
|
|
10
|
+
|
|
11
|
+
Usage::
|
|
12
|
+
|
|
13
|
+
client = TransactorClient()
|
|
14
|
+
proxy = client.get_proxy()
|
|
15
|
+
proxy.register(attach_opaque_data)
|
|
16
|
+
tx_id = proxy.begin(attach_opaque_data)
|
|
17
|
+
# ... use proxy.insert(), proxy.scan(), etc.
|
|
18
|
+
proxy.commit(attach_opaque_data, tx_id)
|
|
19
|
+
client.close()
|
|
20
|
+
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
from __future__ import annotations
|
|
24
|
+
|
|
25
|
+
import logging
|
|
26
|
+
import os
|
|
27
|
+
import subprocess
|
|
28
|
+
import sys
|
|
29
|
+
import time
|
|
30
|
+
from pathlib import Path
|
|
31
|
+
from typing import Any
|
|
32
|
+
|
|
33
|
+
from vgi_rpc.rpc import RpcConnection, UnixTransport
|
|
34
|
+
|
|
35
|
+
from vgi.transactor.protocol import TransactorProtocol
|
|
36
|
+
|
|
37
|
+
logger = logging.getLogger("vgi.transactor.client")
|
|
38
|
+
|
|
39
|
+
_MAX_SPAWN_RETRIES = 50
|
|
40
|
+
_SPAWN_RETRY_DELAY = 0.1 # seconds
|
|
41
|
+
_DEFAULT_SOCKET_PATH = "/tmp/vgi-transactor.sock" # noqa: S108
|
|
42
|
+
_DEFAULT_DB_DIR = str(Path("~/.local/state/vgi/databases").expanduser())
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class TransactorClient:
|
|
46
|
+
"""Client that connects to (and optionally spawns) a db-transactor.
|
|
47
|
+
|
|
48
|
+
The transactor process is auto-spawned on first use if not already
|
|
49
|
+
running. A single transactor serves all databases.
|
|
50
|
+
"""
|
|
51
|
+
|
|
52
|
+
def __init__(self) -> None:
|
|
53
|
+
"""Initialize client."""
|
|
54
|
+
self._socket_path = os.environ.get("VGI_TRANSACTOR_SOCKET", _DEFAULT_SOCKET_PATH)
|
|
55
|
+
self._transport: UnixTransport | None = None
|
|
56
|
+
self._connection: RpcConnection[TransactorProtocol] | None = None
|
|
57
|
+
self._proxy: Any = None
|
|
58
|
+
self._process: subprocess.Popen | None = None # type: ignore[type-arg]
|
|
59
|
+
|
|
60
|
+
def get_proxy(self) -> Any:
|
|
61
|
+
"""Get the typed RPC proxy, spawning the transactor if needed."""
|
|
62
|
+
if self._proxy is not None:
|
|
63
|
+
return self._proxy
|
|
64
|
+
self._ensure_server()
|
|
65
|
+
return self._proxy
|
|
66
|
+
|
|
67
|
+
def _ensure_server(self) -> None:
|
|
68
|
+
"""Connect to existing transactor or spawn a new one."""
|
|
69
|
+
if self._try_connect():
|
|
70
|
+
return
|
|
71
|
+
|
|
72
|
+
self._spawn_server()
|
|
73
|
+
|
|
74
|
+
for _ in range(_MAX_SPAWN_RETRIES):
|
|
75
|
+
time.sleep(_SPAWN_RETRY_DELAY)
|
|
76
|
+
if self._try_connect():
|
|
77
|
+
return
|
|
78
|
+
|
|
79
|
+
raise RuntimeError(f"Failed to connect to transactor after spawning (socket: {self._socket_path})")
|
|
80
|
+
|
|
81
|
+
def _try_connect(self) -> bool:
|
|
82
|
+
"""Try to connect to an existing transactor socket."""
|
|
83
|
+
import socket
|
|
84
|
+
|
|
85
|
+
if sys.platform == "win32": # pragma: no cover - AF_UNIX transactor is POSIX-only
|
|
86
|
+
return False
|
|
87
|
+
|
|
88
|
+
if not os.path.exists(self._socket_path):
|
|
89
|
+
return False
|
|
90
|
+
|
|
91
|
+
try:
|
|
92
|
+
sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
|
|
93
|
+
sock.connect(self._socket_path)
|
|
94
|
+
self._transport = UnixTransport(sock)
|
|
95
|
+
self._connection = RpcConnection(TransactorProtocol, self._transport) # type: ignore[type-abstract, unused-ignore]
|
|
96
|
+
self._proxy = self._connection.__enter__()
|
|
97
|
+
logger.info("Connected to transactor: %s", self._socket_path)
|
|
98
|
+
return True
|
|
99
|
+
except (ConnectionRefusedError, FileNotFoundError, OSError):
|
|
100
|
+
return False
|
|
101
|
+
|
|
102
|
+
def _spawn_server(self) -> None:
|
|
103
|
+
"""Spawn a new transactor subprocess."""
|
|
104
|
+
import sys
|
|
105
|
+
|
|
106
|
+
db_dir = os.environ.get("VGI_TRANSACTOR_DB_DIR", _DEFAULT_DB_DIR)
|
|
107
|
+
os.makedirs(db_dir, exist_ok=True)
|
|
108
|
+
|
|
109
|
+
cmd = [
|
|
110
|
+
sys.executable,
|
|
111
|
+
"-m",
|
|
112
|
+
"vgi.transactor.server",
|
|
113
|
+
"--db-dir",
|
|
114
|
+
db_dir,
|
|
115
|
+
"--socket",
|
|
116
|
+
self._socket_path,
|
|
117
|
+
]
|
|
118
|
+
logger.info("Spawning transactor: %s", " ".join(cmd))
|
|
119
|
+
self._process = subprocess.Popen( # noqa: S603
|
|
120
|
+
cmd,
|
|
121
|
+
stdout=subprocess.DEVNULL,
|
|
122
|
+
stderr=subprocess.DEVNULL,
|
|
123
|
+
start_new_session=True,
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
def close(self) -> None:
|
|
127
|
+
"""Close the connection."""
|
|
128
|
+
import contextlib
|
|
129
|
+
|
|
130
|
+
if self._connection is not None:
|
|
131
|
+
with contextlib.suppress(Exception):
|
|
132
|
+
self._connection.__exit__(None, None, None)
|
|
133
|
+
self._connection = None
|
|
134
|
+
if self._transport is not None:
|
|
135
|
+
with contextlib.suppress(Exception):
|
|
136
|
+
self._transport.close()
|
|
137
|
+
self._transport = None
|