vgi-python 0.8.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vgi/__init__.py +152 -0
- vgi/_duckdb.py +62 -0
- vgi/_storage_profile.py +132 -0
- vgi/_test_fixtures/__init__.py +20 -0
- vgi/_test_fixtures/accumulate/__init__.py +19 -0
- vgi/_test_fixtures/accumulate/worker.py +762 -0
- vgi/_test_fixtures/aggregate/__init__.py +62 -0
- vgi/_test_fixtures/aggregate/_common.py +21 -0
- vgi/_test_fixtures/aggregate/basic.py +232 -0
- vgi/_test_fixtures/aggregate/dynamic.py +409 -0
- vgi/_test_fixtures/aggregate/generic.py +86 -0
- vgi/_test_fixtures/aggregate/listagg.py +71 -0
- vgi/_test_fixtures/aggregate/percentile.py +107 -0
- vgi/_test_fixtures/aggregate/streaming.py +192 -0
- vgi/_test_fixtures/aggregate/varargs.py +75 -0
- vgi/_test_fixtures/aggregate/window.py +380 -0
- vgi/_test_fixtures/attach_options.py +308 -0
- vgi/_test_fixtures/bad_protocol.py +62 -0
- vgi/_test_fixtures/cancellable.py +336 -0
- vgi/_test_fixtures/catalog.py +813 -0
- vgi/_test_fixtures/http_server.py +394 -0
- vgi/_test_fixtures/nest_tensor.py +614 -0
- vgi/_test_fixtures/orchard_catalog.py +47 -0
- vgi/_test_fixtures/projection_repro/__init__.py +6 -0
- vgi/_test_fixtures/projection_repro/worker.py +454 -0
- vgi/_test_fixtures/scalar/__init__.py +116 -0
- vgi/_test_fixtures/scalar/_common.py +69 -0
- vgi/_test_fixtures/scalar/arithmetic.py +321 -0
- vgi/_test_fixtures/scalar/binary.py +120 -0
- vgi/_test_fixtures/scalar/formatting.py +176 -0
- vgi/_test_fixtures/scalar/geo.py +300 -0
- vgi/_test_fixtures/scalar/null_handling.py +107 -0
- vgi/_test_fixtures/scalar/random_demo.py +171 -0
- vgi/_test_fixtures/scalar/settings_secrets.py +102 -0
- vgi/_test_fixtures/scalar/type_info.py +219 -0
- vgi/_test_fixtures/schema_reconcile/__init__.py +29 -0
- vgi/_test_fixtures/schema_reconcile/worker.py +653 -0
- vgi/_test_fixtures/simple_writable.py +793 -0
- vgi/_test_fixtures/table/__init__.py +221 -0
- vgi/_test_fixtures/table/_common.py +162 -0
- vgi/_test_fixtures/table/batch_index.py +283 -0
- vgi/_test_fixtures/table/batch_index_broken.py +200 -0
- vgi/_test_fixtures/table/catalog_scans.py +162 -0
- vgi/_test_fixtures/table/filters.py +1005 -0
- vgi/_test_fixtures/table/late_materialization.py +249 -0
- vgi/_test_fixtures/table/make_series.py +273 -0
- vgi/_test_fixtures/table/misc.py +499 -0
- vgi/_test_fixtures/table/order_modes.py +164 -0
- vgi/_test_fixtures/table/pairs.py +437 -0
- vgi/_test_fixtures/table/partition_columns.py +472 -0
- vgi/_test_fixtures/table/partition_columns_broken.py +304 -0
- vgi/_test_fixtures/table/profiling_example.py +195 -0
- vgi/_test_fixtures/table/required_filters.py +234 -0
- vgi/_test_fixtures/table/sequence.py +710 -0
- vgi/_test_fixtures/table/settings.py +426 -0
- vgi/_test_fixtures/table/transaction_storage.py +162 -0
- vgi/_test_fixtures/table/tt_pushdown.py +191 -0
- vgi/_test_fixtures/table/versioned.py +230 -0
- vgi/_test_fixtures/table_in_out.py +1392 -0
- vgi/_test_fixtures/versioned.py +155 -0
- vgi/_test_fixtures/versioned_tables.py +595 -0
- vgi/_test_fixtures/worker.py +1631 -0
- vgi/_test_fixtures/writable/__init__.py +8 -0
- vgi/_test_fixtures/writable/generic.py +236 -0
- vgi/_test_fixtures/writable/table.py +149 -0
- vgi/_test_fixtures/writable/worker.py +1148 -0
- vgi/aggregate_function.py +607 -0
- vgi/argument_spec.py +472 -0
- vgi/arguments.py +1747 -0
- vgi/auth.py +55 -0
- vgi/catalog/__init__.py +88 -0
- vgi/catalog/attach_option.py +206 -0
- vgi/catalog/catalog_interface.py +2767 -0
- vgi/catalog/descriptors.py +870 -0
- vgi/catalog/duckdb_statistics.py +377 -0
- vgi/catalog/secret_type.py +96 -0
- vgi/catalog/setting.py +253 -0
- vgi/catalog/storage.py +372 -0
- vgi/client/__init__.py +67 -0
- vgi/client/catalog_mixin.py +1251 -0
- vgi/client/cli.py +582 -0
- vgi/client/cli_catalog.py +182 -0
- vgi/client/cli_schema.py +270 -0
- vgi/client/cli_table.py +907 -0
- vgi/client/cli_transaction.py +97 -0
- vgi/client/cli_utils.py +441 -0
- vgi/client/cli_view.py +303 -0
- vgi/client/client.py +2183 -0
- vgi/exceptions.py +205 -0
- vgi/function.py +245 -0
- vgi/function_storage.py +1636 -0
- vgi/function_storage_azure_sql.py +922 -0
- vgi/function_storage_cf_do.py +740 -0
- vgi/http/__init__.py +25 -0
- vgi/http/demo_storage.py +212 -0
- vgi/http/worker_page.py +1252 -0
- vgi/invocation.py +154 -0
- vgi/logging_config.py +93 -0
- vgi/meta_worker.py +661 -0
- vgi/metadata.py +1403 -0
- vgi/otel.py +406 -0
- vgi/protocol.py +2418 -0
- vgi/protocol_version.txt +1 -0
- vgi/py.typed +0 -0
- vgi/scalar_function.py +1211 -0
- vgi/schema_utils.py +234 -0
- vgi/secret_protocol.py +124 -0
- vgi/secret_service.py +238 -0
- vgi/serve.py +769 -0
- vgi/table_buffering_function.py +443 -0
- vgi/table_filter_pushdown.py +1528 -0
- vgi/table_function.py +1130 -0
- vgi/table_in_out_function.py +383 -0
- vgi/transactor/__init__.py +24 -0
- vgi/transactor/_duckdb_compat.py +27 -0
- vgi/transactor/client.py +137 -0
- vgi/transactor/protocol.py +149 -0
- vgi/transactor/server.py +740 -0
- vgi/worker.py +4761 -0
- vgi_python-0.8.0.dist-info/METADATA +735 -0
- vgi_python-0.8.0.dist-info/RECORD +124 -0
- vgi_python-0.8.0.dist-info/WHEEL +4 -0
- vgi_python-0.8.0.dist-info/entry_points.txt +5 -0
- vgi_python-0.8.0.dist-info/licenses/LICENSE +134 -0
vgi/table_function.py
ADDED
|
@@ -0,0 +1,1130 @@
|
|
|
1
|
+
# Copyright 2025, 2026 Query Farm LLC - https://query.farm
|
|
2
|
+
|
|
3
|
+
"""Base classes for table functions with cardinality hints and callback-based processing.
|
|
4
|
+
|
|
5
|
+
TableFunctionGenerator produces output batches via a per-tick callback. Each call
|
|
6
|
+
to process() either emits a batch via out.emit() or signals completion via out.finish().
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import uuid
|
|
12
|
+
from abc import abstractmethod
|
|
13
|
+
from collections.abc import Mapping
|
|
14
|
+
from dataclasses import dataclass, is_dataclass
|
|
15
|
+
from enum import Enum, auto
|
|
16
|
+
from typing import (
|
|
17
|
+
TYPE_CHECKING,
|
|
18
|
+
Annotated,
|
|
19
|
+
Any,
|
|
20
|
+
ClassVar,
|
|
21
|
+
TypeVar,
|
|
22
|
+
final,
|
|
23
|
+
get_args,
|
|
24
|
+
get_origin,
|
|
25
|
+
get_type_hints,
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
import pyarrow as pa
|
|
29
|
+
from vgi_rpc import ArrowSerializableDataclass
|
|
30
|
+
from vgi_rpc.rpc import AuthContext, CallContext, OutputCollector
|
|
31
|
+
|
|
32
|
+
import vgi.function
|
|
33
|
+
from vgi.arguments import (
|
|
34
|
+
Arg,
|
|
35
|
+
Arguments,
|
|
36
|
+
Secret,
|
|
37
|
+
SecretLookupEntry,
|
|
38
|
+
TableInput,
|
|
39
|
+
_accepts_none,
|
|
40
|
+
_extract_setting_secret_params,
|
|
41
|
+
)
|
|
42
|
+
from vgi.function_storage import BoundStorage, TransactionBoundStorage, attach_catalog_bytes
|
|
43
|
+
from vgi.invocation import (
|
|
44
|
+
BaseInitResponse,
|
|
45
|
+
BindResponse,
|
|
46
|
+
GlobalInitResponse,
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
if TYPE_CHECKING:
|
|
50
|
+
from vgi.catalog.catalog_interface import ColumnStatistics
|
|
51
|
+
from vgi.protocol import BindRequest, InitRequest
|
|
52
|
+
from vgi.table_filter_pushdown import PushdownFilters
|
|
53
|
+
|
|
54
|
+
_ON_CANCEL_CAVEATS = """\
|
|
55
|
+
**Best-effort only.** This hook does not fire in every
|
|
56
|
+
cancellation path — process kills, network partitions, and
|
|
57
|
+
some error-on-error unwinds skip it. Never rely on
|
|
58
|
+
``on_cancel`` for correctness-critical cleanup; treat it as a
|
|
59
|
+
resource-release optimization.
|
|
60
|
+
|
|
61
|
+
Under HTTP pooling with ``max_workers > 1``, ``on_cancel`` may
|
|
62
|
+
fire on a different worker process than the one that produced
|
|
63
|
+
batches for this stream. Process-local resources held in a
|
|
64
|
+
specific worker's memory cannot be reliably released from
|
|
65
|
+
another worker's ``on_cancel``; prefer shared infrastructure
|
|
66
|
+
whose handle is re-derivable from the serialized state."""
|
|
67
|
+
|
|
68
|
+
__all__ = [
|
|
69
|
+
"TableCardinality",
|
|
70
|
+
"BindParams",
|
|
71
|
+
"InitParams",
|
|
72
|
+
"ProcessParams",
|
|
73
|
+
"SecretsAccessor",
|
|
74
|
+
"TableFunctionBase",
|
|
75
|
+
"TableFunctionGenerator",
|
|
76
|
+
"TableInOutFunctionInitPhase",
|
|
77
|
+
"init_single_worker",
|
|
78
|
+
"bind_fixed_schema",
|
|
79
|
+
"_struct_scalar_to_dict",
|
|
80
|
+
"_extract_setting_secret_params",
|
|
81
|
+
]
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
@dataclass(frozen=True, slots=True)
|
|
85
|
+
class TableCardinality(ArrowSerializableDataclass):
|
|
86
|
+
"""Cardinality hints for query optimization.
|
|
87
|
+
|
|
88
|
+
Provides optional row count estimates that can help query planners make
|
|
89
|
+
better decisions about join ordering, memory allocation, and parallelization.
|
|
90
|
+
|
|
91
|
+
Attributes:
|
|
92
|
+
estimate: Estimated number of output rows, or None if unknown.
|
|
93
|
+
max: Maximum possible output rows, or None if unbounded.
|
|
94
|
+
|
|
95
|
+
"""
|
|
96
|
+
|
|
97
|
+
estimate: int | None
|
|
98
|
+
max: int | None
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def _batch_to_scalar_dict(batch: pa.RecordBatch | None) -> dict[str, pa.Scalar[Any]]:
|
|
102
|
+
"""Extract a single-row RecordBatch into a dict of column-name to scalar value."""
|
|
103
|
+
if batch is None:
|
|
104
|
+
return {}
|
|
105
|
+
return {name: batch.column(i)[0] for i, name in enumerate(batch.schema.names)}
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def _struct_scalar_to_dict(scalar: pa.StructScalar) -> dict[str, pa.Scalar[Any]]:
|
|
109
|
+
"""Expand a struct scalar into a dict of field name to scalar."""
|
|
110
|
+
return {key: scalar[key] for key in scalar}
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
class SecretsAccessor:
|
|
114
|
+
"""Unified access to secrets — pre-resolved and dynamically requested.
|
|
115
|
+
|
|
116
|
+
Pre-resolved secrets (from Secret() annotations with static scope/name, or
|
|
117
|
+
unscoped lookups) are available immediately. Dynamic lookups (computed scope
|
|
118
|
+
from function arguments) register pending requests — the framework
|
|
119
|
+
automatically triggers a two-phase bind retry to resolve them.
|
|
120
|
+
"""
|
|
121
|
+
|
|
122
|
+
__slots__ = ("_unscoped", "_scoped", "_is_retry", "_pending_lookups")
|
|
123
|
+
|
|
124
|
+
def __init__(self, secrets_batch: pa.RecordBatch | None, *, is_retry: bool = False) -> None:
|
|
125
|
+
"""Initialize from a secrets RecordBatch."""
|
|
126
|
+
self._is_retry = is_retry
|
|
127
|
+
self._pending_lookups: list[SecretLookupEntry] = []
|
|
128
|
+
|
|
129
|
+
# Parse unscoped secrets (columns named by secret_type)
|
|
130
|
+
self._unscoped: dict[str, dict[str, pa.Scalar[Any]]] = {}
|
|
131
|
+
# Parse scoped secrets (columns named "secret_N" with field metadata)
|
|
132
|
+
self._scoped: list[tuple[dict[str, str], dict[str, pa.Scalar[Any]] | None]] = []
|
|
133
|
+
|
|
134
|
+
if secrets_batch is not None:
|
|
135
|
+
for i, name in enumerate(secrets_batch.schema.names):
|
|
136
|
+
col_field = secrets_batch.schema.field(i)
|
|
137
|
+
scalar = secrets_batch.column(i)[0]
|
|
138
|
+
|
|
139
|
+
if name.startswith("secret_"):
|
|
140
|
+
# Scoped secret with metadata on the Arrow field
|
|
141
|
+
raw_meta = col_field.metadata or {}
|
|
142
|
+
entry_meta = {
|
|
143
|
+
(k.decode() if isinstance(k, bytes) else k): (v.decode() if isinstance(v, bytes) else v)
|
|
144
|
+
for k, v in raw_meta.items()
|
|
145
|
+
}
|
|
146
|
+
if scalar.is_valid:
|
|
147
|
+
self._scoped.append((entry_meta, _struct_scalar_to_dict(scalar)))
|
|
148
|
+
else:
|
|
149
|
+
self._scoped.append((entry_meta, None))
|
|
150
|
+
else:
|
|
151
|
+
# Unscoped secret (column name = secret_type)
|
|
152
|
+
if scalar.is_valid:
|
|
153
|
+
self._unscoped[name] = _struct_scalar_to_dict(scalar)
|
|
154
|
+
|
|
155
|
+
def get(
|
|
156
|
+
self,
|
|
157
|
+
secret_type: str,
|
|
158
|
+
*,
|
|
159
|
+
name: str | None = None,
|
|
160
|
+
scope: str | None = None,
|
|
161
|
+
required: bool = False,
|
|
162
|
+
) -> dict[str, pa.Scalar[Any]] | None:
|
|
163
|
+
"""Get a secret by type, with optional name and/or scope.
|
|
164
|
+
|
|
165
|
+
Args:
|
|
166
|
+
secret_type: The secret type (e.g., "vgi_example", "s3").
|
|
167
|
+
name: Optional secret name for name-based lookup.
|
|
168
|
+
scope: Optional scope for scoped lookup (longest-prefix match).
|
|
169
|
+
required: If True, raises ValueError when the secret is genuinely
|
|
170
|
+
not found (after resolution).
|
|
171
|
+
|
|
172
|
+
Returns:
|
|
173
|
+
dict of string keys to Arrow scalars, or None if not found.
|
|
174
|
+
|
|
175
|
+
"""
|
|
176
|
+
# Simple unscoped lookup (no dynamic scope/name)
|
|
177
|
+
if not scope and not name:
|
|
178
|
+
result = self._unscoped.get(secret_type)
|
|
179
|
+
if result is not None:
|
|
180
|
+
return result
|
|
181
|
+
if self._is_retry:
|
|
182
|
+
# Retry but still not found — genuinely missing
|
|
183
|
+
if required:
|
|
184
|
+
raise ValueError(f"Required secret '{secret_type}' not found")
|
|
185
|
+
return None
|
|
186
|
+
# First call, not found — register pending lookup for two-phase bind
|
|
187
|
+
self._pending_lookups.append(SecretLookupEntry(secret_type=secret_type))
|
|
188
|
+
return None
|
|
189
|
+
|
|
190
|
+
# Check resolved scoped secrets (from retry)
|
|
191
|
+
if self._is_retry:
|
|
192
|
+
result = self._find_scoped(secret_type, name, scope)
|
|
193
|
+
if required and result is None:
|
|
194
|
+
raise ValueError(f"Required secret '{secret_type}' not found (scope={scope!r}, name={name!r})")
|
|
195
|
+
return result
|
|
196
|
+
|
|
197
|
+
# First call, dynamic scope/name — register pending lookup
|
|
198
|
+
self._pending_lookups.append(SecretLookupEntry(secret_type=secret_type, scope=scope, secret_name=name))
|
|
199
|
+
return None
|
|
200
|
+
|
|
201
|
+
@property
|
|
202
|
+
def all_resolved(self) -> bool:
|
|
203
|
+
"""True if all requested secrets have been resolved (no pending lookups).
|
|
204
|
+
|
|
205
|
+
Use this to distinguish 'not yet resolved' from 'genuinely not found'
|
|
206
|
+
when not using required=True on get().
|
|
207
|
+
"""
|
|
208
|
+
return len(self._pending_lookups) == 0
|
|
209
|
+
|
|
210
|
+
@property
|
|
211
|
+
def needs_resolution(self) -> bool:
|
|
212
|
+
"""True if there are pending lookups that need resolution."""
|
|
213
|
+
return len(self._pending_lookups) > 0
|
|
214
|
+
|
|
215
|
+
@property
|
|
216
|
+
def pending_lookups(self) -> list[SecretLookupEntry]:
|
|
217
|
+
"""Return the list of pending secret lookups."""
|
|
218
|
+
return list(self._pending_lookups)
|
|
219
|
+
|
|
220
|
+
def to_dict(self) -> dict[str, dict[str, pa.Scalar[Any]]]:
|
|
221
|
+
"""Return all resolved secrets as a flat dict keyed by secret_type.
|
|
222
|
+
|
|
223
|
+
Combines unscoped entries (column name = secret_type) with scoped
|
|
224
|
+
entries (``secret_N`` columns, keyed by ``secret_type`` from Arrow
|
|
225
|
+
field metadata). Null/unresolved entries are omitted.
|
|
226
|
+
"""
|
|
227
|
+
result = dict(self._unscoped)
|
|
228
|
+
for meta, secret_dict in self._scoped:
|
|
229
|
+
if secret_dict is not None:
|
|
230
|
+
key = meta.get("secret_type", "")
|
|
231
|
+
if key:
|
|
232
|
+
result[key] = secret_dict
|
|
233
|
+
return result
|
|
234
|
+
|
|
235
|
+
def _find_scoped(
|
|
236
|
+
self,
|
|
237
|
+
secret_type: str,
|
|
238
|
+
name: str | None,
|
|
239
|
+
scope: str | None,
|
|
240
|
+
) -> dict[str, pa.Scalar[Any]] | None:
|
|
241
|
+
"""Find a resolved scoped secret matching the given criteria."""
|
|
242
|
+
for meta, secret_dict in self._scoped:
|
|
243
|
+
if meta.get("secret_type") != secret_type:
|
|
244
|
+
continue
|
|
245
|
+
if scope is not None and meta.get("scope") != scope:
|
|
246
|
+
continue
|
|
247
|
+
if name is not None and meta.get("secret_name") != name:
|
|
248
|
+
continue
|
|
249
|
+
return secret_dict
|
|
250
|
+
return None
|
|
251
|
+
|
|
252
|
+
|
|
253
|
+
def project_schema(projection_ids: list[int] | None, schema: pa.Schema) -> pa.Schema:
|
|
254
|
+
"""Create the projected schema if projection_ids are supplied."""
|
|
255
|
+
if projection_ids is not None:
|
|
256
|
+
return pa.schema([schema.field(proj_id) for proj_id in projection_ids])
|
|
257
|
+
return schema
|
|
258
|
+
|
|
259
|
+
|
|
260
|
+
def _effective_projection_ids(func_cls: Any, projection_ids: list[int] | None) -> list[int] | None:
|
|
261
|
+
"""Return projection_ids only if the function supports projection pushdown."""
|
|
262
|
+
if projection_ids is not None and func_cls.get_metadata().projection_pushdown:
|
|
263
|
+
return projection_ids
|
|
264
|
+
return None
|
|
265
|
+
|
|
266
|
+
|
|
267
|
+
class TableInOutFunctionInitPhase(Enum):
|
|
268
|
+
"""Init-call phase for table functions.
|
|
269
|
+
|
|
270
|
+
``INPUT`` / ``FINALIZE`` drive the streaming TableInOutGenerator path.
|
|
271
|
+
``TABLE_BUFFERING`` is the Sink+Source init phase for
|
|
272
|
+
``TableBufferingFunction`` — after init, traffic moves to
|
|
273
|
+
``table_buffering_process`` / ``_combine`` (unary) and
|
|
274
|
+
``TABLE_BUFFERING_FINALIZE`` opens a producer-mode finalize stream
|
|
275
|
+
per finalize_state_id.
|
|
276
|
+
"""
|
|
277
|
+
|
|
278
|
+
INPUT = auto()
|
|
279
|
+
FINALIZE = auto()
|
|
280
|
+
TABLE_BUFFERING = auto()
|
|
281
|
+
TABLE_BUFFERING_FINALIZE = auto()
|
|
282
|
+
|
|
283
|
+
|
|
284
|
+
class OrderByDirection(Enum):
|
|
285
|
+
"""ORDER BY direction pushed down from DuckDB's RowGroupPruner optimizer."""
|
|
286
|
+
|
|
287
|
+
ASC = auto()
|
|
288
|
+
DESC = auto()
|
|
289
|
+
|
|
290
|
+
|
|
291
|
+
class OrderByNullOrder(Enum):
|
|
292
|
+
"""NULL ordering pushed down from DuckDB's RowGroupPruner optimizer."""
|
|
293
|
+
|
|
294
|
+
NULLS_FIRST = auto()
|
|
295
|
+
NULLS_LAST = auto()
|
|
296
|
+
|
|
297
|
+
|
|
298
|
+
@dataclass(slots=True, frozen=True, kw_only=True)
|
|
299
|
+
class BindParams[TArgs]:
|
|
300
|
+
"""Parameters passed to on_bind()."""
|
|
301
|
+
|
|
302
|
+
args: TArgs
|
|
303
|
+
bind_call: BindRequest
|
|
304
|
+
# Convenient access to settings and secrets, extracted from the bind_call.
|
|
305
|
+
settings: dict[str, pa.Scalar[Any]]
|
|
306
|
+
secrets: SecretsAccessor
|
|
307
|
+
# Transaction-scoped storage view. Lets ``cardinality()`` and
|
|
308
|
+
# ``statistics()`` cache expensive lookups (e.g. Kafka watermarks)
|
|
309
|
+
# in the same store ``on_init`` reads/writes for snapshot isolation
|
|
310
|
+
# — so a topic's row count is fetched once per SQL transaction
|
|
311
|
+
# rather than once per bind/cardinality/statistics/init phase.
|
|
312
|
+
# ``None`` when ``bind_call.transaction_opaque_data`` is unset.
|
|
313
|
+
transaction_storage: TransactionBoundStorage | None = None
|
|
314
|
+
# Execution-scoped storage view. Populated only on call paths that
|
|
315
|
+
# carry a ``global_execution_id`` — currently just
|
|
316
|
+
# ``dynamic_to_string``. ``None`` for ``bind`` / ``cardinality`` /
|
|
317
|
+
# ``statistics`` (they predate execution and have no
|
|
318
|
+
# execution_id).
|
|
319
|
+
storage: BoundStorage | None = None
|
|
320
|
+
auth_context: AuthContext = AuthContext.anonymous()
|
|
321
|
+
# The catalog's attach bytes, unwrapped by the framework (encryption is the
|
|
322
|
+
# framework's concern, not the user's). This is what the catalog returned at
|
|
323
|
+
# ``catalog_attach`` — the framework shard-UUID prefix is already stripped.
|
|
324
|
+
# None when invoked without an ATTACH. Storage shards on that UUID separately.
|
|
325
|
+
attach_opaque_data: bytes | None = None
|
|
326
|
+
|
|
327
|
+
@property
|
|
328
|
+
def at_unit(self) -> str | None:
|
|
329
|
+
"""The AT (TIMESTAMP|VERSION) unit for this scan, or None without an AT clause.
|
|
330
|
+
|
|
331
|
+
NOTE: for inline-bound (function-backed) tables on_bind runs once
|
|
332
|
+
at attach with no AT, so this is None here — read AT at init/process via
|
|
333
|
+
``ProcessParams.at_value``. See ``BindRequest.at_unit``.
|
|
334
|
+
"""
|
|
335
|
+
return self.bind_call.at_unit
|
|
336
|
+
|
|
337
|
+
@property
|
|
338
|
+
def at_value(self) -> str | None:
|
|
339
|
+
"""The AT (TIMESTAMP|VERSION) value for this scan, or None. See ``at_unit``."""
|
|
340
|
+
return self.bind_call.at_value
|
|
341
|
+
|
|
342
|
+
|
|
343
|
+
@dataclass(slots=True, frozen=True, kw_only=True)
|
|
344
|
+
class InitParams[TArgs]:
|
|
345
|
+
"""Parameters passed to on_init()."""
|
|
346
|
+
|
|
347
|
+
args: TArgs
|
|
348
|
+
init_call: InitRequest
|
|
349
|
+
|
|
350
|
+
execution_id: bytes
|
|
351
|
+
|
|
352
|
+
# This is the projected schema based on projection_ids,
|
|
353
|
+
# which is what the function should produce.
|
|
354
|
+
output_schema: pa.Schema
|
|
355
|
+
|
|
356
|
+
# Convenient access to settings and secrets as dicts, extracted from the bind_call.
|
|
357
|
+
settings: dict[str, pa.Scalar[Any]]
|
|
358
|
+
secrets: dict[str, dict[str, pa.Scalar[Any]]]
|
|
359
|
+
|
|
360
|
+
storage: BoundStorage
|
|
361
|
+
auth_context: AuthContext = AuthContext.anonymous()
|
|
362
|
+
# Catalog's attach bytes, unwrapped by the framework (uuid prefix stripped);
|
|
363
|
+
# None without an ATTACH. See ``BindParams``.
|
|
364
|
+
attach_opaque_data: bytes | None = None
|
|
365
|
+
|
|
366
|
+
@property
|
|
367
|
+
def at_unit(self) -> str | None:
|
|
368
|
+
"""AT (TIMESTAMP|VERSION) unit for this scan, or None.
|
|
369
|
+
|
|
370
|
+
Carried on the per-scan bind embedded in the init request.
|
|
371
|
+
See ``BindRequest.at_unit``.
|
|
372
|
+
"""
|
|
373
|
+
return self.init_call.bind_call.at_unit
|
|
374
|
+
|
|
375
|
+
@property
|
|
376
|
+
def at_value(self) -> str | None:
|
|
377
|
+
"""AT (TIMESTAMP|VERSION) value for this scan, or None. See ``at_unit``."""
|
|
378
|
+
return self.init_call.bind_call.at_value
|
|
379
|
+
|
|
380
|
+
|
|
381
|
+
@dataclass(slots=True, frozen=True, kw_only=True)
|
|
382
|
+
class ProcessParams[TArgs]:
|
|
383
|
+
"""Parameters passed to process() and finalize()."""
|
|
384
|
+
|
|
385
|
+
args: TArgs
|
|
386
|
+
init_call: InitRequest | None # None for aggregate functions
|
|
387
|
+
init_response: BaseInitResponse | None # None for aggregate functions
|
|
388
|
+
|
|
389
|
+
# This is the projected schema based on projection_ids,
|
|
390
|
+
# which is what the function should produce.
|
|
391
|
+
output_schema: pa.Schema
|
|
392
|
+
|
|
393
|
+
# Convenient access to settings and secrets as dicts, extracted from the bind_call.
|
|
394
|
+
settings: dict[str, pa.Scalar[Any]]
|
|
395
|
+
secrets: dict[str, dict[str, pa.Scalar[Any]]]
|
|
396
|
+
|
|
397
|
+
storage: BoundStorage
|
|
398
|
+
auth_context: AuthContext = AuthContext.anonymous()
|
|
399
|
+
|
|
400
|
+
# Current pushdown filters (updated dynamically from tick metadata for Top-N queries).
|
|
401
|
+
# None if no filters have been received. Updated before each process() call.
|
|
402
|
+
current_pushdown_filters: Any = None # PushdownFilters | None
|
|
403
|
+
|
|
404
|
+
# Globally-unique monotonic batch index for this process() call. Populated
|
|
405
|
+
# ONLY for TableBufferingFunction subclasses with
|
|
406
|
+
# Meta.requires_input_batch_index=True — the C++ Sink reads it from DuckDB's
|
|
407
|
+
# per-chunk OperatorPartitionInfo and forwards it. Workers can accumulate
|
|
408
|
+
# (batch_index, payload) tuples and sort in combine() to reconstruct source
|
|
409
|
+
# order under parallel ingest. None for every other call path.
|
|
410
|
+
batch_index: int | None = None
|
|
411
|
+
|
|
412
|
+
# Catalog's attach bytes, unwrapped by the framework (uuid prefix stripped);
|
|
413
|
+
# None without an ATTACH. See ``BindParams``.
|
|
414
|
+
attach_opaque_data: bytes | None = None
|
|
415
|
+
|
|
416
|
+
@property
|
|
417
|
+
def at_unit(self) -> str | None:
|
|
418
|
+
"""AT (TIMESTAMP|VERSION) unit for this scan, or None.
|
|
419
|
+
|
|
420
|
+
Carried on the per-scan bind embedded in the init request; None for
|
|
421
|
+
aggregate functions (no init_call). See ``BindRequest.at_unit``.
|
|
422
|
+
"""
|
|
423
|
+
return self.init_call.bind_call.at_unit if self.init_call is not None else None
|
|
424
|
+
|
|
425
|
+
@property
|
|
426
|
+
def at_value(self) -> str | None:
|
|
427
|
+
"""AT (TIMESTAMP|VERSION) value for this scan, or None. See ``at_unit``."""
|
|
428
|
+
return self.init_call.bind_call.at_value if self.init_call is not None else None
|
|
429
|
+
|
|
430
|
+
|
|
431
|
+
class TableFunctionBase[TArgs](vgi.function.Function):
|
|
432
|
+
"""Base class for table functions with cardinality and schema validation.
|
|
433
|
+
|
|
434
|
+
Extends Function with:
|
|
435
|
+
- Cardinality hints for query optimization
|
|
436
|
+
- Projection pushdown support
|
|
437
|
+
|
|
438
|
+
This class is not meant to be used directly. Subclass either:
|
|
439
|
+
- TableFunctionGenerator: For simple generators that produce output
|
|
440
|
+
- TableInOutGenerator: For functions that transform input batches
|
|
441
|
+
|
|
442
|
+
See Also:
|
|
443
|
+
TableFunctionGenerator: Simple generator base class
|
|
444
|
+
TableInOutGenerator: Full streaming with input batches
|
|
445
|
+
|
|
446
|
+
"""
|
|
447
|
+
|
|
448
|
+
FunctionArguments: ClassVar[type]
|
|
449
|
+
_setting_params: ClassVar[dict[str, str]]
|
|
450
|
+
_secret_params: ClassVar[dict[str, Secret]]
|
|
451
|
+
|
|
452
|
+
def __init_subclass__(cls) -> None:
|
|
453
|
+
"""Validate FunctionArguments, auto-extracting from generic parameter if needed."""
|
|
454
|
+
super().__init_subclass__()
|
|
455
|
+
|
|
456
|
+
# Validate TState (second generic type parameter) is serializable.
|
|
457
|
+
#
|
|
458
|
+
# This runs unconditionally — independently of the FunctionArguments
|
|
459
|
+
# auto-extraction below. The check used to be nested inside the
|
|
460
|
+
# ``not hasattr(cls, "FunctionArguments")`` branch, so any class that
|
|
461
|
+
# set ``FunctionArguments`` explicitly in its body silently skipped
|
|
462
|
+
# TState validation. That let non-serializable state slip through: it
|
|
463
|
+
# appears to work on subprocess transport (the worker process is
|
|
464
|
+
# long-lived, so the live state object survives between ``process()``
|
|
465
|
+
# ticks) but breaks on HTTP, where each tick is an independent request
|
|
466
|
+
# and state must round-trip through the stream-state token.
|
|
467
|
+
for base in cls.__dict__.get("__orig_bases__", ()):
|
|
468
|
+
origin = get_origin(base)
|
|
469
|
+
if origin is not None and issubclass(origin, TableFunctionBase):
|
|
470
|
+
type_args = get_args(base)
|
|
471
|
+
if len(type_args) >= 2:
|
|
472
|
+
state_type = type_args[1]
|
|
473
|
+
if (
|
|
474
|
+
state_type is not None
|
|
475
|
+
and state_type is not type(None)
|
|
476
|
+
and not isinstance(state_type, TypeVar)
|
|
477
|
+
and isinstance(state_type, type)
|
|
478
|
+
and not issubclass(state_type, ArrowSerializableDataclass)
|
|
479
|
+
):
|
|
480
|
+
raise TypeError(
|
|
481
|
+
f"{cls.__name__}: TState type {state_type.__name__} must extend "
|
|
482
|
+
f"ArrowSerializableDataclass for HTTP state serialization. "
|
|
483
|
+
f"Use @dataclass(kw_only=True) and inherit from ArrowSerializableDataclass."
|
|
484
|
+
)
|
|
485
|
+
break
|
|
486
|
+
|
|
487
|
+
# Auto-extract FunctionArguments from generic type parameter if not explicitly set.
|
|
488
|
+
# e.g., class MyFunc(TableFunctionGenerator[MyArgs]) -> cls.FunctionArguments = MyArgs
|
|
489
|
+
if not hasattr(cls, "FunctionArguments"):
|
|
490
|
+
for base in cls.__dict__.get("__orig_bases__", ()):
|
|
491
|
+
origin = get_origin(base)
|
|
492
|
+
if origin is not None and issubclass(origin, TableFunctionBase):
|
|
493
|
+
type_args = get_args(base)
|
|
494
|
+
if type_args and not isinstance(type_args[0], TypeVar):
|
|
495
|
+
if type_args[0] is type(None):
|
|
496
|
+
# None means no arguments — create empty dataclass
|
|
497
|
+
from dataclasses import make_dataclass
|
|
498
|
+
|
|
499
|
+
cls.FunctionArguments = make_dataclass(f"_{cls.__name__}Args", [])
|
|
500
|
+
else:
|
|
501
|
+
cls.FunctionArguments = type_args[0]
|
|
502
|
+
break
|
|
503
|
+
|
|
504
|
+
# Skip validation for abstract base classes
|
|
505
|
+
is_abstract = any(getattr(getattr(cls, name, None), "__isabstractmethod__", False) for name in dir(cls))
|
|
506
|
+
if is_abstract:
|
|
507
|
+
cls._setting_params = {}
|
|
508
|
+
cls._secret_params = {}
|
|
509
|
+
return
|
|
510
|
+
|
|
511
|
+
# Skip intermediate base classes that still have unresolved type parameters
|
|
512
|
+
if not hasattr(cls, "FunctionArguments"):
|
|
513
|
+
has_unresolved = False
|
|
514
|
+
for base in cls.__dict__.get("__orig_bases__", ()):
|
|
515
|
+
type_args = get_args(base)
|
|
516
|
+
if type_args and isinstance(type_args[0], TypeVar):
|
|
517
|
+
has_unresolved = True
|
|
518
|
+
break
|
|
519
|
+
if has_unresolved:
|
|
520
|
+
cls._setting_params = {}
|
|
521
|
+
cls._secret_params = {}
|
|
522
|
+
return
|
|
523
|
+
|
|
524
|
+
if not hasattr(cls, "FunctionArguments"):
|
|
525
|
+
# Provide a default empty FunctionArguments for classes that use
|
|
526
|
+
# class-level Arg descriptors (e.g., TableInOutFunction subclasses
|
|
527
|
+
# without type parameters). This preserves backward compatibility.
|
|
528
|
+
from dataclasses import make_dataclass
|
|
529
|
+
|
|
530
|
+
cls.FunctionArguments = make_dataclass(f"_{cls.__name__}Args", [])
|
|
531
|
+
else:
|
|
532
|
+
args_class = cls.FunctionArguments
|
|
533
|
+
|
|
534
|
+
# Validate FunctionArguments is a dataclass
|
|
535
|
+
if not is_dataclass(args_class):
|
|
536
|
+
raise TypeError(
|
|
537
|
+
f"{cls.__name__}.FunctionArguments must be a dataclass. "
|
|
538
|
+
f"Add @dataclass decorator to {args_class.__name__}"
|
|
539
|
+
)
|
|
540
|
+
|
|
541
|
+
# Validate all fields are Annotated with Arg
|
|
542
|
+
hints = get_type_hints(args_class, include_extras=True)
|
|
543
|
+
for field_name, hint in hints.items():
|
|
544
|
+
if get_origin(hint) is not Annotated:
|
|
545
|
+
raise TypeError(
|
|
546
|
+
f"{cls.__name__}.FunctionArguments.{field_name} must use Annotated[T, Arg(...)], got {hint}"
|
|
547
|
+
)
|
|
548
|
+
|
|
549
|
+
# Check that Arg is in the metadata
|
|
550
|
+
metadata = get_args(hint)[1:]
|
|
551
|
+
has_arg = any(isinstance(meta, Arg) for meta in metadata)
|
|
552
|
+
if not has_arg:
|
|
553
|
+
raise TypeError(
|
|
554
|
+
f"{cls.__name__}.FunctionArguments.{field_name} must have Arg(...) in Annotated metadata"
|
|
555
|
+
)
|
|
556
|
+
|
|
557
|
+
# Parse on_bind() signature for Setting/Secret annotations
|
|
558
|
+
on_bind_method = getattr(cls, "on_bind", None)
|
|
559
|
+
if on_bind_method is not None and "on_bind" in cls.__dict__:
|
|
560
|
+
cls._setting_params, cls._secret_params = _extract_setting_secret_params(on_bind_method)
|
|
561
|
+
else:
|
|
562
|
+
cls._setting_params = getattr(cls, "_setting_params", {})
|
|
563
|
+
cls._secret_params = getattr(cls, "_secret_params", {})
|
|
564
|
+
|
|
565
|
+
@final
|
|
566
|
+
@staticmethod
|
|
567
|
+
def _parse_arguments(args_class: type[TArgs], arguments: Arguments) -> TArgs:
|
|
568
|
+
"""Convert Arguments to typed FunctionArguments instance."""
|
|
569
|
+
hints = get_type_hints(args_class, include_extras=True)
|
|
570
|
+
kwargs: dict[str, Any] = {}
|
|
571
|
+
|
|
572
|
+
for attr_name, hint in hints.items():
|
|
573
|
+
if get_origin(hint) is not Annotated:
|
|
574
|
+
continue
|
|
575
|
+
# Check if this is a TableInput parameter (sentinel, no real data)
|
|
576
|
+
base_type = get_args(hint)[0]
|
|
577
|
+
if base_type is TableInput:
|
|
578
|
+
kwargs[attr_name] = TableInput()
|
|
579
|
+
continue
|
|
580
|
+
for meta in get_args(hint)[1:]:
|
|
581
|
+
if isinstance(meta, Arg):
|
|
582
|
+
if meta.varargs:
|
|
583
|
+
# Varargs: collect remaining positional args as raw pa.Scalar objects
|
|
584
|
+
assert isinstance(meta.position, int)
|
|
585
|
+
kwargs[attr_name] = tuple(arguments.positional[meta.position :])
|
|
586
|
+
else:
|
|
587
|
+
value = arguments.get(meta.position, default=meta.default)
|
|
588
|
+
# Reject SQL NULL for non-Optional Args. Without this,
|
|
589
|
+
# None silently propagated through validation and
|
|
590
|
+
# crashed deep in the user's process()/update() with
|
|
591
|
+
# an opaque Python ``TypeError`` (e.g. ``'<=' not
|
|
592
|
+
# supported between instances of NoneType and int``)
|
|
593
|
+
# that surfaced in the C++ extension as a worker
|
|
594
|
+
# exception with no hint at the cause.
|
|
595
|
+
if value is None and not _accepts_none(base_type):
|
|
596
|
+
raise meta._reject_none()
|
|
597
|
+
# Run Arg constraint validation (ge/le/gt/lt/choices/pattern).
|
|
598
|
+
# Skip for None — accepted via Optional[T].
|
|
599
|
+
if value is not None:
|
|
600
|
+
meta._validate(value)
|
|
601
|
+
kwargs[attr_name] = value
|
|
602
|
+
break
|
|
603
|
+
|
|
604
|
+
return args_class(**kwargs)
|
|
605
|
+
|
|
606
|
+
@final
|
|
607
|
+
@staticmethod
|
|
608
|
+
def _validate_arg_type_bounds(
|
|
609
|
+
args_class: type,
|
|
610
|
+
args: Any,
|
|
611
|
+
input_schema: pa.Schema,
|
|
612
|
+
) -> None:
|
|
613
|
+
"""Validate type bounds for Arg parameters against the input schema.
|
|
614
|
+
|
|
615
|
+
Walks the FunctionArguments type hints to find Arg instances with
|
|
616
|
+
type_bound set. For each, gets the resolved column name from the
|
|
617
|
+
args dataclass and validates the column's Arrow type against the bound.
|
|
618
|
+
|
|
619
|
+
Args:
|
|
620
|
+
args_class: The FunctionArguments class with Annotated type hints.
|
|
621
|
+
args: The resolved FunctionArguments dataclass instance.
|
|
622
|
+
input_schema: The input schema to validate column types against.
|
|
623
|
+
|
|
624
|
+
"""
|
|
625
|
+
hints = get_type_hints(args_class, include_extras=True)
|
|
626
|
+
for attr_name, hint in hints.items():
|
|
627
|
+
if get_origin(hint) is not Annotated:
|
|
628
|
+
continue
|
|
629
|
+
for meta in get_args(hint)[1:]:
|
|
630
|
+
if isinstance(meta, Arg) and meta.type_bound is not None:
|
|
631
|
+
value = getattr(args, attr_name)
|
|
632
|
+
if isinstance(value, tuple):
|
|
633
|
+
for col_name in value:
|
|
634
|
+
if isinstance(col_name, str):
|
|
635
|
+
meta.validate_type_bound(input_schema.field(col_name).type)
|
|
636
|
+
elif isinstance(value, str):
|
|
637
|
+
meta.validate_type_bound(input_schema.field(value).type)
|
|
638
|
+
break
|
|
639
|
+
|
|
640
|
+
@classmethod
|
|
641
|
+
def _extract_bind_kwargs(cls, input: BindRequest) -> dict[str, Any]:
|
|
642
|
+
"""Extract Setting/Secret kwargs from a BindRequest for on_bind().
|
|
643
|
+
|
|
644
|
+
Returns dict of keyword arguments matching Setting/Secret annotations
|
|
645
|
+
on the on_bind() method.
|
|
646
|
+
"""
|
|
647
|
+
kwargs: dict[str, Any] = {}
|
|
648
|
+
|
|
649
|
+
# Setting params: extract pa.Scalar from settings RecordBatch
|
|
650
|
+
if input.settings is not None and cls._setting_params:
|
|
651
|
+
settings_schema = input.settings.schema
|
|
652
|
+
for name, setting_key in cls._setting_params.items():
|
|
653
|
+
col_idx = settings_schema.get_field_index(setting_key)
|
|
654
|
+
kwargs[name] = input.settings.column(col_idx)[0] if col_idx >= 0 else None
|
|
655
|
+
|
|
656
|
+
# Secret params: extract dict[str, pa.Scalar] from secrets RecordBatch
|
|
657
|
+
if input.secrets is not None and cls._secret_params:
|
|
658
|
+
secrets_schema = input.secrets.schema
|
|
659
|
+
for name, secret in cls._secret_params.items():
|
|
660
|
+
col_idx = secrets_schema.get_field_index(secret.secret_type)
|
|
661
|
+
kwargs[name] = _struct_scalar_to_dict(input.secrets.column(col_idx)[0]) if col_idx >= 0 else None
|
|
662
|
+
|
|
663
|
+
return kwargs
|
|
664
|
+
|
|
665
|
+
@final
|
|
666
|
+
@classmethod
|
|
667
|
+
def _make_bind_params(
|
|
668
|
+
cls,
|
|
669
|
+
input: BindRequest,
|
|
670
|
+
*,
|
|
671
|
+
auth_context: AuthContext | None = None,
|
|
672
|
+
execution_id: bytes | None = None,
|
|
673
|
+
attach_plaintext: bytes | None = None,
|
|
674
|
+
) -> BindParams[TArgs]:
|
|
675
|
+
"""Construct BindParams from a BindRequest.
|
|
676
|
+
|
|
677
|
+
Shared by bind() and table_function_cardinality() to avoid
|
|
678
|
+
duplicating BindParams construction logic. ``execution_id`` is
|
|
679
|
+
only populated on call paths that have one (currently just
|
|
680
|
+
``dynamic_to_string``); when provided, ``BindParams.storage`` is
|
|
681
|
+
a ``BoundStorage`` view keyed by it.
|
|
682
|
+
"""
|
|
683
|
+
txn_id = input.transaction_opaque_data
|
|
684
|
+
# ``attach_plaintext`` is the full framework plaintext (``uuid(16) ||
|
|
685
|
+
# catalog_bytes``) the worker unwrapped. Storage shards on its UUID;
|
|
686
|
+
# bodies see only the catalog bytes via ``attach_opaque_data``.
|
|
687
|
+
return BindParams[TArgs](
|
|
688
|
+
args=cls._parse_arguments(cls.FunctionArguments, input.arguments),
|
|
689
|
+
bind_call=input,
|
|
690
|
+
settings=_batch_to_scalar_dict(input.settings),
|
|
691
|
+
secrets=SecretsAccessor(input.secrets, is_retry=input.resolved_secrets_provided),
|
|
692
|
+
transaction_storage=TransactionBoundStorage(
|
|
693
|
+
cls.storage,
|
|
694
|
+
txn_id,
|
|
695
|
+
request=input,
|
|
696
|
+
attach_plaintext=attach_plaintext,
|
|
697
|
+
)
|
|
698
|
+
if txn_id
|
|
699
|
+
else None,
|
|
700
|
+
storage=BoundStorage(
|
|
701
|
+
cls.storage,
|
|
702
|
+
execution_id,
|
|
703
|
+
request=input,
|
|
704
|
+
attach_plaintext=attach_plaintext,
|
|
705
|
+
)
|
|
706
|
+
if execution_id
|
|
707
|
+
else None,
|
|
708
|
+
auth_context=auth_context if auth_context is not None else AuthContext.anonymous(),
|
|
709
|
+
attach_opaque_data=attach_catalog_bytes(attach_plaintext),
|
|
710
|
+
)
|
|
711
|
+
|
|
712
|
+
# ------------------------------------------------------------------
|
|
713
|
+
# Bind / global_init — shared framework hooks for every table function.
|
|
714
|
+
#
|
|
715
|
+
# Subclasses define ``on_bind`` (and optionally ``on_init``) for the
|
|
716
|
+
# user-facing behavior; the framework's wire entry points ``bind`` and
|
|
717
|
+
# ``global_init`` are ``@final`` and live here so we have a single
|
|
718
|
+
# source of truth across TableFunctionGenerator / TableInOutGenerator /
|
|
719
|
+
# TableBufferingFunction.
|
|
720
|
+
# ------------------------------------------------------------------
|
|
721
|
+
|
|
722
|
+
@classmethod
|
|
723
|
+
@abstractmethod
|
|
724
|
+
def on_bind(
|
|
725
|
+
cls,
|
|
726
|
+
params: BindParams[TArgs],
|
|
727
|
+
) -> BindResponse:
|
|
728
|
+
"""Produce the output schema and perform other bind-time logic.
|
|
729
|
+
|
|
730
|
+
Subclasses must override. Common patterns:
|
|
731
|
+
|
|
732
|
+
* Pass through: ``return BindResponse(output_schema=params.bind_call.input_schema)``
|
|
733
|
+
* Custom shape: build a ``pa.Schema`` from ``params.args`` and return it.
|
|
734
|
+
* Dynamic secrets: declare ``*, my_secret: Annotated[..., Secret()] = None``
|
|
735
|
+
or call ``params.secrets.get(...)``; the framework will issue a
|
|
736
|
+
secret-scope retry automatically.
|
|
737
|
+
|
|
738
|
+
Args:
|
|
739
|
+
params: Bind parameters including arguments and schema.
|
|
740
|
+
|
|
741
|
+
Returns:
|
|
742
|
+
BindResponse with output_schema and optional opaque_data.
|
|
743
|
+
|
|
744
|
+
"""
|
|
745
|
+
|
|
746
|
+
@final
|
|
747
|
+
@classmethod
|
|
748
|
+
def bind(
|
|
749
|
+
cls,
|
|
750
|
+
input: BindRequest,
|
|
751
|
+
*,
|
|
752
|
+
ctx: CallContext | None = None,
|
|
753
|
+
attach_plaintext: bytes | None = None,
|
|
754
|
+
) -> BindResponse:
|
|
755
|
+
"""Bind protocol entry point. Do not override; use ``on_bind()``.
|
|
756
|
+
|
|
757
|
+
Validates type bounds when an input schema is present (table-input
|
|
758
|
+
functions), constructs BindParameters, calls ``on_bind()``, and
|
|
759
|
+
wraps the result for transmission to global_init. If ``on_bind()``
|
|
760
|
+
triggered dynamic secret lookups via SecretsAccessor, returns a
|
|
761
|
+
secret-scope request to trigger two-phase bind.
|
|
762
|
+
|
|
763
|
+
Note: we do NOT auto-request secrets before ``on_bind()``. Table
|
|
764
|
+
functions handle secrets via ``on_bind`` kwargs (``Secret()``
|
|
765
|
+
annotations) and ``SecretsAccessor.get()`` calls, which may use
|
|
766
|
+
dynamic scopes computed from function arguments.
|
|
767
|
+
"""
|
|
768
|
+
auth = ctx.auth if ctx is not None else AuthContext.anonymous()
|
|
769
|
+
params = cls._make_bind_params(input, auth_context=auth, attach_plaintext=attach_plaintext)
|
|
770
|
+
|
|
771
|
+
if input.input_schema is not None:
|
|
772
|
+
cls._validate_arg_type_bounds(cls.FunctionArguments, params.args, input.input_schema)
|
|
773
|
+
|
|
774
|
+
result = cls.on_bind(params, **cls._extract_bind_kwargs(input))
|
|
775
|
+
|
|
776
|
+
if params.secrets.needs_resolution:
|
|
777
|
+
return BindResponse.secret_scope_request(params.secrets.pending_lookups)
|
|
778
|
+
|
|
779
|
+
return result
|
|
780
|
+
|
|
781
|
+
@classmethod
|
|
782
|
+
def on_init(
|
|
783
|
+
cls,
|
|
784
|
+
params: InitParams[TArgs],
|
|
785
|
+
) -> GlobalInitResponse:
|
|
786
|
+
"""One-time setup after bind, before processing batches.
|
|
787
|
+
|
|
788
|
+
Override to perform per-execution setup (open external resources,
|
|
789
|
+
allocate caches, etc.). Default is a no-op.
|
|
790
|
+
"""
|
|
791
|
+
return GlobalInitResponse()
|
|
792
|
+
|
|
793
|
+
@final
|
|
794
|
+
@classmethod
|
|
795
|
+
def global_init(
|
|
796
|
+
cls,
|
|
797
|
+
input: InitRequest,
|
|
798
|
+
*,
|
|
799
|
+
ctx: CallContext | None = None,
|
|
800
|
+
attach_plaintext: bytes | None = None,
|
|
801
|
+
) -> GlobalInitResponse:
|
|
802
|
+
"""Global init protocol entry point. Do not override; use ``on_init()``."""
|
|
803
|
+
execution_id = uuid.uuid4().bytes
|
|
804
|
+
auth = ctx.auth if ctx is not None else AuthContext.anonymous()
|
|
805
|
+
params = InitParams[TArgs](
|
|
806
|
+
args=cls._parse_arguments(cls.FunctionArguments, input.bind_call.arguments),
|
|
807
|
+
init_call=input,
|
|
808
|
+
output_schema=project_schema(
|
|
809
|
+
_effective_projection_ids(cls, input.projection_ids),
|
|
810
|
+
input.output_schema,
|
|
811
|
+
),
|
|
812
|
+
settings=_batch_to_scalar_dict(input.bind_call.settings),
|
|
813
|
+
secrets=SecretsAccessor(input.bind_call.secrets).to_dict(),
|
|
814
|
+
execution_id=execution_id,
|
|
815
|
+
# ``attach_plaintext`` is the full framework plaintext (uuid||catalog
|
|
816
|
+
# bytes); storage shards on its UUID, the body sees the catalog bytes.
|
|
817
|
+
storage=BoundStorage(cls.storage, execution_id, request=input, attach_plaintext=attach_plaintext),
|
|
818
|
+
auth_context=auth,
|
|
819
|
+
attach_opaque_data=attach_catalog_bytes(attach_plaintext),
|
|
820
|
+
)
|
|
821
|
+
|
|
822
|
+
result = cls.on_init(params)
|
|
823
|
+
|
|
824
|
+
return GlobalInitResponse(
|
|
825
|
+
max_workers=result.max_workers,
|
|
826
|
+
execution_id=execution_id,
|
|
827
|
+
opaque_data=result.opaque_data,
|
|
828
|
+
)
|
|
829
|
+
|
|
830
|
+
@classmethod
|
|
831
|
+
def cardinality(cls, params: BindParams[TArgs]) -> TableCardinality:
|
|
832
|
+
"""Return the cardinality for the output.
|
|
833
|
+
|
|
834
|
+
Override to provide row count estimates that help query planners
|
|
835
|
+
make better decisions about join ordering and memory allocation.
|
|
836
|
+
|
|
837
|
+
Returns:
|
|
838
|
+
TableCardinality with estimate and/or max, or None if unknown.
|
|
839
|
+
|
|
840
|
+
"""
|
|
841
|
+
return TableCardinality(estimate=None, max=None)
|
|
842
|
+
|
|
843
|
+
@classmethod
|
|
844
|
+
def dynamic_to_string(
|
|
845
|
+
cls,
|
|
846
|
+
params: BindParams[TArgs],
|
|
847
|
+
execution_id: bytes,
|
|
848
|
+
) -> Mapping[str, str]:
|
|
849
|
+
"""Return diagnostics rendered as Extra Info under EXPLAIN ANALYZE.
|
|
850
|
+
|
|
851
|
+
Fired once per parallel scan thread at end-of-stream. The function
|
|
852
|
+
class is responsible for persisting whatever diagnostics it cares
|
|
853
|
+
about during ``process()`` (shared storage, external service,
|
|
854
|
+
in-memory class state for single-worker setups) and retrieving
|
|
855
|
+
them by ``execution_id`` here.
|
|
856
|
+
|
|
857
|
+
DuckDB merges the per-thread maps with last-write-wins semantics,
|
|
858
|
+
so the *last* thread to finish — by which time every thread has
|
|
859
|
+
persisted — supplies the visible final view.
|
|
860
|
+
|
|
861
|
+
Best-effort: must not raise. The dispatcher catches exceptions
|
|
862
|
+
and returns an empty map so EXPLAIN ANALYZE never breaks the
|
|
863
|
+
query.
|
|
864
|
+
|
|
865
|
+
Args:
|
|
866
|
+
params: Same ``BindParams`` ``cardinality`` and ``statistics``
|
|
867
|
+
receive — function args, settings, secrets.
|
|
868
|
+
execution_id: ``VgiTableFunctionGlobalState::global_execution_id``,
|
|
869
|
+
stable for the duration of the query.
|
|
870
|
+
|
|
871
|
+
Returns:
|
|
872
|
+
Ordered key/value pairs. Insertion order is preserved on the
|
|
873
|
+
wire and re-emitted into the C++ profiler's
|
|
874
|
+
``InsertionOrderPreservingMap``. The C++ wrapper appends
|
|
875
|
+
intrinsic keys (``Worker``, ``Function``, ``Rows Read``,
|
|
876
|
+
``Threads``) after this map; user keys override on conflict.
|
|
877
|
+
|
|
878
|
+
"""
|
|
879
|
+
return {}
|
|
880
|
+
|
|
881
|
+
@classmethod
|
|
882
|
+
def statistics(cls, params: BindParams[TArgs]) -> list[ColumnStatistics] | None:
|
|
883
|
+
"""Return per-output-column statistics for this invocation.
|
|
884
|
+
|
|
885
|
+
Override to provide min/max/distinct/null stats so DuckDB's optimizer can
|
|
886
|
+
do filter elimination (e.g. prune a scan entirely when the filter is out
|
|
887
|
+
of range), improve join ordering, and fold always-true/always-false
|
|
888
|
+
predicates at plan time.
|
|
889
|
+
|
|
890
|
+
``params`` is the same ``BindParams[TArgs]`` used by ``cardinality`` and
|
|
891
|
+
``initial_state``, so stats can be derived directly from user-supplied
|
|
892
|
+
arguments.
|
|
893
|
+
|
|
894
|
+
Returns:
|
|
895
|
+
A list of ColumnStatistics (one entry per column for which stats
|
|
896
|
+
are known — columns not listed get unknown stats), or None when no
|
|
897
|
+
stats are available (same effect as today: optimizer receives no
|
|
898
|
+
column stats).
|
|
899
|
+
|
|
900
|
+
"""
|
|
901
|
+
return None
|
|
902
|
+
|
|
903
|
+
@staticmethod
|
|
904
|
+
def pushdown_filters(
|
|
905
|
+
pushdown_filters: pa.RecordBatch,
|
|
906
|
+
join_keys: list[pa.RecordBatch] | None = None,
|
|
907
|
+
) -> PushdownFilters | None:
|
|
908
|
+
"""Get deserialized pushdown filters, or None if not present.
|
|
909
|
+
|
|
910
|
+
Use this property to access the filter AST for:
|
|
911
|
+
- Custom filter handling (push to SQL, APIs, etc.)
|
|
912
|
+
- Extracting column bounds for partition pruning
|
|
913
|
+
- Checking column constants for optimized lookups
|
|
914
|
+
|
|
915
|
+
For automatic filtering, set auto_apply_filters=True in Meta.
|
|
916
|
+
|
|
917
|
+
Args:
|
|
918
|
+
pushdown_filters: Arrow RecordBatch containing serialized filters.
|
|
919
|
+
join_keys: Optional list of single-column Arrow RecordBatches,
|
|
920
|
+
one per IN filter column. Available via
|
|
921
|
+
``get_join_keys_batch()`` / ``get_join_keys_batches()``
|
|
922
|
+
on the returned ``PushdownFilters``.
|
|
923
|
+
|
|
924
|
+
Returns:
|
|
925
|
+
PushdownFilters container with parsed filter AST, or None.
|
|
926
|
+
|
|
927
|
+
"""
|
|
928
|
+
if pushdown_filters is None:
|
|
929
|
+
return None
|
|
930
|
+
from vgi.table_filter_pushdown import deserialize_filters
|
|
931
|
+
|
|
932
|
+
return deserialize_filters(pushdown_filters, join_keys=join_keys)
|
|
933
|
+
|
|
934
|
+
@classmethod
|
|
935
|
+
def _should_auto_apply_filters(cls) -> bool:
|
|
936
|
+
"""Check if auto_apply_filters is enabled in Meta."""
|
|
937
|
+
meta = getattr(cls, "Meta", None)
|
|
938
|
+
return bool(getattr(meta, "auto_apply_filters", False))
|
|
939
|
+
|
|
940
|
+
@classmethod
|
|
941
|
+
def _supports_batch_index(cls) -> bool:
|
|
942
|
+
"""Return True if Meta.supports_batch_index is set.
|
|
943
|
+
|
|
944
|
+
Drives the ``batch_index=`` kwarg validation on ``out.emit()`` in the
|
|
945
|
+
table-producer harness (see vgi.protocol._TrackingOutputCollector).
|
|
946
|
+
"""
|
|
947
|
+
meta = getattr(cls, "Meta", None)
|
|
948
|
+
return bool(getattr(meta, "supports_batch_index", False))
|
|
949
|
+
|
|
950
|
+
@classmethod
|
|
951
|
+
def _partition_kind(cls) -> Any:
|
|
952
|
+
"""Return Meta.partition_kind, defaulting to ``NOT_PARTITIONED``.
|
|
953
|
+
|
|
954
|
+
Drives the ``partition_values=`` kwarg validation on ``out.emit()``
|
|
955
|
+
in the table-producer harness. Imported lazily so the base class
|
|
956
|
+
doesn't pull in ``vgi.metadata`` at module load time.
|
|
957
|
+
"""
|
|
958
|
+
from vgi.metadata import PartitionKind
|
|
959
|
+
|
|
960
|
+
meta = getattr(cls, "Meta", None)
|
|
961
|
+
return getattr(meta, "partition_kind", PartitionKind.NOT_PARTITIONED)
|
|
962
|
+
|
|
963
|
+
@staticmethod
|
|
964
|
+
def _apply_pushdown_filter(batch: pa.RecordBatch, pushdown_filters: PushdownFilters | None) -> pa.RecordBatch:
|
|
965
|
+
"""Apply pushdown filters to a batch if present.
|
|
966
|
+
|
|
967
|
+
Args:
|
|
968
|
+
batch: RecordBatch to filter
|
|
969
|
+
pushdown_filters: The PushdownFilters to apply or None.
|
|
970
|
+
|
|
971
|
+
Returns:
|
|
972
|
+
Filtered batch, or original if no filters or batch is None/empty.
|
|
973
|
+
|
|
974
|
+
"""
|
|
975
|
+
if batch.num_rows == 0:
|
|
976
|
+
return batch
|
|
977
|
+
if pushdown_filters:
|
|
978
|
+
result = pushdown_filters.apply(batch)
|
|
979
|
+
return result
|
|
980
|
+
return batch
|
|
981
|
+
|
|
982
|
+
|
|
983
|
+
class TableFunctionGenerator[TArgs, TState = None](TableFunctionBase[TArgs]):
|
|
984
|
+
"""Callback-based table function that produces output batches.
|
|
985
|
+
|
|
986
|
+
Each call to process() should either:
|
|
987
|
+
- Emit a batch via out.emit(batch)
|
|
988
|
+
- Signal completion via out.finish()
|
|
989
|
+
|
|
990
|
+
Use TState to persist state between process() calls.
|
|
991
|
+
|
|
992
|
+
For functions that transform input batches, use TableInOutGenerator.
|
|
993
|
+
|
|
994
|
+
"""
|
|
995
|
+
|
|
996
|
+
# bind / on_bind / on_init / global_init are defined on TableFunctionBase.
|
|
997
|
+
# TableFunctionGenerator subclasses must override the abstract on_bind
|
|
998
|
+
# to declare an output schema (TFG has no input schema to default to).
|
|
999
|
+
|
|
1000
|
+
@classmethod
|
|
1001
|
+
def initial_state(cls, params: ProcessParams[TArgs]) -> TState | None:
|
|
1002
|
+
"""Create initial processing state. Override when TState is used.
|
|
1003
|
+
|
|
1004
|
+
Called once during init to create the state object that will be
|
|
1005
|
+
passed to process() on each tick.
|
|
1006
|
+
|
|
1007
|
+
Args:
|
|
1008
|
+
params: Process parameters including arguments and schemas.
|
|
1009
|
+
|
|
1010
|
+
Returns:
|
|
1011
|
+
Initial state, or None if no state is needed.
|
|
1012
|
+
|
|
1013
|
+
"""
|
|
1014
|
+
return None
|
|
1015
|
+
|
|
1016
|
+
@classmethod
|
|
1017
|
+
@abstractmethod
|
|
1018
|
+
def process(
|
|
1019
|
+
cls,
|
|
1020
|
+
params: ProcessParams[TArgs],
|
|
1021
|
+
state: TState,
|
|
1022
|
+
out: OutputCollector,
|
|
1023
|
+
) -> None:
|
|
1024
|
+
"""Produce output for one tick.
|
|
1025
|
+
|
|
1026
|
+
Called repeatedly by the framework. Each call should either:
|
|
1027
|
+
- Call out.emit(batch) to produce one output batch
|
|
1028
|
+
- Call out.finish() to signal that generation is complete
|
|
1029
|
+
|
|
1030
|
+
Use out.client_log(level, message) for in-band logging.
|
|
1031
|
+
|
|
1032
|
+
Args:
|
|
1033
|
+
params: Process parameters including arguments and schemas.
|
|
1034
|
+
state: Mutable state persisted between calls. None if TState not used.
|
|
1035
|
+
out: OutputCollector for emitting batches, logging, and signaling finish.
|
|
1036
|
+
|
|
1037
|
+
"""
|
|
1038
|
+
|
|
1039
|
+
@classmethod
|
|
1040
|
+
def on_cancel(cls, params: ProcessParams[TArgs], state: TState) -> None: # noqa: D102
|
|
1041
|
+
pass
|
|
1042
|
+
|
|
1043
|
+
on_cancel.__func__.__doc__ = ( # type: ignore[attr-defined]
|
|
1044
|
+
f"""Release resources when the stream is cancelled before natural end.
|
|
1045
|
+
|
|
1046
|
+
The VGI C++ extension fires this hook when a DuckDB query tears
|
|
1047
|
+
down a VGI scan early (LIMIT clause, user break, Ctrl-C,
|
|
1048
|
+
exception unwind). Override to release expensive per-stream
|
|
1049
|
+
resources the function was holding in ``state`` (database
|
|
1050
|
+
cursors, LLM streaming sessions, file handles, GPU buffers).
|
|
1051
|
+
|
|
1052
|
+
{_ON_CANCEL_CAVEATS}
|
|
1053
|
+
|
|
1054
|
+
The stream has already been torn down by the time this fires;
|
|
1055
|
+
no further batches may be emitted.
|
|
1056
|
+
|
|
1057
|
+
Args:
|
|
1058
|
+
params: Process parameters (same as ``process()`` received).
|
|
1059
|
+
state: The current user state, possibly deserialized from a
|
|
1060
|
+
state-token on a different worker than the one that
|
|
1061
|
+
originally built it.
|
|
1062
|
+
"""
|
|
1063
|
+
)
|
|
1064
|
+
|
|
1065
|
+
|
|
1066
|
+
def init_single_worker[T: TableFunctionGenerator[Any, Any]](cls: type[T]) -> type[T]:
|
|
1067
|
+
"""Class decorator to set max_workers=1 for a TableFunctionGenerator subclass."""
|
|
1068
|
+
if "on_init" not in cls.__dict__:
|
|
1069
|
+
|
|
1070
|
+
def on_init_impl(cls_: type[T], params: Any) -> GlobalInitResponse:
|
|
1071
|
+
return GlobalInitResponse(max_workers=1)
|
|
1072
|
+
|
|
1073
|
+
cls.on_init = classmethod(on_init_impl) # type: ignore[assignment]
|
|
1074
|
+
|
|
1075
|
+
# Clear 'on_init' from __abstractmethods__ — the metaclass set it
|
|
1076
|
+
# before decorators ran, so we must update it manually.
|
|
1077
|
+
if hasattr(cls, "__abstractmethods__") and "on_init" in cls.__abstractmethods__:
|
|
1078
|
+
cls.__abstractmethods__ = cls.__abstractmethods__ - {"on_init"}
|
|
1079
|
+
|
|
1080
|
+
return cls
|
|
1081
|
+
|
|
1082
|
+
|
|
1083
|
+
def bind_fixed_schema[T: TableFunctionGenerator[Any, Any]](cls: type[T]) -> type[T]:
|
|
1084
|
+
"""Class decorator to return FIXED_SCHEMA from on_bind for a TableFunctionGenerator subclass.
|
|
1085
|
+
|
|
1086
|
+
Sets ``cls._inline_bind_safe = True`` *only when* the decorator actually
|
|
1087
|
+
installs its own ``on_bind``. The catalog framework reads this marker to
|
|
1088
|
+
decide whether `Table(inline_bind=True)` is allowed — the contract is "the
|
|
1089
|
+
decorator's bind is in control, output is exactly ``cls.FIXED_SCHEMA``,
|
|
1090
|
+
no kwargs inspected." If the class already defined its own ``on_bind``,
|
|
1091
|
+
the decorator silently leaves it alone and we *must not* set the marker;
|
|
1092
|
+
otherwise the framework would inline a bind it doesn't actually control.
|
|
1093
|
+
|
|
1094
|
+
Subclasses inherit the marker via Python attribute lookup. A subclass
|
|
1095
|
+
that overrides ``on_bind`` adds it to its own ``__dict__``; the catalog
|
|
1096
|
+
framework's eligibility check is
|
|
1097
|
+
``getattr(cls, "_inline_bind_safe", False) and "on_bind" not in cls.__dict__``,
|
|
1098
|
+
which correctly excludes such subclasses.
|
|
1099
|
+
"""
|
|
1100
|
+
if "on_bind" not in cls.__dict__: # only inject if subclass hasn't overridden
|
|
1101
|
+
if not hasattr(cls, "FIXED_SCHEMA"):
|
|
1102
|
+
raise ValueError(f"Class {cls.__name__} must define FIXED_SCHEMA to use @bind_fixed_schema")
|
|
1103
|
+
|
|
1104
|
+
def on_bind_impl(cls_: type[T], params: Any) -> BindResponse:
|
|
1105
|
+
value = getattr(cls_, "FIXED_SCHEMA", None)
|
|
1106
|
+
|
|
1107
|
+
if value is None or not isinstance(value, pa.Schema):
|
|
1108
|
+
raise TypeError(f"Class {cls_.__name__}.FIXED_SCHEMA must be a pyarrow.Schema")
|
|
1109
|
+
return BindResponse(output_schema=value)
|
|
1110
|
+
|
|
1111
|
+
# Mark the function itself so we can later distinguish "decorator
|
|
1112
|
+
# installed this on_bind" from "user overrode on_bind" — useful for
|
|
1113
|
+
# downstream callers (e.g. catalog inline-bind) that need to confirm
|
|
1114
|
+
# the bind logic in effect is the decorator's, not a subclass override.
|
|
1115
|
+
on_bind_impl._is_bind_fixed_schema = True # type: ignore[attr-defined]
|
|
1116
|
+
|
|
1117
|
+
# assign as classmethod
|
|
1118
|
+
cls.on_bind = classmethod(on_bind_impl) # type: ignore[assignment]
|
|
1119
|
+
|
|
1120
|
+
# Clear 'on_bind' from __abstractmethods__ — the metaclass set it
|
|
1121
|
+
# before decorators ran, so we must update it manually.
|
|
1122
|
+
if hasattr(cls, "__abstractmethods__") and "on_bind" in cls.__abstractmethods__:
|
|
1123
|
+
cls.__abstractmethods__ = cls.__abstractmethods__ - {"on_bind"}
|
|
1124
|
+
|
|
1125
|
+
# Mark the class as inline-bind-safe *only when* we actually installed
|
|
1126
|
+
# the on_bind. If the class had a pre-existing custom on_bind, we left
|
|
1127
|
+
# it alone and have no claim about its purity — the marker stays unset.
|
|
1128
|
+
cls._inline_bind_safe = True # type: ignore[attr-defined]
|
|
1129
|
+
|
|
1130
|
+
return cls
|