vgi-python 0.8.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vgi/__init__.py +152 -0
- vgi/_duckdb.py +62 -0
- vgi/_storage_profile.py +132 -0
- vgi/_test_fixtures/__init__.py +20 -0
- vgi/_test_fixtures/accumulate/__init__.py +19 -0
- vgi/_test_fixtures/accumulate/worker.py +762 -0
- vgi/_test_fixtures/aggregate/__init__.py +62 -0
- vgi/_test_fixtures/aggregate/_common.py +21 -0
- vgi/_test_fixtures/aggregate/basic.py +232 -0
- vgi/_test_fixtures/aggregate/dynamic.py +409 -0
- vgi/_test_fixtures/aggregate/generic.py +86 -0
- vgi/_test_fixtures/aggregate/listagg.py +71 -0
- vgi/_test_fixtures/aggregate/percentile.py +107 -0
- vgi/_test_fixtures/aggregate/streaming.py +192 -0
- vgi/_test_fixtures/aggregate/varargs.py +75 -0
- vgi/_test_fixtures/aggregate/window.py +380 -0
- vgi/_test_fixtures/attach_options.py +308 -0
- vgi/_test_fixtures/bad_protocol.py +62 -0
- vgi/_test_fixtures/cancellable.py +336 -0
- vgi/_test_fixtures/catalog.py +813 -0
- vgi/_test_fixtures/http_server.py +394 -0
- vgi/_test_fixtures/nest_tensor.py +614 -0
- vgi/_test_fixtures/orchard_catalog.py +47 -0
- vgi/_test_fixtures/projection_repro/__init__.py +6 -0
- vgi/_test_fixtures/projection_repro/worker.py +454 -0
- vgi/_test_fixtures/scalar/__init__.py +116 -0
- vgi/_test_fixtures/scalar/_common.py +69 -0
- vgi/_test_fixtures/scalar/arithmetic.py +321 -0
- vgi/_test_fixtures/scalar/binary.py +120 -0
- vgi/_test_fixtures/scalar/formatting.py +176 -0
- vgi/_test_fixtures/scalar/geo.py +300 -0
- vgi/_test_fixtures/scalar/null_handling.py +107 -0
- vgi/_test_fixtures/scalar/random_demo.py +171 -0
- vgi/_test_fixtures/scalar/settings_secrets.py +102 -0
- vgi/_test_fixtures/scalar/type_info.py +219 -0
- vgi/_test_fixtures/schema_reconcile/__init__.py +29 -0
- vgi/_test_fixtures/schema_reconcile/worker.py +653 -0
- vgi/_test_fixtures/simple_writable.py +793 -0
- vgi/_test_fixtures/table/__init__.py +221 -0
- vgi/_test_fixtures/table/_common.py +162 -0
- vgi/_test_fixtures/table/batch_index.py +283 -0
- vgi/_test_fixtures/table/batch_index_broken.py +200 -0
- vgi/_test_fixtures/table/catalog_scans.py +162 -0
- vgi/_test_fixtures/table/filters.py +1005 -0
- vgi/_test_fixtures/table/late_materialization.py +249 -0
- vgi/_test_fixtures/table/make_series.py +273 -0
- vgi/_test_fixtures/table/misc.py +499 -0
- vgi/_test_fixtures/table/order_modes.py +164 -0
- vgi/_test_fixtures/table/pairs.py +437 -0
- vgi/_test_fixtures/table/partition_columns.py +472 -0
- vgi/_test_fixtures/table/partition_columns_broken.py +304 -0
- vgi/_test_fixtures/table/profiling_example.py +195 -0
- vgi/_test_fixtures/table/required_filters.py +234 -0
- vgi/_test_fixtures/table/sequence.py +710 -0
- vgi/_test_fixtures/table/settings.py +426 -0
- vgi/_test_fixtures/table/transaction_storage.py +162 -0
- vgi/_test_fixtures/table/tt_pushdown.py +191 -0
- vgi/_test_fixtures/table/versioned.py +230 -0
- vgi/_test_fixtures/table_in_out.py +1392 -0
- vgi/_test_fixtures/versioned.py +155 -0
- vgi/_test_fixtures/versioned_tables.py +595 -0
- vgi/_test_fixtures/worker.py +1631 -0
- vgi/_test_fixtures/writable/__init__.py +8 -0
- vgi/_test_fixtures/writable/generic.py +236 -0
- vgi/_test_fixtures/writable/table.py +149 -0
- vgi/_test_fixtures/writable/worker.py +1148 -0
- vgi/aggregate_function.py +607 -0
- vgi/argument_spec.py +472 -0
- vgi/arguments.py +1747 -0
- vgi/auth.py +55 -0
- vgi/catalog/__init__.py +88 -0
- vgi/catalog/attach_option.py +206 -0
- vgi/catalog/catalog_interface.py +2767 -0
- vgi/catalog/descriptors.py +870 -0
- vgi/catalog/duckdb_statistics.py +377 -0
- vgi/catalog/secret_type.py +96 -0
- vgi/catalog/setting.py +253 -0
- vgi/catalog/storage.py +372 -0
- vgi/client/__init__.py +67 -0
- vgi/client/catalog_mixin.py +1251 -0
- vgi/client/cli.py +582 -0
- vgi/client/cli_catalog.py +182 -0
- vgi/client/cli_schema.py +270 -0
- vgi/client/cli_table.py +907 -0
- vgi/client/cli_transaction.py +97 -0
- vgi/client/cli_utils.py +441 -0
- vgi/client/cli_view.py +303 -0
- vgi/client/client.py +2183 -0
- vgi/exceptions.py +205 -0
- vgi/function.py +245 -0
- vgi/function_storage.py +1636 -0
- vgi/function_storage_azure_sql.py +922 -0
- vgi/function_storage_cf_do.py +740 -0
- vgi/http/__init__.py +25 -0
- vgi/http/demo_storage.py +212 -0
- vgi/http/worker_page.py +1252 -0
- vgi/invocation.py +154 -0
- vgi/logging_config.py +93 -0
- vgi/meta_worker.py +661 -0
- vgi/metadata.py +1403 -0
- vgi/otel.py +406 -0
- vgi/protocol.py +2418 -0
- vgi/protocol_version.txt +1 -0
- vgi/py.typed +0 -0
- vgi/scalar_function.py +1211 -0
- vgi/schema_utils.py +234 -0
- vgi/secret_protocol.py +124 -0
- vgi/secret_service.py +238 -0
- vgi/serve.py +769 -0
- vgi/table_buffering_function.py +443 -0
- vgi/table_filter_pushdown.py +1528 -0
- vgi/table_function.py +1130 -0
- vgi/table_in_out_function.py +383 -0
- vgi/transactor/__init__.py +24 -0
- vgi/transactor/_duckdb_compat.py +27 -0
- vgi/transactor/client.py +137 -0
- vgi/transactor/protocol.py +149 -0
- vgi/transactor/server.py +740 -0
- vgi/worker.py +4761 -0
- vgi_python-0.8.0.dist-info/METADATA +735 -0
- vgi_python-0.8.0.dist-info/RECORD +124 -0
- vgi_python-0.8.0.dist-info/WHEEL +4 -0
- vgi_python-0.8.0.dist-info/entry_points.txt +5 -0
- vgi_python-0.8.0.dist-info/licenses/LICENSE +134 -0
vgi/metadata.py
ADDED
|
@@ -0,0 +1,1403 @@
|
|
|
1
|
+
# Copyright 2025, 2026 Query Farm LLC - https://query.farm
|
|
2
|
+
|
|
3
|
+
"""Function metadata for introspection, documentation, and DuckDB registration.
|
|
4
|
+
|
|
5
|
+
This module provides declarative metadata classes that enable functions to
|
|
6
|
+
describe themselves. Metadata is used for:
|
|
7
|
+
|
|
8
|
+
1. Documentation generation
|
|
9
|
+
2. Worker registration (serialized to Arrow for IPC)
|
|
10
|
+
3. DuckDB function catalog integration
|
|
11
|
+
4. Tooling and discovery
|
|
12
|
+
|
|
13
|
+
DESIGN
|
|
14
|
+
------
|
|
15
|
+
Users define a nested `Meta` class with attributes. No inheritance required:
|
|
16
|
+
|
|
17
|
+
The system automatically:
|
|
18
|
+
- Resolves metadata from the class hierarchy (inheritance works)
|
|
19
|
+
- Extracts parameter info from Arg descriptors
|
|
20
|
+
- Infers function name from class name if not specified
|
|
21
|
+
- Uses docstring as description fallback
|
|
22
|
+
|
|
23
|
+
ARROW SERIALIZATION
|
|
24
|
+
-------------------
|
|
25
|
+
For worker registration, metadata can be serialized to Arrow:
|
|
26
|
+
|
|
27
|
+
from vgi.metadata import functions_to_arrow, arrow_to_functions
|
|
28
|
+
|
|
29
|
+
# Worker sends available functions to client
|
|
30
|
+
batch = functions_to_arrow([MyFunction, OtherFunction])
|
|
31
|
+
|
|
32
|
+
# Client receives and deserializes
|
|
33
|
+
function_infos = arrow_to_functions(batch)
|
|
34
|
+
|
|
35
|
+
"""
|
|
36
|
+
|
|
37
|
+
from __future__ import annotations
|
|
38
|
+
|
|
39
|
+
import functools
|
|
40
|
+
import json
|
|
41
|
+
import re
|
|
42
|
+
import warnings
|
|
43
|
+
from collections.abc import Sequence
|
|
44
|
+
from dataclasses import dataclass, field
|
|
45
|
+
from enum import Enum, auto
|
|
46
|
+
from typing import TYPE_CHECKING, Annotated, Any, get_args, get_origin, get_type_hints
|
|
47
|
+
|
|
48
|
+
import pyarrow as pa
|
|
49
|
+
|
|
50
|
+
from vgi.arguments import _MISSING, AnyArrow, Secret, SecretLookupEntry, TableInput
|
|
51
|
+
|
|
52
|
+
if TYPE_CHECKING:
|
|
53
|
+
from vgi.arguments import Arg
|
|
54
|
+
|
|
55
|
+
# Default max_workers when not explicitly specified (effectively unlimited)
|
|
56
|
+
DEFAULT_MAX_WORKERS = 99999
|
|
57
|
+
|
|
58
|
+
__all__ = [
|
|
59
|
+
# Constants
|
|
60
|
+
"DEFAULT_MAX_WORKERS",
|
|
61
|
+
# Enums
|
|
62
|
+
"FunctionStability",
|
|
63
|
+
"CatalogFunctionType",
|
|
64
|
+
"NullHandling",
|
|
65
|
+
"OrderPreservation",
|
|
66
|
+
"OrderDependence",
|
|
67
|
+
"DistinctDependence",
|
|
68
|
+
# Data classes
|
|
69
|
+
"ParameterInfo",
|
|
70
|
+
"FunctionExample",
|
|
71
|
+
"ResolvedMetadata",
|
|
72
|
+
# Resolution
|
|
73
|
+
"resolve_metadata",
|
|
74
|
+
"extract_parameters",
|
|
75
|
+
# Exceptions
|
|
76
|
+
"FunctionTypeError",
|
|
77
|
+
"TableInputValidationError",
|
|
78
|
+
"VarargsValidationError",
|
|
79
|
+
# Arrow serialization
|
|
80
|
+
"metadata_to_arrow",
|
|
81
|
+
"metadatas_to_arrow",
|
|
82
|
+
"arrow_to_metadata",
|
|
83
|
+
"functions_to_arrow",
|
|
84
|
+
"arrow_to_functions",
|
|
85
|
+
# Mixin
|
|
86
|
+
"MetadataMixin",
|
|
87
|
+
]
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
# =============================================================================
|
|
91
|
+
# Enums
|
|
92
|
+
# =============================================================================
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
class CatalogFunctionType(Enum):
|
|
96
|
+
"""Type of function for DuckDB registration."""
|
|
97
|
+
|
|
98
|
+
SCALAR = auto()
|
|
99
|
+
"""Scalar function: one output per input row."""
|
|
100
|
+
|
|
101
|
+
AGGREGATE = auto()
|
|
102
|
+
"""Aggregate function: many inputs → one output."""
|
|
103
|
+
|
|
104
|
+
TABLE = auto()
|
|
105
|
+
"""Table function: returns a table (streaming producer or streaming exchange)."""
|
|
106
|
+
|
|
107
|
+
TABLE_BUFFERING = auto()
|
|
108
|
+
"""Buffered table function: Sink+Source PhysicalOperator that sees all
|
|
109
|
+
input before producing output. Dispatched to the custom
|
|
110
|
+
``PhysicalVgiTableBufferingFunction`` operator instead of the streaming
|
|
111
|
+
``in_out_function`` registration. The class hierarchy is the dispatch
|
|
112
|
+
key — set automatically for ``TableBufferingFunction`` subclasses."""
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
class FunctionStability(Enum):
|
|
116
|
+
"""Function output stability classification.
|
|
117
|
+
|
|
118
|
+
Maps to DuckDB's FunctionStability enum.
|
|
119
|
+
"""
|
|
120
|
+
|
|
121
|
+
CONSISTENT = auto()
|
|
122
|
+
"""Same input always produces same output (deterministic)."""
|
|
123
|
+
|
|
124
|
+
VOLATILE = auto()
|
|
125
|
+
"""Output may change per row even with same input (e.g., random())."""
|
|
126
|
+
|
|
127
|
+
CONSISTENT_WITHIN_QUERY = auto()
|
|
128
|
+
"""Same within a query, but may vary across queries (e.g., now())."""
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
class NullHandling(Enum):
|
|
132
|
+
"""NULL input handling behavior.
|
|
133
|
+
|
|
134
|
+
Maps to DuckDB's FunctionNullHandling enum.
|
|
135
|
+
"""
|
|
136
|
+
|
|
137
|
+
DEFAULT = auto()
|
|
138
|
+
"""NULL in → NULL out (standard SQL behavior)."""
|
|
139
|
+
|
|
140
|
+
SPECIAL = auto()
|
|
141
|
+
"""Function handles NULLs specially (e.g., COALESCE, IFNULL)."""
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
class OrderPreservation(Enum):
|
|
145
|
+
"""Row order preservation behavior.
|
|
146
|
+
|
|
147
|
+
Maps to DuckDB's ``OrderPreservationType`` enum:
|
|
148
|
+
|
|
149
|
+
* ``PRESERVES_ORDER`` → ``OrderPreservationType::INSERTION_ORDER``
|
|
150
|
+
(DuckDB default — operator maintains child operator order).
|
|
151
|
+
* ``NO_ORDER_GUARANTEE`` → ``OrderPreservationType::NO_ORDER``
|
|
152
|
+
(operator may freely reorder its input/output).
|
|
153
|
+
* ``FIXED_ORDER`` → ``OrderPreservationType::FIXED_ORDER``
|
|
154
|
+
(operator outputs rows in a fixed, mandatory order — DuckDB
|
|
155
|
+
serializes the pipeline so a single worker produces all rows).
|
|
156
|
+
"""
|
|
157
|
+
|
|
158
|
+
PRESERVES_ORDER = auto()
|
|
159
|
+
"""Output rows are in same order as input rows (DuckDB INSERTION_ORDER)."""
|
|
160
|
+
|
|
161
|
+
NO_ORDER_GUARANTEE = auto()
|
|
162
|
+
"""Output order is undefined; may be reordered (DuckDB NO_ORDER)."""
|
|
163
|
+
|
|
164
|
+
FIXED_ORDER = auto()
|
|
165
|
+
"""Output is in a fixed mandatory order; DuckDB serializes the pipeline
|
|
166
|
+
(single worker) to preserve it (DuckDB FIXED_ORDER)."""
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
class PartitionKind(Enum):
|
|
170
|
+
"""Partition shape declared by a table function.
|
|
171
|
+
|
|
172
|
+
Declared over its ``vgi.partition_column``-annotated bind-schema fields.
|
|
173
|
+
|
|
174
|
+
Mirrors DuckDB's ``TablePartitionInfo`` at
|
|
175
|
+
``duckdb/src/include/duckdb/function/partition_stats.hpp:20``.
|
|
176
|
+
|
|
177
|
+
The C++ extension returns this from ``TableFunction::get_partition_info``;
|
|
178
|
+
DuckDB's planner currently consumes only ``SINGLE_VALUE_PARTITIONS``
|
|
179
|
+
(to plan ``PhysicalPartitionedAggregate`` over ``PhysicalHashAggregate``;
|
|
180
|
+
see ``plan_aggregate.cpp:109``). The other values are declarable
|
|
181
|
+
so the protocol is future-proof; today they fall back to
|
|
182
|
+
``HASH_GROUP_BY``.
|
|
183
|
+
|
|
184
|
+
Only set this to a non-default value when at least one field in
|
|
185
|
+
the bind schema is annotated with
|
|
186
|
+
``{b"vgi.partition_column": b"true"}`` (use
|
|
187
|
+
:func:`vgi.schema_utils.partition_field` to construct such fields).
|
|
188
|
+
The reverse is also required — annotated fields without a
|
|
189
|
+
matching ``partition_kind`` raise at worker startup.
|
|
190
|
+
"""
|
|
191
|
+
|
|
192
|
+
NOT_PARTITIONED = auto()
|
|
193
|
+
"""Function does not declare partitioning over the annotated columns
|
|
194
|
+
(default; same effect as leaving fields un-annotated)."""
|
|
195
|
+
|
|
196
|
+
SINGLE_VALUE_PARTITIONS = auto()
|
|
197
|
+
"""Each emitted chunk has exactly one distinct value per partition
|
|
198
|
+
column. Unlocks ``PhysicalPartitionedAggregate`` for ``GROUP BY``
|
|
199
|
+
over those columns."""
|
|
200
|
+
|
|
201
|
+
OVERLAPPING_PARTITIONS = auto()
|
|
202
|
+
"""Partitions overlap only at boundaries (bounds = [1,2][2,3][3,4]).
|
|
203
|
+
Wire-level declarable; DuckDB has no consumer today."""
|
|
204
|
+
|
|
205
|
+
DISJOINT_PARTITIONS = auto()
|
|
206
|
+
"""Partitions are pairwise disjoint (bounds = [1,2][3,4][5,6]).
|
|
207
|
+
Wire-level declarable; DuckDB has no consumer today."""
|
|
208
|
+
|
|
209
|
+
|
|
210
|
+
class OrderDependence(Enum):
|
|
211
|
+
"""Aggregate order sensitivity.
|
|
212
|
+
|
|
213
|
+
Maps to DuckDB's AggregateOrderDependent enum.
|
|
214
|
+
"""
|
|
215
|
+
|
|
216
|
+
ORDER_DEPENDENT = auto()
|
|
217
|
+
"""Result changes based on row order (e.g., FIRST, LAST, LISTAGG)."""
|
|
218
|
+
|
|
219
|
+
NOT_ORDER_DEPENDENT = auto()
|
|
220
|
+
"""Result is the same regardless of order (e.g., SUM, COUNT)."""
|
|
221
|
+
|
|
222
|
+
|
|
223
|
+
class DistinctDependence(Enum):
|
|
224
|
+
"""Aggregate DISTINCT modifier sensitivity.
|
|
225
|
+
|
|
226
|
+
Maps to DuckDB's AggregateDistinctDependent enum.
|
|
227
|
+
"""
|
|
228
|
+
|
|
229
|
+
DISTINCT_DEPENDENT = auto()
|
|
230
|
+
"""DISTINCT changes the result (e.g., COUNT DISTINCT)."""
|
|
231
|
+
|
|
232
|
+
NOT_DISTINCT_DEPENDENT = auto()
|
|
233
|
+
"""DISTINCT has no effect (e.g., MAX, MIN)."""
|
|
234
|
+
|
|
235
|
+
|
|
236
|
+
# =============================================================================
|
|
237
|
+
# Data Classes
|
|
238
|
+
# =============================================================================
|
|
239
|
+
|
|
240
|
+
|
|
241
|
+
@dataclass(frozen=True)
|
|
242
|
+
class ParameterInfo:
|
|
243
|
+
"""Metadata about a function parameter.
|
|
244
|
+
|
|
245
|
+
Automatically extracted from Arg descriptors.
|
|
246
|
+
|
|
247
|
+
Attributes:
|
|
248
|
+
name: Parameter name (attribute name from class).
|
|
249
|
+
position: Positional index (int) or named key (str).
|
|
250
|
+
type_name: Type name as string (e.g., "int", "str", "TableInput").
|
|
251
|
+
description: Documentation from Arg.doc.
|
|
252
|
+
required: True if no default value.
|
|
253
|
+
default: Default value, or None if required.
|
|
254
|
+
constraints: Validation constraints as dict.
|
|
255
|
+
is_table_input: True if this is the table input parameter.
|
|
256
|
+
is_varargs: True if this accepts multiple trailing values.
|
|
257
|
+
is_const: True if this is a constant parameter (ConstParam).
|
|
258
|
+
|
|
259
|
+
"""
|
|
260
|
+
|
|
261
|
+
name: str
|
|
262
|
+
position: int | str
|
|
263
|
+
type_name: str | None = None
|
|
264
|
+
description: str = ""
|
|
265
|
+
required: bool = True
|
|
266
|
+
default: Any = None
|
|
267
|
+
constraints: dict[str, Any] = field(default_factory=dict)
|
|
268
|
+
is_table_input: bool = False
|
|
269
|
+
is_varargs: bool = False
|
|
270
|
+
is_const: bool = False
|
|
271
|
+
|
|
272
|
+
def to_dict(self) -> dict[str, str | int | bool | None]:
|
|
273
|
+
"""Convert to dictionary for serialization."""
|
|
274
|
+
return {
|
|
275
|
+
"name": self.name,
|
|
276
|
+
"position": self.position if isinstance(self.position, int) else None,
|
|
277
|
+
"position_name": self.position if isinstance(self.position, str) else None,
|
|
278
|
+
"type_name": self.type_name,
|
|
279
|
+
"description": self.description,
|
|
280
|
+
"required": self.required,
|
|
281
|
+
"default": repr(self.default) if self.default is not None else None,
|
|
282
|
+
"constraints": json.dumps(self.constraints) if self.constraints else None,
|
|
283
|
+
"is_table_input": self.is_table_input,
|
|
284
|
+
"is_varargs": self.is_varargs,
|
|
285
|
+
"is_const": self.is_const,
|
|
286
|
+
}
|
|
287
|
+
|
|
288
|
+
@staticmethod
|
|
289
|
+
def from_dict(d: dict[str, Any]) -> ParameterInfo:
|
|
290
|
+
"""Create from dictionary."""
|
|
291
|
+
position: int | str
|
|
292
|
+
if d.get("position") is not None:
|
|
293
|
+
position = d["position"]
|
|
294
|
+
elif d.get("position_name") is not None:
|
|
295
|
+
position = d["position_name"]
|
|
296
|
+
else:
|
|
297
|
+
position = 0
|
|
298
|
+
|
|
299
|
+
constraints = {}
|
|
300
|
+
if d.get("constraints"):
|
|
301
|
+
constraints = json.loads(d["constraints"])
|
|
302
|
+
|
|
303
|
+
return ParameterInfo(
|
|
304
|
+
name=d["name"],
|
|
305
|
+
position=position,
|
|
306
|
+
type_name=d.get("type_name"),
|
|
307
|
+
description=d.get("description", ""),
|
|
308
|
+
required=d.get("required", True),
|
|
309
|
+
default=d.get("default"),
|
|
310
|
+
constraints=constraints,
|
|
311
|
+
is_table_input=d.get("is_table_input", False),
|
|
312
|
+
is_varargs=d.get("is_varargs", False),
|
|
313
|
+
is_const=d.get("is_const", False),
|
|
314
|
+
)
|
|
315
|
+
|
|
316
|
+
|
|
317
|
+
@dataclass(frozen=True)
|
|
318
|
+
class FunctionExample:
|
|
319
|
+
"""An example usage of a function.
|
|
320
|
+
|
|
321
|
+
Attributes:
|
|
322
|
+
sql: SQL query demonstrating the function.
|
|
323
|
+
description: What this example demonstrates.
|
|
324
|
+
expected_output: Optional expected result description.
|
|
325
|
+
|
|
326
|
+
"""
|
|
327
|
+
|
|
328
|
+
sql: str
|
|
329
|
+
description: str = ""
|
|
330
|
+
expected_output: str | None = None
|
|
331
|
+
|
|
332
|
+
def to_dict(self) -> dict[str, str | None]:
|
|
333
|
+
"""Convert to dictionary for serialization."""
|
|
334
|
+
return {
|
|
335
|
+
"sql": self.sql,
|
|
336
|
+
"description": self.description,
|
|
337
|
+
"expected_output": self.expected_output,
|
|
338
|
+
}
|
|
339
|
+
|
|
340
|
+
@staticmethod
|
|
341
|
+
def from_dict(d: dict[str, Any]) -> FunctionExample:
|
|
342
|
+
"""Create from dictionary."""
|
|
343
|
+
return FunctionExample(
|
|
344
|
+
sql=d["sql"],
|
|
345
|
+
description=d.get("description", ""),
|
|
346
|
+
expected_output=d.get("expected_output"),
|
|
347
|
+
)
|
|
348
|
+
|
|
349
|
+
|
|
350
|
+
@dataclass
|
|
351
|
+
class ResolvedMetadata:
|
|
352
|
+
"""Fully resolved metadata for a function.
|
|
353
|
+
|
|
354
|
+
This is the result of resolving a Meta class hierarchy and extracting
|
|
355
|
+
parameter information from Arg descriptors.
|
|
356
|
+
|
|
357
|
+
"""
|
|
358
|
+
|
|
359
|
+
# Identity
|
|
360
|
+
name: str
|
|
361
|
+
class_name: str
|
|
362
|
+
function_type: CatalogFunctionType
|
|
363
|
+
|
|
364
|
+
# Documentation
|
|
365
|
+
description: str = ""
|
|
366
|
+
examples: list[FunctionExample] = field(default_factory=list)
|
|
367
|
+
categories: list[str] = field(default_factory=list)
|
|
368
|
+
tags: dict[str, str] = field(default_factory=dict)
|
|
369
|
+
parameters: list[ParameterInfo] = field(default_factory=list)
|
|
370
|
+
|
|
371
|
+
# Behavior (all functions)
|
|
372
|
+
stability: FunctionStability = FunctionStability.CONSISTENT
|
|
373
|
+
null_handling: NullHandling = NullHandling.DEFAULT
|
|
374
|
+
|
|
375
|
+
# settings required by the function
|
|
376
|
+
required_settings: list[str] = field(default_factory=list)
|
|
377
|
+
|
|
378
|
+
# secrets required by the function (each entry has secret_type, secret_name, scope)
|
|
379
|
+
required_secrets: list[SecretLookupEntry] = field(default_factory=list)
|
|
380
|
+
|
|
381
|
+
# Table function specific
|
|
382
|
+
projection_pushdown: bool = False
|
|
383
|
+
filter_pushdown: bool = False
|
|
384
|
+
sampling_pushdown: bool = False
|
|
385
|
+
# When True, the table function participates in DuckDB's late-materialization
|
|
386
|
+
# optimizer: TOP_N/LIMIT/SAMPLE over the scan is rewritten into a SEMI join on
|
|
387
|
+
# the rowid virtual column, and surviving rowids are pushed back to the wide
|
|
388
|
+
# scan as a filter. Requires a unique, deterministic, snapshot-stable rowid
|
|
389
|
+
# column (is_row_id) plus projection_pushdown + filter_pushdown. See the C++
|
|
390
|
+
# extension's late-materialization gating for the worker contract.
|
|
391
|
+
late_materialization: bool = False
|
|
392
|
+
supported_expression_filters: list[str] = field(default_factory=list)
|
|
393
|
+
preserves_order: OrderPreservation = OrderPreservation.PRESERVES_ORDER
|
|
394
|
+
max_workers: int | None = None
|
|
395
|
+
supports_batch_index: bool = False
|
|
396
|
+
partition_kind: PartitionKind = PartitionKind.NOT_PARTITIONED
|
|
397
|
+
|
|
398
|
+
# Aggregate function specific
|
|
399
|
+
order_dependent: OrderDependence = OrderDependence.NOT_ORDER_DEPENDENT
|
|
400
|
+
distinct_dependent: DistinctDependence = DistinctDependence.NOT_DISTINCT_DEPENDENT
|
|
401
|
+
supports_window: bool = False
|
|
402
|
+
streaming_partitioned: bool = False
|
|
403
|
+
|
|
404
|
+
# Table-in-out specific: True if the function has a meaningful finalize phase
|
|
405
|
+
# (override of finalize()/finish()). Used by the C++ extension to decide
|
|
406
|
+
# whether to register in_out_function_final, which DuckDB disallows alongside
|
|
407
|
+
# LATERAL-projected input.
|
|
408
|
+
has_finalize: bool = False
|
|
409
|
+
|
|
410
|
+
# When True (only meaningful when ``function_type == TABLE_BUFFERING``),
|
|
411
|
+
# the source phase is single-threaded and finalize_state_ids are drained
|
|
412
|
+
# in the order combine() returned them. The default (False) enables
|
|
413
|
+
# parallel finalize.
|
|
414
|
+
source_order_dependent: bool = False
|
|
415
|
+
|
|
416
|
+
# When True (only meaningful when ``function_type == TABLE_BUFFERING``),
|
|
417
|
+
# the SINK phase runs single-threaded — every process() call arrives in
|
|
418
|
+
# source order on one worker. The default (False) parallelizes ingest.
|
|
419
|
+
# Mutually exclusive with requires_input_batch_index (single-thread
|
|
420
|
+
# already orders; no batch_index needed).
|
|
421
|
+
sink_order_dependent: bool = False
|
|
422
|
+
|
|
423
|
+
# When True (only meaningful when ``function_type == TABLE_BUFFERING``), the C++ Sink
|
|
424
|
+
# operator declares RequiredPartitionInfo()=BatchIndex(), causing DuckDB
|
|
425
|
+
# to thread a globally-unique monotonic batch_index from the source
|
|
426
|
+
# into every process() call. Workers can accumulate (batch_index,
|
|
427
|
+
# payload) tuples and sort in combine() to reconstruct source order
|
|
428
|
+
# under parallel ingest. Requires the source to support batch_index
|
|
429
|
+
# (parquet/csv/temp-table-scan do; range() does not — bind fails).
|
|
430
|
+
# Mutually exclusive with sink_order_dependent.
|
|
431
|
+
requires_input_batch_index: bool = False
|
|
432
|
+
|
|
433
|
+
def to_dict(self) -> dict[str, Any]:
|
|
434
|
+
"""Convert to dictionary for JSON serialization."""
|
|
435
|
+
return {
|
|
436
|
+
"name": self.name,
|
|
437
|
+
"class_name": self.class_name,
|
|
438
|
+
"function_type": self.function_type.name,
|
|
439
|
+
"description": self.description,
|
|
440
|
+
"examples": [ex.to_dict() for ex in self.examples],
|
|
441
|
+
"categories": self.categories,
|
|
442
|
+
"tags": self.tags,
|
|
443
|
+
"parameters": [p.to_dict() for p in self.parameters],
|
|
444
|
+
"stability": self.stability.name,
|
|
445
|
+
"null_handling": self.null_handling.name,
|
|
446
|
+
"required_settings": self.required_settings,
|
|
447
|
+
"required_secrets": [e.to_dict() for e in self.required_secrets],
|
|
448
|
+
"projection_pushdown": self.projection_pushdown,
|
|
449
|
+
"filter_pushdown": self.filter_pushdown,
|
|
450
|
+
"sampling_pushdown": self.sampling_pushdown,
|
|
451
|
+
"late_materialization": self.late_materialization,
|
|
452
|
+
"supported_expression_filters": self.supported_expression_filters,
|
|
453
|
+
"preserves_order": self.preserves_order.name,
|
|
454
|
+
"max_workers": self.max_workers,
|
|
455
|
+
"supports_batch_index": self.supports_batch_index,
|
|
456
|
+
"partition_kind": self.partition_kind.name,
|
|
457
|
+
"order_dependent": self.order_dependent.name,
|
|
458
|
+
"distinct_dependent": self.distinct_dependent.name,
|
|
459
|
+
"supports_window": self.supports_window,
|
|
460
|
+
"streaming_partitioned": self.streaming_partitioned,
|
|
461
|
+
"has_finalize": self.has_finalize,
|
|
462
|
+
"source_order_dependent": self.source_order_dependent,
|
|
463
|
+
"sink_order_dependent": self.sink_order_dependent,
|
|
464
|
+
"requires_input_batch_index": self.requires_input_batch_index,
|
|
465
|
+
}
|
|
466
|
+
|
|
467
|
+
@staticmethod
|
|
468
|
+
def from_dict(d: dict[str, Any]) -> ResolvedMetadata:
|
|
469
|
+
"""Create from dictionary."""
|
|
470
|
+
return ResolvedMetadata(
|
|
471
|
+
name=d["name"],
|
|
472
|
+
class_name=d["class_name"],
|
|
473
|
+
function_type=CatalogFunctionType[d["function_type"]],
|
|
474
|
+
description=d.get("description", ""),
|
|
475
|
+
examples=[FunctionExample.from_dict(ex) for ex in d.get("examples", [])],
|
|
476
|
+
categories=d.get("categories", []),
|
|
477
|
+
tags=dict(d.get("tags", {})),
|
|
478
|
+
parameters=[ParameterInfo.from_dict(p) for p in d.get("parameters", [])],
|
|
479
|
+
stability=FunctionStability[d.get("stability", "CONSISTENT")],
|
|
480
|
+
null_handling=NullHandling[d.get("null_handling", "DEFAULT")],
|
|
481
|
+
required_settings=d.get("required_settings", []),
|
|
482
|
+
required_secrets=[SecretLookupEntry.from_dict(e) for e in d.get("required_secrets", [])],
|
|
483
|
+
projection_pushdown=d.get("projection_pushdown", False),
|
|
484
|
+
filter_pushdown=d.get("filter_pushdown", False),
|
|
485
|
+
sampling_pushdown=d.get("sampling_pushdown", False),
|
|
486
|
+
late_materialization=d.get("late_materialization", False),
|
|
487
|
+
supported_expression_filters=d.get("supported_expression_filters", []),
|
|
488
|
+
preserves_order=OrderPreservation[d.get("preserves_order", "PRESERVES_ORDER")],
|
|
489
|
+
max_workers=d.get("max_workers"),
|
|
490
|
+
supports_batch_index=d.get("supports_batch_index", False),
|
|
491
|
+
partition_kind=PartitionKind[d.get("partition_kind", "NOT_PARTITIONED")],
|
|
492
|
+
order_dependent=OrderDependence[d.get("order_dependent", "NOT_ORDER_DEPENDENT")],
|
|
493
|
+
distinct_dependent=DistinctDependence[d.get("distinct_dependent", "NOT_DISTINCT_DEPENDENT")],
|
|
494
|
+
supports_window=d.get("supports_window", False),
|
|
495
|
+
streaming_partitioned=d.get("streaming_partitioned", False),
|
|
496
|
+
has_finalize=d.get("has_finalize", False),
|
|
497
|
+
source_order_dependent=d.get("source_order_dependent", False),
|
|
498
|
+
sink_order_dependent=d.get("sink_order_dependent", False),
|
|
499
|
+
requires_input_batch_index=d.get("requires_input_batch_index", False),
|
|
500
|
+
)
|
|
501
|
+
|
|
502
|
+
|
|
503
|
+
# =============================================================================
|
|
504
|
+
# Parameter Extraction from Arg Descriptors
|
|
505
|
+
# =============================================================================
|
|
506
|
+
|
|
507
|
+
|
|
508
|
+
def _get_arg_type_info(cls: type, attr_name: str) -> tuple[str | None, bool]:
|
|
509
|
+
"""Extract type name and TableInput status from type hints for an Arg attribute.
|
|
510
|
+
|
|
511
|
+
Returns:
|
|
512
|
+
Tuple of (type_name, is_table_input).
|
|
513
|
+
|
|
514
|
+
"""
|
|
515
|
+
try:
|
|
516
|
+
hints = get_type_hints(cls)
|
|
517
|
+
except (NameError, AttributeError):
|
|
518
|
+
# NameError: Forward references can't be resolved (common with TYPE_CHECKING)
|
|
519
|
+
# AttributeError: Issues accessing class attributes during resolution
|
|
520
|
+
return (None, False)
|
|
521
|
+
|
|
522
|
+
if attr_name not in hints:
|
|
523
|
+
return (None, False)
|
|
524
|
+
|
|
525
|
+
hint = hints[attr_name]
|
|
526
|
+
|
|
527
|
+
# Check if it's TableInput
|
|
528
|
+
if hint is TableInput:
|
|
529
|
+
return ("TableInput", True)
|
|
530
|
+
|
|
531
|
+
# Check if it's AnyArrow (any Arrow type accepted)
|
|
532
|
+
if hint is AnyArrow:
|
|
533
|
+
return ("AnyArrow", False)
|
|
534
|
+
|
|
535
|
+
# Extract type name
|
|
536
|
+
if hasattr(hint, "__name__"):
|
|
537
|
+
return (hint.__name__, False)
|
|
538
|
+
|
|
539
|
+
return (str(hint), False)
|
|
540
|
+
|
|
541
|
+
|
|
542
|
+
class TableInputValidationError(ValueError):
|
|
543
|
+
"""Raised when TableInput parameter validation fails."""
|
|
544
|
+
|
|
545
|
+
|
|
546
|
+
class VarargsValidationError(ValueError):
|
|
547
|
+
"""Raised when varargs parameter validation fails."""
|
|
548
|
+
|
|
549
|
+
|
|
550
|
+
def _build_constraints(arg: Arg[Any]) -> dict[str, Any]:
|
|
551
|
+
"""Extract validation constraints from an Arg descriptor."""
|
|
552
|
+
constraints: dict[str, Any] = {}
|
|
553
|
+
|
|
554
|
+
# Numeric bounds
|
|
555
|
+
for name in ("ge", "le", "gt", "lt"):
|
|
556
|
+
value = getattr(arg, name)
|
|
557
|
+
if value is not None:
|
|
558
|
+
constraints[name] = value
|
|
559
|
+
|
|
560
|
+
# Other constraints
|
|
561
|
+
if arg.choices is not None:
|
|
562
|
+
constraints["choices"] = list(arg.choices)
|
|
563
|
+
if arg.pattern is not None:
|
|
564
|
+
constraints["pattern"] = arg.pattern
|
|
565
|
+
|
|
566
|
+
return constraints
|
|
567
|
+
|
|
568
|
+
|
|
569
|
+
def extract_parameters(cls: type, *, validate_table_input: bool = True) -> list[ParameterInfo]:
|
|
570
|
+
"""Extract parameter information from Arg descriptors on a class.
|
|
571
|
+
|
|
572
|
+
Walks the class and its bases to find all Arg descriptors and converts
|
|
573
|
+
them to ParameterInfo objects. Also handles the new Param/ConstParam API
|
|
574
|
+
for ScalarFunction subclasses.
|
|
575
|
+
|
|
576
|
+
Args:
|
|
577
|
+
cls: The function class to extract parameters from.
|
|
578
|
+
validate_table_input: If True, validates TableInput requirements for
|
|
579
|
+
TableInOutFunction subclasses.
|
|
580
|
+
|
|
581
|
+
Returns:
|
|
582
|
+
List of ParameterInfo objects, sorted by position.
|
|
583
|
+
|
|
584
|
+
Raises:
|
|
585
|
+
TableInputValidationError: If TableInput validation fails.
|
|
586
|
+
|
|
587
|
+
"""
|
|
588
|
+
# Import here to avoid circular imports
|
|
589
|
+
from vgi.arguments import Arg
|
|
590
|
+
|
|
591
|
+
parameters: list[ParameterInfo] = []
|
|
592
|
+
seen_names: set[str] = set()
|
|
593
|
+
|
|
594
|
+
# Check for new Param/ConstParam API (ScalarFunction and AggregateFunction subclasses)
|
|
595
|
+
# These are stored in _compute_params and _const_params class attributes
|
|
596
|
+
compute_params: dict[str, Arg[Any]] = getattr(cls, "_compute_params", {})
|
|
597
|
+
const_params: dict[str, Arg[Any]] = getattr(cls, "_const_params", {})
|
|
598
|
+
|
|
599
|
+
for name, arg in compute_params.items():
|
|
600
|
+
seen_names.add(name)
|
|
601
|
+
required = arg.default is _MISSING
|
|
602
|
+
# For new API, use arrow_type if available
|
|
603
|
+
compute_type_name = str(arg.arrow_type) if arg.arrow_type else "any"
|
|
604
|
+
|
|
605
|
+
parameters.append(
|
|
606
|
+
ParameterInfo(
|
|
607
|
+
name=name,
|
|
608
|
+
position=arg.position,
|
|
609
|
+
type_name=compute_type_name,
|
|
610
|
+
description=arg.doc,
|
|
611
|
+
required=required,
|
|
612
|
+
default=None if required else arg.default,
|
|
613
|
+
constraints=_build_constraints(arg),
|
|
614
|
+
is_table_input=False,
|
|
615
|
+
is_varargs=arg.varargs,
|
|
616
|
+
)
|
|
617
|
+
)
|
|
618
|
+
|
|
619
|
+
for name, arg in const_params.items():
|
|
620
|
+
seen_names.add(name)
|
|
621
|
+
required = arg.default is _MISSING
|
|
622
|
+
const_type_name = str(arg.arrow_type) if arg.arrow_type else "any"
|
|
623
|
+
|
|
624
|
+
parameters.append(
|
|
625
|
+
ParameterInfo(
|
|
626
|
+
name=name,
|
|
627
|
+
position=arg.position,
|
|
628
|
+
type_name=const_type_name,
|
|
629
|
+
description=arg.doc,
|
|
630
|
+
required=required,
|
|
631
|
+
default=None if required else arg.default,
|
|
632
|
+
constraints=_build_constraints(arg),
|
|
633
|
+
is_table_input=False,
|
|
634
|
+
is_varargs=arg.varargs,
|
|
635
|
+
is_const=arg.const,
|
|
636
|
+
)
|
|
637
|
+
)
|
|
638
|
+
|
|
639
|
+
# Check for FunctionArguments dataclass (typed generic pattern)
|
|
640
|
+
# e.g., class MyFunc(TableFunctionGenerator[MyArgs]):
|
|
641
|
+
# where MyArgs has fields like: count: Annotated[int, Arg(0, doc="...")]
|
|
642
|
+
func_args_class = getattr(cls, "FunctionArguments", None)
|
|
643
|
+
if func_args_class is not None:
|
|
644
|
+
try:
|
|
645
|
+
func_args_hints = get_type_hints(func_args_class, include_extras=True)
|
|
646
|
+
except (NameError, AttributeError):
|
|
647
|
+
func_args_hints = {}
|
|
648
|
+
|
|
649
|
+
for field_name, field_hint in func_args_hints.items():
|
|
650
|
+
if field_name.startswith("_") or field_name in seen_names:
|
|
651
|
+
continue
|
|
652
|
+
|
|
653
|
+
if get_origin(field_hint) is not Annotated:
|
|
654
|
+
continue
|
|
655
|
+
|
|
656
|
+
# Extract Arg from Annotated metadata
|
|
657
|
+
type_args = get_args(field_hint)
|
|
658
|
+
base_type = type_args[0]
|
|
659
|
+
arg_instance: Arg[Any] | None = None
|
|
660
|
+
for meta in type_args[1:]:
|
|
661
|
+
if isinstance(meta, Arg):
|
|
662
|
+
arg_instance = meta
|
|
663
|
+
break
|
|
664
|
+
|
|
665
|
+
if arg_instance is None:
|
|
666
|
+
continue
|
|
667
|
+
|
|
668
|
+
seen_names.add(field_name)
|
|
669
|
+
|
|
670
|
+
is_table_input = base_type is TableInput
|
|
671
|
+
if base_type is TableInput:
|
|
672
|
+
type_name = "TableInput"
|
|
673
|
+
elif base_type is AnyArrow:
|
|
674
|
+
type_name = "AnyArrow"
|
|
675
|
+
elif hasattr(base_type, "__name__"):
|
|
676
|
+
type_name = base_type.__name__
|
|
677
|
+
else:
|
|
678
|
+
type_name = str(base_type)
|
|
679
|
+
|
|
680
|
+
required = arg_instance.default is _MISSING
|
|
681
|
+
|
|
682
|
+
parameters.append(
|
|
683
|
+
ParameterInfo(
|
|
684
|
+
name=field_name,
|
|
685
|
+
position=arg_instance.position,
|
|
686
|
+
type_name=type_name,
|
|
687
|
+
description=arg_instance.doc,
|
|
688
|
+
required=required,
|
|
689
|
+
default=None if required else arg_instance.default,
|
|
690
|
+
constraints=_build_constraints(arg_instance),
|
|
691
|
+
is_table_input=is_table_input,
|
|
692
|
+
is_varargs=arg_instance.varargs,
|
|
693
|
+
)
|
|
694
|
+
)
|
|
695
|
+
|
|
696
|
+
# Walk MRO to find all Arg descriptors (legacy API)
|
|
697
|
+
for klass in cls.__mro__:
|
|
698
|
+
if klass is object:
|
|
699
|
+
continue
|
|
700
|
+
|
|
701
|
+
for attr_name, attr_value in vars(klass).items():
|
|
702
|
+
if attr_name.startswith("_"):
|
|
703
|
+
continue
|
|
704
|
+
if attr_name in seen_names:
|
|
705
|
+
continue
|
|
706
|
+
|
|
707
|
+
if isinstance(attr_value, Arg):
|
|
708
|
+
seen_names.add(attr_name)
|
|
709
|
+
arg = attr_value
|
|
710
|
+
required = arg.default is _MISSING
|
|
711
|
+
legacy_type_name, is_table_input = _get_arg_type_info(cls, attr_name)
|
|
712
|
+
|
|
713
|
+
parameters.append(
|
|
714
|
+
ParameterInfo(
|
|
715
|
+
name=attr_name,
|
|
716
|
+
position=arg.position,
|
|
717
|
+
type_name=legacy_type_name or "any",
|
|
718
|
+
description=arg.doc,
|
|
719
|
+
required=required,
|
|
720
|
+
default=None if required else arg.default,
|
|
721
|
+
constraints=_build_constraints(arg),
|
|
722
|
+
is_table_input=is_table_input,
|
|
723
|
+
is_varargs=arg.varargs,
|
|
724
|
+
)
|
|
725
|
+
)
|
|
726
|
+
|
|
727
|
+
# Sort: positional args by index first, then named args alphabetically
|
|
728
|
+
def sort_key(p: ParameterInfo) -> tuple[int, int | str]:
|
|
729
|
+
if isinstance(p.position, int):
|
|
730
|
+
return (0, p.position)
|
|
731
|
+
return (1, p.position)
|
|
732
|
+
|
|
733
|
+
sorted_params = sorted(parameters, key=sort_key)
|
|
734
|
+
|
|
735
|
+
# Validate TableInput and varargs constraints
|
|
736
|
+
if validate_table_input:
|
|
737
|
+
_validate_table_input(cls, sorted_params)
|
|
738
|
+
_validate_varargs(cls, sorted_params)
|
|
739
|
+
|
|
740
|
+
return sorted_params
|
|
741
|
+
|
|
742
|
+
|
|
743
|
+
def _validate_table_input(cls: type, parameters: list[ParameterInfo]) -> None:
|
|
744
|
+
"""Validate TableInput parameter constraints.
|
|
745
|
+
|
|
746
|
+
If a function has TableInput parameters, validates that:
|
|
747
|
+
- There is exactly one TableInput parameter
|
|
748
|
+
- The TableInput parameter is positional (not named)
|
|
749
|
+
|
|
750
|
+
Args:
|
|
751
|
+
cls: The function class being validated.
|
|
752
|
+
parameters: Extracted parameters.
|
|
753
|
+
|
|
754
|
+
Raises:
|
|
755
|
+
TableInputValidationError: If validation fails.
|
|
756
|
+
|
|
757
|
+
"""
|
|
758
|
+
table_inputs = [p for p in parameters if p.is_table_input]
|
|
759
|
+
|
|
760
|
+
if len(table_inputs) == 0:
|
|
761
|
+
return # No TableInput parameters, nothing to validate
|
|
762
|
+
|
|
763
|
+
if len(table_inputs) > 1:
|
|
764
|
+
names = [p.name for p in table_inputs]
|
|
765
|
+
raise TableInputValidationError(
|
|
766
|
+
f"{cls.__name__}: Functions can have at most one Arg[TableInput] "
|
|
767
|
+
f"parameter, but found {len(table_inputs)}: {names}"
|
|
768
|
+
)
|
|
769
|
+
|
|
770
|
+
table_input = table_inputs[0]
|
|
771
|
+
|
|
772
|
+
# TableInput must be positional (not named)
|
|
773
|
+
if isinstance(table_input.position, str):
|
|
774
|
+
raise TableInputValidationError(
|
|
775
|
+
f"{cls.__name__}: TableInput parameter '{table_input.name}' must be "
|
|
776
|
+
f"positional (int), not named. Change from "
|
|
777
|
+
f"Arg[TableInput]('{table_input.position}') to "
|
|
778
|
+
f"Arg[TableInput](<position_index>)"
|
|
779
|
+
)
|
|
780
|
+
|
|
781
|
+
|
|
782
|
+
def _validate_varargs(cls: type, parameters: list[ParameterInfo]) -> None:
|
|
783
|
+
"""Validate varargs parameter constraints.
|
|
784
|
+
|
|
785
|
+
If a function has varargs parameters, validates that:
|
|
786
|
+
- There is at most one varargs parameter
|
|
787
|
+
- The varargs parameter is positional (not named) - enforced by Arg.__init__
|
|
788
|
+
- The varargs parameter is the last positional arg (before TableInput if present)
|
|
789
|
+
|
|
790
|
+
Args:
|
|
791
|
+
cls: The function class being validated.
|
|
792
|
+
parameters: Extracted parameters.
|
|
793
|
+
|
|
794
|
+
Raises:
|
|
795
|
+
VarargsValidationError: If validation fails.
|
|
796
|
+
|
|
797
|
+
"""
|
|
798
|
+
varargs_params = [p for p in parameters if p.is_varargs]
|
|
799
|
+
|
|
800
|
+
if len(varargs_params) == 0:
|
|
801
|
+
return # No varargs parameters, nothing to validate
|
|
802
|
+
|
|
803
|
+
if len(varargs_params) > 1:
|
|
804
|
+
names = [p.name for p in varargs_params]
|
|
805
|
+
raise VarargsValidationError(
|
|
806
|
+
f"{cls.__name__}: Functions can have at most one varargs parameter, "
|
|
807
|
+
f"but found {len(varargs_params)}: {names}"
|
|
808
|
+
)
|
|
809
|
+
|
|
810
|
+
varargs_param = varargs_params[0]
|
|
811
|
+
|
|
812
|
+
# Get all positional parameters (excluding TableInput)
|
|
813
|
+
positional_params = [p for p in parameters if isinstance(p.position, int) and not p.is_table_input]
|
|
814
|
+
|
|
815
|
+
if not positional_params:
|
|
816
|
+
return # Should not happen if varargs exists, but be safe
|
|
817
|
+
|
|
818
|
+
# Find the maximum position among non-varargs positional params
|
|
819
|
+
# All positions here are int (filtered above), but mypy doesn't know
|
|
820
|
+
non_varargs_positional = [p for p in positional_params if not p.is_varargs]
|
|
821
|
+
if non_varargs_positional:
|
|
822
|
+
# All positions are int (filtered by isinstance(p.position, int) above)
|
|
823
|
+
int_positions = [p.position for p in non_varargs_positional if isinstance(p.position, int)]
|
|
824
|
+
max_non_varargs_pos = max(int_positions)
|
|
825
|
+
# varargs position must be int (enforced by Arg.__init__)
|
|
826
|
+
assert isinstance(varargs_param.position, int)
|
|
827
|
+
if varargs_param.position < max_non_varargs_pos:
|
|
828
|
+
raise VarargsValidationError(
|
|
829
|
+
f"{cls.__name__}: Varargs parameter '{varargs_param.name}' at "
|
|
830
|
+
f"position {varargs_param.position} must be the last positional "
|
|
831
|
+
f"argument, but there are positional arguments after it"
|
|
832
|
+
)
|
|
833
|
+
|
|
834
|
+
|
|
835
|
+
# =============================================================================
|
|
836
|
+
# Metadata Resolution
|
|
837
|
+
# =============================================================================
|
|
838
|
+
|
|
839
|
+
|
|
840
|
+
def _normalize_examples(
|
|
841
|
+
examples: list[FunctionExample | str],
|
|
842
|
+
) -> list[FunctionExample]:
|
|
843
|
+
"""Convert string examples to FunctionExample objects."""
|
|
844
|
+
return [FunctionExample(sql=ex) if isinstance(ex, str) else ex for ex in examples]
|
|
845
|
+
|
|
846
|
+
|
|
847
|
+
# Mapping from base class names to CatalogFunctionType.
|
|
848
|
+
# Using a dict avoids typos and provides O(1) lookup.
|
|
849
|
+
# Class names are used (not classes) to avoid circular imports.
|
|
850
|
+
# Note: Functions with an Arg[TableInput] parameter receive table input.
|
|
851
|
+
_CLASS_NAME_TO_FUNCTION_TYPE: dict[str, CatalogFunctionType] = {
|
|
852
|
+
# Buffered table function (Sink+Source). Must come before "TableFunctionBase"
|
|
853
|
+
# in the MRO walk — ``_infer_function_type`` returns on the first match, so
|
|
854
|
+
# the more-specific entry wins for TableBufferingFunction subclasses.
|
|
855
|
+
"TableBufferingFunction": CatalogFunctionType.TABLE_BUFFERING,
|
|
856
|
+
# Streaming table functions (TableFunctionGenerator + TableInOutGenerator).
|
|
857
|
+
"TableFunctionBase": CatalogFunctionType.TABLE,
|
|
858
|
+
"AggregateFunction": CatalogFunctionType.AGGREGATE,
|
|
859
|
+
"ScalarFunction": CatalogFunctionType.SCALAR,
|
|
860
|
+
"ScalarFunctionGenerator": CatalogFunctionType.SCALAR,
|
|
861
|
+
}
|
|
862
|
+
|
|
863
|
+
# Valid Meta class attribute names (for typo detection)
|
|
864
|
+
_VALID_META_ATTRIBUTES: frozenset[str] = frozenset(
|
|
865
|
+
{
|
|
866
|
+
# Common
|
|
867
|
+
"name",
|
|
868
|
+
"description",
|
|
869
|
+
"examples",
|
|
870
|
+
"categories",
|
|
871
|
+
"tags",
|
|
872
|
+
"stability",
|
|
873
|
+
"null_handling",
|
|
874
|
+
"required_settings", # settings/pragmas required by function
|
|
875
|
+
"required_secrets", # secrets required by function
|
|
876
|
+
# Table function specific
|
|
877
|
+
"projection_pushdown",
|
|
878
|
+
"filter_pushdown",
|
|
879
|
+
"sampling_pushdown",
|
|
880
|
+
"late_materialization", # Participate in DuckDB late-materialization rewrite
|
|
881
|
+
"supported_expression_filters",
|
|
882
|
+
"auto_apply_filters", # Auto-apply pushdown filters to output batches
|
|
883
|
+
"preserves_order",
|
|
884
|
+
"max_workers",
|
|
885
|
+
"supports_batch_index", # opt-in to per-batch batch_index tagging (parallel + ordered sink)
|
|
886
|
+
"partition_kind", # opt-in to PartitionColumns mode for Hive-style partitioning
|
|
887
|
+
# Table-in-out specific: explicit override for the has_finalize auto-detection.
|
|
888
|
+
# Set to True or False to force the emitted ``in_out_function_final``
|
|
889
|
+
# registration bit; leave unset (None) to auto-detect from finish/finalize.
|
|
890
|
+
"has_finalize",
|
|
891
|
+
# Buffered table function knobs (only meaningful when the class is a
|
|
892
|
+
# TableBufferingFunction subclass — function_type == TABLE_BUFFERING).
|
|
893
|
+
# When True, source phase is single-threaded and finalize_state_ids
|
|
894
|
+
# drain in combine-returned order.
|
|
895
|
+
"source_order_dependent",
|
|
896
|
+
# When True, the SINK phase runs single-threaded — process() calls
|
|
897
|
+
# arrive in source order on one worker.
|
|
898
|
+
"sink_order_dependent",
|
|
899
|
+
# When True, DuckDB threads a globally-unique monotonic batch_index
|
|
900
|
+
# from the source into every process() call. Worker can reconstruct
|
|
901
|
+
# source order in combine() by sorting accumulated (batch_index,
|
|
902
|
+
# payload) tuples.
|
|
903
|
+
"requires_input_batch_index",
|
|
904
|
+
# Aggregate function specific
|
|
905
|
+
"order_dependent",
|
|
906
|
+
"distinct_dependent",
|
|
907
|
+
"supports_window",
|
|
908
|
+
"streaming_partitioned",
|
|
909
|
+
# Scalar function specific
|
|
910
|
+
"output_type", # pa.DataType | type[AnyArrow] for scalar functions
|
|
911
|
+
}
|
|
912
|
+
)
|
|
913
|
+
|
|
914
|
+
|
|
915
|
+
class FunctionTypeError(TypeError):
|
|
916
|
+
"""Raised when a function's type cannot be determined from its class hierarchy."""
|
|
917
|
+
|
|
918
|
+
|
|
919
|
+
def _infer_function_type(cls: type) -> CatalogFunctionType:
|
|
920
|
+
"""Infer the function type from the class hierarchy.
|
|
921
|
+
|
|
922
|
+
Raises:
|
|
923
|
+
FunctionTypeError: If no recognized base class is found in the MRO.
|
|
924
|
+
|
|
925
|
+
"""
|
|
926
|
+
for klass in cls.__mro__:
|
|
927
|
+
if klass.__name__ in _CLASS_NAME_TO_FUNCTION_TYPE:
|
|
928
|
+
return _CLASS_NAME_TO_FUNCTION_TYPE[klass.__name__]
|
|
929
|
+
recognized_bases = sorted(_CLASS_NAME_TO_FUNCTION_TYPE.keys())
|
|
930
|
+
raise FunctionTypeError(
|
|
931
|
+
f"Cannot determine function type for {cls.__name__}. Class must inherit from one of: {recognized_bases}"
|
|
932
|
+
)
|
|
933
|
+
|
|
934
|
+
|
|
935
|
+
@functools.lru_cache(maxsize=256)
|
|
936
|
+
def resolve_metadata(cls: type) -> ResolvedMetadata:
|
|
937
|
+
"""Resolve metadata for a function class.
|
|
938
|
+
|
|
939
|
+
Results are cached since class metadata doesn't change at runtime.
|
|
940
|
+
|
|
941
|
+
This function:
|
|
942
|
+
1. Walks the class hierarchy to find and merge Meta classes
|
|
943
|
+
2. Extracts parameter info from Arg descriptors
|
|
944
|
+
3. Infers function name from class name if not specified
|
|
945
|
+
4. Uses docstring as description fallback
|
|
946
|
+
|
|
947
|
+
Args:
|
|
948
|
+
cls: The function class to resolve metadata for.
|
|
949
|
+
|
|
950
|
+
Returns:
|
|
951
|
+
ResolvedMetadata with all resolved values.
|
|
952
|
+
|
|
953
|
+
"""
|
|
954
|
+
# Collect all attributes from Meta classes in MRO
|
|
955
|
+
attrs: dict[str, Any] = {}
|
|
956
|
+
|
|
957
|
+
# Walk MRO in reverse so derived classes override base classes
|
|
958
|
+
for klass in reversed(cls.__mro__):
|
|
959
|
+
if klass is object:
|
|
960
|
+
continue
|
|
961
|
+
|
|
962
|
+
# Check for nested Meta class defined directly on this class
|
|
963
|
+
if "Meta" not in klass.__dict__:
|
|
964
|
+
continue
|
|
965
|
+
|
|
966
|
+
meta_class = klass.__dict__["Meta"]
|
|
967
|
+
|
|
968
|
+
# Extract class attributes defined directly on this Meta class
|
|
969
|
+
for attr_name, value in vars(meta_class).items():
|
|
970
|
+
if attr_name.startswith("_"):
|
|
971
|
+
continue
|
|
972
|
+
# Skip methods
|
|
973
|
+
if callable(value) and not isinstance(value, type):
|
|
974
|
+
continue
|
|
975
|
+
attrs[attr_name] = value
|
|
976
|
+
|
|
977
|
+
# Warn about unknown Meta attributes (likely typos)
|
|
978
|
+
unknown_attrs = set(attrs.keys()) - _VALID_META_ATTRIBUTES
|
|
979
|
+
if unknown_attrs:
|
|
980
|
+
warnings.warn(
|
|
981
|
+
f"{cls.__name__}.Meta has unknown attributes: {sorted(unknown_attrs)}. "
|
|
982
|
+
f"Valid attributes are: {sorted(_VALID_META_ATTRIBUTES)}",
|
|
983
|
+
stacklevel=2,
|
|
984
|
+
)
|
|
985
|
+
|
|
986
|
+
# Infer function type from class hierarchy. TableBufferingFunction
|
|
987
|
+
# subclasses resolve to ``CatalogFunctionType.TABLE_BUFFERING`` — that's
|
|
988
|
+
# the single source of truth for the C++ optimizer rewriter, not a
|
|
989
|
+
# separate Meta flag.
|
|
990
|
+
function_type = _infer_function_type(cls)
|
|
991
|
+
is_buffering = function_type is CatalogFunctionType.TABLE_BUFFERING
|
|
992
|
+
|
|
993
|
+
# Cross-flag validation for the buffered table path.
|
|
994
|
+
if attrs.get("source_order_dependent") and not is_buffering:
|
|
995
|
+
raise TypeError(
|
|
996
|
+
f"{cls.__name__}: Meta.source_order_dependent is only meaningful on TableBufferingFunction subclasses"
|
|
997
|
+
)
|
|
998
|
+
if attrs.get("sink_order_dependent") and not is_buffering:
|
|
999
|
+
raise TypeError(
|
|
1000
|
+
f"{cls.__name__}: Meta.sink_order_dependent is only meaningful on TableBufferingFunction subclasses"
|
|
1001
|
+
)
|
|
1002
|
+
if attrs.get("requires_input_batch_index") and not is_buffering:
|
|
1003
|
+
raise TypeError(
|
|
1004
|
+
f"{cls.__name__}: Meta.requires_input_batch_index is only meaningful on TableBufferingFunction subclasses"
|
|
1005
|
+
)
|
|
1006
|
+
if attrs.get("sink_order_dependent") and attrs.get("requires_input_batch_index"):
|
|
1007
|
+
raise TypeError(
|
|
1008
|
+
f"{cls.__name__}: Meta.sink_order_dependent and "
|
|
1009
|
+
f"Meta.requires_input_batch_index are mutually exclusive — "
|
|
1010
|
+
f"single-threaded sink already orders process() calls; "
|
|
1011
|
+
f"batch_index is only useful under parallel ingest"
|
|
1012
|
+
)
|
|
1013
|
+
|
|
1014
|
+
# Use class name as default name, converting to snake_case
|
|
1015
|
+
class_name = cls.__name__
|
|
1016
|
+
if "name" in attrs and attrs["name"]:
|
|
1017
|
+
name = attrs["name"]
|
|
1018
|
+
else:
|
|
1019
|
+
# Convert CamelCase to snake_case
|
|
1020
|
+
name = re.sub(r"(?<!^)(?=[A-Z])", "_", class_name).lower()
|
|
1021
|
+
# Remove common suffixes
|
|
1022
|
+
for suffix in ["_function", "_func"]:
|
|
1023
|
+
if name.endswith(suffix):
|
|
1024
|
+
name = name[: -len(suffix)]
|
|
1025
|
+
break
|
|
1026
|
+
|
|
1027
|
+
# Use docstring as fallback description
|
|
1028
|
+
description = attrs.get("description", "")
|
|
1029
|
+
if not description and cls.__doc__:
|
|
1030
|
+
description = cls.__doc__.strip().split("\n")[0]
|
|
1031
|
+
|
|
1032
|
+
# Normalize examples
|
|
1033
|
+
examples = _normalize_examples(attrs.get("examples", []))
|
|
1034
|
+
|
|
1035
|
+
# Extract parameters from Arg descriptors
|
|
1036
|
+
parameters = extract_parameters(cls)
|
|
1037
|
+
|
|
1038
|
+
# Merge annotation-derived setting/secret keys into required_settings/required_secrets
|
|
1039
|
+
meta_required_settings: list[str] = list(attrs.get("required_settings", []))
|
|
1040
|
+
|
|
1041
|
+
# Build required_secrets from Meta and annotations
|
|
1042
|
+
meta_required_secrets_raw = attrs.get("required_secrets", [])
|
|
1043
|
+
meta_required_secrets: list[SecretLookupEntry] = []
|
|
1044
|
+
for entry in meta_required_secrets_raw:
|
|
1045
|
+
if isinstance(entry, SecretLookupEntry):
|
|
1046
|
+
meta_required_secrets.append(entry)
|
|
1047
|
+
elif isinstance(entry, dict):
|
|
1048
|
+
meta_required_secrets.append(SecretLookupEntry.from_dict(entry))
|
|
1049
|
+
|
|
1050
|
+
# Auto-populate from _setting_params / _secret_params class vars (set by __init_subclass__)
|
|
1051
|
+
annotation_setting_keys: set[str] = set()
|
|
1052
|
+
|
|
1053
|
+
setting_params: dict[str, str] = getattr(cls, "_setting_params", {})
|
|
1054
|
+
secret_params: dict[str, Secret] = getattr(cls, "_secret_params", {})
|
|
1055
|
+
annotation_setting_keys.update(setting_params.values())
|
|
1056
|
+
|
|
1057
|
+
# Union with Meta-declared keys, deduped, preserving order
|
|
1058
|
+
existing_settings = set(meta_required_settings)
|
|
1059
|
+
for key in sorted(annotation_setting_keys):
|
|
1060
|
+
if key not in existing_settings:
|
|
1061
|
+
meta_required_settings.append(key)
|
|
1062
|
+
|
|
1063
|
+
# Add annotation-derived secret requirements
|
|
1064
|
+
existing_secret_types = {e.secret_type for e in meta_required_secrets}
|
|
1065
|
+
for secret in secret_params.values():
|
|
1066
|
+
if secret.secret_type not in existing_secret_types:
|
|
1067
|
+
meta_required_secrets.append(
|
|
1068
|
+
SecretLookupEntry(
|
|
1069
|
+
secret_type=secret.secret_type,
|
|
1070
|
+
secret_name=secret.name,
|
|
1071
|
+
scope=secret.scope,
|
|
1072
|
+
)
|
|
1073
|
+
)
|
|
1074
|
+
existing_secret_types.add(secret.secret_type)
|
|
1075
|
+
|
|
1076
|
+
return ResolvedMetadata(
|
|
1077
|
+
name=name,
|
|
1078
|
+
class_name=class_name,
|
|
1079
|
+
function_type=function_type,
|
|
1080
|
+
description=description,
|
|
1081
|
+
examples=examples,
|
|
1082
|
+
categories=attrs.get("categories", []),
|
|
1083
|
+
tags=dict(attrs.get("tags", {})),
|
|
1084
|
+
parameters=parameters,
|
|
1085
|
+
stability=attrs.get("stability", FunctionStability.CONSISTENT),
|
|
1086
|
+
null_handling=attrs.get("null_handling", NullHandling.DEFAULT),
|
|
1087
|
+
required_settings=meta_required_settings,
|
|
1088
|
+
required_secrets=meta_required_secrets,
|
|
1089
|
+
projection_pushdown=attrs.get("projection_pushdown", False),
|
|
1090
|
+
filter_pushdown=attrs.get("filter_pushdown", False),
|
|
1091
|
+
sampling_pushdown=attrs.get("sampling_pushdown", False),
|
|
1092
|
+
late_materialization=bool(attrs.get("late_materialization", False)),
|
|
1093
|
+
supported_expression_filters=attrs.get("supported_expression_filters", []),
|
|
1094
|
+
preserves_order=attrs.get("preserves_order", OrderPreservation.PRESERVES_ORDER),
|
|
1095
|
+
max_workers=attrs.get("max_workers"),
|
|
1096
|
+
supports_batch_index=bool(attrs.get("supports_batch_index", False)),
|
|
1097
|
+
partition_kind=_validate_partition_kind(cls, attrs.get("partition_kind", PartitionKind.NOT_PARTITIONED)),
|
|
1098
|
+
order_dependent=attrs.get("order_dependent", OrderDependence.NOT_ORDER_DEPENDENT),
|
|
1099
|
+
distinct_dependent=attrs.get("distinct_dependent", DistinctDependence.NOT_DISTINCT_DEPENDENT),
|
|
1100
|
+
supports_window=bool(attrs.get("supports_window", False)),
|
|
1101
|
+
streaming_partitioned=bool(attrs.get("streaming_partitioned", False)),
|
|
1102
|
+
# TABLE_BUFFERING implies has_finalize — the buffered path always
|
|
1103
|
+
# invokes the worker's finalize phase (it's the whole point).
|
|
1104
|
+
has_finalize=(_detect_has_finalize(cls, function_type) or is_buffering),
|
|
1105
|
+
source_order_dependent=bool(attrs.get("source_order_dependent", False)),
|
|
1106
|
+
sink_order_dependent=bool(attrs.get("sink_order_dependent", False)),
|
|
1107
|
+
requires_input_batch_index=bool(attrs.get("requires_input_batch_index", False)),
|
|
1108
|
+
)
|
|
1109
|
+
|
|
1110
|
+
|
|
1111
|
+
def _validate_partition_kind(cls: type, kind: PartitionKind) -> PartitionKind:
|
|
1112
|
+
"""Cross-check ``Meta.partition_kind`` against the bind schema.
|
|
1113
|
+
|
|
1114
|
+
When the class exposes a static ``FIXED_SCHEMA`` ``ClassVar``
|
|
1115
|
+
(the common pattern in test fixtures), we can verify at
|
|
1116
|
+
registration time that:
|
|
1117
|
+
|
|
1118
|
+
* ``kind != NOT_PARTITIONED`` ⇒ at least one field carries the
|
|
1119
|
+
``vgi.partition_column`` metadata key (via
|
|
1120
|
+
:func:`vgi.schema_utils.partition_field`).
|
|
1121
|
+
* The reverse: any field annotated as a partition column ⇒
|
|
1122
|
+
``kind != NOT_PARTITIONED``.
|
|
1123
|
+
|
|
1124
|
+
For functions that compute their bind schema dynamically (no
|
|
1125
|
+
``FIXED_SCHEMA`` available at class-resolution time), the check
|
|
1126
|
+
is deferred to the framework's bind path — the C++ extension's
|
|
1127
|
+
bind-time walk also raises ``BinderException`` on mismatch.
|
|
1128
|
+
|
|
1129
|
+
Returns the validated kind unchanged.
|
|
1130
|
+
"""
|
|
1131
|
+
# Static-schema fast path. ``FIXED_SCHEMA`` is the established
|
|
1132
|
+
# pattern for fixed-output table functions (see e.g.
|
|
1133
|
+
# ``PartitionedBatchIndexFunction.FIXED_SCHEMA``).
|
|
1134
|
+
fixed_schema = getattr(cls, "FIXED_SCHEMA", None)
|
|
1135
|
+
if not isinstance(fixed_schema, pa.Schema):
|
|
1136
|
+
# Dynamic schema or not a table function — defer to bind-time
|
|
1137
|
+
# validation in the C++ extension.
|
|
1138
|
+
return kind
|
|
1139
|
+
|
|
1140
|
+
from vgi.schema_utils import VGI_PARTITION_COLUMN_KEY
|
|
1141
|
+
|
|
1142
|
+
annotated_fields: list[str] = []
|
|
1143
|
+
for fld in fixed_schema:
|
|
1144
|
+
md = fld.metadata
|
|
1145
|
+
if md is not None and md.get(VGI_PARTITION_COLUMN_KEY) == b"true":
|
|
1146
|
+
annotated_fields.append(fld.name)
|
|
1147
|
+
|
|
1148
|
+
if kind == PartitionKind.NOT_PARTITIONED and annotated_fields:
|
|
1149
|
+
raise ValueError(
|
|
1150
|
+
f"{cls.__name__}: bind schema has partition-annotated field(s) "
|
|
1151
|
+
f"{annotated_fields!r} but Meta.partition_kind is NOT_PARTITIONED. "
|
|
1152
|
+
f"Set Meta.partition_kind to a non-default PartitionKind, or "
|
|
1153
|
+
f"remove the partition_field() annotations."
|
|
1154
|
+
)
|
|
1155
|
+
if kind != PartitionKind.NOT_PARTITIONED and not annotated_fields:
|
|
1156
|
+
raise ValueError(
|
|
1157
|
+
f"{cls.__name__}: Meta.partition_kind is {kind.name} but no bind "
|
|
1158
|
+
f"schema field is annotated with vgi.partition_column. Use "
|
|
1159
|
+
f"vgi.schema_utils.partition_field(name, type) to mark the "
|
|
1160
|
+
f"column(s) that satisfy the partition contract, or set "
|
|
1161
|
+
f"Meta.partition_kind back to NOT_PARTITIONED."
|
|
1162
|
+
)
|
|
1163
|
+
|
|
1164
|
+
return kind
|
|
1165
|
+
|
|
1166
|
+
|
|
1167
|
+
def _detect_has_finalize(cls: type, function_type: CatalogFunctionType) -> bool:
|
|
1168
|
+
"""Route to the TableInOut base class's ``has_finalize_override`` hook.
|
|
1169
|
+
|
|
1170
|
+
For non-TableInOut function types always returns ``False``. The actual
|
|
1171
|
+
detection logic lives on the base class so users can subclass and
|
|
1172
|
+
override the heuristic, and so the Meta-level ``has_finalize`` flag is
|
|
1173
|
+
handled in one place.
|
|
1174
|
+
"""
|
|
1175
|
+
if function_type is CatalogFunctionType.TABLE_BUFFERING:
|
|
1176
|
+
# The Sink+Source path is, by construction, an exchange that emits
|
|
1177
|
+
# output exclusively in the Source phase — has_finalize is always True
|
|
1178
|
+
# and is not detected from the user's class.
|
|
1179
|
+
return True
|
|
1180
|
+
if function_type is not CatalogFunctionType.TABLE:
|
|
1181
|
+
return False
|
|
1182
|
+
# Lazy import to avoid a circular dependency.
|
|
1183
|
+
try:
|
|
1184
|
+
from vgi.table_in_out_function import TableInOutGenerator
|
|
1185
|
+
except ImportError: # pragma: no cover
|
|
1186
|
+
return False
|
|
1187
|
+
if not issubclass(cls, TableInOutGenerator):
|
|
1188
|
+
return False
|
|
1189
|
+
return cls.has_finalize_override()
|
|
1190
|
+
|
|
1191
|
+
|
|
1192
|
+
# =============================================================================
|
|
1193
|
+
# Arrow Serialization
|
|
1194
|
+
# =============================================================================
|
|
1195
|
+
|
|
1196
|
+
# Nested struct type for function examples
|
|
1197
|
+
_EXAMPLE_STRUCT = pa.struct(
|
|
1198
|
+
[
|
|
1199
|
+
pa.field("sql", pa.string()),
|
|
1200
|
+
pa.field("description", pa.string()),
|
|
1201
|
+
pa.field("expected_output", pa.string(), nullable=True),
|
|
1202
|
+
]
|
|
1203
|
+
)
|
|
1204
|
+
|
|
1205
|
+
# Nested struct type for secret requirements
|
|
1206
|
+
_SECRET_REQUIREMENT_STRUCT = pa.struct(
|
|
1207
|
+
[
|
|
1208
|
+
pa.field("secret_type", pa.string()),
|
|
1209
|
+
pa.field("secret_name", pa.string(), nullable=True),
|
|
1210
|
+
pa.field("scope", pa.string(), nullable=True),
|
|
1211
|
+
]
|
|
1212
|
+
)
|
|
1213
|
+
|
|
1214
|
+
# Nested struct type for function parameters
|
|
1215
|
+
_PARAMETER_STRUCT = pa.struct(
|
|
1216
|
+
[
|
|
1217
|
+
pa.field("name", pa.string()),
|
|
1218
|
+
pa.field("position", pa.int32(), nullable=True),
|
|
1219
|
+
pa.field("position_name", pa.string(), nullable=True),
|
|
1220
|
+
pa.field("type_name", pa.string(), nullable=True),
|
|
1221
|
+
pa.field("description", pa.string()),
|
|
1222
|
+
pa.field("required", pa.bool_()),
|
|
1223
|
+
pa.field("default", pa.string(), nullable=True),
|
|
1224
|
+
pa.field("constraints", pa.string(), nullable=True), # JSON for flexibility
|
|
1225
|
+
pa.field("is_table_input", pa.bool_()),
|
|
1226
|
+
pa.field("is_varargs", pa.bool_()),
|
|
1227
|
+
pa.field("is_const", pa.bool_()),
|
|
1228
|
+
]
|
|
1229
|
+
)
|
|
1230
|
+
|
|
1231
|
+
# Schema for serializing function metadata
|
|
1232
|
+
_METADATA_SCHEMA = pa.schema(
|
|
1233
|
+
[
|
|
1234
|
+
pa.field("name", pa.string()),
|
|
1235
|
+
pa.field("class_name", pa.string()),
|
|
1236
|
+
pa.field("function_type", pa.string()),
|
|
1237
|
+
pa.field("description", pa.string()),
|
|
1238
|
+
pa.field("examples", pa.list_(_EXAMPLE_STRUCT)),
|
|
1239
|
+
pa.field("categories", pa.list_(pa.string())),
|
|
1240
|
+
pa.field("tags", pa.map_(pa.string(), pa.string())),
|
|
1241
|
+
pa.field("parameters", pa.list_(_PARAMETER_STRUCT)),
|
|
1242
|
+
pa.field("stability", pa.string()),
|
|
1243
|
+
pa.field("null_handling", pa.string()),
|
|
1244
|
+
pa.field("required_settings", pa.list_(pa.string())),
|
|
1245
|
+
pa.field("required_secrets", pa.list_(_SECRET_REQUIREMENT_STRUCT)),
|
|
1246
|
+
pa.field("projection_pushdown", pa.bool_()),
|
|
1247
|
+
pa.field("filter_pushdown", pa.bool_()),
|
|
1248
|
+
pa.field("sampling_pushdown", pa.bool_()),
|
|
1249
|
+
pa.field("late_materialization", pa.bool_()),
|
|
1250
|
+
pa.field("supported_expression_filters", pa.list_(pa.string())),
|
|
1251
|
+
pa.field("preserves_order", pa.string()),
|
|
1252
|
+
pa.field("max_workers", pa.int32(), nullable=True),
|
|
1253
|
+
pa.field("supports_batch_index", pa.bool_()),
|
|
1254
|
+
pa.field("partition_kind", pa.string()),
|
|
1255
|
+
pa.field("order_dependent", pa.string()),
|
|
1256
|
+
pa.field("distinct_dependent", pa.string()),
|
|
1257
|
+
pa.field("supports_window", pa.bool_()),
|
|
1258
|
+
pa.field("streaming_partitioned", pa.bool_()),
|
|
1259
|
+
pa.field("has_finalize", pa.bool_()),
|
|
1260
|
+
pa.field("source_order_dependent", pa.bool_()),
|
|
1261
|
+
pa.field("sink_order_dependent", pa.bool_()),
|
|
1262
|
+
pa.field("requires_input_batch_index", pa.bool_()),
|
|
1263
|
+
]
|
|
1264
|
+
)
|
|
1265
|
+
|
|
1266
|
+
# Fields that contain lists and need None -> [] conversion during deserialization
|
|
1267
|
+
_LIST_FIELDS: frozenset[str] = frozenset(
|
|
1268
|
+
{"examples", "categories", "parameters", "required_settings", "required_secrets", "supported_expression_filters"}
|
|
1269
|
+
)
|
|
1270
|
+
|
|
1271
|
+
# Fields that contain maps and need None -> {} conversion during deserialization
|
|
1272
|
+
_MAP_FIELDS: frozenset[str] = frozenset({"tags"})
|
|
1273
|
+
|
|
1274
|
+
|
|
1275
|
+
def _extract_arrow_row(columns: dict[str, list[Any]], index: int) -> dict[str, Any]:
|
|
1276
|
+
"""Extract a single row from Arrow columnar data as a dict.
|
|
1277
|
+
|
|
1278
|
+
Handles None values for list fields (converts None to [])
|
|
1279
|
+
and map fields (converts None to {}).
|
|
1280
|
+
"""
|
|
1281
|
+
result: dict[str, Any] = {}
|
|
1282
|
+
for field_name, values in columns.items():
|
|
1283
|
+
value = values[index]
|
|
1284
|
+
if value is None:
|
|
1285
|
+
if field_name in _LIST_FIELDS:
|
|
1286
|
+
result[field_name] = []
|
|
1287
|
+
elif field_name in _MAP_FIELDS:
|
|
1288
|
+
result[field_name] = {}
|
|
1289
|
+
else:
|
|
1290
|
+
result[field_name] = value
|
|
1291
|
+
else:
|
|
1292
|
+
result[field_name] = value
|
|
1293
|
+
return result
|
|
1294
|
+
|
|
1295
|
+
|
|
1296
|
+
def metadata_to_arrow(metadata: ResolvedMetadata) -> pa.RecordBatch:
|
|
1297
|
+
"""Serialize a single ResolvedMetadata to Arrow RecordBatch.
|
|
1298
|
+
|
|
1299
|
+
Args:
|
|
1300
|
+
metadata: The metadata to serialize.
|
|
1301
|
+
|
|
1302
|
+
Returns:
|
|
1303
|
+
RecordBatch with one row containing the metadata.
|
|
1304
|
+
|
|
1305
|
+
"""
|
|
1306
|
+
row = metadata.to_dict()
|
|
1307
|
+
# Wrap each value in a list for single-row batch
|
|
1308
|
+
data = {field: [value] for field, value in row.items()}
|
|
1309
|
+
return pa.RecordBatch.from_pydict(data, schema=_METADATA_SCHEMA)
|
|
1310
|
+
|
|
1311
|
+
|
|
1312
|
+
def arrow_to_metadata(batch: pa.RecordBatch) -> ResolvedMetadata:
|
|
1313
|
+
"""Deserialize Arrow RecordBatch to ResolvedMetadata.
|
|
1314
|
+
|
|
1315
|
+
Args:
|
|
1316
|
+
batch: RecordBatch with one row containing metadata.
|
|
1317
|
+
|
|
1318
|
+
Returns:
|
|
1319
|
+
Deserialized ResolvedMetadata.
|
|
1320
|
+
|
|
1321
|
+
"""
|
|
1322
|
+
if batch.num_rows != 1:
|
|
1323
|
+
raise ValueError(f"Expected 1 row, got {batch.num_rows}")
|
|
1324
|
+
|
|
1325
|
+
columns = batch.to_pydict()
|
|
1326
|
+
row = _extract_arrow_row(columns, 0)
|
|
1327
|
+
return ResolvedMetadata.from_dict(row)
|
|
1328
|
+
|
|
1329
|
+
|
|
1330
|
+
def metadatas_to_arrow(metadatas: Sequence[ResolvedMetadata]) -> pa.RecordBatch:
|
|
1331
|
+
"""Serialize multiple ResolvedMetadata objects to Arrow RecordBatch.
|
|
1332
|
+
|
|
1333
|
+
Args:
|
|
1334
|
+
metadatas: Sequence of ResolvedMetadata objects to serialize.
|
|
1335
|
+
|
|
1336
|
+
Returns:
|
|
1337
|
+
RecordBatch with one row per metadata object.
|
|
1338
|
+
|
|
1339
|
+
"""
|
|
1340
|
+
if not metadatas:
|
|
1341
|
+
return pa.RecordBatch.from_pydict({field.name: [] for field in _METADATA_SCHEMA}, schema=_METADATA_SCHEMA)
|
|
1342
|
+
|
|
1343
|
+
# Collect all data into columnar lists
|
|
1344
|
+
data: dict[str, list[Any]] = {field.name: [] for field in _METADATA_SCHEMA}
|
|
1345
|
+
|
|
1346
|
+
for meta in metadatas:
|
|
1347
|
+
row = meta.to_dict()
|
|
1348
|
+
for key, value in row.items():
|
|
1349
|
+
data[key].append(value)
|
|
1350
|
+
|
|
1351
|
+
return pa.RecordBatch.from_pydict(data, schema=_METADATA_SCHEMA)
|
|
1352
|
+
|
|
1353
|
+
|
|
1354
|
+
def functions_to_arrow(function_classes: Sequence[type]) -> pa.RecordBatch:
|
|
1355
|
+
"""Serialize multiple function classes to Arrow RecordBatch.
|
|
1356
|
+
|
|
1357
|
+
Convenience function that resolves metadata for each class, then serializes.
|
|
1358
|
+
For pre-resolved metadata, use metadatas_to_arrow() directly.
|
|
1359
|
+
|
|
1360
|
+
Args:
|
|
1361
|
+
function_classes: Sequence of function classes to serialize.
|
|
1362
|
+
|
|
1363
|
+
Returns:
|
|
1364
|
+
RecordBatch with one row per function.
|
|
1365
|
+
|
|
1366
|
+
"""
|
|
1367
|
+
return metadatas_to_arrow([resolve_metadata(cls) for cls in function_classes])
|
|
1368
|
+
|
|
1369
|
+
|
|
1370
|
+
def arrow_to_functions(batch: pa.RecordBatch) -> list[ResolvedMetadata]:
|
|
1371
|
+
"""Deserialize Arrow RecordBatch to list of ResolvedMetadata.
|
|
1372
|
+
|
|
1373
|
+
Args:
|
|
1374
|
+
batch: RecordBatch with one row per function.
|
|
1375
|
+
|
|
1376
|
+
Returns:
|
|
1377
|
+
List of deserialized ResolvedMetadata objects.
|
|
1378
|
+
|
|
1379
|
+
"""
|
|
1380
|
+
columns = batch.to_pydict()
|
|
1381
|
+
return [ResolvedMetadata.from_dict(_extract_arrow_row(columns, i)) for i in range(batch.num_rows)]
|
|
1382
|
+
|
|
1383
|
+
|
|
1384
|
+
# =============================================================================
|
|
1385
|
+
# Mixin for Function Classes
|
|
1386
|
+
# =============================================================================
|
|
1387
|
+
|
|
1388
|
+
|
|
1389
|
+
class MetadataMixin:
|
|
1390
|
+
"""Mixin that provides metadata access for function classes.
|
|
1391
|
+
|
|
1392
|
+
Add this to the base Function class to enable metadata resolution.
|
|
1393
|
+
"""
|
|
1394
|
+
|
|
1395
|
+
@classmethod
|
|
1396
|
+
def get_metadata(cls) -> ResolvedMetadata:
|
|
1397
|
+
"""Get the resolved metadata for this function class."""
|
|
1398
|
+
return resolve_metadata(cls) # type: ignore[arg-type]
|
|
1399
|
+
|
|
1400
|
+
@classmethod
|
|
1401
|
+
def describe(cls) -> dict[str, Any]:
|
|
1402
|
+
"""Get metadata as a dictionary (for JSON serialization)."""
|
|
1403
|
+
return cls.get_metadata().to_dict()
|