vgi-python 0.8.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (124) hide show
  1. vgi/__init__.py +152 -0
  2. vgi/_duckdb.py +62 -0
  3. vgi/_storage_profile.py +132 -0
  4. vgi/_test_fixtures/__init__.py +20 -0
  5. vgi/_test_fixtures/accumulate/__init__.py +19 -0
  6. vgi/_test_fixtures/accumulate/worker.py +762 -0
  7. vgi/_test_fixtures/aggregate/__init__.py +62 -0
  8. vgi/_test_fixtures/aggregate/_common.py +21 -0
  9. vgi/_test_fixtures/aggregate/basic.py +232 -0
  10. vgi/_test_fixtures/aggregate/dynamic.py +409 -0
  11. vgi/_test_fixtures/aggregate/generic.py +86 -0
  12. vgi/_test_fixtures/aggregate/listagg.py +71 -0
  13. vgi/_test_fixtures/aggregate/percentile.py +107 -0
  14. vgi/_test_fixtures/aggregate/streaming.py +192 -0
  15. vgi/_test_fixtures/aggregate/varargs.py +75 -0
  16. vgi/_test_fixtures/aggregate/window.py +380 -0
  17. vgi/_test_fixtures/attach_options.py +308 -0
  18. vgi/_test_fixtures/bad_protocol.py +62 -0
  19. vgi/_test_fixtures/cancellable.py +336 -0
  20. vgi/_test_fixtures/catalog.py +813 -0
  21. vgi/_test_fixtures/http_server.py +394 -0
  22. vgi/_test_fixtures/nest_tensor.py +614 -0
  23. vgi/_test_fixtures/orchard_catalog.py +47 -0
  24. vgi/_test_fixtures/projection_repro/__init__.py +6 -0
  25. vgi/_test_fixtures/projection_repro/worker.py +454 -0
  26. vgi/_test_fixtures/scalar/__init__.py +116 -0
  27. vgi/_test_fixtures/scalar/_common.py +69 -0
  28. vgi/_test_fixtures/scalar/arithmetic.py +321 -0
  29. vgi/_test_fixtures/scalar/binary.py +120 -0
  30. vgi/_test_fixtures/scalar/formatting.py +176 -0
  31. vgi/_test_fixtures/scalar/geo.py +300 -0
  32. vgi/_test_fixtures/scalar/null_handling.py +107 -0
  33. vgi/_test_fixtures/scalar/random_demo.py +171 -0
  34. vgi/_test_fixtures/scalar/settings_secrets.py +102 -0
  35. vgi/_test_fixtures/scalar/type_info.py +219 -0
  36. vgi/_test_fixtures/schema_reconcile/__init__.py +29 -0
  37. vgi/_test_fixtures/schema_reconcile/worker.py +653 -0
  38. vgi/_test_fixtures/simple_writable.py +793 -0
  39. vgi/_test_fixtures/table/__init__.py +221 -0
  40. vgi/_test_fixtures/table/_common.py +162 -0
  41. vgi/_test_fixtures/table/batch_index.py +283 -0
  42. vgi/_test_fixtures/table/batch_index_broken.py +200 -0
  43. vgi/_test_fixtures/table/catalog_scans.py +162 -0
  44. vgi/_test_fixtures/table/filters.py +1005 -0
  45. vgi/_test_fixtures/table/late_materialization.py +249 -0
  46. vgi/_test_fixtures/table/make_series.py +273 -0
  47. vgi/_test_fixtures/table/misc.py +499 -0
  48. vgi/_test_fixtures/table/order_modes.py +164 -0
  49. vgi/_test_fixtures/table/pairs.py +437 -0
  50. vgi/_test_fixtures/table/partition_columns.py +472 -0
  51. vgi/_test_fixtures/table/partition_columns_broken.py +304 -0
  52. vgi/_test_fixtures/table/profiling_example.py +195 -0
  53. vgi/_test_fixtures/table/required_filters.py +234 -0
  54. vgi/_test_fixtures/table/sequence.py +710 -0
  55. vgi/_test_fixtures/table/settings.py +426 -0
  56. vgi/_test_fixtures/table/transaction_storage.py +162 -0
  57. vgi/_test_fixtures/table/tt_pushdown.py +191 -0
  58. vgi/_test_fixtures/table/versioned.py +230 -0
  59. vgi/_test_fixtures/table_in_out.py +1392 -0
  60. vgi/_test_fixtures/versioned.py +155 -0
  61. vgi/_test_fixtures/versioned_tables.py +595 -0
  62. vgi/_test_fixtures/worker.py +1631 -0
  63. vgi/_test_fixtures/writable/__init__.py +8 -0
  64. vgi/_test_fixtures/writable/generic.py +236 -0
  65. vgi/_test_fixtures/writable/table.py +149 -0
  66. vgi/_test_fixtures/writable/worker.py +1148 -0
  67. vgi/aggregate_function.py +607 -0
  68. vgi/argument_spec.py +472 -0
  69. vgi/arguments.py +1747 -0
  70. vgi/auth.py +55 -0
  71. vgi/catalog/__init__.py +88 -0
  72. vgi/catalog/attach_option.py +206 -0
  73. vgi/catalog/catalog_interface.py +2767 -0
  74. vgi/catalog/descriptors.py +870 -0
  75. vgi/catalog/duckdb_statistics.py +377 -0
  76. vgi/catalog/secret_type.py +96 -0
  77. vgi/catalog/setting.py +253 -0
  78. vgi/catalog/storage.py +372 -0
  79. vgi/client/__init__.py +67 -0
  80. vgi/client/catalog_mixin.py +1251 -0
  81. vgi/client/cli.py +582 -0
  82. vgi/client/cli_catalog.py +182 -0
  83. vgi/client/cli_schema.py +270 -0
  84. vgi/client/cli_table.py +907 -0
  85. vgi/client/cli_transaction.py +97 -0
  86. vgi/client/cli_utils.py +441 -0
  87. vgi/client/cli_view.py +303 -0
  88. vgi/client/client.py +2183 -0
  89. vgi/exceptions.py +205 -0
  90. vgi/function.py +245 -0
  91. vgi/function_storage.py +1636 -0
  92. vgi/function_storage_azure_sql.py +922 -0
  93. vgi/function_storage_cf_do.py +740 -0
  94. vgi/http/__init__.py +25 -0
  95. vgi/http/demo_storage.py +212 -0
  96. vgi/http/worker_page.py +1252 -0
  97. vgi/invocation.py +154 -0
  98. vgi/logging_config.py +93 -0
  99. vgi/meta_worker.py +661 -0
  100. vgi/metadata.py +1403 -0
  101. vgi/otel.py +406 -0
  102. vgi/protocol.py +2418 -0
  103. vgi/protocol_version.txt +1 -0
  104. vgi/py.typed +0 -0
  105. vgi/scalar_function.py +1211 -0
  106. vgi/schema_utils.py +234 -0
  107. vgi/secret_protocol.py +124 -0
  108. vgi/secret_service.py +238 -0
  109. vgi/serve.py +769 -0
  110. vgi/table_buffering_function.py +443 -0
  111. vgi/table_filter_pushdown.py +1528 -0
  112. vgi/table_function.py +1130 -0
  113. vgi/table_in_out_function.py +383 -0
  114. vgi/transactor/__init__.py +24 -0
  115. vgi/transactor/_duckdb_compat.py +27 -0
  116. vgi/transactor/client.py +137 -0
  117. vgi/transactor/protocol.py +149 -0
  118. vgi/transactor/server.py +740 -0
  119. vgi/worker.py +4761 -0
  120. vgi_python-0.8.0.dist-info/METADATA +735 -0
  121. vgi_python-0.8.0.dist-info/RECORD +124 -0
  122. vgi_python-0.8.0.dist-info/WHEEL +4 -0
  123. vgi_python-0.8.0.dist-info/entry_points.txt +5 -0
  124. vgi_python-0.8.0.dist-info/licenses/LICENSE +134 -0
vgi/metadata.py ADDED
@@ -0,0 +1,1403 @@
1
+ # Copyright 2025, 2026 Query Farm LLC - https://query.farm
2
+
3
+ """Function metadata for introspection, documentation, and DuckDB registration.
4
+
5
+ This module provides declarative metadata classes that enable functions to
6
+ describe themselves. Metadata is used for:
7
+
8
+ 1. Documentation generation
9
+ 2. Worker registration (serialized to Arrow for IPC)
10
+ 3. DuckDB function catalog integration
11
+ 4. Tooling and discovery
12
+
13
+ DESIGN
14
+ ------
15
+ Users define a nested `Meta` class with attributes. No inheritance required:
16
+
17
+ The system automatically:
18
+ - Resolves metadata from the class hierarchy (inheritance works)
19
+ - Extracts parameter info from Arg descriptors
20
+ - Infers function name from class name if not specified
21
+ - Uses docstring as description fallback
22
+
23
+ ARROW SERIALIZATION
24
+ -------------------
25
+ For worker registration, metadata can be serialized to Arrow:
26
+
27
+ from vgi.metadata import functions_to_arrow, arrow_to_functions
28
+
29
+ # Worker sends available functions to client
30
+ batch = functions_to_arrow([MyFunction, OtherFunction])
31
+
32
+ # Client receives and deserializes
33
+ function_infos = arrow_to_functions(batch)
34
+
35
+ """
36
+
37
+ from __future__ import annotations
38
+
39
+ import functools
40
+ import json
41
+ import re
42
+ import warnings
43
+ from collections.abc import Sequence
44
+ from dataclasses import dataclass, field
45
+ from enum import Enum, auto
46
+ from typing import TYPE_CHECKING, Annotated, Any, get_args, get_origin, get_type_hints
47
+
48
+ import pyarrow as pa
49
+
50
+ from vgi.arguments import _MISSING, AnyArrow, Secret, SecretLookupEntry, TableInput
51
+
52
+ if TYPE_CHECKING:
53
+ from vgi.arguments import Arg
54
+
55
+ # Default max_workers when not explicitly specified (effectively unlimited)
56
+ DEFAULT_MAX_WORKERS = 99999
57
+
58
+ __all__ = [
59
+ # Constants
60
+ "DEFAULT_MAX_WORKERS",
61
+ # Enums
62
+ "FunctionStability",
63
+ "CatalogFunctionType",
64
+ "NullHandling",
65
+ "OrderPreservation",
66
+ "OrderDependence",
67
+ "DistinctDependence",
68
+ # Data classes
69
+ "ParameterInfo",
70
+ "FunctionExample",
71
+ "ResolvedMetadata",
72
+ # Resolution
73
+ "resolve_metadata",
74
+ "extract_parameters",
75
+ # Exceptions
76
+ "FunctionTypeError",
77
+ "TableInputValidationError",
78
+ "VarargsValidationError",
79
+ # Arrow serialization
80
+ "metadata_to_arrow",
81
+ "metadatas_to_arrow",
82
+ "arrow_to_metadata",
83
+ "functions_to_arrow",
84
+ "arrow_to_functions",
85
+ # Mixin
86
+ "MetadataMixin",
87
+ ]
88
+
89
+
90
+ # =============================================================================
91
+ # Enums
92
+ # =============================================================================
93
+
94
+
95
+ class CatalogFunctionType(Enum):
96
+ """Type of function for DuckDB registration."""
97
+
98
+ SCALAR = auto()
99
+ """Scalar function: one output per input row."""
100
+
101
+ AGGREGATE = auto()
102
+ """Aggregate function: many inputs → one output."""
103
+
104
+ TABLE = auto()
105
+ """Table function: returns a table (streaming producer or streaming exchange)."""
106
+
107
+ TABLE_BUFFERING = auto()
108
+ """Buffered table function: Sink+Source PhysicalOperator that sees all
109
+ input before producing output. Dispatched to the custom
110
+ ``PhysicalVgiTableBufferingFunction`` operator instead of the streaming
111
+ ``in_out_function`` registration. The class hierarchy is the dispatch
112
+ key — set automatically for ``TableBufferingFunction`` subclasses."""
113
+
114
+
115
+ class FunctionStability(Enum):
116
+ """Function output stability classification.
117
+
118
+ Maps to DuckDB's FunctionStability enum.
119
+ """
120
+
121
+ CONSISTENT = auto()
122
+ """Same input always produces same output (deterministic)."""
123
+
124
+ VOLATILE = auto()
125
+ """Output may change per row even with same input (e.g., random())."""
126
+
127
+ CONSISTENT_WITHIN_QUERY = auto()
128
+ """Same within a query, but may vary across queries (e.g., now())."""
129
+
130
+
131
+ class NullHandling(Enum):
132
+ """NULL input handling behavior.
133
+
134
+ Maps to DuckDB's FunctionNullHandling enum.
135
+ """
136
+
137
+ DEFAULT = auto()
138
+ """NULL in → NULL out (standard SQL behavior)."""
139
+
140
+ SPECIAL = auto()
141
+ """Function handles NULLs specially (e.g., COALESCE, IFNULL)."""
142
+
143
+
144
+ class OrderPreservation(Enum):
145
+ """Row order preservation behavior.
146
+
147
+ Maps to DuckDB's ``OrderPreservationType`` enum:
148
+
149
+ * ``PRESERVES_ORDER`` → ``OrderPreservationType::INSERTION_ORDER``
150
+ (DuckDB default — operator maintains child operator order).
151
+ * ``NO_ORDER_GUARANTEE`` → ``OrderPreservationType::NO_ORDER``
152
+ (operator may freely reorder its input/output).
153
+ * ``FIXED_ORDER`` → ``OrderPreservationType::FIXED_ORDER``
154
+ (operator outputs rows in a fixed, mandatory order — DuckDB
155
+ serializes the pipeline so a single worker produces all rows).
156
+ """
157
+
158
+ PRESERVES_ORDER = auto()
159
+ """Output rows are in same order as input rows (DuckDB INSERTION_ORDER)."""
160
+
161
+ NO_ORDER_GUARANTEE = auto()
162
+ """Output order is undefined; may be reordered (DuckDB NO_ORDER)."""
163
+
164
+ FIXED_ORDER = auto()
165
+ """Output is in a fixed mandatory order; DuckDB serializes the pipeline
166
+ (single worker) to preserve it (DuckDB FIXED_ORDER)."""
167
+
168
+
169
+ class PartitionKind(Enum):
170
+ """Partition shape declared by a table function.
171
+
172
+ Declared over its ``vgi.partition_column``-annotated bind-schema fields.
173
+
174
+ Mirrors DuckDB's ``TablePartitionInfo`` at
175
+ ``duckdb/src/include/duckdb/function/partition_stats.hpp:20``.
176
+
177
+ The C++ extension returns this from ``TableFunction::get_partition_info``;
178
+ DuckDB's planner currently consumes only ``SINGLE_VALUE_PARTITIONS``
179
+ (to plan ``PhysicalPartitionedAggregate`` over ``PhysicalHashAggregate``;
180
+ see ``plan_aggregate.cpp:109``). The other values are declarable
181
+ so the protocol is future-proof; today they fall back to
182
+ ``HASH_GROUP_BY``.
183
+
184
+ Only set this to a non-default value when at least one field in
185
+ the bind schema is annotated with
186
+ ``{b"vgi.partition_column": b"true"}`` (use
187
+ :func:`vgi.schema_utils.partition_field` to construct such fields).
188
+ The reverse is also required — annotated fields without a
189
+ matching ``partition_kind`` raise at worker startup.
190
+ """
191
+
192
+ NOT_PARTITIONED = auto()
193
+ """Function does not declare partitioning over the annotated columns
194
+ (default; same effect as leaving fields un-annotated)."""
195
+
196
+ SINGLE_VALUE_PARTITIONS = auto()
197
+ """Each emitted chunk has exactly one distinct value per partition
198
+ column. Unlocks ``PhysicalPartitionedAggregate`` for ``GROUP BY``
199
+ over those columns."""
200
+
201
+ OVERLAPPING_PARTITIONS = auto()
202
+ """Partitions overlap only at boundaries (bounds = [1,2][2,3][3,4]).
203
+ Wire-level declarable; DuckDB has no consumer today."""
204
+
205
+ DISJOINT_PARTITIONS = auto()
206
+ """Partitions are pairwise disjoint (bounds = [1,2][3,4][5,6]).
207
+ Wire-level declarable; DuckDB has no consumer today."""
208
+
209
+
210
+ class OrderDependence(Enum):
211
+ """Aggregate order sensitivity.
212
+
213
+ Maps to DuckDB's AggregateOrderDependent enum.
214
+ """
215
+
216
+ ORDER_DEPENDENT = auto()
217
+ """Result changes based on row order (e.g., FIRST, LAST, LISTAGG)."""
218
+
219
+ NOT_ORDER_DEPENDENT = auto()
220
+ """Result is the same regardless of order (e.g., SUM, COUNT)."""
221
+
222
+
223
+ class DistinctDependence(Enum):
224
+ """Aggregate DISTINCT modifier sensitivity.
225
+
226
+ Maps to DuckDB's AggregateDistinctDependent enum.
227
+ """
228
+
229
+ DISTINCT_DEPENDENT = auto()
230
+ """DISTINCT changes the result (e.g., COUNT DISTINCT)."""
231
+
232
+ NOT_DISTINCT_DEPENDENT = auto()
233
+ """DISTINCT has no effect (e.g., MAX, MIN)."""
234
+
235
+
236
+ # =============================================================================
237
+ # Data Classes
238
+ # =============================================================================
239
+
240
+
241
+ @dataclass(frozen=True)
242
+ class ParameterInfo:
243
+ """Metadata about a function parameter.
244
+
245
+ Automatically extracted from Arg descriptors.
246
+
247
+ Attributes:
248
+ name: Parameter name (attribute name from class).
249
+ position: Positional index (int) or named key (str).
250
+ type_name: Type name as string (e.g., "int", "str", "TableInput").
251
+ description: Documentation from Arg.doc.
252
+ required: True if no default value.
253
+ default: Default value, or None if required.
254
+ constraints: Validation constraints as dict.
255
+ is_table_input: True if this is the table input parameter.
256
+ is_varargs: True if this accepts multiple trailing values.
257
+ is_const: True if this is a constant parameter (ConstParam).
258
+
259
+ """
260
+
261
+ name: str
262
+ position: int | str
263
+ type_name: str | None = None
264
+ description: str = ""
265
+ required: bool = True
266
+ default: Any = None
267
+ constraints: dict[str, Any] = field(default_factory=dict)
268
+ is_table_input: bool = False
269
+ is_varargs: bool = False
270
+ is_const: bool = False
271
+
272
+ def to_dict(self) -> dict[str, str | int | bool | None]:
273
+ """Convert to dictionary for serialization."""
274
+ return {
275
+ "name": self.name,
276
+ "position": self.position if isinstance(self.position, int) else None,
277
+ "position_name": self.position if isinstance(self.position, str) else None,
278
+ "type_name": self.type_name,
279
+ "description": self.description,
280
+ "required": self.required,
281
+ "default": repr(self.default) if self.default is not None else None,
282
+ "constraints": json.dumps(self.constraints) if self.constraints else None,
283
+ "is_table_input": self.is_table_input,
284
+ "is_varargs": self.is_varargs,
285
+ "is_const": self.is_const,
286
+ }
287
+
288
+ @staticmethod
289
+ def from_dict(d: dict[str, Any]) -> ParameterInfo:
290
+ """Create from dictionary."""
291
+ position: int | str
292
+ if d.get("position") is not None:
293
+ position = d["position"]
294
+ elif d.get("position_name") is not None:
295
+ position = d["position_name"]
296
+ else:
297
+ position = 0
298
+
299
+ constraints = {}
300
+ if d.get("constraints"):
301
+ constraints = json.loads(d["constraints"])
302
+
303
+ return ParameterInfo(
304
+ name=d["name"],
305
+ position=position,
306
+ type_name=d.get("type_name"),
307
+ description=d.get("description", ""),
308
+ required=d.get("required", True),
309
+ default=d.get("default"),
310
+ constraints=constraints,
311
+ is_table_input=d.get("is_table_input", False),
312
+ is_varargs=d.get("is_varargs", False),
313
+ is_const=d.get("is_const", False),
314
+ )
315
+
316
+
317
+ @dataclass(frozen=True)
318
+ class FunctionExample:
319
+ """An example usage of a function.
320
+
321
+ Attributes:
322
+ sql: SQL query demonstrating the function.
323
+ description: What this example demonstrates.
324
+ expected_output: Optional expected result description.
325
+
326
+ """
327
+
328
+ sql: str
329
+ description: str = ""
330
+ expected_output: str | None = None
331
+
332
+ def to_dict(self) -> dict[str, str | None]:
333
+ """Convert to dictionary for serialization."""
334
+ return {
335
+ "sql": self.sql,
336
+ "description": self.description,
337
+ "expected_output": self.expected_output,
338
+ }
339
+
340
+ @staticmethod
341
+ def from_dict(d: dict[str, Any]) -> FunctionExample:
342
+ """Create from dictionary."""
343
+ return FunctionExample(
344
+ sql=d["sql"],
345
+ description=d.get("description", ""),
346
+ expected_output=d.get("expected_output"),
347
+ )
348
+
349
+
350
+ @dataclass
351
+ class ResolvedMetadata:
352
+ """Fully resolved metadata for a function.
353
+
354
+ This is the result of resolving a Meta class hierarchy and extracting
355
+ parameter information from Arg descriptors.
356
+
357
+ """
358
+
359
+ # Identity
360
+ name: str
361
+ class_name: str
362
+ function_type: CatalogFunctionType
363
+
364
+ # Documentation
365
+ description: str = ""
366
+ examples: list[FunctionExample] = field(default_factory=list)
367
+ categories: list[str] = field(default_factory=list)
368
+ tags: dict[str, str] = field(default_factory=dict)
369
+ parameters: list[ParameterInfo] = field(default_factory=list)
370
+
371
+ # Behavior (all functions)
372
+ stability: FunctionStability = FunctionStability.CONSISTENT
373
+ null_handling: NullHandling = NullHandling.DEFAULT
374
+
375
+ # settings required by the function
376
+ required_settings: list[str] = field(default_factory=list)
377
+
378
+ # secrets required by the function (each entry has secret_type, secret_name, scope)
379
+ required_secrets: list[SecretLookupEntry] = field(default_factory=list)
380
+
381
+ # Table function specific
382
+ projection_pushdown: bool = False
383
+ filter_pushdown: bool = False
384
+ sampling_pushdown: bool = False
385
+ # When True, the table function participates in DuckDB's late-materialization
386
+ # optimizer: TOP_N/LIMIT/SAMPLE over the scan is rewritten into a SEMI join on
387
+ # the rowid virtual column, and surviving rowids are pushed back to the wide
388
+ # scan as a filter. Requires a unique, deterministic, snapshot-stable rowid
389
+ # column (is_row_id) plus projection_pushdown + filter_pushdown. See the C++
390
+ # extension's late-materialization gating for the worker contract.
391
+ late_materialization: bool = False
392
+ supported_expression_filters: list[str] = field(default_factory=list)
393
+ preserves_order: OrderPreservation = OrderPreservation.PRESERVES_ORDER
394
+ max_workers: int | None = None
395
+ supports_batch_index: bool = False
396
+ partition_kind: PartitionKind = PartitionKind.NOT_PARTITIONED
397
+
398
+ # Aggregate function specific
399
+ order_dependent: OrderDependence = OrderDependence.NOT_ORDER_DEPENDENT
400
+ distinct_dependent: DistinctDependence = DistinctDependence.NOT_DISTINCT_DEPENDENT
401
+ supports_window: bool = False
402
+ streaming_partitioned: bool = False
403
+
404
+ # Table-in-out specific: True if the function has a meaningful finalize phase
405
+ # (override of finalize()/finish()). Used by the C++ extension to decide
406
+ # whether to register in_out_function_final, which DuckDB disallows alongside
407
+ # LATERAL-projected input.
408
+ has_finalize: bool = False
409
+
410
+ # When True (only meaningful when ``function_type == TABLE_BUFFERING``),
411
+ # the source phase is single-threaded and finalize_state_ids are drained
412
+ # in the order combine() returned them. The default (False) enables
413
+ # parallel finalize.
414
+ source_order_dependent: bool = False
415
+
416
+ # When True (only meaningful when ``function_type == TABLE_BUFFERING``),
417
+ # the SINK phase runs single-threaded — every process() call arrives in
418
+ # source order on one worker. The default (False) parallelizes ingest.
419
+ # Mutually exclusive with requires_input_batch_index (single-thread
420
+ # already orders; no batch_index needed).
421
+ sink_order_dependent: bool = False
422
+
423
+ # When True (only meaningful when ``function_type == TABLE_BUFFERING``), the C++ Sink
424
+ # operator declares RequiredPartitionInfo()=BatchIndex(), causing DuckDB
425
+ # to thread a globally-unique monotonic batch_index from the source
426
+ # into every process() call. Workers can accumulate (batch_index,
427
+ # payload) tuples and sort in combine() to reconstruct source order
428
+ # under parallel ingest. Requires the source to support batch_index
429
+ # (parquet/csv/temp-table-scan do; range() does not — bind fails).
430
+ # Mutually exclusive with sink_order_dependent.
431
+ requires_input_batch_index: bool = False
432
+
433
+ def to_dict(self) -> dict[str, Any]:
434
+ """Convert to dictionary for JSON serialization."""
435
+ return {
436
+ "name": self.name,
437
+ "class_name": self.class_name,
438
+ "function_type": self.function_type.name,
439
+ "description": self.description,
440
+ "examples": [ex.to_dict() for ex in self.examples],
441
+ "categories": self.categories,
442
+ "tags": self.tags,
443
+ "parameters": [p.to_dict() for p in self.parameters],
444
+ "stability": self.stability.name,
445
+ "null_handling": self.null_handling.name,
446
+ "required_settings": self.required_settings,
447
+ "required_secrets": [e.to_dict() for e in self.required_secrets],
448
+ "projection_pushdown": self.projection_pushdown,
449
+ "filter_pushdown": self.filter_pushdown,
450
+ "sampling_pushdown": self.sampling_pushdown,
451
+ "late_materialization": self.late_materialization,
452
+ "supported_expression_filters": self.supported_expression_filters,
453
+ "preserves_order": self.preserves_order.name,
454
+ "max_workers": self.max_workers,
455
+ "supports_batch_index": self.supports_batch_index,
456
+ "partition_kind": self.partition_kind.name,
457
+ "order_dependent": self.order_dependent.name,
458
+ "distinct_dependent": self.distinct_dependent.name,
459
+ "supports_window": self.supports_window,
460
+ "streaming_partitioned": self.streaming_partitioned,
461
+ "has_finalize": self.has_finalize,
462
+ "source_order_dependent": self.source_order_dependent,
463
+ "sink_order_dependent": self.sink_order_dependent,
464
+ "requires_input_batch_index": self.requires_input_batch_index,
465
+ }
466
+
467
+ @staticmethod
468
+ def from_dict(d: dict[str, Any]) -> ResolvedMetadata:
469
+ """Create from dictionary."""
470
+ return ResolvedMetadata(
471
+ name=d["name"],
472
+ class_name=d["class_name"],
473
+ function_type=CatalogFunctionType[d["function_type"]],
474
+ description=d.get("description", ""),
475
+ examples=[FunctionExample.from_dict(ex) for ex in d.get("examples", [])],
476
+ categories=d.get("categories", []),
477
+ tags=dict(d.get("tags", {})),
478
+ parameters=[ParameterInfo.from_dict(p) for p in d.get("parameters", [])],
479
+ stability=FunctionStability[d.get("stability", "CONSISTENT")],
480
+ null_handling=NullHandling[d.get("null_handling", "DEFAULT")],
481
+ required_settings=d.get("required_settings", []),
482
+ required_secrets=[SecretLookupEntry.from_dict(e) for e in d.get("required_secrets", [])],
483
+ projection_pushdown=d.get("projection_pushdown", False),
484
+ filter_pushdown=d.get("filter_pushdown", False),
485
+ sampling_pushdown=d.get("sampling_pushdown", False),
486
+ late_materialization=d.get("late_materialization", False),
487
+ supported_expression_filters=d.get("supported_expression_filters", []),
488
+ preserves_order=OrderPreservation[d.get("preserves_order", "PRESERVES_ORDER")],
489
+ max_workers=d.get("max_workers"),
490
+ supports_batch_index=d.get("supports_batch_index", False),
491
+ partition_kind=PartitionKind[d.get("partition_kind", "NOT_PARTITIONED")],
492
+ order_dependent=OrderDependence[d.get("order_dependent", "NOT_ORDER_DEPENDENT")],
493
+ distinct_dependent=DistinctDependence[d.get("distinct_dependent", "NOT_DISTINCT_DEPENDENT")],
494
+ supports_window=d.get("supports_window", False),
495
+ streaming_partitioned=d.get("streaming_partitioned", False),
496
+ has_finalize=d.get("has_finalize", False),
497
+ source_order_dependent=d.get("source_order_dependent", False),
498
+ sink_order_dependent=d.get("sink_order_dependent", False),
499
+ requires_input_batch_index=d.get("requires_input_batch_index", False),
500
+ )
501
+
502
+
503
+ # =============================================================================
504
+ # Parameter Extraction from Arg Descriptors
505
+ # =============================================================================
506
+
507
+
508
+ def _get_arg_type_info(cls: type, attr_name: str) -> tuple[str | None, bool]:
509
+ """Extract type name and TableInput status from type hints for an Arg attribute.
510
+
511
+ Returns:
512
+ Tuple of (type_name, is_table_input).
513
+
514
+ """
515
+ try:
516
+ hints = get_type_hints(cls)
517
+ except (NameError, AttributeError):
518
+ # NameError: Forward references can't be resolved (common with TYPE_CHECKING)
519
+ # AttributeError: Issues accessing class attributes during resolution
520
+ return (None, False)
521
+
522
+ if attr_name not in hints:
523
+ return (None, False)
524
+
525
+ hint = hints[attr_name]
526
+
527
+ # Check if it's TableInput
528
+ if hint is TableInput:
529
+ return ("TableInput", True)
530
+
531
+ # Check if it's AnyArrow (any Arrow type accepted)
532
+ if hint is AnyArrow:
533
+ return ("AnyArrow", False)
534
+
535
+ # Extract type name
536
+ if hasattr(hint, "__name__"):
537
+ return (hint.__name__, False)
538
+
539
+ return (str(hint), False)
540
+
541
+
542
+ class TableInputValidationError(ValueError):
543
+ """Raised when TableInput parameter validation fails."""
544
+
545
+
546
+ class VarargsValidationError(ValueError):
547
+ """Raised when varargs parameter validation fails."""
548
+
549
+
550
+ def _build_constraints(arg: Arg[Any]) -> dict[str, Any]:
551
+ """Extract validation constraints from an Arg descriptor."""
552
+ constraints: dict[str, Any] = {}
553
+
554
+ # Numeric bounds
555
+ for name in ("ge", "le", "gt", "lt"):
556
+ value = getattr(arg, name)
557
+ if value is not None:
558
+ constraints[name] = value
559
+
560
+ # Other constraints
561
+ if arg.choices is not None:
562
+ constraints["choices"] = list(arg.choices)
563
+ if arg.pattern is not None:
564
+ constraints["pattern"] = arg.pattern
565
+
566
+ return constraints
567
+
568
+
569
+ def extract_parameters(cls: type, *, validate_table_input: bool = True) -> list[ParameterInfo]:
570
+ """Extract parameter information from Arg descriptors on a class.
571
+
572
+ Walks the class and its bases to find all Arg descriptors and converts
573
+ them to ParameterInfo objects. Also handles the new Param/ConstParam API
574
+ for ScalarFunction subclasses.
575
+
576
+ Args:
577
+ cls: The function class to extract parameters from.
578
+ validate_table_input: If True, validates TableInput requirements for
579
+ TableInOutFunction subclasses.
580
+
581
+ Returns:
582
+ List of ParameterInfo objects, sorted by position.
583
+
584
+ Raises:
585
+ TableInputValidationError: If TableInput validation fails.
586
+
587
+ """
588
+ # Import here to avoid circular imports
589
+ from vgi.arguments import Arg
590
+
591
+ parameters: list[ParameterInfo] = []
592
+ seen_names: set[str] = set()
593
+
594
+ # Check for new Param/ConstParam API (ScalarFunction and AggregateFunction subclasses)
595
+ # These are stored in _compute_params and _const_params class attributes
596
+ compute_params: dict[str, Arg[Any]] = getattr(cls, "_compute_params", {})
597
+ const_params: dict[str, Arg[Any]] = getattr(cls, "_const_params", {})
598
+
599
+ for name, arg in compute_params.items():
600
+ seen_names.add(name)
601
+ required = arg.default is _MISSING
602
+ # For new API, use arrow_type if available
603
+ compute_type_name = str(arg.arrow_type) if arg.arrow_type else "any"
604
+
605
+ parameters.append(
606
+ ParameterInfo(
607
+ name=name,
608
+ position=arg.position,
609
+ type_name=compute_type_name,
610
+ description=arg.doc,
611
+ required=required,
612
+ default=None if required else arg.default,
613
+ constraints=_build_constraints(arg),
614
+ is_table_input=False,
615
+ is_varargs=arg.varargs,
616
+ )
617
+ )
618
+
619
+ for name, arg in const_params.items():
620
+ seen_names.add(name)
621
+ required = arg.default is _MISSING
622
+ const_type_name = str(arg.arrow_type) if arg.arrow_type else "any"
623
+
624
+ parameters.append(
625
+ ParameterInfo(
626
+ name=name,
627
+ position=arg.position,
628
+ type_name=const_type_name,
629
+ description=arg.doc,
630
+ required=required,
631
+ default=None if required else arg.default,
632
+ constraints=_build_constraints(arg),
633
+ is_table_input=False,
634
+ is_varargs=arg.varargs,
635
+ is_const=arg.const,
636
+ )
637
+ )
638
+
639
+ # Check for FunctionArguments dataclass (typed generic pattern)
640
+ # e.g., class MyFunc(TableFunctionGenerator[MyArgs]):
641
+ # where MyArgs has fields like: count: Annotated[int, Arg(0, doc="...")]
642
+ func_args_class = getattr(cls, "FunctionArguments", None)
643
+ if func_args_class is not None:
644
+ try:
645
+ func_args_hints = get_type_hints(func_args_class, include_extras=True)
646
+ except (NameError, AttributeError):
647
+ func_args_hints = {}
648
+
649
+ for field_name, field_hint in func_args_hints.items():
650
+ if field_name.startswith("_") or field_name in seen_names:
651
+ continue
652
+
653
+ if get_origin(field_hint) is not Annotated:
654
+ continue
655
+
656
+ # Extract Arg from Annotated metadata
657
+ type_args = get_args(field_hint)
658
+ base_type = type_args[0]
659
+ arg_instance: Arg[Any] | None = None
660
+ for meta in type_args[1:]:
661
+ if isinstance(meta, Arg):
662
+ arg_instance = meta
663
+ break
664
+
665
+ if arg_instance is None:
666
+ continue
667
+
668
+ seen_names.add(field_name)
669
+
670
+ is_table_input = base_type is TableInput
671
+ if base_type is TableInput:
672
+ type_name = "TableInput"
673
+ elif base_type is AnyArrow:
674
+ type_name = "AnyArrow"
675
+ elif hasattr(base_type, "__name__"):
676
+ type_name = base_type.__name__
677
+ else:
678
+ type_name = str(base_type)
679
+
680
+ required = arg_instance.default is _MISSING
681
+
682
+ parameters.append(
683
+ ParameterInfo(
684
+ name=field_name,
685
+ position=arg_instance.position,
686
+ type_name=type_name,
687
+ description=arg_instance.doc,
688
+ required=required,
689
+ default=None if required else arg_instance.default,
690
+ constraints=_build_constraints(arg_instance),
691
+ is_table_input=is_table_input,
692
+ is_varargs=arg_instance.varargs,
693
+ )
694
+ )
695
+
696
+ # Walk MRO to find all Arg descriptors (legacy API)
697
+ for klass in cls.__mro__:
698
+ if klass is object:
699
+ continue
700
+
701
+ for attr_name, attr_value in vars(klass).items():
702
+ if attr_name.startswith("_"):
703
+ continue
704
+ if attr_name in seen_names:
705
+ continue
706
+
707
+ if isinstance(attr_value, Arg):
708
+ seen_names.add(attr_name)
709
+ arg = attr_value
710
+ required = arg.default is _MISSING
711
+ legacy_type_name, is_table_input = _get_arg_type_info(cls, attr_name)
712
+
713
+ parameters.append(
714
+ ParameterInfo(
715
+ name=attr_name,
716
+ position=arg.position,
717
+ type_name=legacy_type_name or "any",
718
+ description=arg.doc,
719
+ required=required,
720
+ default=None if required else arg.default,
721
+ constraints=_build_constraints(arg),
722
+ is_table_input=is_table_input,
723
+ is_varargs=arg.varargs,
724
+ )
725
+ )
726
+
727
+ # Sort: positional args by index first, then named args alphabetically
728
+ def sort_key(p: ParameterInfo) -> tuple[int, int | str]:
729
+ if isinstance(p.position, int):
730
+ return (0, p.position)
731
+ return (1, p.position)
732
+
733
+ sorted_params = sorted(parameters, key=sort_key)
734
+
735
+ # Validate TableInput and varargs constraints
736
+ if validate_table_input:
737
+ _validate_table_input(cls, sorted_params)
738
+ _validate_varargs(cls, sorted_params)
739
+
740
+ return sorted_params
741
+
742
+
743
+ def _validate_table_input(cls: type, parameters: list[ParameterInfo]) -> None:
744
+ """Validate TableInput parameter constraints.
745
+
746
+ If a function has TableInput parameters, validates that:
747
+ - There is exactly one TableInput parameter
748
+ - The TableInput parameter is positional (not named)
749
+
750
+ Args:
751
+ cls: The function class being validated.
752
+ parameters: Extracted parameters.
753
+
754
+ Raises:
755
+ TableInputValidationError: If validation fails.
756
+
757
+ """
758
+ table_inputs = [p for p in parameters if p.is_table_input]
759
+
760
+ if len(table_inputs) == 0:
761
+ return # No TableInput parameters, nothing to validate
762
+
763
+ if len(table_inputs) > 1:
764
+ names = [p.name for p in table_inputs]
765
+ raise TableInputValidationError(
766
+ f"{cls.__name__}: Functions can have at most one Arg[TableInput] "
767
+ f"parameter, but found {len(table_inputs)}: {names}"
768
+ )
769
+
770
+ table_input = table_inputs[0]
771
+
772
+ # TableInput must be positional (not named)
773
+ if isinstance(table_input.position, str):
774
+ raise TableInputValidationError(
775
+ f"{cls.__name__}: TableInput parameter '{table_input.name}' must be "
776
+ f"positional (int), not named. Change from "
777
+ f"Arg[TableInput]('{table_input.position}') to "
778
+ f"Arg[TableInput](<position_index>)"
779
+ )
780
+
781
+
782
+ def _validate_varargs(cls: type, parameters: list[ParameterInfo]) -> None:
783
+ """Validate varargs parameter constraints.
784
+
785
+ If a function has varargs parameters, validates that:
786
+ - There is at most one varargs parameter
787
+ - The varargs parameter is positional (not named) - enforced by Arg.__init__
788
+ - The varargs parameter is the last positional arg (before TableInput if present)
789
+
790
+ Args:
791
+ cls: The function class being validated.
792
+ parameters: Extracted parameters.
793
+
794
+ Raises:
795
+ VarargsValidationError: If validation fails.
796
+
797
+ """
798
+ varargs_params = [p for p in parameters if p.is_varargs]
799
+
800
+ if len(varargs_params) == 0:
801
+ return # No varargs parameters, nothing to validate
802
+
803
+ if len(varargs_params) > 1:
804
+ names = [p.name for p in varargs_params]
805
+ raise VarargsValidationError(
806
+ f"{cls.__name__}: Functions can have at most one varargs parameter, "
807
+ f"but found {len(varargs_params)}: {names}"
808
+ )
809
+
810
+ varargs_param = varargs_params[0]
811
+
812
+ # Get all positional parameters (excluding TableInput)
813
+ positional_params = [p for p in parameters if isinstance(p.position, int) and not p.is_table_input]
814
+
815
+ if not positional_params:
816
+ return # Should not happen if varargs exists, but be safe
817
+
818
+ # Find the maximum position among non-varargs positional params
819
+ # All positions here are int (filtered above), but mypy doesn't know
820
+ non_varargs_positional = [p for p in positional_params if not p.is_varargs]
821
+ if non_varargs_positional:
822
+ # All positions are int (filtered by isinstance(p.position, int) above)
823
+ int_positions = [p.position for p in non_varargs_positional if isinstance(p.position, int)]
824
+ max_non_varargs_pos = max(int_positions)
825
+ # varargs position must be int (enforced by Arg.__init__)
826
+ assert isinstance(varargs_param.position, int)
827
+ if varargs_param.position < max_non_varargs_pos:
828
+ raise VarargsValidationError(
829
+ f"{cls.__name__}: Varargs parameter '{varargs_param.name}' at "
830
+ f"position {varargs_param.position} must be the last positional "
831
+ f"argument, but there are positional arguments after it"
832
+ )
833
+
834
+
835
+ # =============================================================================
836
+ # Metadata Resolution
837
+ # =============================================================================
838
+
839
+
840
+ def _normalize_examples(
841
+ examples: list[FunctionExample | str],
842
+ ) -> list[FunctionExample]:
843
+ """Convert string examples to FunctionExample objects."""
844
+ return [FunctionExample(sql=ex) if isinstance(ex, str) else ex for ex in examples]
845
+
846
+
847
+ # Mapping from base class names to CatalogFunctionType.
848
+ # Using a dict avoids typos and provides O(1) lookup.
849
+ # Class names are used (not classes) to avoid circular imports.
850
+ # Note: Functions with an Arg[TableInput] parameter receive table input.
851
+ _CLASS_NAME_TO_FUNCTION_TYPE: dict[str, CatalogFunctionType] = {
852
+ # Buffered table function (Sink+Source). Must come before "TableFunctionBase"
853
+ # in the MRO walk — ``_infer_function_type`` returns on the first match, so
854
+ # the more-specific entry wins for TableBufferingFunction subclasses.
855
+ "TableBufferingFunction": CatalogFunctionType.TABLE_BUFFERING,
856
+ # Streaming table functions (TableFunctionGenerator + TableInOutGenerator).
857
+ "TableFunctionBase": CatalogFunctionType.TABLE,
858
+ "AggregateFunction": CatalogFunctionType.AGGREGATE,
859
+ "ScalarFunction": CatalogFunctionType.SCALAR,
860
+ "ScalarFunctionGenerator": CatalogFunctionType.SCALAR,
861
+ }
862
+
863
+ # Valid Meta class attribute names (for typo detection)
864
+ _VALID_META_ATTRIBUTES: frozenset[str] = frozenset(
865
+ {
866
+ # Common
867
+ "name",
868
+ "description",
869
+ "examples",
870
+ "categories",
871
+ "tags",
872
+ "stability",
873
+ "null_handling",
874
+ "required_settings", # settings/pragmas required by function
875
+ "required_secrets", # secrets required by function
876
+ # Table function specific
877
+ "projection_pushdown",
878
+ "filter_pushdown",
879
+ "sampling_pushdown",
880
+ "late_materialization", # Participate in DuckDB late-materialization rewrite
881
+ "supported_expression_filters",
882
+ "auto_apply_filters", # Auto-apply pushdown filters to output batches
883
+ "preserves_order",
884
+ "max_workers",
885
+ "supports_batch_index", # opt-in to per-batch batch_index tagging (parallel + ordered sink)
886
+ "partition_kind", # opt-in to PartitionColumns mode for Hive-style partitioning
887
+ # Table-in-out specific: explicit override for the has_finalize auto-detection.
888
+ # Set to True or False to force the emitted ``in_out_function_final``
889
+ # registration bit; leave unset (None) to auto-detect from finish/finalize.
890
+ "has_finalize",
891
+ # Buffered table function knobs (only meaningful when the class is a
892
+ # TableBufferingFunction subclass — function_type == TABLE_BUFFERING).
893
+ # When True, source phase is single-threaded and finalize_state_ids
894
+ # drain in combine-returned order.
895
+ "source_order_dependent",
896
+ # When True, the SINK phase runs single-threaded — process() calls
897
+ # arrive in source order on one worker.
898
+ "sink_order_dependent",
899
+ # When True, DuckDB threads a globally-unique monotonic batch_index
900
+ # from the source into every process() call. Worker can reconstruct
901
+ # source order in combine() by sorting accumulated (batch_index,
902
+ # payload) tuples.
903
+ "requires_input_batch_index",
904
+ # Aggregate function specific
905
+ "order_dependent",
906
+ "distinct_dependent",
907
+ "supports_window",
908
+ "streaming_partitioned",
909
+ # Scalar function specific
910
+ "output_type", # pa.DataType | type[AnyArrow] for scalar functions
911
+ }
912
+ )
913
+
914
+
915
+ class FunctionTypeError(TypeError):
916
+ """Raised when a function's type cannot be determined from its class hierarchy."""
917
+
918
+
919
+ def _infer_function_type(cls: type) -> CatalogFunctionType:
920
+ """Infer the function type from the class hierarchy.
921
+
922
+ Raises:
923
+ FunctionTypeError: If no recognized base class is found in the MRO.
924
+
925
+ """
926
+ for klass in cls.__mro__:
927
+ if klass.__name__ in _CLASS_NAME_TO_FUNCTION_TYPE:
928
+ return _CLASS_NAME_TO_FUNCTION_TYPE[klass.__name__]
929
+ recognized_bases = sorted(_CLASS_NAME_TO_FUNCTION_TYPE.keys())
930
+ raise FunctionTypeError(
931
+ f"Cannot determine function type for {cls.__name__}. Class must inherit from one of: {recognized_bases}"
932
+ )
933
+
934
+
935
+ @functools.lru_cache(maxsize=256)
936
+ def resolve_metadata(cls: type) -> ResolvedMetadata:
937
+ """Resolve metadata for a function class.
938
+
939
+ Results are cached since class metadata doesn't change at runtime.
940
+
941
+ This function:
942
+ 1. Walks the class hierarchy to find and merge Meta classes
943
+ 2. Extracts parameter info from Arg descriptors
944
+ 3. Infers function name from class name if not specified
945
+ 4. Uses docstring as description fallback
946
+
947
+ Args:
948
+ cls: The function class to resolve metadata for.
949
+
950
+ Returns:
951
+ ResolvedMetadata with all resolved values.
952
+
953
+ """
954
+ # Collect all attributes from Meta classes in MRO
955
+ attrs: dict[str, Any] = {}
956
+
957
+ # Walk MRO in reverse so derived classes override base classes
958
+ for klass in reversed(cls.__mro__):
959
+ if klass is object:
960
+ continue
961
+
962
+ # Check for nested Meta class defined directly on this class
963
+ if "Meta" not in klass.__dict__:
964
+ continue
965
+
966
+ meta_class = klass.__dict__["Meta"]
967
+
968
+ # Extract class attributes defined directly on this Meta class
969
+ for attr_name, value in vars(meta_class).items():
970
+ if attr_name.startswith("_"):
971
+ continue
972
+ # Skip methods
973
+ if callable(value) and not isinstance(value, type):
974
+ continue
975
+ attrs[attr_name] = value
976
+
977
+ # Warn about unknown Meta attributes (likely typos)
978
+ unknown_attrs = set(attrs.keys()) - _VALID_META_ATTRIBUTES
979
+ if unknown_attrs:
980
+ warnings.warn(
981
+ f"{cls.__name__}.Meta has unknown attributes: {sorted(unknown_attrs)}. "
982
+ f"Valid attributes are: {sorted(_VALID_META_ATTRIBUTES)}",
983
+ stacklevel=2,
984
+ )
985
+
986
+ # Infer function type from class hierarchy. TableBufferingFunction
987
+ # subclasses resolve to ``CatalogFunctionType.TABLE_BUFFERING`` — that's
988
+ # the single source of truth for the C++ optimizer rewriter, not a
989
+ # separate Meta flag.
990
+ function_type = _infer_function_type(cls)
991
+ is_buffering = function_type is CatalogFunctionType.TABLE_BUFFERING
992
+
993
+ # Cross-flag validation for the buffered table path.
994
+ if attrs.get("source_order_dependent") and not is_buffering:
995
+ raise TypeError(
996
+ f"{cls.__name__}: Meta.source_order_dependent is only meaningful on TableBufferingFunction subclasses"
997
+ )
998
+ if attrs.get("sink_order_dependent") and not is_buffering:
999
+ raise TypeError(
1000
+ f"{cls.__name__}: Meta.sink_order_dependent is only meaningful on TableBufferingFunction subclasses"
1001
+ )
1002
+ if attrs.get("requires_input_batch_index") and not is_buffering:
1003
+ raise TypeError(
1004
+ f"{cls.__name__}: Meta.requires_input_batch_index is only meaningful on TableBufferingFunction subclasses"
1005
+ )
1006
+ if attrs.get("sink_order_dependent") and attrs.get("requires_input_batch_index"):
1007
+ raise TypeError(
1008
+ f"{cls.__name__}: Meta.sink_order_dependent and "
1009
+ f"Meta.requires_input_batch_index are mutually exclusive — "
1010
+ f"single-threaded sink already orders process() calls; "
1011
+ f"batch_index is only useful under parallel ingest"
1012
+ )
1013
+
1014
+ # Use class name as default name, converting to snake_case
1015
+ class_name = cls.__name__
1016
+ if "name" in attrs and attrs["name"]:
1017
+ name = attrs["name"]
1018
+ else:
1019
+ # Convert CamelCase to snake_case
1020
+ name = re.sub(r"(?<!^)(?=[A-Z])", "_", class_name).lower()
1021
+ # Remove common suffixes
1022
+ for suffix in ["_function", "_func"]:
1023
+ if name.endswith(suffix):
1024
+ name = name[: -len(suffix)]
1025
+ break
1026
+
1027
+ # Use docstring as fallback description
1028
+ description = attrs.get("description", "")
1029
+ if not description and cls.__doc__:
1030
+ description = cls.__doc__.strip().split("\n")[0]
1031
+
1032
+ # Normalize examples
1033
+ examples = _normalize_examples(attrs.get("examples", []))
1034
+
1035
+ # Extract parameters from Arg descriptors
1036
+ parameters = extract_parameters(cls)
1037
+
1038
+ # Merge annotation-derived setting/secret keys into required_settings/required_secrets
1039
+ meta_required_settings: list[str] = list(attrs.get("required_settings", []))
1040
+
1041
+ # Build required_secrets from Meta and annotations
1042
+ meta_required_secrets_raw = attrs.get("required_secrets", [])
1043
+ meta_required_secrets: list[SecretLookupEntry] = []
1044
+ for entry in meta_required_secrets_raw:
1045
+ if isinstance(entry, SecretLookupEntry):
1046
+ meta_required_secrets.append(entry)
1047
+ elif isinstance(entry, dict):
1048
+ meta_required_secrets.append(SecretLookupEntry.from_dict(entry))
1049
+
1050
+ # Auto-populate from _setting_params / _secret_params class vars (set by __init_subclass__)
1051
+ annotation_setting_keys: set[str] = set()
1052
+
1053
+ setting_params: dict[str, str] = getattr(cls, "_setting_params", {})
1054
+ secret_params: dict[str, Secret] = getattr(cls, "_secret_params", {})
1055
+ annotation_setting_keys.update(setting_params.values())
1056
+
1057
+ # Union with Meta-declared keys, deduped, preserving order
1058
+ existing_settings = set(meta_required_settings)
1059
+ for key in sorted(annotation_setting_keys):
1060
+ if key not in existing_settings:
1061
+ meta_required_settings.append(key)
1062
+
1063
+ # Add annotation-derived secret requirements
1064
+ existing_secret_types = {e.secret_type for e in meta_required_secrets}
1065
+ for secret in secret_params.values():
1066
+ if secret.secret_type not in existing_secret_types:
1067
+ meta_required_secrets.append(
1068
+ SecretLookupEntry(
1069
+ secret_type=secret.secret_type,
1070
+ secret_name=secret.name,
1071
+ scope=secret.scope,
1072
+ )
1073
+ )
1074
+ existing_secret_types.add(secret.secret_type)
1075
+
1076
+ return ResolvedMetadata(
1077
+ name=name,
1078
+ class_name=class_name,
1079
+ function_type=function_type,
1080
+ description=description,
1081
+ examples=examples,
1082
+ categories=attrs.get("categories", []),
1083
+ tags=dict(attrs.get("tags", {})),
1084
+ parameters=parameters,
1085
+ stability=attrs.get("stability", FunctionStability.CONSISTENT),
1086
+ null_handling=attrs.get("null_handling", NullHandling.DEFAULT),
1087
+ required_settings=meta_required_settings,
1088
+ required_secrets=meta_required_secrets,
1089
+ projection_pushdown=attrs.get("projection_pushdown", False),
1090
+ filter_pushdown=attrs.get("filter_pushdown", False),
1091
+ sampling_pushdown=attrs.get("sampling_pushdown", False),
1092
+ late_materialization=bool(attrs.get("late_materialization", False)),
1093
+ supported_expression_filters=attrs.get("supported_expression_filters", []),
1094
+ preserves_order=attrs.get("preserves_order", OrderPreservation.PRESERVES_ORDER),
1095
+ max_workers=attrs.get("max_workers"),
1096
+ supports_batch_index=bool(attrs.get("supports_batch_index", False)),
1097
+ partition_kind=_validate_partition_kind(cls, attrs.get("partition_kind", PartitionKind.NOT_PARTITIONED)),
1098
+ order_dependent=attrs.get("order_dependent", OrderDependence.NOT_ORDER_DEPENDENT),
1099
+ distinct_dependent=attrs.get("distinct_dependent", DistinctDependence.NOT_DISTINCT_DEPENDENT),
1100
+ supports_window=bool(attrs.get("supports_window", False)),
1101
+ streaming_partitioned=bool(attrs.get("streaming_partitioned", False)),
1102
+ # TABLE_BUFFERING implies has_finalize — the buffered path always
1103
+ # invokes the worker's finalize phase (it's the whole point).
1104
+ has_finalize=(_detect_has_finalize(cls, function_type) or is_buffering),
1105
+ source_order_dependent=bool(attrs.get("source_order_dependent", False)),
1106
+ sink_order_dependent=bool(attrs.get("sink_order_dependent", False)),
1107
+ requires_input_batch_index=bool(attrs.get("requires_input_batch_index", False)),
1108
+ )
1109
+
1110
+
1111
+ def _validate_partition_kind(cls: type, kind: PartitionKind) -> PartitionKind:
1112
+ """Cross-check ``Meta.partition_kind`` against the bind schema.
1113
+
1114
+ When the class exposes a static ``FIXED_SCHEMA`` ``ClassVar``
1115
+ (the common pattern in test fixtures), we can verify at
1116
+ registration time that:
1117
+
1118
+ * ``kind != NOT_PARTITIONED`` ⇒ at least one field carries the
1119
+ ``vgi.partition_column`` metadata key (via
1120
+ :func:`vgi.schema_utils.partition_field`).
1121
+ * The reverse: any field annotated as a partition column ⇒
1122
+ ``kind != NOT_PARTITIONED``.
1123
+
1124
+ For functions that compute their bind schema dynamically (no
1125
+ ``FIXED_SCHEMA`` available at class-resolution time), the check
1126
+ is deferred to the framework's bind path — the C++ extension's
1127
+ bind-time walk also raises ``BinderException`` on mismatch.
1128
+
1129
+ Returns the validated kind unchanged.
1130
+ """
1131
+ # Static-schema fast path. ``FIXED_SCHEMA`` is the established
1132
+ # pattern for fixed-output table functions (see e.g.
1133
+ # ``PartitionedBatchIndexFunction.FIXED_SCHEMA``).
1134
+ fixed_schema = getattr(cls, "FIXED_SCHEMA", None)
1135
+ if not isinstance(fixed_schema, pa.Schema):
1136
+ # Dynamic schema or not a table function — defer to bind-time
1137
+ # validation in the C++ extension.
1138
+ return kind
1139
+
1140
+ from vgi.schema_utils import VGI_PARTITION_COLUMN_KEY
1141
+
1142
+ annotated_fields: list[str] = []
1143
+ for fld in fixed_schema:
1144
+ md = fld.metadata
1145
+ if md is not None and md.get(VGI_PARTITION_COLUMN_KEY) == b"true":
1146
+ annotated_fields.append(fld.name)
1147
+
1148
+ if kind == PartitionKind.NOT_PARTITIONED and annotated_fields:
1149
+ raise ValueError(
1150
+ f"{cls.__name__}: bind schema has partition-annotated field(s) "
1151
+ f"{annotated_fields!r} but Meta.partition_kind is NOT_PARTITIONED. "
1152
+ f"Set Meta.partition_kind to a non-default PartitionKind, or "
1153
+ f"remove the partition_field() annotations."
1154
+ )
1155
+ if kind != PartitionKind.NOT_PARTITIONED and not annotated_fields:
1156
+ raise ValueError(
1157
+ f"{cls.__name__}: Meta.partition_kind is {kind.name} but no bind "
1158
+ f"schema field is annotated with vgi.partition_column. Use "
1159
+ f"vgi.schema_utils.partition_field(name, type) to mark the "
1160
+ f"column(s) that satisfy the partition contract, or set "
1161
+ f"Meta.partition_kind back to NOT_PARTITIONED."
1162
+ )
1163
+
1164
+ return kind
1165
+
1166
+
1167
+ def _detect_has_finalize(cls: type, function_type: CatalogFunctionType) -> bool:
1168
+ """Route to the TableInOut base class's ``has_finalize_override`` hook.
1169
+
1170
+ For non-TableInOut function types always returns ``False``. The actual
1171
+ detection logic lives on the base class so users can subclass and
1172
+ override the heuristic, and so the Meta-level ``has_finalize`` flag is
1173
+ handled in one place.
1174
+ """
1175
+ if function_type is CatalogFunctionType.TABLE_BUFFERING:
1176
+ # The Sink+Source path is, by construction, an exchange that emits
1177
+ # output exclusively in the Source phase — has_finalize is always True
1178
+ # and is not detected from the user's class.
1179
+ return True
1180
+ if function_type is not CatalogFunctionType.TABLE:
1181
+ return False
1182
+ # Lazy import to avoid a circular dependency.
1183
+ try:
1184
+ from vgi.table_in_out_function import TableInOutGenerator
1185
+ except ImportError: # pragma: no cover
1186
+ return False
1187
+ if not issubclass(cls, TableInOutGenerator):
1188
+ return False
1189
+ return cls.has_finalize_override()
1190
+
1191
+
1192
+ # =============================================================================
1193
+ # Arrow Serialization
1194
+ # =============================================================================
1195
+
1196
+ # Nested struct type for function examples
1197
+ _EXAMPLE_STRUCT = pa.struct(
1198
+ [
1199
+ pa.field("sql", pa.string()),
1200
+ pa.field("description", pa.string()),
1201
+ pa.field("expected_output", pa.string(), nullable=True),
1202
+ ]
1203
+ )
1204
+
1205
+ # Nested struct type for secret requirements
1206
+ _SECRET_REQUIREMENT_STRUCT = pa.struct(
1207
+ [
1208
+ pa.field("secret_type", pa.string()),
1209
+ pa.field("secret_name", pa.string(), nullable=True),
1210
+ pa.field("scope", pa.string(), nullable=True),
1211
+ ]
1212
+ )
1213
+
1214
+ # Nested struct type for function parameters
1215
+ _PARAMETER_STRUCT = pa.struct(
1216
+ [
1217
+ pa.field("name", pa.string()),
1218
+ pa.field("position", pa.int32(), nullable=True),
1219
+ pa.field("position_name", pa.string(), nullable=True),
1220
+ pa.field("type_name", pa.string(), nullable=True),
1221
+ pa.field("description", pa.string()),
1222
+ pa.field("required", pa.bool_()),
1223
+ pa.field("default", pa.string(), nullable=True),
1224
+ pa.field("constraints", pa.string(), nullable=True), # JSON for flexibility
1225
+ pa.field("is_table_input", pa.bool_()),
1226
+ pa.field("is_varargs", pa.bool_()),
1227
+ pa.field("is_const", pa.bool_()),
1228
+ ]
1229
+ )
1230
+
1231
+ # Schema for serializing function metadata
1232
+ _METADATA_SCHEMA = pa.schema(
1233
+ [
1234
+ pa.field("name", pa.string()),
1235
+ pa.field("class_name", pa.string()),
1236
+ pa.field("function_type", pa.string()),
1237
+ pa.field("description", pa.string()),
1238
+ pa.field("examples", pa.list_(_EXAMPLE_STRUCT)),
1239
+ pa.field("categories", pa.list_(pa.string())),
1240
+ pa.field("tags", pa.map_(pa.string(), pa.string())),
1241
+ pa.field("parameters", pa.list_(_PARAMETER_STRUCT)),
1242
+ pa.field("stability", pa.string()),
1243
+ pa.field("null_handling", pa.string()),
1244
+ pa.field("required_settings", pa.list_(pa.string())),
1245
+ pa.field("required_secrets", pa.list_(_SECRET_REQUIREMENT_STRUCT)),
1246
+ pa.field("projection_pushdown", pa.bool_()),
1247
+ pa.field("filter_pushdown", pa.bool_()),
1248
+ pa.field("sampling_pushdown", pa.bool_()),
1249
+ pa.field("late_materialization", pa.bool_()),
1250
+ pa.field("supported_expression_filters", pa.list_(pa.string())),
1251
+ pa.field("preserves_order", pa.string()),
1252
+ pa.field("max_workers", pa.int32(), nullable=True),
1253
+ pa.field("supports_batch_index", pa.bool_()),
1254
+ pa.field("partition_kind", pa.string()),
1255
+ pa.field("order_dependent", pa.string()),
1256
+ pa.field("distinct_dependent", pa.string()),
1257
+ pa.field("supports_window", pa.bool_()),
1258
+ pa.field("streaming_partitioned", pa.bool_()),
1259
+ pa.field("has_finalize", pa.bool_()),
1260
+ pa.field("source_order_dependent", pa.bool_()),
1261
+ pa.field("sink_order_dependent", pa.bool_()),
1262
+ pa.field("requires_input_batch_index", pa.bool_()),
1263
+ ]
1264
+ )
1265
+
1266
+ # Fields that contain lists and need None -> [] conversion during deserialization
1267
+ _LIST_FIELDS: frozenset[str] = frozenset(
1268
+ {"examples", "categories", "parameters", "required_settings", "required_secrets", "supported_expression_filters"}
1269
+ )
1270
+
1271
+ # Fields that contain maps and need None -> {} conversion during deserialization
1272
+ _MAP_FIELDS: frozenset[str] = frozenset({"tags"})
1273
+
1274
+
1275
+ def _extract_arrow_row(columns: dict[str, list[Any]], index: int) -> dict[str, Any]:
1276
+ """Extract a single row from Arrow columnar data as a dict.
1277
+
1278
+ Handles None values for list fields (converts None to [])
1279
+ and map fields (converts None to {}).
1280
+ """
1281
+ result: dict[str, Any] = {}
1282
+ for field_name, values in columns.items():
1283
+ value = values[index]
1284
+ if value is None:
1285
+ if field_name in _LIST_FIELDS:
1286
+ result[field_name] = []
1287
+ elif field_name in _MAP_FIELDS:
1288
+ result[field_name] = {}
1289
+ else:
1290
+ result[field_name] = value
1291
+ else:
1292
+ result[field_name] = value
1293
+ return result
1294
+
1295
+
1296
+ def metadata_to_arrow(metadata: ResolvedMetadata) -> pa.RecordBatch:
1297
+ """Serialize a single ResolvedMetadata to Arrow RecordBatch.
1298
+
1299
+ Args:
1300
+ metadata: The metadata to serialize.
1301
+
1302
+ Returns:
1303
+ RecordBatch with one row containing the metadata.
1304
+
1305
+ """
1306
+ row = metadata.to_dict()
1307
+ # Wrap each value in a list for single-row batch
1308
+ data = {field: [value] for field, value in row.items()}
1309
+ return pa.RecordBatch.from_pydict(data, schema=_METADATA_SCHEMA)
1310
+
1311
+
1312
+ def arrow_to_metadata(batch: pa.RecordBatch) -> ResolvedMetadata:
1313
+ """Deserialize Arrow RecordBatch to ResolvedMetadata.
1314
+
1315
+ Args:
1316
+ batch: RecordBatch with one row containing metadata.
1317
+
1318
+ Returns:
1319
+ Deserialized ResolvedMetadata.
1320
+
1321
+ """
1322
+ if batch.num_rows != 1:
1323
+ raise ValueError(f"Expected 1 row, got {batch.num_rows}")
1324
+
1325
+ columns = batch.to_pydict()
1326
+ row = _extract_arrow_row(columns, 0)
1327
+ return ResolvedMetadata.from_dict(row)
1328
+
1329
+
1330
+ def metadatas_to_arrow(metadatas: Sequence[ResolvedMetadata]) -> pa.RecordBatch:
1331
+ """Serialize multiple ResolvedMetadata objects to Arrow RecordBatch.
1332
+
1333
+ Args:
1334
+ metadatas: Sequence of ResolvedMetadata objects to serialize.
1335
+
1336
+ Returns:
1337
+ RecordBatch with one row per metadata object.
1338
+
1339
+ """
1340
+ if not metadatas:
1341
+ return pa.RecordBatch.from_pydict({field.name: [] for field in _METADATA_SCHEMA}, schema=_METADATA_SCHEMA)
1342
+
1343
+ # Collect all data into columnar lists
1344
+ data: dict[str, list[Any]] = {field.name: [] for field in _METADATA_SCHEMA}
1345
+
1346
+ for meta in metadatas:
1347
+ row = meta.to_dict()
1348
+ for key, value in row.items():
1349
+ data[key].append(value)
1350
+
1351
+ return pa.RecordBatch.from_pydict(data, schema=_METADATA_SCHEMA)
1352
+
1353
+
1354
+ def functions_to_arrow(function_classes: Sequence[type]) -> pa.RecordBatch:
1355
+ """Serialize multiple function classes to Arrow RecordBatch.
1356
+
1357
+ Convenience function that resolves metadata for each class, then serializes.
1358
+ For pre-resolved metadata, use metadatas_to_arrow() directly.
1359
+
1360
+ Args:
1361
+ function_classes: Sequence of function classes to serialize.
1362
+
1363
+ Returns:
1364
+ RecordBatch with one row per function.
1365
+
1366
+ """
1367
+ return metadatas_to_arrow([resolve_metadata(cls) for cls in function_classes])
1368
+
1369
+
1370
+ def arrow_to_functions(batch: pa.RecordBatch) -> list[ResolvedMetadata]:
1371
+ """Deserialize Arrow RecordBatch to list of ResolvedMetadata.
1372
+
1373
+ Args:
1374
+ batch: RecordBatch with one row per function.
1375
+
1376
+ Returns:
1377
+ List of deserialized ResolvedMetadata objects.
1378
+
1379
+ """
1380
+ columns = batch.to_pydict()
1381
+ return [ResolvedMetadata.from_dict(_extract_arrow_row(columns, i)) for i in range(batch.num_rows)]
1382
+
1383
+
1384
+ # =============================================================================
1385
+ # Mixin for Function Classes
1386
+ # =============================================================================
1387
+
1388
+
1389
+ class MetadataMixin:
1390
+ """Mixin that provides metadata access for function classes.
1391
+
1392
+ Add this to the base Function class to enable metadata resolution.
1393
+ """
1394
+
1395
+ @classmethod
1396
+ def get_metadata(cls) -> ResolvedMetadata:
1397
+ """Get the resolved metadata for this function class."""
1398
+ return resolve_metadata(cls) # type: ignore[arg-type]
1399
+
1400
+ @classmethod
1401
+ def describe(cls) -> dict[str, Any]:
1402
+ """Get metadata as a dictionary (for JSON serialization)."""
1403
+ return cls.get_metadata().to_dict()