vgi-python 0.8.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (124) hide show
  1. vgi/__init__.py +152 -0
  2. vgi/_duckdb.py +62 -0
  3. vgi/_storage_profile.py +132 -0
  4. vgi/_test_fixtures/__init__.py +20 -0
  5. vgi/_test_fixtures/accumulate/__init__.py +19 -0
  6. vgi/_test_fixtures/accumulate/worker.py +762 -0
  7. vgi/_test_fixtures/aggregate/__init__.py +62 -0
  8. vgi/_test_fixtures/aggregate/_common.py +21 -0
  9. vgi/_test_fixtures/aggregate/basic.py +232 -0
  10. vgi/_test_fixtures/aggregate/dynamic.py +409 -0
  11. vgi/_test_fixtures/aggregate/generic.py +86 -0
  12. vgi/_test_fixtures/aggregate/listagg.py +71 -0
  13. vgi/_test_fixtures/aggregate/percentile.py +107 -0
  14. vgi/_test_fixtures/aggregate/streaming.py +192 -0
  15. vgi/_test_fixtures/aggregate/varargs.py +75 -0
  16. vgi/_test_fixtures/aggregate/window.py +380 -0
  17. vgi/_test_fixtures/attach_options.py +308 -0
  18. vgi/_test_fixtures/bad_protocol.py +62 -0
  19. vgi/_test_fixtures/cancellable.py +336 -0
  20. vgi/_test_fixtures/catalog.py +813 -0
  21. vgi/_test_fixtures/http_server.py +394 -0
  22. vgi/_test_fixtures/nest_tensor.py +614 -0
  23. vgi/_test_fixtures/orchard_catalog.py +47 -0
  24. vgi/_test_fixtures/projection_repro/__init__.py +6 -0
  25. vgi/_test_fixtures/projection_repro/worker.py +454 -0
  26. vgi/_test_fixtures/scalar/__init__.py +116 -0
  27. vgi/_test_fixtures/scalar/_common.py +69 -0
  28. vgi/_test_fixtures/scalar/arithmetic.py +321 -0
  29. vgi/_test_fixtures/scalar/binary.py +120 -0
  30. vgi/_test_fixtures/scalar/formatting.py +176 -0
  31. vgi/_test_fixtures/scalar/geo.py +300 -0
  32. vgi/_test_fixtures/scalar/null_handling.py +107 -0
  33. vgi/_test_fixtures/scalar/random_demo.py +171 -0
  34. vgi/_test_fixtures/scalar/settings_secrets.py +102 -0
  35. vgi/_test_fixtures/scalar/type_info.py +219 -0
  36. vgi/_test_fixtures/schema_reconcile/__init__.py +29 -0
  37. vgi/_test_fixtures/schema_reconcile/worker.py +653 -0
  38. vgi/_test_fixtures/simple_writable.py +793 -0
  39. vgi/_test_fixtures/table/__init__.py +221 -0
  40. vgi/_test_fixtures/table/_common.py +162 -0
  41. vgi/_test_fixtures/table/batch_index.py +283 -0
  42. vgi/_test_fixtures/table/batch_index_broken.py +200 -0
  43. vgi/_test_fixtures/table/catalog_scans.py +162 -0
  44. vgi/_test_fixtures/table/filters.py +1005 -0
  45. vgi/_test_fixtures/table/late_materialization.py +249 -0
  46. vgi/_test_fixtures/table/make_series.py +273 -0
  47. vgi/_test_fixtures/table/misc.py +499 -0
  48. vgi/_test_fixtures/table/order_modes.py +164 -0
  49. vgi/_test_fixtures/table/pairs.py +437 -0
  50. vgi/_test_fixtures/table/partition_columns.py +472 -0
  51. vgi/_test_fixtures/table/partition_columns_broken.py +304 -0
  52. vgi/_test_fixtures/table/profiling_example.py +195 -0
  53. vgi/_test_fixtures/table/required_filters.py +234 -0
  54. vgi/_test_fixtures/table/sequence.py +710 -0
  55. vgi/_test_fixtures/table/settings.py +426 -0
  56. vgi/_test_fixtures/table/transaction_storage.py +162 -0
  57. vgi/_test_fixtures/table/tt_pushdown.py +191 -0
  58. vgi/_test_fixtures/table/versioned.py +230 -0
  59. vgi/_test_fixtures/table_in_out.py +1392 -0
  60. vgi/_test_fixtures/versioned.py +155 -0
  61. vgi/_test_fixtures/versioned_tables.py +595 -0
  62. vgi/_test_fixtures/worker.py +1631 -0
  63. vgi/_test_fixtures/writable/__init__.py +8 -0
  64. vgi/_test_fixtures/writable/generic.py +236 -0
  65. vgi/_test_fixtures/writable/table.py +149 -0
  66. vgi/_test_fixtures/writable/worker.py +1148 -0
  67. vgi/aggregate_function.py +607 -0
  68. vgi/argument_spec.py +472 -0
  69. vgi/arguments.py +1747 -0
  70. vgi/auth.py +55 -0
  71. vgi/catalog/__init__.py +88 -0
  72. vgi/catalog/attach_option.py +206 -0
  73. vgi/catalog/catalog_interface.py +2767 -0
  74. vgi/catalog/descriptors.py +870 -0
  75. vgi/catalog/duckdb_statistics.py +377 -0
  76. vgi/catalog/secret_type.py +96 -0
  77. vgi/catalog/setting.py +253 -0
  78. vgi/catalog/storage.py +372 -0
  79. vgi/client/__init__.py +67 -0
  80. vgi/client/catalog_mixin.py +1251 -0
  81. vgi/client/cli.py +582 -0
  82. vgi/client/cli_catalog.py +182 -0
  83. vgi/client/cli_schema.py +270 -0
  84. vgi/client/cli_table.py +907 -0
  85. vgi/client/cli_transaction.py +97 -0
  86. vgi/client/cli_utils.py +441 -0
  87. vgi/client/cli_view.py +303 -0
  88. vgi/client/client.py +2183 -0
  89. vgi/exceptions.py +205 -0
  90. vgi/function.py +245 -0
  91. vgi/function_storage.py +1636 -0
  92. vgi/function_storage_azure_sql.py +922 -0
  93. vgi/function_storage_cf_do.py +740 -0
  94. vgi/http/__init__.py +25 -0
  95. vgi/http/demo_storage.py +212 -0
  96. vgi/http/worker_page.py +1252 -0
  97. vgi/invocation.py +154 -0
  98. vgi/logging_config.py +93 -0
  99. vgi/meta_worker.py +661 -0
  100. vgi/metadata.py +1403 -0
  101. vgi/otel.py +406 -0
  102. vgi/protocol.py +2418 -0
  103. vgi/protocol_version.txt +1 -0
  104. vgi/py.typed +0 -0
  105. vgi/scalar_function.py +1211 -0
  106. vgi/schema_utils.py +234 -0
  107. vgi/secret_protocol.py +124 -0
  108. vgi/secret_service.py +238 -0
  109. vgi/serve.py +769 -0
  110. vgi/table_buffering_function.py +443 -0
  111. vgi/table_filter_pushdown.py +1528 -0
  112. vgi/table_function.py +1130 -0
  113. vgi/table_in_out_function.py +383 -0
  114. vgi/transactor/__init__.py +24 -0
  115. vgi/transactor/_duckdb_compat.py +27 -0
  116. vgi/transactor/client.py +137 -0
  117. vgi/transactor/protocol.py +149 -0
  118. vgi/transactor/server.py +740 -0
  119. vgi/worker.py +4761 -0
  120. vgi_python-0.8.0.dist-info/METADATA +735 -0
  121. vgi_python-0.8.0.dist-info/RECORD +124 -0
  122. vgi_python-0.8.0.dist-info/WHEEL +4 -0
  123. vgi_python-0.8.0.dist-info/entry_points.txt +5 -0
  124. vgi_python-0.8.0.dist-info/licenses/LICENSE +134 -0
vgi/table_function.py ADDED
@@ -0,0 +1,1130 @@
1
+ # Copyright 2025, 2026 Query Farm LLC - https://query.farm
2
+
3
+ """Base classes for table functions with cardinality hints and callback-based processing.
4
+
5
+ TableFunctionGenerator produces output batches via a per-tick callback. Each call
6
+ to process() either emits a batch via out.emit() or signals completion via out.finish().
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import uuid
12
+ from abc import abstractmethod
13
+ from collections.abc import Mapping
14
+ from dataclasses import dataclass, is_dataclass
15
+ from enum import Enum, auto
16
+ from typing import (
17
+ TYPE_CHECKING,
18
+ Annotated,
19
+ Any,
20
+ ClassVar,
21
+ TypeVar,
22
+ final,
23
+ get_args,
24
+ get_origin,
25
+ get_type_hints,
26
+ )
27
+
28
+ import pyarrow as pa
29
+ from vgi_rpc import ArrowSerializableDataclass
30
+ from vgi_rpc.rpc import AuthContext, CallContext, OutputCollector
31
+
32
+ import vgi.function
33
+ from vgi.arguments import (
34
+ Arg,
35
+ Arguments,
36
+ Secret,
37
+ SecretLookupEntry,
38
+ TableInput,
39
+ _accepts_none,
40
+ _extract_setting_secret_params,
41
+ )
42
+ from vgi.function_storage import BoundStorage, TransactionBoundStorage, attach_catalog_bytes
43
+ from vgi.invocation import (
44
+ BaseInitResponse,
45
+ BindResponse,
46
+ GlobalInitResponse,
47
+ )
48
+
49
+ if TYPE_CHECKING:
50
+ from vgi.catalog.catalog_interface import ColumnStatistics
51
+ from vgi.protocol import BindRequest, InitRequest
52
+ from vgi.table_filter_pushdown import PushdownFilters
53
+
54
+ _ON_CANCEL_CAVEATS = """\
55
+ **Best-effort only.** This hook does not fire in every
56
+ cancellation path — process kills, network partitions, and
57
+ some error-on-error unwinds skip it. Never rely on
58
+ ``on_cancel`` for correctness-critical cleanup; treat it as a
59
+ resource-release optimization.
60
+
61
+ Under HTTP pooling with ``max_workers > 1``, ``on_cancel`` may
62
+ fire on a different worker process than the one that produced
63
+ batches for this stream. Process-local resources held in a
64
+ specific worker's memory cannot be reliably released from
65
+ another worker's ``on_cancel``; prefer shared infrastructure
66
+ whose handle is re-derivable from the serialized state."""
67
+
68
+ __all__ = [
69
+ "TableCardinality",
70
+ "BindParams",
71
+ "InitParams",
72
+ "ProcessParams",
73
+ "SecretsAccessor",
74
+ "TableFunctionBase",
75
+ "TableFunctionGenerator",
76
+ "TableInOutFunctionInitPhase",
77
+ "init_single_worker",
78
+ "bind_fixed_schema",
79
+ "_struct_scalar_to_dict",
80
+ "_extract_setting_secret_params",
81
+ ]
82
+
83
+
84
+ @dataclass(frozen=True, slots=True)
85
+ class TableCardinality(ArrowSerializableDataclass):
86
+ """Cardinality hints for query optimization.
87
+
88
+ Provides optional row count estimates that can help query planners make
89
+ better decisions about join ordering, memory allocation, and parallelization.
90
+
91
+ Attributes:
92
+ estimate: Estimated number of output rows, or None if unknown.
93
+ max: Maximum possible output rows, or None if unbounded.
94
+
95
+ """
96
+
97
+ estimate: int | None
98
+ max: int | None
99
+
100
+
101
+ def _batch_to_scalar_dict(batch: pa.RecordBatch | None) -> dict[str, pa.Scalar[Any]]:
102
+ """Extract a single-row RecordBatch into a dict of column-name to scalar value."""
103
+ if batch is None:
104
+ return {}
105
+ return {name: batch.column(i)[0] for i, name in enumerate(batch.schema.names)}
106
+
107
+
108
+ def _struct_scalar_to_dict(scalar: pa.StructScalar) -> dict[str, pa.Scalar[Any]]:
109
+ """Expand a struct scalar into a dict of field name to scalar."""
110
+ return {key: scalar[key] for key in scalar}
111
+
112
+
113
+ class SecretsAccessor:
114
+ """Unified access to secrets — pre-resolved and dynamically requested.
115
+
116
+ Pre-resolved secrets (from Secret() annotations with static scope/name, or
117
+ unscoped lookups) are available immediately. Dynamic lookups (computed scope
118
+ from function arguments) register pending requests — the framework
119
+ automatically triggers a two-phase bind retry to resolve them.
120
+ """
121
+
122
+ __slots__ = ("_unscoped", "_scoped", "_is_retry", "_pending_lookups")
123
+
124
+ def __init__(self, secrets_batch: pa.RecordBatch | None, *, is_retry: bool = False) -> None:
125
+ """Initialize from a secrets RecordBatch."""
126
+ self._is_retry = is_retry
127
+ self._pending_lookups: list[SecretLookupEntry] = []
128
+
129
+ # Parse unscoped secrets (columns named by secret_type)
130
+ self._unscoped: dict[str, dict[str, pa.Scalar[Any]]] = {}
131
+ # Parse scoped secrets (columns named "secret_N" with field metadata)
132
+ self._scoped: list[tuple[dict[str, str], dict[str, pa.Scalar[Any]] | None]] = []
133
+
134
+ if secrets_batch is not None:
135
+ for i, name in enumerate(secrets_batch.schema.names):
136
+ col_field = secrets_batch.schema.field(i)
137
+ scalar = secrets_batch.column(i)[0]
138
+
139
+ if name.startswith("secret_"):
140
+ # Scoped secret with metadata on the Arrow field
141
+ raw_meta = col_field.metadata or {}
142
+ entry_meta = {
143
+ (k.decode() if isinstance(k, bytes) else k): (v.decode() if isinstance(v, bytes) else v)
144
+ for k, v in raw_meta.items()
145
+ }
146
+ if scalar.is_valid:
147
+ self._scoped.append((entry_meta, _struct_scalar_to_dict(scalar)))
148
+ else:
149
+ self._scoped.append((entry_meta, None))
150
+ else:
151
+ # Unscoped secret (column name = secret_type)
152
+ if scalar.is_valid:
153
+ self._unscoped[name] = _struct_scalar_to_dict(scalar)
154
+
155
+ def get(
156
+ self,
157
+ secret_type: str,
158
+ *,
159
+ name: str | None = None,
160
+ scope: str | None = None,
161
+ required: bool = False,
162
+ ) -> dict[str, pa.Scalar[Any]] | None:
163
+ """Get a secret by type, with optional name and/or scope.
164
+
165
+ Args:
166
+ secret_type: The secret type (e.g., "vgi_example", "s3").
167
+ name: Optional secret name for name-based lookup.
168
+ scope: Optional scope for scoped lookup (longest-prefix match).
169
+ required: If True, raises ValueError when the secret is genuinely
170
+ not found (after resolution).
171
+
172
+ Returns:
173
+ dict of string keys to Arrow scalars, or None if not found.
174
+
175
+ """
176
+ # Simple unscoped lookup (no dynamic scope/name)
177
+ if not scope and not name:
178
+ result = self._unscoped.get(secret_type)
179
+ if result is not None:
180
+ return result
181
+ if self._is_retry:
182
+ # Retry but still not found — genuinely missing
183
+ if required:
184
+ raise ValueError(f"Required secret '{secret_type}' not found")
185
+ return None
186
+ # First call, not found — register pending lookup for two-phase bind
187
+ self._pending_lookups.append(SecretLookupEntry(secret_type=secret_type))
188
+ return None
189
+
190
+ # Check resolved scoped secrets (from retry)
191
+ if self._is_retry:
192
+ result = self._find_scoped(secret_type, name, scope)
193
+ if required and result is None:
194
+ raise ValueError(f"Required secret '{secret_type}' not found (scope={scope!r}, name={name!r})")
195
+ return result
196
+
197
+ # First call, dynamic scope/name — register pending lookup
198
+ self._pending_lookups.append(SecretLookupEntry(secret_type=secret_type, scope=scope, secret_name=name))
199
+ return None
200
+
201
+ @property
202
+ def all_resolved(self) -> bool:
203
+ """True if all requested secrets have been resolved (no pending lookups).
204
+
205
+ Use this to distinguish 'not yet resolved' from 'genuinely not found'
206
+ when not using required=True on get().
207
+ """
208
+ return len(self._pending_lookups) == 0
209
+
210
+ @property
211
+ def needs_resolution(self) -> bool:
212
+ """True if there are pending lookups that need resolution."""
213
+ return len(self._pending_lookups) > 0
214
+
215
+ @property
216
+ def pending_lookups(self) -> list[SecretLookupEntry]:
217
+ """Return the list of pending secret lookups."""
218
+ return list(self._pending_lookups)
219
+
220
+ def to_dict(self) -> dict[str, dict[str, pa.Scalar[Any]]]:
221
+ """Return all resolved secrets as a flat dict keyed by secret_type.
222
+
223
+ Combines unscoped entries (column name = secret_type) with scoped
224
+ entries (``secret_N`` columns, keyed by ``secret_type`` from Arrow
225
+ field metadata). Null/unresolved entries are omitted.
226
+ """
227
+ result = dict(self._unscoped)
228
+ for meta, secret_dict in self._scoped:
229
+ if secret_dict is not None:
230
+ key = meta.get("secret_type", "")
231
+ if key:
232
+ result[key] = secret_dict
233
+ return result
234
+
235
+ def _find_scoped(
236
+ self,
237
+ secret_type: str,
238
+ name: str | None,
239
+ scope: str | None,
240
+ ) -> dict[str, pa.Scalar[Any]] | None:
241
+ """Find a resolved scoped secret matching the given criteria."""
242
+ for meta, secret_dict in self._scoped:
243
+ if meta.get("secret_type") != secret_type:
244
+ continue
245
+ if scope is not None and meta.get("scope") != scope:
246
+ continue
247
+ if name is not None and meta.get("secret_name") != name:
248
+ continue
249
+ return secret_dict
250
+ return None
251
+
252
+
253
+ def project_schema(projection_ids: list[int] | None, schema: pa.Schema) -> pa.Schema:
254
+ """Create the projected schema if projection_ids are supplied."""
255
+ if projection_ids is not None:
256
+ return pa.schema([schema.field(proj_id) for proj_id in projection_ids])
257
+ return schema
258
+
259
+
260
+ def _effective_projection_ids(func_cls: Any, projection_ids: list[int] | None) -> list[int] | None:
261
+ """Return projection_ids only if the function supports projection pushdown."""
262
+ if projection_ids is not None and func_cls.get_metadata().projection_pushdown:
263
+ return projection_ids
264
+ return None
265
+
266
+
267
+ class TableInOutFunctionInitPhase(Enum):
268
+ """Init-call phase for table functions.
269
+
270
+ ``INPUT`` / ``FINALIZE`` drive the streaming TableInOutGenerator path.
271
+ ``TABLE_BUFFERING`` is the Sink+Source init phase for
272
+ ``TableBufferingFunction`` — after init, traffic moves to
273
+ ``table_buffering_process`` / ``_combine`` (unary) and
274
+ ``TABLE_BUFFERING_FINALIZE`` opens a producer-mode finalize stream
275
+ per finalize_state_id.
276
+ """
277
+
278
+ INPUT = auto()
279
+ FINALIZE = auto()
280
+ TABLE_BUFFERING = auto()
281
+ TABLE_BUFFERING_FINALIZE = auto()
282
+
283
+
284
+ class OrderByDirection(Enum):
285
+ """ORDER BY direction pushed down from DuckDB's RowGroupPruner optimizer."""
286
+
287
+ ASC = auto()
288
+ DESC = auto()
289
+
290
+
291
+ class OrderByNullOrder(Enum):
292
+ """NULL ordering pushed down from DuckDB's RowGroupPruner optimizer."""
293
+
294
+ NULLS_FIRST = auto()
295
+ NULLS_LAST = auto()
296
+
297
+
298
+ @dataclass(slots=True, frozen=True, kw_only=True)
299
+ class BindParams[TArgs]:
300
+ """Parameters passed to on_bind()."""
301
+
302
+ args: TArgs
303
+ bind_call: BindRequest
304
+ # Convenient access to settings and secrets, extracted from the bind_call.
305
+ settings: dict[str, pa.Scalar[Any]]
306
+ secrets: SecretsAccessor
307
+ # Transaction-scoped storage view. Lets ``cardinality()`` and
308
+ # ``statistics()`` cache expensive lookups (e.g. Kafka watermarks)
309
+ # in the same store ``on_init`` reads/writes for snapshot isolation
310
+ # — so a topic's row count is fetched once per SQL transaction
311
+ # rather than once per bind/cardinality/statistics/init phase.
312
+ # ``None`` when ``bind_call.transaction_opaque_data`` is unset.
313
+ transaction_storage: TransactionBoundStorage | None = None
314
+ # Execution-scoped storage view. Populated only on call paths that
315
+ # carry a ``global_execution_id`` — currently just
316
+ # ``dynamic_to_string``. ``None`` for ``bind`` / ``cardinality`` /
317
+ # ``statistics`` (they predate execution and have no
318
+ # execution_id).
319
+ storage: BoundStorage | None = None
320
+ auth_context: AuthContext = AuthContext.anonymous()
321
+ # The catalog's attach bytes, unwrapped by the framework (encryption is the
322
+ # framework's concern, not the user's). This is what the catalog returned at
323
+ # ``catalog_attach`` — the framework shard-UUID prefix is already stripped.
324
+ # None when invoked without an ATTACH. Storage shards on that UUID separately.
325
+ attach_opaque_data: bytes | None = None
326
+
327
+ @property
328
+ def at_unit(self) -> str | None:
329
+ """The AT (TIMESTAMP|VERSION) unit for this scan, or None without an AT clause.
330
+
331
+ NOTE: for inline-bound (function-backed) tables on_bind runs once
332
+ at attach with no AT, so this is None here — read AT at init/process via
333
+ ``ProcessParams.at_value``. See ``BindRequest.at_unit``.
334
+ """
335
+ return self.bind_call.at_unit
336
+
337
+ @property
338
+ def at_value(self) -> str | None:
339
+ """The AT (TIMESTAMP|VERSION) value for this scan, or None. See ``at_unit``."""
340
+ return self.bind_call.at_value
341
+
342
+
343
+ @dataclass(slots=True, frozen=True, kw_only=True)
344
+ class InitParams[TArgs]:
345
+ """Parameters passed to on_init()."""
346
+
347
+ args: TArgs
348
+ init_call: InitRequest
349
+
350
+ execution_id: bytes
351
+
352
+ # This is the projected schema based on projection_ids,
353
+ # which is what the function should produce.
354
+ output_schema: pa.Schema
355
+
356
+ # Convenient access to settings and secrets as dicts, extracted from the bind_call.
357
+ settings: dict[str, pa.Scalar[Any]]
358
+ secrets: dict[str, dict[str, pa.Scalar[Any]]]
359
+
360
+ storage: BoundStorage
361
+ auth_context: AuthContext = AuthContext.anonymous()
362
+ # Catalog's attach bytes, unwrapped by the framework (uuid prefix stripped);
363
+ # None without an ATTACH. See ``BindParams``.
364
+ attach_opaque_data: bytes | None = None
365
+
366
+ @property
367
+ def at_unit(self) -> str | None:
368
+ """AT (TIMESTAMP|VERSION) unit for this scan, or None.
369
+
370
+ Carried on the per-scan bind embedded in the init request.
371
+ See ``BindRequest.at_unit``.
372
+ """
373
+ return self.init_call.bind_call.at_unit
374
+
375
+ @property
376
+ def at_value(self) -> str | None:
377
+ """AT (TIMESTAMP|VERSION) value for this scan, or None. See ``at_unit``."""
378
+ return self.init_call.bind_call.at_value
379
+
380
+
381
+ @dataclass(slots=True, frozen=True, kw_only=True)
382
+ class ProcessParams[TArgs]:
383
+ """Parameters passed to process() and finalize()."""
384
+
385
+ args: TArgs
386
+ init_call: InitRequest | None # None for aggregate functions
387
+ init_response: BaseInitResponse | None # None for aggregate functions
388
+
389
+ # This is the projected schema based on projection_ids,
390
+ # which is what the function should produce.
391
+ output_schema: pa.Schema
392
+
393
+ # Convenient access to settings and secrets as dicts, extracted from the bind_call.
394
+ settings: dict[str, pa.Scalar[Any]]
395
+ secrets: dict[str, dict[str, pa.Scalar[Any]]]
396
+
397
+ storage: BoundStorage
398
+ auth_context: AuthContext = AuthContext.anonymous()
399
+
400
+ # Current pushdown filters (updated dynamically from tick metadata for Top-N queries).
401
+ # None if no filters have been received. Updated before each process() call.
402
+ current_pushdown_filters: Any = None # PushdownFilters | None
403
+
404
+ # Globally-unique monotonic batch index for this process() call. Populated
405
+ # ONLY for TableBufferingFunction subclasses with
406
+ # Meta.requires_input_batch_index=True — the C++ Sink reads it from DuckDB's
407
+ # per-chunk OperatorPartitionInfo and forwards it. Workers can accumulate
408
+ # (batch_index, payload) tuples and sort in combine() to reconstruct source
409
+ # order under parallel ingest. None for every other call path.
410
+ batch_index: int | None = None
411
+
412
+ # Catalog's attach bytes, unwrapped by the framework (uuid prefix stripped);
413
+ # None without an ATTACH. See ``BindParams``.
414
+ attach_opaque_data: bytes | None = None
415
+
416
+ @property
417
+ def at_unit(self) -> str | None:
418
+ """AT (TIMESTAMP|VERSION) unit for this scan, or None.
419
+
420
+ Carried on the per-scan bind embedded in the init request; None for
421
+ aggregate functions (no init_call). See ``BindRequest.at_unit``.
422
+ """
423
+ return self.init_call.bind_call.at_unit if self.init_call is not None else None
424
+
425
+ @property
426
+ def at_value(self) -> str | None:
427
+ """AT (TIMESTAMP|VERSION) value for this scan, or None. See ``at_unit``."""
428
+ return self.init_call.bind_call.at_value if self.init_call is not None else None
429
+
430
+
431
+ class TableFunctionBase[TArgs](vgi.function.Function):
432
+ """Base class for table functions with cardinality and schema validation.
433
+
434
+ Extends Function with:
435
+ - Cardinality hints for query optimization
436
+ - Projection pushdown support
437
+
438
+ This class is not meant to be used directly. Subclass either:
439
+ - TableFunctionGenerator: For simple generators that produce output
440
+ - TableInOutGenerator: For functions that transform input batches
441
+
442
+ See Also:
443
+ TableFunctionGenerator: Simple generator base class
444
+ TableInOutGenerator: Full streaming with input batches
445
+
446
+ """
447
+
448
+ FunctionArguments: ClassVar[type]
449
+ _setting_params: ClassVar[dict[str, str]]
450
+ _secret_params: ClassVar[dict[str, Secret]]
451
+
452
+ def __init_subclass__(cls) -> None:
453
+ """Validate FunctionArguments, auto-extracting from generic parameter if needed."""
454
+ super().__init_subclass__()
455
+
456
+ # Validate TState (second generic type parameter) is serializable.
457
+ #
458
+ # This runs unconditionally — independently of the FunctionArguments
459
+ # auto-extraction below. The check used to be nested inside the
460
+ # ``not hasattr(cls, "FunctionArguments")`` branch, so any class that
461
+ # set ``FunctionArguments`` explicitly in its body silently skipped
462
+ # TState validation. That let non-serializable state slip through: it
463
+ # appears to work on subprocess transport (the worker process is
464
+ # long-lived, so the live state object survives between ``process()``
465
+ # ticks) but breaks on HTTP, where each tick is an independent request
466
+ # and state must round-trip through the stream-state token.
467
+ for base in cls.__dict__.get("__orig_bases__", ()):
468
+ origin = get_origin(base)
469
+ if origin is not None and issubclass(origin, TableFunctionBase):
470
+ type_args = get_args(base)
471
+ if len(type_args) >= 2:
472
+ state_type = type_args[1]
473
+ if (
474
+ state_type is not None
475
+ and state_type is not type(None)
476
+ and not isinstance(state_type, TypeVar)
477
+ and isinstance(state_type, type)
478
+ and not issubclass(state_type, ArrowSerializableDataclass)
479
+ ):
480
+ raise TypeError(
481
+ f"{cls.__name__}: TState type {state_type.__name__} must extend "
482
+ f"ArrowSerializableDataclass for HTTP state serialization. "
483
+ f"Use @dataclass(kw_only=True) and inherit from ArrowSerializableDataclass."
484
+ )
485
+ break
486
+
487
+ # Auto-extract FunctionArguments from generic type parameter if not explicitly set.
488
+ # e.g., class MyFunc(TableFunctionGenerator[MyArgs]) -> cls.FunctionArguments = MyArgs
489
+ if not hasattr(cls, "FunctionArguments"):
490
+ for base in cls.__dict__.get("__orig_bases__", ()):
491
+ origin = get_origin(base)
492
+ if origin is not None and issubclass(origin, TableFunctionBase):
493
+ type_args = get_args(base)
494
+ if type_args and not isinstance(type_args[0], TypeVar):
495
+ if type_args[0] is type(None):
496
+ # None means no arguments — create empty dataclass
497
+ from dataclasses import make_dataclass
498
+
499
+ cls.FunctionArguments = make_dataclass(f"_{cls.__name__}Args", [])
500
+ else:
501
+ cls.FunctionArguments = type_args[0]
502
+ break
503
+
504
+ # Skip validation for abstract base classes
505
+ is_abstract = any(getattr(getattr(cls, name, None), "__isabstractmethod__", False) for name in dir(cls))
506
+ if is_abstract:
507
+ cls._setting_params = {}
508
+ cls._secret_params = {}
509
+ return
510
+
511
+ # Skip intermediate base classes that still have unresolved type parameters
512
+ if not hasattr(cls, "FunctionArguments"):
513
+ has_unresolved = False
514
+ for base in cls.__dict__.get("__orig_bases__", ()):
515
+ type_args = get_args(base)
516
+ if type_args and isinstance(type_args[0], TypeVar):
517
+ has_unresolved = True
518
+ break
519
+ if has_unresolved:
520
+ cls._setting_params = {}
521
+ cls._secret_params = {}
522
+ return
523
+
524
+ if not hasattr(cls, "FunctionArguments"):
525
+ # Provide a default empty FunctionArguments for classes that use
526
+ # class-level Arg descriptors (e.g., TableInOutFunction subclasses
527
+ # without type parameters). This preserves backward compatibility.
528
+ from dataclasses import make_dataclass
529
+
530
+ cls.FunctionArguments = make_dataclass(f"_{cls.__name__}Args", [])
531
+ else:
532
+ args_class = cls.FunctionArguments
533
+
534
+ # Validate FunctionArguments is a dataclass
535
+ if not is_dataclass(args_class):
536
+ raise TypeError(
537
+ f"{cls.__name__}.FunctionArguments must be a dataclass. "
538
+ f"Add @dataclass decorator to {args_class.__name__}"
539
+ )
540
+
541
+ # Validate all fields are Annotated with Arg
542
+ hints = get_type_hints(args_class, include_extras=True)
543
+ for field_name, hint in hints.items():
544
+ if get_origin(hint) is not Annotated:
545
+ raise TypeError(
546
+ f"{cls.__name__}.FunctionArguments.{field_name} must use Annotated[T, Arg(...)], got {hint}"
547
+ )
548
+
549
+ # Check that Arg is in the metadata
550
+ metadata = get_args(hint)[1:]
551
+ has_arg = any(isinstance(meta, Arg) for meta in metadata)
552
+ if not has_arg:
553
+ raise TypeError(
554
+ f"{cls.__name__}.FunctionArguments.{field_name} must have Arg(...) in Annotated metadata"
555
+ )
556
+
557
+ # Parse on_bind() signature for Setting/Secret annotations
558
+ on_bind_method = getattr(cls, "on_bind", None)
559
+ if on_bind_method is not None and "on_bind" in cls.__dict__:
560
+ cls._setting_params, cls._secret_params = _extract_setting_secret_params(on_bind_method)
561
+ else:
562
+ cls._setting_params = getattr(cls, "_setting_params", {})
563
+ cls._secret_params = getattr(cls, "_secret_params", {})
564
+
565
+ @final
566
+ @staticmethod
567
+ def _parse_arguments(args_class: type[TArgs], arguments: Arguments) -> TArgs:
568
+ """Convert Arguments to typed FunctionArguments instance."""
569
+ hints = get_type_hints(args_class, include_extras=True)
570
+ kwargs: dict[str, Any] = {}
571
+
572
+ for attr_name, hint in hints.items():
573
+ if get_origin(hint) is not Annotated:
574
+ continue
575
+ # Check if this is a TableInput parameter (sentinel, no real data)
576
+ base_type = get_args(hint)[0]
577
+ if base_type is TableInput:
578
+ kwargs[attr_name] = TableInput()
579
+ continue
580
+ for meta in get_args(hint)[1:]:
581
+ if isinstance(meta, Arg):
582
+ if meta.varargs:
583
+ # Varargs: collect remaining positional args as raw pa.Scalar objects
584
+ assert isinstance(meta.position, int)
585
+ kwargs[attr_name] = tuple(arguments.positional[meta.position :])
586
+ else:
587
+ value = arguments.get(meta.position, default=meta.default)
588
+ # Reject SQL NULL for non-Optional Args. Without this,
589
+ # None silently propagated through validation and
590
+ # crashed deep in the user's process()/update() with
591
+ # an opaque Python ``TypeError`` (e.g. ``'<=' not
592
+ # supported between instances of NoneType and int``)
593
+ # that surfaced in the C++ extension as a worker
594
+ # exception with no hint at the cause.
595
+ if value is None and not _accepts_none(base_type):
596
+ raise meta._reject_none()
597
+ # Run Arg constraint validation (ge/le/gt/lt/choices/pattern).
598
+ # Skip for None — accepted via Optional[T].
599
+ if value is not None:
600
+ meta._validate(value)
601
+ kwargs[attr_name] = value
602
+ break
603
+
604
+ return args_class(**kwargs)
605
+
606
+ @final
607
+ @staticmethod
608
+ def _validate_arg_type_bounds(
609
+ args_class: type,
610
+ args: Any,
611
+ input_schema: pa.Schema,
612
+ ) -> None:
613
+ """Validate type bounds for Arg parameters against the input schema.
614
+
615
+ Walks the FunctionArguments type hints to find Arg instances with
616
+ type_bound set. For each, gets the resolved column name from the
617
+ args dataclass and validates the column's Arrow type against the bound.
618
+
619
+ Args:
620
+ args_class: The FunctionArguments class with Annotated type hints.
621
+ args: The resolved FunctionArguments dataclass instance.
622
+ input_schema: The input schema to validate column types against.
623
+
624
+ """
625
+ hints = get_type_hints(args_class, include_extras=True)
626
+ for attr_name, hint in hints.items():
627
+ if get_origin(hint) is not Annotated:
628
+ continue
629
+ for meta in get_args(hint)[1:]:
630
+ if isinstance(meta, Arg) and meta.type_bound is not None:
631
+ value = getattr(args, attr_name)
632
+ if isinstance(value, tuple):
633
+ for col_name in value:
634
+ if isinstance(col_name, str):
635
+ meta.validate_type_bound(input_schema.field(col_name).type)
636
+ elif isinstance(value, str):
637
+ meta.validate_type_bound(input_schema.field(value).type)
638
+ break
639
+
640
+ @classmethod
641
+ def _extract_bind_kwargs(cls, input: BindRequest) -> dict[str, Any]:
642
+ """Extract Setting/Secret kwargs from a BindRequest for on_bind().
643
+
644
+ Returns dict of keyword arguments matching Setting/Secret annotations
645
+ on the on_bind() method.
646
+ """
647
+ kwargs: dict[str, Any] = {}
648
+
649
+ # Setting params: extract pa.Scalar from settings RecordBatch
650
+ if input.settings is not None and cls._setting_params:
651
+ settings_schema = input.settings.schema
652
+ for name, setting_key in cls._setting_params.items():
653
+ col_idx = settings_schema.get_field_index(setting_key)
654
+ kwargs[name] = input.settings.column(col_idx)[0] if col_idx >= 0 else None
655
+
656
+ # Secret params: extract dict[str, pa.Scalar] from secrets RecordBatch
657
+ if input.secrets is not None and cls._secret_params:
658
+ secrets_schema = input.secrets.schema
659
+ for name, secret in cls._secret_params.items():
660
+ col_idx = secrets_schema.get_field_index(secret.secret_type)
661
+ kwargs[name] = _struct_scalar_to_dict(input.secrets.column(col_idx)[0]) if col_idx >= 0 else None
662
+
663
+ return kwargs
664
+
665
+ @final
666
+ @classmethod
667
+ def _make_bind_params(
668
+ cls,
669
+ input: BindRequest,
670
+ *,
671
+ auth_context: AuthContext | None = None,
672
+ execution_id: bytes | None = None,
673
+ attach_plaintext: bytes | None = None,
674
+ ) -> BindParams[TArgs]:
675
+ """Construct BindParams from a BindRequest.
676
+
677
+ Shared by bind() and table_function_cardinality() to avoid
678
+ duplicating BindParams construction logic. ``execution_id`` is
679
+ only populated on call paths that have one (currently just
680
+ ``dynamic_to_string``); when provided, ``BindParams.storage`` is
681
+ a ``BoundStorage`` view keyed by it.
682
+ """
683
+ txn_id = input.transaction_opaque_data
684
+ # ``attach_plaintext`` is the full framework plaintext (``uuid(16) ||
685
+ # catalog_bytes``) the worker unwrapped. Storage shards on its UUID;
686
+ # bodies see only the catalog bytes via ``attach_opaque_data``.
687
+ return BindParams[TArgs](
688
+ args=cls._parse_arguments(cls.FunctionArguments, input.arguments),
689
+ bind_call=input,
690
+ settings=_batch_to_scalar_dict(input.settings),
691
+ secrets=SecretsAccessor(input.secrets, is_retry=input.resolved_secrets_provided),
692
+ transaction_storage=TransactionBoundStorage(
693
+ cls.storage,
694
+ txn_id,
695
+ request=input,
696
+ attach_plaintext=attach_plaintext,
697
+ )
698
+ if txn_id
699
+ else None,
700
+ storage=BoundStorage(
701
+ cls.storage,
702
+ execution_id,
703
+ request=input,
704
+ attach_plaintext=attach_plaintext,
705
+ )
706
+ if execution_id
707
+ else None,
708
+ auth_context=auth_context if auth_context is not None else AuthContext.anonymous(),
709
+ attach_opaque_data=attach_catalog_bytes(attach_plaintext),
710
+ )
711
+
712
+ # ------------------------------------------------------------------
713
+ # Bind / global_init — shared framework hooks for every table function.
714
+ #
715
+ # Subclasses define ``on_bind`` (and optionally ``on_init``) for the
716
+ # user-facing behavior; the framework's wire entry points ``bind`` and
717
+ # ``global_init`` are ``@final`` and live here so we have a single
718
+ # source of truth across TableFunctionGenerator / TableInOutGenerator /
719
+ # TableBufferingFunction.
720
+ # ------------------------------------------------------------------
721
+
722
+ @classmethod
723
+ @abstractmethod
724
+ def on_bind(
725
+ cls,
726
+ params: BindParams[TArgs],
727
+ ) -> BindResponse:
728
+ """Produce the output schema and perform other bind-time logic.
729
+
730
+ Subclasses must override. Common patterns:
731
+
732
+ * Pass through: ``return BindResponse(output_schema=params.bind_call.input_schema)``
733
+ * Custom shape: build a ``pa.Schema`` from ``params.args`` and return it.
734
+ * Dynamic secrets: declare ``*, my_secret: Annotated[..., Secret()] = None``
735
+ or call ``params.secrets.get(...)``; the framework will issue a
736
+ secret-scope retry automatically.
737
+
738
+ Args:
739
+ params: Bind parameters including arguments and schema.
740
+
741
+ Returns:
742
+ BindResponse with output_schema and optional opaque_data.
743
+
744
+ """
745
+
746
+ @final
747
+ @classmethod
748
+ def bind(
749
+ cls,
750
+ input: BindRequest,
751
+ *,
752
+ ctx: CallContext | None = None,
753
+ attach_plaintext: bytes | None = None,
754
+ ) -> BindResponse:
755
+ """Bind protocol entry point. Do not override; use ``on_bind()``.
756
+
757
+ Validates type bounds when an input schema is present (table-input
758
+ functions), constructs BindParameters, calls ``on_bind()``, and
759
+ wraps the result for transmission to global_init. If ``on_bind()``
760
+ triggered dynamic secret lookups via SecretsAccessor, returns a
761
+ secret-scope request to trigger two-phase bind.
762
+
763
+ Note: we do NOT auto-request secrets before ``on_bind()``. Table
764
+ functions handle secrets via ``on_bind`` kwargs (``Secret()``
765
+ annotations) and ``SecretsAccessor.get()`` calls, which may use
766
+ dynamic scopes computed from function arguments.
767
+ """
768
+ auth = ctx.auth if ctx is not None else AuthContext.anonymous()
769
+ params = cls._make_bind_params(input, auth_context=auth, attach_plaintext=attach_plaintext)
770
+
771
+ if input.input_schema is not None:
772
+ cls._validate_arg_type_bounds(cls.FunctionArguments, params.args, input.input_schema)
773
+
774
+ result = cls.on_bind(params, **cls._extract_bind_kwargs(input))
775
+
776
+ if params.secrets.needs_resolution:
777
+ return BindResponse.secret_scope_request(params.secrets.pending_lookups)
778
+
779
+ return result
780
+
781
+ @classmethod
782
+ def on_init(
783
+ cls,
784
+ params: InitParams[TArgs],
785
+ ) -> GlobalInitResponse:
786
+ """One-time setup after bind, before processing batches.
787
+
788
+ Override to perform per-execution setup (open external resources,
789
+ allocate caches, etc.). Default is a no-op.
790
+ """
791
+ return GlobalInitResponse()
792
+
793
+ @final
794
+ @classmethod
795
+ def global_init(
796
+ cls,
797
+ input: InitRequest,
798
+ *,
799
+ ctx: CallContext | None = None,
800
+ attach_plaintext: bytes | None = None,
801
+ ) -> GlobalInitResponse:
802
+ """Global init protocol entry point. Do not override; use ``on_init()``."""
803
+ execution_id = uuid.uuid4().bytes
804
+ auth = ctx.auth if ctx is not None else AuthContext.anonymous()
805
+ params = InitParams[TArgs](
806
+ args=cls._parse_arguments(cls.FunctionArguments, input.bind_call.arguments),
807
+ init_call=input,
808
+ output_schema=project_schema(
809
+ _effective_projection_ids(cls, input.projection_ids),
810
+ input.output_schema,
811
+ ),
812
+ settings=_batch_to_scalar_dict(input.bind_call.settings),
813
+ secrets=SecretsAccessor(input.bind_call.secrets).to_dict(),
814
+ execution_id=execution_id,
815
+ # ``attach_plaintext`` is the full framework plaintext (uuid||catalog
816
+ # bytes); storage shards on its UUID, the body sees the catalog bytes.
817
+ storage=BoundStorage(cls.storage, execution_id, request=input, attach_plaintext=attach_plaintext),
818
+ auth_context=auth,
819
+ attach_opaque_data=attach_catalog_bytes(attach_plaintext),
820
+ )
821
+
822
+ result = cls.on_init(params)
823
+
824
+ return GlobalInitResponse(
825
+ max_workers=result.max_workers,
826
+ execution_id=execution_id,
827
+ opaque_data=result.opaque_data,
828
+ )
829
+
830
+ @classmethod
831
+ def cardinality(cls, params: BindParams[TArgs]) -> TableCardinality:
832
+ """Return the cardinality for the output.
833
+
834
+ Override to provide row count estimates that help query planners
835
+ make better decisions about join ordering and memory allocation.
836
+
837
+ Returns:
838
+ TableCardinality with estimate and/or max, or None if unknown.
839
+
840
+ """
841
+ return TableCardinality(estimate=None, max=None)
842
+
843
+ @classmethod
844
+ def dynamic_to_string(
845
+ cls,
846
+ params: BindParams[TArgs],
847
+ execution_id: bytes,
848
+ ) -> Mapping[str, str]:
849
+ """Return diagnostics rendered as Extra Info under EXPLAIN ANALYZE.
850
+
851
+ Fired once per parallel scan thread at end-of-stream. The function
852
+ class is responsible for persisting whatever diagnostics it cares
853
+ about during ``process()`` (shared storage, external service,
854
+ in-memory class state for single-worker setups) and retrieving
855
+ them by ``execution_id`` here.
856
+
857
+ DuckDB merges the per-thread maps with last-write-wins semantics,
858
+ so the *last* thread to finish — by which time every thread has
859
+ persisted — supplies the visible final view.
860
+
861
+ Best-effort: must not raise. The dispatcher catches exceptions
862
+ and returns an empty map so EXPLAIN ANALYZE never breaks the
863
+ query.
864
+
865
+ Args:
866
+ params: Same ``BindParams`` ``cardinality`` and ``statistics``
867
+ receive — function args, settings, secrets.
868
+ execution_id: ``VgiTableFunctionGlobalState::global_execution_id``,
869
+ stable for the duration of the query.
870
+
871
+ Returns:
872
+ Ordered key/value pairs. Insertion order is preserved on the
873
+ wire and re-emitted into the C++ profiler's
874
+ ``InsertionOrderPreservingMap``. The C++ wrapper appends
875
+ intrinsic keys (``Worker``, ``Function``, ``Rows Read``,
876
+ ``Threads``) after this map; user keys override on conflict.
877
+
878
+ """
879
+ return {}
880
+
881
+ @classmethod
882
+ def statistics(cls, params: BindParams[TArgs]) -> list[ColumnStatistics] | None:
883
+ """Return per-output-column statistics for this invocation.
884
+
885
+ Override to provide min/max/distinct/null stats so DuckDB's optimizer can
886
+ do filter elimination (e.g. prune a scan entirely when the filter is out
887
+ of range), improve join ordering, and fold always-true/always-false
888
+ predicates at plan time.
889
+
890
+ ``params`` is the same ``BindParams[TArgs]`` used by ``cardinality`` and
891
+ ``initial_state``, so stats can be derived directly from user-supplied
892
+ arguments.
893
+
894
+ Returns:
895
+ A list of ColumnStatistics (one entry per column for which stats
896
+ are known — columns not listed get unknown stats), or None when no
897
+ stats are available (same effect as today: optimizer receives no
898
+ column stats).
899
+
900
+ """
901
+ return None
902
+
903
+ @staticmethod
904
+ def pushdown_filters(
905
+ pushdown_filters: pa.RecordBatch,
906
+ join_keys: list[pa.RecordBatch] | None = None,
907
+ ) -> PushdownFilters | None:
908
+ """Get deserialized pushdown filters, or None if not present.
909
+
910
+ Use this property to access the filter AST for:
911
+ - Custom filter handling (push to SQL, APIs, etc.)
912
+ - Extracting column bounds for partition pruning
913
+ - Checking column constants for optimized lookups
914
+
915
+ For automatic filtering, set auto_apply_filters=True in Meta.
916
+
917
+ Args:
918
+ pushdown_filters: Arrow RecordBatch containing serialized filters.
919
+ join_keys: Optional list of single-column Arrow RecordBatches,
920
+ one per IN filter column. Available via
921
+ ``get_join_keys_batch()`` / ``get_join_keys_batches()``
922
+ on the returned ``PushdownFilters``.
923
+
924
+ Returns:
925
+ PushdownFilters container with parsed filter AST, or None.
926
+
927
+ """
928
+ if pushdown_filters is None:
929
+ return None
930
+ from vgi.table_filter_pushdown import deserialize_filters
931
+
932
+ return deserialize_filters(pushdown_filters, join_keys=join_keys)
933
+
934
+ @classmethod
935
+ def _should_auto_apply_filters(cls) -> bool:
936
+ """Check if auto_apply_filters is enabled in Meta."""
937
+ meta = getattr(cls, "Meta", None)
938
+ return bool(getattr(meta, "auto_apply_filters", False))
939
+
940
+ @classmethod
941
+ def _supports_batch_index(cls) -> bool:
942
+ """Return True if Meta.supports_batch_index is set.
943
+
944
+ Drives the ``batch_index=`` kwarg validation on ``out.emit()`` in the
945
+ table-producer harness (see vgi.protocol._TrackingOutputCollector).
946
+ """
947
+ meta = getattr(cls, "Meta", None)
948
+ return bool(getattr(meta, "supports_batch_index", False))
949
+
950
+ @classmethod
951
+ def _partition_kind(cls) -> Any:
952
+ """Return Meta.partition_kind, defaulting to ``NOT_PARTITIONED``.
953
+
954
+ Drives the ``partition_values=`` kwarg validation on ``out.emit()``
955
+ in the table-producer harness. Imported lazily so the base class
956
+ doesn't pull in ``vgi.metadata`` at module load time.
957
+ """
958
+ from vgi.metadata import PartitionKind
959
+
960
+ meta = getattr(cls, "Meta", None)
961
+ return getattr(meta, "partition_kind", PartitionKind.NOT_PARTITIONED)
962
+
963
+ @staticmethod
964
+ def _apply_pushdown_filter(batch: pa.RecordBatch, pushdown_filters: PushdownFilters | None) -> pa.RecordBatch:
965
+ """Apply pushdown filters to a batch if present.
966
+
967
+ Args:
968
+ batch: RecordBatch to filter
969
+ pushdown_filters: The PushdownFilters to apply or None.
970
+
971
+ Returns:
972
+ Filtered batch, or original if no filters or batch is None/empty.
973
+
974
+ """
975
+ if batch.num_rows == 0:
976
+ return batch
977
+ if pushdown_filters:
978
+ result = pushdown_filters.apply(batch)
979
+ return result
980
+ return batch
981
+
982
+
983
+ class TableFunctionGenerator[TArgs, TState = None](TableFunctionBase[TArgs]):
984
+ """Callback-based table function that produces output batches.
985
+
986
+ Each call to process() should either:
987
+ - Emit a batch via out.emit(batch)
988
+ - Signal completion via out.finish()
989
+
990
+ Use TState to persist state between process() calls.
991
+
992
+ For functions that transform input batches, use TableInOutGenerator.
993
+
994
+ """
995
+
996
+ # bind / on_bind / on_init / global_init are defined on TableFunctionBase.
997
+ # TableFunctionGenerator subclasses must override the abstract on_bind
998
+ # to declare an output schema (TFG has no input schema to default to).
999
+
1000
+ @classmethod
1001
+ def initial_state(cls, params: ProcessParams[TArgs]) -> TState | None:
1002
+ """Create initial processing state. Override when TState is used.
1003
+
1004
+ Called once during init to create the state object that will be
1005
+ passed to process() on each tick.
1006
+
1007
+ Args:
1008
+ params: Process parameters including arguments and schemas.
1009
+
1010
+ Returns:
1011
+ Initial state, or None if no state is needed.
1012
+
1013
+ """
1014
+ return None
1015
+
1016
+ @classmethod
1017
+ @abstractmethod
1018
+ def process(
1019
+ cls,
1020
+ params: ProcessParams[TArgs],
1021
+ state: TState,
1022
+ out: OutputCollector,
1023
+ ) -> None:
1024
+ """Produce output for one tick.
1025
+
1026
+ Called repeatedly by the framework. Each call should either:
1027
+ - Call out.emit(batch) to produce one output batch
1028
+ - Call out.finish() to signal that generation is complete
1029
+
1030
+ Use out.client_log(level, message) for in-band logging.
1031
+
1032
+ Args:
1033
+ params: Process parameters including arguments and schemas.
1034
+ state: Mutable state persisted between calls. None if TState not used.
1035
+ out: OutputCollector for emitting batches, logging, and signaling finish.
1036
+
1037
+ """
1038
+
1039
+ @classmethod
1040
+ def on_cancel(cls, params: ProcessParams[TArgs], state: TState) -> None: # noqa: D102
1041
+ pass
1042
+
1043
+ on_cancel.__func__.__doc__ = ( # type: ignore[attr-defined]
1044
+ f"""Release resources when the stream is cancelled before natural end.
1045
+
1046
+ The VGI C++ extension fires this hook when a DuckDB query tears
1047
+ down a VGI scan early (LIMIT clause, user break, Ctrl-C,
1048
+ exception unwind). Override to release expensive per-stream
1049
+ resources the function was holding in ``state`` (database
1050
+ cursors, LLM streaming sessions, file handles, GPU buffers).
1051
+
1052
+ {_ON_CANCEL_CAVEATS}
1053
+
1054
+ The stream has already been torn down by the time this fires;
1055
+ no further batches may be emitted.
1056
+
1057
+ Args:
1058
+ params: Process parameters (same as ``process()`` received).
1059
+ state: The current user state, possibly deserialized from a
1060
+ state-token on a different worker than the one that
1061
+ originally built it.
1062
+ """
1063
+ )
1064
+
1065
+
1066
+ def init_single_worker[T: TableFunctionGenerator[Any, Any]](cls: type[T]) -> type[T]:
1067
+ """Class decorator to set max_workers=1 for a TableFunctionGenerator subclass."""
1068
+ if "on_init" not in cls.__dict__:
1069
+
1070
+ def on_init_impl(cls_: type[T], params: Any) -> GlobalInitResponse:
1071
+ return GlobalInitResponse(max_workers=1)
1072
+
1073
+ cls.on_init = classmethod(on_init_impl) # type: ignore[assignment]
1074
+
1075
+ # Clear 'on_init' from __abstractmethods__ — the metaclass set it
1076
+ # before decorators ran, so we must update it manually.
1077
+ if hasattr(cls, "__abstractmethods__") and "on_init" in cls.__abstractmethods__:
1078
+ cls.__abstractmethods__ = cls.__abstractmethods__ - {"on_init"}
1079
+
1080
+ return cls
1081
+
1082
+
1083
+ def bind_fixed_schema[T: TableFunctionGenerator[Any, Any]](cls: type[T]) -> type[T]:
1084
+ """Class decorator to return FIXED_SCHEMA from on_bind for a TableFunctionGenerator subclass.
1085
+
1086
+ Sets ``cls._inline_bind_safe = True`` *only when* the decorator actually
1087
+ installs its own ``on_bind``. The catalog framework reads this marker to
1088
+ decide whether `Table(inline_bind=True)` is allowed — the contract is "the
1089
+ decorator's bind is in control, output is exactly ``cls.FIXED_SCHEMA``,
1090
+ no kwargs inspected." If the class already defined its own ``on_bind``,
1091
+ the decorator silently leaves it alone and we *must not* set the marker;
1092
+ otherwise the framework would inline a bind it doesn't actually control.
1093
+
1094
+ Subclasses inherit the marker via Python attribute lookup. A subclass
1095
+ that overrides ``on_bind`` adds it to its own ``__dict__``; the catalog
1096
+ framework's eligibility check is
1097
+ ``getattr(cls, "_inline_bind_safe", False) and "on_bind" not in cls.__dict__``,
1098
+ which correctly excludes such subclasses.
1099
+ """
1100
+ if "on_bind" not in cls.__dict__: # only inject if subclass hasn't overridden
1101
+ if not hasattr(cls, "FIXED_SCHEMA"):
1102
+ raise ValueError(f"Class {cls.__name__} must define FIXED_SCHEMA to use @bind_fixed_schema")
1103
+
1104
+ def on_bind_impl(cls_: type[T], params: Any) -> BindResponse:
1105
+ value = getattr(cls_, "FIXED_SCHEMA", None)
1106
+
1107
+ if value is None or not isinstance(value, pa.Schema):
1108
+ raise TypeError(f"Class {cls_.__name__}.FIXED_SCHEMA must be a pyarrow.Schema")
1109
+ return BindResponse(output_schema=value)
1110
+
1111
+ # Mark the function itself so we can later distinguish "decorator
1112
+ # installed this on_bind" from "user overrode on_bind" — useful for
1113
+ # downstream callers (e.g. catalog inline-bind) that need to confirm
1114
+ # the bind logic in effect is the decorator's, not a subclass override.
1115
+ on_bind_impl._is_bind_fixed_schema = True # type: ignore[attr-defined]
1116
+
1117
+ # assign as classmethod
1118
+ cls.on_bind = classmethod(on_bind_impl) # type: ignore[assignment]
1119
+
1120
+ # Clear 'on_bind' from __abstractmethods__ — the metaclass set it
1121
+ # before decorators ran, so we must update it manually.
1122
+ if hasattr(cls, "__abstractmethods__") and "on_bind" in cls.__abstractmethods__:
1123
+ cls.__abstractmethods__ = cls.__abstractmethods__ - {"on_bind"}
1124
+
1125
+ # Mark the class as inline-bind-safe *only when* we actually installed
1126
+ # the on_bind. If the class had a pre-existing custom on_bind, we left
1127
+ # it alone and have no claim about its purity — the marker stays unset.
1128
+ cls._inline_bind_safe = True # type: ignore[attr-defined]
1129
+
1130
+ return cls