vgi-python 0.8.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (124) hide show
  1. vgi/__init__.py +152 -0
  2. vgi/_duckdb.py +62 -0
  3. vgi/_storage_profile.py +132 -0
  4. vgi/_test_fixtures/__init__.py +20 -0
  5. vgi/_test_fixtures/accumulate/__init__.py +19 -0
  6. vgi/_test_fixtures/accumulate/worker.py +762 -0
  7. vgi/_test_fixtures/aggregate/__init__.py +62 -0
  8. vgi/_test_fixtures/aggregate/_common.py +21 -0
  9. vgi/_test_fixtures/aggregate/basic.py +232 -0
  10. vgi/_test_fixtures/aggregate/dynamic.py +409 -0
  11. vgi/_test_fixtures/aggregate/generic.py +86 -0
  12. vgi/_test_fixtures/aggregate/listagg.py +71 -0
  13. vgi/_test_fixtures/aggregate/percentile.py +107 -0
  14. vgi/_test_fixtures/aggregate/streaming.py +192 -0
  15. vgi/_test_fixtures/aggregate/varargs.py +75 -0
  16. vgi/_test_fixtures/aggregate/window.py +380 -0
  17. vgi/_test_fixtures/attach_options.py +308 -0
  18. vgi/_test_fixtures/bad_protocol.py +62 -0
  19. vgi/_test_fixtures/cancellable.py +336 -0
  20. vgi/_test_fixtures/catalog.py +813 -0
  21. vgi/_test_fixtures/http_server.py +394 -0
  22. vgi/_test_fixtures/nest_tensor.py +614 -0
  23. vgi/_test_fixtures/orchard_catalog.py +47 -0
  24. vgi/_test_fixtures/projection_repro/__init__.py +6 -0
  25. vgi/_test_fixtures/projection_repro/worker.py +454 -0
  26. vgi/_test_fixtures/scalar/__init__.py +116 -0
  27. vgi/_test_fixtures/scalar/_common.py +69 -0
  28. vgi/_test_fixtures/scalar/arithmetic.py +321 -0
  29. vgi/_test_fixtures/scalar/binary.py +120 -0
  30. vgi/_test_fixtures/scalar/formatting.py +176 -0
  31. vgi/_test_fixtures/scalar/geo.py +300 -0
  32. vgi/_test_fixtures/scalar/null_handling.py +107 -0
  33. vgi/_test_fixtures/scalar/random_demo.py +171 -0
  34. vgi/_test_fixtures/scalar/settings_secrets.py +102 -0
  35. vgi/_test_fixtures/scalar/type_info.py +219 -0
  36. vgi/_test_fixtures/schema_reconcile/__init__.py +29 -0
  37. vgi/_test_fixtures/schema_reconcile/worker.py +653 -0
  38. vgi/_test_fixtures/simple_writable.py +793 -0
  39. vgi/_test_fixtures/table/__init__.py +221 -0
  40. vgi/_test_fixtures/table/_common.py +162 -0
  41. vgi/_test_fixtures/table/batch_index.py +283 -0
  42. vgi/_test_fixtures/table/batch_index_broken.py +200 -0
  43. vgi/_test_fixtures/table/catalog_scans.py +162 -0
  44. vgi/_test_fixtures/table/filters.py +1005 -0
  45. vgi/_test_fixtures/table/late_materialization.py +249 -0
  46. vgi/_test_fixtures/table/make_series.py +273 -0
  47. vgi/_test_fixtures/table/misc.py +499 -0
  48. vgi/_test_fixtures/table/order_modes.py +164 -0
  49. vgi/_test_fixtures/table/pairs.py +437 -0
  50. vgi/_test_fixtures/table/partition_columns.py +472 -0
  51. vgi/_test_fixtures/table/partition_columns_broken.py +304 -0
  52. vgi/_test_fixtures/table/profiling_example.py +195 -0
  53. vgi/_test_fixtures/table/required_filters.py +234 -0
  54. vgi/_test_fixtures/table/sequence.py +710 -0
  55. vgi/_test_fixtures/table/settings.py +426 -0
  56. vgi/_test_fixtures/table/transaction_storage.py +162 -0
  57. vgi/_test_fixtures/table/tt_pushdown.py +191 -0
  58. vgi/_test_fixtures/table/versioned.py +230 -0
  59. vgi/_test_fixtures/table_in_out.py +1392 -0
  60. vgi/_test_fixtures/versioned.py +155 -0
  61. vgi/_test_fixtures/versioned_tables.py +595 -0
  62. vgi/_test_fixtures/worker.py +1631 -0
  63. vgi/_test_fixtures/writable/__init__.py +8 -0
  64. vgi/_test_fixtures/writable/generic.py +236 -0
  65. vgi/_test_fixtures/writable/table.py +149 -0
  66. vgi/_test_fixtures/writable/worker.py +1148 -0
  67. vgi/aggregate_function.py +607 -0
  68. vgi/argument_spec.py +472 -0
  69. vgi/arguments.py +1747 -0
  70. vgi/auth.py +55 -0
  71. vgi/catalog/__init__.py +88 -0
  72. vgi/catalog/attach_option.py +206 -0
  73. vgi/catalog/catalog_interface.py +2767 -0
  74. vgi/catalog/descriptors.py +870 -0
  75. vgi/catalog/duckdb_statistics.py +377 -0
  76. vgi/catalog/secret_type.py +96 -0
  77. vgi/catalog/setting.py +253 -0
  78. vgi/catalog/storage.py +372 -0
  79. vgi/client/__init__.py +67 -0
  80. vgi/client/catalog_mixin.py +1251 -0
  81. vgi/client/cli.py +582 -0
  82. vgi/client/cli_catalog.py +182 -0
  83. vgi/client/cli_schema.py +270 -0
  84. vgi/client/cli_table.py +907 -0
  85. vgi/client/cli_transaction.py +97 -0
  86. vgi/client/cli_utils.py +441 -0
  87. vgi/client/cli_view.py +303 -0
  88. vgi/client/client.py +2183 -0
  89. vgi/exceptions.py +205 -0
  90. vgi/function.py +245 -0
  91. vgi/function_storage.py +1636 -0
  92. vgi/function_storage_azure_sql.py +922 -0
  93. vgi/function_storage_cf_do.py +740 -0
  94. vgi/http/__init__.py +25 -0
  95. vgi/http/demo_storage.py +212 -0
  96. vgi/http/worker_page.py +1252 -0
  97. vgi/invocation.py +154 -0
  98. vgi/logging_config.py +93 -0
  99. vgi/meta_worker.py +661 -0
  100. vgi/metadata.py +1403 -0
  101. vgi/otel.py +406 -0
  102. vgi/protocol.py +2418 -0
  103. vgi/protocol_version.txt +1 -0
  104. vgi/py.typed +0 -0
  105. vgi/scalar_function.py +1211 -0
  106. vgi/schema_utils.py +234 -0
  107. vgi/secret_protocol.py +124 -0
  108. vgi/secret_service.py +238 -0
  109. vgi/serve.py +769 -0
  110. vgi/table_buffering_function.py +443 -0
  111. vgi/table_filter_pushdown.py +1528 -0
  112. vgi/table_function.py +1130 -0
  113. vgi/table_in_out_function.py +383 -0
  114. vgi/transactor/__init__.py +24 -0
  115. vgi/transactor/_duckdb_compat.py +27 -0
  116. vgi/transactor/client.py +137 -0
  117. vgi/transactor/protocol.py +149 -0
  118. vgi/transactor/server.py +740 -0
  119. vgi/worker.py +4761 -0
  120. vgi_python-0.8.0.dist-info/METADATA +735 -0
  121. vgi_python-0.8.0.dist-info/RECORD +124 -0
  122. vgi_python-0.8.0.dist-info/WHEEL +4 -0
  123. vgi_python-0.8.0.dist-info/entry_points.txt +5 -0
  124. vgi_python-0.8.0.dist-info/licenses/LICENSE +134 -0
@@ -0,0 +1,2767 @@
1
+ # Copyright 2025, 2026 Query Farm LLC - https://query.farm
2
+
3
+ """VGI Catalog Interface for exposing catalogs, schemas, tables, and views.
4
+
5
+ This module provides the abstract base class and data types for implementing
6
+ catalog interfaces in VGI workers, enabling DuckDB ATTACH support.
7
+ """
8
+
9
+ import dataclasses
10
+ import threading
11
+ from abc import ABC, abstractmethod
12
+ from collections.abc import Mapping, Sequence
13
+ from dataclasses import dataclass, field
14
+ from datetime import datetime
15
+ from enum import Enum
16
+ from typing import (
17
+ TYPE_CHECKING,
18
+ Annotated,
19
+ Any,
20
+ ClassVar,
21
+ Literal,
22
+ NewType,
23
+ Self,
24
+ cast,
25
+ overload,
26
+ )
27
+
28
+ if TYPE_CHECKING:
29
+ from vgi_rpc.rpc import CallContext
30
+
31
+ from vgi.catalog.attach_option import AttachOptionSpec
32
+ from vgi.catalog.descriptors import Catalog, Index, Macro, Schema, Table, View
33
+ from vgi.catalog.secret_type import SecretTypeSpec
34
+ from vgi.catalog.setting import SettingSpec
35
+
36
+ import pyarrow as pa
37
+ from vgi_rpc import ArrowSerializableDataclass, ArrowType
38
+ from vgi_rpc.utils import deserialize_record_batch, serialize_record_batch_bytes
39
+
40
+ from vgi.arguments import SecretLookupEntry
41
+ from vgi.exceptions import CatalogReadOnlyError
42
+ from vgi.metadata import (
43
+ DistinctDependence,
44
+ FunctionStability,
45
+ NullHandling,
46
+ OrderDependence,
47
+ OrderPreservation,
48
+ PartitionKind,
49
+ )
50
+
51
+ __all__ = [
52
+ # Re-exported from vgi.metadata
53
+ "DistinctDependence",
54
+ "FunctionStability",
55
+ "NullHandling",
56
+ "OrderDependence",
57
+ "OrderPreservation",
58
+ "PartitionKind",
59
+ # Catalog-specific
60
+ "CatalogDataVersionRelease",
61
+ "CatalogExample",
62
+ "CatalogInfo",
63
+ "ColumnStatistics",
64
+ "IndexConstraintType",
65
+ "IndexInfo",
66
+ "SecretLookupEntry",
67
+ "MacroType",
68
+ "SchemaObjectType",
69
+ "TableColumnStatisticsResult",
70
+ "WriteFunctionResult",
71
+ ]
72
+
73
+
74
+ def _validate_at_params(at_unit: str | None, at_value: str | None) -> None:
75
+ """Validate that at_unit and at_value are both provided or both absent."""
76
+ if bool(at_unit) != bool(at_value):
77
+ raise ValueError("at_unit and at_value must both be provided or both be None")
78
+
79
+
80
+ @dataclass(frozen=True)
81
+ class CatalogExample(ArrowSerializableDataclass):
82
+ """An example usage of a function for catalog serialization.
83
+
84
+ Attributes:
85
+ sql: SQL query demonstrating the function.
86
+ description: What this example demonstrates.
87
+ expected_output: Optional expected result description.
88
+
89
+ """
90
+
91
+ sql: str
92
+ description: str = ""
93
+ expected_output: str | None = None
94
+
95
+
96
+ # Type aliases for improved code clarity and type checking.
97
+ # At runtime, these are equivalent to their underlying types.
98
+ AttachOpaqueData = NewType("AttachOpaqueData", bytes)
99
+ TransactionOpaqueData = NewType("TransactionOpaqueData", bytes)
100
+ SerializedSchema = NewType("SerializedSchema", bytes)
101
+ SqlExpression = NewType("SqlExpression", str)
102
+
103
+
104
+ @dataclass(frozen=True)
105
+ class CatalogDataVersionRelease(ArrowSerializableDataclass):
106
+ """One published data version of a catalog.
107
+
108
+ ``data_version_spec`` advertises a *compatibility range*; this record
109
+ advertises *what's actually been published*. Together they let a
110
+ client (the describe page, Cupola, programmatic consumers) render a
111
+ discoverable release timeline without scraping the worker's repo.
112
+
113
+ Contracts on the ``CatalogInfo.releases`` list this belongs to:
114
+
115
+ * **Ordering** — entries MUST appear newest-first. Unspecified order
116
+ would force consumers to sort by ``version`` string, which requires
117
+ a comparator the protocol does not define (semver vs. calver vs.
118
+ date-stamped vs. RC tags are all valid).
119
+ * **Uniqueness** — each ``version`` MUST appear at most once. Mirrors
120
+ the same invariant on ``attach_option_specs``'s ``name``. Consumers
121
+ defend against duplicates (log-and-skip later entries) since Arrow
122
+ cannot enforce key uniqueness at the wire level.
123
+
124
+ Long-form release notes do not live here — link to a CHANGELOG anchor,
125
+ GitHub release page, PR, or migration guide via ``notes_url``.
126
+ """
127
+
128
+ # Concrete version, not a spec. e.g. "1.0.0", "2.4.1". Semver carries
129
+ # the breaking-change signal directly — major bumps are breaking,
130
+ # minor/patch are not.
131
+ version: str
132
+
133
+ # Release date (UTC). ``None`` when the worker doesn't track dates.
134
+ released_at: Annotated[datetime | None, ArrowType(pa.timestamp("us", tz="UTC"))] = None
135
+
136
+ # One-line human summary. Empty string when unknown.
137
+ summary: str = ""
138
+
139
+ # Optional per-release link to detailed notes. Distinct from
140
+ # ``CatalogInfo.source_url`` (which points at the repo as a whole):
141
+ # this points at what changed in *this* release.
142
+ notes_url: str | None = None
143
+
144
+
145
+ @dataclass(frozen=True)
146
+ class CatalogInfo(ArrowSerializableDataclass):
147
+ """Discovery record for a catalog exposed by a worker.
148
+
149
+ Returned by catalog_catalogs() so clients can inspect per-catalog version
150
+ metadata before attaching.
151
+ """
152
+
153
+ # Catalog name — pass to catalog_attach() to open it.
154
+ name: str
155
+ # Worker software version (singular per worker). ``None`` = worker declares
156
+ # no implementation version.
157
+ implementation_version: str | None
158
+ # Semver range the catalog serves (e.g. ">=1.0.0,<2.0.0"). ``None`` = worker
159
+ # declares no data-version opinion.
160
+ data_version_spec: str | None
161
+ # Attach-time options the catalog accepts (distinct from session settings).
162
+ # Each AttachOptionSpec is serialized as bytes for Arrow compatibility.
163
+ # Enables pre-attach discovery via the catalogs() RPC.
164
+ attach_option_specs: list[bytes] = field(default_factory=list)
165
+ # Concrete published data versions, newest-first. Empty when the worker
166
+ # doesn't track release history. See ``CatalogDataVersionRelease`` for
167
+ # the per-entry ordering and uniqueness contracts.
168
+ releases: list[CatalogDataVersionRelease] = field(default_factory=list)
169
+ # Where this worker's code lives — repo, build, docs. ``None`` when
170
+ # the worker doesn't advertise a source location.
171
+ source_url: str | None = None
172
+
173
+
174
+ @dataclass(frozen=True)
175
+ class CatalogAttachResult(ArrowSerializableDataclass):
176
+ """Result from attaching to a catalog."""
177
+
178
+ # The unique id for the attached catalog.
179
+ attach_opaque_data: AttachOpaqueData
180
+ # Indicate if the worker supports transactions or not.
181
+ # If false, all transaction related methods will not be called and all
182
+ # transaction_opaque_data parameters will be None.
183
+ supports_transactions: bool
184
+ # Indicate if tables support time travel
185
+ supports_time_travel: bool
186
+ # Indicate that the catalog version id is frozen and the schema
187
+ # and object information will not change.
188
+ catalog_version_frozen: bool
189
+ # The initial catalog version, it increments when schemas, tables
190
+ # or other objects change.
191
+ catalog_version: int
192
+ # Indicate if the attach_opaque_data must be persisted across commands.
193
+ # True: Catalog is stateful; attach_opaque_data represents a session
194
+ # False: Catalog is stateless; CLI can auto-attach on each command
195
+ attach_opaque_data_required: bool = True
196
+ # The name of the default schema for this catalog.
197
+ default_schema: str = "main"
198
+ # Extension options (settings) exposed by this catalog/worker.
199
+ # Each ExtensionOption is serialized as bytes for Arrow compatibility.
200
+ settings: list[bytes] = field(default_factory=list)
201
+ # Secret types registered with DuckDB's SecretManager.
202
+ # Each SecretTypeSpec is serialized as bytes for Arrow compatibility.
203
+ secret_types: list[bytes] = field(default_factory=list)
204
+ # Optional comment describing this catalog/database.
205
+ comment: str | None = None
206
+ # Optional key-value tags associated with this catalog/database.
207
+ tags: dict[str, str] = field(default_factory=dict)
208
+ # Whether any tables in this catalog can provide column statistics.
209
+ # Global gate — if False, GetStatistics() returns nullptr for all tables.
210
+ supports_column_statistics: bool = False
211
+ # Concrete data version the worker resolved for this attach. ``None`` =
212
+ # worker has no opinion or the request omitted data_version_spec.
213
+ resolved_data_version: str | None = field(kw_only=True)
214
+ # Concrete implementation version the worker resolved for this attach.
215
+ # ``None`` = worker has no opinion or the request omitted
216
+ # implementation_version.
217
+ resolved_implementation_version: str | None = field(kw_only=True)
218
+
219
+
220
+ @dataclass(frozen=True)
221
+ class CatalogObject:
222
+ """All objects have the following common properties."""
223
+
224
+ # This is a generic comment about the object
225
+ comment: str | None
226
+ # These are key-value tags associated with the object
227
+ tags: dict[str, str]
228
+
229
+
230
+ @dataclass(frozen=True)
231
+ class CatalogSchemaObject(CatalogObject):
232
+ """Objects that exist within a schema have the following common properties."""
233
+
234
+ # The name of the object
235
+ name: str
236
+ # The name of the schema containing the object
237
+ schema_name: str
238
+
239
+
240
+ @dataclass(frozen=True)
241
+ class SchemaInfo(CatalogObject, ArrowSerializableDataclass):
242
+ """Information about a schema in a catalog."""
243
+
244
+ attach_opaque_data: AttachOpaqueData
245
+ name: str
246
+ # Approximate population per object kind, keyed by the same names the C++
247
+ # extension uses for its set-cache instrumentation: ``"table"``, ``"view"``,
248
+ # ``"scalar_function"``, ``"aggregate_function"``, ``"table_function"``,
249
+ # ``"macro"``, ``"index"``. Used by the client to pick between bulk
250
+ # ``LoadEntries`` and per-name single-entry RPCs. Workers may omit the
251
+ # field entirely or any individual key — the client treats absent counts
252
+ # as 1, so unspecified populations bias toward eager bulk-load.
253
+ #
254
+ # **The value 0 is a hard guarantee, not an estimate.** When a count is
255
+ # exactly 0 the client skips the corresponding ``catalog_schema_contents_*``
256
+ # bulk RPC entirely and short-circuits per-name lookups
257
+ # (``catalog_table_get`` / ``catalog_view_get`` / ``catalog_index_get``).
258
+ # If a worker reports 0 for a kind that actually has entries,
259
+ # ``SELECT … FROM s.x`` silently returns "not found" — only declare 0 for
260
+ # kinds the worker knows are empty in its current view of the schema.
261
+ # Cross-session DDL on the same catalog (another connection creating a
262
+ # view in a schema this connection has cached as zero-views) is handled
263
+ # the same way as any other stale catalog cache: ``vgi_clear_cache()`` or
264
+ # re-attach. Time-travel AT-clause queries do not honor the bypass — they
265
+ # always issue the per-name RPC because a historical version may have had
266
+ # entries the current view does not.
267
+ estimated_object_count: dict[str, int] | None = None
268
+
269
+
270
+ @dataclass(frozen=True)
271
+ class TableInfo(CatalogSchemaObject, ArrowSerializableDataclass):
272
+ """Information about a table in a schema."""
273
+
274
+ # The columns of the table as a PyArrow schema
275
+ # that is serialized as bytes.
276
+ columns: SerializedSchema
277
+
278
+ # Use ArrowType to specify int32 instead of default int64
279
+ not_null_constraints: Annotated[list[int], ArrowType(pa.list_(pa.int32()))]
280
+ unique_constraints: Annotated[list[list[int]], ArrowType(pa.list_(pa.list_(pa.int32())))]
281
+ check_constraints: list[str]
282
+ primary_key_constraints: Annotated[list[list[int]], ArrowType(pa.list_(pa.list_(pa.int32())))] = field(
283
+ default_factory=list
284
+ )
285
+ foreign_key_constraints: Annotated[list[bytes], ArrowType(pa.list_(pa.binary()))] = field(default_factory=list)
286
+
287
+ # Write support flags — indicate which DML operations the table supports.
288
+ supports_insert: bool = False
289
+ supports_update: bool = False
290
+ supports_delete: bool = False
291
+ # When False (the default), the C++ extension rejects INSERT/UPDATE/DELETE
292
+ # ... RETURNING at plan time with a BinderException. Workers that can emit
293
+ # the affected rows from their write functions must opt in by setting this
294
+ # to True.
295
+ supports_returning: bool = False
296
+
297
+ # Statistics capability flag — indicates this table can provide column statistics.
298
+ supports_column_statistics: bool = False
299
+
300
+ # Optional inlined function-discovery results. When populated, the C++
301
+ # extension uses the cached value and skips the corresponding
302
+ # ``catalog_table_{scan,insert,update,delete}_function_get`` RPC. Bytes are
303
+ # the IPC payload from ``ScanFunctionResult.serialize()``.
304
+ #
305
+ # Populating these fields freezes the function args for the lifetime of the
306
+ # catalog cache (until ``catalog_version`` bumps). Workers whose function
307
+ # args change more frequently than ``catalog_version`` (rotating
308
+ # credentials, presigned URLs, per-transaction snapshots) MUST leave these
309
+ # null so the per-bind RPC continues to fire.
310
+ scan_function: Annotated[bytes | None, ArrowType(pa.binary())] = None
311
+ insert_function: Annotated[bytes | None, ArrowType(pa.binary())] = None
312
+ update_function: Annotated[bytes | None, ArrowType(pa.binary())] = None
313
+ delete_function: Annotated[bytes | None, ArrowType(pa.binary())] = None
314
+
315
+ # Optional inlined cardinality. When populated, the C++ extension uses
316
+ # these values directly and skips the ``table_function_cardinality`` RPC
317
+ # — saving one round-trip per bind. Use for read-only or slow-changing
318
+ # tables where cardinality is statically known.
319
+ #
320
+ # Populating these fields freezes the cardinality for the lifetime of
321
+ # the catalog cache (until ``catalog_version`` bumps). Workers whose
322
+ # cardinality changes faster (e.g. live counters) MUST leave them null
323
+ # so the per-bind RPC continues to fire.
324
+ cardinality_estimate: Annotated[int | None, ArrowType(pa.int64())] = None
325
+ cardinality_max: Annotated[int | None, ArrowType(pa.int64())] = None
326
+
327
+ # Optional inlined column statistics. When populated, the C++ extension
328
+ # uses the cached value and skips the per-bind / per-table
329
+ # ``catalog_table_column_statistics_get`` RPC and the per-scan
330
+ # ``table_function_statistics`` RPC. Bytes are the IPC payload from
331
+ # ``serialize_column_statistics(stats, cache_max_age_seconds)``.
332
+ #
333
+ # Populating this field freezes the resolved stats for the lifetime of
334
+ # the catalog cache (until ``catalog_version`` bumps). Workers whose
335
+ # statistics change faster than ``catalog_version`` (e.g. live counters,
336
+ # rapidly-mutating dimensions) MUST leave this null so the on-demand
337
+ # RPC continues to fire.
338
+ column_statistics: Annotated[bytes | None, ArrowType(pa.binary())] = None
339
+
340
+ # Optional inlined bind result. Bytes are the IPC payload of
341
+ # ``BindResponse.serialize_to_bytes()``. When populated, the C++
342
+ # extension uses these bytes verbatim and skips the per-scan ``bind``
343
+ # RPC, threading the deserialized BindResult straight into bind_data.
344
+ #
345
+ # The catalog framework only populates this for tables marked
346
+ # ``Table(inline_bind=True)`` whose function class is
347
+ # ``@bind_fixed_schema``-decorated — the decorator's contract (output is
348
+ # exactly ``cls.FIXED_SCHEMA``, no per-call inputs, no opaque_data)
349
+ # matches what's safe to freeze for the catalog cache lifetime.
350
+ # Functions with custom ``on_bind`` are not eligible via the framework
351
+ # path; workers can still inline manually inside their own
352
+ # ``schema_contents`` override when the bind output is independently
353
+ # known to be stable.
354
+ bind_result: Annotated[bytes | None, ArrowType(pa.binary())] = None
355
+
356
+ # Dotted-path column references that the VGI extension's optimizer pass
357
+ # must verify appear in any scan's WHERE expression (top-level column
358
+ # names like ``"country"`` or struct subfields like ``"bbox.xmin"``,
359
+ # ``"nested.outer.inner"``). Empty (default) means no enforcement — the
360
+ # zero-cost fast path for every existing table.
361
+ #
362
+ # Satisfaction is prefix-based: a present filter on a shorter dotted path
363
+ # satisfies any required path it's a prefix of. A whole-struct filter on
364
+ # ``bbox`` therefore satisfies every required ``"bbox.*"`` path. The C++
365
+ # extension throws ``BinderException`` listing any unsatisfied paths.
366
+ required_field_filter_paths: list[str] = field(default_factory=list)
367
+
368
+
369
+ @dataclass(frozen=True)
370
+ class ViewInfo(CatalogSchemaObject, ArrowSerializableDataclass):
371
+ """Information about a view in a schema."""
372
+
373
+ # The definition of the view which is a SQL query string.
374
+ definition: str
375
+
376
+ # Per-column comments, keyed by the view's output column name. Unlike tables
377
+ # (whose column comments ride along as Arrow field metadata on the serialized
378
+ # ``columns`` schema), a view ships only its SQL ``definition`` — DuckDB binds
379
+ # that query to derive the columns — so view column comments need their own
380
+ # channel. The C++ extension aligns these by name against the bound output
381
+ # columns and feeds them into ``CreateViewInfo.column_comments_map``; names
382
+ # that don't match a bound column are ignored.
383
+ column_comments: dict[str, str] = field(default_factory=dict)
384
+
385
+
386
+ @dataclass(frozen=True)
387
+ class MacroInfo(CatalogSchemaObject, ArrowSerializableDataclass):
388
+ """Information about a macro in a schema.
389
+
390
+ Attributes:
391
+ macro_type: Whether this is a scalar or table macro.
392
+ parameters: Ordered list of parameter names.
393
+ parameter_default_values: One-row RecordBatch where column names are parameter
394
+ names and values are typed defaults. None if no defaults.
395
+ Serialized as IPC bytes over the wire.
396
+ definition: The SQL expression (scalar) or query (table).
397
+
398
+ """
399
+
400
+ macro_type: "MacroType"
401
+ parameters: list[str]
402
+ parameter_default_values: Annotated[pa.RecordBatch | None, ArrowType(pa.binary())] = None
403
+ definition: str = ""
404
+
405
+
406
+ class FunctionType(Enum):
407
+ """The type of function in a schema."""
408
+
409
+ SCALAR = "scalar"
410
+ TABLE = "table"
411
+ TABLE_BUFFERING = "table_buffering"
412
+ AGGREGATE = "aggregate"
413
+
414
+
415
+ class MacroType(Enum):
416
+ """The type of macro in a schema."""
417
+
418
+ SCALAR = "scalar"
419
+ TABLE = "table"
420
+
421
+
422
+ class IndexConstraintType(Enum):
423
+ """The constraint type of an index.
424
+
425
+ NONE: Regular index (no constraint enforcement).
426
+ UNIQUE: Index enforces a UNIQUE constraint.
427
+ PRIMARY: Index enforces a PRIMARY KEY constraint.
428
+ """
429
+
430
+ NONE = "none"
431
+ UNIQUE = "unique"
432
+ PRIMARY = "primary"
433
+
434
+
435
+ @dataclass(frozen=True)
436
+ class IndexInfo(CatalogSchemaObject, ArrowSerializableDataclass):
437
+ """Information about an index in a schema.
438
+
439
+ Attributes:
440
+ table_name: The name of the table this index is on.
441
+ index_type: The index type string (e.g., "ART", or empty for default).
442
+ constraint_type: The constraint enforcement type (NONE, UNIQUE, PRIMARY).
443
+ expressions: SQL expression strings defining the indexed expressions.
444
+ For column-based indexes, these are column references (e.g., "col_a").
445
+ For expression indexes, these are arbitrary SQL (e.g., "lower(col_a)").
446
+ options: Key-value index options (WITH clause).
447
+
448
+ """
449
+
450
+ table_name: str
451
+ index_type: str = ""
452
+ constraint_type: IndexConstraintType = IndexConstraintType.NONE
453
+ expressions: list[str] = field(default_factory=list)
454
+ options: dict[str, str] = field(default_factory=dict)
455
+
456
+
457
+ class SchemaObjectType(Enum):
458
+ """The type of object that can exist within a schema.
459
+
460
+ Used to filter results from schema_contents().
461
+ """
462
+
463
+ TABLE = "table"
464
+ VIEW = "view"
465
+ SCALAR_FUNCTION = "scalar_function"
466
+ TABLE_FUNCTION = "table_function"
467
+ AGGREGATE_FUNCTION = "aggregate_function"
468
+ SCALAR_MACRO = "scalar_macro"
469
+ TABLE_MACRO = "table_macro"
470
+ INDEX = "index"
471
+
472
+
473
+ class OnConflict(Enum):
474
+ """Behavior when a conflict occurs during creation of an object.
475
+
476
+ IGNORE: Do nothing if the object already exists.
477
+ REPLACE: Replace the existing object if it already exists.
478
+ ERROR: Raise an error if the object already exists.
479
+ """
480
+
481
+ ERROR = "error"
482
+ IGNORE = "ignore"
483
+ REPLACE = "replace"
484
+
485
+
486
+ @dataclass(frozen=True)
487
+ class FunctionInfo(CatalogSchemaObject, ArrowSerializableDataclass):
488
+ """Information about a function in a schema."""
489
+
490
+ # the type of function from VGI
491
+ function_type: FunctionType
492
+
493
+ # The arguments as a serialized Apache arrow schema using
494
+ # schema.serialize().to_pybytes()
495
+ arguments: SerializedSchema
496
+
497
+ # The output schema as a serialized Apache arrow schema using
498
+ # schema.serialize().to_pybytes()
499
+ output_schema: SerializedSchema
500
+
501
+ # Scalar function behavior fields (None for non-scalar functions)
502
+ stability: FunctionStability | None = None
503
+ null_handling: NullHandling | None = None
504
+
505
+ # Documentation fields
506
+ # description: intrinsic documentation from function metadata (Meta.description)
507
+ # comment: user-settable comment (via COMMENT ON FUNCTION, inherited from base)
508
+ description: str = ""
509
+ examples: list[CatalogExample] = field(default_factory=list)
510
+ categories: list[str] = field(default_factory=list)
511
+
512
+ # Table function capabilities (None for scalar functions)
513
+ projection_pushdown: bool | None = None
514
+ filter_pushdown: bool | None = None
515
+ sampling_pushdown: bool | None = None
516
+ # True if the table participates in DuckDB's late-materialization optimizer
517
+ # (Meta.late_materialization). The DuckDB extension only honours this when
518
+ # the table also exposes a rowid virtual column plus filter/projection
519
+ # pushdown — see GetScanFunctionImpl in the C++ vgi_table_entry.cpp.
520
+ late_materialization: bool | None = None
521
+ supported_expression_filters: list[str] = field(default_factory=list)
522
+ order_preservation: OrderPreservation | None = None
523
+ # Use ArrowType to specify int32 instead of default int64
524
+ max_workers: Annotated[int | None, ArrowType(pa.int32())] = None
525
+ # True if the function opts in to per-batch ``vgi_batch_index`` tagging:
526
+ # the worker emits an integer partition id in each Arrow batch's
527
+ # KeyValueMetadata; the DuckDB extension threads it through
528
+ # ``TableFunction::get_partition_data`` so ordered sinks (BatchCollector,
529
+ # BatchInsert, BatchCopyToFile, Limit) can reassemble parallel output in
530
+ # partition-id order. Opting in also skips the FIXED_ORDER MaxThreads=1
531
+ # clamp; the source stays parallel and the sink does the ordering.
532
+ supports_batch_index: bool = False
533
+ # Partition shape declared by the function over its
534
+ # ``vgi.partition_column``-annotated bind-schema fields. When non-
535
+ # ``NOT_PARTITIONED``, the DuckDB extension installs
536
+ # ``TableFunction::get_partition_info`` returning the corresponding
537
+ # ``TablePartitionInfo`` value so the planner can pick
538
+ # ``PhysicalPartitionedAggregate`` for ``GROUP BY`` queries (today,
539
+ # only ``SINGLE_VALUE_PARTITIONS`` materially changes planner
540
+ # behavior). Per-column annotation lives in the bind schema's
541
+ # field-level metadata — see ``vgi.schema_utils.partition_field``.
542
+ partition_kind: PartitionKind = PartitionKind.NOT_PARTITIONED
543
+
544
+ # Aggregate function fields (future)
545
+ order_dependent: OrderDependence = OrderDependence.NOT_ORDER_DEPENDENT
546
+ distinct_dependent: DistinctDependence = DistinctDependence.NOT_DISTINCT_DEPENDENT
547
+ # True if the aggregate implements the window() callback
548
+ supports_window: bool = False
549
+ # True if the aggregate opts into the streaming-partitioned protocol —
550
+ # ``aggregate_streaming_open`` / ``_chunk`` / ``_close``. The DuckDB
551
+ # extension's optimizer rule may rewrite eligible LogicalWindow nodes to
552
+ # use this path.
553
+ streaming_partitioned: bool = False
554
+
555
+ # True if a table-in-out function declares a finalize/finish stage.
556
+ # The C++ extension uses this to conditionally register
557
+ # ``in_out_function_final``; DuckDB rejects LATERAL with correlated input
558
+ # on functions that register a finalize callback.
559
+ has_finalize: bool = False
560
+
561
+ # Only meaningful when ``function_type == FunctionType.TABLE_BUFFERING``
562
+ # (i.e. the function is registered through the Sink+Source path). When
563
+ # true, the source phase is single-threaded and ``finalize_state_id``s
564
+ # drain in combine-returned order. Default false enables parallel
565
+ # finalize.
566
+ source_order_dependent: bool = False
567
+
568
+ # Only meaningful when ``function_type == FunctionType.TABLE_BUFFERING``.
569
+ # When true, the SINK phase runs single-threaded — every process() call
570
+ # arrives in source order on one worker. Mutually exclusive with
571
+ # requires_input_batch_index.
572
+ sink_order_dependent: bool = False
573
+
574
+ # Only meaningful when ``function_type == FunctionType.TABLE_BUFFERING``.
575
+ # When true, the C++ Sink operator declares
576
+ # RequiredPartitionInfo()=BatchIndex(); each process() RPC carries a
577
+ # globally-unique monotonic batch_index from DuckDB's source. Workers
578
+ # can sort by it in combine() to reconstruct source order under parallel
579
+ # ingest. Mutually exclusive with sink_order_dependent.
580
+ requires_input_batch_index: bool = False
581
+
582
+ # Settings required by the function
583
+ required_settings: list[str] = field(default_factory=list)
584
+
585
+ # Secrets required by the function (each entry has secret_type, optional secret_name, optional scope)
586
+ required_secrets: list[SecretLookupEntry] = field(default_factory=list)
587
+
588
+
589
+ @dataclass(frozen=True)
590
+ class ScanFunctionResult:
591
+ """Result from getting a table scan function.
592
+
593
+ This result tells the VGI DuckDB extension which DuckDB function to call
594
+ to obtain the data for a table. This enables catalogs to delegate scanning
595
+ to any DuckDB function (e.g., read_parquet, iceberg_scan, or a custom VGI
596
+ table function) with appropriate arguments.
597
+
598
+ Attributes:
599
+ function_name: The DuckDB function to call (e.g., "read_parquet").
600
+ positional_arguments: Positional arguments as PyArrow scalars.
601
+ named_arguments: Named arguments as PyArrow scalars.
602
+ required_extensions: DuckDB extensions to load before calling.
603
+
604
+ """
605
+
606
+ # The name of the duckdb function to call to obtain the data
607
+ # in the table.
608
+ function_name: str
609
+
610
+ # The positional arguments to the include in the function call.
611
+ positional_arguments: list[pa.Scalar] # type: ignore[type-arg]
612
+
613
+ # The named arguments to include in the function call.
614
+ named_arguments: dict[str, pa.Scalar] # type: ignore[type-arg]
615
+
616
+ # A list of extensions to require to be loaded.
617
+ required_extensions: list[str] = field(default_factory=list)
618
+
619
+ ARROW_SCHEMA: ClassVar[pa.Schema] = pa.schema(
620
+ [
621
+ pa.field("function_name", pa.string(), nullable=False),
622
+ pa.field("arguments", pa.binary(), nullable=False),
623
+ pa.field("required_extensions", pa.list_(pa.string()), nullable=False),
624
+ ] # type: ignore[arg-type]
625
+ )
626
+
627
+ def to_row_dict(self) -> dict[str, Any]:
628
+ """Convert to a dictionary for batch construction.
629
+
630
+ The arguments field is serialized as nested Arrow IPC bytes.
631
+ """
632
+ # Build arguments as nested batch
633
+ argument_values: dict[str, pa.Scalar] = {} # type: ignore[type-arg]
634
+ argument_schema = []
635
+ for index, arg in enumerate(self.positional_arguments):
636
+ argument_schema.append(pa.field(f"arg_{index}", arg.type))
637
+ argument_values[f"arg_{index}"] = arg
638
+ for name, value in self.named_arguments.items():
639
+ argument_schema.append(pa.field(name, value.type))
640
+ argument_values[name] = value
641
+
642
+ argument_batch = pa.RecordBatch.from_pylist(
643
+ [argument_values],
644
+ schema=pa.schema(argument_schema),
645
+ )
646
+
647
+ return {
648
+ "function_name": self.function_name,
649
+ "arguments": serialize_record_batch_bytes(argument_batch),
650
+ "required_extensions": list(self.required_extensions) if self.required_extensions is not None else None,
651
+ }
652
+
653
+ def serialize(self) -> bytes:
654
+ """Serialize to Arrow IPC bytes."""
655
+ batch = pa.RecordBatch.from_pylist(
656
+ [self.to_row_dict()],
657
+ schema=self.ARROW_SCHEMA,
658
+ )
659
+ return serialize_record_batch_bytes(batch)
660
+
661
+ @classmethod
662
+ def deserialize(cls, batch: pa.RecordBatch) -> Self:
663
+ """Deserialize from Arrow RecordBatch."""
664
+ from vgi_rpc.utils import _validate_single_row_batch
665
+
666
+ row = _validate_single_row_batch(
667
+ batch,
668
+ cls.__name__,
669
+ required_fields=["function_name", "arguments"],
670
+ )
671
+
672
+ # Deserialize the nested arguments batch.
673
+ # row["arguments"] is already bytes (_validate_single_row_batch returns
674
+ # Python values, not PyArrow scalars).
675
+ arguments_bytes = cast(bytes, row["arguments"])
676
+ arguments_batch, _ = deserialize_record_batch(arguments_bytes)
677
+
678
+ # Extract positional and named arguments from the batch
679
+ positional_arguments: list[pa.Scalar] = [] # type: ignore[type-arg]
680
+ named_arguments: dict[str, pa.Scalar] = {} # type: ignore[type-arg]
681
+
682
+ for arg_field in arguments_batch.schema:
683
+ value = arguments_batch.column(arg_field.name)[0]
684
+ if arg_field.name.startswith("arg_"):
685
+ positional_arguments.append(value)
686
+ else:
687
+ named_arguments[arg_field.name] = value
688
+
689
+ return cls(
690
+ function_name=cast(str, row["function_name"]),
691
+ positional_arguments=positional_arguments,
692
+ named_arguments=named_arguments,
693
+ required_extensions=list(cast("list[str]", row.get("required_extensions") or [])),
694
+ )
695
+
696
+
697
+ # Write function discovery uses the same wire format as scan function discovery.
698
+ WriteFunctionResult = ScanFunctionResult
699
+
700
+
701
+ # ============================================================================
702
+ # Multi-branch scan (catalog_table_scan_branches_get)
703
+ # ============================================================================
704
+ #
705
+ # A table whose data spans multiple physical sources (canonical example:
706
+ # hot rows in Kafka + historical rows in Iceberg/Delta/parquet) declares
707
+ # one ``ScanBranch`` per source. The VGI DuckDB extension's optimizer-
708
+ # extension rewrites a placeholder ``LogicalGet`` into a
709
+ # ``LogicalSetOperation(UNION_ALL, ...)`` with one arm per branch, each
710
+ # binding its own ``TableFunction`` (a VGI function, or a native reader
711
+ # like ``iceberg_scan`` / ``read_parquet``).
712
+ #
713
+ # This is **wire-compat with single-branch workers**: the new RPC
714
+ # ``catalog_table_scan_branches_get`` is additive; old workers that don't
715
+ # implement it cause the C++ side to fall back to
716
+ # ``catalog_table_scan_function_get`` and synthesise a one-branch result.
717
+ #
718
+ # The rewriter semantics, ``branch_filter`` model, and current scope
719
+ # decisions (INSERT-only on writable arm, UPDATE/DELETE/MERGE refused,
720
+ # AT-clause refused, fail-fast error semantics) are documented with the
721
+ # relevant methods below.
722
+
723
+
724
+ @dataclass(frozen=True)
725
+ class ScanBranch:
726
+ """One physical source backing a multi-branch scan.
727
+
728
+ Attributes:
729
+ function_name: The DuckDB function to call for this branch
730
+ (e.g., ``"read_parquet"``, ``"iceberg_scan"``, or a VGI
731
+ table function). The C++ rewriter resolves this name against
732
+ DuckDB's function catalog and binds it at optimize time.
733
+ positional_arguments: Positional arguments as PyArrow scalars,
734
+ passed through to the function's ``bind``.
735
+ named_arguments: Named arguments as PyArrow scalars.
736
+ branch_filter: Optional SQL expression text (parsed by DuckDB's
737
+ parser, bound against the branch's bound column list). The
738
+ rewriter AND's this into every scan of this branch BEFORE
739
+ filter pushdown, so the branch only ever sees rows in its
740
+ declared scope. Used to make overlapping physical sources
741
+ (Kafka 7d retention + Iceberg nightly batches with ~24h
742
+ overlap) non-overlapping at scan time, without changing the
743
+ worker code. ``None`` means unconstrained.
744
+ writable: Declares this branch as the INSERT target for the
745
+ multi-branch table. At most one branch per table may set
746
+ this true (enforced at catalog-load by the C++ extension —
747
+ multiple writable arms would violate DuckDB's single-
748
+ writable-catalog-per-transaction rule). When no branch is
749
+ writable, the table is read-only. UPDATE/DELETE/MERGE
750
+ remain refused on multi-branch tables regardless of this
751
+ flag; the contract is INSERT-only until cross-arm
752
+ semantics have customer-driven evidence.
753
+
754
+ """
755
+
756
+ function_name: str
757
+ positional_arguments: list[pa.Scalar] # type: ignore[type-arg]
758
+ named_arguments: dict[str, pa.Scalar] # type: ignore[type-arg]
759
+ branch_filter: str | None = None
760
+ writable: bool = False
761
+
762
+ ARROW_SCHEMA: ClassVar[pa.Schema] = pa.schema(
763
+ [
764
+ pa.field("function_name", pa.string(), nullable=False),
765
+ pa.field("arguments", pa.binary(), nullable=False),
766
+ pa.field("branch_filter", pa.string(), nullable=True),
767
+ pa.field("writable", pa.bool_(), nullable=False),
768
+ ] # type: ignore[arg-type]
769
+ )
770
+
771
+ def to_row_dict(self) -> dict[str, Any]:
772
+ """Convert to a dictionary for batch construction.
773
+
774
+ Arguments are serialized as nested Arrow IPC bytes (same trick as
775
+ :class:`ScanFunctionResult`).
776
+ """
777
+ argument_values: dict[str, pa.Scalar] = {} # type: ignore[type-arg]
778
+ argument_schema: list[pa.Field] = [] # type: ignore[type-arg]
779
+ for index, arg in enumerate(self.positional_arguments):
780
+ argument_schema.append(pa.field(f"arg_{index}", arg.type))
781
+ argument_values[f"arg_{index}"] = arg
782
+ for name, value in self.named_arguments.items():
783
+ argument_schema.append(pa.field(name, value.type))
784
+ argument_values[name] = value
785
+ argument_batch = pa.RecordBatch.from_pylist(
786
+ [argument_values],
787
+ schema=pa.schema(argument_schema),
788
+ )
789
+ return {
790
+ "function_name": self.function_name,
791
+ "arguments": serialize_record_batch_bytes(argument_batch),
792
+ "branch_filter": self.branch_filter,
793
+ "writable": self.writable,
794
+ }
795
+
796
+ def serialize(self) -> bytes:
797
+ """Serialize to Arrow IPC bytes (1-row batch using ARROW_SCHEMA)."""
798
+ batch = pa.RecordBatch.from_pylist(
799
+ [self.to_row_dict()],
800
+ schema=self.ARROW_SCHEMA,
801
+ )
802
+ return serialize_record_batch_bytes(batch)
803
+
804
+ @classmethod
805
+ def deserialize(cls, batch: pa.RecordBatch) -> Self:
806
+ """Deserialize from a 1-row Arrow RecordBatch."""
807
+ from vgi_rpc.utils import _validate_single_row_batch
808
+
809
+ row = _validate_single_row_batch(
810
+ batch,
811
+ cls.__name__,
812
+ required_fields=["function_name", "arguments"],
813
+ )
814
+
815
+ arguments_bytes = cast(bytes, row["arguments"])
816
+ arguments_batch, _ = deserialize_record_batch(arguments_bytes)
817
+
818
+ positional_arguments: list[pa.Scalar] = [] # type: ignore[type-arg]
819
+ named_arguments: dict[str, pa.Scalar] = {} # type: ignore[type-arg]
820
+ for arg_field in arguments_batch.schema:
821
+ value = arguments_batch.column(arg_field.name)[0]
822
+ if arg_field.name.startswith("arg_"):
823
+ positional_arguments.append(value)
824
+ else:
825
+ named_arguments[arg_field.name] = value
826
+
827
+ branch_filter_value = row.get("branch_filter")
828
+ return cls(
829
+ function_name=cast(str, row["function_name"]),
830
+ positional_arguments=positional_arguments,
831
+ named_arguments=named_arguments,
832
+ branch_filter=cast("str | None", branch_filter_value) if branch_filter_value is not None else None,
833
+ # writable is non-nullable on the wire — trust the schema.
834
+ writable=bool(row["writable"]),
835
+ )
836
+
837
+
838
+ @dataclass(frozen=True)
839
+ class ScanBranchesResult:
840
+ """Result from getting the list of scan branches for a multi-branch table.
841
+
842
+ The result tells the VGI DuckDB extension which DuckDB function(s) to
843
+ call to obtain the data for the table. Each branch is bound independently
844
+ and the rewriter unions their output.
845
+
846
+ Attributes:
847
+ branches: One ``ScanBranch`` per physical source. Order is meaningful
848
+ for stable diagnostic output (``vgi_table_branches()``) but not
849
+ for query semantics (UNION ALL is unordered).
850
+ required_extensions: Union of all DuckDB extensions needed across all
851
+ branches (e.g., ``["iceberg", "httpfs"]``). The C++ side auto-loads
852
+ unloaded entries before running the rewrite; missing extensions
853
+ surface the existing extension-load diagnostic. Hoisted to the
854
+ top level so workers don't repeat ``"iceberg"`` on every branch
855
+ that uses it.
856
+
857
+ """
858
+
859
+ branches: list[ScanBranch]
860
+ required_extensions: list[str] = field(default_factory=list)
861
+
862
+ # On the wire each branch is serialized as its own IPC stream (bytes),
863
+ # carried in a list<binary> column. The C++ side parses each entry via
864
+ # ScanBranch::deserialize, matching the nested-IPC trick used for the
865
+ # arguments field on ScanFunctionResult/ScanBranch themselves.
866
+ ARROW_SCHEMA: ClassVar[pa.Schema] = pa.schema(
867
+ [
868
+ pa.field("branches", pa.list_(pa.binary()), nullable=False),
869
+ pa.field("required_extensions", pa.list_(pa.string()), nullable=False),
870
+ ] # type: ignore[arg-type]
871
+ )
872
+
873
+ def to_row_dict(self) -> dict[str, Any]:
874
+ """Convert to a dictionary for batch construction."""
875
+ return {
876
+ "branches": [branch.serialize() for branch in self.branches],
877
+ "required_extensions": list(self.required_extensions),
878
+ }
879
+
880
+ def serialize(self) -> bytes:
881
+ """Serialize to Arrow IPC bytes (1-row batch using ARROW_SCHEMA)."""
882
+ batch = pa.RecordBatch.from_pylist(
883
+ [self.to_row_dict()],
884
+ schema=self.ARROW_SCHEMA,
885
+ )
886
+ return serialize_record_batch_bytes(batch)
887
+
888
+ @classmethod
889
+ def deserialize(cls, batch: pa.RecordBatch) -> Self:
890
+ """Deserialize from a 1-row Arrow RecordBatch.
891
+
892
+ Empty branches list is rejected — workers must return at least one
893
+ branch. (See the design memo's "loud at attach" rule.)
894
+ """
895
+ from vgi_rpc.utils import _validate_single_row_batch
896
+
897
+ row = _validate_single_row_batch(
898
+ batch,
899
+ cls.__name__,
900
+ required_fields=["branches"],
901
+ )
902
+
903
+ branch_blobs = cast("list[bytes]", row["branches"])
904
+ if not branch_blobs:
905
+ raise ValueError(f"{cls.__name__}: branches list must not be empty")
906
+
907
+ branches: list[ScanBranch] = []
908
+ for blob in branch_blobs:
909
+ branch_batch, _ = deserialize_record_batch(blob)
910
+ branches.append(ScanBranch.deserialize(branch_batch))
911
+
912
+ return cls(
913
+ branches=branches,
914
+ required_extensions=list(cast("list[str]", row.get("required_extensions") or [])),
915
+ )
916
+
917
+
918
+ # ============================================================================
919
+ # Column Statistics
920
+ # ============================================================================
921
+
922
+
923
+ @dataclass(frozen=True)
924
+ class ColumnStatistics:
925
+ """Statistics for a single column in a table.
926
+
927
+ Workers provide these to help DuckDB's optimizer make cost-based decisions
928
+ (filter elimination, join reordering, etc.).
929
+
930
+ Attributes:
931
+ column_name: Name of the column these statistics describe.
932
+ min: Minimum value as a typed PyArrow scalar (e.g., ``pa.scalar(0, pa.int64())``),
933
+ or ``None`` if unknown.
934
+ max: Maximum value as a typed PyArrow scalar, or ``None`` if unknown.
935
+ Must have the same Arrow type as ``min``.
936
+ has_null: Whether the column contains any null values.
937
+ has_not_null: Whether the column contains any non-null values.
938
+ distinct_count: Approximate count of distinct values, or ``None`` if unknown.
939
+ contains_unicode: String/binary columns only — whether values contain non-ASCII
940
+ characters. ``None`` for non-string columns.
941
+ max_string_length: String/binary columns only — maximum byte length of values.
942
+ ``None`` for non-string columns.
943
+
944
+ """
945
+
946
+ column_name: str
947
+ min: pa.Scalar | None = None # type: ignore[type-arg]
948
+ max: pa.Scalar | None = None # type: ignore[type-arg]
949
+ has_null: bool = True
950
+ has_not_null: bool = True
951
+ distinct_count: int | None = None
952
+ contains_unicode: bool | None = None
953
+ max_string_length: int | None = None
954
+
955
+
956
+ @dataclass(frozen=True)
957
+ class TableColumnStatisticsResult:
958
+ """Result from ``table_column_statistics_get`` with optional cache control.
959
+
960
+ Attributes:
961
+ statistics: Per-column statistics for the table.
962
+ cache_max_age_seconds: How long the client may cache these statistics
963
+ (in seconds). ``None`` means cache indefinitely (static data).
964
+ ``0`` means do not cache (live/volatile data).
965
+
966
+ """
967
+
968
+ statistics: list[ColumnStatistics]
969
+ cache_max_age_seconds: int | None = None
970
+
971
+
972
+ def _infer_stat_type(stat: ColumnStatistics) -> pa.DataType:
973
+ """Infer the Arrow type for a ColumnStatistics entry from its min/max scalars."""
974
+ if stat.min is not None and stat.min.is_valid:
975
+ return stat.min.type # type: ignore[no-any-return]
976
+ if stat.max is not None and stat.max.is_valid:
977
+ return stat.max.type # type: ignore[no-any-return]
978
+ return pa.null()
979
+
980
+
981
+ def serialize_column_statistics(
982
+ stats: list[ColumnStatistics],
983
+ cache_max_age_seconds: int | None = None,
984
+ ) -> bytes:
985
+ """Serialize column statistics into a single RecordBatch with sparse union min/max.
986
+
987
+ The ``min`` and ``max`` columns use an Arrow sparse union whose child types
988
+ are the distinct column types present in *stats*. This keeps everything in
989
+ a single IPC stream regardless of how many column types the table has.
990
+
991
+ Args:
992
+ stats: Per-column statistics to serialize.
993
+ cache_max_age_seconds: Optional cache TTL embedded in schema metadata.
994
+
995
+ Returns:
996
+ IPC-serialized bytes of the statistics RecordBatch.
997
+
998
+ """
999
+ n = len(stats)
1000
+ if n == 0:
1001
+ # Return a minimal empty batch — must construct empty union arrays manually
1002
+ # since pa.array([], type=sparse_union) is not supported
1003
+ union_fields: list[pa.Field[Any]] = [pa.field("0", pa.null())]
1004
+ union_type = pa.sparse_union(union_fields)
1005
+ empty_union = pa.UnionArray.from_sparse(
1006
+ pa.array([], type=pa.int8()),
1007
+ [pa.array([], type=pa.null())],
1008
+ field_names=["0"],
1009
+ type_codes=[0], # type: ignore[arg-type]
1010
+ )
1011
+ schema = pa.schema(
1012
+ [
1013
+ pa.field("column_name", pa.utf8()),
1014
+ pa.field("min", union_type),
1015
+ pa.field("max", union_type),
1016
+ pa.field("has_null", pa.bool_()),
1017
+ pa.field("has_not_null", pa.bool_()),
1018
+ pa.field("distinct_count", pa.int64()),
1019
+ pa.field("contains_unicode", pa.bool_()),
1020
+ pa.field("max_string_length", pa.uint64()),
1021
+ ]
1022
+ )
1023
+ batch = pa.record_batch(
1024
+ [
1025
+ pa.array([], type=pa.utf8()),
1026
+ empty_union,
1027
+ empty_union,
1028
+ pa.array([], type=pa.bool_()),
1029
+ pa.array([], type=pa.bool_()),
1030
+ pa.array([], type=pa.int64()),
1031
+ pa.array([], type=pa.bool_()),
1032
+ pa.array([], type=pa.uint64()),
1033
+ ],
1034
+ schema=schema,
1035
+ )
1036
+ return serialize_record_batch_bytes(batch)
1037
+
1038
+ # 1. Collect distinct Arrow types, assign type codes
1039
+ type_map: dict[pa.DataType, int] = {}
1040
+ row_type_codes: list[int] = []
1041
+ for s in stats:
1042
+ arrow_type = _infer_stat_type(s)
1043
+ if arrow_type not in type_map:
1044
+ type_map[arrow_type] = len(type_map)
1045
+ row_type_codes.append(type_map[arrow_type])
1046
+
1047
+ # 2. Build sparse union child arrays (each child is length N)
1048
+ union_fields = []
1049
+ field_names: list[str] = []
1050
+ type_codes: list[int] = []
1051
+ min_children: list[pa.Array[Any]] = []
1052
+ max_children: list[pa.Array[Any]] = []
1053
+ for arrow_type, code in sorted(type_map.items(), key=lambda x: x[1]):
1054
+ union_fields.append(pa.field(str(code), arrow_type))
1055
+ field_names.append(str(code))
1056
+ type_codes.append(code)
1057
+ min_vals = [s.min if row_type_codes[i] == code else None for i, s in enumerate(stats)]
1058
+ max_vals = [s.max if row_type_codes[i] == code else None for i, s in enumerate(stats)]
1059
+ min_children.append(pa.array(min_vals, type=arrow_type))
1060
+ max_children.append(pa.array(max_vals, type=arrow_type))
1061
+
1062
+ # 3. Build sparse union arrays
1063
+ codes_arr = pa.array(row_type_codes, type=pa.int8())
1064
+ min_union = pa.UnionArray.from_sparse(
1065
+ codes_arr,
1066
+ min_children,
1067
+ field_names=field_names,
1068
+ type_codes=type_codes, # type: ignore[arg-type]
1069
+ )
1070
+ max_union = pa.UnionArray.from_sparse(
1071
+ codes_arr,
1072
+ max_children,
1073
+ field_names=field_names,
1074
+ type_codes=type_codes, # type: ignore[arg-type]
1075
+ )
1076
+
1077
+ # 4. Build schema and batch
1078
+ union_type = pa.sparse_union(union_fields)
1079
+ schema = pa.schema(
1080
+ [
1081
+ pa.field("column_name", pa.utf8()),
1082
+ pa.field("min", union_type),
1083
+ pa.field("max", union_type),
1084
+ pa.field("has_null", pa.bool_()),
1085
+ pa.field("has_not_null", pa.bool_()),
1086
+ pa.field("distinct_count", pa.int64()),
1087
+ pa.field("contains_unicode", pa.bool_()),
1088
+ pa.field("max_string_length", pa.uint64()),
1089
+ ],
1090
+ )
1091
+
1092
+ batch = pa.record_batch(
1093
+ [
1094
+ pa.array([s.column_name for s in stats], type=pa.utf8()),
1095
+ min_union,
1096
+ max_union,
1097
+ pa.array([s.has_null for s in stats], type=pa.bool_()),
1098
+ pa.array([s.has_not_null for s in stats], type=pa.bool_()),
1099
+ pa.array([s.distinct_count for s in stats], type=pa.int64()),
1100
+ pa.array([s.contains_unicode for s in stats], type=pa.bool_()),
1101
+ pa.array([s.max_string_length for s in stats], type=pa.uint64()),
1102
+ ],
1103
+ schema=schema,
1104
+ )
1105
+
1106
+ # 5. Serialize with cache TTL as IPC batch custom_metadata (not schema metadata)
1107
+ custom_metadata = None
1108
+ if cache_max_age_seconds is not None:
1109
+ custom_metadata = pa.KeyValueMetadata({b"cache_max_age_seconds": str(cache_max_age_seconds).encode()})
1110
+ return serialize_record_batch_bytes(batch, custom_metadata=custom_metadata)
1111
+
1112
+
1113
+ class CatalogInterface(ABC):
1114
+ """Provides an interface to manage catalogs, schemas, tables, and views for VGI.
1115
+
1116
+ This interface defines methods for creating, dropping, and managing catalogs,
1117
+ schemas, tables, and views. It also supports transactions and provides methods
1118
+ for discovering catalog contents.
1119
+
1120
+ Implementors of this interface should provide concrete implementations for
1121
+ all abstract methods and properties.
1122
+
1123
+ API limitations:
1124
+ - Functions are not able to be created or dropped.
1125
+ - Tags are not able to be updated on catalog objects.
1126
+ - Comments and tags are not updatable on schemas (SchemaInfo).
1127
+ - Constraints cannot be added/dropped (except NOT NULL).
1128
+
1129
+ A VGI worker will offer a single implementation of this interface to clients
1130
+ to manage their catalogs.
1131
+ """
1132
+
1133
+ @property
1134
+ def interface_feature_flags(self) -> set[str]:
1135
+ """Get the feature flags supported by this CatalogInterface.
1136
+
1137
+ Feature flags indicate optional capabilities of the implementation.
1138
+ The default implementation returns an empty set.
1139
+ """
1140
+ return set()
1141
+
1142
+ def loggable_attach_options(self, options: Mapping[str, Any]) -> Mapping[str, Any]:
1143
+ """Return a redacted view of attach/create options safe for logs and Sentry breadcrumbs.
1144
+
1145
+ Called by the worker when emitting catalog lifecycle events
1146
+ (``catalog.attach``, ``catalog.create``). Override to opt in to
1147
+ logging the option fields you know are safe — host names, regions,
1148
+ bucket names, etc. Never return credentials such as passwords,
1149
+ tokens, or connection strings containing secrets.
1150
+
1151
+ Default returns an empty mapping, so by default **nothing** from the
1152
+ ``options`` dict is logged. This fail-closed behaviour avoids
1153
+ leaking credentials when an implementer has not explicitly chosen
1154
+ which fields are safe to emit.
1155
+
1156
+ Args:
1157
+ options: The raw options dict the client passed to ATTACH /
1158
+ CREATE (the same ``dict`` handed to :meth:`catalog_attach`
1159
+ or :meth:`catalog_create`).
1160
+
1161
+ Returns:
1162
+ A mapping of safe-to-log key/value pairs. Returning an empty
1163
+ mapping (the default) suppresses the ``options`` field from
1164
+ lifecycle events entirely.
1165
+
1166
+ """
1167
+ del options
1168
+ return {}
1169
+
1170
+ @abstractmethod
1171
+ def catalogs(self) -> list[CatalogInfo]:
1172
+ """Get a list of catalog discovery records provided by the VGI worker.
1173
+
1174
+ Each record carries the catalog name and — if the worker has opinions —
1175
+ its implementation_version and data_version_spec, so clients can
1176
+ prevalidate ATTACH requests.
1177
+
1178
+ This is a discovery only method.
1179
+ """
1180
+
1181
+ def catalog_create(self, *, name: str, on_conflict: OnConflict, options: dict[str, Any]) -> None:
1182
+ """Create a new catalog with the given name.
1183
+
1184
+ If on_conflict is IGNORE and the catalog already exists, do nothing.
1185
+ If on_conflict is REPLACE and the catalog already exists, replace it.
1186
+ If on_conflict is ERROR and the catalog already exists, raise an error.
1187
+
1188
+ """
1189
+ raise NotImplementedError("Catalog create not implemented.")
1190
+
1191
+ # Drop a catalog
1192
+ def catalog_drop(self, *, name: str) -> None:
1193
+ """Drop the catalog with the given name."""
1194
+ raise NotImplementedError("Catalog drop not implemented.")
1195
+
1196
+ # Transactions are initiated and driven by DuckDB it is rare for CatalogInterface
1197
+ # implementors to implement them, but I want to support them.
1198
+ #
1199
+ # Transaction Guarantees
1200
+ # - Transactions MAY span multiple worker processes
1201
+ # - Workers MUST treat transaction_opaque_data as opaque
1202
+ # - Workers MUST ensure idempotency of commit/rollback
1203
+
1204
+ def catalog_transaction_begin(self, *, attach_opaque_data: AttachOpaqueData) -> TransactionOpaqueData | None:
1205
+ """Begin a new transaction for the given attach_opaque_data.
1206
+
1207
+ If the implementation does not support transactions, it can return None.
1208
+ """
1209
+ raise NotImplementedError("Catalog transactions not implemented.")
1210
+
1211
+ def catalog_transaction_commit(
1212
+ self, *, attach_opaque_data: AttachOpaqueData, transaction_opaque_data: TransactionOpaqueData
1213
+ ) -> None:
1214
+ """Commit the transaction for the given attachment.
1215
+
1216
+ If the transaction cannot be committed, an exception should be raised.
1217
+ """
1218
+ raise NotImplementedError("Catalog transactions not implemented.")
1219
+
1220
+ def catalog_transaction_rollback(
1221
+ self, *, attach_opaque_data: AttachOpaqueData, transaction_opaque_data: TransactionOpaqueData
1222
+ ) -> None:
1223
+ """Rollback the transaction for the given attachment.
1224
+
1225
+ If the transaction cannot be rolled back, an exception should be raised.
1226
+ """
1227
+ raise NotImplementedError("Catalog transactions not implemented.")
1228
+
1229
+ @abstractmethod
1230
+ def catalog_attach(
1231
+ self,
1232
+ *,
1233
+ name: str,
1234
+ options: dict[str, Any],
1235
+ data_version_spec: str | None,
1236
+ implementation_version: str | None,
1237
+ ctx: "CallContext | None" = None,
1238
+ ) -> CatalogAttachResult:
1239
+ """Attach to a catalog with the given name and options.
1240
+
1241
+ ``data_version_spec`` and ``implementation_version`` carry the
1242
+ semver constraints the client requested at ATTACH time. Pass-through
1243
+ strings — subclasses interpret and validate them. ``None`` means
1244
+ the client did not constrain that dimension. Implementations that
1245
+ cannot satisfy a requested version MUST raise an exception with a
1246
+ human-readable message; the error surfaces on the client as the
1247
+ ATTACH failure.
1248
+
1249
+ ``ctx`` is injected by the RPC dispatcher when available. Over HTTP it
1250
+ enables setting a per-session routing cookie via ``ctx.set_cookie()``;
1251
+ over subprocess it may be ``None`` or have empty cookie support.
1252
+
1253
+ Returns a CatalogAttachResult containing the attach ID, other catalog
1254
+ metadata, and the resolved concrete versions chosen by the worker.
1255
+ """
1256
+
1257
+ def catalog_detach(self, *, attach_opaque_data: AttachOpaqueData) -> None:
1258
+ """Detach from the catalog with the given attach_opaque_data.
1259
+
1260
+ Any open transactions should be rolled back.
1261
+ The default implementation does nothing.
1262
+ """
1263
+ return # Default no-op
1264
+
1265
+ def catalog_version(
1266
+ self,
1267
+ *,
1268
+ attach_opaque_data: AttachOpaqueData,
1269
+ transaction_opaque_data: TransactionOpaqueData | None,
1270
+ ctx: "CallContext | None" = None,
1271
+ ) -> int:
1272
+ """Get the current catalog version for the given attach_opaque_data and transaction_opaque_data.
1273
+
1274
+ Returns an integer representing the current catalog version.
1275
+
1276
+ Changes to schemas, tables, and objects increment this version. It is used to
1277
+ expire cached catalog/schema/object information inside a VGI client or process.
1278
+
1279
+ ``ctx`` is injected by the RPC dispatcher when available. Subclasses that use
1280
+ HTTP-session cookies can consult ``ctx.cookies`` to verify routing
1281
+ stickiness.
1282
+
1283
+ The default implementation returns 0.
1284
+ """
1285
+ del ctx
1286
+ return 0
1287
+
1288
+ def schemas(
1289
+ self, *, attach_opaque_data: AttachOpaqueData, transaction_opaque_data: TransactionOpaqueData | None
1290
+ ) -> list[SchemaInfo]:
1291
+ """Get a list of schemas for the given attach_opaque_data and transaction_opaque_data.
1292
+
1293
+ The default returns a schema called "main" with no comment or tags.
1294
+ """
1295
+ return [
1296
+ SchemaInfo(
1297
+ attach_opaque_data=attach_opaque_data,
1298
+ name="main",
1299
+ comment=None,
1300
+ tags={},
1301
+ )
1302
+ ]
1303
+
1304
+ def schema_create(
1305
+ self,
1306
+ *,
1307
+ attach_opaque_data: AttachOpaqueData,
1308
+ transaction_opaque_data: TransactionOpaqueData | None,
1309
+ name: str,
1310
+ on_conflict: OnConflict = OnConflict.ERROR,
1311
+ comment: str | None,
1312
+ tags: dict[str, str],
1313
+ ) -> None:
1314
+ """Create a new schema with the given name, comment, and tags."""
1315
+ raise NotImplementedError("Schema create not implemented.")
1316
+
1317
+ def schema_drop(
1318
+ self,
1319
+ *,
1320
+ attach_opaque_data: AttachOpaqueData,
1321
+ transaction_opaque_data: TransactionOpaqueData | None,
1322
+ name: str,
1323
+ ignore_not_found: bool,
1324
+ cascade: bool,
1325
+ ) -> None:
1326
+ """Drop the schema with the given name."""
1327
+ raise NotImplementedError("Schema drop not implemented.")
1328
+
1329
+ @overload
1330
+ def schema_contents(
1331
+ self,
1332
+ *,
1333
+ attach_opaque_data: AttachOpaqueData,
1334
+ transaction_opaque_data: TransactionOpaqueData | None,
1335
+ name: str,
1336
+ type: Literal[SchemaObjectType.TABLE],
1337
+ ) -> Sequence[TableInfo]: ...
1338
+
1339
+ @overload
1340
+ def schema_contents(
1341
+ self,
1342
+ *,
1343
+ attach_opaque_data: AttachOpaqueData,
1344
+ transaction_opaque_data: TransactionOpaqueData | None,
1345
+ name: str,
1346
+ type: Literal[SchemaObjectType.VIEW],
1347
+ ) -> Sequence[ViewInfo]: ...
1348
+
1349
+ @overload
1350
+ def schema_contents(
1351
+ self,
1352
+ *,
1353
+ attach_opaque_data: AttachOpaqueData,
1354
+ transaction_opaque_data: TransactionOpaqueData | None,
1355
+ name: str,
1356
+ type: Literal[
1357
+ SchemaObjectType.SCALAR_FUNCTION,
1358
+ SchemaObjectType.TABLE_FUNCTION,
1359
+ SchemaObjectType.AGGREGATE_FUNCTION,
1360
+ ],
1361
+ ) -> Sequence[FunctionInfo]: ...
1362
+
1363
+ @overload
1364
+ def schema_contents(
1365
+ self,
1366
+ *,
1367
+ attach_opaque_data: AttachOpaqueData,
1368
+ transaction_opaque_data: TransactionOpaqueData | None,
1369
+ name: str,
1370
+ type: Literal[SchemaObjectType.SCALAR_MACRO, SchemaObjectType.TABLE_MACRO],
1371
+ ) -> Sequence[MacroInfo]: ...
1372
+
1373
+ @overload
1374
+ def schema_contents(
1375
+ self,
1376
+ *,
1377
+ attach_opaque_data: AttachOpaqueData,
1378
+ transaction_opaque_data: TransactionOpaqueData | None,
1379
+ name: str,
1380
+ type: Literal[SchemaObjectType.INDEX],
1381
+ ) -> Sequence[IndexInfo]: ...
1382
+
1383
+ def schema_contents(
1384
+ self,
1385
+ *,
1386
+ attach_opaque_data: AttachOpaqueData,
1387
+ transaction_opaque_data: TransactionOpaqueData | None,
1388
+ name: str,
1389
+ type: SchemaObjectType,
1390
+ ) -> Sequence[TableInfo | ViewInfo | FunctionInfo | MacroInfo | IndexInfo]:
1391
+ """Get the contents of the schema with the given name.
1392
+
1393
+ Schemas can contain tables, views, functions, macros, and indexes.
1394
+
1395
+ Args:
1396
+ attach_opaque_data: The attachment identifier.
1397
+ transaction_opaque_data: The transaction identifier, if any.
1398
+ name: The name of the schema.
1399
+ type: The type of objects to return. Must be a SchemaObjectType enum:
1400
+ - SchemaObjectType.TABLE: Return only tables
1401
+ - SchemaObjectType.VIEW: Return only views
1402
+ - SchemaObjectType.SCALAR_FUNCTION: Scalar functions
1403
+ - SchemaObjectType.TABLE_FUNCTION: Table functions
1404
+ - SchemaObjectType.SCALAR_MACRO: Scalar macros
1405
+ - SchemaObjectType.TABLE_MACRO: Table macros
1406
+ - SchemaObjectType.INDEX: Indexes
1407
+
1408
+ Returns:
1409
+ A list of TableInfo, ViewInfo, FunctionInfo, or MacroInfo objects
1410
+ depending on the type parameter.
1411
+
1412
+ """
1413
+ raise NotImplementedError("Schema contents not implemented.")
1414
+
1415
+ @abstractmethod
1416
+ def schema_get(
1417
+ self,
1418
+ *,
1419
+ attach_opaque_data: AttachOpaqueData,
1420
+ transaction_opaque_data: TransactionOpaqueData | None,
1421
+ name: str,
1422
+ ) -> SchemaInfo | None:
1423
+ """Get information about the schema with the given name.
1424
+
1425
+ Returns a SchemaInfo object if the schema exists, or None if it does not.
1426
+ """
1427
+
1428
+ @abstractmethod
1429
+ def table_get(
1430
+ self,
1431
+ *,
1432
+ attach_opaque_data: AttachOpaqueData,
1433
+ transaction_opaque_data: TransactionOpaqueData | None,
1434
+ schema_name: str,
1435
+ name: str,
1436
+ at_unit: str | None = None,
1437
+ at_value: str | None = None,
1438
+ ) -> TableInfo | None:
1439
+ """Get information about the table with the given name in the specified schema.
1440
+
1441
+ When ``at_unit`` / ``at_value`` are provided the implementation should
1442
+ return the table schema for the requested point in time (time travel).
1443
+
1444
+ Returns a TableInfo object if the table exists, or None if it does not.
1445
+ """
1446
+
1447
+ def table_create(
1448
+ self,
1449
+ *,
1450
+ attach_opaque_data: AttachOpaqueData,
1451
+ transaction_opaque_data: TransactionOpaqueData | None,
1452
+ schema_name: str,
1453
+ name: str,
1454
+ # The contents of the table is a serialized PyArrow schema
1455
+ # the nullability for each field is ignored.
1456
+ # schema.serialize().to_pybytes()
1457
+ columns: SerializedSchema,
1458
+ on_conflict: OnConflict,
1459
+ # These are constraints listed by field index
1460
+ not_null_constraints: list[int], # [] = no not null constraints
1461
+ unique_constraints: list[list[int]], # [] = no unique constraints
1462
+ # These are general check constraints specified as SQL expressions.
1463
+ check_constraints: list[str], # [] = no check constraints
1464
+ # Primary key constraints as column index groups
1465
+ primary_key_constraints: list[list[int]] | None = None,
1466
+ # Foreign key constraints as IPC-serialized bytes (same format as TableInfo)
1467
+ foreign_key_constraints: list[bytes] | None = None,
1468
+ ) -> None:
1469
+ """Create a new table with the given name and schema.
1470
+
1471
+ Comments and tags are not supported on table creation.
1472
+ """
1473
+ raise NotImplementedError("Table create not implemented.")
1474
+
1475
+ def table_drop(
1476
+ self,
1477
+ *,
1478
+ attach_opaque_data: AttachOpaqueData,
1479
+ transaction_opaque_data: TransactionOpaqueData | None,
1480
+ schema_name: str,
1481
+ name: str,
1482
+ ignore_not_found: bool,
1483
+ cascade: bool = False,
1484
+ ) -> None:
1485
+ """Drop the table with the given name."""
1486
+ raise NotImplementedError("Table drop not implemented.")
1487
+
1488
+ def table_comment_set(
1489
+ self,
1490
+ *,
1491
+ attach_opaque_data: AttachOpaqueData,
1492
+ transaction_opaque_data: TransactionOpaqueData | None,
1493
+ schema_name: str,
1494
+ name: str,
1495
+ comment: str | None,
1496
+ ignore_not_found: bool,
1497
+ ) -> None:
1498
+ """Set the comment for the table with the given name."""
1499
+ raise NotImplementedError("Table comment set not implemented.")
1500
+
1501
+ def table_column_comment_set(
1502
+ self,
1503
+ *,
1504
+ attach_opaque_data: AttachOpaqueData,
1505
+ transaction_opaque_data: TransactionOpaqueData | None,
1506
+ schema_name: str,
1507
+ name: str,
1508
+ column_name: str,
1509
+ comment: str | None,
1510
+ ignore_not_found: bool,
1511
+ ) -> None:
1512
+ """Set the comment for a column in the table."""
1513
+ raise NotImplementedError("Table column comment set not implemented.")
1514
+
1515
+ def table_rename(
1516
+ self,
1517
+ *,
1518
+ attach_opaque_data: AttachOpaqueData,
1519
+ transaction_opaque_data: TransactionOpaqueData | None,
1520
+ schema_name: str,
1521
+ name: str,
1522
+ new_name: str,
1523
+ ignore_not_found: bool,
1524
+ ) -> None:
1525
+ """Rename the table with the given name to the new name."""
1526
+ raise NotImplementedError("Table rename not implemented.")
1527
+
1528
+ def table_column_add(
1529
+ self,
1530
+ *,
1531
+ attach_opaque_data: AttachOpaqueData,
1532
+ transaction_opaque_data: TransactionOpaqueData | None,
1533
+ schema_name: str,
1534
+ name: str,
1535
+ # Arrow schema with single field for column to add.
1536
+ # Serialized via schema.serialize().to_pybytes()
1537
+ column_definition: SerializedSchema,
1538
+ ignore_not_found: bool,
1539
+ if_column_not_exists: bool,
1540
+ ) -> None:
1541
+ """Add a column to the table with the given name."""
1542
+ raise NotImplementedError("Table column add not implemented.")
1543
+
1544
+ def table_column_drop(
1545
+ self,
1546
+ *,
1547
+ attach_opaque_data: AttachOpaqueData,
1548
+ transaction_opaque_data: TransactionOpaqueData | None,
1549
+ schema_name: str,
1550
+ name: str,
1551
+ column_name: str,
1552
+ ignore_not_found: bool,
1553
+ if_column_exists: bool,
1554
+ cascade: bool,
1555
+ ) -> None:
1556
+ """Drop the column from the table with the given name."""
1557
+ raise NotImplementedError("Table column drop not implemented.")
1558
+
1559
+ def table_column_rename(
1560
+ self,
1561
+ *,
1562
+ attach_opaque_data: AttachOpaqueData,
1563
+ transaction_opaque_data: TransactionOpaqueData | None,
1564
+ schema_name: str,
1565
+ name: str,
1566
+ column_name: str,
1567
+ new_column_name: str,
1568
+ ignore_not_found: bool,
1569
+ ) -> None:
1570
+ """Rename the column in the table with the given name."""
1571
+ raise NotImplementedError("Table column rename not implemented.")
1572
+
1573
+ def table_column_default_set(
1574
+ self,
1575
+ *,
1576
+ attach_opaque_data: AttachOpaqueData,
1577
+ transaction_opaque_data: TransactionOpaqueData | None,
1578
+ schema_name: str,
1579
+ name: str,
1580
+ column_name: str,
1581
+ expression: SqlExpression,
1582
+ ignore_not_found: bool,
1583
+ ) -> None:
1584
+ """Set the default expression for the column."""
1585
+ raise NotImplementedError("Table column default set not implemented.")
1586
+
1587
+ def table_column_default_drop(
1588
+ self,
1589
+ *,
1590
+ attach_opaque_data: AttachOpaqueData,
1591
+ transaction_opaque_data: TransactionOpaqueData | None,
1592
+ schema_name: str,
1593
+ name: str,
1594
+ column_name: str,
1595
+ ignore_not_found: bool,
1596
+ ) -> None:
1597
+ """Drop the default expression for the column."""
1598
+ raise NotImplementedError("Table column default drop not implemented.")
1599
+
1600
+ def table_column_type_change(
1601
+ self,
1602
+ *,
1603
+ attach_opaque_data: AttachOpaqueData,
1604
+ transaction_opaque_data: TransactionOpaqueData | None,
1605
+ schema_name: str,
1606
+ name: str,
1607
+ # Arrow schema with single field for the new column type.
1608
+ # Serialized via schema.serialize().to_pybytes()
1609
+ column_definition: SerializedSchema,
1610
+ expression: SqlExpression | None,
1611
+ ignore_not_found: bool,
1612
+ ) -> None:
1613
+ """Change the type of the column in the table with the given name.
1614
+
1615
+ The name of the column to change is taken from the field in the provided schema.
1616
+ """
1617
+ raise NotImplementedError("Table column type change not implemented.")
1618
+
1619
+ def table_not_null_drop(
1620
+ self,
1621
+ *,
1622
+ attach_opaque_data: AttachOpaqueData,
1623
+ transaction_opaque_data: TransactionOpaqueData | None,
1624
+ schema_name: str,
1625
+ name: str,
1626
+ column_name: str,
1627
+ ignore_not_found: bool,
1628
+ ) -> None:
1629
+ """Drop the NOT NULL constraint from the column."""
1630
+ raise NotImplementedError("Table NOT NULL drop not implemented.")
1631
+
1632
+ def table_not_null_set(
1633
+ self,
1634
+ *,
1635
+ attach_opaque_data: AttachOpaqueData,
1636
+ transaction_opaque_data: TransactionOpaqueData | None,
1637
+ schema_name: str,
1638
+ name: str,
1639
+ column_name: str,
1640
+ ignore_not_found: bool,
1641
+ ) -> None:
1642
+ """Set the NOT NULL constraint on the column."""
1643
+ raise NotImplementedError("Table NOT NULL set not implemented.")
1644
+
1645
+ def table_scan_function_get(
1646
+ self,
1647
+ *,
1648
+ attach_opaque_data: AttachOpaqueData,
1649
+ transaction_opaque_data: TransactionOpaqueData | None,
1650
+ schema_name: str,
1651
+ name: str,
1652
+ # Time travel fields (iceberg style)
1653
+ at_unit: str | None,
1654
+ at_value: str | None,
1655
+ ) -> ScanFunctionResult:
1656
+ """Get the ScanFunctionResult for scanning the table.
1657
+
1658
+ Returns information about the VGI table function to call when scanning
1659
+ this table. The at_unit and at_value support time travel queries.
1660
+ """
1661
+ raise NotImplementedError("Table scan function get not implemented.")
1662
+
1663
+ def table_scan_branches_get(
1664
+ self,
1665
+ *,
1666
+ attach_opaque_data: AttachOpaqueData,
1667
+ transaction_opaque_data: TransactionOpaqueData | None,
1668
+ schema_name: str,
1669
+ name: str,
1670
+ at_unit: str | None,
1671
+ at_value: str | None,
1672
+ ) -> ScanBranchesResult:
1673
+ """Get the list of scan branches for a multi-source table.
1674
+
1675
+ Multi-branch tables compose a logical scan from N physical sources
1676
+ (canonical case: Kafka hot tier + Iceberg cold tier). The VGI DuckDB
1677
+ extension's optimizer-extension rewrites the placeholder ``LogicalGet``
1678
+ into ``LogicalSetOperation(UNION_ALL, ...)``, one arm per branch.
1679
+
1680
+ Default implementation: delegate to :meth:`table_scan_function_get`
1681
+ and wrap the single ``ScanFunctionResult`` as a one-branch list.
1682
+ This makes every existing single-source worker automatically
1683
+ compatible with the new branches-aware C++ side, while letting
1684
+ workers that genuinely need multi-source override this method.
1685
+
1686
+ Workers that override should NOT also raise from
1687
+ :meth:`table_scan_function_get` — the legacy method must keep
1688
+ working for old C++ extensions that don't yet probe for the new
1689
+ branches RPC. Common pattern: a worker implements both, where
1690
+ :meth:`table_scan_function_get` returns ``branches[0]`` (the
1691
+ primary branch) and :meth:`table_scan_branches_get` returns the
1692
+ full list.
1693
+
1694
+ Args:
1695
+ attach_opaque_data: Per-attach session token.
1696
+ transaction_opaque_data: Optional transaction token.
1697
+ schema_name: Schema containing the table.
1698
+ name: Table name.
1699
+ at_unit: Optional time-travel unit (e.g., ``"VERSION"`` /
1700
+ ``"TIMESTAMP"``). The VGI C++ side refuses ``AT(...)`` on
1701
+ multi-branch tables (>1 branch) at bind time, so workers
1702
+ returning multiple branches should expect ``at_unit`` /
1703
+ ``at_value`` to always be ``None``; single-branch returns
1704
+ still honour them.
1705
+ at_value: Optional time-travel value matching ``at_unit``.
1706
+
1707
+ Returns:
1708
+ A :class:`ScanBranchesResult` carrying one or more
1709
+ :class:`ScanBranch` entries plus the union of required
1710
+ extensions across all branches.
1711
+
1712
+ """
1713
+ legacy = self.table_scan_function_get(
1714
+ attach_opaque_data=attach_opaque_data,
1715
+ transaction_opaque_data=transaction_opaque_data,
1716
+ schema_name=schema_name,
1717
+ name=name,
1718
+ at_unit=at_unit,
1719
+ at_value=at_value,
1720
+ )
1721
+ return ScanBranchesResult(
1722
+ branches=[
1723
+ ScanBranch(
1724
+ function_name=legacy.function_name,
1725
+ positional_arguments=list(legacy.positional_arguments),
1726
+ named_arguments=dict(legacy.named_arguments),
1727
+ branch_filter=None,
1728
+ ),
1729
+ ],
1730
+ required_extensions=list(legacy.required_extensions),
1731
+ )
1732
+
1733
+ def table_column_statistics_get(
1734
+ self,
1735
+ *,
1736
+ attach_opaque_data: AttachOpaqueData,
1737
+ transaction_opaque_data: TransactionOpaqueData | None,
1738
+ schema_name: str,
1739
+ name: str,
1740
+ ) -> TableColumnStatisticsResult | None:
1741
+ """Get column statistics for all columns in a table.
1742
+
1743
+ Returns a :class:`TableColumnStatisticsResult` containing per-column
1744
+ statistics and an optional cache TTL, or ``None`` if statistics are not
1745
+ available for this table.
1746
+
1747
+ The default implementation returns ``None`` (no statistics).
1748
+ Workers that provide statistics should override this method.
1749
+ """
1750
+ return None
1751
+
1752
+ def table_insert_function_get(
1753
+ self,
1754
+ *,
1755
+ attach_opaque_data: AttachOpaqueData,
1756
+ transaction_opaque_data: TransactionOpaqueData | None,
1757
+ schema_name: str,
1758
+ name: str,
1759
+ writable_branch_function_name: str | None = None,
1760
+ ) -> ScanFunctionResult:
1761
+ """Get the write function for INSERT operations on the table.
1762
+
1763
+ Returns a ScanFunctionResult identifying the TableInOutGenerator function
1764
+ to call for inserting rows into this table.
1765
+
1766
+ ``writable_branch_function_name`` is set by the C++ extension when the
1767
+ table is multi-branch and a branch declared ``writable=True``: the value
1768
+ is the writable arm's ``ScanBranch.function_name``. Workers serving
1769
+ multi-branch tables can use this to dispatch the INSERT to the correct
1770
+ underlying storage without re-resolving the writable arm internally.
1771
+ For single-branch tables this is ``None`` (or unset for legacy
1772
+ overrides).
1773
+ """
1774
+ raise NotImplementedError("Table insert not supported.")
1775
+
1776
+ def table_update_function_get(
1777
+ self,
1778
+ *,
1779
+ attach_opaque_data: AttachOpaqueData,
1780
+ transaction_opaque_data: TransactionOpaqueData | None,
1781
+ schema_name: str,
1782
+ name: str,
1783
+ ) -> ScanFunctionResult:
1784
+ """Get the write function for UPDATE operations on the table.
1785
+
1786
+ Returns a ScanFunctionResult identifying the TableInOutGenerator function
1787
+ to call for updating rows in this table. Input batches will include a
1788
+ rowid column plus the columns being updated.
1789
+ """
1790
+ raise NotImplementedError("Table update not supported.")
1791
+
1792
+ def table_delete_function_get(
1793
+ self,
1794
+ *,
1795
+ attach_opaque_data: AttachOpaqueData,
1796
+ transaction_opaque_data: TransactionOpaqueData | None,
1797
+ schema_name: str,
1798
+ name: str,
1799
+ ) -> ScanFunctionResult:
1800
+ """Get the write function for DELETE operations on the table.
1801
+
1802
+ Returns a ScanFunctionResult identifying the TableInOutGenerator function
1803
+ to call for deleting rows from this table. Input batches will contain
1804
+ a rowid column identifying the rows to delete.
1805
+ """
1806
+ raise NotImplementedError("Table delete not supported.")
1807
+
1808
+ def view_create(
1809
+ self,
1810
+ *,
1811
+ attach_opaque_data: AttachOpaqueData,
1812
+ transaction_opaque_data: TransactionOpaqueData | None,
1813
+ schema_name: str,
1814
+ name: str,
1815
+ definition: str,
1816
+ on_conflict: OnConflict,
1817
+ ) -> None:
1818
+ """Create a new view with the given definition."""
1819
+ raise NotImplementedError("View create not implemented.")
1820
+
1821
+ def view_drop(
1822
+ self,
1823
+ *,
1824
+ attach_opaque_data: AttachOpaqueData,
1825
+ transaction_opaque_data: TransactionOpaqueData | None,
1826
+ schema_name: str,
1827
+ name: str,
1828
+ ignore_not_found: bool,
1829
+ cascade: bool = False,
1830
+ ) -> None:
1831
+ """Drop the view with the given name."""
1832
+ raise NotImplementedError("View drop not implemented.")
1833
+
1834
+ def view_rename(
1835
+ self,
1836
+ *,
1837
+ attach_opaque_data: AttachOpaqueData,
1838
+ transaction_opaque_data: TransactionOpaqueData | None,
1839
+ schema_name: str,
1840
+ name: str,
1841
+ new_name: str,
1842
+ ignore_not_found: bool,
1843
+ ) -> None:
1844
+ """Rename the view to the new name."""
1845
+ raise NotImplementedError("View rename not implemented.")
1846
+
1847
+ @abstractmethod
1848
+ def view_get(
1849
+ self,
1850
+ *,
1851
+ attach_opaque_data: AttachOpaqueData,
1852
+ transaction_opaque_data: TransactionOpaqueData | None,
1853
+ schema_name: str,
1854
+ name: str,
1855
+ ) -> ViewInfo | None:
1856
+ """Get information about the view with the given name.
1857
+
1858
+ Returns a ViewInfo object if the view exists, or None if it does not.
1859
+ """
1860
+
1861
+ def view_comment_set(
1862
+ self,
1863
+ *,
1864
+ attach_opaque_data: AttachOpaqueData,
1865
+ transaction_opaque_data: TransactionOpaqueData | None,
1866
+ schema_name: str,
1867
+ name: str,
1868
+ comment: str | None,
1869
+ ignore_not_found: bool,
1870
+ ) -> None:
1871
+ """Set the comment for the view with the given name."""
1872
+ raise NotImplementedError("View comment set not implemented.")
1873
+
1874
+ # ---- Macros ----
1875
+
1876
+ @abstractmethod
1877
+ def macro_get(
1878
+ self,
1879
+ *,
1880
+ attach_opaque_data: AttachOpaqueData,
1881
+ transaction_opaque_data: TransactionOpaqueData | None,
1882
+ schema_name: str,
1883
+ name: str,
1884
+ ) -> MacroInfo | None:
1885
+ """Get information about the macro with the given name.
1886
+
1887
+ Returns a MacroInfo object if the macro exists, or None if it does not.
1888
+ """
1889
+
1890
+ def macro_create(
1891
+ self,
1892
+ *,
1893
+ attach_opaque_data: AttachOpaqueData,
1894
+ transaction_opaque_data: TransactionOpaqueData | None,
1895
+ schema_name: str,
1896
+ name: str,
1897
+ macro_type: "MacroType",
1898
+ parameters: list[str],
1899
+ definition: str,
1900
+ on_conflict: OnConflict,
1901
+ parameter_default_values: pa.RecordBatch | None = None,
1902
+ ) -> None:
1903
+ """Create a new macro with the given definition."""
1904
+ raise NotImplementedError("Macro create not implemented.")
1905
+
1906
+ def macro_drop(
1907
+ self,
1908
+ *,
1909
+ attach_opaque_data: AttachOpaqueData,
1910
+ transaction_opaque_data: TransactionOpaqueData | None,
1911
+ schema_name: str,
1912
+ name: str,
1913
+ ignore_not_found: bool,
1914
+ ) -> None:
1915
+ """Drop the macro with the given name."""
1916
+ raise NotImplementedError("Macro drop not implemented.")
1917
+
1918
+ # ---- Indexes ----
1919
+
1920
+ def index_get(
1921
+ self,
1922
+ *,
1923
+ attach_opaque_data: AttachOpaqueData,
1924
+ transaction_opaque_data: TransactionOpaqueData | None,
1925
+ schema_name: str,
1926
+ name: str,
1927
+ ) -> IndexInfo | None:
1928
+ """Get information about the index with the given name.
1929
+
1930
+ Returns an IndexInfo object if the index exists, or None if it does not.
1931
+ The default implementation returns None (no indexes).
1932
+ """
1933
+ return None
1934
+
1935
+ def index_create(
1936
+ self,
1937
+ *,
1938
+ attach_opaque_data: AttachOpaqueData,
1939
+ transaction_opaque_data: TransactionOpaqueData | None,
1940
+ schema_name: str,
1941
+ name: str,
1942
+ table_name: str,
1943
+ index_type: str,
1944
+ constraint_type: IndexConstraintType,
1945
+ expressions: list[str],
1946
+ on_conflict: OnConflict,
1947
+ options: dict[str, str] | None = None,
1948
+ ) -> None:
1949
+ """Create a new index on the specified table."""
1950
+ raise NotImplementedError("Index create not implemented.")
1951
+
1952
+ def index_drop(
1953
+ self,
1954
+ *,
1955
+ attach_opaque_data: AttachOpaqueData,
1956
+ transaction_opaque_data: TransactionOpaqueData | None,
1957
+ schema_name: str,
1958
+ name: str,
1959
+ ignore_not_found: bool,
1960
+ cascade: bool = False,
1961
+ ) -> None:
1962
+ """Drop the index with the given name."""
1963
+ raise NotImplementedError("Index drop not implemented.")
1964
+
1965
+
1966
+ def _read_only(operation: str) -> Any:
1967
+ """Create a CatalogInterface method that raises CatalogReadOnlyError."""
1968
+
1969
+ def method(self: Any, **kwargs: Any) -> Any:
1970
+ raise CatalogReadOnlyError(f"Cannot {operation}: catalog is read-only")
1971
+
1972
+ method.__doc__ = "Not supported — raises CatalogReadOnlyError."
1973
+ return method
1974
+
1975
+
1976
+ def _inline_bind_result_for(func_cls: type) -> bytes | None:
1977
+ """Pre-built ``bind_result`` bytes for a ``@bind_fixed_schema`` function.
1978
+
1979
+ Returns the IPC-serialized ``BindResponse(output_schema=cls.FIXED_SCHEMA)``
1980
+ that the worker would have produced from a regular bind RPC. Cached on a
1981
+ private class attribute so subsequent ``schema_contents`` calls (per
1982
+ attach, per cache invalidation) reuse the bytes instead of re-serializing.
1983
+
1984
+ Returns ``None`` if the class isn't safely pre-bind-able — either it
1985
+ isn't ``@bind_fixed_schema``-decorated (no ``_inline_bind_safe`` marker),
1986
+ or a subclass has overridden ``on_bind`` (escaping the decorator's
1987
+ contract — see the eligibility comment on ``bind_fixed_schema``).
1988
+ """
1989
+ if not getattr(func_cls, "_inline_bind_safe", False):
1990
+ return None
1991
+ # If the class has its own on_bind in __dict__, it's either the decorator's
1992
+ # injection (marked) or a subclass override (unmarked). Reject overrides.
1993
+ on_bind_attr = func_cls.__dict__.get("on_bind")
1994
+ if on_bind_attr is not None:
1995
+ underlying = getattr(on_bind_attr, "__func__", on_bind_attr)
1996
+ if not getattr(underlying, "_is_bind_fixed_schema", False):
1997
+ return None
1998
+ cached = func_cls.__dict__.get("_cached_inline_bind_result")
1999
+ if cached is not None:
2000
+ return cached # type: ignore[no-any-return]
2001
+ from vgi.invocation import BindResponse
2002
+
2003
+ response = BindResponse(output_schema=func_cls.FIXED_SCHEMA, opaque_data=None) # type: ignore[attr-defined]
2004
+ blob = response.serialize_to_bytes()
2005
+ # Set on the class itself so subclasses don't pollute their parents'
2006
+ # cache with each other's serialized blobs (FIXED_SCHEMA may differ).
2007
+ func_cls._cached_inline_bind_result = blob # type: ignore[attr-defined]
2008
+ return blob
2009
+
2010
+
2011
+ class ReadOnlyCatalogInterface(CatalogInterface):
2012
+ """A read-only catalog interface that does not support DDL operations.
2013
+
2014
+ This is a convenience base class for catalogs that only support reading
2015
+ metadata and data, not creating or modifying objects.
2016
+
2017
+ There are two ways to use this class:
2018
+
2019
+ 1. Subclass and implement abstract methods:
2020
+ - catalogs() - List available catalogs
2021
+ - catalog_attach() - Attach to a catalog
2022
+ - schema_get() - Get schema information
2023
+ - table_get() - Get table information (return None for function-only catalogs)
2024
+ - view_get() - Get view information (return None for function-only catalogs)
2025
+
2026
+ 2. Use with functions list (simpler for function-only catalogs):
2027
+ Set the `functions` class attribute to expose VGI functions:
2028
+ - catalog_name - Name of the catalog (default: "functions")
2029
+ - functions - List of function classes to expose in the "main" schema
2030
+
2031
+ This provides automatic implementations of catalogs(), catalog_attach(),
2032
+ schema_get(), table_get(), view_get(), and schema_contents().
2033
+
2034
+ Optional methods that can be overridden:
2035
+ - catalog_detach() - Custom detach logic
2036
+ - schemas() - Custom schema listing (default returns 'main')
2037
+ - schema_contents() - List schema contents
2038
+ - table_scan_function_get() - Get scan function for tables
2039
+
2040
+ All DDL operations (create, drop, rename, modify) will raise
2041
+ CatalogReadOnlyError.
2042
+
2043
+ """
2044
+
2045
+ supports_transactions = False
2046
+ catalog_version_frozen = True
2047
+
2048
+ # Class attributes for function-based catalogs
2049
+ catalog_name: str = "functions"
2050
+ functions: list[type] = []
2051
+ settings: list["SettingSpec"] = []
2052
+ secret_types: list["SecretTypeSpec"] = []
2053
+ attach_option_specs: list["AttachOptionSpec"] = []
2054
+
2055
+ # NEW: Optional Catalog object for declarative definition
2056
+ catalog: "Catalog | None" = None
2057
+
2058
+ # Fixed attach_opaque_data for read-only catalogs (no need for unique IDs)
2059
+ _FIXED_ATTACH_ID: AttachOpaqueData = AttachOpaqueData(b"readonly-catalog-")
2060
+
2061
+ # Instance-level registry caches (built lazily)
2062
+ # Keys are LOWERCASE for case-insensitive lookup
2063
+ _schema_registry: "dict[str, Schema] | None" = None
2064
+ _table_registry: "dict[tuple[str, str], Table] | None" = None
2065
+ _view_registry: "dict[tuple[str, str], View] | None" = None
2066
+ _function_registry: "dict[tuple[str, str], list[type]] | None" = None
2067
+ _macro_registry: "dict[tuple[str, str], Macro] | None" = None
2068
+ _index_registry: "dict[tuple[str, str], Index] | None" = None
2069
+ # Lazy registry build is one-time but the fixture HTTP server is
2070
+ # multi-threaded and shares one catalog instance, so concurrent
2071
+ # first-requests can race the build. Serialize it under a lock and flip
2072
+ # ``_registries_built`` only AFTER population so readers never observe a
2073
+ # half-built (mutating) registry. (Shared across instances — fine; the
2074
+ # build is one-time and infrequent.)
2075
+ _build_lock = threading.Lock()
2076
+ _registries_built: bool = False
2077
+
2078
+ def _build_registries(self) -> None:
2079
+ """Build the lookup registries lazily, once, and thread-safely.
2080
+
2081
+ Double-checked locking: the fast path is a lock-free flag read; the
2082
+ actual build runs under ``_build_lock`` and sets ``_registries_built``
2083
+ only after population completes. A concurrent reader either builds
2084
+ (under the lock) or waits for the builder, so it never iterates a
2085
+ registry that another thread is still mutating.
2086
+ """
2087
+ if self._registries_built:
2088
+ return
2089
+ with self._build_lock:
2090
+ if self._registries_built:
2091
+ return
2092
+ self._build_registries_locked()
2093
+
2094
+ def _build_registries_locked(self) -> None:
2095
+ """Populate the registries. Caller must hold ``_build_lock``.
2096
+
2097
+ All registry keys are lowercase for case-insensitive lookups.
2098
+ Raises ValueError if duplicate names detected within same schema.
2099
+ """
2100
+ # Import here to avoid circular imports
2101
+ from vgi.catalog.descriptors import Schema
2102
+
2103
+ self._schema_registry = {}
2104
+ self._table_registry = {}
2105
+ self._view_registry = {}
2106
+ self._function_registry = {}
2107
+ self._macro_registry = {}
2108
+ self._index_registry = {}
2109
+
2110
+ def _register_table(schema_key: str, table: "Table") -> None:
2111
+ key = (schema_key, table.name.lower())
2112
+ if key in self._table_registry: # type: ignore[operator]
2113
+ raise ValueError(f"Duplicate table '{table.name}' in schema '{schema_key}'")
2114
+ self._table_registry[key] = table # type: ignore[index]
2115
+
2116
+ def _register_view(schema_key: str, view: "View") -> None:
2117
+ key = (schema_key, view.name.lower())
2118
+ if key in self._view_registry: # type: ignore[operator]
2119
+ raise ValueError(f"Duplicate view '{view.name}' in schema '{schema_key}'")
2120
+ self._view_registry[key] = view # type: ignore[index]
2121
+
2122
+ def _register_function(schema_key: str, func_cls: type) -> None:
2123
+ meta = func_cls.get_metadata() # type: ignore[attr-defined]
2124
+ key = (schema_key, meta.name.lower())
2125
+ if key not in self._function_registry: # type: ignore[operator]
2126
+ self._function_registry[key] = [] # type: ignore[index]
2127
+ self._function_registry[key].append(func_cls) # type: ignore[index]
2128
+
2129
+ def _register_macro(schema_key: str, macro: "Macro") -> None:
2130
+ key = (schema_key, macro.name.lower())
2131
+ if key in self._macro_registry: # type: ignore[operator]
2132
+ raise ValueError(f"Duplicate macro '{macro.name}' in schema '{schema_key}'")
2133
+ self._macro_registry[key] = macro # type: ignore[index]
2134
+
2135
+ def _register_index(schema_key: str, index: "Index") -> None:
2136
+ key = (schema_key, index.name.lower())
2137
+ if key in self._index_registry: # type: ignore[operator]
2138
+ raise ValueError(f"Duplicate index '{index.name}' in schema '{schema_key}'")
2139
+ self._index_registry[key] = index # type: ignore[index]
2140
+
2141
+ if self.catalog is not None:
2142
+ # Build from Catalog object
2143
+ for schema in self.catalog.schemas:
2144
+ schema_key = schema.name.lower()
2145
+ self._schema_registry[schema_key] = schema
2146
+
2147
+ for table in schema.tables:
2148
+ _register_table(schema_key, table)
2149
+ for view in schema.views:
2150
+ _register_view(schema_key, view)
2151
+ for func_cls in schema.functions:
2152
+ _register_function(schema_key, func_cls)
2153
+ for macro in schema.macros:
2154
+ _register_macro(schema_key, macro)
2155
+ for index in schema.indexes:
2156
+ _register_index(schema_key, index)
2157
+ else:
2158
+ # Backward compat: create "main" schema from legacy `functions` list
2159
+ main_schema = Schema(name="main", tables=(), views=(), functions=())
2160
+ self._schema_registry["main"] = main_schema
2161
+
2162
+ for func_cls in self.functions:
2163
+ _register_function("main", func_cls)
2164
+
2165
+ # Publish last: only now may a concurrent reader skip the build and
2166
+ # iterate these registries (they are fully populated and no longer
2167
+ # mutated).
2168
+ self._registries_built = True
2169
+
2170
+ @property
2171
+ def _effective_catalog_name(self) -> str:
2172
+ """Get catalog name from Catalog object or class attribute."""
2173
+ if self.catalog is not None:
2174
+ return self.catalog.name
2175
+ return self.catalog_name
2176
+
2177
+ @property
2178
+ def _default_schema_name(self) -> str:
2179
+ """Get default schema name."""
2180
+ if self.catalog is not None:
2181
+ return self.catalog.default_schema
2182
+ return "main"
2183
+
2184
+ def catalogs(self) -> list[CatalogInfo]:
2185
+ """Return the list of available catalogs.
2186
+
2187
+ Default discovery record carries just the catalog name — subclasses
2188
+ that want to advertise version metadata should override.
2189
+ """
2190
+ return [
2191
+ CatalogInfo(
2192
+ name=self._effective_catalog_name,
2193
+ implementation_version=None,
2194
+ data_version_spec=None,
2195
+ attach_option_specs=[spec.serialize() for spec in self.attach_option_specs],
2196
+ )
2197
+ ]
2198
+
2199
+ def catalog_attach(
2200
+ self,
2201
+ *,
2202
+ name: str,
2203
+ options: dict[str, Any],
2204
+ data_version_spec: str | None,
2205
+ implementation_version: str | None,
2206
+ ctx: "CallContext | None" = None,
2207
+ ) -> CatalogAttachResult:
2208
+ """Attach to the catalog. Version constraints are ignored by default."""
2209
+ del data_version_spec, implementation_version, ctx
2210
+ effective_name = self._effective_catalog_name
2211
+ if name != effective_name:
2212
+ raise ValueError(f"Unknown catalog: {name!r}. Available: {effective_name}")
2213
+
2214
+ # Serialize settings and secret types for the attach result
2215
+ serialized_settings = [s.serialize() for s in self.settings]
2216
+ serialized_secret_types = [st.serialize() for st in self.secret_types]
2217
+
2218
+ # Auto-derive supports_time_travel and supports_column_statistics from tables
2219
+ self._build_registries()
2220
+ assert self._table_registry is not None
2221
+ has_time_travel = any(t.supports_time_travel for t in self._table_registry.values())
2222
+ has_column_statistics = any(bool(t.statistics) for t in self._table_registry.values())
2223
+
2224
+ return CatalogAttachResult(
2225
+ attach_opaque_data=self._FIXED_ATTACH_ID,
2226
+ supports_transactions=getattr(self, "supports_transactions", False),
2227
+ supports_time_travel=has_time_travel,
2228
+ catalog_version_frozen=True,
2229
+ catalog_version=1,
2230
+ attach_opaque_data_required=False,
2231
+ default_schema=self._default_schema_name,
2232
+ settings=serialized_settings,
2233
+ secret_types=serialized_secret_types,
2234
+ comment=self.catalog.comment if self.catalog is not None else None,
2235
+ tags=dict(self.catalog.tags) if self.catalog is not None else {},
2236
+ supports_column_statistics=has_column_statistics,
2237
+ resolved_data_version=None,
2238
+ resolved_implementation_version=None,
2239
+ )
2240
+
2241
+ def schemas(
2242
+ self, *, attach_opaque_data: AttachOpaqueData, transaction_opaque_data: TransactionOpaqueData | None
2243
+ ) -> list[SchemaInfo]:
2244
+ """Get a list of schemas for the given attach_opaque_data."""
2245
+ self._build_registries()
2246
+ assert self._schema_registry is not None
2247
+ return [s.to_schema_info(attach_opaque_data) for s in self._schema_registry.values()]
2248
+
2249
+ def schema_get(
2250
+ self,
2251
+ *,
2252
+ attach_opaque_data: AttachOpaqueData,
2253
+ transaction_opaque_data: TransactionOpaqueData | None,
2254
+ name: str,
2255
+ ) -> SchemaInfo | None:
2256
+ """Get information about a schema (case-insensitive lookup)."""
2257
+ self._build_registries()
2258
+ assert self._schema_registry is not None
2259
+ schema = self._schema_registry.get(name.lower())
2260
+ return schema.to_schema_info(attach_opaque_data) if schema else None
2261
+
2262
+ def table_get(
2263
+ self,
2264
+ *,
2265
+ attach_opaque_data: AttachOpaqueData,
2266
+ transaction_opaque_data: TransactionOpaqueData | None,
2267
+ schema_name: str,
2268
+ name: str,
2269
+ at_unit: str | None = None,
2270
+ at_value: str | None = None,
2271
+ ) -> TableInfo | None:
2272
+ """Get information about a table (case-insensitive lookup).
2273
+
2274
+ When ``at_unit`` / ``at_value`` are provided, the default implementation
2275
+ returns the same table info (no schema evolution). Override this method
2276
+ to return version-specific schemas for time-travel queries.
2277
+ """
2278
+ _validate_at_params(at_unit, at_value)
2279
+
2280
+ self._build_registries()
2281
+ assert self._table_registry is not None
2282
+ assert self._schema_registry is not None
2283
+ table = self._table_registry.get((schema_name.lower(), name.lower()))
2284
+ if table is None:
2285
+ return None
2286
+
2287
+ # If AT clause present but table doesn't support time travel, error
2288
+ if at_unit and not table.supports_time_travel:
2289
+ raise ValueError(f"Table '{schema_name}.{name}' does not support time travel queries")
2290
+
2291
+ schema = self._schema_registry.get(schema_name.lower())
2292
+ return table.to_table_info(schema.name if schema else schema_name)
2293
+
2294
+ def view_get(
2295
+ self,
2296
+ *,
2297
+ attach_opaque_data: AttachOpaqueData,
2298
+ transaction_opaque_data: TransactionOpaqueData | None,
2299
+ schema_name: str,
2300
+ name: str,
2301
+ ) -> ViewInfo | None:
2302
+ """Get information about a view (case-insensitive lookup)."""
2303
+ self._build_registries()
2304
+ assert self._view_registry is not None
2305
+ assert self._schema_registry is not None
2306
+ view = self._view_registry.get((schema_name.lower(), name.lower()))
2307
+ if view:
2308
+ schema = self._schema_registry.get(schema_name.lower())
2309
+ return view.to_view_info(schema.name if schema else schema_name)
2310
+ return None
2311
+
2312
+ def macro_get(
2313
+ self,
2314
+ *,
2315
+ attach_opaque_data: AttachOpaqueData,
2316
+ transaction_opaque_data: TransactionOpaqueData | None,
2317
+ schema_name: str,
2318
+ name: str,
2319
+ ) -> MacroInfo | None:
2320
+ """Get information about a macro (case-insensitive lookup)."""
2321
+ self._build_registries()
2322
+ assert self._macro_registry is not None
2323
+ assert self._schema_registry is not None
2324
+ macro = self._macro_registry.get((schema_name.lower(), name.lower()))
2325
+ if macro:
2326
+ schema = self._schema_registry.get(schema_name.lower())
2327
+ return macro.to_macro_info(schema.name if schema else schema_name)
2328
+ return None
2329
+
2330
+ def index_get(
2331
+ self,
2332
+ *,
2333
+ attach_opaque_data: AttachOpaqueData,
2334
+ transaction_opaque_data: TransactionOpaqueData | None,
2335
+ schema_name: str,
2336
+ name: str,
2337
+ ) -> IndexInfo | None:
2338
+ """Get information about an index (case-insensitive lookup)."""
2339
+ self._build_registries()
2340
+ assert self._index_registry is not None
2341
+ assert self._schema_registry is not None
2342
+ index = self._index_registry.get((schema_name.lower(), name.lower()))
2343
+ if index is not None:
2344
+ schema = self._schema_registry.get(schema_name.lower())
2345
+ return index.to_index_info(schema.name if schema else schema_name)
2346
+ return None
2347
+
2348
+ def table_column_statistics_get(
2349
+ self,
2350
+ *,
2351
+ attach_opaque_data: AttachOpaqueData,
2352
+ transaction_opaque_data: TransactionOpaqueData | None,
2353
+ schema_name: str,
2354
+ name: str,
2355
+ ) -> TableColumnStatisticsResult | None:
2356
+ """Get column statistics from the Table descriptor's ``statistics`` dict.
2357
+
2358
+ Automatically resolves plain Python values to typed PyArrow scalars
2359
+ using the column's Arrow type from the table schema.
2360
+ Override this method for dynamic or computed statistics.
2361
+ """
2362
+ self._build_registries()
2363
+ assert self._table_registry is not None
2364
+ table = self._table_registry.get((schema_name.lower(), name.lower()))
2365
+ if table is None:
2366
+ return None
2367
+ return table.resolve_column_statistics()
2368
+
2369
+ def table_scan_function_get(
2370
+ self,
2371
+ *,
2372
+ attach_opaque_data: AttachOpaqueData,
2373
+ transaction_opaque_data: TransactionOpaqueData | None,
2374
+ schema_name: str,
2375
+ name: str,
2376
+ at_unit: str | None,
2377
+ at_value: str | None,
2378
+ ) -> ScanFunctionResult:
2379
+ """Get scan function for a table.
2380
+
2381
+ For function-backed tables (Table.function is set), automatically returns
2382
+ a ScanFunctionResult that invokes the linked function.
2383
+
2384
+ For tables with explicit columns, override this method in your Worker
2385
+ to provide scan functions.
2386
+ """
2387
+ _validate_at_params(at_unit, at_value)
2388
+
2389
+ self._build_registries()
2390
+ assert self._table_registry is not None
2391
+ assert self._schema_registry is not None
2392
+
2393
+ # Validate AT clause against table's supports_time_travel
2394
+ table = self._table_registry.get((schema_name.lower(), name.lower()))
2395
+ if table is not None and at_unit and not table.supports_time_travel:
2396
+ raise ValueError(f"Table '{schema_name}.{name}' does not support time travel queries")
2397
+
2398
+ # Check if table exists and is function-backed
2399
+ if table is not None and table.function is not None:
2400
+ # Auto-implement for function-backed tables
2401
+ func_meta = table.function.get_metadata()
2402
+ return ScanFunctionResult(
2403
+ function_name=func_meta.name,
2404
+ positional_arguments=[],
2405
+ named_arguments={},
2406
+ required_extensions=[],
2407
+ )
2408
+
2409
+ # No auto-implementation available - provide helpful error
2410
+ available = [
2411
+ f"{self._effective_catalog_name}.{s.name}.{t.name}"
2412
+ for s in self._schema_registry.values()
2413
+ for t in s.tables
2414
+ ]
2415
+ available_str = ", ".join(sorted(available)) if available else "(none)"
2416
+
2417
+ raise NotImplementedError(
2418
+ f"table_scan_function_get not implemented for table "
2419
+ f"'{self._effective_catalog_name}.{schema_name}.{name}'. "
2420
+ f"Available tables: {available_str}. "
2421
+ f"Either use Table(function=...) for automatic scanning, "
2422
+ f"or override table_scan_function_get in your Worker."
2423
+ )
2424
+
2425
+ def _write_function_get(
2426
+ self,
2427
+ *,
2428
+ schema_name: str,
2429
+ name: str,
2430
+ operation: str,
2431
+ attr_name: str,
2432
+ ) -> ScanFunctionResult:
2433
+ """Shared implementation for table_{insert,update,delete}_function_get."""
2434
+ self._build_registries()
2435
+ assert self._table_registry is not None
2436
+
2437
+ table = self._table_registry.get((schema_name.lower(), name.lower()))
2438
+ if table is None:
2439
+ raise NotImplementedError(f"Table '{schema_name}.{name}' not found in catalog.")
2440
+
2441
+ write_func = getattr(table, attr_name, None)
2442
+ if write_func is None:
2443
+ raise CatalogReadOnlyError(f"Table '{schema_name}.{name}' does not support {operation}.")
2444
+
2445
+ func_meta = write_func.get_metadata()
2446
+ return ScanFunctionResult(
2447
+ function_name=func_meta.name,
2448
+ positional_arguments=[],
2449
+ named_arguments={},
2450
+ required_extensions=[],
2451
+ )
2452
+
2453
+ def table_insert_function_get(
2454
+ self,
2455
+ *,
2456
+ attach_opaque_data: AttachOpaqueData,
2457
+ transaction_opaque_data: TransactionOpaqueData | None,
2458
+ schema_name: str,
2459
+ name: str,
2460
+ writable_branch_function_name: str | None = None,
2461
+ ) -> ScanFunctionResult:
2462
+ """Get insert function for a table."""
2463
+ # ReadOnlyCatalogInterface tables are single-branch — writable arm
2464
+ # disambiguation is not relevant here. Discard the hint.
2465
+ del writable_branch_function_name
2466
+ return self._write_function_get(
2467
+ schema_name=schema_name,
2468
+ name=name,
2469
+ operation="INSERT",
2470
+ attr_name="insert_function",
2471
+ )
2472
+
2473
+ def table_update_function_get(
2474
+ self,
2475
+ *,
2476
+ attach_opaque_data: AttachOpaqueData,
2477
+ transaction_opaque_data: TransactionOpaqueData | None,
2478
+ schema_name: str,
2479
+ name: str,
2480
+ ) -> ScanFunctionResult:
2481
+ """Get update function for a table."""
2482
+ return self._write_function_get(
2483
+ schema_name=schema_name,
2484
+ name=name,
2485
+ operation="UPDATE",
2486
+ attr_name="update_function",
2487
+ )
2488
+
2489
+ def table_delete_function_get(
2490
+ self,
2491
+ *,
2492
+ attach_opaque_data: AttachOpaqueData,
2493
+ transaction_opaque_data: TransactionOpaqueData | None,
2494
+ schema_name: str,
2495
+ name: str,
2496
+ ) -> ScanFunctionResult:
2497
+ """Get delete function for a table."""
2498
+ return self._write_function_get(
2499
+ schema_name=schema_name,
2500
+ name=name,
2501
+ operation="DELETE",
2502
+ attr_name="delete_function",
2503
+ )
2504
+
2505
+ @overload
2506
+ def schema_contents(
2507
+ self,
2508
+ *,
2509
+ attach_opaque_data: AttachOpaqueData,
2510
+ transaction_opaque_data: TransactionOpaqueData | None,
2511
+ name: str,
2512
+ type: Literal[SchemaObjectType.TABLE],
2513
+ ) -> Sequence[TableInfo]: ...
2514
+
2515
+ @overload
2516
+ def schema_contents(
2517
+ self,
2518
+ *,
2519
+ attach_opaque_data: AttachOpaqueData,
2520
+ transaction_opaque_data: TransactionOpaqueData | None,
2521
+ name: str,
2522
+ type: Literal[SchemaObjectType.VIEW],
2523
+ ) -> Sequence[ViewInfo]: ...
2524
+
2525
+ @overload
2526
+ def schema_contents(
2527
+ self,
2528
+ *,
2529
+ attach_opaque_data: AttachOpaqueData,
2530
+ transaction_opaque_data: TransactionOpaqueData | None,
2531
+ name: str,
2532
+ type: Literal[
2533
+ SchemaObjectType.SCALAR_FUNCTION,
2534
+ SchemaObjectType.TABLE_FUNCTION,
2535
+ SchemaObjectType.AGGREGATE_FUNCTION,
2536
+ ],
2537
+ ) -> Sequence[FunctionInfo]: ...
2538
+
2539
+ @overload
2540
+ def schema_contents(
2541
+ self,
2542
+ *,
2543
+ attach_opaque_data: AttachOpaqueData,
2544
+ transaction_opaque_data: TransactionOpaqueData | None,
2545
+ name: str,
2546
+ type: Literal[SchemaObjectType.SCALAR_MACRO, SchemaObjectType.TABLE_MACRO],
2547
+ ) -> Sequence[MacroInfo]: ...
2548
+
2549
+ @overload
2550
+ def schema_contents(
2551
+ self,
2552
+ *,
2553
+ attach_opaque_data: AttachOpaqueData,
2554
+ transaction_opaque_data: TransactionOpaqueData | None,
2555
+ name: str,
2556
+ type: Literal[SchemaObjectType.INDEX],
2557
+ ) -> Sequence[IndexInfo]: ...
2558
+
2559
+ def schema_contents(
2560
+ self,
2561
+ *,
2562
+ attach_opaque_data: AttachOpaqueData,
2563
+ transaction_opaque_data: TransactionOpaqueData | None,
2564
+ name: str,
2565
+ type: SchemaObjectType,
2566
+ ) -> Sequence[TableInfo | ViewInfo | FunctionInfo | MacroInfo | IndexInfo]:
2567
+ """List contents of a schema.
2568
+
2569
+ Returns tables, views, functions, macros, or indexes based on the type parameter.
2570
+ Uses case-insensitive schema name lookup.
2571
+
2572
+ Args:
2573
+ attach_opaque_data: The attachment identifier.
2574
+ transaction_opaque_data: The transaction identifier, if any.
2575
+ name: The name of the schema.
2576
+ type: The type of objects to return. Must be a SchemaObjectType enum.
2577
+
2578
+ Returns:
2579
+ A list of TableInfo, ViewInfo, FunctionInfo, MacroInfo, or IndexInfo objects.
2580
+
2581
+ """
2582
+ self._build_registries()
2583
+ assert self._schema_registry is not None
2584
+ assert self._table_registry is not None
2585
+ assert self._view_registry is not None
2586
+ assert self._function_registry is not None
2587
+ assert self._macro_registry is not None
2588
+ assert self._index_registry is not None
2589
+
2590
+ # Case-insensitive schema lookup
2591
+ name_lower = name.lower()
2592
+ schema = self._schema_registry.get(name_lower)
2593
+ if schema is None:
2594
+ return []
2595
+
2596
+ schema_name = schema.name
2597
+
2598
+ # Normalize type parameter (may be string from wire protocol)
2599
+ type_enum = type if isinstance(type, SchemaObjectType) else SchemaObjectType(type)
2600
+
2601
+ results: list[TableInfo | ViewInfo | FunctionInfo | MacroInfo | IndexInfo] = []
2602
+
2603
+ if type_enum == SchemaObjectType.TABLE:
2604
+ for (sn, _), table in self._table_registry.items():
2605
+ if sn == name_lower:
2606
+ info = table.to_table_info(schema_name)
2607
+ # Inline-bind post-pass: descriptors with inline_bind=True
2608
+ # backed by @bind_fixed_schema-decorated functions get a
2609
+ # pre-built BindResponse inlined onto TableInfo.bind_result.
2610
+ # The C++ extension uses these bytes verbatim and skips
2611
+ # the per-scan bind RPC.
2612
+ if table.inline_bind and table.function is not None:
2613
+ bind_bytes = _inline_bind_result_for(table.function)
2614
+ if bind_bytes is not None:
2615
+ info = dataclasses.replace(info, bind_result=bind_bytes)
2616
+ results.append(info)
2617
+ elif type_enum == SchemaObjectType.VIEW:
2618
+ for (sn, _), view in self._view_registry.items():
2619
+ if sn == name_lower:
2620
+ results.append(view.to_view_info(schema_name))
2621
+ elif type_enum == SchemaObjectType.INDEX:
2622
+ for (sn, _), index in self._index_registry.items():
2623
+ if sn == name_lower:
2624
+ results.append(index.to_index_info(schema_name))
2625
+ elif type_enum in (SchemaObjectType.SCALAR_MACRO, SchemaObjectType.TABLE_MACRO):
2626
+ target_macro_type = MacroType.SCALAR if type_enum == SchemaObjectType.SCALAR_MACRO else MacroType.TABLE
2627
+ for (sn, _), macro in self._macro_registry.items():
2628
+ if sn == name_lower and macro.macro_type == target_macro_type:
2629
+ results.append(macro.to_macro_info(schema_name))
2630
+ else:
2631
+ # SCALAR_FUNCTION or TABLE_FUNCTION
2632
+ for (sn, _), func_classes in self._function_registry.items():
2633
+ if sn != name_lower:
2634
+ continue
2635
+ for func_cls in func_classes:
2636
+ func_info = self._function_to_info(func_cls, schema_name)
2637
+ # Filter by function type
2638
+ if type_enum == SchemaObjectType.SCALAR_FUNCTION and func_info.function_type != FunctionType.SCALAR:
2639
+ continue
2640
+ if type_enum == SchemaObjectType.TABLE_FUNCTION and func_info.function_type not in (
2641
+ FunctionType.TABLE,
2642
+ FunctionType.TABLE_BUFFERING,
2643
+ ):
2644
+ continue
2645
+ if (
2646
+ type_enum == SchemaObjectType.AGGREGATE_FUNCTION
2647
+ and func_info.function_type != FunctionType.AGGREGATE
2648
+ ):
2649
+ continue
2650
+ results.append(func_info)
2651
+
2652
+ return results
2653
+
2654
+ def _function_to_info(self, func_cls: type, schema_name: str) -> FunctionInfo:
2655
+ """Convert a function class to FunctionInfo."""
2656
+ # Import here to avoid circular imports
2657
+ from vgi.argument_spec import (
2658
+ argument_specs_to_schema,
2659
+ extract_argument_specs,
2660
+ )
2661
+ from vgi.metadata import CatalogFunctionType as MetadataFunctionType
2662
+ from vgi.metadata import resolve_metadata
2663
+
2664
+ meta = resolve_metadata(func_cls)
2665
+
2666
+ # Map metadata function type to catalog function type
2667
+ func_type_map = {
2668
+ MetadataFunctionType.SCALAR: FunctionType.SCALAR,
2669
+ MetadataFunctionType.TABLE: FunctionType.TABLE,
2670
+ MetadataFunctionType.TABLE_BUFFERING: FunctionType.TABLE_BUFFERING,
2671
+ MetadataFunctionType.AGGREGATE: FunctionType.AGGREGATE,
2672
+ }
2673
+ func_type = func_type_map.get(meta.function_type, FunctionType.TABLE)
2674
+
2675
+ # Extract argument specs with proper Arrow types
2676
+ arg_specs = extract_argument_specs(func_cls)
2677
+ args_schema = argument_specs_to_schema(arg_specs)
2678
+ args_bytes = SerializedSchema(args_schema.serialize().to_pybytes())
2679
+
2680
+ # Get output schema from catalog introspection methods if available
2681
+ output_schema: pa.Schema = pa.schema([])
2682
+ has_catalog_schema = hasattr(func_cls, "catalog_output_schema")
2683
+ if func_type in (FunctionType.SCALAR, FunctionType.AGGREGATE) and has_catalog_schema:
2684
+ # ScalarFunction/AggregateFunction has catalog_output_schema() classmethod
2685
+ output_schema = func_cls.catalog_output_schema() # type: ignore[attr-defined]
2686
+ output_bytes = SerializedSchema(output_schema.serialize().to_pybytes())
2687
+
2688
+ is_scalar = func_type == FunctionType.SCALAR
2689
+ is_aggregate = func_type == FunctionType.AGGREGATE
2690
+
2691
+ return FunctionInfo(
2692
+ name=meta.name,
2693
+ schema_name=schema_name,
2694
+ function_type=func_type,
2695
+ arguments=args_bytes,
2696
+ output_schema=output_bytes,
2697
+ comment=None, # Functions don't use comment; use description instead
2698
+ tags=meta.tags,
2699
+ # Scalar/aggregate function behavior fields
2700
+ stability=meta.stability if is_scalar else None,
2701
+ null_handling=meta.null_handling if (is_scalar or is_aggregate) else None,
2702
+ # Documentation fields
2703
+ description=meta.description or "", # Intrinsic from Meta.description
2704
+ examples=[
2705
+ CatalogExample(
2706
+ sql=ex.sql,
2707
+ description=ex.description,
2708
+ expected_output=ex.expected_output,
2709
+ )
2710
+ for ex in meta.examples
2711
+ ],
2712
+ categories=meta.categories,
2713
+ # Table function capabilities (None for scalar)
2714
+ projection_pushdown=None if is_scalar else meta.projection_pushdown,
2715
+ filter_pushdown=None if is_scalar else meta.filter_pushdown,
2716
+ sampling_pushdown=None if is_scalar else meta.sampling_pushdown,
2717
+ late_materialization=None if is_scalar else meta.late_materialization,
2718
+ supported_expression_filters=[] if is_scalar else meta.supported_expression_filters,
2719
+ order_preservation=None if is_scalar else meta.preserves_order,
2720
+ max_workers=None if is_scalar else meta.max_workers,
2721
+ supports_batch_index=False if is_scalar else meta.supports_batch_index,
2722
+ partition_kind=PartitionKind.NOT_PARTITIONED if is_scalar else meta.partition_kind,
2723
+ # Aggregate function fields
2724
+ order_dependent=meta.order_dependent,
2725
+ distinct_dependent=meta.distinct_dependent,
2726
+ supports_window=meta.supports_window,
2727
+ streaming_partitioned=meta.streaming_partitioned,
2728
+ has_finalize=meta.has_finalize,
2729
+ source_order_dependent=meta.source_order_dependent,
2730
+ sink_order_dependent=meta.sink_order_dependent,
2731
+ requires_input_batch_index=meta.requires_input_batch_index,
2732
+ # Settings
2733
+ required_settings=meta.required_settings,
2734
+ # Secrets
2735
+ required_secrets=list(meta.required_secrets),
2736
+ )
2737
+
2738
+ # ========== DDL operations (not supported — read-only catalog) ==========
2739
+
2740
+ catalog_create = _read_only("create catalog")
2741
+ catalog_drop = _read_only("drop catalog")
2742
+ catalog_transaction_begin = _read_only("begin transaction")
2743
+ catalog_transaction_commit = _read_only("commit transaction")
2744
+ catalog_transaction_rollback = _read_only("rollback transaction")
2745
+ schema_create = _read_only("create schema")
2746
+ schema_drop = _read_only("drop schema")
2747
+ table_create = _read_only("create table")
2748
+ table_drop = _read_only("drop table")
2749
+ table_comment_set = _read_only("set table comment")
2750
+ table_column_comment_set = _read_only("set column comment")
2751
+ table_rename = _read_only("rename table")
2752
+ table_column_add = _read_only("add column")
2753
+ table_column_drop = _read_only("drop column")
2754
+ table_column_rename = _read_only("rename column")
2755
+ table_column_default_set = _read_only("set column default")
2756
+ table_column_default_drop = _read_only("drop column default")
2757
+ table_column_type_change = _read_only("change column type")
2758
+ table_not_null_drop = _read_only("drop NOT NULL constraint")
2759
+ table_not_null_set = _read_only("set NOT NULL constraint")
2760
+ view_create = _read_only("create view")
2761
+ view_drop = _read_only("drop view")
2762
+ view_rename = _read_only("rename view")
2763
+ view_comment_set = _read_only("set view comment")
2764
+ macro_create = _read_only("create macro")
2765
+ macro_drop = _read_only("drop macro")
2766
+ index_create = _read_only("create index")
2767
+ index_drop = _read_only("drop index")