vgi-python 0.8.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vgi/__init__.py +152 -0
- vgi/_duckdb.py +62 -0
- vgi/_storage_profile.py +132 -0
- vgi/_test_fixtures/__init__.py +20 -0
- vgi/_test_fixtures/accumulate/__init__.py +19 -0
- vgi/_test_fixtures/accumulate/worker.py +762 -0
- vgi/_test_fixtures/aggregate/__init__.py +62 -0
- vgi/_test_fixtures/aggregate/_common.py +21 -0
- vgi/_test_fixtures/aggregate/basic.py +232 -0
- vgi/_test_fixtures/aggregate/dynamic.py +409 -0
- vgi/_test_fixtures/aggregate/generic.py +86 -0
- vgi/_test_fixtures/aggregate/listagg.py +71 -0
- vgi/_test_fixtures/aggregate/percentile.py +107 -0
- vgi/_test_fixtures/aggregate/streaming.py +192 -0
- vgi/_test_fixtures/aggregate/varargs.py +75 -0
- vgi/_test_fixtures/aggregate/window.py +380 -0
- vgi/_test_fixtures/attach_options.py +308 -0
- vgi/_test_fixtures/bad_protocol.py +62 -0
- vgi/_test_fixtures/cancellable.py +336 -0
- vgi/_test_fixtures/catalog.py +813 -0
- vgi/_test_fixtures/http_server.py +394 -0
- vgi/_test_fixtures/nest_tensor.py +614 -0
- vgi/_test_fixtures/orchard_catalog.py +47 -0
- vgi/_test_fixtures/projection_repro/__init__.py +6 -0
- vgi/_test_fixtures/projection_repro/worker.py +454 -0
- vgi/_test_fixtures/scalar/__init__.py +116 -0
- vgi/_test_fixtures/scalar/_common.py +69 -0
- vgi/_test_fixtures/scalar/arithmetic.py +321 -0
- vgi/_test_fixtures/scalar/binary.py +120 -0
- vgi/_test_fixtures/scalar/formatting.py +176 -0
- vgi/_test_fixtures/scalar/geo.py +300 -0
- vgi/_test_fixtures/scalar/null_handling.py +107 -0
- vgi/_test_fixtures/scalar/random_demo.py +171 -0
- vgi/_test_fixtures/scalar/settings_secrets.py +102 -0
- vgi/_test_fixtures/scalar/type_info.py +219 -0
- vgi/_test_fixtures/schema_reconcile/__init__.py +29 -0
- vgi/_test_fixtures/schema_reconcile/worker.py +653 -0
- vgi/_test_fixtures/simple_writable.py +793 -0
- vgi/_test_fixtures/table/__init__.py +221 -0
- vgi/_test_fixtures/table/_common.py +162 -0
- vgi/_test_fixtures/table/batch_index.py +283 -0
- vgi/_test_fixtures/table/batch_index_broken.py +200 -0
- vgi/_test_fixtures/table/catalog_scans.py +162 -0
- vgi/_test_fixtures/table/filters.py +1005 -0
- vgi/_test_fixtures/table/late_materialization.py +249 -0
- vgi/_test_fixtures/table/make_series.py +273 -0
- vgi/_test_fixtures/table/misc.py +499 -0
- vgi/_test_fixtures/table/order_modes.py +164 -0
- vgi/_test_fixtures/table/pairs.py +437 -0
- vgi/_test_fixtures/table/partition_columns.py +472 -0
- vgi/_test_fixtures/table/partition_columns_broken.py +304 -0
- vgi/_test_fixtures/table/profiling_example.py +195 -0
- vgi/_test_fixtures/table/required_filters.py +234 -0
- vgi/_test_fixtures/table/sequence.py +710 -0
- vgi/_test_fixtures/table/settings.py +426 -0
- vgi/_test_fixtures/table/transaction_storage.py +162 -0
- vgi/_test_fixtures/table/tt_pushdown.py +191 -0
- vgi/_test_fixtures/table/versioned.py +230 -0
- vgi/_test_fixtures/table_in_out.py +1392 -0
- vgi/_test_fixtures/versioned.py +155 -0
- vgi/_test_fixtures/versioned_tables.py +595 -0
- vgi/_test_fixtures/worker.py +1631 -0
- vgi/_test_fixtures/writable/__init__.py +8 -0
- vgi/_test_fixtures/writable/generic.py +236 -0
- vgi/_test_fixtures/writable/table.py +149 -0
- vgi/_test_fixtures/writable/worker.py +1148 -0
- vgi/aggregate_function.py +607 -0
- vgi/argument_spec.py +472 -0
- vgi/arguments.py +1747 -0
- vgi/auth.py +55 -0
- vgi/catalog/__init__.py +88 -0
- vgi/catalog/attach_option.py +206 -0
- vgi/catalog/catalog_interface.py +2767 -0
- vgi/catalog/descriptors.py +870 -0
- vgi/catalog/duckdb_statistics.py +377 -0
- vgi/catalog/secret_type.py +96 -0
- vgi/catalog/setting.py +253 -0
- vgi/catalog/storage.py +372 -0
- vgi/client/__init__.py +67 -0
- vgi/client/catalog_mixin.py +1251 -0
- vgi/client/cli.py +582 -0
- vgi/client/cli_catalog.py +182 -0
- vgi/client/cli_schema.py +270 -0
- vgi/client/cli_table.py +907 -0
- vgi/client/cli_transaction.py +97 -0
- vgi/client/cli_utils.py +441 -0
- vgi/client/cli_view.py +303 -0
- vgi/client/client.py +2183 -0
- vgi/exceptions.py +205 -0
- vgi/function.py +245 -0
- vgi/function_storage.py +1636 -0
- vgi/function_storage_azure_sql.py +922 -0
- vgi/function_storage_cf_do.py +740 -0
- vgi/http/__init__.py +25 -0
- vgi/http/demo_storage.py +212 -0
- vgi/http/worker_page.py +1252 -0
- vgi/invocation.py +154 -0
- vgi/logging_config.py +93 -0
- vgi/meta_worker.py +661 -0
- vgi/metadata.py +1403 -0
- vgi/otel.py +406 -0
- vgi/protocol.py +2418 -0
- vgi/protocol_version.txt +1 -0
- vgi/py.typed +0 -0
- vgi/scalar_function.py +1211 -0
- vgi/schema_utils.py +234 -0
- vgi/secret_protocol.py +124 -0
- vgi/secret_service.py +238 -0
- vgi/serve.py +769 -0
- vgi/table_buffering_function.py +443 -0
- vgi/table_filter_pushdown.py +1528 -0
- vgi/table_function.py +1130 -0
- vgi/table_in_out_function.py +383 -0
- vgi/transactor/__init__.py +24 -0
- vgi/transactor/_duckdb_compat.py +27 -0
- vgi/transactor/client.py +137 -0
- vgi/transactor/protocol.py +149 -0
- vgi/transactor/server.py +740 -0
- vgi/worker.py +4761 -0
- vgi_python-0.8.0.dist-info/METADATA +735 -0
- vgi_python-0.8.0.dist-info/RECORD +124 -0
- vgi_python-0.8.0.dist-info/WHEEL +4 -0
- vgi_python-0.8.0.dist-info/entry_points.txt +5 -0
- vgi_python-0.8.0.dist-info/licenses/LICENSE +134 -0
|
@@ -0,0 +1,2767 @@
|
|
|
1
|
+
# Copyright 2025, 2026 Query Farm LLC - https://query.farm
|
|
2
|
+
|
|
3
|
+
"""VGI Catalog Interface for exposing catalogs, schemas, tables, and views.
|
|
4
|
+
|
|
5
|
+
This module provides the abstract base class and data types for implementing
|
|
6
|
+
catalog interfaces in VGI workers, enabling DuckDB ATTACH support.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import dataclasses
|
|
10
|
+
import threading
|
|
11
|
+
from abc import ABC, abstractmethod
|
|
12
|
+
from collections.abc import Mapping, Sequence
|
|
13
|
+
from dataclasses import dataclass, field
|
|
14
|
+
from datetime import datetime
|
|
15
|
+
from enum import Enum
|
|
16
|
+
from typing import (
|
|
17
|
+
TYPE_CHECKING,
|
|
18
|
+
Annotated,
|
|
19
|
+
Any,
|
|
20
|
+
ClassVar,
|
|
21
|
+
Literal,
|
|
22
|
+
NewType,
|
|
23
|
+
Self,
|
|
24
|
+
cast,
|
|
25
|
+
overload,
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
if TYPE_CHECKING:
|
|
29
|
+
from vgi_rpc.rpc import CallContext
|
|
30
|
+
|
|
31
|
+
from vgi.catalog.attach_option import AttachOptionSpec
|
|
32
|
+
from vgi.catalog.descriptors import Catalog, Index, Macro, Schema, Table, View
|
|
33
|
+
from vgi.catalog.secret_type import SecretTypeSpec
|
|
34
|
+
from vgi.catalog.setting import SettingSpec
|
|
35
|
+
|
|
36
|
+
import pyarrow as pa
|
|
37
|
+
from vgi_rpc import ArrowSerializableDataclass, ArrowType
|
|
38
|
+
from vgi_rpc.utils import deserialize_record_batch, serialize_record_batch_bytes
|
|
39
|
+
|
|
40
|
+
from vgi.arguments import SecretLookupEntry
|
|
41
|
+
from vgi.exceptions import CatalogReadOnlyError
|
|
42
|
+
from vgi.metadata import (
|
|
43
|
+
DistinctDependence,
|
|
44
|
+
FunctionStability,
|
|
45
|
+
NullHandling,
|
|
46
|
+
OrderDependence,
|
|
47
|
+
OrderPreservation,
|
|
48
|
+
PartitionKind,
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
__all__ = [
|
|
52
|
+
# Re-exported from vgi.metadata
|
|
53
|
+
"DistinctDependence",
|
|
54
|
+
"FunctionStability",
|
|
55
|
+
"NullHandling",
|
|
56
|
+
"OrderDependence",
|
|
57
|
+
"OrderPreservation",
|
|
58
|
+
"PartitionKind",
|
|
59
|
+
# Catalog-specific
|
|
60
|
+
"CatalogDataVersionRelease",
|
|
61
|
+
"CatalogExample",
|
|
62
|
+
"CatalogInfo",
|
|
63
|
+
"ColumnStatistics",
|
|
64
|
+
"IndexConstraintType",
|
|
65
|
+
"IndexInfo",
|
|
66
|
+
"SecretLookupEntry",
|
|
67
|
+
"MacroType",
|
|
68
|
+
"SchemaObjectType",
|
|
69
|
+
"TableColumnStatisticsResult",
|
|
70
|
+
"WriteFunctionResult",
|
|
71
|
+
]
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def _validate_at_params(at_unit: str | None, at_value: str | None) -> None:
|
|
75
|
+
"""Validate that at_unit and at_value are both provided or both absent."""
|
|
76
|
+
if bool(at_unit) != bool(at_value):
|
|
77
|
+
raise ValueError("at_unit and at_value must both be provided or both be None")
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
@dataclass(frozen=True)
|
|
81
|
+
class CatalogExample(ArrowSerializableDataclass):
|
|
82
|
+
"""An example usage of a function for catalog serialization.
|
|
83
|
+
|
|
84
|
+
Attributes:
|
|
85
|
+
sql: SQL query demonstrating the function.
|
|
86
|
+
description: What this example demonstrates.
|
|
87
|
+
expected_output: Optional expected result description.
|
|
88
|
+
|
|
89
|
+
"""
|
|
90
|
+
|
|
91
|
+
sql: str
|
|
92
|
+
description: str = ""
|
|
93
|
+
expected_output: str | None = None
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
# Type aliases for improved code clarity and type checking.
|
|
97
|
+
# At runtime, these are equivalent to their underlying types.
|
|
98
|
+
AttachOpaqueData = NewType("AttachOpaqueData", bytes)
|
|
99
|
+
TransactionOpaqueData = NewType("TransactionOpaqueData", bytes)
|
|
100
|
+
SerializedSchema = NewType("SerializedSchema", bytes)
|
|
101
|
+
SqlExpression = NewType("SqlExpression", str)
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
@dataclass(frozen=True)
|
|
105
|
+
class CatalogDataVersionRelease(ArrowSerializableDataclass):
|
|
106
|
+
"""One published data version of a catalog.
|
|
107
|
+
|
|
108
|
+
``data_version_spec`` advertises a *compatibility range*; this record
|
|
109
|
+
advertises *what's actually been published*. Together they let a
|
|
110
|
+
client (the describe page, Cupola, programmatic consumers) render a
|
|
111
|
+
discoverable release timeline without scraping the worker's repo.
|
|
112
|
+
|
|
113
|
+
Contracts on the ``CatalogInfo.releases`` list this belongs to:
|
|
114
|
+
|
|
115
|
+
* **Ordering** — entries MUST appear newest-first. Unspecified order
|
|
116
|
+
would force consumers to sort by ``version`` string, which requires
|
|
117
|
+
a comparator the protocol does not define (semver vs. calver vs.
|
|
118
|
+
date-stamped vs. RC tags are all valid).
|
|
119
|
+
* **Uniqueness** — each ``version`` MUST appear at most once. Mirrors
|
|
120
|
+
the same invariant on ``attach_option_specs``'s ``name``. Consumers
|
|
121
|
+
defend against duplicates (log-and-skip later entries) since Arrow
|
|
122
|
+
cannot enforce key uniqueness at the wire level.
|
|
123
|
+
|
|
124
|
+
Long-form release notes do not live here — link to a CHANGELOG anchor,
|
|
125
|
+
GitHub release page, PR, or migration guide via ``notes_url``.
|
|
126
|
+
"""
|
|
127
|
+
|
|
128
|
+
# Concrete version, not a spec. e.g. "1.0.0", "2.4.1". Semver carries
|
|
129
|
+
# the breaking-change signal directly — major bumps are breaking,
|
|
130
|
+
# minor/patch are not.
|
|
131
|
+
version: str
|
|
132
|
+
|
|
133
|
+
# Release date (UTC). ``None`` when the worker doesn't track dates.
|
|
134
|
+
released_at: Annotated[datetime | None, ArrowType(pa.timestamp("us", tz="UTC"))] = None
|
|
135
|
+
|
|
136
|
+
# One-line human summary. Empty string when unknown.
|
|
137
|
+
summary: str = ""
|
|
138
|
+
|
|
139
|
+
# Optional per-release link to detailed notes. Distinct from
|
|
140
|
+
# ``CatalogInfo.source_url`` (which points at the repo as a whole):
|
|
141
|
+
# this points at what changed in *this* release.
|
|
142
|
+
notes_url: str | None = None
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
@dataclass(frozen=True)
|
|
146
|
+
class CatalogInfo(ArrowSerializableDataclass):
|
|
147
|
+
"""Discovery record for a catalog exposed by a worker.
|
|
148
|
+
|
|
149
|
+
Returned by catalog_catalogs() so clients can inspect per-catalog version
|
|
150
|
+
metadata before attaching.
|
|
151
|
+
"""
|
|
152
|
+
|
|
153
|
+
# Catalog name — pass to catalog_attach() to open it.
|
|
154
|
+
name: str
|
|
155
|
+
# Worker software version (singular per worker). ``None`` = worker declares
|
|
156
|
+
# no implementation version.
|
|
157
|
+
implementation_version: str | None
|
|
158
|
+
# Semver range the catalog serves (e.g. ">=1.0.0,<2.0.0"). ``None`` = worker
|
|
159
|
+
# declares no data-version opinion.
|
|
160
|
+
data_version_spec: str | None
|
|
161
|
+
# Attach-time options the catalog accepts (distinct from session settings).
|
|
162
|
+
# Each AttachOptionSpec is serialized as bytes for Arrow compatibility.
|
|
163
|
+
# Enables pre-attach discovery via the catalogs() RPC.
|
|
164
|
+
attach_option_specs: list[bytes] = field(default_factory=list)
|
|
165
|
+
# Concrete published data versions, newest-first. Empty when the worker
|
|
166
|
+
# doesn't track release history. See ``CatalogDataVersionRelease`` for
|
|
167
|
+
# the per-entry ordering and uniqueness contracts.
|
|
168
|
+
releases: list[CatalogDataVersionRelease] = field(default_factory=list)
|
|
169
|
+
# Where this worker's code lives — repo, build, docs. ``None`` when
|
|
170
|
+
# the worker doesn't advertise a source location.
|
|
171
|
+
source_url: str | None = None
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
@dataclass(frozen=True)
|
|
175
|
+
class CatalogAttachResult(ArrowSerializableDataclass):
|
|
176
|
+
"""Result from attaching to a catalog."""
|
|
177
|
+
|
|
178
|
+
# The unique id for the attached catalog.
|
|
179
|
+
attach_opaque_data: AttachOpaqueData
|
|
180
|
+
# Indicate if the worker supports transactions or not.
|
|
181
|
+
# If false, all transaction related methods will not be called and all
|
|
182
|
+
# transaction_opaque_data parameters will be None.
|
|
183
|
+
supports_transactions: bool
|
|
184
|
+
# Indicate if tables support time travel
|
|
185
|
+
supports_time_travel: bool
|
|
186
|
+
# Indicate that the catalog version id is frozen and the schema
|
|
187
|
+
# and object information will not change.
|
|
188
|
+
catalog_version_frozen: bool
|
|
189
|
+
# The initial catalog version, it increments when schemas, tables
|
|
190
|
+
# or other objects change.
|
|
191
|
+
catalog_version: int
|
|
192
|
+
# Indicate if the attach_opaque_data must be persisted across commands.
|
|
193
|
+
# True: Catalog is stateful; attach_opaque_data represents a session
|
|
194
|
+
# False: Catalog is stateless; CLI can auto-attach on each command
|
|
195
|
+
attach_opaque_data_required: bool = True
|
|
196
|
+
# The name of the default schema for this catalog.
|
|
197
|
+
default_schema: str = "main"
|
|
198
|
+
# Extension options (settings) exposed by this catalog/worker.
|
|
199
|
+
# Each ExtensionOption is serialized as bytes for Arrow compatibility.
|
|
200
|
+
settings: list[bytes] = field(default_factory=list)
|
|
201
|
+
# Secret types registered with DuckDB's SecretManager.
|
|
202
|
+
# Each SecretTypeSpec is serialized as bytes for Arrow compatibility.
|
|
203
|
+
secret_types: list[bytes] = field(default_factory=list)
|
|
204
|
+
# Optional comment describing this catalog/database.
|
|
205
|
+
comment: str | None = None
|
|
206
|
+
# Optional key-value tags associated with this catalog/database.
|
|
207
|
+
tags: dict[str, str] = field(default_factory=dict)
|
|
208
|
+
# Whether any tables in this catalog can provide column statistics.
|
|
209
|
+
# Global gate — if False, GetStatistics() returns nullptr for all tables.
|
|
210
|
+
supports_column_statistics: bool = False
|
|
211
|
+
# Concrete data version the worker resolved for this attach. ``None`` =
|
|
212
|
+
# worker has no opinion or the request omitted data_version_spec.
|
|
213
|
+
resolved_data_version: str | None = field(kw_only=True)
|
|
214
|
+
# Concrete implementation version the worker resolved for this attach.
|
|
215
|
+
# ``None`` = worker has no opinion or the request omitted
|
|
216
|
+
# implementation_version.
|
|
217
|
+
resolved_implementation_version: str | None = field(kw_only=True)
|
|
218
|
+
|
|
219
|
+
|
|
220
|
+
@dataclass(frozen=True)
|
|
221
|
+
class CatalogObject:
|
|
222
|
+
"""All objects have the following common properties."""
|
|
223
|
+
|
|
224
|
+
# This is a generic comment about the object
|
|
225
|
+
comment: str | None
|
|
226
|
+
# These are key-value tags associated with the object
|
|
227
|
+
tags: dict[str, str]
|
|
228
|
+
|
|
229
|
+
|
|
230
|
+
@dataclass(frozen=True)
|
|
231
|
+
class CatalogSchemaObject(CatalogObject):
|
|
232
|
+
"""Objects that exist within a schema have the following common properties."""
|
|
233
|
+
|
|
234
|
+
# The name of the object
|
|
235
|
+
name: str
|
|
236
|
+
# The name of the schema containing the object
|
|
237
|
+
schema_name: str
|
|
238
|
+
|
|
239
|
+
|
|
240
|
+
@dataclass(frozen=True)
|
|
241
|
+
class SchemaInfo(CatalogObject, ArrowSerializableDataclass):
|
|
242
|
+
"""Information about a schema in a catalog."""
|
|
243
|
+
|
|
244
|
+
attach_opaque_data: AttachOpaqueData
|
|
245
|
+
name: str
|
|
246
|
+
# Approximate population per object kind, keyed by the same names the C++
|
|
247
|
+
# extension uses for its set-cache instrumentation: ``"table"``, ``"view"``,
|
|
248
|
+
# ``"scalar_function"``, ``"aggregate_function"``, ``"table_function"``,
|
|
249
|
+
# ``"macro"``, ``"index"``. Used by the client to pick between bulk
|
|
250
|
+
# ``LoadEntries`` and per-name single-entry RPCs. Workers may omit the
|
|
251
|
+
# field entirely or any individual key — the client treats absent counts
|
|
252
|
+
# as 1, so unspecified populations bias toward eager bulk-load.
|
|
253
|
+
#
|
|
254
|
+
# **The value 0 is a hard guarantee, not an estimate.** When a count is
|
|
255
|
+
# exactly 0 the client skips the corresponding ``catalog_schema_contents_*``
|
|
256
|
+
# bulk RPC entirely and short-circuits per-name lookups
|
|
257
|
+
# (``catalog_table_get`` / ``catalog_view_get`` / ``catalog_index_get``).
|
|
258
|
+
# If a worker reports 0 for a kind that actually has entries,
|
|
259
|
+
# ``SELECT … FROM s.x`` silently returns "not found" — only declare 0 for
|
|
260
|
+
# kinds the worker knows are empty in its current view of the schema.
|
|
261
|
+
# Cross-session DDL on the same catalog (another connection creating a
|
|
262
|
+
# view in a schema this connection has cached as zero-views) is handled
|
|
263
|
+
# the same way as any other stale catalog cache: ``vgi_clear_cache()`` or
|
|
264
|
+
# re-attach. Time-travel AT-clause queries do not honor the bypass — they
|
|
265
|
+
# always issue the per-name RPC because a historical version may have had
|
|
266
|
+
# entries the current view does not.
|
|
267
|
+
estimated_object_count: dict[str, int] | None = None
|
|
268
|
+
|
|
269
|
+
|
|
270
|
+
@dataclass(frozen=True)
|
|
271
|
+
class TableInfo(CatalogSchemaObject, ArrowSerializableDataclass):
|
|
272
|
+
"""Information about a table in a schema."""
|
|
273
|
+
|
|
274
|
+
# The columns of the table as a PyArrow schema
|
|
275
|
+
# that is serialized as bytes.
|
|
276
|
+
columns: SerializedSchema
|
|
277
|
+
|
|
278
|
+
# Use ArrowType to specify int32 instead of default int64
|
|
279
|
+
not_null_constraints: Annotated[list[int], ArrowType(pa.list_(pa.int32()))]
|
|
280
|
+
unique_constraints: Annotated[list[list[int]], ArrowType(pa.list_(pa.list_(pa.int32())))]
|
|
281
|
+
check_constraints: list[str]
|
|
282
|
+
primary_key_constraints: Annotated[list[list[int]], ArrowType(pa.list_(pa.list_(pa.int32())))] = field(
|
|
283
|
+
default_factory=list
|
|
284
|
+
)
|
|
285
|
+
foreign_key_constraints: Annotated[list[bytes], ArrowType(pa.list_(pa.binary()))] = field(default_factory=list)
|
|
286
|
+
|
|
287
|
+
# Write support flags — indicate which DML operations the table supports.
|
|
288
|
+
supports_insert: bool = False
|
|
289
|
+
supports_update: bool = False
|
|
290
|
+
supports_delete: bool = False
|
|
291
|
+
# When False (the default), the C++ extension rejects INSERT/UPDATE/DELETE
|
|
292
|
+
# ... RETURNING at plan time with a BinderException. Workers that can emit
|
|
293
|
+
# the affected rows from their write functions must opt in by setting this
|
|
294
|
+
# to True.
|
|
295
|
+
supports_returning: bool = False
|
|
296
|
+
|
|
297
|
+
# Statistics capability flag — indicates this table can provide column statistics.
|
|
298
|
+
supports_column_statistics: bool = False
|
|
299
|
+
|
|
300
|
+
# Optional inlined function-discovery results. When populated, the C++
|
|
301
|
+
# extension uses the cached value and skips the corresponding
|
|
302
|
+
# ``catalog_table_{scan,insert,update,delete}_function_get`` RPC. Bytes are
|
|
303
|
+
# the IPC payload from ``ScanFunctionResult.serialize()``.
|
|
304
|
+
#
|
|
305
|
+
# Populating these fields freezes the function args for the lifetime of the
|
|
306
|
+
# catalog cache (until ``catalog_version`` bumps). Workers whose function
|
|
307
|
+
# args change more frequently than ``catalog_version`` (rotating
|
|
308
|
+
# credentials, presigned URLs, per-transaction snapshots) MUST leave these
|
|
309
|
+
# null so the per-bind RPC continues to fire.
|
|
310
|
+
scan_function: Annotated[bytes | None, ArrowType(pa.binary())] = None
|
|
311
|
+
insert_function: Annotated[bytes | None, ArrowType(pa.binary())] = None
|
|
312
|
+
update_function: Annotated[bytes | None, ArrowType(pa.binary())] = None
|
|
313
|
+
delete_function: Annotated[bytes | None, ArrowType(pa.binary())] = None
|
|
314
|
+
|
|
315
|
+
# Optional inlined cardinality. When populated, the C++ extension uses
|
|
316
|
+
# these values directly and skips the ``table_function_cardinality`` RPC
|
|
317
|
+
# — saving one round-trip per bind. Use for read-only or slow-changing
|
|
318
|
+
# tables where cardinality is statically known.
|
|
319
|
+
#
|
|
320
|
+
# Populating these fields freezes the cardinality for the lifetime of
|
|
321
|
+
# the catalog cache (until ``catalog_version`` bumps). Workers whose
|
|
322
|
+
# cardinality changes faster (e.g. live counters) MUST leave them null
|
|
323
|
+
# so the per-bind RPC continues to fire.
|
|
324
|
+
cardinality_estimate: Annotated[int | None, ArrowType(pa.int64())] = None
|
|
325
|
+
cardinality_max: Annotated[int | None, ArrowType(pa.int64())] = None
|
|
326
|
+
|
|
327
|
+
# Optional inlined column statistics. When populated, the C++ extension
|
|
328
|
+
# uses the cached value and skips the per-bind / per-table
|
|
329
|
+
# ``catalog_table_column_statistics_get`` RPC and the per-scan
|
|
330
|
+
# ``table_function_statistics`` RPC. Bytes are the IPC payload from
|
|
331
|
+
# ``serialize_column_statistics(stats, cache_max_age_seconds)``.
|
|
332
|
+
#
|
|
333
|
+
# Populating this field freezes the resolved stats for the lifetime of
|
|
334
|
+
# the catalog cache (until ``catalog_version`` bumps). Workers whose
|
|
335
|
+
# statistics change faster than ``catalog_version`` (e.g. live counters,
|
|
336
|
+
# rapidly-mutating dimensions) MUST leave this null so the on-demand
|
|
337
|
+
# RPC continues to fire.
|
|
338
|
+
column_statistics: Annotated[bytes | None, ArrowType(pa.binary())] = None
|
|
339
|
+
|
|
340
|
+
# Optional inlined bind result. Bytes are the IPC payload of
|
|
341
|
+
# ``BindResponse.serialize_to_bytes()``. When populated, the C++
|
|
342
|
+
# extension uses these bytes verbatim and skips the per-scan ``bind``
|
|
343
|
+
# RPC, threading the deserialized BindResult straight into bind_data.
|
|
344
|
+
#
|
|
345
|
+
# The catalog framework only populates this for tables marked
|
|
346
|
+
# ``Table(inline_bind=True)`` whose function class is
|
|
347
|
+
# ``@bind_fixed_schema``-decorated — the decorator's contract (output is
|
|
348
|
+
# exactly ``cls.FIXED_SCHEMA``, no per-call inputs, no opaque_data)
|
|
349
|
+
# matches what's safe to freeze for the catalog cache lifetime.
|
|
350
|
+
# Functions with custom ``on_bind`` are not eligible via the framework
|
|
351
|
+
# path; workers can still inline manually inside their own
|
|
352
|
+
# ``schema_contents`` override when the bind output is independently
|
|
353
|
+
# known to be stable.
|
|
354
|
+
bind_result: Annotated[bytes | None, ArrowType(pa.binary())] = None
|
|
355
|
+
|
|
356
|
+
# Dotted-path column references that the VGI extension's optimizer pass
|
|
357
|
+
# must verify appear in any scan's WHERE expression (top-level column
|
|
358
|
+
# names like ``"country"`` or struct subfields like ``"bbox.xmin"``,
|
|
359
|
+
# ``"nested.outer.inner"``). Empty (default) means no enforcement — the
|
|
360
|
+
# zero-cost fast path for every existing table.
|
|
361
|
+
#
|
|
362
|
+
# Satisfaction is prefix-based: a present filter on a shorter dotted path
|
|
363
|
+
# satisfies any required path it's a prefix of. A whole-struct filter on
|
|
364
|
+
# ``bbox`` therefore satisfies every required ``"bbox.*"`` path. The C++
|
|
365
|
+
# extension throws ``BinderException`` listing any unsatisfied paths.
|
|
366
|
+
required_field_filter_paths: list[str] = field(default_factory=list)
|
|
367
|
+
|
|
368
|
+
|
|
369
|
+
@dataclass(frozen=True)
|
|
370
|
+
class ViewInfo(CatalogSchemaObject, ArrowSerializableDataclass):
|
|
371
|
+
"""Information about a view in a schema."""
|
|
372
|
+
|
|
373
|
+
# The definition of the view which is a SQL query string.
|
|
374
|
+
definition: str
|
|
375
|
+
|
|
376
|
+
# Per-column comments, keyed by the view's output column name. Unlike tables
|
|
377
|
+
# (whose column comments ride along as Arrow field metadata on the serialized
|
|
378
|
+
# ``columns`` schema), a view ships only its SQL ``definition`` — DuckDB binds
|
|
379
|
+
# that query to derive the columns — so view column comments need their own
|
|
380
|
+
# channel. The C++ extension aligns these by name against the bound output
|
|
381
|
+
# columns and feeds them into ``CreateViewInfo.column_comments_map``; names
|
|
382
|
+
# that don't match a bound column are ignored.
|
|
383
|
+
column_comments: dict[str, str] = field(default_factory=dict)
|
|
384
|
+
|
|
385
|
+
|
|
386
|
+
@dataclass(frozen=True)
|
|
387
|
+
class MacroInfo(CatalogSchemaObject, ArrowSerializableDataclass):
|
|
388
|
+
"""Information about a macro in a schema.
|
|
389
|
+
|
|
390
|
+
Attributes:
|
|
391
|
+
macro_type: Whether this is a scalar or table macro.
|
|
392
|
+
parameters: Ordered list of parameter names.
|
|
393
|
+
parameter_default_values: One-row RecordBatch where column names are parameter
|
|
394
|
+
names and values are typed defaults. None if no defaults.
|
|
395
|
+
Serialized as IPC bytes over the wire.
|
|
396
|
+
definition: The SQL expression (scalar) or query (table).
|
|
397
|
+
|
|
398
|
+
"""
|
|
399
|
+
|
|
400
|
+
macro_type: "MacroType"
|
|
401
|
+
parameters: list[str]
|
|
402
|
+
parameter_default_values: Annotated[pa.RecordBatch | None, ArrowType(pa.binary())] = None
|
|
403
|
+
definition: str = ""
|
|
404
|
+
|
|
405
|
+
|
|
406
|
+
class FunctionType(Enum):
|
|
407
|
+
"""The type of function in a schema."""
|
|
408
|
+
|
|
409
|
+
SCALAR = "scalar"
|
|
410
|
+
TABLE = "table"
|
|
411
|
+
TABLE_BUFFERING = "table_buffering"
|
|
412
|
+
AGGREGATE = "aggregate"
|
|
413
|
+
|
|
414
|
+
|
|
415
|
+
class MacroType(Enum):
|
|
416
|
+
"""The type of macro in a schema."""
|
|
417
|
+
|
|
418
|
+
SCALAR = "scalar"
|
|
419
|
+
TABLE = "table"
|
|
420
|
+
|
|
421
|
+
|
|
422
|
+
class IndexConstraintType(Enum):
|
|
423
|
+
"""The constraint type of an index.
|
|
424
|
+
|
|
425
|
+
NONE: Regular index (no constraint enforcement).
|
|
426
|
+
UNIQUE: Index enforces a UNIQUE constraint.
|
|
427
|
+
PRIMARY: Index enforces a PRIMARY KEY constraint.
|
|
428
|
+
"""
|
|
429
|
+
|
|
430
|
+
NONE = "none"
|
|
431
|
+
UNIQUE = "unique"
|
|
432
|
+
PRIMARY = "primary"
|
|
433
|
+
|
|
434
|
+
|
|
435
|
+
@dataclass(frozen=True)
|
|
436
|
+
class IndexInfo(CatalogSchemaObject, ArrowSerializableDataclass):
|
|
437
|
+
"""Information about an index in a schema.
|
|
438
|
+
|
|
439
|
+
Attributes:
|
|
440
|
+
table_name: The name of the table this index is on.
|
|
441
|
+
index_type: The index type string (e.g., "ART", or empty for default).
|
|
442
|
+
constraint_type: The constraint enforcement type (NONE, UNIQUE, PRIMARY).
|
|
443
|
+
expressions: SQL expression strings defining the indexed expressions.
|
|
444
|
+
For column-based indexes, these are column references (e.g., "col_a").
|
|
445
|
+
For expression indexes, these are arbitrary SQL (e.g., "lower(col_a)").
|
|
446
|
+
options: Key-value index options (WITH clause).
|
|
447
|
+
|
|
448
|
+
"""
|
|
449
|
+
|
|
450
|
+
table_name: str
|
|
451
|
+
index_type: str = ""
|
|
452
|
+
constraint_type: IndexConstraintType = IndexConstraintType.NONE
|
|
453
|
+
expressions: list[str] = field(default_factory=list)
|
|
454
|
+
options: dict[str, str] = field(default_factory=dict)
|
|
455
|
+
|
|
456
|
+
|
|
457
|
+
class SchemaObjectType(Enum):
|
|
458
|
+
"""The type of object that can exist within a schema.
|
|
459
|
+
|
|
460
|
+
Used to filter results from schema_contents().
|
|
461
|
+
"""
|
|
462
|
+
|
|
463
|
+
TABLE = "table"
|
|
464
|
+
VIEW = "view"
|
|
465
|
+
SCALAR_FUNCTION = "scalar_function"
|
|
466
|
+
TABLE_FUNCTION = "table_function"
|
|
467
|
+
AGGREGATE_FUNCTION = "aggregate_function"
|
|
468
|
+
SCALAR_MACRO = "scalar_macro"
|
|
469
|
+
TABLE_MACRO = "table_macro"
|
|
470
|
+
INDEX = "index"
|
|
471
|
+
|
|
472
|
+
|
|
473
|
+
class OnConflict(Enum):
|
|
474
|
+
"""Behavior when a conflict occurs during creation of an object.
|
|
475
|
+
|
|
476
|
+
IGNORE: Do nothing if the object already exists.
|
|
477
|
+
REPLACE: Replace the existing object if it already exists.
|
|
478
|
+
ERROR: Raise an error if the object already exists.
|
|
479
|
+
"""
|
|
480
|
+
|
|
481
|
+
ERROR = "error"
|
|
482
|
+
IGNORE = "ignore"
|
|
483
|
+
REPLACE = "replace"
|
|
484
|
+
|
|
485
|
+
|
|
486
|
+
@dataclass(frozen=True)
|
|
487
|
+
class FunctionInfo(CatalogSchemaObject, ArrowSerializableDataclass):
|
|
488
|
+
"""Information about a function in a schema."""
|
|
489
|
+
|
|
490
|
+
# the type of function from VGI
|
|
491
|
+
function_type: FunctionType
|
|
492
|
+
|
|
493
|
+
# The arguments as a serialized Apache arrow schema using
|
|
494
|
+
# schema.serialize().to_pybytes()
|
|
495
|
+
arguments: SerializedSchema
|
|
496
|
+
|
|
497
|
+
# The output schema as a serialized Apache arrow schema using
|
|
498
|
+
# schema.serialize().to_pybytes()
|
|
499
|
+
output_schema: SerializedSchema
|
|
500
|
+
|
|
501
|
+
# Scalar function behavior fields (None for non-scalar functions)
|
|
502
|
+
stability: FunctionStability | None = None
|
|
503
|
+
null_handling: NullHandling | None = None
|
|
504
|
+
|
|
505
|
+
# Documentation fields
|
|
506
|
+
# description: intrinsic documentation from function metadata (Meta.description)
|
|
507
|
+
# comment: user-settable comment (via COMMENT ON FUNCTION, inherited from base)
|
|
508
|
+
description: str = ""
|
|
509
|
+
examples: list[CatalogExample] = field(default_factory=list)
|
|
510
|
+
categories: list[str] = field(default_factory=list)
|
|
511
|
+
|
|
512
|
+
# Table function capabilities (None for scalar functions)
|
|
513
|
+
projection_pushdown: bool | None = None
|
|
514
|
+
filter_pushdown: bool | None = None
|
|
515
|
+
sampling_pushdown: bool | None = None
|
|
516
|
+
# True if the table participates in DuckDB's late-materialization optimizer
|
|
517
|
+
# (Meta.late_materialization). The DuckDB extension only honours this when
|
|
518
|
+
# the table also exposes a rowid virtual column plus filter/projection
|
|
519
|
+
# pushdown — see GetScanFunctionImpl in the C++ vgi_table_entry.cpp.
|
|
520
|
+
late_materialization: bool | None = None
|
|
521
|
+
supported_expression_filters: list[str] = field(default_factory=list)
|
|
522
|
+
order_preservation: OrderPreservation | None = None
|
|
523
|
+
# Use ArrowType to specify int32 instead of default int64
|
|
524
|
+
max_workers: Annotated[int | None, ArrowType(pa.int32())] = None
|
|
525
|
+
# True if the function opts in to per-batch ``vgi_batch_index`` tagging:
|
|
526
|
+
# the worker emits an integer partition id in each Arrow batch's
|
|
527
|
+
# KeyValueMetadata; the DuckDB extension threads it through
|
|
528
|
+
# ``TableFunction::get_partition_data`` so ordered sinks (BatchCollector,
|
|
529
|
+
# BatchInsert, BatchCopyToFile, Limit) can reassemble parallel output in
|
|
530
|
+
# partition-id order. Opting in also skips the FIXED_ORDER MaxThreads=1
|
|
531
|
+
# clamp; the source stays parallel and the sink does the ordering.
|
|
532
|
+
supports_batch_index: bool = False
|
|
533
|
+
# Partition shape declared by the function over its
|
|
534
|
+
# ``vgi.partition_column``-annotated bind-schema fields. When non-
|
|
535
|
+
# ``NOT_PARTITIONED``, the DuckDB extension installs
|
|
536
|
+
# ``TableFunction::get_partition_info`` returning the corresponding
|
|
537
|
+
# ``TablePartitionInfo`` value so the planner can pick
|
|
538
|
+
# ``PhysicalPartitionedAggregate`` for ``GROUP BY`` queries (today,
|
|
539
|
+
# only ``SINGLE_VALUE_PARTITIONS`` materially changes planner
|
|
540
|
+
# behavior). Per-column annotation lives in the bind schema's
|
|
541
|
+
# field-level metadata — see ``vgi.schema_utils.partition_field``.
|
|
542
|
+
partition_kind: PartitionKind = PartitionKind.NOT_PARTITIONED
|
|
543
|
+
|
|
544
|
+
# Aggregate function fields (future)
|
|
545
|
+
order_dependent: OrderDependence = OrderDependence.NOT_ORDER_DEPENDENT
|
|
546
|
+
distinct_dependent: DistinctDependence = DistinctDependence.NOT_DISTINCT_DEPENDENT
|
|
547
|
+
# True if the aggregate implements the window() callback
|
|
548
|
+
supports_window: bool = False
|
|
549
|
+
# True if the aggregate opts into the streaming-partitioned protocol —
|
|
550
|
+
# ``aggregate_streaming_open`` / ``_chunk`` / ``_close``. The DuckDB
|
|
551
|
+
# extension's optimizer rule may rewrite eligible LogicalWindow nodes to
|
|
552
|
+
# use this path.
|
|
553
|
+
streaming_partitioned: bool = False
|
|
554
|
+
|
|
555
|
+
# True if a table-in-out function declares a finalize/finish stage.
|
|
556
|
+
# The C++ extension uses this to conditionally register
|
|
557
|
+
# ``in_out_function_final``; DuckDB rejects LATERAL with correlated input
|
|
558
|
+
# on functions that register a finalize callback.
|
|
559
|
+
has_finalize: bool = False
|
|
560
|
+
|
|
561
|
+
# Only meaningful when ``function_type == FunctionType.TABLE_BUFFERING``
|
|
562
|
+
# (i.e. the function is registered through the Sink+Source path). When
|
|
563
|
+
# true, the source phase is single-threaded and ``finalize_state_id``s
|
|
564
|
+
# drain in combine-returned order. Default false enables parallel
|
|
565
|
+
# finalize.
|
|
566
|
+
source_order_dependent: bool = False
|
|
567
|
+
|
|
568
|
+
# Only meaningful when ``function_type == FunctionType.TABLE_BUFFERING``.
|
|
569
|
+
# When true, the SINK phase runs single-threaded — every process() call
|
|
570
|
+
# arrives in source order on one worker. Mutually exclusive with
|
|
571
|
+
# requires_input_batch_index.
|
|
572
|
+
sink_order_dependent: bool = False
|
|
573
|
+
|
|
574
|
+
# Only meaningful when ``function_type == FunctionType.TABLE_BUFFERING``.
|
|
575
|
+
# When true, the C++ Sink operator declares
|
|
576
|
+
# RequiredPartitionInfo()=BatchIndex(); each process() RPC carries a
|
|
577
|
+
# globally-unique monotonic batch_index from DuckDB's source. Workers
|
|
578
|
+
# can sort by it in combine() to reconstruct source order under parallel
|
|
579
|
+
# ingest. Mutually exclusive with sink_order_dependent.
|
|
580
|
+
requires_input_batch_index: bool = False
|
|
581
|
+
|
|
582
|
+
# Settings required by the function
|
|
583
|
+
required_settings: list[str] = field(default_factory=list)
|
|
584
|
+
|
|
585
|
+
# Secrets required by the function (each entry has secret_type, optional secret_name, optional scope)
|
|
586
|
+
required_secrets: list[SecretLookupEntry] = field(default_factory=list)
|
|
587
|
+
|
|
588
|
+
|
|
589
|
+
@dataclass(frozen=True)
|
|
590
|
+
class ScanFunctionResult:
|
|
591
|
+
"""Result from getting a table scan function.
|
|
592
|
+
|
|
593
|
+
This result tells the VGI DuckDB extension which DuckDB function to call
|
|
594
|
+
to obtain the data for a table. This enables catalogs to delegate scanning
|
|
595
|
+
to any DuckDB function (e.g., read_parquet, iceberg_scan, or a custom VGI
|
|
596
|
+
table function) with appropriate arguments.
|
|
597
|
+
|
|
598
|
+
Attributes:
|
|
599
|
+
function_name: The DuckDB function to call (e.g., "read_parquet").
|
|
600
|
+
positional_arguments: Positional arguments as PyArrow scalars.
|
|
601
|
+
named_arguments: Named arguments as PyArrow scalars.
|
|
602
|
+
required_extensions: DuckDB extensions to load before calling.
|
|
603
|
+
|
|
604
|
+
"""
|
|
605
|
+
|
|
606
|
+
# The name of the duckdb function to call to obtain the data
|
|
607
|
+
# in the table.
|
|
608
|
+
function_name: str
|
|
609
|
+
|
|
610
|
+
# The positional arguments to the include in the function call.
|
|
611
|
+
positional_arguments: list[pa.Scalar] # type: ignore[type-arg]
|
|
612
|
+
|
|
613
|
+
# The named arguments to include in the function call.
|
|
614
|
+
named_arguments: dict[str, pa.Scalar] # type: ignore[type-arg]
|
|
615
|
+
|
|
616
|
+
# A list of extensions to require to be loaded.
|
|
617
|
+
required_extensions: list[str] = field(default_factory=list)
|
|
618
|
+
|
|
619
|
+
ARROW_SCHEMA: ClassVar[pa.Schema] = pa.schema(
|
|
620
|
+
[
|
|
621
|
+
pa.field("function_name", pa.string(), nullable=False),
|
|
622
|
+
pa.field("arguments", pa.binary(), nullable=False),
|
|
623
|
+
pa.field("required_extensions", pa.list_(pa.string()), nullable=False),
|
|
624
|
+
] # type: ignore[arg-type]
|
|
625
|
+
)
|
|
626
|
+
|
|
627
|
+
def to_row_dict(self) -> dict[str, Any]:
|
|
628
|
+
"""Convert to a dictionary for batch construction.
|
|
629
|
+
|
|
630
|
+
The arguments field is serialized as nested Arrow IPC bytes.
|
|
631
|
+
"""
|
|
632
|
+
# Build arguments as nested batch
|
|
633
|
+
argument_values: dict[str, pa.Scalar] = {} # type: ignore[type-arg]
|
|
634
|
+
argument_schema = []
|
|
635
|
+
for index, arg in enumerate(self.positional_arguments):
|
|
636
|
+
argument_schema.append(pa.field(f"arg_{index}", arg.type))
|
|
637
|
+
argument_values[f"arg_{index}"] = arg
|
|
638
|
+
for name, value in self.named_arguments.items():
|
|
639
|
+
argument_schema.append(pa.field(name, value.type))
|
|
640
|
+
argument_values[name] = value
|
|
641
|
+
|
|
642
|
+
argument_batch = pa.RecordBatch.from_pylist(
|
|
643
|
+
[argument_values],
|
|
644
|
+
schema=pa.schema(argument_schema),
|
|
645
|
+
)
|
|
646
|
+
|
|
647
|
+
return {
|
|
648
|
+
"function_name": self.function_name,
|
|
649
|
+
"arguments": serialize_record_batch_bytes(argument_batch),
|
|
650
|
+
"required_extensions": list(self.required_extensions) if self.required_extensions is not None else None,
|
|
651
|
+
}
|
|
652
|
+
|
|
653
|
+
def serialize(self) -> bytes:
|
|
654
|
+
"""Serialize to Arrow IPC bytes."""
|
|
655
|
+
batch = pa.RecordBatch.from_pylist(
|
|
656
|
+
[self.to_row_dict()],
|
|
657
|
+
schema=self.ARROW_SCHEMA,
|
|
658
|
+
)
|
|
659
|
+
return serialize_record_batch_bytes(batch)
|
|
660
|
+
|
|
661
|
+
@classmethod
|
|
662
|
+
def deserialize(cls, batch: pa.RecordBatch) -> Self:
|
|
663
|
+
"""Deserialize from Arrow RecordBatch."""
|
|
664
|
+
from vgi_rpc.utils import _validate_single_row_batch
|
|
665
|
+
|
|
666
|
+
row = _validate_single_row_batch(
|
|
667
|
+
batch,
|
|
668
|
+
cls.__name__,
|
|
669
|
+
required_fields=["function_name", "arguments"],
|
|
670
|
+
)
|
|
671
|
+
|
|
672
|
+
# Deserialize the nested arguments batch.
|
|
673
|
+
# row["arguments"] is already bytes (_validate_single_row_batch returns
|
|
674
|
+
# Python values, not PyArrow scalars).
|
|
675
|
+
arguments_bytes = cast(bytes, row["arguments"])
|
|
676
|
+
arguments_batch, _ = deserialize_record_batch(arguments_bytes)
|
|
677
|
+
|
|
678
|
+
# Extract positional and named arguments from the batch
|
|
679
|
+
positional_arguments: list[pa.Scalar] = [] # type: ignore[type-arg]
|
|
680
|
+
named_arguments: dict[str, pa.Scalar] = {} # type: ignore[type-arg]
|
|
681
|
+
|
|
682
|
+
for arg_field in arguments_batch.schema:
|
|
683
|
+
value = arguments_batch.column(arg_field.name)[0]
|
|
684
|
+
if arg_field.name.startswith("arg_"):
|
|
685
|
+
positional_arguments.append(value)
|
|
686
|
+
else:
|
|
687
|
+
named_arguments[arg_field.name] = value
|
|
688
|
+
|
|
689
|
+
return cls(
|
|
690
|
+
function_name=cast(str, row["function_name"]),
|
|
691
|
+
positional_arguments=positional_arguments,
|
|
692
|
+
named_arguments=named_arguments,
|
|
693
|
+
required_extensions=list(cast("list[str]", row.get("required_extensions") or [])),
|
|
694
|
+
)
|
|
695
|
+
|
|
696
|
+
|
|
697
|
+
# Write function discovery uses the same wire format as scan function discovery.
|
|
698
|
+
WriteFunctionResult = ScanFunctionResult
|
|
699
|
+
|
|
700
|
+
|
|
701
|
+
# ============================================================================
|
|
702
|
+
# Multi-branch scan (catalog_table_scan_branches_get)
|
|
703
|
+
# ============================================================================
|
|
704
|
+
#
|
|
705
|
+
# A table whose data spans multiple physical sources (canonical example:
|
|
706
|
+
# hot rows in Kafka + historical rows in Iceberg/Delta/parquet) declares
|
|
707
|
+
# one ``ScanBranch`` per source. The VGI DuckDB extension's optimizer-
|
|
708
|
+
# extension rewrites a placeholder ``LogicalGet`` into a
|
|
709
|
+
# ``LogicalSetOperation(UNION_ALL, ...)`` with one arm per branch, each
|
|
710
|
+
# binding its own ``TableFunction`` (a VGI function, or a native reader
|
|
711
|
+
# like ``iceberg_scan`` / ``read_parquet``).
|
|
712
|
+
#
|
|
713
|
+
# This is **wire-compat with single-branch workers**: the new RPC
|
|
714
|
+
# ``catalog_table_scan_branches_get`` is additive; old workers that don't
|
|
715
|
+
# implement it cause the C++ side to fall back to
|
|
716
|
+
# ``catalog_table_scan_function_get`` and synthesise a one-branch result.
|
|
717
|
+
#
|
|
718
|
+
# The rewriter semantics, ``branch_filter`` model, and current scope
|
|
719
|
+
# decisions (INSERT-only on writable arm, UPDATE/DELETE/MERGE refused,
|
|
720
|
+
# AT-clause refused, fail-fast error semantics) are documented with the
|
|
721
|
+
# relevant methods below.
|
|
722
|
+
|
|
723
|
+
|
|
724
|
+
@dataclass(frozen=True)
|
|
725
|
+
class ScanBranch:
|
|
726
|
+
"""One physical source backing a multi-branch scan.
|
|
727
|
+
|
|
728
|
+
Attributes:
|
|
729
|
+
function_name: The DuckDB function to call for this branch
|
|
730
|
+
(e.g., ``"read_parquet"``, ``"iceberg_scan"``, or a VGI
|
|
731
|
+
table function). The C++ rewriter resolves this name against
|
|
732
|
+
DuckDB's function catalog and binds it at optimize time.
|
|
733
|
+
positional_arguments: Positional arguments as PyArrow scalars,
|
|
734
|
+
passed through to the function's ``bind``.
|
|
735
|
+
named_arguments: Named arguments as PyArrow scalars.
|
|
736
|
+
branch_filter: Optional SQL expression text (parsed by DuckDB's
|
|
737
|
+
parser, bound against the branch's bound column list). The
|
|
738
|
+
rewriter AND's this into every scan of this branch BEFORE
|
|
739
|
+
filter pushdown, so the branch only ever sees rows in its
|
|
740
|
+
declared scope. Used to make overlapping physical sources
|
|
741
|
+
(Kafka 7d retention + Iceberg nightly batches with ~24h
|
|
742
|
+
overlap) non-overlapping at scan time, without changing the
|
|
743
|
+
worker code. ``None`` means unconstrained.
|
|
744
|
+
writable: Declares this branch as the INSERT target for the
|
|
745
|
+
multi-branch table. At most one branch per table may set
|
|
746
|
+
this true (enforced at catalog-load by the C++ extension —
|
|
747
|
+
multiple writable arms would violate DuckDB's single-
|
|
748
|
+
writable-catalog-per-transaction rule). When no branch is
|
|
749
|
+
writable, the table is read-only. UPDATE/DELETE/MERGE
|
|
750
|
+
remain refused on multi-branch tables regardless of this
|
|
751
|
+
flag; the contract is INSERT-only until cross-arm
|
|
752
|
+
semantics have customer-driven evidence.
|
|
753
|
+
|
|
754
|
+
"""
|
|
755
|
+
|
|
756
|
+
function_name: str
|
|
757
|
+
positional_arguments: list[pa.Scalar] # type: ignore[type-arg]
|
|
758
|
+
named_arguments: dict[str, pa.Scalar] # type: ignore[type-arg]
|
|
759
|
+
branch_filter: str | None = None
|
|
760
|
+
writable: bool = False
|
|
761
|
+
|
|
762
|
+
ARROW_SCHEMA: ClassVar[pa.Schema] = pa.schema(
|
|
763
|
+
[
|
|
764
|
+
pa.field("function_name", pa.string(), nullable=False),
|
|
765
|
+
pa.field("arguments", pa.binary(), nullable=False),
|
|
766
|
+
pa.field("branch_filter", pa.string(), nullable=True),
|
|
767
|
+
pa.field("writable", pa.bool_(), nullable=False),
|
|
768
|
+
] # type: ignore[arg-type]
|
|
769
|
+
)
|
|
770
|
+
|
|
771
|
+
def to_row_dict(self) -> dict[str, Any]:
|
|
772
|
+
"""Convert to a dictionary for batch construction.
|
|
773
|
+
|
|
774
|
+
Arguments are serialized as nested Arrow IPC bytes (same trick as
|
|
775
|
+
:class:`ScanFunctionResult`).
|
|
776
|
+
"""
|
|
777
|
+
argument_values: dict[str, pa.Scalar] = {} # type: ignore[type-arg]
|
|
778
|
+
argument_schema: list[pa.Field] = [] # type: ignore[type-arg]
|
|
779
|
+
for index, arg in enumerate(self.positional_arguments):
|
|
780
|
+
argument_schema.append(pa.field(f"arg_{index}", arg.type))
|
|
781
|
+
argument_values[f"arg_{index}"] = arg
|
|
782
|
+
for name, value in self.named_arguments.items():
|
|
783
|
+
argument_schema.append(pa.field(name, value.type))
|
|
784
|
+
argument_values[name] = value
|
|
785
|
+
argument_batch = pa.RecordBatch.from_pylist(
|
|
786
|
+
[argument_values],
|
|
787
|
+
schema=pa.schema(argument_schema),
|
|
788
|
+
)
|
|
789
|
+
return {
|
|
790
|
+
"function_name": self.function_name,
|
|
791
|
+
"arguments": serialize_record_batch_bytes(argument_batch),
|
|
792
|
+
"branch_filter": self.branch_filter,
|
|
793
|
+
"writable": self.writable,
|
|
794
|
+
}
|
|
795
|
+
|
|
796
|
+
def serialize(self) -> bytes:
|
|
797
|
+
"""Serialize to Arrow IPC bytes (1-row batch using ARROW_SCHEMA)."""
|
|
798
|
+
batch = pa.RecordBatch.from_pylist(
|
|
799
|
+
[self.to_row_dict()],
|
|
800
|
+
schema=self.ARROW_SCHEMA,
|
|
801
|
+
)
|
|
802
|
+
return serialize_record_batch_bytes(batch)
|
|
803
|
+
|
|
804
|
+
@classmethod
|
|
805
|
+
def deserialize(cls, batch: pa.RecordBatch) -> Self:
|
|
806
|
+
"""Deserialize from a 1-row Arrow RecordBatch."""
|
|
807
|
+
from vgi_rpc.utils import _validate_single_row_batch
|
|
808
|
+
|
|
809
|
+
row = _validate_single_row_batch(
|
|
810
|
+
batch,
|
|
811
|
+
cls.__name__,
|
|
812
|
+
required_fields=["function_name", "arguments"],
|
|
813
|
+
)
|
|
814
|
+
|
|
815
|
+
arguments_bytes = cast(bytes, row["arguments"])
|
|
816
|
+
arguments_batch, _ = deserialize_record_batch(arguments_bytes)
|
|
817
|
+
|
|
818
|
+
positional_arguments: list[pa.Scalar] = [] # type: ignore[type-arg]
|
|
819
|
+
named_arguments: dict[str, pa.Scalar] = {} # type: ignore[type-arg]
|
|
820
|
+
for arg_field in arguments_batch.schema:
|
|
821
|
+
value = arguments_batch.column(arg_field.name)[0]
|
|
822
|
+
if arg_field.name.startswith("arg_"):
|
|
823
|
+
positional_arguments.append(value)
|
|
824
|
+
else:
|
|
825
|
+
named_arguments[arg_field.name] = value
|
|
826
|
+
|
|
827
|
+
branch_filter_value = row.get("branch_filter")
|
|
828
|
+
return cls(
|
|
829
|
+
function_name=cast(str, row["function_name"]),
|
|
830
|
+
positional_arguments=positional_arguments,
|
|
831
|
+
named_arguments=named_arguments,
|
|
832
|
+
branch_filter=cast("str | None", branch_filter_value) if branch_filter_value is not None else None,
|
|
833
|
+
# writable is non-nullable on the wire — trust the schema.
|
|
834
|
+
writable=bool(row["writable"]),
|
|
835
|
+
)
|
|
836
|
+
|
|
837
|
+
|
|
838
|
+
@dataclass(frozen=True)
|
|
839
|
+
class ScanBranchesResult:
|
|
840
|
+
"""Result from getting the list of scan branches for a multi-branch table.
|
|
841
|
+
|
|
842
|
+
The result tells the VGI DuckDB extension which DuckDB function(s) to
|
|
843
|
+
call to obtain the data for the table. Each branch is bound independently
|
|
844
|
+
and the rewriter unions their output.
|
|
845
|
+
|
|
846
|
+
Attributes:
|
|
847
|
+
branches: One ``ScanBranch`` per physical source. Order is meaningful
|
|
848
|
+
for stable diagnostic output (``vgi_table_branches()``) but not
|
|
849
|
+
for query semantics (UNION ALL is unordered).
|
|
850
|
+
required_extensions: Union of all DuckDB extensions needed across all
|
|
851
|
+
branches (e.g., ``["iceberg", "httpfs"]``). The C++ side auto-loads
|
|
852
|
+
unloaded entries before running the rewrite; missing extensions
|
|
853
|
+
surface the existing extension-load diagnostic. Hoisted to the
|
|
854
|
+
top level so workers don't repeat ``"iceberg"`` on every branch
|
|
855
|
+
that uses it.
|
|
856
|
+
|
|
857
|
+
"""
|
|
858
|
+
|
|
859
|
+
branches: list[ScanBranch]
|
|
860
|
+
required_extensions: list[str] = field(default_factory=list)
|
|
861
|
+
|
|
862
|
+
# On the wire each branch is serialized as its own IPC stream (bytes),
|
|
863
|
+
# carried in a list<binary> column. The C++ side parses each entry via
|
|
864
|
+
# ScanBranch::deserialize, matching the nested-IPC trick used for the
|
|
865
|
+
# arguments field on ScanFunctionResult/ScanBranch themselves.
|
|
866
|
+
ARROW_SCHEMA: ClassVar[pa.Schema] = pa.schema(
|
|
867
|
+
[
|
|
868
|
+
pa.field("branches", pa.list_(pa.binary()), nullable=False),
|
|
869
|
+
pa.field("required_extensions", pa.list_(pa.string()), nullable=False),
|
|
870
|
+
] # type: ignore[arg-type]
|
|
871
|
+
)
|
|
872
|
+
|
|
873
|
+
def to_row_dict(self) -> dict[str, Any]:
|
|
874
|
+
"""Convert to a dictionary for batch construction."""
|
|
875
|
+
return {
|
|
876
|
+
"branches": [branch.serialize() for branch in self.branches],
|
|
877
|
+
"required_extensions": list(self.required_extensions),
|
|
878
|
+
}
|
|
879
|
+
|
|
880
|
+
def serialize(self) -> bytes:
|
|
881
|
+
"""Serialize to Arrow IPC bytes (1-row batch using ARROW_SCHEMA)."""
|
|
882
|
+
batch = pa.RecordBatch.from_pylist(
|
|
883
|
+
[self.to_row_dict()],
|
|
884
|
+
schema=self.ARROW_SCHEMA,
|
|
885
|
+
)
|
|
886
|
+
return serialize_record_batch_bytes(batch)
|
|
887
|
+
|
|
888
|
+
@classmethod
|
|
889
|
+
def deserialize(cls, batch: pa.RecordBatch) -> Self:
|
|
890
|
+
"""Deserialize from a 1-row Arrow RecordBatch.
|
|
891
|
+
|
|
892
|
+
Empty branches list is rejected — workers must return at least one
|
|
893
|
+
branch. (See the design memo's "loud at attach" rule.)
|
|
894
|
+
"""
|
|
895
|
+
from vgi_rpc.utils import _validate_single_row_batch
|
|
896
|
+
|
|
897
|
+
row = _validate_single_row_batch(
|
|
898
|
+
batch,
|
|
899
|
+
cls.__name__,
|
|
900
|
+
required_fields=["branches"],
|
|
901
|
+
)
|
|
902
|
+
|
|
903
|
+
branch_blobs = cast("list[bytes]", row["branches"])
|
|
904
|
+
if not branch_blobs:
|
|
905
|
+
raise ValueError(f"{cls.__name__}: branches list must not be empty")
|
|
906
|
+
|
|
907
|
+
branches: list[ScanBranch] = []
|
|
908
|
+
for blob in branch_blobs:
|
|
909
|
+
branch_batch, _ = deserialize_record_batch(blob)
|
|
910
|
+
branches.append(ScanBranch.deserialize(branch_batch))
|
|
911
|
+
|
|
912
|
+
return cls(
|
|
913
|
+
branches=branches,
|
|
914
|
+
required_extensions=list(cast("list[str]", row.get("required_extensions") or [])),
|
|
915
|
+
)
|
|
916
|
+
|
|
917
|
+
|
|
918
|
+
# ============================================================================
|
|
919
|
+
# Column Statistics
|
|
920
|
+
# ============================================================================
|
|
921
|
+
|
|
922
|
+
|
|
923
|
+
@dataclass(frozen=True)
|
|
924
|
+
class ColumnStatistics:
|
|
925
|
+
"""Statistics for a single column in a table.
|
|
926
|
+
|
|
927
|
+
Workers provide these to help DuckDB's optimizer make cost-based decisions
|
|
928
|
+
(filter elimination, join reordering, etc.).
|
|
929
|
+
|
|
930
|
+
Attributes:
|
|
931
|
+
column_name: Name of the column these statistics describe.
|
|
932
|
+
min: Minimum value as a typed PyArrow scalar (e.g., ``pa.scalar(0, pa.int64())``),
|
|
933
|
+
or ``None`` if unknown.
|
|
934
|
+
max: Maximum value as a typed PyArrow scalar, or ``None`` if unknown.
|
|
935
|
+
Must have the same Arrow type as ``min``.
|
|
936
|
+
has_null: Whether the column contains any null values.
|
|
937
|
+
has_not_null: Whether the column contains any non-null values.
|
|
938
|
+
distinct_count: Approximate count of distinct values, or ``None`` if unknown.
|
|
939
|
+
contains_unicode: String/binary columns only — whether values contain non-ASCII
|
|
940
|
+
characters. ``None`` for non-string columns.
|
|
941
|
+
max_string_length: String/binary columns only — maximum byte length of values.
|
|
942
|
+
``None`` for non-string columns.
|
|
943
|
+
|
|
944
|
+
"""
|
|
945
|
+
|
|
946
|
+
column_name: str
|
|
947
|
+
min: pa.Scalar | None = None # type: ignore[type-arg]
|
|
948
|
+
max: pa.Scalar | None = None # type: ignore[type-arg]
|
|
949
|
+
has_null: bool = True
|
|
950
|
+
has_not_null: bool = True
|
|
951
|
+
distinct_count: int | None = None
|
|
952
|
+
contains_unicode: bool | None = None
|
|
953
|
+
max_string_length: int | None = None
|
|
954
|
+
|
|
955
|
+
|
|
956
|
+
@dataclass(frozen=True)
|
|
957
|
+
class TableColumnStatisticsResult:
|
|
958
|
+
"""Result from ``table_column_statistics_get`` with optional cache control.
|
|
959
|
+
|
|
960
|
+
Attributes:
|
|
961
|
+
statistics: Per-column statistics for the table.
|
|
962
|
+
cache_max_age_seconds: How long the client may cache these statistics
|
|
963
|
+
(in seconds). ``None`` means cache indefinitely (static data).
|
|
964
|
+
``0`` means do not cache (live/volatile data).
|
|
965
|
+
|
|
966
|
+
"""
|
|
967
|
+
|
|
968
|
+
statistics: list[ColumnStatistics]
|
|
969
|
+
cache_max_age_seconds: int | None = None
|
|
970
|
+
|
|
971
|
+
|
|
972
|
+
def _infer_stat_type(stat: ColumnStatistics) -> pa.DataType:
|
|
973
|
+
"""Infer the Arrow type for a ColumnStatistics entry from its min/max scalars."""
|
|
974
|
+
if stat.min is not None and stat.min.is_valid:
|
|
975
|
+
return stat.min.type # type: ignore[no-any-return]
|
|
976
|
+
if stat.max is not None and stat.max.is_valid:
|
|
977
|
+
return stat.max.type # type: ignore[no-any-return]
|
|
978
|
+
return pa.null()
|
|
979
|
+
|
|
980
|
+
|
|
981
|
+
def serialize_column_statistics(
|
|
982
|
+
stats: list[ColumnStatistics],
|
|
983
|
+
cache_max_age_seconds: int | None = None,
|
|
984
|
+
) -> bytes:
|
|
985
|
+
"""Serialize column statistics into a single RecordBatch with sparse union min/max.
|
|
986
|
+
|
|
987
|
+
The ``min`` and ``max`` columns use an Arrow sparse union whose child types
|
|
988
|
+
are the distinct column types present in *stats*. This keeps everything in
|
|
989
|
+
a single IPC stream regardless of how many column types the table has.
|
|
990
|
+
|
|
991
|
+
Args:
|
|
992
|
+
stats: Per-column statistics to serialize.
|
|
993
|
+
cache_max_age_seconds: Optional cache TTL embedded in schema metadata.
|
|
994
|
+
|
|
995
|
+
Returns:
|
|
996
|
+
IPC-serialized bytes of the statistics RecordBatch.
|
|
997
|
+
|
|
998
|
+
"""
|
|
999
|
+
n = len(stats)
|
|
1000
|
+
if n == 0:
|
|
1001
|
+
# Return a minimal empty batch — must construct empty union arrays manually
|
|
1002
|
+
# since pa.array([], type=sparse_union) is not supported
|
|
1003
|
+
union_fields: list[pa.Field[Any]] = [pa.field("0", pa.null())]
|
|
1004
|
+
union_type = pa.sparse_union(union_fields)
|
|
1005
|
+
empty_union = pa.UnionArray.from_sparse(
|
|
1006
|
+
pa.array([], type=pa.int8()),
|
|
1007
|
+
[pa.array([], type=pa.null())],
|
|
1008
|
+
field_names=["0"],
|
|
1009
|
+
type_codes=[0], # type: ignore[arg-type]
|
|
1010
|
+
)
|
|
1011
|
+
schema = pa.schema(
|
|
1012
|
+
[
|
|
1013
|
+
pa.field("column_name", pa.utf8()),
|
|
1014
|
+
pa.field("min", union_type),
|
|
1015
|
+
pa.field("max", union_type),
|
|
1016
|
+
pa.field("has_null", pa.bool_()),
|
|
1017
|
+
pa.field("has_not_null", pa.bool_()),
|
|
1018
|
+
pa.field("distinct_count", pa.int64()),
|
|
1019
|
+
pa.field("contains_unicode", pa.bool_()),
|
|
1020
|
+
pa.field("max_string_length", pa.uint64()),
|
|
1021
|
+
]
|
|
1022
|
+
)
|
|
1023
|
+
batch = pa.record_batch(
|
|
1024
|
+
[
|
|
1025
|
+
pa.array([], type=pa.utf8()),
|
|
1026
|
+
empty_union,
|
|
1027
|
+
empty_union,
|
|
1028
|
+
pa.array([], type=pa.bool_()),
|
|
1029
|
+
pa.array([], type=pa.bool_()),
|
|
1030
|
+
pa.array([], type=pa.int64()),
|
|
1031
|
+
pa.array([], type=pa.bool_()),
|
|
1032
|
+
pa.array([], type=pa.uint64()),
|
|
1033
|
+
],
|
|
1034
|
+
schema=schema,
|
|
1035
|
+
)
|
|
1036
|
+
return serialize_record_batch_bytes(batch)
|
|
1037
|
+
|
|
1038
|
+
# 1. Collect distinct Arrow types, assign type codes
|
|
1039
|
+
type_map: dict[pa.DataType, int] = {}
|
|
1040
|
+
row_type_codes: list[int] = []
|
|
1041
|
+
for s in stats:
|
|
1042
|
+
arrow_type = _infer_stat_type(s)
|
|
1043
|
+
if arrow_type not in type_map:
|
|
1044
|
+
type_map[arrow_type] = len(type_map)
|
|
1045
|
+
row_type_codes.append(type_map[arrow_type])
|
|
1046
|
+
|
|
1047
|
+
# 2. Build sparse union child arrays (each child is length N)
|
|
1048
|
+
union_fields = []
|
|
1049
|
+
field_names: list[str] = []
|
|
1050
|
+
type_codes: list[int] = []
|
|
1051
|
+
min_children: list[pa.Array[Any]] = []
|
|
1052
|
+
max_children: list[pa.Array[Any]] = []
|
|
1053
|
+
for arrow_type, code in sorted(type_map.items(), key=lambda x: x[1]):
|
|
1054
|
+
union_fields.append(pa.field(str(code), arrow_type))
|
|
1055
|
+
field_names.append(str(code))
|
|
1056
|
+
type_codes.append(code)
|
|
1057
|
+
min_vals = [s.min if row_type_codes[i] == code else None for i, s in enumerate(stats)]
|
|
1058
|
+
max_vals = [s.max if row_type_codes[i] == code else None for i, s in enumerate(stats)]
|
|
1059
|
+
min_children.append(pa.array(min_vals, type=arrow_type))
|
|
1060
|
+
max_children.append(pa.array(max_vals, type=arrow_type))
|
|
1061
|
+
|
|
1062
|
+
# 3. Build sparse union arrays
|
|
1063
|
+
codes_arr = pa.array(row_type_codes, type=pa.int8())
|
|
1064
|
+
min_union = pa.UnionArray.from_sparse(
|
|
1065
|
+
codes_arr,
|
|
1066
|
+
min_children,
|
|
1067
|
+
field_names=field_names,
|
|
1068
|
+
type_codes=type_codes, # type: ignore[arg-type]
|
|
1069
|
+
)
|
|
1070
|
+
max_union = pa.UnionArray.from_sparse(
|
|
1071
|
+
codes_arr,
|
|
1072
|
+
max_children,
|
|
1073
|
+
field_names=field_names,
|
|
1074
|
+
type_codes=type_codes, # type: ignore[arg-type]
|
|
1075
|
+
)
|
|
1076
|
+
|
|
1077
|
+
# 4. Build schema and batch
|
|
1078
|
+
union_type = pa.sparse_union(union_fields)
|
|
1079
|
+
schema = pa.schema(
|
|
1080
|
+
[
|
|
1081
|
+
pa.field("column_name", pa.utf8()),
|
|
1082
|
+
pa.field("min", union_type),
|
|
1083
|
+
pa.field("max", union_type),
|
|
1084
|
+
pa.field("has_null", pa.bool_()),
|
|
1085
|
+
pa.field("has_not_null", pa.bool_()),
|
|
1086
|
+
pa.field("distinct_count", pa.int64()),
|
|
1087
|
+
pa.field("contains_unicode", pa.bool_()),
|
|
1088
|
+
pa.field("max_string_length", pa.uint64()),
|
|
1089
|
+
],
|
|
1090
|
+
)
|
|
1091
|
+
|
|
1092
|
+
batch = pa.record_batch(
|
|
1093
|
+
[
|
|
1094
|
+
pa.array([s.column_name for s in stats], type=pa.utf8()),
|
|
1095
|
+
min_union,
|
|
1096
|
+
max_union,
|
|
1097
|
+
pa.array([s.has_null for s in stats], type=pa.bool_()),
|
|
1098
|
+
pa.array([s.has_not_null for s in stats], type=pa.bool_()),
|
|
1099
|
+
pa.array([s.distinct_count for s in stats], type=pa.int64()),
|
|
1100
|
+
pa.array([s.contains_unicode for s in stats], type=pa.bool_()),
|
|
1101
|
+
pa.array([s.max_string_length for s in stats], type=pa.uint64()),
|
|
1102
|
+
],
|
|
1103
|
+
schema=schema,
|
|
1104
|
+
)
|
|
1105
|
+
|
|
1106
|
+
# 5. Serialize with cache TTL as IPC batch custom_metadata (not schema metadata)
|
|
1107
|
+
custom_metadata = None
|
|
1108
|
+
if cache_max_age_seconds is not None:
|
|
1109
|
+
custom_metadata = pa.KeyValueMetadata({b"cache_max_age_seconds": str(cache_max_age_seconds).encode()})
|
|
1110
|
+
return serialize_record_batch_bytes(batch, custom_metadata=custom_metadata)
|
|
1111
|
+
|
|
1112
|
+
|
|
1113
|
+
class CatalogInterface(ABC):
|
|
1114
|
+
"""Provides an interface to manage catalogs, schemas, tables, and views for VGI.
|
|
1115
|
+
|
|
1116
|
+
This interface defines methods for creating, dropping, and managing catalogs,
|
|
1117
|
+
schemas, tables, and views. It also supports transactions and provides methods
|
|
1118
|
+
for discovering catalog contents.
|
|
1119
|
+
|
|
1120
|
+
Implementors of this interface should provide concrete implementations for
|
|
1121
|
+
all abstract methods and properties.
|
|
1122
|
+
|
|
1123
|
+
API limitations:
|
|
1124
|
+
- Functions are not able to be created or dropped.
|
|
1125
|
+
- Tags are not able to be updated on catalog objects.
|
|
1126
|
+
- Comments and tags are not updatable on schemas (SchemaInfo).
|
|
1127
|
+
- Constraints cannot be added/dropped (except NOT NULL).
|
|
1128
|
+
|
|
1129
|
+
A VGI worker will offer a single implementation of this interface to clients
|
|
1130
|
+
to manage their catalogs.
|
|
1131
|
+
"""
|
|
1132
|
+
|
|
1133
|
+
@property
|
|
1134
|
+
def interface_feature_flags(self) -> set[str]:
|
|
1135
|
+
"""Get the feature flags supported by this CatalogInterface.
|
|
1136
|
+
|
|
1137
|
+
Feature flags indicate optional capabilities of the implementation.
|
|
1138
|
+
The default implementation returns an empty set.
|
|
1139
|
+
"""
|
|
1140
|
+
return set()
|
|
1141
|
+
|
|
1142
|
+
def loggable_attach_options(self, options: Mapping[str, Any]) -> Mapping[str, Any]:
|
|
1143
|
+
"""Return a redacted view of attach/create options safe for logs and Sentry breadcrumbs.
|
|
1144
|
+
|
|
1145
|
+
Called by the worker when emitting catalog lifecycle events
|
|
1146
|
+
(``catalog.attach``, ``catalog.create``). Override to opt in to
|
|
1147
|
+
logging the option fields you know are safe — host names, regions,
|
|
1148
|
+
bucket names, etc. Never return credentials such as passwords,
|
|
1149
|
+
tokens, or connection strings containing secrets.
|
|
1150
|
+
|
|
1151
|
+
Default returns an empty mapping, so by default **nothing** from the
|
|
1152
|
+
``options`` dict is logged. This fail-closed behaviour avoids
|
|
1153
|
+
leaking credentials when an implementer has not explicitly chosen
|
|
1154
|
+
which fields are safe to emit.
|
|
1155
|
+
|
|
1156
|
+
Args:
|
|
1157
|
+
options: The raw options dict the client passed to ATTACH /
|
|
1158
|
+
CREATE (the same ``dict`` handed to :meth:`catalog_attach`
|
|
1159
|
+
or :meth:`catalog_create`).
|
|
1160
|
+
|
|
1161
|
+
Returns:
|
|
1162
|
+
A mapping of safe-to-log key/value pairs. Returning an empty
|
|
1163
|
+
mapping (the default) suppresses the ``options`` field from
|
|
1164
|
+
lifecycle events entirely.
|
|
1165
|
+
|
|
1166
|
+
"""
|
|
1167
|
+
del options
|
|
1168
|
+
return {}
|
|
1169
|
+
|
|
1170
|
+
@abstractmethod
|
|
1171
|
+
def catalogs(self) -> list[CatalogInfo]:
|
|
1172
|
+
"""Get a list of catalog discovery records provided by the VGI worker.
|
|
1173
|
+
|
|
1174
|
+
Each record carries the catalog name and — if the worker has opinions —
|
|
1175
|
+
its implementation_version and data_version_spec, so clients can
|
|
1176
|
+
prevalidate ATTACH requests.
|
|
1177
|
+
|
|
1178
|
+
This is a discovery only method.
|
|
1179
|
+
"""
|
|
1180
|
+
|
|
1181
|
+
def catalog_create(self, *, name: str, on_conflict: OnConflict, options: dict[str, Any]) -> None:
|
|
1182
|
+
"""Create a new catalog with the given name.
|
|
1183
|
+
|
|
1184
|
+
If on_conflict is IGNORE and the catalog already exists, do nothing.
|
|
1185
|
+
If on_conflict is REPLACE and the catalog already exists, replace it.
|
|
1186
|
+
If on_conflict is ERROR and the catalog already exists, raise an error.
|
|
1187
|
+
|
|
1188
|
+
"""
|
|
1189
|
+
raise NotImplementedError("Catalog create not implemented.")
|
|
1190
|
+
|
|
1191
|
+
# Drop a catalog
|
|
1192
|
+
def catalog_drop(self, *, name: str) -> None:
|
|
1193
|
+
"""Drop the catalog with the given name."""
|
|
1194
|
+
raise NotImplementedError("Catalog drop not implemented.")
|
|
1195
|
+
|
|
1196
|
+
# Transactions are initiated and driven by DuckDB it is rare for CatalogInterface
|
|
1197
|
+
# implementors to implement them, but I want to support them.
|
|
1198
|
+
#
|
|
1199
|
+
# Transaction Guarantees
|
|
1200
|
+
# - Transactions MAY span multiple worker processes
|
|
1201
|
+
# - Workers MUST treat transaction_opaque_data as opaque
|
|
1202
|
+
# - Workers MUST ensure idempotency of commit/rollback
|
|
1203
|
+
|
|
1204
|
+
def catalog_transaction_begin(self, *, attach_opaque_data: AttachOpaqueData) -> TransactionOpaqueData | None:
|
|
1205
|
+
"""Begin a new transaction for the given attach_opaque_data.
|
|
1206
|
+
|
|
1207
|
+
If the implementation does not support transactions, it can return None.
|
|
1208
|
+
"""
|
|
1209
|
+
raise NotImplementedError("Catalog transactions not implemented.")
|
|
1210
|
+
|
|
1211
|
+
def catalog_transaction_commit(
|
|
1212
|
+
self, *, attach_opaque_data: AttachOpaqueData, transaction_opaque_data: TransactionOpaqueData
|
|
1213
|
+
) -> None:
|
|
1214
|
+
"""Commit the transaction for the given attachment.
|
|
1215
|
+
|
|
1216
|
+
If the transaction cannot be committed, an exception should be raised.
|
|
1217
|
+
"""
|
|
1218
|
+
raise NotImplementedError("Catalog transactions not implemented.")
|
|
1219
|
+
|
|
1220
|
+
def catalog_transaction_rollback(
|
|
1221
|
+
self, *, attach_opaque_data: AttachOpaqueData, transaction_opaque_data: TransactionOpaqueData
|
|
1222
|
+
) -> None:
|
|
1223
|
+
"""Rollback the transaction for the given attachment.
|
|
1224
|
+
|
|
1225
|
+
If the transaction cannot be rolled back, an exception should be raised.
|
|
1226
|
+
"""
|
|
1227
|
+
raise NotImplementedError("Catalog transactions not implemented.")
|
|
1228
|
+
|
|
1229
|
+
@abstractmethod
|
|
1230
|
+
def catalog_attach(
|
|
1231
|
+
self,
|
|
1232
|
+
*,
|
|
1233
|
+
name: str,
|
|
1234
|
+
options: dict[str, Any],
|
|
1235
|
+
data_version_spec: str | None,
|
|
1236
|
+
implementation_version: str | None,
|
|
1237
|
+
ctx: "CallContext | None" = None,
|
|
1238
|
+
) -> CatalogAttachResult:
|
|
1239
|
+
"""Attach to a catalog with the given name and options.
|
|
1240
|
+
|
|
1241
|
+
``data_version_spec`` and ``implementation_version`` carry the
|
|
1242
|
+
semver constraints the client requested at ATTACH time. Pass-through
|
|
1243
|
+
strings — subclasses interpret and validate them. ``None`` means
|
|
1244
|
+
the client did not constrain that dimension. Implementations that
|
|
1245
|
+
cannot satisfy a requested version MUST raise an exception with a
|
|
1246
|
+
human-readable message; the error surfaces on the client as the
|
|
1247
|
+
ATTACH failure.
|
|
1248
|
+
|
|
1249
|
+
``ctx`` is injected by the RPC dispatcher when available. Over HTTP it
|
|
1250
|
+
enables setting a per-session routing cookie via ``ctx.set_cookie()``;
|
|
1251
|
+
over subprocess it may be ``None`` or have empty cookie support.
|
|
1252
|
+
|
|
1253
|
+
Returns a CatalogAttachResult containing the attach ID, other catalog
|
|
1254
|
+
metadata, and the resolved concrete versions chosen by the worker.
|
|
1255
|
+
"""
|
|
1256
|
+
|
|
1257
|
+
def catalog_detach(self, *, attach_opaque_data: AttachOpaqueData) -> None:
|
|
1258
|
+
"""Detach from the catalog with the given attach_opaque_data.
|
|
1259
|
+
|
|
1260
|
+
Any open transactions should be rolled back.
|
|
1261
|
+
The default implementation does nothing.
|
|
1262
|
+
"""
|
|
1263
|
+
return # Default no-op
|
|
1264
|
+
|
|
1265
|
+
def catalog_version(
|
|
1266
|
+
self,
|
|
1267
|
+
*,
|
|
1268
|
+
attach_opaque_data: AttachOpaqueData,
|
|
1269
|
+
transaction_opaque_data: TransactionOpaqueData | None,
|
|
1270
|
+
ctx: "CallContext | None" = None,
|
|
1271
|
+
) -> int:
|
|
1272
|
+
"""Get the current catalog version for the given attach_opaque_data and transaction_opaque_data.
|
|
1273
|
+
|
|
1274
|
+
Returns an integer representing the current catalog version.
|
|
1275
|
+
|
|
1276
|
+
Changes to schemas, tables, and objects increment this version. It is used to
|
|
1277
|
+
expire cached catalog/schema/object information inside a VGI client or process.
|
|
1278
|
+
|
|
1279
|
+
``ctx`` is injected by the RPC dispatcher when available. Subclasses that use
|
|
1280
|
+
HTTP-session cookies can consult ``ctx.cookies`` to verify routing
|
|
1281
|
+
stickiness.
|
|
1282
|
+
|
|
1283
|
+
The default implementation returns 0.
|
|
1284
|
+
"""
|
|
1285
|
+
del ctx
|
|
1286
|
+
return 0
|
|
1287
|
+
|
|
1288
|
+
def schemas(
|
|
1289
|
+
self, *, attach_opaque_data: AttachOpaqueData, transaction_opaque_data: TransactionOpaqueData | None
|
|
1290
|
+
) -> list[SchemaInfo]:
|
|
1291
|
+
"""Get a list of schemas for the given attach_opaque_data and transaction_opaque_data.
|
|
1292
|
+
|
|
1293
|
+
The default returns a schema called "main" with no comment or tags.
|
|
1294
|
+
"""
|
|
1295
|
+
return [
|
|
1296
|
+
SchemaInfo(
|
|
1297
|
+
attach_opaque_data=attach_opaque_data,
|
|
1298
|
+
name="main",
|
|
1299
|
+
comment=None,
|
|
1300
|
+
tags={},
|
|
1301
|
+
)
|
|
1302
|
+
]
|
|
1303
|
+
|
|
1304
|
+
def schema_create(
|
|
1305
|
+
self,
|
|
1306
|
+
*,
|
|
1307
|
+
attach_opaque_data: AttachOpaqueData,
|
|
1308
|
+
transaction_opaque_data: TransactionOpaqueData | None,
|
|
1309
|
+
name: str,
|
|
1310
|
+
on_conflict: OnConflict = OnConflict.ERROR,
|
|
1311
|
+
comment: str | None,
|
|
1312
|
+
tags: dict[str, str],
|
|
1313
|
+
) -> None:
|
|
1314
|
+
"""Create a new schema with the given name, comment, and tags."""
|
|
1315
|
+
raise NotImplementedError("Schema create not implemented.")
|
|
1316
|
+
|
|
1317
|
+
def schema_drop(
|
|
1318
|
+
self,
|
|
1319
|
+
*,
|
|
1320
|
+
attach_opaque_data: AttachOpaqueData,
|
|
1321
|
+
transaction_opaque_data: TransactionOpaqueData | None,
|
|
1322
|
+
name: str,
|
|
1323
|
+
ignore_not_found: bool,
|
|
1324
|
+
cascade: bool,
|
|
1325
|
+
) -> None:
|
|
1326
|
+
"""Drop the schema with the given name."""
|
|
1327
|
+
raise NotImplementedError("Schema drop not implemented.")
|
|
1328
|
+
|
|
1329
|
+
@overload
|
|
1330
|
+
def schema_contents(
|
|
1331
|
+
self,
|
|
1332
|
+
*,
|
|
1333
|
+
attach_opaque_data: AttachOpaqueData,
|
|
1334
|
+
transaction_opaque_data: TransactionOpaqueData | None,
|
|
1335
|
+
name: str,
|
|
1336
|
+
type: Literal[SchemaObjectType.TABLE],
|
|
1337
|
+
) -> Sequence[TableInfo]: ...
|
|
1338
|
+
|
|
1339
|
+
@overload
|
|
1340
|
+
def schema_contents(
|
|
1341
|
+
self,
|
|
1342
|
+
*,
|
|
1343
|
+
attach_opaque_data: AttachOpaqueData,
|
|
1344
|
+
transaction_opaque_data: TransactionOpaqueData | None,
|
|
1345
|
+
name: str,
|
|
1346
|
+
type: Literal[SchemaObjectType.VIEW],
|
|
1347
|
+
) -> Sequence[ViewInfo]: ...
|
|
1348
|
+
|
|
1349
|
+
@overload
|
|
1350
|
+
def schema_contents(
|
|
1351
|
+
self,
|
|
1352
|
+
*,
|
|
1353
|
+
attach_opaque_data: AttachOpaqueData,
|
|
1354
|
+
transaction_opaque_data: TransactionOpaqueData | None,
|
|
1355
|
+
name: str,
|
|
1356
|
+
type: Literal[
|
|
1357
|
+
SchemaObjectType.SCALAR_FUNCTION,
|
|
1358
|
+
SchemaObjectType.TABLE_FUNCTION,
|
|
1359
|
+
SchemaObjectType.AGGREGATE_FUNCTION,
|
|
1360
|
+
],
|
|
1361
|
+
) -> Sequence[FunctionInfo]: ...
|
|
1362
|
+
|
|
1363
|
+
@overload
|
|
1364
|
+
def schema_contents(
|
|
1365
|
+
self,
|
|
1366
|
+
*,
|
|
1367
|
+
attach_opaque_data: AttachOpaqueData,
|
|
1368
|
+
transaction_opaque_data: TransactionOpaqueData | None,
|
|
1369
|
+
name: str,
|
|
1370
|
+
type: Literal[SchemaObjectType.SCALAR_MACRO, SchemaObjectType.TABLE_MACRO],
|
|
1371
|
+
) -> Sequence[MacroInfo]: ...
|
|
1372
|
+
|
|
1373
|
+
@overload
|
|
1374
|
+
def schema_contents(
|
|
1375
|
+
self,
|
|
1376
|
+
*,
|
|
1377
|
+
attach_opaque_data: AttachOpaqueData,
|
|
1378
|
+
transaction_opaque_data: TransactionOpaqueData | None,
|
|
1379
|
+
name: str,
|
|
1380
|
+
type: Literal[SchemaObjectType.INDEX],
|
|
1381
|
+
) -> Sequence[IndexInfo]: ...
|
|
1382
|
+
|
|
1383
|
+
def schema_contents(
|
|
1384
|
+
self,
|
|
1385
|
+
*,
|
|
1386
|
+
attach_opaque_data: AttachOpaqueData,
|
|
1387
|
+
transaction_opaque_data: TransactionOpaqueData | None,
|
|
1388
|
+
name: str,
|
|
1389
|
+
type: SchemaObjectType,
|
|
1390
|
+
) -> Sequence[TableInfo | ViewInfo | FunctionInfo | MacroInfo | IndexInfo]:
|
|
1391
|
+
"""Get the contents of the schema with the given name.
|
|
1392
|
+
|
|
1393
|
+
Schemas can contain tables, views, functions, macros, and indexes.
|
|
1394
|
+
|
|
1395
|
+
Args:
|
|
1396
|
+
attach_opaque_data: The attachment identifier.
|
|
1397
|
+
transaction_opaque_data: The transaction identifier, if any.
|
|
1398
|
+
name: The name of the schema.
|
|
1399
|
+
type: The type of objects to return. Must be a SchemaObjectType enum:
|
|
1400
|
+
- SchemaObjectType.TABLE: Return only tables
|
|
1401
|
+
- SchemaObjectType.VIEW: Return only views
|
|
1402
|
+
- SchemaObjectType.SCALAR_FUNCTION: Scalar functions
|
|
1403
|
+
- SchemaObjectType.TABLE_FUNCTION: Table functions
|
|
1404
|
+
- SchemaObjectType.SCALAR_MACRO: Scalar macros
|
|
1405
|
+
- SchemaObjectType.TABLE_MACRO: Table macros
|
|
1406
|
+
- SchemaObjectType.INDEX: Indexes
|
|
1407
|
+
|
|
1408
|
+
Returns:
|
|
1409
|
+
A list of TableInfo, ViewInfo, FunctionInfo, or MacroInfo objects
|
|
1410
|
+
depending on the type parameter.
|
|
1411
|
+
|
|
1412
|
+
"""
|
|
1413
|
+
raise NotImplementedError("Schema contents not implemented.")
|
|
1414
|
+
|
|
1415
|
+
@abstractmethod
|
|
1416
|
+
def schema_get(
|
|
1417
|
+
self,
|
|
1418
|
+
*,
|
|
1419
|
+
attach_opaque_data: AttachOpaqueData,
|
|
1420
|
+
transaction_opaque_data: TransactionOpaqueData | None,
|
|
1421
|
+
name: str,
|
|
1422
|
+
) -> SchemaInfo | None:
|
|
1423
|
+
"""Get information about the schema with the given name.
|
|
1424
|
+
|
|
1425
|
+
Returns a SchemaInfo object if the schema exists, or None if it does not.
|
|
1426
|
+
"""
|
|
1427
|
+
|
|
1428
|
+
@abstractmethod
|
|
1429
|
+
def table_get(
|
|
1430
|
+
self,
|
|
1431
|
+
*,
|
|
1432
|
+
attach_opaque_data: AttachOpaqueData,
|
|
1433
|
+
transaction_opaque_data: TransactionOpaqueData | None,
|
|
1434
|
+
schema_name: str,
|
|
1435
|
+
name: str,
|
|
1436
|
+
at_unit: str | None = None,
|
|
1437
|
+
at_value: str | None = None,
|
|
1438
|
+
) -> TableInfo | None:
|
|
1439
|
+
"""Get information about the table with the given name in the specified schema.
|
|
1440
|
+
|
|
1441
|
+
When ``at_unit`` / ``at_value`` are provided the implementation should
|
|
1442
|
+
return the table schema for the requested point in time (time travel).
|
|
1443
|
+
|
|
1444
|
+
Returns a TableInfo object if the table exists, or None if it does not.
|
|
1445
|
+
"""
|
|
1446
|
+
|
|
1447
|
+
def table_create(
|
|
1448
|
+
self,
|
|
1449
|
+
*,
|
|
1450
|
+
attach_opaque_data: AttachOpaqueData,
|
|
1451
|
+
transaction_opaque_data: TransactionOpaqueData | None,
|
|
1452
|
+
schema_name: str,
|
|
1453
|
+
name: str,
|
|
1454
|
+
# The contents of the table is a serialized PyArrow schema
|
|
1455
|
+
# the nullability for each field is ignored.
|
|
1456
|
+
# schema.serialize().to_pybytes()
|
|
1457
|
+
columns: SerializedSchema,
|
|
1458
|
+
on_conflict: OnConflict,
|
|
1459
|
+
# These are constraints listed by field index
|
|
1460
|
+
not_null_constraints: list[int], # [] = no not null constraints
|
|
1461
|
+
unique_constraints: list[list[int]], # [] = no unique constraints
|
|
1462
|
+
# These are general check constraints specified as SQL expressions.
|
|
1463
|
+
check_constraints: list[str], # [] = no check constraints
|
|
1464
|
+
# Primary key constraints as column index groups
|
|
1465
|
+
primary_key_constraints: list[list[int]] | None = None,
|
|
1466
|
+
# Foreign key constraints as IPC-serialized bytes (same format as TableInfo)
|
|
1467
|
+
foreign_key_constraints: list[bytes] | None = None,
|
|
1468
|
+
) -> None:
|
|
1469
|
+
"""Create a new table with the given name and schema.
|
|
1470
|
+
|
|
1471
|
+
Comments and tags are not supported on table creation.
|
|
1472
|
+
"""
|
|
1473
|
+
raise NotImplementedError("Table create not implemented.")
|
|
1474
|
+
|
|
1475
|
+
def table_drop(
|
|
1476
|
+
self,
|
|
1477
|
+
*,
|
|
1478
|
+
attach_opaque_data: AttachOpaqueData,
|
|
1479
|
+
transaction_opaque_data: TransactionOpaqueData | None,
|
|
1480
|
+
schema_name: str,
|
|
1481
|
+
name: str,
|
|
1482
|
+
ignore_not_found: bool,
|
|
1483
|
+
cascade: bool = False,
|
|
1484
|
+
) -> None:
|
|
1485
|
+
"""Drop the table with the given name."""
|
|
1486
|
+
raise NotImplementedError("Table drop not implemented.")
|
|
1487
|
+
|
|
1488
|
+
def table_comment_set(
|
|
1489
|
+
self,
|
|
1490
|
+
*,
|
|
1491
|
+
attach_opaque_data: AttachOpaqueData,
|
|
1492
|
+
transaction_opaque_data: TransactionOpaqueData | None,
|
|
1493
|
+
schema_name: str,
|
|
1494
|
+
name: str,
|
|
1495
|
+
comment: str | None,
|
|
1496
|
+
ignore_not_found: bool,
|
|
1497
|
+
) -> None:
|
|
1498
|
+
"""Set the comment for the table with the given name."""
|
|
1499
|
+
raise NotImplementedError("Table comment set not implemented.")
|
|
1500
|
+
|
|
1501
|
+
def table_column_comment_set(
|
|
1502
|
+
self,
|
|
1503
|
+
*,
|
|
1504
|
+
attach_opaque_data: AttachOpaqueData,
|
|
1505
|
+
transaction_opaque_data: TransactionOpaqueData | None,
|
|
1506
|
+
schema_name: str,
|
|
1507
|
+
name: str,
|
|
1508
|
+
column_name: str,
|
|
1509
|
+
comment: str | None,
|
|
1510
|
+
ignore_not_found: bool,
|
|
1511
|
+
) -> None:
|
|
1512
|
+
"""Set the comment for a column in the table."""
|
|
1513
|
+
raise NotImplementedError("Table column comment set not implemented.")
|
|
1514
|
+
|
|
1515
|
+
def table_rename(
|
|
1516
|
+
self,
|
|
1517
|
+
*,
|
|
1518
|
+
attach_opaque_data: AttachOpaqueData,
|
|
1519
|
+
transaction_opaque_data: TransactionOpaqueData | None,
|
|
1520
|
+
schema_name: str,
|
|
1521
|
+
name: str,
|
|
1522
|
+
new_name: str,
|
|
1523
|
+
ignore_not_found: bool,
|
|
1524
|
+
) -> None:
|
|
1525
|
+
"""Rename the table with the given name to the new name."""
|
|
1526
|
+
raise NotImplementedError("Table rename not implemented.")
|
|
1527
|
+
|
|
1528
|
+
def table_column_add(
|
|
1529
|
+
self,
|
|
1530
|
+
*,
|
|
1531
|
+
attach_opaque_data: AttachOpaqueData,
|
|
1532
|
+
transaction_opaque_data: TransactionOpaqueData | None,
|
|
1533
|
+
schema_name: str,
|
|
1534
|
+
name: str,
|
|
1535
|
+
# Arrow schema with single field for column to add.
|
|
1536
|
+
# Serialized via schema.serialize().to_pybytes()
|
|
1537
|
+
column_definition: SerializedSchema,
|
|
1538
|
+
ignore_not_found: bool,
|
|
1539
|
+
if_column_not_exists: bool,
|
|
1540
|
+
) -> None:
|
|
1541
|
+
"""Add a column to the table with the given name."""
|
|
1542
|
+
raise NotImplementedError("Table column add not implemented.")
|
|
1543
|
+
|
|
1544
|
+
def table_column_drop(
|
|
1545
|
+
self,
|
|
1546
|
+
*,
|
|
1547
|
+
attach_opaque_data: AttachOpaqueData,
|
|
1548
|
+
transaction_opaque_data: TransactionOpaqueData | None,
|
|
1549
|
+
schema_name: str,
|
|
1550
|
+
name: str,
|
|
1551
|
+
column_name: str,
|
|
1552
|
+
ignore_not_found: bool,
|
|
1553
|
+
if_column_exists: bool,
|
|
1554
|
+
cascade: bool,
|
|
1555
|
+
) -> None:
|
|
1556
|
+
"""Drop the column from the table with the given name."""
|
|
1557
|
+
raise NotImplementedError("Table column drop not implemented.")
|
|
1558
|
+
|
|
1559
|
+
def table_column_rename(
|
|
1560
|
+
self,
|
|
1561
|
+
*,
|
|
1562
|
+
attach_opaque_data: AttachOpaqueData,
|
|
1563
|
+
transaction_opaque_data: TransactionOpaqueData | None,
|
|
1564
|
+
schema_name: str,
|
|
1565
|
+
name: str,
|
|
1566
|
+
column_name: str,
|
|
1567
|
+
new_column_name: str,
|
|
1568
|
+
ignore_not_found: bool,
|
|
1569
|
+
) -> None:
|
|
1570
|
+
"""Rename the column in the table with the given name."""
|
|
1571
|
+
raise NotImplementedError("Table column rename not implemented.")
|
|
1572
|
+
|
|
1573
|
+
def table_column_default_set(
|
|
1574
|
+
self,
|
|
1575
|
+
*,
|
|
1576
|
+
attach_opaque_data: AttachOpaqueData,
|
|
1577
|
+
transaction_opaque_data: TransactionOpaqueData | None,
|
|
1578
|
+
schema_name: str,
|
|
1579
|
+
name: str,
|
|
1580
|
+
column_name: str,
|
|
1581
|
+
expression: SqlExpression,
|
|
1582
|
+
ignore_not_found: bool,
|
|
1583
|
+
) -> None:
|
|
1584
|
+
"""Set the default expression for the column."""
|
|
1585
|
+
raise NotImplementedError("Table column default set not implemented.")
|
|
1586
|
+
|
|
1587
|
+
def table_column_default_drop(
|
|
1588
|
+
self,
|
|
1589
|
+
*,
|
|
1590
|
+
attach_opaque_data: AttachOpaqueData,
|
|
1591
|
+
transaction_opaque_data: TransactionOpaqueData | None,
|
|
1592
|
+
schema_name: str,
|
|
1593
|
+
name: str,
|
|
1594
|
+
column_name: str,
|
|
1595
|
+
ignore_not_found: bool,
|
|
1596
|
+
) -> None:
|
|
1597
|
+
"""Drop the default expression for the column."""
|
|
1598
|
+
raise NotImplementedError("Table column default drop not implemented.")
|
|
1599
|
+
|
|
1600
|
+
def table_column_type_change(
|
|
1601
|
+
self,
|
|
1602
|
+
*,
|
|
1603
|
+
attach_opaque_data: AttachOpaqueData,
|
|
1604
|
+
transaction_opaque_data: TransactionOpaqueData | None,
|
|
1605
|
+
schema_name: str,
|
|
1606
|
+
name: str,
|
|
1607
|
+
# Arrow schema with single field for the new column type.
|
|
1608
|
+
# Serialized via schema.serialize().to_pybytes()
|
|
1609
|
+
column_definition: SerializedSchema,
|
|
1610
|
+
expression: SqlExpression | None,
|
|
1611
|
+
ignore_not_found: bool,
|
|
1612
|
+
) -> None:
|
|
1613
|
+
"""Change the type of the column in the table with the given name.
|
|
1614
|
+
|
|
1615
|
+
The name of the column to change is taken from the field in the provided schema.
|
|
1616
|
+
"""
|
|
1617
|
+
raise NotImplementedError("Table column type change not implemented.")
|
|
1618
|
+
|
|
1619
|
+
def table_not_null_drop(
|
|
1620
|
+
self,
|
|
1621
|
+
*,
|
|
1622
|
+
attach_opaque_data: AttachOpaqueData,
|
|
1623
|
+
transaction_opaque_data: TransactionOpaqueData | None,
|
|
1624
|
+
schema_name: str,
|
|
1625
|
+
name: str,
|
|
1626
|
+
column_name: str,
|
|
1627
|
+
ignore_not_found: bool,
|
|
1628
|
+
) -> None:
|
|
1629
|
+
"""Drop the NOT NULL constraint from the column."""
|
|
1630
|
+
raise NotImplementedError("Table NOT NULL drop not implemented.")
|
|
1631
|
+
|
|
1632
|
+
def table_not_null_set(
|
|
1633
|
+
self,
|
|
1634
|
+
*,
|
|
1635
|
+
attach_opaque_data: AttachOpaqueData,
|
|
1636
|
+
transaction_opaque_data: TransactionOpaqueData | None,
|
|
1637
|
+
schema_name: str,
|
|
1638
|
+
name: str,
|
|
1639
|
+
column_name: str,
|
|
1640
|
+
ignore_not_found: bool,
|
|
1641
|
+
) -> None:
|
|
1642
|
+
"""Set the NOT NULL constraint on the column."""
|
|
1643
|
+
raise NotImplementedError("Table NOT NULL set not implemented.")
|
|
1644
|
+
|
|
1645
|
+
def table_scan_function_get(
|
|
1646
|
+
self,
|
|
1647
|
+
*,
|
|
1648
|
+
attach_opaque_data: AttachOpaqueData,
|
|
1649
|
+
transaction_opaque_data: TransactionOpaqueData | None,
|
|
1650
|
+
schema_name: str,
|
|
1651
|
+
name: str,
|
|
1652
|
+
# Time travel fields (iceberg style)
|
|
1653
|
+
at_unit: str | None,
|
|
1654
|
+
at_value: str | None,
|
|
1655
|
+
) -> ScanFunctionResult:
|
|
1656
|
+
"""Get the ScanFunctionResult for scanning the table.
|
|
1657
|
+
|
|
1658
|
+
Returns information about the VGI table function to call when scanning
|
|
1659
|
+
this table. The at_unit and at_value support time travel queries.
|
|
1660
|
+
"""
|
|
1661
|
+
raise NotImplementedError("Table scan function get not implemented.")
|
|
1662
|
+
|
|
1663
|
+
def table_scan_branches_get(
|
|
1664
|
+
self,
|
|
1665
|
+
*,
|
|
1666
|
+
attach_opaque_data: AttachOpaqueData,
|
|
1667
|
+
transaction_opaque_data: TransactionOpaqueData | None,
|
|
1668
|
+
schema_name: str,
|
|
1669
|
+
name: str,
|
|
1670
|
+
at_unit: str | None,
|
|
1671
|
+
at_value: str | None,
|
|
1672
|
+
) -> ScanBranchesResult:
|
|
1673
|
+
"""Get the list of scan branches for a multi-source table.
|
|
1674
|
+
|
|
1675
|
+
Multi-branch tables compose a logical scan from N physical sources
|
|
1676
|
+
(canonical case: Kafka hot tier + Iceberg cold tier). The VGI DuckDB
|
|
1677
|
+
extension's optimizer-extension rewrites the placeholder ``LogicalGet``
|
|
1678
|
+
into ``LogicalSetOperation(UNION_ALL, ...)``, one arm per branch.
|
|
1679
|
+
|
|
1680
|
+
Default implementation: delegate to :meth:`table_scan_function_get`
|
|
1681
|
+
and wrap the single ``ScanFunctionResult`` as a one-branch list.
|
|
1682
|
+
This makes every existing single-source worker automatically
|
|
1683
|
+
compatible with the new branches-aware C++ side, while letting
|
|
1684
|
+
workers that genuinely need multi-source override this method.
|
|
1685
|
+
|
|
1686
|
+
Workers that override should NOT also raise from
|
|
1687
|
+
:meth:`table_scan_function_get` — the legacy method must keep
|
|
1688
|
+
working for old C++ extensions that don't yet probe for the new
|
|
1689
|
+
branches RPC. Common pattern: a worker implements both, where
|
|
1690
|
+
:meth:`table_scan_function_get` returns ``branches[0]`` (the
|
|
1691
|
+
primary branch) and :meth:`table_scan_branches_get` returns the
|
|
1692
|
+
full list.
|
|
1693
|
+
|
|
1694
|
+
Args:
|
|
1695
|
+
attach_opaque_data: Per-attach session token.
|
|
1696
|
+
transaction_opaque_data: Optional transaction token.
|
|
1697
|
+
schema_name: Schema containing the table.
|
|
1698
|
+
name: Table name.
|
|
1699
|
+
at_unit: Optional time-travel unit (e.g., ``"VERSION"`` /
|
|
1700
|
+
``"TIMESTAMP"``). The VGI C++ side refuses ``AT(...)`` on
|
|
1701
|
+
multi-branch tables (>1 branch) at bind time, so workers
|
|
1702
|
+
returning multiple branches should expect ``at_unit`` /
|
|
1703
|
+
``at_value`` to always be ``None``; single-branch returns
|
|
1704
|
+
still honour them.
|
|
1705
|
+
at_value: Optional time-travel value matching ``at_unit``.
|
|
1706
|
+
|
|
1707
|
+
Returns:
|
|
1708
|
+
A :class:`ScanBranchesResult` carrying one or more
|
|
1709
|
+
:class:`ScanBranch` entries plus the union of required
|
|
1710
|
+
extensions across all branches.
|
|
1711
|
+
|
|
1712
|
+
"""
|
|
1713
|
+
legacy = self.table_scan_function_get(
|
|
1714
|
+
attach_opaque_data=attach_opaque_data,
|
|
1715
|
+
transaction_opaque_data=transaction_opaque_data,
|
|
1716
|
+
schema_name=schema_name,
|
|
1717
|
+
name=name,
|
|
1718
|
+
at_unit=at_unit,
|
|
1719
|
+
at_value=at_value,
|
|
1720
|
+
)
|
|
1721
|
+
return ScanBranchesResult(
|
|
1722
|
+
branches=[
|
|
1723
|
+
ScanBranch(
|
|
1724
|
+
function_name=legacy.function_name,
|
|
1725
|
+
positional_arguments=list(legacy.positional_arguments),
|
|
1726
|
+
named_arguments=dict(legacy.named_arguments),
|
|
1727
|
+
branch_filter=None,
|
|
1728
|
+
),
|
|
1729
|
+
],
|
|
1730
|
+
required_extensions=list(legacy.required_extensions),
|
|
1731
|
+
)
|
|
1732
|
+
|
|
1733
|
+
def table_column_statistics_get(
|
|
1734
|
+
self,
|
|
1735
|
+
*,
|
|
1736
|
+
attach_opaque_data: AttachOpaqueData,
|
|
1737
|
+
transaction_opaque_data: TransactionOpaqueData | None,
|
|
1738
|
+
schema_name: str,
|
|
1739
|
+
name: str,
|
|
1740
|
+
) -> TableColumnStatisticsResult | None:
|
|
1741
|
+
"""Get column statistics for all columns in a table.
|
|
1742
|
+
|
|
1743
|
+
Returns a :class:`TableColumnStatisticsResult` containing per-column
|
|
1744
|
+
statistics and an optional cache TTL, or ``None`` if statistics are not
|
|
1745
|
+
available for this table.
|
|
1746
|
+
|
|
1747
|
+
The default implementation returns ``None`` (no statistics).
|
|
1748
|
+
Workers that provide statistics should override this method.
|
|
1749
|
+
"""
|
|
1750
|
+
return None
|
|
1751
|
+
|
|
1752
|
+
def table_insert_function_get(
|
|
1753
|
+
self,
|
|
1754
|
+
*,
|
|
1755
|
+
attach_opaque_data: AttachOpaqueData,
|
|
1756
|
+
transaction_opaque_data: TransactionOpaqueData | None,
|
|
1757
|
+
schema_name: str,
|
|
1758
|
+
name: str,
|
|
1759
|
+
writable_branch_function_name: str | None = None,
|
|
1760
|
+
) -> ScanFunctionResult:
|
|
1761
|
+
"""Get the write function for INSERT operations on the table.
|
|
1762
|
+
|
|
1763
|
+
Returns a ScanFunctionResult identifying the TableInOutGenerator function
|
|
1764
|
+
to call for inserting rows into this table.
|
|
1765
|
+
|
|
1766
|
+
``writable_branch_function_name`` is set by the C++ extension when the
|
|
1767
|
+
table is multi-branch and a branch declared ``writable=True``: the value
|
|
1768
|
+
is the writable arm's ``ScanBranch.function_name``. Workers serving
|
|
1769
|
+
multi-branch tables can use this to dispatch the INSERT to the correct
|
|
1770
|
+
underlying storage without re-resolving the writable arm internally.
|
|
1771
|
+
For single-branch tables this is ``None`` (or unset for legacy
|
|
1772
|
+
overrides).
|
|
1773
|
+
"""
|
|
1774
|
+
raise NotImplementedError("Table insert not supported.")
|
|
1775
|
+
|
|
1776
|
+
def table_update_function_get(
|
|
1777
|
+
self,
|
|
1778
|
+
*,
|
|
1779
|
+
attach_opaque_data: AttachOpaqueData,
|
|
1780
|
+
transaction_opaque_data: TransactionOpaqueData | None,
|
|
1781
|
+
schema_name: str,
|
|
1782
|
+
name: str,
|
|
1783
|
+
) -> ScanFunctionResult:
|
|
1784
|
+
"""Get the write function for UPDATE operations on the table.
|
|
1785
|
+
|
|
1786
|
+
Returns a ScanFunctionResult identifying the TableInOutGenerator function
|
|
1787
|
+
to call for updating rows in this table. Input batches will include a
|
|
1788
|
+
rowid column plus the columns being updated.
|
|
1789
|
+
"""
|
|
1790
|
+
raise NotImplementedError("Table update not supported.")
|
|
1791
|
+
|
|
1792
|
+
def table_delete_function_get(
|
|
1793
|
+
self,
|
|
1794
|
+
*,
|
|
1795
|
+
attach_opaque_data: AttachOpaqueData,
|
|
1796
|
+
transaction_opaque_data: TransactionOpaqueData | None,
|
|
1797
|
+
schema_name: str,
|
|
1798
|
+
name: str,
|
|
1799
|
+
) -> ScanFunctionResult:
|
|
1800
|
+
"""Get the write function for DELETE operations on the table.
|
|
1801
|
+
|
|
1802
|
+
Returns a ScanFunctionResult identifying the TableInOutGenerator function
|
|
1803
|
+
to call for deleting rows from this table. Input batches will contain
|
|
1804
|
+
a rowid column identifying the rows to delete.
|
|
1805
|
+
"""
|
|
1806
|
+
raise NotImplementedError("Table delete not supported.")
|
|
1807
|
+
|
|
1808
|
+
def view_create(
|
|
1809
|
+
self,
|
|
1810
|
+
*,
|
|
1811
|
+
attach_opaque_data: AttachOpaqueData,
|
|
1812
|
+
transaction_opaque_data: TransactionOpaqueData | None,
|
|
1813
|
+
schema_name: str,
|
|
1814
|
+
name: str,
|
|
1815
|
+
definition: str,
|
|
1816
|
+
on_conflict: OnConflict,
|
|
1817
|
+
) -> None:
|
|
1818
|
+
"""Create a new view with the given definition."""
|
|
1819
|
+
raise NotImplementedError("View create not implemented.")
|
|
1820
|
+
|
|
1821
|
+
def view_drop(
|
|
1822
|
+
self,
|
|
1823
|
+
*,
|
|
1824
|
+
attach_opaque_data: AttachOpaqueData,
|
|
1825
|
+
transaction_opaque_data: TransactionOpaqueData | None,
|
|
1826
|
+
schema_name: str,
|
|
1827
|
+
name: str,
|
|
1828
|
+
ignore_not_found: bool,
|
|
1829
|
+
cascade: bool = False,
|
|
1830
|
+
) -> None:
|
|
1831
|
+
"""Drop the view with the given name."""
|
|
1832
|
+
raise NotImplementedError("View drop not implemented.")
|
|
1833
|
+
|
|
1834
|
+
def view_rename(
|
|
1835
|
+
self,
|
|
1836
|
+
*,
|
|
1837
|
+
attach_opaque_data: AttachOpaqueData,
|
|
1838
|
+
transaction_opaque_data: TransactionOpaqueData | None,
|
|
1839
|
+
schema_name: str,
|
|
1840
|
+
name: str,
|
|
1841
|
+
new_name: str,
|
|
1842
|
+
ignore_not_found: bool,
|
|
1843
|
+
) -> None:
|
|
1844
|
+
"""Rename the view to the new name."""
|
|
1845
|
+
raise NotImplementedError("View rename not implemented.")
|
|
1846
|
+
|
|
1847
|
+
@abstractmethod
|
|
1848
|
+
def view_get(
|
|
1849
|
+
self,
|
|
1850
|
+
*,
|
|
1851
|
+
attach_opaque_data: AttachOpaqueData,
|
|
1852
|
+
transaction_opaque_data: TransactionOpaqueData | None,
|
|
1853
|
+
schema_name: str,
|
|
1854
|
+
name: str,
|
|
1855
|
+
) -> ViewInfo | None:
|
|
1856
|
+
"""Get information about the view with the given name.
|
|
1857
|
+
|
|
1858
|
+
Returns a ViewInfo object if the view exists, or None if it does not.
|
|
1859
|
+
"""
|
|
1860
|
+
|
|
1861
|
+
def view_comment_set(
|
|
1862
|
+
self,
|
|
1863
|
+
*,
|
|
1864
|
+
attach_opaque_data: AttachOpaqueData,
|
|
1865
|
+
transaction_opaque_data: TransactionOpaqueData | None,
|
|
1866
|
+
schema_name: str,
|
|
1867
|
+
name: str,
|
|
1868
|
+
comment: str | None,
|
|
1869
|
+
ignore_not_found: bool,
|
|
1870
|
+
) -> None:
|
|
1871
|
+
"""Set the comment for the view with the given name."""
|
|
1872
|
+
raise NotImplementedError("View comment set not implemented.")
|
|
1873
|
+
|
|
1874
|
+
# ---- Macros ----
|
|
1875
|
+
|
|
1876
|
+
@abstractmethod
|
|
1877
|
+
def macro_get(
|
|
1878
|
+
self,
|
|
1879
|
+
*,
|
|
1880
|
+
attach_opaque_data: AttachOpaqueData,
|
|
1881
|
+
transaction_opaque_data: TransactionOpaqueData | None,
|
|
1882
|
+
schema_name: str,
|
|
1883
|
+
name: str,
|
|
1884
|
+
) -> MacroInfo | None:
|
|
1885
|
+
"""Get information about the macro with the given name.
|
|
1886
|
+
|
|
1887
|
+
Returns a MacroInfo object if the macro exists, or None if it does not.
|
|
1888
|
+
"""
|
|
1889
|
+
|
|
1890
|
+
def macro_create(
|
|
1891
|
+
self,
|
|
1892
|
+
*,
|
|
1893
|
+
attach_opaque_data: AttachOpaqueData,
|
|
1894
|
+
transaction_opaque_data: TransactionOpaqueData | None,
|
|
1895
|
+
schema_name: str,
|
|
1896
|
+
name: str,
|
|
1897
|
+
macro_type: "MacroType",
|
|
1898
|
+
parameters: list[str],
|
|
1899
|
+
definition: str,
|
|
1900
|
+
on_conflict: OnConflict,
|
|
1901
|
+
parameter_default_values: pa.RecordBatch | None = None,
|
|
1902
|
+
) -> None:
|
|
1903
|
+
"""Create a new macro with the given definition."""
|
|
1904
|
+
raise NotImplementedError("Macro create not implemented.")
|
|
1905
|
+
|
|
1906
|
+
def macro_drop(
|
|
1907
|
+
self,
|
|
1908
|
+
*,
|
|
1909
|
+
attach_opaque_data: AttachOpaqueData,
|
|
1910
|
+
transaction_opaque_data: TransactionOpaqueData | None,
|
|
1911
|
+
schema_name: str,
|
|
1912
|
+
name: str,
|
|
1913
|
+
ignore_not_found: bool,
|
|
1914
|
+
) -> None:
|
|
1915
|
+
"""Drop the macro with the given name."""
|
|
1916
|
+
raise NotImplementedError("Macro drop not implemented.")
|
|
1917
|
+
|
|
1918
|
+
# ---- Indexes ----
|
|
1919
|
+
|
|
1920
|
+
def index_get(
|
|
1921
|
+
self,
|
|
1922
|
+
*,
|
|
1923
|
+
attach_opaque_data: AttachOpaqueData,
|
|
1924
|
+
transaction_opaque_data: TransactionOpaqueData | None,
|
|
1925
|
+
schema_name: str,
|
|
1926
|
+
name: str,
|
|
1927
|
+
) -> IndexInfo | None:
|
|
1928
|
+
"""Get information about the index with the given name.
|
|
1929
|
+
|
|
1930
|
+
Returns an IndexInfo object if the index exists, or None if it does not.
|
|
1931
|
+
The default implementation returns None (no indexes).
|
|
1932
|
+
"""
|
|
1933
|
+
return None
|
|
1934
|
+
|
|
1935
|
+
def index_create(
|
|
1936
|
+
self,
|
|
1937
|
+
*,
|
|
1938
|
+
attach_opaque_data: AttachOpaqueData,
|
|
1939
|
+
transaction_opaque_data: TransactionOpaqueData | None,
|
|
1940
|
+
schema_name: str,
|
|
1941
|
+
name: str,
|
|
1942
|
+
table_name: str,
|
|
1943
|
+
index_type: str,
|
|
1944
|
+
constraint_type: IndexConstraintType,
|
|
1945
|
+
expressions: list[str],
|
|
1946
|
+
on_conflict: OnConflict,
|
|
1947
|
+
options: dict[str, str] | None = None,
|
|
1948
|
+
) -> None:
|
|
1949
|
+
"""Create a new index on the specified table."""
|
|
1950
|
+
raise NotImplementedError("Index create not implemented.")
|
|
1951
|
+
|
|
1952
|
+
def index_drop(
|
|
1953
|
+
self,
|
|
1954
|
+
*,
|
|
1955
|
+
attach_opaque_data: AttachOpaqueData,
|
|
1956
|
+
transaction_opaque_data: TransactionOpaqueData | None,
|
|
1957
|
+
schema_name: str,
|
|
1958
|
+
name: str,
|
|
1959
|
+
ignore_not_found: bool,
|
|
1960
|
+
cascade: bool = False,
|
|
1961
|
+
) -> None:
|
|
1962
|
+
"""Drop the index with the given name."""
|
|
1963
|
+
raise NotImplementedError("Index drop not implemented.")
|
|
1964
|
+
|
|
1965
|
+
|
|
1966
|
+
def _read_only(operation: str) -> Any:
|
|
1967
|
+
"""Create a CatalogInterface method that raises CatalogReadOnlyError."""
|
|
1968
|
+
|
|
1969
|
+
def method(self: Any, **kwargs: Any) -> Any:
|
|
1970
|
+
raise CatalogReadOnlyError(f"Cannot {operation}: catalog is read-only")
|
|
1971
|
+
|
|
1972
|
+
method.__doc__ = "Not supported — raises CatalogReadOnlyError."
|
|
1973
|
+
return method
|
|
1974
|
+
|
|
1975
|
+
|
|
1976
|
+
def _inline_bind_result_for(func_cls: type) -> bytes | None:
|
|
1977
|
+
"""Pre-built ``bind_result`` bytes for a ``@bind_fixed_schema`` function.
|
|
1978
|
+
|
|
1979
|
+
Returns the IPC-serialized ``BindResponse(output_schema=cls.FIXED_SCHEMA)``
|
|
1980
|
+
that the worker would have produced from a regular bind RPC. Cached on a
|
|
1981
|
+
private class attribute so subsequent ``schema_contents`` calls (per
|
|
1982
|
+
attach, per cache invalidation) reuse the bytes instead of re-serializing.
|
|
1983
|
+
|
|
1984
|
+
Returns ``None`` if the class isn't safely pre-bind-able — either it
|
|
1985
|
+
isn't ``@bind_fixed_schema``-decorated (no ``_inline_bind_safe`` marker),
|
|
1986
|
+
or a subclass has overridden ``on_bind`` (escaping the decorator's
|
|
1987
|
+
contract — see the eligibility comment on ``bind_fixed_schema``).
|
|
1988
|
+
"""
|
|
1989
|
+
if not getattr(func_cls, "_inline_bind_safe", False):
|
|
1990
|
+
return None
|
|
1991
|
+
# If the class has its own on_bind in __dict__, it's either the decorator's
|
|
1992
|
+
# injection (marked) or a subclass override (unmarked). Reject overrides.
|
|
1993
|
+
on_bind_attr = func_cls.__dict__.get("on_bind")
|
|
1994
|
+
if on_bind_attr is not None:
|
|
1995
|
+
underlying = getattr(on_bind_attr, "__func__", on_bind_attr)
|
|
1996
|
+
if not getattr(underlying, "_is_bind_fixed_schema", False):
|
|
1997
|
+
return None
|
|
1998
|
+
cached = func_cls.__dict__.get("_cached_inline_bind_result")
|
|
1999
|
+
if cached is not None:
|
|
2000
|
+
return cached # type: ignore[no-any-return]
|
|
2001
|
+
from vgi.invocation import BindResponse
|
|
2002
|
+
|
|
2003
|
+
response = BindResponse(output_schema=func_cls.FIXED_SCHEMA, opaque_data=None) # type: ignore[attr-defined]
|
|
2004
|
+
blob = response.serialize_to_bytes()
|
|
2005
|
+
# Set on the class itself so subclasses don't pollute their parents'
|
|
2006
|
+
# cache with each other's serialized blobs (FIXED_SCHEMA may differ).
|
|
2007
|
+
func_cls._cached_inline_bind_result = blob # type: ignore[attr-defined]
|
|
2008
|
+
return blob
|
|
2009
|
+
|
|
2010
|
+
|
|
2011
|
+
class ReadOnlyCatalogInterface(CatalogInterface):
|
|
2012
|
+
"""A read-only catalog interface that does not support DDL operations.
|
|
2013
|
+
|
|
2014
|
+
This is a convenience base class for catalogs that only support reading
|
|
2015
|
+
metadata and data, not creating or modifying objects.
|
|
2016
|
+
|
|
2017
|
+
There are two ways to use this class:
|
|
2018
|
+
|
|
2019
|
+
1. Subclass and implement abstract methods:
|
|
2020
|
+
- catalogs() - List available catalogs
|
|
2021
|
+
- catalog_attach() - Attach to a catalog
|
|
2022
|
+
- schema_get() - Get schema information
|
|
2023
|
+
- table_get() - Get table information (return None for function-only catalogs)
|
|
2024
|
+
- view_get() - Get view information (return None for function-only catalogs)
|
|
2025
|
+
|
|
2026
|
+
2. Use with functions list (simpler for function-only catalogs):
|
|
2027
|
+
Set the `functions` class attribute to expose VGI functions:
|
|
2028
|
+
- catalog_name - Name of the catalog (default: "functions")
|
|
2029
|
+
- functions - List of function classes to expose in the "main" schema
|
|
2030
|
+
|
|
2031
|
+
This provides automatic implementations of catalogs(), catalog_attach(),
|
|
2032
|
+
schema_get(), table_get(), view_get(), and schema_contents().
|
|
2033
|
+
|
|
2034
|
+
Optional methods that can be overridden:
|
|
2035
|
+
- catalog_detach() - Custom detach logic
|
|
2036
|
+
- schemas() - Custom schema listing (default returns 'main')
|
|
2037
|
+
- schema_contents() - List schema contents
|
|
2038
|
+
- table_scan_function_get() - Get scan function for tables
|
|
2039
|
+
|
|
2040
|
+
All DDL operations (create, drop, rename, modify) will raise
|
|
2041
|
+
CatalogReadOnlyError.
|
|
2042
|
+
|
|
2043
|
+
"""
|
|
2044
|
+
|
|
2045
|
+
supports_transactions = False
|
|
2046
|
+
catalog_version_frozen = True
|
|
2047
|
+
|
|
2048
|
+
# Class attributes for function-based catalogs
|
|
2049
|
+
catalog_name: str = "functions"
|
|
2050
|
+
functions: list[type] = []
|
|
2051
|
+
settings: list["SettingSpec"] = []
|
|
2052
|
+
secret_types: list["SecretTypeSpec"] = []
|
|
2053
|
+
attach_option_specs: list["AttachOptionSpec"] = []
|
|
2054
|
+
|
|
2055
|
+
# NEW: Optional Catalog object for declarative definition
|
|
2056
|
+
catalog: "Catalog | None" = None
|
|
2057
|
+
|
|
2058
|
+
# Fixed attach_opaque_data for read-only catalogs (no need for unique IDs)
|
|
2059
|
+
_FIXED_ATTACH_ID: AttachOpaqueData = AttachOpaqueData(b"readonly-catalog-")
|
|
2060
|
+
|
|
2061
|
+
# Instance-level registry caches (built lazily)
|
|
2062
|
+
# Keys are LOWERCASE for case-insensitive lookup
|
|
2063
|
+
_schema_registry: "dict[str, Schema] | None" = None
|
|
2064
|
+
_table_registry: "dict[tuple[str, str], Table] | None" = None
|
|
2065
|
+
_view_registry: "dict[tuple[str, str], View] | None" = None
|
|
2066
|
+
_function_registry: "dict[tuple[str, str], list[type]] | None" = None
|
|
2067
|
+
_macro_registry: "dict[tuple[str, str], Macro] | None" = None
|
|
2068
|
+
_index_registry: "dict[tuple[str, str], Index] | None" = None
|
|
2069
|
+
# Lazy registry build is one-time but the fixture HTTP server is
|
|
2070
|
+
# multi-threaded and shares one catalog instance, so concurrent
|
|
2071
|
+
# first-requests can race the build. Serialize it under a lock and flip
|
|
2072
|
+
# ``_registries_built`` only AFTER population so readers never observe a
|
|
2073
|
+
# half-built (mutating) registry. (Shared across instances — fine; the
|
|
2074
|
+
# build is one-time and infrequent.)
|
|
2075
|
+
_build_lock = threading.Lock()
|
|
2076
|
+
_registries_built: bool = False
|
|
2077
|
+
|
|
2078
|
+
def _build_registries(self) -> None:
|
|
2079
|
+
"""Build the lookup registries lazily, once, and thread-safely.
|
|
2080
|
+
|
|
2081
|
+
Double-checked locking: the fast path is a lock-free flag read; the
|
|
2082
|
+
actual build runs under ``_build_lock`` and sets ``_registries_built``
|
|
2083
|
+
only after population completes. A concurrent reader either builds
|
|
2084
|
+
(under the lock) or waits for the builder, so it never iterates a
|
|
2085
|
+
registry that another thread is still mutating.
|
|
2086
|
+
"""
|
|
2087
|
+
if self._registries_built:
|
|
2088
|
+
return
|
|
2089
|
+
with self._build_lock:
|
|
2090
|
+
if self._registries_built:
|
|
2091
|
+
return
|
|
2092
|
+
self._build_registries_locked()
|
|
2093
|
+
|
|
2094
|
+
def _build_registries_locked(self) -> None:
|
|
2095
|
+
"""Populate the registries. Caller must hold ``_build_lock``.
|
|
2096
|
+
|
|
2097
|
+
All registry keys are lowercase for case-insensitive lookups.
|
|
2098
|
+
Raises ValueError if duplicate names detected within same schema.
|
|
2099
|
+
"""
|
|
2100
|
+
# Import here to avoid circular imports
|
|
2101
|
+
from vgi.catalog.descriptors import Schema
|
|
2102
|
+
|
|
2103
|
+
self._schema_registry = {}
|
|
2104
|
+
self._table_registry = {}
|
|
2105
|
+
self._view_registry = {}
|
|
2106
|
+
self._function_registry = {}
|
|
2107
|
+
self._macro_registry = {}
|
|
2108
|
+
self._index_registry = {}
|
|
2109
|
+
|
|
2110
|
+
def _register_table(schema_key: str, table: "Table") -> None:
|
|
2111
|
+
key = (schema_key, table.name.lower())
|
|
2112
|
+
if key in self._table_registry: # type: ignore[operator]
|
|
2113
|
+
raise ValueError(f"Duplicate table '{table.name}' in schema '{schema_key}'")
|
|
2114
|
+
self._table_registry[key] = table # type: ignore[index]
|
|
2115
|
+
|
|
2116
|
+
def _register_view(schema_key: str, view: "View") -> None:
|
|
2117
|
+
key = (schema_key, view.name.lower())
|
|
2118
|
+
if key in self._view_registry: # type: ignore[operator]
|
|
2119
|
+
raise ValueError(f"Duplicate view '{view.name}' in schema '{schema_key}'")
|
|
2120
|
+
self._view_registry[key] = view # type: ignore[index]
|
|
2121
|
+
|
|
2122
|
+
def _register_function(schema_key: str, func_cls: type) -> None:
|
|
2123
|
+
meta = func_cls.get_metadata() # type: ignore[attr-defined]
|
|
2124
|
+
key = (schema_key, meta.name.lower())
|
|
2125
|
+
if key not in self._function_registry: # type: ignore[operator]
|
|
2126
|
+
self._function_registry[key] = [] # type: ignore[index]
|
|
2127
|
+
self._function_registry[key].append(func_cls) # type: ignore[index]
|
|
2128
|
+
|
|
2129
|
+
def _register_macro(schema_key: str, macro: "Macro") -> None:
|
|
2130
|
+
key = (schema_key, macro.name.lower())
|
|
2131
|
+
if key in self._macro_registry: # type: ignore[operator]
|
|
2132
|
+
raise ValueError(f"Duplicate macro '{macro.name}' in schema '{schema_key}'")
|
|
2133
|
+
self._macro_registry[key] = macro # type: ignore[index]
|
|
2134
|
+
|
|
2135
|
+
def _register_index(schema_key: str, index: "Index") -> None:
|
|
2136
|
+
key = (schema_key, index.name.lower())
|
|
2137
|
+
if key in self._index_registry: # type: ignore[operator]
|
|
2138
|
+
raise ValueError(f"Duplicate index '{index.name}' in schema '{schema_key}'")
|
|
2139
|
+
self._index_registry[key] = index # type: ignore[index]
|
|
2140
|
+
|
|
2141
|
+
if self.catalog is not None:
|
|
2142
|
+
# Build from Catalog object
|
|
2143
|
+
for schema in self.catalog.schemas:
|
|
2144
|
+
schema_key = schema.name.lower()
|
|
2145
|
+
self._schema_registry[schema_key] = schema
|
|
2146
|
+
|
|
2147
|
+
for table in schema.tables:
|
|
2148
|
+
_register_table(schema_key, table)
|
|
2149
|
+
for view in schema.views:
|
|
2150
|
+
_register_view(schema_key, view)
|
|
2151
|
+
for func_cls in schema.functions:
|
|
2152
|
+
_register_function(schema_key, func_cls)
|
|
2153
|
+
for macro in schema.macros:
|
|
2154
|
+
_register_macro(schema_key, macro)
|
|
2155
|
+
for index in schema.indexes:
|
|
2156
|
+
_register_index(schema_key, index)
|
|
2157
|
+
else:
|
|
2158
|
+
# Backward compat: create "main" schema from legacy `functions` list
|
|
2159
|
+
main_schema = Schema(name="main", tables=(), views=(), functions=())
|
|
2160
|
+
self._schema_registry["main"] = main_schema
|
|
2161
|
+
|
|
2162
|
+
for func_cls in self.functions:
|
|
2163
|
+
_register_function("main", func_cls)
|
|
2164
|
+
|
|
2165
|
+
# Publish last: only now may a concurrent reader skip the build and
|
|
2166
|
+
# iterate these registries (they are fully populated and no longer
|
|
2167
|
+
# mutated).
|
|
2168
|
+
self._registries_built = True
|
|
2169
|
+
|
|
2170
|
+
@property
|
|
2171
|
+
def _effective_catalog_name(self) -> str:
|
|
2172
|
+
"""Get catalog name from Catalog object or class attribute."""
|
|
2173
|
+
if self.catalog is not None:
|
|
2174
|
+
return self.catalog.name
|
|
2175
|
+
return self.catalog_name
|
|
2176
|
+
|
|
2177
|
+
@property
|
|
2178
|
+
def _default_schema_name(self) -> str:
|
|
2179
|
+
"""Get default schema name."""
|
|
2180
|
+
if self.catalog is not None:
|
|
2181
|
+
return self.catalog.default_schema
|
|
2182
|
+
return "main"
|
|
2183
|
+
|
|
2184
|
+
def catalogs(self) -> list[CatalogInfo]:
|
|
2185
|
+
"""Return the list of available catalogs.
|
|
2186
|
+
|
|
2187
|
+
Default discovery record carries just the catalog name — subclasses
|
|
2188
|
+
that want to advertise version metadata should override.
|
|
2189
|
+
"""
|
|
2190
|
+
return [
|
|
2191
|
+
CatalogInfo(
|
|
2192
|
+
name=self._effective_catalog_name,
|
|
2193
|
+
implementation_version=None,
|
|
2194
|
+
data_version_spec=None,
|
|
2195
|
+
attach_option_specs=[spec.serialize() for spec in self.attach_option_specs],
|
|
2196
|
+
)
|
|
2197
|
+
]
|
|
2198
|
+
|
|
2199
|
+
def catalog_attach(
|
|
2200
|
+
self,
|
|
2201
|
+
*,
|
|
2202
|
+
name: str,
|
|
2203
|
+
options: dict[str, Any],
|
|
2204
|
+
data_version_spec: str | None,
|
|
2205
|
+
implementation_version: str | None,
|
|
2206
|
+
ctx: "CallContext | None" = None,
|
|
2207
|
+
) -> CatalogAttachResult:
|
|
2208
|
+
"""Attach to the catalog. Version constraints are ignored by default."""
|
|
2209
|
+
del data_version_spec, implementation_version, ctx
|
|
2210
|
+
effective_name = self._effective_catalog_name
|
|
2211
|
+
if name != effective_name:
|
|
2212
|
+
raise ValueError(f"Unknown catalog: {name!r}. Available: {effective_name}")
|
|
2213
|
+
|
|
2214
|
+
# Serialize settings and secret types for the attach result
|
|
2215
|
+
serialized_settings = [s.serialize() for s in self.settings]
|
|
2216
|
+
serialized_secret_types = [st.serialize() for st in self.secret_types]
|
|
2217
|
+
|
|
2218
|
+
# Auto-derive supports_time_travel and supports_column_statistics from tables
|
|
2219
|
+
self._build_registries()
|
|
2220
|
+
assert self._table_registry is not None
|
|
2221
|
+
has_time_travel = any(t.supports_time_travel for t in self._table_registry.values())
|
|
2222
|
+
has_column_statistics = any(bool(t.statistics) for t in self._table_registry.values())
|
|
2223
|
+
|
|
2224
|
+
return CatalogAttachResult(
|
|
2225
|
+
attach_opaque_data=self._FIXED_ATTACH_ID,
|
|
2226
|
+
supports_transactions=getattr(self, "supports_transactions", False),
|
|
2227
|
+
supports_time_travel=has_time_travel,
|
|
2228
|
+
catalog_version_frozen=True,
|
|
2229
|
+
catalog_version=1,
|
|
2230
|
+
attach_opaque_data_required=False,
|
|
2231
|
+
default_schema=self._default_schema_name,
|
|
2232
|
+
settings=serialized_settings,
|
|
2233
|
+
secret_types=serialized_secret_types,
|
|
2234
|
+
comment=self.catalog.comment if self.catalog is not None else None,
|
|
2235
|
+
tags=dict(self.catalog.tags) if self.catalog is not None else {},
|
|
2236
|
+
supports_column_statistics=has_column_statistics,
|
|
2237
|
+
resolved_data_version=None,
|
|
2238
|
+
resolved_implementation_version=None,
|
|
2239
|
+
)
|
|
2240
|
+
|
|
2241
|
+
def schemas(
|
|
2242
|
+
self, *, attach_opaque_data: AttachOpaqueData, transaction_opaque_data: TransactionOpaqueData | None
|
|
2243
|
+
) -> list[SchemaInfo]:
|
|
2244
|
+
"""Get a list of schemas for the given attach_opaque_data."""
|
|
2245
|
+
self._build_registries()
|
|
2246
|
+
assert self._schema_registry is not None
|
|
2247
|
+
return [s.to_schema_info(attach_opaque_data) for s in self._schema_registry.values()]
|
|
2248
|
+
|
|
2249
|
+
def schema_get(
|
|
2250
|
+
self,
|
|
2251
|
+
*,
|
|
2252
|
+
attach_opaque_data: AttachOpaqueData,
|
|
2253
|
+
transaction_opaque_data: TransactionOpaqueData | None,
|
|
2254
|
+
name: str,
|
|
2255
|
+
) -> SchemaInfo | None:
|
|
2256
|
+
"""Get information about a schema (case-insensitive lookup)."""
|
|
2257
|
+
self._build_registries()
|
|
2258
|
+
assert self._schema_registry is not None
|
|
2259
|
+
schema = self._schema_registry.get(name.lower())
|
|
2260
|
+
return schema.to_schema_info(attach_opaque_data) if schema else None
|
|
2261
|
+
|
|
2262
|
+
def table_get(
|
|
2263
|
+
self,
|
|
2264
|
+
*,
|
|
2265
|
+
attach_opaque_data: AttachOpaqueData,
|
|
2266
|
+
transaction_opaque_data: TransactionOpaqueData | None,
|
|
2267
|
+
schema_name: str,
|
|
2268
|
+
name: str,
|
|
2269
|
+
at_unit: str | None = None,
|
|
2270
|
+
at_value: str | None = None,
|
|
2271
|
+
) -> TableInfo | None:
|
|
2272
|
+
"""Get information about a table (case-insensitive lookup).
|
|
2273
|
+
|
|
2274
|
+
When ``at_unit`` / ``at_value`` are provided, the default implementation
|
|
2275
|
+
returns the same table info (no schema evolution). Override this method
|
|
2276
|
+
to return version-specific schemas for time-travel queries.
|
|
2277
|
+
"""
|
|
2278
|
+
_validate_at_params(at_unit, at_value)
|
|
2279
|
+
|
|
2280
|
+
self._build_registries()
|
|
2281
|
+
assert self._table_registry is not None
|
|
2282
|
+
assert self._schema_registry is not None
|
|
2283
|
+
table = self._table_registry.get((schema_name.lower(), name.lower()))
|
|
2284
|
+
if table is None:
|
|
2285
|
+
return None
|
|
2286
|
+
|
|
2287
|
+
# If AT clause present but table doesn't support time travel, error
|
|
2288
|
+
if at_unit and not table.supports_time_travel:
|
|
2289
|
+
raise ValueError(f"Table '{schema_name}.{name}' does not support time travel queries")
|
|
2290
|
+
|
|
2291
|
+
schema = self._schema_registry.get(schema_name.lower())
|
|
2292
|
+
return table.to_table_info(schema.name if schema else schema_name)
|
|
2293
|
+
|
|
2294
|
+
def view_get(
|
|
2295
|
+
self,
|
|
2296
|
+
*,
|
|
2297
|
+
attach_opaque_data: AttachOpaqueData,
|
|
2298
|
+
transaction_opaque_data: TransactionOpaqueData | None,
|
|
2299
|
+
schema_name: str,
|
|
2300
|
+
name: str,
|
|
2301
|
+
) -> ViewInfo | None:
|
|
2302
|
+
"""Get information about a view (case-insensitive lookup)."""
|
|
2303
|
+
self._build_registries()
|
|
2304
|
+
assert self._view_registry is not None
|
|
2305
|
+
assert self._schema_registry is not None
|
|
2306
|
+
view = self._view_registry.get((schema_name.lower(), name.lower()))
|
|
2307
|
+
if view:
|
|
2308
|
+
schema = self._schema_registry.get(schema_name.lower())
|
|
2309
|
+
return view.to_view_info(schema.name if schema else schema_name)
|
|
2310
|
+
return None
|
|
2311
|
+
|
|
2312
|
+
def macro_get(
|
|
2313
|
+
self,
|
|
2314
|
+
*,
|
|
2315
|
+
attach_opaque_data: AttachOpaqueData,
|
|
2316
|
+
transaction_opaque_data: TransactionOpaqueData | None,
|
|
2317
|
+
schema_name: str,
|
|
2318
|
+
name: str,
|
|
2319
|
+
) -> MacroInfo | None:
|
|
2320
|
+
"""Get information about a macro (case-insensitive lookup)."""
|
|
2321
|
+
self._build_registries()
|
|
2322
|
+
assert self._macro_registry is not None
|
|
2323
|
+
assert self._schema_registry is not None
|
|
2324
|
+
macro = self._macro_registry.get((schema_name.lower(), name.lower()))
|
|
2325
|
+
if macro:
|
|
2326
|
+
schema = self._schema_registry.get(schema_name.lower())
|
|
2327
|
+
return macro.to_macro_info(schema.name if schema else schema_name)
|
|
2328
|
+
return None
|
|
2329
|
+
|
|
2330
|
+
def index_get(
|
|
2331
|
+
self,
|
|
2332
|
+
*,
|
|
2333
|
+
attach_opaque_data: AttachOpaqueData,
|
|
2334
|
+
transaction_opaque_data: TransactionOpaqueData | None,
|
|
2335
|
+
schema_name: str,
|
|
2336
|
+
name: str,
|
|
2337
|
+
) -> IndexInfo | None:
|
|
2338
|
+
"""Get information about an index (case-insensitive lookup)."""
|
|
2339
|
+
self._build_registries()
|
|
2340
|
+
assert self._index_registry is not None
|
|
2341
|
+
assert self._schema_registry is not None
|
|
2342
|
+
index = self._index_registry.get((schema_name.lower(), name.lower()))
|
|
2343
|
+
if index is not None:
|
|
2344
|
+
schema = self._schema_registry.get(schema_name.lower())
|
|
2345
|
+
return index.to_index_info(schema.name if schema else schema_name)
|
|
2346
|
+
return None
|
|
2347
|
+
|
|
2348
|
+
def table_column_statistics_get(
|
|
2349
|
+
self,
|
|
2350
|
+
*,
|
|
2351
|
+
attach_opaque_data: AttachOpaqueData,
|
|
2352
|
+
transaction_opaque_data: TransactionOpaqueData | None,
|
|
2353
|
+
schema_name: str,
|
|
2354
|
+
name: str,
|
|
2355
|
+
) -> TableColumnStatisticsResult | None:
|
|
2356
|
+
"""Get column statistics from the Table descriptor's ``statistics`` dict.
|
|
2357
|
+
|
|
2358
|
+
Automatically resolves plain Python values to typed PyArrow scalars
|
|
2359
|
+
using the column's Arrow type from the table schema.
|
|
2360
|
+
Override this method for dynamic or computed statistics.
|
|
2361
|
+
"""
|
|
2362
|
+
self._build_registries()
|
|
2363
|
+
assert self._table_registry is not None
|
|
2364
|
+
table = self._table_registry.get((schema_name.lower(), name.lower()))
|
|
2365
|
+
if table is None:
|
|
2366
|
+
return None
|
|
2367
|
+
return table.resolve_column_statistics()
|
|
2368
|
+
|
|
2369
|
+
def table_scan_function_get(
|
|
2370
|
+
self,
|
|
2371
|
+
*,
|
|
2372
|
+
attach_opaque_data: AttachOpaqueData,
|
|
2373
|
+
transaction_opaque_data: TransactionOpaqueData | None,
|
|
2374
|
+
schema_name: str,
|
|
2375
|
+
name: str,
|
|
2376
|
+
at_unit: str | None,
|
|
2377
|
+
at_value: str | None,
|
|
2378
|
+
) -> ScanFunctionResult:
|
|
2379
|
+
"""Get scan function for a table.
|
|
2380
|
+
|
|
2381
|
+
For function-backed tables (Table.function is set), automatically returns
|
|
2382
|
+
a ScanFunctionResult that invokes the linked function.
|
|
2383
|
+
|
|
2384
|
+
For tables with explicit columns, override this method in your Worker
|
|
2385
|
+
to provide scan functions.
|
|
2386
|
+
"""
|
|
2387
|
+
_validate_at_params(at_unit, at_value)
|
|
2388
|
+
|
|
2389
|
+
self._build_registries()
|
|
2390
|
+
assert self._table_registry is not None
|
|
2391
|
+
assert self._schema_registry is not None
|
|
2392
|
+
|
|
2393
|
+
# Validate AT clause against table's supports_time_travel
|
|
2394
|
+
table = self._table_registry.get((schema_name.lower(), name.lower()))
|
|
2395
|
+
if table is not None and at_unit and not table.supports_time_travel:
|
|
2396
|
+
raise ValueError(f"Table '{schema_name}.{name}' does not support time travel queries")
|
|
2397
|
+
|
|
2398
|
+
# Check if table exists and is function-backed
|
|
2399
|
+
if table is not None and table.function is not None:
|
|
2400
|
+
# Auto-implement for function-backed tables
|
|
2401
|
+
func_meta = table.function.get_metadata()
|
|
2402
|
+
return ScanFunctionResult(
|
|
2403
|
+
function_name=func_meta.name,
|
|
2404
|
+
positional_arguments=[],
|
|
2405
|
+
named_arguments={},
|
|
2406
|
+
required_extensions=[],
|
|
2407
|
+
)
|
|
2408
|
+
|
|
2409
|
+
# No auto-implementation available - provide helpful error
|
|
2410
|
+
available = [
|
|
2411
|
+
f"{self._effective_catalog_name}.{s.name}.{t.name}"
|
|
2412
|
+
for s in self._schema_registry.values()
|
|
2413
|
+
for t in s.tables
|
|
2414
|
+
]
|
|
2415
|
+
available_str = ", ".join(sorted(available)) if available else "(none)"
|
|
2416
|
+
|
|
2417
|
+
raise NotImplementedError(
|
|
2418
|
+
f"table_scan_function_get not implemented for table "
|
|
2419
|
+
f"'{self._effective_catalog_name}.{schema_name}.{name}'. "
|
|
2420
|
+
f"Available tables: {available_str}. "
|
|
2421
|
+
f"Either use Table(function=...) for automatic scanning, "
|
|
2422
|
+
f"or override table_scan_function_get in your Worker."
|
|
2423
|
+
)
|
|
2424
|
+
|
|
2425
|
+
def _write_function_get(
|
|
2426
|
+
self,
|
|
2427
|
+
*,
|
|
2428
|
+
schema_name: str,
|
|
2429
|
+
name: str,
|
|
2430
|
+
operation: str,
|
|
2431
|
+
attr_name: str,
|
|
2432
|
+
) -> ScanFunctionResult:
|
|
2433
|
+
"""Shared implementation for table_{insert,update,delete}_function_get."""
|
|
2434
|
+
self._build_registries()
|
|
2435
|
+
assert self._table_registry is not None
|
|
2436
|
+
|
|
2437
|
+
table = self._table_registry.get((schema_name.lower(), name.lower()))
|
|
2438
|
+
if table is None:
|
|
2439
|
+
raise NotImplementedError(f"Table '{schema_name}.{name}' not found in catalog.")
|
|
2440
|
+
|
|
2441
|
+
write_func = getattr(table, attr_name, None)
|
|
2442
|
+
if write_func is None:
|
|
2443
|
+
raise CatalogReadOnlyError(f"Table '{schema_name}.{name}' does not support {operation}.")
|
|
2444
|
+
|
|
2445
|
+
func_meta = write_func.get_metadata()
|
|
2446
|
+
return ScanFunctionResult(
|
|
2447
|
+
function_name=func_meta.name,
|
|
2448
|
+
positional_arguments=[],
|
|
2449
|
+
named_arguments={},
|
|
2450
|
+
required_extensions=[],
|
|
2451
|
+
)
|
|
2452
|
+
|
|
2453
|
+
def table_insert_function_get(
|
|
2454
|
+
self,
|
|
2455
|
+
*,
|
|
2456
|
+
attach_opaque_data: AttachOpaqueData,
|
|
2457
|
+
transaction_opaque_data: TransactionOpaqueData | None,
|
|
2458
|
+
schema_name: str,
|
|
2459
|
+
name: str,
|
|
2460
|
+
writable_branch_function_name: str | None = None,
|
|
2461
|
+
) -> ScanFunctionResult:
|
|
2462
|
+
"""Get insert function for a table."""
|
|
2463
|
+
# ReadOnlyCatalogInterface tables are single-branch — writable arm
|
|
2464
|
+
# disambiguation is not relevant here. Discard the hint.
|
|
2465
|
+
del writable_branch_function_name
|
|
2466
|
+
return self._write_function_get(
|
|
2467
|
+
schema_name=schema_name,
|
|
2468
|
+
name=name,
|
|
2469
|
+
operation="INSERT",
|
|
2470
|
+
attr_name="insert_function",
|
|
2471
|
+
)
|
|
2472
|
+
|
|
2473
|
+
def table_update_function_get(
|
|
2474
|
+
self,
|
|
2475
|
+
*,
|
|
2476
|
+
attach_opaque_data: AttachOpaqueData,
|
|
2477
|
+
transaction_opaque_data: TransactionOpaqueData | None,
|
|
2478
|
+
schema_name: str,
|
|
2479
|
+
name: str,
|
|
2480
|
+
) -> ScanFunctionResult:
|
|
2481
|
+
"""Get update function for a table."""
|
|
2482
|
+
return self._write_function_get(
|
|
2483
|
+
schema_name=schema_name,
|
|
2484
|
+
name=name,
|
|
2485
|
+
operation="UPDATE",
|
|
2486
|
+
attr_name="update_function",
|
|
2487
|
+
)
|
|
2488
|
+
|
|
2489
|
+
def table_delete_function_get(
|
|
2490
|
+
self,
|
|
2491
|
+
*,
|
|
2492
|
+
attach_opaque_data: AttachOpaqueData,
|
|
2493
|
+
transaction_opaque_data: TransactionOpaqueData | None,
|
|
2494
|
+
schema_name: str,
|
|
2495
|
+
name: str,
|
|
2496
|
+
) -> ScanFunctionResult:
|
|
2497
|
+
"""Get delete function for a table."""
|
|
2498
|
+
return self._write_function_get(
|
|
2499
|
+
schema_name=schema_name,
|
|
2500
|
+
name=name,
|
|
2501
|
+
operation="DELETE",
|
|
2502
|
+
attr_name="delete_function",
|
|
2503
|
+
)
|
|
2504
|
+
|
|
2505
|
+
@overload
|
|
2506
|
+
def schema_contents(
|
|
2507
|
+
self,
|
|
2508
|
+
*,
|
|
2509
|
+
attach_opaque_data: AttachOpaqueData,
|
|
2510
|
+
transaction_opaque_data: TransactionOpaqueData | None,
|
|
2511
|
+
name: str,
|
|
2512
|
+
type: Literal[SchemaObjectType.TABLE],
|
|
2513
|
+
) -> Sequence[TableInfo]: ...
|
|
2514
|
+
|
|
2515
|
+
@overload
|
|
2516
|
+
def schema_contents(
|
|
2517
|
+
self,
|
|
2518
|
+
*,
|
|
2519
|
+
attach_opaque_data: AttachOpaqueData,
|
|
2520
|
+
transaction_opaque_data: TransactionOpaqueData | None,
|
|
2521
|
+
name: str,
|
|
2522
|
+
type: Literal[SchemaObjectType.VIEW],
|
|
2523
|
+
) -> Sequence[ViewInfo]: ...
|
|
2524
|
+
|
|
2525
|
+
@overload
|
|
2526
|
+
def schema_contents(
|
|
2527
|
+
self,
|
|
2528
|
+
*,
|
|
2529
|
+
attach_opaque_data: AttachOpaqueData,
|
|
2530
|
+
transaction_opaque_data: TransactionOpaqueData | None,
|
|
2531
|
+
name: str,
|
|
2532
|
+
type: Literal[
|
|
2533
|
+
SchemaObjectType.SCALAR_FUNCTION,
|
|
2534
|
+
SchemaObjectType.TABLE_FUNCTION,
|
|
2535
|
+
SchemaObjectType.AGGREGATE_FUNCTION,
|
|
2536
|
+
],
|
|
2537
|
+
) -> Sequence[FunctionInfo]: ...
|
|
2538
|
+
|
|
2539
|
+
@overload
|
|
2540
|
+
def schema_contents(
|
|
2541
|
+
self,
|
|
2542
|
+
*,
|
|
2543
|
+
attach_opaque_data: AttachOpaqueData,
|
|
2544
|
+
transaction_opaque_data: TransactionOpaqueData | None,
|
|
2545
|
+
name: str,
|
|
2546
|
+
type: Literal[SchemaObjectType.SCALAR_MACRO, SchemaObjectType.TABLE_MACRO],
|
|
2547
|
+
) -> Sequence[MacroInfo]: ...
|
|
2548
|
+
|
|
2549
|
+
@overload
|
|
2550
|
+
def schema_contents(
|
|
2551
|
+
self,
|
|
2552
|
+
*,
|
|
2553
|
+
attach_opaque_data: AttachOpaqueData,
|
|
2554
|
+
transaction_opaque_data: TransactionOpaqueData | None,
|
|
2555
|
+
name: str,
|
|
2556
|
+
type: Literal[SchemaObjectType.INDEX],
|
|
2557
|
+
) -> Sequence[IndexInfo]: ...
|
|
2558
|
+
|
|
2559
|
+
def schema_contents(
|
|
2560
|
+
self,
|
|
2561
|
+
*,
|
|
2562
|
+
attach_opaque_data: AttachOpaqueData,
|
|
2563
|
+
transaction_opaque_data: TransactionOpaqueData | None,
|
|
2564
|
+
name: str,
|
|
2565
|
+
type: SchemaObjectType,
|
|
2566
|
+
) -> Sequence[TableInfo | ViewInfo | FunctionInfo | MacroInfo | IndexInfo]:
|
|
2567
|
+
"""List contents of a schema.
|
|
2568
|
+
|
|
2569
|
+
Returns tables, views, functions, macros, or indexes based on the type parameter.
|
|
2570
|
+
Uses case-insensitive schema name lookup.
|
|
2571
|
+
|
|
2572
|
+
Args:
|
|
2573
|
+
attach_opaque_data: The attachment identifier.
|
|
2574
|
+
transaction_opaque_data: The transaction identifier, if any.
|
|
2575
|
+
name: The name of the schema.
|
|
2576
|
+
type: The type of objects to return. Must be a SchemaObjectType enum.
|
|
2577
|
+
|
|
2578
|
+
Returns:
|
|
2579
|
+
A list of TableInfo, ViewInfo, FunctionInfo, MacroInfo, or IndexInfo objects.
|
|
2580
|
+
|
|
2581
|
+
"""
|
|
2582
|
+
self._build_registries()
|
|
2583
|
+
assert self._schema_registry is not None
|
|
2584
|
+
assert self._table_registry is not None
|
|
2585
|
+
assert self._view_registry is not None
|
|
2586
|
+
assert self._function_registry is not None
|
|
2587
|
+
assert self._macro_registry is not None
|
|
2588
|
+
assert self._index_registry is not None
|
|
2589
|
+
|
|
2590
|
+
# Case-insensitive schema lookup
|
|
2591
|
+
name_lower = name.lower()
|
|
2592
|
+
schema = self._schema_registry.get(name_lower)
|
|
2593
|
+
if schema is None:
|
|
2594
|
+
return []
|
|
2595
|
+
|
|
2596
|
+
schema_name = schema.name
|
|
2597
|
+
|
|
2598
|
+
# Normalize type parameter (may be string from wire protocol)
|
|
2599
|
+
type_enum = type if isinstance(type, SchemaObjectType) else SchemaObjectType(type)
|
|
2600
|
+
|
|
2601
|
+
results: list[TableInfo | ViewInfo | FunctionInfo | MacroInfo | IndexInfo] = []
|
|
2602
|
+
|
|
2603
|
+
if type_enum == SchemaObjectType.TABLE:
|
|
2604
|
+
for (sn, _), table in self._table_registry.items():
|
|
2605
|
+
if sn == name_lower:
|
|
2606
|
+
info = table.to_table_info(schema_name)
|
|
2607
|
+
# Inline-bind post-pass: descriptors with inline_bind=True
|
|
2608
|
+
# backed by @bind_fixed_schema-decorated functions get a
|
|
2609
|
+
# pre-built BindResponse inlined onto TableInfo.bind_result.
|
|
2610
|
+
# The C++ extension uses these bytes verbatim and skips
|
|
2611
|
+
# the per-scan bind RPC.
|
|
2612
|
+
if table.inline_bind and table.function is not None:
|
|
2613
|
+
bind_bytes = _inline_bind_result_for(table.function)
|
|
2614
|
+
if bind_bytes is not None:
|
|
2615
|
+
info = dataclasses.replace(info, bind_result=bind_bytes)
|
|
2616
|
+
results.append(info)
|
|
2617
|
+
elif type_enum == SchemaObjectType.VIEW:
|
|
2618
|
+
for (sn, _), view in self._view_registry.items():
|
|
2619
|
+
if sn == name_lower:
|
|
2620
|
+
results.append(view.to_view_info(schema_name))
|
|
2621
|
+
elif type_enum == SchemaObjectType.INDEX:
|
|
2622
|
+
for (sn, _), index in self._index_registry.items():
|
|
2623
|
+
if sn == name_lower:
|
|
2624
|
+
results.append(index.to_index_info(schema_name))
|
|
2625
|
+
elif type_enum in (SchemaObjectType.SCALAR_MACRO, SchemaObjectType.TABLE_MACRO):
|
|
2626
|
+
target_macro_type = MacroType.SCALAR if type_enum == SchemaObjectType.SCALAR_MACRO else MacroType.TABLE
|
|
2627
|
+
for (sn, _), macro in self._macro_registry.items():
|
|
2628
|
+
if sn == name_lower and macro.macro_type == target_macro_type:
|
|
2629
|
+
results.append(macro.to_macro_info(schema_name))
|
|
2630
|
+
else:
|
|
2631
|
+
# SCALAR_FUNCTION or TABLE_FUNCTION
|
|
2632
|
+
for (sn, _), func_classes in self._function_registry.items():
|
|
2633
|
+
if sn != name_lower:
|
|
2634
|
+
continue
|
|
2635
|
+
for func_cls in func_classes:
|
|
2636
|
+
func_info = self._function_to_info(func_cls, schema_name)
|
|
2637
|
+
# Filter by function type
|
|
2638
|
+
if type_enum == SchemaObjectType.SCALAR_FUNCTION and func_info.function_type != FunctionType.SCALAR:
|
|
2639
|
+
continue
|
|
2640
|
+
if type_enum == SchemaObjectType.TABLE_FUNCTION and func_info.function_type not in (
|
|
2641
|
+
FunctionType.TABLE,
|
|
2642
|
+
FunctionType.TABLE_BUFFERING,
|
|
2643
|
+
):
|
|
2644
|
+
continue
|
|
2645
|
+
if (
|
|
2646
|
+
type_enum == SchemaObjectType.AGGREGATE_FUNCTION
|
|
2647
|
+
and func_info.function_type != FunctionType.AGGREGATE
|
|
2648
|
+
):
|
|
2649
|
+
continue
|
|
2650
|
+
results.append(func_info)
|
|
2651
|
+
|
|
2652
|
+
return results
|
|
2653
|
+
|
|
2654
|
+
def _function_to_info(self, func_cls: type, schema_name: str) -> FunctionInfo:
|
|
2655
|
+
"""Convert a function class to FunctionInfo."""
|
|
2656
|
+
# Import here to avoid circular imports
|
|
2657
|
+
from vgi.argument_spec import (
|
|
2658
|
+
argument_specs_to_schema,
|
|
2659
|
+
extract_argument_specs,
|
|
2660
|
+
)
|
|
2661
|
+
from vgi.metadata import CatalogFunctionType as MetadataFunctionType
|
|
2662
|
+
from vgi.metadata import resolve_metadata
|
|
2663
|
+
|
|
2664
|
+
meta = resolve_metadata(func_cls)
|
|
2665
|
+
|
|
2666
|
+
# Map metadata function type to catalog function type
|
|
2667
|
+
func_type_map = {
|
|
2668
|
+
MetadataFunctionType.SCALAR: FunctionType.SCALAR,
|
|
2669
|
+
MetadataFunctionType.TABLE: FunctionType.TABLE,
|
|
2670
|
+
MetadataFunctionType.TABLE_BUFFERING: FunctionType.TABLE_BUFFERING,
|
|
2671
|
+
MetadataFunctionType.AGGREGATE: FunctionType.AGGREGATE,
|
|
2672
|
+
}
|
|
2673
|
+
func_type = func_type_map.get(meta.function_type, FunctionType.TABLE)
|
|
2674
|
+
|
|
2675
|
+
# Extract argument specs with proper Arrow types
|
|
2676
|
+
arg_specs = extract_argument_specs(func_cls)
|
|
2677
|
+
args_schema = argument_specs_to_schema(arg_specs)
|
|
2678
|
+
args_bytes = SerializedSchema(args_schema.serialize().to_pybytes())
|
|
2679
|
+
|
|
2680
|
+
# Get output schema from catalog introspection methods if available
|
|
2681
|
+
output_schema: pa.Schema = pa.schema([])
|
|
2682
|
+
has_catalog_schema = hasattr(func_cls, "catalog_output_schema")
|
|
2683
|
+
if func_type in (FunctionType.SCALAR, FunctionType.AGGREGATE) and has_catalog_schema:
|
|
2684
|
+
# ScalarFunction/AggregateFunction has catalog_output_schema() classmethod
|
|
2685
|
+
output_schema = func_cls.catalog_output_schema() # type: ignore[attr-defined]
|
|
2686
|
+
output_bytes = SerializedSchema(output_schema.serialize().to_pybytes())
|
|
2687
|
+
|
|
2688
|
+
is_scalar = func_type == FunctionType.SCALAR
|
|
2689
|
+
is_aggregate = func_type == FunctionType.AGGREGATE
|
|
2690
|
+
|
|
2691
|
+
return FunctionInfo(
|
|
2692
|
+
name=meta.name,
|
|
2693
|
+
schema_name=schema_name,
|
|
2694
|
+
function_type=func_type,
|
|
2695
|
+
arguments=args_bytes,
|
|
2696
|
+
output_schema=output_bytes,
|
|
2697
|
+
comment=None, # Functions don't use comment; use description instead
|
|
2698
|
+
tags=meta.tags,
|
|
2699
|
+
# Scalar/aggregate function behavior fields
|
|
2700
|
+
stability=meta.stability if is_scalar else None,
|
|
2701
|
+
null_handling=meta.null_handling if (is_scalar or is_aggregate) else None,
|
|
2702
|
+
# Documentation fields
|
|
2703
|
+
description=meta.description or "", # Intrinsic from Meta.description
|
|
2704
|
+
examples=[
|
|
2705
|
+
CatalogExample(
|
|
2706
|
+
sql=ex.sql,
|
|
2707
|
+
description=ex.description,
|
|
2708
|
+
expected_output=ex.expected_output,
|
|
2709
|
+
)
|
|
2710
|
+
for ex in meta.examples
|
|
2711
|
+
],
|
|
2712
|
+
categories=meta.categories,
|
|
2713
|
+
# Table function capabilities (None for scalar)
|
|
2714
|
+
projection_pushdown=None if is_scalar else meta.projection_pushdown,
|
|
2715
|
+
filter_pushdown=None if is_scalar else meta.filter_pushdown,
|
|
2716
|
+
sampling_pushdown=None if is_scalar else meta.sampling_pushdown,
|
|
2717
|
+
late_materialization=None if is_scalar else meta.late_materialization,
|
|
2718
|
+
supported_expression_filters=[] if is_scalar else meta.supported_expression_filters,
|
|
2719
|
+
order_preservation=None if is_scalar else meta.preserves_order,
|
|
2720
|
+
max_workers=None if is_scalar else meta.max_workers,
|
|
2721
|
+
supports_batch_index=False if is_scalar else meta.supports_batch_index,
|
|
2722
|
+
partition_kind=PartitionKind.NOT_PARTITIONED if is_scalar else meta.partition_kind,
|
|
2723
|
+
# Aggregate function fields
|
|
2724
|
+
order_dependent=meta.order_dependent,
|
|
2725
|
+
distinct_dependent=meta.distinct_dependent,
|
|
2726
|
+
supports_window=meta.supports_window,
|
|
2727
|
+
streaming_partitioned=meta.streaming_partitioned,
|
|
2728
|
+
has_finalize=meta.has_finalize,
|
|
2729
|
+
source_order_dependent=meta.source_order_dependent,
|
|
2730
|
+
sink_order_dependent=meta.sink_order_dependent,
|
|
2731
|
+
requires_input_batch_index=meta.requires_input_batch_index,
|
|
2732
|
+
# Settings
|
|
2733
|
+
required_settings=meta.required_settings,
|
|
2734
|
+
# Secrets
|
|
2735
|
+
required_secrets=list(meta.required_secrets),
|
|
2736
|
+
)
|
|
2737
|
+
|
|
2738
|
+
# ========== DDL operations (not supported — read-only catalog) ==========
|
|
2739
|
+
|
|
2740
|
+
catalog_create = _read_only("create catalog")
|
|
2741
|
+
catalog_drop = _read_only("drop catalog")
|
|
2742
|
+
catalog_transaction_begin = _read_only("begin transaction")
|
|
2743
|
+
catalog_transaction_commit = _read_only("commit transaction")
|
|
2744
|
+
catalog_transaction_rollback = _read_only("rollback transaction")
|
|
2745
|
+
schema_create = _read_only("create schema")
|
|
2746
|
+
schema_drop = _read_only("drop schema")
|
|
2747
|
+
table_create = _read_only("create table")
|
|
2748
|
+
table_drop = _read_only("drop table")
|
|
2749
|
+
table_comment_set = _read_only("set table comment")
|
|
2750
|
+
table_column_comment_set = _read_only("set column comment")
|
|
2751
|
+
table_rename = _read_only("rename table")
|
|
2752
|
+
table_column_add = _read_only("add column")
|
|
2753
|
+
table_column_drop = _read_only("drop column")
|
|
2754
|
+
table_column_rename = _read_only("rename column")
|
|
2755
|
+
table_column_default_set = _read_only("set column default")
|
|
2756
|
+
table_column_default_drop = _read_only("drop column default")
|
|
2757
|
+
table_column_type_change = _read_only("change column type")
|
|
2758
|
+
table_not_null_drop = _read_only("drop NOT NULL constraint")
|
|
2759
|
+
table_not_null_set = _read_only("set NOT NULL constraint")
|
|
2760
|
+
view_create = _read_only("create view")
|
|
2761
|
+
view_drop = _read_only("drop view")
|
|
2762
|
+
view_rename = _read_only("rename view")
|
|
2763
|
+
view_comment_set = _read_only("set view comment")
|
|
2764
|
+
macro_create = _read_only("create macro")
|
|
2765
|
+
macro_drop = _read_only("drop macro")
|
|
2766
|
+
index_create = _read_only("create index")
|
|
2767
|
+
index_drop = _read_only("drop index")
|