vgi-python 0.8.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vgi/__init__.py +152 -0
- vgi/_duckdb.py +62 -0
- vgi/_storage_profile.py +132 -0
- vgi/_test_fixtures/__init__.py +20 -0
- vgi/_test_fixtures/accumulate/__init__.py +19 -0
- vgi/_test_fixtures/accumulate/worker.py +762 -0
- vgi/_test_fixtures/aggregate/__init__.py +62 -0
- vgi/_test_fixtures/aggregate/_common.py +21 -0
- vgi/_test_fixtures/aggregate/basic.py +232 -0
- vgi/_test_fixtures/aggregate/dynamic.py +409 -0
- vgi/_test_fixtures/aggregate/generic.py +86 -0
- vgi/_test_fixtures/aggregate/listagg.py +71 -0
- vgi/_test_fixtures/aggregate/percentile.py +107 -0
- vgi/_test_fixtures/aggregate/streaming.py +192 -0
- vgi/_test_fixtures/aggregate/varargs.py +75 -0
- vgi/_test_fixtures/aggregate/window.py +380 -0
- vgi/_test_fixtures/attach_options.py +308 -0
- vgi/_test_fixtures/bad_protocol.py +62 -0
- vgi/_test_fixtures/cancellable.py +336 -0
- vgi/_test_fixtures/catalog.py +813 -0
- vgi/_test_fixtures/http_server.py +394 -0
- vgi/_test_fixtures/nest_tensor.py +614 -0
- vgi/_test_fixtures/orchard_catalog.py +47 -0
- vgi/_test_fixtures/projection_repro/__init__.py +6 -0
- vgi/_test_fixtures/projection_repro/worker.py +454 -0
- vgi/_test_fixtures/scalar/__init__.py +116 -0
- vgi/_test_fixtures/scalar/_common.py +69 -0
- vgi/_test_fixtures/scalar/arithmetic.py +321 -0
- vgi/_test_fixtures/scalar/binary.py +120 -0
- vgi/_test_fixtures/scalar/formatting.py +176 -0
- vgi/_test_fixtures/scalar/geo.py +300 -0
- vgi/_test_fixtures/scalar/null_handling.py +107 -0
- vgi/_test_fixtures/scalar/random_demo.py +171 -0
- vgi/_test_fixtures/scalar/settings_secrets.py +102 -0
- vgi/_test_fixtures/scalar/type_info.py +219 -0
- vgi/_test_fixtures/schema_reconcile/__init__.py +29 -0
- vgi/_test_fixtures/schema_reconcile/worker.py +653 -0
- vgi/_test_fixtures/simple_writable.py +793 -0
- vgi/_test_fixtures/table/__init__.py +221 -0
- vgi/_test_fixtures/table/_common.py +162 -0
- vgi/_test_fixtures/table/batch_index.py +283 -0
- vgi/_test_fixtures/table/batch_index_broken.py +200 -0
- vgi/_test_fixtures/table/catalog_scans.py +162 -0
- vgi/_test_fixtures/table/filters.py +1005 -0
- vgi/_test_fixtures/table/late_materialization.py +249 -0
- vgi/_test_fixtures/table/make_series.py +273 -0
- vgi/_test_fixtures/table/misc.py +499 -0
- vgi/_test_fixtures/table/order_modes.py +164 -0
- vgi/_test_fixtures/table/pairs.py +437 -0
- vgi/_test_fixtures/table/partition_columns.py +472 -0
- vgi/_test_fixtures/table/partition_columns_broken.py +304 -0
- vgi/_test_fixtures/table/profiling_example.py +195 -0
- vgi/_test_fixtures/table/required_filters.py +234 -0
- vgi/_test_fixtures/table/sequence.py +710 -0
- vgi/_test_fixtures/table/settings.py +426 -0
- vgi/_test_fixtures/table/transaction_storage.py +162 -0
- vgi/_test_fixtures/table/tt_pushdown.py +191 -0
- vgi/_test_fixtures/table/versioned.py +230 -0
- vgi/_test_fixtures/table_in_out.py +1392 -0
- vgi/_test_fixtures/versioned.py +155 -0
- vgi/_test_fixtures/versioned_tables.py +595 -0
- vgi/_test_fixtures/worker.py +1631 -0
- vgi/_test_fixtures/writable/__init__.py +8 -0
- vgi/_test_fixtures/writable/generic.py +236 -0
- vgi/_test_fixtures/writable/table.py +149 -0
- vgi/_test_fixtures/writable/worker.py +1148 -0
- vgi/aggregate_function.py +607 -0
- vgi/argument_spec.py +472 -0
- vgi/arguments.py +1747 -0
- vgi/auth.py +55 -0
- vgi/catalog/__init__.py +88 -0
- vgi/catalog/attach_option.py +206 -0
- vgi/catalog/catalog_interface.py +2767 -0
- vgi/catalog/descriptors.py +870 -0
- vgi/catalog/duckdb_statistics.py +377 -0
- vgi/catalog/secret_type.py +96 -0
- vgi/catalog/setting.py +253 -0
- vgi/catalog/storage.py +372 -0
- vgi/client/__init__.py +67 -0
- vgi/client/catalog_mixin.py +1251 -0
- vgi/client/cli.py +582 -0
- vgi/client/cli_catalog.py +182 -0
- vgi/client/cli_schema.py +270 -0
- vgi/client/cli_table.py +907 -0
- vgi/client/cli_transaction.py +97 -0
- vgi/client/cli_utils.py +441 -0
- vgi/client/cli_view.py +303 -0
- vgi/client/client.py +2183 -0
- vgi/exceptions.py +205 -0
- vgi/function.py +245 -0
- vgi/function_storage.py +1636 -0
- vgi/function_storage_azure_sql.py +922 -0
- vgi/function_storage_cf_do.py +740 -0
- vgi/http/__init__.py +25 -0
- vgi/http/demo_storage.py +212 -0
- vgi/http/worker_page.py +1252 -0
- vgi/invocation.py +154 -0
- vgi/logging_config.py +93 -0
- vgi/meta_worker.py +661 -0
- vgi/metadata.py +1403 -0
- vgi/otel.py +406 -0
- vgi/protocol.py +2418 -0
- vgi/protocol_version.txt +1 -0
- vgi/py.typed +0 -0
- vgi/scalar_function.py +1211 -0
- vgi/schema_utils.py +234 -0
- vgi/secret_protocol.py +124 -0
- vgi/secret_service.py +238 -0
- vgi/serve.py +769 -0
- vgi/table_buffering_function.py +443 -0
- vgi/table_filter_pushdown.py +1528 -0
- vgi/table_function.py +1130 -0
- vgi/table_in_out_function.py +383 -0
- vgi/transactor/__init__.py +24 -0
- vgi/transactor/_duckdb_compat.py +27 -0
- vgi/transactor/client.py +137 -0
- vgi/transactor/protocol.py +149 -0
- vgi/transactor/server.py +740 -0
- vgi/worker.py +4761 -0
- vgi_python-0.8.0.dist-info/METADATA +735 -0
- vgi_python-0.8.0.dist-info/RECORD +124 -0
- vgi_python-0.8.0.dist-info/WHEEL +4 -0
- vgi_python-0.8.0.dist-info/entry_points.txt +5 -0
- vgi_python-0.8.0.dist-info/licenses/LICENSE +134 -0
|
@@ -0,0 +1,870 @@
|
|
|
1
|
+
# Copyright 2025, 2026 Query Farm LLC - https://query.farm
|
|
2
|
+
|
|
3
|
+
"""Declarative descriptor classes for catalog definition.
|
|
4
|
+
|
|
5
|
+
This module provides classes for declaratively defining catalog structure:
|
|
6
|
+
- Catalog: Top-level container for schemas
|
|
7
|
+
- Schema: Groups tables, views, and functions
|
|
8
|
+
- Table: Table definition with columns and constraints
|
|
9
|
+
- View: View definition with SQL
|
|
10
|
+
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
from collections.abc import Sequence
|
|
16
|
+
from dataclasses import dataclass, field
|
|
17
|
+
from typing import TYPE_CHECKING, Any, Union
|
|
18
|
+
|
|
19
|
+
import pyarrow as pa
|
|
20
|
+
|
|
21
|
+
from vgi.arguments import Arguments
|
|
22
|
+
from vgi.catalog.catalog_interface import (
|
|
23
|
+
AttachOpaqueData,
|
|
24
|
+
ColumnStatistics,
|
|
25
|
+
IndexConstraintType,
|
|
26
|
+
IndexInfo,
|
|
27
|
+
MacroInfo,
|
|
28
|
+
MacroType,
|
|
29
|
+
ScanFunctionResult,
|
|
30
|
+
SchemaInfo,
|
|
31
|
+
SerializedSchema,
|
|
32
|
+
TableColumnStatisticsResult,
|
|
33
|
+
TableInfo,
|
|
34
|
+
ViewInfo,
|
|
35
|
+
serialize_column_statistics,
|
|
36
|
+
)
|
|
37
|
+
from vgi.invocation import BindResponse, FunctionType
|
|
38
|
+
from vgi.metadata import CatalogFunctionType
|
|
39
|
+
|
|
40
|
+
if TYPE_CHECKING:
|
|
41
|
+
from vgi.function import Function
|
|
42
|
+
from vgi.table_function import TableFunctionGenerator
|
|
43
|
+
from vgi.table_in_out_function import TableInOutGenerator
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
class Sql(str):
|
|
47
|
+
"""A raw SQL expression, passed through verbatim as a default value.
|
|
48
|
+
|
|
49
|
+
Use this when the default is a SQL expression rather than a Python literal::
|
|
50
|
+
|
|
51
|
+
defaults={"created_at": Sql("current_timestamp")}
|
|
52
|
+
"""
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
# A default value can be a Python literal (str, int, float, bool, None)
|
|
56
|
+
# or Sql() for raw SQL expressions. Plain str values are treated as string
|
|
57
|
+
# literals and automatically quoted.
|
|
58
|
+
DefaultValue = str | int | float | bool | None
|
|
59
|
+
|
|
60
|
+
# A stat value is either a plain Python value (auto-converted using the column's
|
|
61
|
+
# Arrow type) or an explicit PyArrow scalar (used as-is).
|
|
62
|
+
StatValue = Union[None, bool, int, float, str, bytes, "pa.Scalar"] # type: ignore[type-arg]
|
|
63
|
+
|
|
64
|
+
__all__ = [
|
|
65
|
+
"Catalog",
|
|
66
|
+
"ColumnStatisticsInput",
|
|
67
|
+
"DefaultValue",
|
|
68
|
+
"ForeignKeyDef",
|
|
69
|
+
"Index",
|
|
70
|
+
"Macro",
|
|
71
|
+
"Schema",
|
|
72
|
+
"Sql",
|
|
73
|
+
"Table",
|
|
74
|
+
"View",
|
|
75
|
+
]
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def _to_scalar(
|
|
79
|
+
value: StatValue,
|
|
80
|
+
arrow_type: pa.DataType,
|
|
81
|
+
) -> pa.Scalar | None: # type: ignore[type-arg]
|
|
82
|
+
"""Convert a stat value to a PyArrow scalar, inferring type from the column schema."""
|
|
83
|
+
if value is None:
|
|
84
|
+
return None
|
|
85
|
+
if isinstance(value, pa.Scalar):
|
|
86
|
+
return value # Already a scalar — use as-is
|
|
87
|
+
# Unwrap dictionary type — stats should use the value type so min/max
|
|
88
|
+
# serialize as the actual value, not the dictionary index.
|
|
89
|
+
if pa.types.is_dictionary(arrow_type):
|
|
90
|
+
arrow_type = arrow_type.value_type
|
|
91
|
+
return pa.scalar(value, type=arrow_type)
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
@dataclass(frozen=True, slots=True)
|
|
95
|
+
class ColumnStatisticsInput:
|
|
96
|
+
"""Column statistics specified on a Table descriptor.
|
|
97
|
+
|
|
98
|
+
Values for ``min`` and ``max`` can be plain Python literals (int, float, str, etc.)
|
|
99
|
+
which are auto-converted to PyArrow scalars using the column's Arrow type from
|
|
100
|
+
the table schema, or explicit ``pa.scalar(...)`` values used as-is.
|
|
101
|
+
|
|
102
|
+
Example::
|
|
103
|
+
|
|
104
|
+
# Plain Python values — types inferred from schema
|
|
105
|
+
ColumnStatisticsInput(min=1, max=100, has_null=False, distinct_count=100)
|
|
106
|
+
|
|
107
|
+
# Explicit PyArrow scalars
|
|
108
|
+
ColumnStatisticsInput(min=pa.scalar(1, pa.int32()), max=pa.scalar(100, pa.int32()))
|
|
109
|
+
|
|
110
|
+
"""
|
|
111
|
+
|
|
112
|
+
min: StatValue = None
|
|
113
|
+
max: StatValue = None
|
|
114
|
+
has_null: bool = True
|
|
115
|
+
has_not_null: bool = True
|
|
116
|
+
distinct_count: int | None = None
|
|
117
|
+
contains_unicode: bool | None = None
|
|
118
|
+
max_string_length: int | None = None
|
|
119
|
+
|
|
120
|
+
def resolve(self, column_name: str, arrow_type: pa.DataType) -> ColumnStatistics:
|
|
121
|
+
"""Convert to a :class:`ColumnStatistics` with properly typed PyArrow scalars."""
|
|
122
|
+
return ColumnStatistics(
|
|
123
|
+
column_name=column_name,
|
|
124
|
+
min=_to_scalar(self.min, arrow_type),
|
|
125
|
+
max=_to_scalar(self.max, arrow_type),
|
|
126
|
+
has_null=self.has_null,
|
|
127
|
+
has_not_null=self.has_not_null,
|
|
128
|
+
distinct_count=self.distinct_count,
|
|
129
|
+
contains_unicode=self.contains_unicode,
|
|
130
|
+
max_string_length=self.max_string_length,
|
|
131
|
+
)
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
def _default_to_sql(value: DefaultValue) -> str:
|
|
135
|
+
"""Convert a Python default value to a SQL expression string.
|
|
136
|
+
|
|
137
|
+
- ``Sql``: passed through verbatim (raw SQL)
|
|
138
|
+
- ``str``: quoted as a SQL string literal (``'hello'``)
|
|
139
|
+
- ``int`` / ``float``: unquoted numeric literal
|
|
140
|
+
- ``bool``: ``true`` / ``false``
|
|
141
|
+
- ``None``: ``NULL``
|
|
142
|
+
"""
|
|
143
|
+
if isinstance(value, Sql):
|
|
144
|
+
return str(value)
|
|
145
|
+
if isinstance(value, bool):
|
|
146
|
+
return "true" if value else "false"
|
|
147
|
+
if isinstance(value, int):
|
|
148
|
+
return str(value)
|
|
149
|
+
if isinstance(value, float):
|
|
150
|
+
return repr(value)
|
|
151
|
+
if value is None:
|
|
152
|
+
return "NULL"
|
|
153
|
+
# str — quote as SQL string literal, escaping single quotes
|
|
154
|
+
escaped = value.replace("'", "''")
|
|
155
|
+
return f"'{escaped}'"
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
def _inline_function_result(
|
|
159
|
+
func: type[Function] | None,
|
|
160
|
+
) -> bytes | None:
|
|
161
|
+
"""Build inlined ``ScanFunctionResult`` IPC bytes for a function-backed table.
|
|
162
|
+
|
|
163
|
+
Returns ``None`` when the table is not function-backed for that operation.
|
|
164
|
+
Mirrors ``ReadOnlyCatalogInterface._write_function_get`` /
|
|
165
|
+
``table_scan_function_get`` auto-impl: empty positional/named arguments,
|
|
166
|
+
no required extensions. The C++ extension uses these bytes verbatim and
|
|
167
|
+
skips the corresponding ``catalog_table_*_function_get`` RPC.
|
|
168
|
+
"""
|
|
169
|
+
if func is None:
|
|
170
|
+
return None
|
|
171
|
+
func_meta = func.get_metadata()
|
|
172
|
+
return ScanFunctionResult(
|
|
173
|
+
function_name=func_meta.name,
|
|
174
|
+
positional_arguments=[],
|
|
175
|
+
named_arguments={},
|
|
176
|
+
required_extensions=[],
|
|
177
|
+
).serialize()
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
@dataclass(frozen=True, slots=True)
|
|
181
|
+
class ForeignKeyDef:
|
|
182
|
+
"""A foreign key constraint definition.
|
|
183
|
+
|
|
184
|
+
Attributes:
|
|
185
|
+
columns: Column names in THIS table that form the FK.
|
|
186
|
+
referenced_table: Name of the referenced table.
|
|
187
|
+
referenced_columns: Column names in the referenced table.
|
|
188
|
+
referenced_schema: Schema of the referenced table.
|
|
189
|
+
Defaults to None meaning same schema as this table.
|
|
190
|
+
|
|
191
|
+
"""
|
|
192
|
+
|
|
193
|
+
columns: tuple[str, ...]
|
|
194
|
+
referenced_table: str
|
|
195
|
+
referenced_columns: tuple[str, ...]
|
|
196
|
+
referenced_schema: str | None = None
|
|
197
|
+
|
|
198
|
+
|
|
199
|
+
@dataclass(frozen=True, slots=True, kw_only=True)
|
|
200
|
+
class Table:
|
|
201
|
+
"""Declarative table definition.
|
|
202
|
+
|
|
203
|
+
Immutable. Can be defined in two ways:
|
|
204
|
+
|
|
205
|
+
1. **Explicit columns**: Provide ``columns`` schema directly.
|
|
206
|
+
2. **Function-backed**: Provide ``function`` reference — the schema is
|
|
207
|
+
derived by calling ``bind()`` on the function class. If the function
|
|
208
|
+
requires arguments, supply them via ``arguments``.
|
|
209
|
+
|
|
210
|
+
Attributes:
|
|
211
|
+
name: Table name.
|
|
212
|
+
columns: Explicit PyArrow schema (mutually exclusive with function).
|
|
213
|
+
function: TableFunctionGenerator class to derive schema from
|
|
214
|
+
(mutually exclusive with columns).
|
|
215
|
+
arguments: Arguments to pass when calling ``bind()`` on a
|
|
216
|
+
function-backed table. Required when the function has
|
|
217
|
+
mandatory parameters.
|
|
218
|
+
not_null: Tuple of column names with NOT NULL constraints.
|
|
219
|
+
unique: Tuple of column name tuples for UNIQUE constraints.
|
|
220
|
+
check: Tuple of SQL expressions for CHECK constraints.
|
|
221
|
+
defaults: Dict mapping column names to default values. Accepts
|
|
222
|
+
Python literals (str, int, float, bool, None) which are
|
|
223
|
+
auto-converted, or SqlExpression for raw SQL.
|
|
224
|
+
generated_columns: Dict mapping column names to SQL expressions
|
|
225
|
+
for generated (virtual) columns. Generated columns are
|
|
226
|
+
computed on read by DuckDB and are mutually exclusive with
|
|
227
|
+
defaults.
|
|
228
|
+
column_comments: Dict mapping column names to comment strings.
|
|
229
|
+
Comments are transported as Arrow field metadata and visible
|
|
230
|
+
via ``duckdb_columns()`` in DuckDB.
|
|
231
|
+
required_field_filter_paths: Dotted-path column references that MUST
|
|
232
|
+
appear in a WHERE expression for any scan of this table. Top-level
|
|
233
|
+
names (``"country"``) or struct subfields (``"bbox.xmin"``,
|
|
234
|
+
``"nested.outer.inner"``). Empty (default) means no enforcement.
|
|
235
|
+
Satisfaction is prefix-based: a present filter on a shorter path
|
|
236
|
+
satisfies any required path it is a prefix of. So a whole-struct
|
|
237
|
+
filter on ``bbox`` satisfies all of ``bbox.xmin`` / ``.xmax`` /
|
|
238
|
+
``.ymin`` / ``.ymax`` — the wider filter is at least as
|
|
239
|
+
constraining as the four individual ones. The VGI DuckDB
|
|
240
|
+
extension's optimizer pass consults this list at bind time and
|
|
241
|
+
throws ``BinderException`` listing any unsatisfied paths.
|
|
242
|
+
comment: Optional table comment.
|
|
243
|
+
tags: Optional metadata tags.
|
|
244
|
+
|
|
245
|
+
"""
|
|
246
|
+
|
|
247
|
+
name: str
|
|
248
|
+
columns: pa.Schema | None = None
|
|
249
|
+
function: type[TableFunctionGenerator[Any, Any]] | None = None
|
|
250
|
+
arguments: Arguments | None = None
|
|
251
|
+
supports_time_travel: bool = False
|
|
252
|
+
insert_function: type[TableInOutGenerator[Any, Any]] | None = None
|
|
253
|
+
update_function: type[TableInOutGenerator[Any, Any]] | None = None
|
|
254
|
+
delete_function: type[TableInOutGenerator[Any, Any]] | None = None
|
|
255
|
+
not_null: tuple[str, ...] = ()
|
|
256
|
+
unique: tuple[tuple[str, ...], ...] = ()
|
|
257
|
+
check: tuple[str, ...] = ()
|
|
258
|
+
primary_key: tuple[tuple[str, ...], ...] = ()
|
|
259
|
+
foreign_key: tuple[ForeignKeyDef, ...] = ()
|
|
260
|
+
defaults: dict[str, DefaultValue] = field(default_factory=dict)
|
|
261
|
+
generated_columns: dict[str, str] = field(default_factory=dict)
|
|
262
|
+
column_comments: dict[str, str] = field(default_factory=dict)
|
|
263
|
+
required_field_filter_paths: tuple[str, ...] = ()
|
|
264
|
+
statistics: dict[str, ColumnStatisticsInput] = field(default_factory=dict)
|
|
265
|
+
statistics_cache_max_age_seconds: int | None = None
|
|
266
|
+
# Optional inlined cardinality. When set, the C++ extension uses these
|
|
267
|
+
# values directly and skips the per-bind ``table_function_cardinality``
|
|
268
|
+
# RPC. Use for read-only or slow-changing tables. Leave both as ``None``
|
|
269
|
+
# to keep the existing per-bind RPC behavior.
|
|
270
|
+
cardinality_estimate: int | None = None
|
|
271
|
+
cardinality_max: int | None = None
|
|
272
|
+
# Opt into pre-binding the function during ``schema_contents`` and
|
|
273
|
+
# inlining the result on ``TableInfo.bind_result``. The C++ extension
|
|
274
|
+
# then skips the per-scan ``bind`` RPC.
|
|
275
|
+
#
|
|
276
|
+
# Only valid when ``function`` is a ``@bind_fixed_schema``-decorated
|
|
277
|
+
# ``TableFunctionGenerator`` subclass — the decorator's contract (output
|
|
278
|
+
# is exactly ``cls.FIXED_SCHEMA``, no per-call inputs) matches what's
|
|
279
|
+
# safe to freeze for the catalog cache lifetime. Setting this on a
|
|
280
|
+
# descriptor whose function is not decorated raises at descriptor build.
|
|
281
|
+
inline_bind: bool = False
|
|
282
|
+
comment: str | None = None
|
|
283
|
+
tags: dict[str, str] = field(default_factory=dict)
|
|
284
|
+
|
|
285
|
+
def __post_init__(self) -> None:
|
|
286
|
+
"""Validate configuration and constraint column names."""
|
|
287
|
+
# Validate mutually exclusive options
|
|
288
|
+
if self.columns is None and self.function is None:
|
|
289
|
+
raise ValueError(f"Table '{self.name}': must specify either 'columns' or 'function'")
|
|
290
|
+
if self.columns is not None and self.function is not None:
|
|
291
|
+
raise ValueError(f"Table '{self.name}': cannot specify both 'columns' and 'function'")
|
|
292
|
+
|
|
293
|
+
# Validate inline_bind contract: only @bind_fixed_schema-decorated
|
|
294
|
+
# functions qualify for the catalog framework's pre-bind path. The
|
|
295
|
+
# decorator marks both the class (_inline_bind_safe=True) and the
|
|
296
|
+
# installed on_bind function (_is_bind_fixed_schema=True). The
|
|
297
|
+
# function-level marker lets us reject subclasses that overrode
|
|
298
|
+
# on_bind even though they inherit _inline_bind_safe via MRO.
|
|
299
|
+
if self.inline_bind:
|
|
300
|
+
if self.function is None:
|
|
301
|
+
raise ValueError(f"Table '{self.name}': inline_bind=True requires function= to be set")
|
|
302
|
+
if not getattr(self.function, "_inline_bind_safe", False):
|
|
303
|
+
raise ValueError(
|
|
304
|
+
f"Table '{self.name}': inline_bind=True requires the function class "
|
|
305
|
+
f"to be decorated with @bind_fixed_schema. Got {self.function.__name__}, "
|
|
306
|
+
f"which has a custom on_bind. Either decorate it (deleting the manual "
|
|
307
|
+
f"on_bind) or leave inline_bind=False."
|
|
308
|
+
)
|
|
309
|
+
on_bind_attr = self.function.__dict__.get("on_bind")
|
|
310
|
+
if on_bind_attr is not None:
|
|
311
|
+
# The class has its own on_bind in __dict__. Either the
|
|
312
|
+
# decorator installed it (good — has _is_bind_fixed_schema
|
|
313
|
+
# marker on the underlying function) or a subclass overrode
|
|
314
|
+
# it (bad — escapes the decorator's contract).
|
|
315
|
+
underlying = getattr(on_bind_attr, "__func__", on_bind_attr)
|
|
316
|
+
if not getattr(underlying, "_is_bind_fixed_schema", False):
|
|
317
|
+
raise ValueError(
|
|
318
|
+
f"Table '{self.name}': inline_bind=True is not safe for "
|
|
319
|
+
f"{self.function.__name__} because it overrides on_bind, "
|
|
320
|
+
f"escaping @bind_fixed_schema's contract. Either remove the "
|
|
321
|
+
f"override or leave inline_bind=False."
|
|
322
|
+
)
|
|
323
|
+
|
|
324
|
+
# Resolve columns to validate constraints
|
|
325
|
+
resolved = self._get_resolved_columns()
|
|
326
|
+
column_names = {f.name for f in resolved}
|
|
327
|
+
|
|
328
|
+
# Validate not_null column names
|
|
329
|
+
for col in self.not_null:
|
|
330
|
+
if col not in column_names:
|
|
331
|
+
raise ValueError(
|
|
332
|
+
f"Table '{self.name}': not_null column '{col}' not found "
|
|
333
|
+
f"in schema. Available columns: {sorted(column_names)}"
|
|
334
|
+
)
|
|
335
|
+
|
|
336
|
+
# Validate unique column names
|
|
337
|
+
for group in self.unique:
|
|
338
|
+
for col in group:
|
|
339
|
+
if col not in column_names:
|
|
340
|
+
raise ValueError(
|
|
341
|
+
f"Table '{self.name}': unique column '{col}' not found "
|
|
342
|
+
f"in schema. Available columns: {sorted(column_names)}"
|
|
343
|
+
)
|
|
344
|
+
|
|
345
|
+
# Validate primary_key column names
|
|
346
|
+
for group in self.primary_key:
|
|
347
|
+
for col in group:
|
|
348
|
+
if col not in column_names:
|
|
349
|
+
raise ValueError(
|
|
350
|
+
f"Table '{self.name}': primary_key column '{col}' not found "
|
|
351
|
+
f"in schema. Available columns: {sorted(column_names)}"
|
|
352
|
+
)
|
|
353
|
+
|
|
354
|
+
# Validate foreign_key column names (only FK side, not referenced table)
|
|
355
|
+
for fk in self.foreign_key:
|
|
356
|
+
for col in fk.columns:
|
|
357
|
+
if col not in column_names:
|
|
358
|
+
raise ValueError(
|
|
359
|
+
f"Table '{self.name}': foreign_key column '{col}' not found "
|
|
360
|
+
f"in schema. Available columns: {sorted(column_names)}"
|
|
361
|
+
)
|
|
362
|
+
|
|
363
|
+
# Validate at most one primary key
|
|
364
|
+
if len(self.primary_key) > 1:
|
|
365
|
+
raise ValueError(
|
|
366
|
+
f"Table '{self.name}': at most one primary_key constraint allowed, got {len(self.primary_key)}"
|
|
367
|
+
)
|
|
368
|
+
|
|
369
|
+
# Validate foreign_key column count parity
|
|
370
|
+
for fk in self.foreign_key:
|
|
371
|
+
if len(fk.columns) != len(fk.referenced_columns):
|
|
372
|
+
raise ValueError(
|
|
373
|
+
f"Table '{self.name}': foreign_key referencing '{fk.referenced_table}' "
|
|
374
|
+
f"has {len(fk.columns)} FK columns but {len(fk.referenced_columns)} "
|
|
375
|
+
f"referenced columns — counts must match"
|
|
376
|
+
)
|
|
377
|
+
|
|
378
|
+
# Validate defaults column names
|
|
379
|
+
for col in self.defaults:
|
|
380
|
+
if col not in column_names:
|
|
381
|
+
raise ValueError(
|
|
382
|
+
f"Table '{self.name}': defaults column '{col}' not found "
|
|
383
|
+
f"in schema. Available columns: {sorted(column_names)}"
|
|
384
|
+
)
|
|
385
|
+
|
|
386
|
+
# Validate generated_columns column names and no overlap with defaults
|
|
387
|
+
for col in self.generated_columns:
|
|
388
|
+
if col not in column_names:
|
|
389
|
+
raise ValueError(
|
|
390
|
+
f"Table '{self.name}': generated_columns column '{col}' not found "
|
|
391
|
+
f"in schema. Available columns: {sorted(column_names)}"
|
|
392
|
+
)
|
|
393
|
+
if col in self.defaults:
|
|
394
|
+
raise ValueError(
|
|
395
|
+
f"Table '{self.name}': column '{col}' cannot have both a default value and a generated expression"
|
|
396
|
+
)
|
|
397
|
+
|
|
398
|
+
# Validate column_comments column names
|
|
399
|
+
for col in self.column_comments:
|
|
400
|
+
if col not in column_names:
|
|
401
|
+
raise ValueError(
|
|
402
|
+
f"Table '{self.name}': column_comments column '{col}' not found "
|
|
403
|
+
f"in schema. Available columns: {sorted(column_names)}"
|
|
404
|
+
)
|
|
405
|
+
|
|
406
|
+
# Validate required_field_filter_paths: the leading dotted segment of
|
|
407
|
+
# each path must be a real column on this table. Struct subfield
|
|
408
|
+
# validity is not checked here — DuckDB's binder catches typos at
|
|
409
|
+
# scan time, and the descriptor doesn't unpack STRUCT subfields.
|
|
410
|
+
for path in self.required_field_filter_paths:
|
|
411
|
+
if not path:
|
|
412
|
+
raise ValueError(f"Table '{self.name}': required_field_filter_paths must not contain empty strings")
|
|
413
|
+
head = path.split(".", 1)[0]
|
|
414
|
+
if head not in column_names:
|
|
415
|
+
raise ValueError(
|
|
416
|
+
f"Table '{self.name}': required_field_filter_paths path '{path}' references "
|
|
417
|
+
f"unknown column '{head}'. Available columns: {sorted(column_names)}"
|
|
418
|
+
)
|
|
419
|
+
|
|
420
|
+
# Validate statistics column names
|
|
421
|
+
for col in self.statistics:
|
|
422
|
+
if col not in column_names:
|
|
423
|
+
raise ValueError(
|
|
424
|
+
f"Table '{self.name}': statistics column '{col}' not found "
|
|
425
|
+
f"in schema. Available columns: {sorted(column_names)}"
|
|
426
|
+
)
|
|
427
|
+
|
|
428
|
+
# Validate write functions: UPDATE/DELETE require a scan function for row IDs
|
|
429
|
+
if (self.update_function is not None or self.delete_function is not None) and self.function is None:
|
|
430
|
+
raise ValueError(
|
|
431
|
+
f"Table '{self.name}': update_function and delete_function require "
|
|
432
|
+
f"a scan function (set 'function') to provide row IDs"
|
|
433
|
+
)
|
|
434
|
+
|
|
435
|
+
def _get_resolved_columns(self) -> pa.Schema:
|
|
436
|
+
"""Get the resolved columns schema (explicit or derived from function).
|
|
437
|
+
|
|
438
|
+
For function-backed tables, calls ``bind()`` on the function class
|
|
439
|
+
to obtain the output schema. If the function requires arguments,
|
|
440
|
+
they must be supplied via the ``arguments`` field.
|
|
441
|
+
"""
|
|
442
|
+
if self.columns is not None:
|
|
443
|
+
return self.columns
|
|
444
|
+
|
|
445
|
+
assert self.function is not None
|
|
446
|
+
arguments = self.arguments if self.arguments is not None else Arguments()
|
|
447
|
+
from vgi.protocol import BindRequest
|
|
448
|
+
|
|
449
|
+
bind_call = BindRequest(
|
|
450
|
+
function_name=self.function.Meta.name, # type: ignore[attr-defined]
|
|
451
|
+
arguments=arguments,
|
|
452
|
+
function_type=FunctionType.TABLE,
|
|
453
|
+
)
|
|
454
|
+
try:
|
|
455
|
+
result = self.function.bind(bind_call)
|
|
456
|
+
if not isinstance(result, BindResponse):
|
|
457
|
+
raise ValueError(
|
|
458
|
+
f"Table '{self.name}': function '{self.function.__name__}' returned "
|
|
459
|
+
f"unexpected bind result type: {type(result).__name__}"
|
|
460
|
+
)
|
|
461
|
+
return result.output_schema
|
|
462
|
+
except Exception as e:
|
|
463
|
+
raise ValueError(
|
|
464
|
+
f"Table '{self.name}': failed to derive schema from function "
|
|
465
|
+
f"'{self.function.__name__}' via bind(). If the function requires "
|
|
466
|
+
f"arguments, pass them via arguments=Arguments(...). Error: {e}"
|
|
467
|
+
) from e
|
|
468
|
+
|
|
469
|
+
@property
|
|
470
|
+
def resolved_columns(self) -> pa.Schema:
|
|
471
|
+
"""The resolved column schema (explicit or derived from function)."""
|
|
472
|
+
return self._get_resolved_columns()
|
|
473
|
+
|
|
474
|
+
def _resolve_not_null_indices(self) -> list[int]:
|
|
475
|
+
"""Convert column names to indices for not_null constraints."""
|
|
476
|
+
cols = self.resolved_columns
|
|
477
|
+
return [cols.get_field_index(col) for col in self.not_null]
|
|
478
|
+
|
|
479
|
+
def _resolve_unique_indices(self) -> list[list[int]]:
|
|
480
|
+
"""Convert column names to indices for unique constraints."""
|
|
481
|
+
cols = self.resolved_columns
|
|
482
|
+
return [[cols.get_field_index(col) for col in group] for group in self.unique]
|
|
483
|
+
|
|
484
|
+
def _resolve_primary_key_indices(self) -> list[list[int]]:
|
|
485
|
+
"""Convert column names to indices for primary_key constraints."""
|
|
486
|
+
cols = self.resolved_columns
|
|
487
|
+
return [[cols.get_field_index(col) for col in group] for group in self.primary_key]
|
|
488
|
+
|
|
489
|
+
def _serialize_foreign_keys(self, schema_name: str) -> list[bytes]:
|
|
490
|
+
"""Serialize foreign key constraints as IPC bytes."""
|
|
491
|
+
from vgi_rpc.utils import serialize_record_batch_bytes
|
|
492
|
+
|
|
493
|
+
result = []
|
|
494
|
+
for fk in self.foreign_key:
|
|
495
|
+
batch = pa.RecordBatch.from_pydict(
|
|
496
|
+
{
|
|
497
|
+
"fk_columns": [list(fk.columns)],
|
|
498
|
+
"pk_columns": [list(fk.referenced_columns)],
|
|
499
|
+
"referenced_table": [fk.referenced_table],
|
|
500
|
+
"referenced_schema": [fk.referenced_schema or schema_name],
|
|
501
|
+
},
|
|
502
|
+
schema=pa.schema(
|
|
503
|
+
[
|
|
504
|
+
("fk_columns", pa.list_(pa.utf8())),
|
|
505
|
+
("pk_columns", pa.list_(pa.utf8())),
|
|
506
|
+
("referenced_table", pa.utf8()),
|
|
507
|
+
("referenced_schema", pa.utf8()),
|
|
508
|
+
]
|
|
509
|
+
),
|
|
510
|
+
)
|
|
511
|
+
result.append(serialize_record_batch_bytes(batch))
|
|
512
|
+
return result
|
|
513
|
+
|
|
514
|
+
def _apply_defaults_to_schema(self, schema: pa.Schema) -> pa.Schema:
|
|
515
|
+
"""Return schema with default value metadata applied to fields."""
|
|
516
|
+
if not self.defaults:
|
|
517
|
+
return schema
|
|
518
|
+
for col_name, value in self.defaults.items():
|
|
519
|
+
sql_expr = _default_to_sql(value)
|
|
520
|
+
idx = schema.get_field_index(col_name)
|
|
521
|
+
f = schema.field(idx)
|
|
522
|
+
existing = dict(f.metadata) if f.metadata else {}
|
|
523
|
+
existing[b"default"] = sql_expr.encode("utf-8")
|
|
524
|
+
schema = schema.set(idx, f.with_metadata(existing)) # type: ignore[arg-type]
|
|
525
|
+
return schema
|
|
526
|
+
|
|
527
|
+
def _apply_generated_columns_to_schema(self, schema: pa.Schema) -> pa.Schema:
|
|
528
|
+
"""Return schema with generated expression metadata applied to fields."""
|
|
529
|
+
if not self.generated_columns:
|
|
530
|
+
return schema
|
|
531
|
+
for col_name, expression in self.generated_columns.items():
|
|
532
|
+
idx = schema.get_field_index(col_name)
|
|
533
|
+
f = schema.field(idx)
|
|
534
|
+
existing = dict(f.metadata) if f.metadata else {}
|
|
535
|
+
existing[b"generated_expression"] = expression.encode("utf-8")
|
|
536
|
+
schema = schema.set(idx, f.with_metadata(existing)) # type: ignore[arg-type]
|
|
537
|
+
return schema
|
|
538
|
+
|
|
539
|
+
def _apply_column_comments_to_schema(self, schema: pa.Schema) -> pa.Schema:
|
|
540
|
+
"""Return schema with column comment metadata applied to fields."""
|
|
541
|
+
if not self.column_comments:
|
|
542
|
+
return schema
|
|
543
|
+
for col_name, comment in self.column_comments.items():
|
|
544
|
+
if not comment:
|
|
545
|
+
continue
|
|
546
|
+
idx = schema.get_field_index(col_name)
|
|
547
|
+
f = schema.field(idx)
|
|
548
|
+
existing = dict(f.metadata) if f.metadata else {}
|
|
549
|
+
existing[b"comment"] = comment.encode("utf-8")
|
|
550
|
+
schema = schema.set(idx, f.with_metadata(existing)) # type: ignore[arg-type]
|
|
551
|
+
return schema
|
|
552
|
+
|
|
553
|
+
def to_table_info(self, schema_name: str) -> TableInfo:
|
|
554
|
+
"""Convert to TableInfo for catalog response."""
|
|
555
|
+
cols = self._apply_defaults_to_schema(self.resolved_columns)
|
|
556
|
+
cols = self._apply_generated_columns_to_schema(cols)
|
|
557
|
+
cols = self._apply_column_comments_to_schema(cols)
|
|
558
|
+
# Inline the resolved stats blob so the C++ extension can short-circuit
|
|
559
|
+
# the per-scan ``table_function_statistics`` and per-table
|
|
560
|
+
# ``catalog_table_column_statistics_get`` RPCs entirely. This freezes
|
|
561
|
+
# the resolved stats for the lifetime of the catalog cache; workers
|
|
562
|
+
# whose stats change faster than catalog_version must override
|
|
563
|
+
# ``to_table_info`` and leave column_statistics null.
|
|
564
|
+
resolved_stats = self.resolve_column_statistics()
|
|
565
|
+
column_statistics_blob = (
|
|
566
|
+
serialize_column_statistics(
|
|
567
|
+
resolved_stats.statistics,
|
|
568
|
+
resolved_stats.cache_max_age_seconds,
|
|
569
|
+
)
|
|
570
|
+
if resolved_stats is not None
|
|
571
|
+
else None
|
|
572
|
+
)
|
|
573
|
+
return TableInfo(
|
|
574
|
+
name=self.name,
|
|
575
|
+
schema_name=schema_name,
|
|
576
|
+
columns=SerializedSchema(cols.serialize().to_pybytes()),
|
|
577
|
+
not_null_constraints=self._resolve_not_null_indices(),
|
|
578
|
+
unique_constraints=self._resolve_unique_indices(),
|
|
579
|
+
check_constraints=list(self.check),
|
|
580
|
+
primary_key_constraints=self._resolve_primary_key_indices(),
|
|
581
|
+
foreign_key_constraints=self._serialize_foreign_keys(schema_name),
|
|
582
|
+
supports_insert=self.insert_function is not None,
|
|
583
|
+
supports_update=self.update_function is not None,
|
|
584
|
+
supports_delete=self.delete_function is not None,
|
|
585
|
+
supports_column_statistics=bool(self.statistics),
|
|
586
|
+
comment=self.comment,
|
|
587
|
+
tags=dict(self.tags),
|
|
588
|
+
scan_function=_inline_function_result(self.function),
|
|
589
|
+
insert_function=_inline_function_result(self.insert_function),
|
|
590
|
+
update_function=_inline_function_result(self.update_function),
|
|
591
|
+
delete_function=_inline_function_result(self.delete_function),
|
|
592
|
+
cardinality_estimate=self.cardinality_estimate,
|
|
593
|
+
cardinality_max=self.cardinality_max,
|
|
594
|
+
column_statistics=column_statistics_blob,
|
|
595
|
+
required_field_filter_paths=list(self.required_field_filter_paths),
|
|
596
|
+
)
|
|
597
|
+
|
|
598
|
+
def resolve_column_statistics(self) -> TableColumnStatisticsResult | None:
|
|
599
|
+
"""Resolve the ``statistics`` dict into a :class:`TableColumnStatisticsResult`.
|
|
600
|
+
|
|
601
|
+
Returns ``None`` if no statistics are defined. Otherwise, converts
|
|
602
|
+
each entry to a :class:`ColumnStatistics` with properly typed PyArrow
|
|
603
|
+
scalars inferred from the table's column schema.
|
|
604
|
+
"""
|
|
605
|
+
if not self.statistics:
|
|
606
|
+
return None
|
|
607
|
+
resolved_cols = self.resolved_columns
|
|
608
|
+
stats = []
|
|
609
|
+
for col_name, stat_input in self.statistics.items():
|
|
610
|
+
col_field = resolved_cols.field(col_name)
|
|
611
|
+
stats.append(stat_input.resolve(col_name, col_field.type))
|
|
612
|
+
return TableColumnStatisticsResult(
|
|
613
|
+
statistics=stats,
|
|
614
|
+
cache_max_age_seconds=self.statistics_cache_max_age_seconds,
|
|
615
|
+
)
|
|
616
|
+
|
|
617
|
+
|
|
618
|
+
@dataclass(frozen=True)
|
|
619
|
+
class View:
|
|
620
|
+
"""Declarative view definition.
|
|
621
|
+
|
|
622
|
+
Immutable.
|
|
623
|
+
|
|
624
|
+
Attributes:
|
|
625
|
+
name: View name.
|
|
626
|
+
definition: SQL definition of the view.
|
|
627
|
+
comment: Optional view comment.
|
|
628
|
+
column_comments: Optional mapping of view output column name to comment.
|
|
629
|
+
The extension aligns these by name against the columns DuckDB binds
|
|
630
|
+
from the view's query, so only the names that actually appear in the
|
|
631
|
+
result need entries; unmatched names are ignored.
|
|
632
|
+
tags: Optional metadata tags.
|
|
633
|
+
|
|
634
|
+
"""
|
|
635
|
+
|
|
636
|
+
name: str
|
|
637
|
+
definition: str
|
|
638
|
+
comment: str | None = None
|
|
639
|
+
column_comments: dict[str, str] = field(default_factory=dict)
|
|
640
|
+
tags: dict[str, str] = field(default_factory=dict)
|
|
641
|
+
|
|
642
|
+
def to_view_info(self, schema_name: str) -> ViewInfo:
|
|
643
|
+
"""Convert to ViewInfo for catalog response."""
|
|
644
|
+
return ViewInfo(
|
|
645
|
+
name=self.name,
|
|
646
|
+
schema_name=schema_name,
|
|
647
|
+
definition=self.definition,
|
|
648
|
+
comment=self.comment,
|
|
649
|
+
column_comments=dict(self.column_comments),
|
|
650
|
+
tags=dict(self.tags),
|
|
651
|
+
)
|
|
652
|
+
|
|
653
|
+
|
|
654
|
+
@dataclass(frozen=True)
|
|
655
|
+
class Macro:
|
|
656
|
+
"""Declarative macro definition.
|
|
657
|
+
|
|
658
|
+
Attributes:
|
|
659
|
+
name: Macro name.
|
|
660
|
+
macro_type: Whether this is a scalar or table macro.
|
|
661
|
+
parameters: Ordered list of parameter names.
|
|
662
|
+
parameter_default_values: One-row RecordBatch where columns are parameter
|
|
663
|
+
names and values are typed defaults. None if no defaults.
|
|
664
|
+
Example: pa.RecordBatch.from_pydict({"b": [5]}) for b := 5.
|
|
665
|
+
definition: SQL expression (scalar) or query (table).
|
|
666
|
+
comment: Optional macro comment.
|
|
667
|
+
tags: Optional metadata tags.
|
|
668
|
+
|
|
669
|
+
"""
|
|
670
|
+
|
|
671
|
+
name: str
|
|
672
|
+
macro_type: MacroType
|
|
673
|
+
parameters: list[str] = field(default_factory=list)
|
|
674
|
+
parameter_default_values: pa.RecordBatch | None = None
|
|
675
|
+
definition: str = ""
|
|
676
|
+
comment: str | None = None
|
|
677
|
+
tags: dict[str, str] = field(default_factory=dict)
|
|
678
|
+
|
|
679
|
+
def __post_init__(self) -> None:
|
|
680
|
+
"""Validate macro configuration."""
|
|
681
|
+
if self.parameter_default_values is not None:
|
|
682
|
+
if self.parameter_default_values.num_rows != 1:
|
|
683
|
+
raise ValueError(
|
|
684
|
+
f"Macro '{self.name}': parameter_default_values must have exactly 1 row, "
|
|
685
|
+
f"got {self.parameter_default_values.num_rows}"
|
|
686
|
+
)
|
|
687
|
+
# Validate that default param column names exist in parameters list
|
|
688
|
+
param_set = set(self.parameters)
|
|
689
|
+
for col_name in self.parameter_default_values.schema.names:
|
|
690
|
+
if col_name not in param_set:
|
|
691
|
+
raise ValueError(
|
|
692
|
+
f"Macro '{self.name}': default parameter '{col_name}' not found "
|
|
693
|
+
f"in parameters list {self.parameters}"
|
|
694
|
+
)
|
|
695
|
+
|
|
696
|
+
def to_macro_info(self, schema_name: str) -> MacroInfo:
|
|
697
|
+
"""Convert to MacroInfo for catalog response."""
|
|
698
|
+
return MacroInfo(
|
|
699
|
+
name=self.name,
|
|
700
|
+
schema_name=schema_name,
|
|
701
|
+
macro_type=self.macro_type,
|
|
702
|
+
parameters=list(self.parameters),
|
|
703
|
+
parameter_default_values=self.parameter_default_values,
|
|
704
|
+
definition=self.definition,
|
|
705
|
+
comment=self.comment,
|
|
706
|
+
tags=dict(self.tags),
|
|
707
|
+
)
|
|
708
|
+
|
|
709
|
+
|
|
710
|
+
@dataclass(frozen=True, slots=True, kw_only=True)
|
|
711
|
+
class Index:
|
|
712
|
+
"""Declarative index definition.
|
|
713
|
+
|
|
714
|
+
Immutable.
|
|
715
|
+
|
|
716
|
+
Attributes:
|
|
717
|
+
name: Index name.
|
|
718
|
+
table_name: Name of the table this index is on.
|
|
719
|
+
expressions: SQL expression strings or column names defining the index.
|
|
720
|
+
For column-based indexes: ("col_a", "col_b")
|
|
721
|
+
For expression indexes: ("lower(col_a)", "col_b + 1")
|
|
722
|
+
index_type: The index type (e.g., "" for default).
|
|
723
|
+
constraint_type: NONE for regular, UNIQUE for unique indexes.
|
|
724
|
+
options: Key-value index options.
|
|
725
|
+
comment: Optional index comment.
|
|
726
|
+
tags: Optional metadata tags.
|
|
727
|
+
|
|
728
|
+
"""
|
|
729
|
+
|
|
730
|
+
name: str
|
|
731
|
+
table_name: str
|
|
732
|
+
expressions: tuple[str, ...] = ()
|
|
733
|
+
index_type: str = ""
|
|
734
|
+
constraint_type: IndexConstraintType = IndexConstraintType.NONE
|
|
735
|
+
options: dict[str, str] = field(default_factory=dict)
|
|
736
|
+
comment: str | None = None
|
|
737
|
+
tags: dict[str, str] = field(default_factory=dict)
|
|
738
|
+
|
|
739
|
+
def __post_init__(self) -> None:
|
|
740
|
+
"""Validate index configuration."""
|
|
741
|
+
if not self.expressions:
|
|
742
|
+
raise ValueError(f"Index '{self.name}': must specify at least one expression")
|
|
743
|
+
if not self.table_name:
|
|
744
|
+
raise ValueError(f"Index '{self.name}': must specify a table_name")
|
|
745
|
+
|
|
746
|
+
def to_index_info(self, schema_name: str) -> IndexInfo:
|
|
747
|
+
"""Convert to IndexInfo for catalog response."""
|
|
748
|
+
return IndexInfo(
|
|
749
|
+
name=self.name,
|
|
750
|
+
schema_name=schema_name,
|
|
751
|
+
table_name=self.table_name,
|
|
752
|
+
index_type=self.index_type,
|
|
753
|
+
constraint_type=self.constraint_type,
|
|
754
|
+
expressions=list(self.expressions),
|
|
755
|
+
options=dict(self.options),
|
|
756
|
+
comment=self.comment,
|
|
757
|
+
tags=dict(self.tags),
|
|
758
|
+
)
|
|
759
|
+
|
|
760
|
+
|
|
761
|
+
@dataclass
|
|
762
|
+
class Schema:
|
|
763
|
+
"""Declarative schema definition grouping tables, views, functions, macros, and indexes.
|
|
764
|
+
|
|
765
|
+
Attributes:
|
|
766
|
+
name: Schema name.
|
|
767
|
+
comment: Optional schema comment.
|
|
768
|
+
tags: Optional metadata tags.
|
|
769
|
+
tables: Sequence of Table definitions.
|
|
770
|
+
views: Sequence of View definitions.
|
|
771
|
+
functions: Sequence of Function classes (scalar, table, or aggregate).
|
|
772
|
+
macros: Sequence of Macro definitions.
|
|
773
|
+
indexes: Sequence of Index definitions.
|
|
774
|
+
|
|
775
|
+
"""
|
|
776
|
+
|
|
777
|
+
name: str
|
|
778
|
+
comment: str | None = None
|
|
779
|
+
tags: dict[str, str] = field(default_factory=dict)
|
|
780
|
+
tables: Sequence[Table] = ()
|
|
781
|
+
views: Sequence[View] = ()
|
|
782
|
+
functions: Sequence[type[Function]] = ()
|
|
783
|
+
macros: Sequence[Macro] = ()
|
|
784
|
+
indexes: Sequence[Index] = ()
|
|
785
|
+
|
|
786
|
+
def to_schema_info(self, attach_opaque_data: AttachOpaqueData) -> SchemaInfo:
|
|
787
|
+
"""Convert to SchemaInfo for catalog response.
|
|
788
|
+
|
|
789
|
+
Populates ``estimated_object_count`` from the declared population so
|
|
790
|
+
the C++ extension's eager-load gate can choose between bulk
|
|
791
|
+
``LoadEntries`` and per-name single-entry RPCs without an extra round
|
|
792
|
+
trip. Functions are partitioned by ``get_metadata().function_type``
|
|
793
|
+
into the three keys (``scalar_function``, ``aggregate_function``,
|
|
794
|
+
``table_function``) so DuckDB's per-type catalog probes (a name
|
|
795
|
+
lookup walks scalar → aggregate → table) skip the bulk RPC for any
|
|
796
|
+
category the schema doesn't populate.
|
|
797
|
+
|
|
798
|
+
**Zero counts are load-bearing.** Empty declarative collections
|
|
799
|
+
(e.g. ``views=()``) emit ``0`` here, which the C++ client treats as
|
|
800
|
+
a hard guarantee and uses to skip the corresponding bulk + per-name
|
|
801
|
+
RPCs entirely. Do not "optimize" this into omitting empty keys —
|
|
802
|
+
absence reads as count=1 (unknown), suppressing the RPC bypass.
|
|
803
|
+
"""
|
|
804
|
+
function_counts = {
|
|
805
|
+
CatalogFunctionType.SCALAR: 0,
|
|
806
|
+
CatalogFunctionType.AGGREGATE: 0,
|
|
807
|
+
CatalogFunctionType.TABLE: 0,
|
|
808
|
+
CatalogFunctionType.TABLE_BUFFERING: 0,
|
|
809
|
+
}
|
|
810
|
+
for func in self.functions:
|
|
811
|
+
function_counts[func.get_metadata().function_type] += 1
|
|
812
|
+
return SchemaInfo(
|
|
813
|
+
attach_opaque_data=attach_opaque_data,
|
|
814
|
+
name=self.name,
|
|
815
|
+
comment=self.comment,
|
|
816
|
+
tags=dict(self.tags),
|
|
817
|
+
estimated_object_count={
|
|
818
|
+
"table": len(self.tables),
|
|
819
|
+
"view": len(self.views),
|
|
820
|
+
"scalar_function": function_counts[CatalogFunctionType.SCALAR],
|
|
821
|
+
"aggregate_function": function_counts[CatalogFunctionType.AGGREGATE],
|
|
822
|
+
"table_function": (
|
|
823
|
+
function_counts[CatalogFunctionType.TABLE] + function_counts[CatalogFunctionType.TABLE_BUFFERING]
|
|
824
|
+
),
|
|
825
|
+
"macro": len(self.macros),
|
|
826
|
+
"index": len(self.indexes),
|
|
827
|
+
},
|
|
828
|
+
)
|
|
829
|
+
|
|
830
|
+
|
|
831
|
+
@dataclass
|
|
832
|
+
class Catalog:
|
|
833
|
+
"""Declarative catalog definition containing schemas.
|
|
834
|
+
|
|
835
|
+
The single entry point for defining all catalog metadata on a Worker.
|
|
836
|
+
|
|
837
|
+
Attributes:
|
|
838
|
+
name: The catalog name (used in SQL as the database name).
|
|
839
|
+
default_schema: Schema to use for unqualified table/view/function names.
|
|
840
|
+
schemas: Sequence of Schema objects defining the catalog contents.
|
|
841
|
+
comment: Optional comment describing the catalog.
|
|
842
|
+
tags: Optional key-value tags associated with the catalog.
|
|
843
|
+
|
|
844
|
+
"""
|
|
845
|
+
|
|
846
|
+
name: str
|
|
847
|
+
default_schema: str = "main"
|
|
848
|
+
schemas: Sequence[Schema] = ()
|
|
849
|
+
comment: str | None = None
|
|
850
|
+
tags: dict[str, str] = field(default_factory=dict)
|
|
851
|
+
|
|
852
|
+
def __post_init__(self) -> None:
|
|
853
|
+
"""Validate catalog configuration."""
|
|
854
|
+
schema_names = {s.name.lower() for s in self.schemas}
|
|
855
|
+
|
|
856
|
+
# Validate default_schema exists
|
|
857
|
+
if self.default_schema.lower() not in schema_names:
|
|
858
|
+
available = sorted(s.name for s in self.schemas) or ["(none)"]
|
|
859
|
+
raise ValueError(
|
|
860
|
+
f"Catalog '{self.name}': default_schema '{self.default_schema}' "
|
|
861
|
+
f"not found in schemas. Available schemas: {available}"
|
|
862
|
+
)
|
|
863
|
+
|
|
864
|
+
# Check for duplicate schema names (case-insensitive)
|
|
865
|
+
seen: set[str] = set()
|
|
866
|
+
for schema in self.schemas:
|
|
867
|
+
key = schema.name.lower()
|
|
868
|
+
if key in seen:
|
|
869
|
+
raise ValueError(f"Catalog '{self.name}': duplicate schema name '{schema.name}'")
|
|
870
|
+
seen.add(key)
|