vgi-python 0.8.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (124) hide show
  1. vgi/__init__.py +152 -0
  2. vgi/_duckdb.py +62 -0
  3. vgi/_storage_profile.py +132 -0
  4. vgi/_test_fixtures/__init__.py +20 -0
  5. vgi/_test_fixtures/accumulate/__init__.py +19 -0
  6. vgi/_test_fixtures/accumulate/worker.py +762 -0
  7. vgi/_test_fixtures/aggregate/__init__.py +62 -0
  8. vgi/_test_fixtures/aggregate/_common.py +21 -0
  9. vgi/_test_fixtures/aggregate/basic.py +232 -0
  10. vgi/_test_fixtures/aggregate/dynamic.py +409 -0
  11. vgi/_test_fixtures/aggregate/generic.py +86 -0
  12. vgi/_test_fixtures/aggregate/listagg.py +71 -0
  13. vgi/_test_fixtures/aggregate/percentile.py +107 -0
  14. vgi/_test_fixtures/aggregate/streaming.py +192 -0
  15. vgi/_test_fixtures/aggregate/varargs.py +75 -0
  16. vgi/_test_fixtures/aggregate/window.py +380 -0
  17. vgi/_test_fixtures/attach_options.py +308 -0
  18. vgi/_test_fixtures/bad_protocol.py +62 -0
  19. vgi/_test_fixtures/cancellable.py +336 -0
  20. vgi/_test_fixtures/catalog.py +813 -0
  21. vgi/_test_fixtures/http_server.py +394 -0
  22. vgi/_test_fixtures/nest_tensor.py +614 -0
  23. vgi/_test_fixtures/orchard_catalog.py +47 -0
  24. vgi/_test_fixtures/projection_repro/__init__.py +6 -0
  25. vgi/_test_fixtures/projection_repro/worker.py +454 -0
  26. vgi/_test_fixtures/scalar/__init__.py +116 -0
  27. vgi/_test_fixtures/scalar/_common.py +69 -0
  28. vgi/_test_fixtures/scalar/arithmetic.py +321 -0
  29. vgi/_test_fixtures/scalar/binary.py +120 -0
  30. vgi/_test_fixtures/scalar/formatting.py +176 -0
  31. vgi/_test_fixtures/scalar/geo.py +300 -0
  32. vgi/_test_fixtures/scalar/null_handling.py +107 -0
  33. vgi/_test_fixtures/scalar/random_demo.py +171 -0
  34. vgi/_test_fixtures/scalar/settings_secrets.py +102 -0
  35. vgi/_test_fixtures/scalar/type_info.py +219 -0
  36. vgi/_test_fixtures/schema_reconcile/__init__.py +29 -0
  37. vgi/_test_fixtures/schema_reconcile/worker.py +653 -0
  38. vgi/_test_fixtures/simple_writable.py +793 -0
  39. vgi/_test_fixtures/table/__init__.py +221 -0
  40. vgi/_test_fixtures/table/_common.py +162 -0
  41. vgi/_test_fixtures/table/batch_index.py +283 -0
  42. vgi/_test_fixtures/table/batch_index_broken.py +200 -0
  43. vgi/_test_fixtures/table/catalog_scans.py +162 -0
  44. vgi/_test_fixtures/table/filters.py +1005 -0
  45. vgi/_test_fixtures/table/late_materialization.py +249 -0
  46. vgi/_test_fixtures/table/make_series.py +273 -0
  47. vgi/_test_fixtures/table/misc.py +499 -0
  48. vgi/_test_fixtures/table/order_modes.py +164 -0
  49. vgi/_test_fixtures/table/pairs.py +437 -0
  50. vgi/_test_fixtures/table/partition_columns.py +472 -0
  51. vgi/_test_fixtures/table/partition_columns_broken.py +304 -0
  52. vgi/_test_fixtures/table/profiling_example.py +195 -0
  53. vgi/_test_fixtures/table/required_filters.py +234 -0
  54. vgi/_test_fixtures/table/sequence.py +710 -0
  55. vgi/_test_fixtures/table/settings.py +426 -0
  56. vgi/_test_fixtures/table/transaction_storage.py +162 -0
  57. vgi/_test_fixtures/table/tt_pushdown.py +191 -0
  58. vgi/_test_fixtures/table/versioned.py +230 -0
  59. vgi/_test_fixtures/table_in_out.py +1392 -0
  60. vgi/_test_fixtures/versioned.py +155 -0
  61. vgi/_test_fixtures/versioned_tables.py +595 -0
  62. vgi/_test_fixtures/worker.py +1631 -0
  63. vgi/_test_fixtures/writable/__init__.py +8 -0
  64. vgi/_test_fixtures/writable/generic.py +236 -0
  65. vgi/_test_fixtures/writable/table.py +149 -0
  66. vgi/_test_fixtures/writable/worker.py +1148 -0
  67. vgi/aggregate_function.py +607 -0
  68. vgi/argument_spec.py +472 -0
  69. vgi/arguments.py +1747 -0
  70. vgi/auth.py +55 -0
  71. vgi/catalog/__init__.py +88 -0
  72. vgi/catalog/attach_option.py +206 -0
  73. vgi/catalog/catalog_interface.py +2767 -0
  74. vgi/catalog/descriptors.py +870 -0
  75. vgi/catalog/duckdb_statistics.py +377 -0
  76. vgi/catalog/secret_type.py +96 -0
  77. vgi/catalog/setting.py +253 -0
  78. vgi/catalog/storage.py +372 -0
  79. vgi/client/__init__.py +67 -0
  80. vgi/client/catalog_mixin.py +1251 -0
  81. vgi/client/cli.py +582 -0
  82. vgi/client/cli_catalog.py +182 -0
  83. vgi/client/cli_schema.py +270 -0
  84. vgi/client/cli_table.py +907 -0
  85. vgi/client/cli_transaction.py +97 -0
  86. vgi/client/cli_utils.py +441 -0
  87. vgi/client/cli_view.py +303 -0
  88. vgi/client/client.py +2183 -0
  89. vgi/exceptions.py +205 -0
  90. vgi/function.py +245 -0
  91. vgi/function_storage.py +1636 -0
  92. vgi/function_storage_azure_sql.py +922 -0
  93. vgi/function_storage_cf_do.py +740 -0
  94. vgi/http/__init__.py +25 -0
  95. vgi/http/demo_storage.py +212 -0
  96. vgi/http/worker_page.py +1252 -0
  97. vgi/invocation.py +154 -0
  98. vgi/logging_config.py +93 -0
  99. vgi/meta_worker.py +661 -0
  100. vgi/metadata.py +1403 -0
  101. vgi/otel.py +406 -0
  102. vgi/protocol.py +2418 -0
  103. vgi/protocol_version.txt +1 -0
  104. vgi/py.typed +0 -0
  105. vgi/scalar_function.py +1211 -0
  106. vgi/schema_utils.py +234 -0
  107. vgi/secret_protocol.py +124 -0
  108. vgi/secret_service.py +238 -0
  109. vgi/serve.py +769 -0
  110. vgi/table_buffering_function.py +443 -0
  111. vgi/table_filter_pushdown.py +1528 -0
  112. vgi/table_function.py +1130 -0
  113. vgi/table_in_out_function.py +383 -0
  114. vgi/transactor/__init__.py +24 -0
  115. vgi/transactor/_duckdb_compat.py +27 -0
  116. vgi/transactor/client.py +137 -0
  117. vgi/transactor/protocol.py +149 -0
  118. vgi/transactor/server.py +740 -0
  119. vgi/worker.py +4761 -0
  120. vgi_python-0.8.0.dist-info/METADATA +735 -0
  121. vgi_python-0.8.0.dist-info/RECORD +124 -0
  122. vgi_python-0.8.0.dist-info/WHEEL +4 -0
  123. vgi_python-0.8.0.dist-info/entry_points.txt +5 -0
  124. vgi_python-0.8.0.dist-info/licenses/LICENSE +134 -0
@@ -0,0 +1,870 @@
1
+ # Copyright 2025, 2026 Query Farm LLC - https://query.farm
2
+
3
+ """Declarative descriptor classes for catalog definition.
4
+
5
+ This module provides classes for declaratively defining catalog structure:
6
+ - Catalog: Top-level container for schemas
7
+ - Schema: Groups tables, views, and functions
8
+ - Table: Table definition with columns and constraints
9
+ - View: View definition with SQL
10
+
11
+ """
12
+
13
+ from __future__ import annotations
14
+
15
+ from collections.abc import Sequence
16
+ from dataclasses import dataclass, field
17
+ from typing import TYPE_CHECKING, Any, Union
18
+
19
+ import pyarrow as pa
20
+
21
+ from vgi.arguments import Arguments
22
+ from vgi.catalog.catalog_interface import (
23
+ AttachOpaqueData,
24
+ ColumnStatistics,
25
+ IndexConstraintType,
26
+ IndexInfo,
27
+ MacroInfo,
28
+ MacroType,
29
+ ScanFunctionResult,
30
+ SchemaInfo,
31
+ SerializedSchema,
32
+ TableColumnStatisticsResult,
33
+ TableInfo,
34
+ ViewInfo,
35
+ serialize_column_statistics,
36
+ )
37
+ from vgi.invocation import BindResponse, FunctionType
38
+ from vgi.metadata import CatalogFunctionType
39
+
40
+ if TYPE_CHECKING:
41
+ from vgi.function import Function
42
+ from vgi.table_function import TableFunctionGenerator
43
+ from vgi.table_in_out_function import TableInOutGenerator
44
+
45
+
46
+ class Sql(str):
47
+ """A raw SQL expression, passed through verbatim as a default value.
48
+
49
+ Use this when the default is a SQL expression rather than a Python literal::
50
+
51
+ defaults={"created_at": Sql("current_timestamp")}
52
+ """
53
+
54
+
55
+ # A default value can be a Python literal (str, int, float, bool, None)
56
+ # or Sql() for raw SQL expressions. Plain str values are treated as string
57
+ # literals and automatically quoted.
58
+ DefaultValue = str | int | float | bool | None
59
+
60
+ # A stat value is either a plain Python value (auto-converted using the column's
61
+ # Arrow type) or an explicit PyArrow scalar (used as-is).
62
+ StatValue = Union[None, bool, int, float, str, bytes, "pa.Scalar"] # type: ignore[type-arg]
63
+
64
+ __all__ = [
65
+ "Catalog",
66
+ "ColumnStatisticsInput",
67
+ "DefaultValue",
68
+ "ForeignKeyDef",
69
+ "Index",
70
+ "Macro",
71
+ "Schema",
72
+ "Sql",
73
+ "Table",
74
+ "View",
75
+ ]
76
+
77
+
78
+ def _to_scalar(
79
+ value: StatValue,
80
+ arrow_type: pa.DataType,
81
+ ) -> pa.Scalar | None: # type: ignore[type-arg]
82
+ """Convert a stat value to a PyArrow scalar, inferring type from the column schema."""
83
+ if value is None:
84
+ return None
85
+ if isinstance(value, pa.Scalar):
86
+ return value # Already a scalar — use as-is
87
+ # Unwrap dictionary type — stats should use the value type so min/max
88
+ # serialize as the actual value, not the dictionary index.
89
+ if pa.types.is_dictionary(arrow_type):
90
+ arrow_type = arrow_type.value_type
91
+ return pa.scalar(value, type=arrow_type)
92
+
93
+
94
+ @dataclass(frozen=True, slots=True)
95
+ class ColumnStatisticsInput:
96
+ """Column statistics specified on a Table descriptor.
97
+
98
+ Values for ``min`` and ``max`` can be plain Python literals (int, float, str, etc.)
99
+ which are auto-converted to PyArrow scalars using the column's Arrow type from
100
+ the table schema, or explicit ``pa.scalar(...)`` values used as-is.
101
+
102
+ Example::
103
+
104
+ # Plain Python values — types inferred from schema
105
+ ColumnStatisticsInput(min=1, max=100, has_null=False, distinct_count=100)
106
+
107
+ # Explicit PyArrow scalars
108
+ ColumnStatisticsInput(min=pa.scalar(1, pa.int32()), max=pa.scalar(100, pa.int32()))
109
+
110
+ """
111
+
112
+ min: StatValue = None
113
+ max: StatValue = None
114
+ has_null: bool = True
115
+ has_not_null: bool = True
116
+ distinct_count: int | None = None
117
+ contains_unicode: bool | None = None
118
+ max_string_length: int | None = None
119
+
120
+ def resolve(self, column_name: str, arrow_type: pa.DataType) -> ColumnStatistics:
121
+ """Convert to a :class:`ColumnStatistics` with properly typed PyArrow scalars."""
122
+ return ColumnStatistics(
123
+ column_name=column_name,
124
+ min=_to_scalar(self.min, arrow_type),
125
+ max=_to_scalar(self.max, arrow_type),
126
+ has_null=self.has_null,
127
+ has_not_null=self.has_not_null,
128
+ distinct_count=self.distinct_count,
129
+ contains_unicode=self.contains_unicode,
130
+ max_string_length=self.max_string_length,
131
+ )
132
+
133
+
134
+ def _default_to_sql(value: DefaultValue) -> str:
135
+ """Convert a Python default value to a SQL expression string.
136
+
137
+ - ``Sql``: passed through verbatim (raw SQL)
138
+ - ``str``: quoted as a SQL string literal (``'hello'``)
139
+ - ``int`` / ``float``: unquoted numeric literal
140
+ - ``bool``: ``true`` / ``false``
141
+ - ``None``: ``NULL``
142
+ """
143
+ if isinstance(value, Sql):
144
+ return str(value)
145
+ if isinstance(value, bool):
146
+ return "true" if value else "false"
147
+ if isinstance(value, int):
148
+ return str(value)
149
+ if isinstance(value, float):
150
+ return repr(value)
151
+ if value is None:
152
+ return "NULL"
153
+ # str — quote as SQL string literal, escaping single quotes
154
+ escaped = value.replace("'", "''")
155
+ return f"'{escaped}'"
156
+
157
+
158
+ def _inline_function_result(
159
+ func: type[Function] | None,
160
+ ) -> bytes | None:
161
+ """Build inlined ``ScanFunctionResult`` IPC bytes for a function-backed table.
162
+
163
+ Returns ``None`` when the table is not function-backed for that operation.
164
+ Mirrors ``ReadOnlyCatalogInterface._write_function_get`` /
165
+ ``table_scan_function_get`` auto-impl: empty positional/named arguments,
166
+ no required extensions. The C++ extension uses these bytes verbatim and
167
+ skips the corresponding ``catalog_table_*_function_get`` RPC.
168
+ """
169
+ if func is None:
170
+ return None
171
+ func_meta = func.get_metadata()
172
+ return ScanFunctionResult(
173
+ function_name=func_meta.name,
174
+ positional_arguments=[],
175
+ named_arguments={},
176
+ required_extensions=[],
177
+ ).serialize()
178
+
179
+
180
+ @dataclass(frozen=True, slots=True)
181
+ class ForeignKeyDef:
182
+ """A foreign key constraint definition.
183
+
184
+ Attributes:
185
+ columns: Column names in THIS table that form the FK.
186
+ referenced_table: Name of the referenced table.
187
+ referenced_columns: Column names in the referenced table.
188
+ referenced_schema: Schema of the referenced table.
189
+ Defaults to None meaning same schema as this table.
190
+
191
+ """
192
+
193
+ columns: tuple[str, ...]
194
+ referenced_table: str
195
+ referenced_columns: tuple[str, ...]
196
+ referenced_schema: str | None = None
197
+
198
+
199
+ @dataclass(frozen=True, slots=True, kw_only=True)
200
+ class Table:
201
+ """Declarative table definition.
202
+
203
+ Immutable. Can be defined in two ways:
204
+
205
+ 1. **Explicit columns**: Provide ``columns`` schema directly.
206
+ 2. **Function-backed**: Provide ``function`` reference — the schema is
207
+ derived by calling ``bind()`` on the function class. If the function
208
+ requires arguments, supply them via ``arguments``.
209
+
210
+ Attributes:
211
+ name: Table name.
212
+ columns: Explicit PyArrow schema (mutually exclusive with function).
213
+ function: TableFunctionGenerator class to derive schema from
214
+ (mutually exclusive with columns).
215
+ arguments: Arguments to pass when calling ``bind()`` on a
216
+ function-backed table. Required when the function has
217
+ mandatory parameters.
218
+ not_null: Tuple of column names with NOT NULL constraints.
219
+ unique: Tuple of column name tuples for UNIQUE constraints.
220
+ check: Tuple of SQL expressions for CHECK constraints.
221
+ defaults: Dict mapping column names to default values. Accepts
222
+ Python literals (str, int, float, bool, None) which are
223
+ auto-converted, or SqlExpression for raw SQL.
224
+ generated_columns: Dict mapping column names to SQL expressions
225
+ for generated (virtual) columns. Generated columns are
226
+ computed on read by DuckDB and are mutually exclusive with
227
+ defaults.
228
+ column_comments: Dict mapping column names to comment strings.
229
+ Comments are transported as Arrow field metadata and visible
230
+ via ``duckdb_columns()`` in DuckDB.
231
+ required_field_filter_paths: Dotted-path column references that MUST
232
+ appear in a WHERE expression for any scan of this table. Top-level
233
+ names (``"country"``) or struct subfields (``"bbox.xmin"``,
234
+ ``"nested.outer.inner"``). Empty (default) means no enforcement.
235
+ Satisfaction is prefix-based: a present filter on a shorter path
236
+ satisfies any required path it is a prefix of. So a whole-struct
237
+ filter on ``bbox`` satisfies all of ``bbox.xmin`` / ``.xmax`` /
238
+ ``.ymin`` / ``.ymax`` — the wider filter is at least as
239
+ constraining as the four individual ones. The VGI DuckDB
240
+ extension's optimizer pass consults this list at bind time and
241
+ throws ``BinderException`` listing any unsatisfied paths.
242
+ comment: Optional table comment.
243
+ tags: Optional metadata tags.
244
+
245
+ """
246
+
247
+ name: str
248
+ columns: pa.Schema | None = None
249
+ function: type[TableFunctionGenerator[Any, Any]] | None = None
250
+ arguments: Arguments | None = None
251
+ supports_time_travel: bool = False
252
+ insert_function: type[TableInOutGenerator[Any, Any]] | None = None
253
+ update_function: type[TableInOutGenerator[Any, Any]] | None = None
254
+ delete_function: type[TableInOutGenerator[Any, Any]] | None = None
255
+ not_null: tuple[str, ...] = ()
256
+ unique: tuple[tuple[str, ...], ...] = ()
257
+ check: tuple[str, ...] = ()
258
+ primary_key: tuple[tuple[str, ...], ...] = ()
259
+ foreign_key: tuple[ForeignKeyDef, ...] = ()
260
+ defaults: dict[str, DefaultValue] = field(default_factory=dict)
261
+ generated_columns: dict[str, str] = field(default_factory=dict)
262
+ column_comments: dict[str, str] = field(default_factory=dict)
263
+ required_field_filter_paths: tuple[str, ...] = ()
264
+ statistics: dict[str, ColumnStatisticsInput] = field(default_factory=dict)
265
+ statistics_cache_max_age_seconds: int | None = None
266
+ # Optional inlined cardinality. When set, the C++ extension uses these
267
+ # values directly and skips the per-bind ``table_function_cardinality``
268
+ # RPC. Use for read-only or slow-changing tables. Leave both as ``None``
269
+ # to keep the existing per-bind RPC behavior.
270
+ cardinality_estimate: int | None = None
271
+ cardinality_max: int | None = None
272
+ # Opt into pre-binding the function during ``schema_contents`` and
273
+ # inlining the result on ``TableInfo.bind_result``. The C++ extension
274
+ # then skips the per-scan ``bind`` RPC.
275
+ #
276
+ # Only valid when ``function`` is a ``@bind_fixed_schema``-decorated
277
+ # ``TableFunctionGenerator`` subclass — the decorator's contract (output
278
+ # is exactly ``cls.FIXED_SCHEMA``, no per-call inputs) matches what's
279
+ # safe to freeze for the catalog cache lifetime. Setting this on a
280
+ # descriptor whose function is not decorated raises at descriptor build.
281
+ inline_bind: bool = False
282
+ comment: str | None = None
283
+ tags: dict[str, str] = field(default_factory=dict)
284
+
285
+ def __post_init__(self) -> None:
286
+ """Validate configuration and constraint column names."""
287
+ # Validate mutually exclusive options
288
+ if self.columns is None and self.function is None:
289
+ raise ValueError(f"Table '{self.name}': must specify either 'columns' or 'function'")
290
+ if self.columns is not None and self.function is not None:
291
+ raise ValueError(f"Table '{self.name}': cannot specify both 'columns' and 'function'")
292
+
293
+ # Validate inline_bind contract: only @bind_fixed_schema-decorated
294
+ # functions qualify for the catalog framework's pre-bind path. The
295
+ # decorator marks both the class (_inline_bind_safe=True) and the
296
+ # installed on_bind function (_is_bind_fixed_schema=True). The
297
+ # function-level marker lets us reject subclasses that overrode
298
+ # on_bind even though they inherit _inline_bind_safe via MRO.
299
+ if self.inline_bind:
300
+ if self.function is None:
301
+ raise ValueError(f"Table '{self.name}': inline_bind=True requires function= to be set")
302
+ if not getattr(self.function, "_inline_bind_safe", False):
303
+ raise ValueError(
304
+ f"Table '{self.name}': inline_bind=True requires the function class "
305
+ f"to be decorated with @bind_fixed_schema. Got {self.function.__name__}, "
306
+ f"which has a custom on_bind. Either decorate it (deleting the manual "
307
+ f"on_bind) or leave inline_bind=False."
308
+ )
309
+ on_bind_attr = self.function.__dict__.get("on_bind")
310
+ if on_bind_attr is not None:
311
+ # The class has its own on_bind in __dict__. Either the
312
+ # decorator installed it (good — has _is_bind_fixed_schema
313
+ # marker on the underlying function) or a subclass overrode
314
+ # it (bad — escapes the decorator's contract).
315
+ underlying = getattr(on_bind_attr, "__func__", on_bind_attr)
316
+ if not getattr(underlying, "_is_bind_fixed_schema", False):
317
+ raise ValueError(
318
+ f"Table '{self.name}': inline_bind=True is not safe for "
319
+ f"{self.function.__name__} because it overrides on_bind, "
320
+ f"escaping @bind_fixed_schema's contract. Either remove the "
321
+ f"override or leave inline_bind=False."
322
+ )
323
+
324
+ # Resolve columns to validate constraints
325
+ resolved = self._get_resolved_columns()
326
+ column_names = {f.name for f in resolved}
327
+
328
+ # Validate not_null column names
329
+ for col in self.not_null:
330
+ if col not in column_names:
331
+ raise ValueError(
332
+ f"Table '{self.name}': not_null column '{col}' not found "
333
+ f"in schema. Available columns: {sorted(column_names)}"
334
+ )
335
+
336
+ # Validate unique column names
337
+ for group in self.unique:
338
+ for col in group:
339
+ if col not in column_names:
340
+ raise ValueError(
341
+ f"Table '{self.name}': unique column '{col}' not found "
342
+ f"in schema. Available columns: {sorted(column_names)}"
343
+ )
344
+
345
+ # Validate primary_key column names
346
+ for group in self.primary_key:
347
+ for col in group:
348
+ if col not in column_names:
349
+ raise ValueError(
350
+ f"Table '{self.name}': primary_key column '{col}' not found "
351
+ f"in schema. Available columns: {sorted(column_names)}"
352
+ )
353
+
354
+ # Validate foreign_key column names (only FK side, not referenced table)
355
+ for fk in self.foreign_key:
356
+ for col in fk.columns:
357
+ if col not in column_names:
358
+ raise ValueError(
359
+ f"Table '{self.name}': foreign_key column '{col}' not found "
360
+ f"in schema. Available columns: {sorted(column_names)}"
361
+ )
362
+
363
+ # Validate at most one primary key
364
+ if len(self.primary_key) > 1:
365
+ raise ValueError(
366
+ f"Table '{self.name}': at most one primary_key constraint allowed, got {len(self.primary_key)}"
367
+ )
368
+
369
+ # Validate foreign_key column count parity
370
+ for fk in self.foreign_key:
371
+ if len(fk.columns) != len(fk.referenced_columns):
372
+ raise ValueError(
373
+ f"Table '{self.name}': foreign_key referencing '{fk.referenced_table}' "
374
+ f"has {len(fk.columns)} FK columns but {len(fk.referenced_columns)} "
375
+ f"referenced columns — counts must match"
376
+ )
377
+
378
+ # Validate defaults column names
379
+ for col in self.defaults:
380
+ if col not in column_names:
381
+ raise ValueError(
382
+ f"Table '{self.name}': defaults column '{col}' not found "
383
+ f"in schema. Available columns: {sorted(column_names)}"
384
+ )
385
+
386
+ # Validate generated_columns column names and no overlap with defaults
387
+ for col in self.generated_columns:
388
+ if col not in column_names:
389
+ raise ValueError(
390
+ f"Table '{self.name}': generated_columns column '{col}' not found "
391
+ f"in schema. Available columns: {sorted(column_names)}"
392
+ )
393
+ if col in self.defaults:
394
+ raise ValueError(
395
+ f"Table '{self.name}': column '{col}' cannot have both a default value and a generated expression"
396
+ )
397
+
398
+ # Validate column_comments column names
399
+ for col in self.column_comments:
400
+ if col not in column_names:
401
+ raise ValueError(
402
+ f"Table '{self.name}': column_comments column '{col}' not found "
403
+ f"in schema. Available columns: {sorted(column_names)}"
404
+ )
405
+
406
+ # Validate required_field_filter_paths: the leading dotted segment of
407
+ # each path must be a real column on this table. Struct subfield
408
+ # validity is not checked here — DuckDB's binder catches typos at
409
+ # scan time, and the descriptor doesn't unpack STRUCT subfields.
410
+ for path in self.required_field_filter_paths:
411
+ if not path:
412
+ raise ValueError(f"Table '{self.name}': required_field_filter_paths must not contain empty strings")
413
+ head = path.split(".", 1)[0]
414
+ if head not in column_names:
415
+ raise ValueError(
416
+ f"Table '{self.name}': required_field_filter_paths path '{path}' references "
417
+ f"unknown column '{head}'. Available columns: {sorted(column_names)}"
418
+ )
419
+
420
+ # Validate statistics column names
421
+ for col in self.statistics:
422
+ if col not in column_names:
423
+ raise ValueError(
424
+ f"Table '{self.name}': statistics column '{col}' not found "
425
+ f"in schema. Available columns: {sorted(column_names)}"
426
+ )
427
+
428
+ # Validate write functions: UPDATE/DELETE require a scan function for row IDs
429
+ if (self.update_function is not None or self.delete_function is not None) and self.function is None:
430
+ raise ValueError(
431
+ f"Table '{self.name}': update_function and delete_function require "
432
+ f"a scan function (set 'function') to provide row IDs"
433
+ )
434
+
435
+ def _get_resolved_columns(self) -> pa.Schema:
436
+ """Get the resolved columns schema (explicit or derived from function).
437
+
438
+ For function-backed tables, calls ``bind()`` on the function class
439
+ to obtain the output schema. If the function requires arguments,
440
+ they must be supplied via the ``arguments`` field.
441
+ """
442
+ if self.columns is not None:
443
+ return self.columns
444
+
445
+ assert self.function is not None
446
+ arguments = self.arguments if self.arguments is not None else Arguments()
447
+ from vgi.protocol import BindRequest
448
+
449
+ bind_call = BindRequest(
450
+ function_name=self.function.Meta.name, # type: ignore[attr-defined]
451
+ arguments=arguments,
452
+ function_type=FunctionType.TABLE,
453
+ )
454
+ try:
455
+ result = self.function.bind(bind_call)
456
+ if not isinstance(result, BindResponse):
457
+ raise ValueError(
458
+ f"Table '{self.name}': function '{self.function.__name__}' returned "
459
+ f"unexpected bind result type: {type(result).__name__}"
460
+ )
461
+ return result.output_schema
462
+ except Exception as e:
463
+ raise ValueError(
464
+ f"Table '{self.name}': failed to derive schema from function "
465
+ f"'{self.function.__name__}' via bind(). If the function requires "
466
+ f"arguments, pass them via arguments=Arguments(...). Error: {e}"
467
+ ) from e
468
+
469
+ @property
470
+ def resolved_columns(self) -> pa.Schema:
471
+ """The resolved column schema (explicit or derived from function)."""
472
+ return self._get_resolved_columns()
473
+
474
+ def _resolve_not_null_indices(self) -> list[int]:
475
+ """Convert column names to indices for not_null constraints."""
476
+ cols = self.resolved_columns
477
+ return [cols.get_field_index(col) for col in self.not_null]
478
+
479
+ def _resolve_unique_indices(self) -> list[list[int]]:
480
+ """Convert column names to indices for unique constraints."""
481
+ cols = self.resolved_columns
482
+ return [[cols.get_field_index(col) for col in group] for group in self.unique]
483
+
484
+ def _resolve_primary_key_indices(self) -> list[list[int]]:
485
+ """Convert column names to indices for primary_key constraints."""
486
+ cols = self.resolved_columns
487
+ return [[cols.get_field_index(col) for col in group] for group in self.primary_key]
488
+
489
+ def _serialize_foreign_keys(self, schema_name: str) -> list[bytes]:
490
+ """Serialize foreign key constraints as IPC bytes."""
491
+ from vgi_rpc.utils import serialize_record_batch_bytes
492
+
493
+ result = []
494
+ for fk in self.foreign_key:
495
+ batch = pa.RecordBatch.from_pydict(
496
+ {
497
+ "fk_columns": [list(fk.columns)],
498
+ "pk_columns": [list(fk.referenced_columns)],
499
+ "referenced_table": [fk.referenced_table],
500
+ "referenced_schema": [fk.referenced_schema or schema_name],
501
+ },
502
+ schema=pa.schema(
503
+ [
504
+ ("fk_columns", pa.list_(pa.utf8())),
505
+ ("pk_columns", pa.list_(pa.utf8())),
506
+ ("referenced_table", pa.utf8()),
507
+ ("referenced_schema", pa.utf8()),
508
+ ]
509
+ ),
510
+ )
511
+ result.append(serialize_record_batch_bytes(batch))
512
+ return result
513
+
514
+ def _apply_defaults_to_schema(self, schema: pa.Schema) -> pa.Schema:
515
+ """Return schema with default value metadata applied to fields."""
516
+ if not self.defaults:
517
+ return schema
518
+ for col_name, value in self.defaults.items():
519
+ sql_expr = _default_to_sql(value)
520
+ idx = schema.get_field_index(col_name)
521
+ f = schema.field(idx)
522
+ existing = dict(f.metadata) if f.metadata else {}
523
+ existing[b"default"] = sql_expr.encode("utf-8")
524
+ schema = schema.set(idx, f.with_metadata(existing)) # type: ignore[arg-type]
525
+ return schema
526
+
527
+ def _apply_generated_columns_to_schema(self, schema: pa.Schema) -> pa.Schema:
528
+ """Return schema with generated expression metadata applied to fields."""
529
+ if not self.generated_columns:
530
+ return schema
531
+ for col_name, expression in self.generated_columns.items():
532
+ idx = schema.get_field_index(col_name)
533
+ f = schema.field(idx)
534
+ existing = dict(f.metadata) if f.metadata else {}
535
+ existing[b"generated_expression"] = expression.encode("utf-8")
536
+ schema = schema.set(idx, f.with_metadata(existing)) # type: ignore[arg-type]
537
+ return schema
538
+
539
+ def _apply_column_comments_to_schema(self, schema: pa.Schema) -> pa.Schema:
540
+ """Return schema with column comment metadata applied to fields."""
541
+ if not self.column_comments:
542
+ return schema
543
+ for col_name, comment in self.column_comments.items():
544
+ if not comment:
545
+ continue
546
+ idx = schema.get_field_index(col_name)
547
+ f = schema.field(idx)
548
+ existing = dict(f.metadata) if f.metadata else {}
549
+ existing[b"comment"] = comment.encode("utf-8")
550
+ schema = schema.set(idx, f.with_metadata(existing)) # type: ignore[arg-type]
551
+ return schema
552
+
553
+ def to_table_info(self, schema_name: str) -> TableInfo:
554
+ """Convert to TableInfo for catalog response."""
555
+ cols = self._apply_defaults_to_schema(self.resolved_columns)
556
+ cols = self._apply_generated_columns_to_schema(cols)
557
+ cols = self._apply_column_comments_to_schema(cols)
558
+ # Inline the resolved stats blob so the C++ extension can short-circuit
559
+ # the per-scan ``table_function_statistics`` and per-table
560
+ # ``catalog_table_column_statistics_get`` RPCs entirely. This freezes
561
+ # the resolved stats for the lifetime of the catalog cache; workers
562
+ # whose stats change faster than catalog_version must override
563
+ # ``to_table_info`` and leave column_statistics null.
564
+ resolved_stats = self.resolve_column_statistics()
565
+ column_statistics_blob = (
566
+ serialize_column_statistics(
567
+ resolved_stats.statistics,
568
+ resolved_stats.cache_max_age_seconds,
569
+ )
570
+ if resolved_stats is not None
571
+ else None
572
+ )
573
+ return TableInfo(
574
+ name=self.name,
575
+ schema_name=schema_name,
576
+ columns=SerializedSchema(cols.serialize().to_pybytes()),
577
+ not_null_constraints=self._resolve_not_null_indices(),
578
+ unique_constraints=self._resolve_unique_indices(),
579
+ check_constraints=list(self.check),
580
+ primary_key_constraints=self._resolve_primary_key_indices(),
581
+ foreign_key_constraints=self._serialize_foreign_keys(schema_name),
582
+ supports_insert=self.insert_function is not None,
583
+ supports_update=self.update_function is not None,
584
+ supports_delete=self.delete_function is not None,
585
+ supports_column_statistics=bool(self.statistics),
586
+ comment=self.comment,
587
+ tags=dict(self.tags),
588
+ scan_function=_inline_function_result(self.function),
589
+ insert_function=_inline_function_result(self.insert_function),
590
+ update_function=_inline_function_result(self.update_function),
591
+ delete_function=_inline_function_result(self.delete_function),
592
+ cardinality_estimate=self.cardinality_estimate,
593
+ cardinality_max=self.cardinality_max,
594
+ column_statistics=column_statistics_blob,
595
+ required_field_filter_paths=list(self.required_field_filter_paths),
596
+ )
597
+
598
+ def resolve_column_statistics(self) -> TableColumnStatisticsResult | None:
599
+ """Resolve the ``statistics`` dict into a :class:`TableColumnStatisticsResult`.
600
+
601
+ Returns ``None`` if no statistics are defined. Otherwise, converts
602
+ each entry to a :class:`ColumnStatistics` with properly typed PyArrow
603
+ scalars inferred from the table's column schema.
604
+ """
605
+ if not self.statistics:
606
+ return None
607
+ resolved_cols = self.resolved_columns
608
+ stats = []
609
+ for col_name, stat_input in self.statistics.items():
610
+ col_field = resolved_cols.field(col_name)
611
+ stats.append(stat_input.resolve(col_name, col_field.type))
612
+ return TableColumnStatisticsResult(
613
+ statistics=stats,
614
+ cache_max_age_seconds=self.statistics_cache_max_age_seconds,
615
+ )
616
+
617
+
618
+ @dataclass(frozen=True)
619
+ class View:
620
+ """Declarative view definition.
621
+
622
+ Immutable.
623
+
624
+ Attributes:
625
+ name: View name.
626
+ definition: SQL definition of the view.
627
+ comment: Optional view comment.
628
+ column_comments: Optional mapping of view output column name to comment.
629
+ The extension aligns these by name against the columns DuckDB binds
630
+ from the view's query, so only the names that actually appear in the
631
+ result need entries; unmatched names are ignored.
632
+ tags: Optional metadata tags.
633
+
634
+ """
635
+
636
+ name: str
637
+ definition: str
638
+ comment: str | None = None
639
+ column_comments: dict[str, str] = field(default_factory=dict)
640
+ tags: dict[str, str] = field(default_factory=dict)
641
+
642
+ def to_view_info(self, schema_name: str) -> ViewInfo:
643
+ """Convert to ViewInfo for catalog response."""
644
+ return ViewInfo(
645
+ name=self.name,
646
+ schema_name=schema_name,
647
+ definition=self.definition,
648
+ comment=self.comment,
649
+ column_comments=dict(self.column_comments),
650
+ tags=dict(self.tags),
651
+ )
652
+
653
+
654
+ @dataclass(frozen=True)
655
+ class Macro:
656
+ """Declarative macro definition.
657
+
658
+ Attributes:
659
+ name: Macro name.
660
+ macro_type: Whether this is a scalar or table macro.
661
+ parameters: Ordered list of parameter names.
662
+ parameter_default_values: One-row RecordBatch where columns are parameter
663
+ names and values are typed defaults. None if no defaults.
664
+ Example: pa.RecordBatch.from_pydict({"b": [5]}) for b := 5.
665
+ definition: SQL expression (scalar) or query (table).
666
+ comment: Optional macro comment.
667
+ tags: Optional metadata tags.
668
+
669
+ """
670
+
671
+ name: str
672
+ macro_type: MacroType
673
+ parameters: list[str] = field(default_factory=list)
674
+ parameter_default_values: pa.RecordBatch | None = None
675
+ definition: str = ""
676
+ comment: str | None = None
677
+ tags: dict[str, str] = field(default_factory=dict)
678
+
679
+ def __post_init__(self) -> None:
680
+ """Validate macro configuration."""
681
+ if self.parameter_default_values is not None:
682
+ if self.parameter_default_values.num_rows != 1:
683
+ raise ValueError(
684
+ f"Macro '{self.name}': parameter_default_values must have exactly 1 row, "
685
+ f"got {self.parameter_default_values.num_rows}"
686
+ )
687
+ # Validate that default param column names exist in parameters list
688
+ param_set = set(self.parameters)
689
+ for col_name in self.parameter_default_values.schema.names:
690
+ if col_name not in param_set:
691
+ raise ValueError(
692
+ f"Macro '{self.name}': default parameter '{col_name}' not found "
693
+ f"in parameters list {self.parameters}"
694
+ )
695
+
696
+ def to_macro_info(self, schema_name: str) -> MacroInfo:
697
+ """Convert to MacroInfo for catalog response."""
698
+ return MacroInfo(
699
+ name=self.name,
700
+ schema_name=schema_name,
701
+ macro_type=self.macro_type,
702
+ parameters=list(self.parameters),
703
+ parameter_default_values=self.parameter_default_values,
704
+ definition=self.definition,
705
+ comment=self.comment,
706
+ tags=dict(self.tags),
707
+ )
708
+
709
+
710
+ @dataclass(frozen=True, slots=True, kw_only=True)
711
+ class Index:
712
+ """Declarative index definition.
713
+
714
+ Immutable.
715
+
716
+ Attributes:
717
+ name: Index name.
718
+ table_name: Name of the table this index is on.
719
+ expressions: SQL expression strings or column names defining the index.
720
+ For column-based indexes: ("col_a", "col_b")
721
+ For expression indexes: ("lower(col_a)", "col_b + 1")
722
+ index_type: The index type (e.g., "" for default).
723
+ constraint_type: NONE for regular, UNIQUE for unique indexes.
724
+ options: Key-value index options.
725
+ comment: Optional index comment.
726
+ tags: Optional metadata tags.
727
+
728
+ """
729
+
730
+ name: str
731
+ table_name: str
732
+ expressions: tuple[str, ...] = ()
733
+ index_type: str = ""
734
+ constraint_type: IndexConstraintType = IndexConstraintType.NONE
735
+ options: dict[str, str] = field(default_factory=dict)
736
+ comment: str | None = None
737
+ tags: dict[str, str] = field(default_factory=dict)
738
+
739
+ def __post_init__(self) -> None:
740
+ """Validate index configuration."""
741
+ if not self.expressions:
742
+ raise ValueError(f"Index '{self.name}': must specify at least one expression")
743
+ if not self.table_name:
744
+ raise ValueError(f"Index '{self.name}': must specify a table_name")
745
+
746
+ def to_index_info(self, schema_name: str) -> IndexInfo:
747
+ """Convert to IndexInfo for catalog response."""
748
+ return IndexInfo(
749
+ name=self.name,
750
+ schema_name=schema_name,
751
+ table_name=self.table_name,
752
+ index_type=self.index_type,
753
+ constraint_type=self.constraint_type,
754
+ expressions=list(self.expressions),
755
+ options=dict(self.options),
756
+ comment=self.comment,
757
+ tags=dict(self.tags),
758
+ )
759
+
760
+
761
+ @dataclass
762
+ class Schema:
763
+ """Declarative schema definition grouping tables, views, functions, macros, and indexes.
764
+
765
+ Attributes:
766
+ name: Schema name.
767
+ comment: Optional schema comment.
768
+ tags: Optional metadata tags.
769
+ tables: Sequence of Table definitions.
770
+ views: Sequence of View definitions.
771
+ functions: Sequence of Function classes (scalar, table, or aggregate).
772
+ macros: Sequence of Macro definitions.
773
+ indexes: Sequence of Index definitions.
774
+
775
+ """
776
+
777
+ name: str
778
+ comment: str | None = None
779
+ tags: dict[str, str] = field(default_factory=dict)
780
+ tables: Sequence[Table] = ()
781
+ views: Sequence[View] = ()
782
+ functions: Sequence[type[Function]] = ()
783
+ macros: Sequence[Macro] = ()
784
+ indexes: Sequence[Index] = ()
785
+
786
+ def to_schema_info(self, attach_opaque_data: AttachOpaqueData) -> SchemaInfo:
787
+ """Convert to SchemaInfo for catalog response.
788
+
789
+ Populates ``estimated_object_count`` from the declared population so
790
+ the C++ extension's eager-load gate can choose between bulk
791
+ ``LoadEntries`` and per-name single-entry RPCs without an extra round
792
+ trip. Functions are partitioned by ``get_metadata().function_type``
793
+ into the three keys (``scalar_function``, ``aggregate_function``,
794
+ ``table_function``) so DuckDB's per-type catalog probes (a name
795
+ lookup walks scalar → aggregate → table) skip the bulk RPC for any
796
+ category the schema doesn't populate.
797
+
798
+ **Zero counts are load-bearing.** Empty declarative collections
799
+ (e.g. ``views=()``) emit ``0`` here, which the C++ client treats as
800
+ a hard guarantee and uses to skip the corresponding bulk + per-name
801
+ RPCs entirely. Do not "optimize" this into omitting empty keys —
802
+ absence reads as count=1 (unknown), suppressing the RPC bypass.
803
+ """
804
+ function_counts = {
805
+ CatalogFunctionType.SCALAR: 0,
806
+ CatalogFunctionType.AGGREGATE: 0,
807
+ CatalogFunctionType.TABLE: 0,
808
+ CatalogFunctionType.TABLE_BUFFERING: 0,
809
+ }
810
+ for func in self.functions:
811
+ function_counts[func.get_metadata().function_type] += 1
812
+ return SchemaInfo(
813
+ attach_opaque_data=attach_opaque_data,
814
+ name=self.name,
815
+ comment=self.comment,
816
+ tags=dict(self.tags),
817
+ estimated_object_count={
818
+ "table": len(self.tables),
819
+ "view": len(self.views),
820
+ "scalar_function": function_counts[CatalogFunctionType.SCALAR],
821
+ "aggregate_function": function_counts[CatalogFunctionType.AGGREGATE],
822
+ "table_function": (
823
+ function_counts[CatalogFunctionType.TABLE] + function_counts[CatalogFunctionType.TABLE_BUFFERING]
824
+ ),
825
+ "macro": len(self.macros),
826
+ "index": len(self.indexes),
827
+ },
828
+ )
829
+
830
+
831
+ @dataclass
832
+ class Catalog:
833
+ """Declarative catalog definition containing schemas.
834
+
835
+ The single entry point for defining all catalog metadata on a Worker.
836
+
837
+ Attributes:
838
+ name: The catalog name (used in SQL as the database name).
839
+ default_schema: Schema to use for unqualified table/view/function names.
840
+ schemas: Sequence of Schema objects defining the catalog contents.
841
+ comment: Optional comment describing the catalog.
842
+ tags: Optional key-value tags associated with the catalog.
843
+
844
+ """
845
+
846
+ name: str
847
+ default_schema: str = "main"
848
+ schemas: Sequence[Schema] = ()
849
+ comment: str | None = None
850
+ tags: dict[str, str] = field(default_factory=dict)
851
+
852
+ def __post_init__(self) -> None:
853
+ """Validate catalog configuration."""
854
+ schema_names = {s.name.lower() for s in self.schemas}
855
+
856
+ # Validate default_schema exists
857
+ if self.default_schema.lower() not in schema_names:
858
+ available = sorted(s.name for s in self.schemas) or ["(none)"]
859
+ raise ValueError(
860
+ f"Catalog '{self.name}': default_schema '{self.default_schema}' "
861
+ f"not found in schemas. Available schemas: {available}"
862
+ )
863
+
864
+ # Check for duplicate schema names (case-insensitive)
865
+ seen: set[str] = set()
866
+ for schema in self.schemas:
867
+ key = schema.name.lower()
868
+ if key in seen:
869
+ raise ValueError(f"Catalog '{self.name}': duplicate schema name '{schema.name}'")
870
+ seen.add(key)