vgi-python 0.8.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vgi/__init__.py +152 -0
- vgi/_duckdb.py +62 -0
- vgi/_storage_profile.py +132 -0
- vgi/_test_fixtures/__init__.py +20 -0
- vgi/_test_fixtures/accumulate/__init__.py +19 -0
- vgi/_test_fixtures/accumulate/worker.py +762 -0
- vgi/_test_fixtures/aggregate/__init__.py +62 -0
- vgi/_test_fixtures/aggregate/_common.py +21 -0
- vgi/_test_fixtures/aggregate/basic.py +232 -0
- vgi/_test_fixtures/aggregate/dynamic.py +409 -0
- vgi/_test_fixtures/aggregate/generic.py +86 -0
- vgi/_test_fixtures/aggregate/listagg.py +71 -0
- vgi/_test_fixtures/aggregate/percentile.py +107 -0
- vgi/_test_fixtures/aggregate/streaming.py +192 -0
- vgi/_test_fixtures/aggregate/varargs.py +75 -0
- vgi/_test_fixtures/aggregate/window.py +380 -0
- vgi/_test_fixtures/attach_options.py +308 -0
- vgi/_test_fixtures/bad_protocol.py +62 -0
- vgi/_test_fixtures/cancellable.py +336 -0
- vgi/_test_fixtures/catalog.py +813 -0
- vgi/_test_fixtures/http_server.py +394 -0
- vgi/_test_fixtures/nest_tensor.py +614 -0
- vgi/_test_fixtures/orchard_catalog.py +47 -0
- vgi/_test_fixtures/projection_repro/__init__.py +6 -0
- vgi/_test_fixtures/projection_repro/worker.py +454 -0
- vgi/_test_fixtures/scalar/__init__.py +116 -0
- vgi/_test_fixtures/scalar/_common.py +69 -0
- vgi/_test_fixtures/scalar/arithmetic.py +321 -0
- vgi/_test_fixtures/scalar/binary.py +120 -0
- vgi/_test_fixtures/scalar/formatting.py +176 -0
- vgi/_test_fixtures/scalar/geo.py +300 -0
- vgi/_test_fixtures/scalar/null_handling.py +107 -0
- vgi/_test_fixtures/scalar/random_demo.py +171 -0
- vgi/_test_fixtures/scalar/settings_secrets.py +102 -0
- vgi/_test_fixtures/scalar/type_info.py +219 -0
- vgi/_test_fixtures/schema_reconcile/__init__.py +29 -0
- vgi/_test_fixtures/schema_reconcile/worker.py +653 -0
- vgi/_test_fixtures/simple_writable.py +793 -0
- vgi/_test_fixtures/table/__init__.py +221 -0
- vgi/_test_fixtures/table/_common.py +162 -0
- vgi/_test_fixtures/table/batch_index.py +283 -0
- vgi/_test_fixtures/table/batch_index_broken.py +200 -0
- vgi/_test_fixtures/table/catalog_scans.py +162 -0
- vgi/_test_fixtures/table/filters.py +1005 -0
- vgi/_test_fixtures/table/late_materialization.py +249 -0
- vgi/_test_fixtures/table/make_series.py +273 -0
- vgi/_test_fixtures/table/misc.py +499 -0
- vgi/_test_fixtures/table/order_modes.py +164 -0
- vgi/_test_fixtures/table/pairs.py +437 -0
- vgi/_test_fixtures/table/partition_columns.py +472 -0
- vgi/_test_fixtures/table/partition_columns_broken.py +304 -0
- vgi/_test_fixtures/table/profiling_example.py +195 -0
- vgi/_test_fixtures/table/required_filters.py +234 -0
- vgi/_test_fixtures/table/sequence.py +710 -0
- vgi/_test_fixtures/table/settings.py +426 -0
- vgi/_test_fixtures/table/transaction_storage.py +162 -0
- vgi/_test_fixtures/table/tt_pushdown.py +191 -0
- vgi/_test_fixtures/table/versioned.py +230 -0
- vgi/_test_fixtures/table_in_out.py +1392 -0
- vgi/_test_fixtures/versioned.py +155 -0
- vgi/_test_fixtures/versioned_tables.py +595 -0
- vgi/_test_fixtures/worker.py +1631 -0
- vgi/_test_fixtures/writable/__init__.py +8 -0
- vgi/_test_fixtures/writable/generic.py +236 -0
- vgi/_test_fixtures/writable/table.py +149 -0
- vgi/_test_fixtures/writable/worker.py +1148 -0
- vgi/aggregate_function.py +607 -0
- vgi/argument_spec.py +472 -0
- vgi/arguments.py +1747 -0
- vgi/auth.py +55 -0
- vgi/catalog/__init__.py +88 -0
- vgi/catalog/attach_option.py +206 -0
- vgi/catalog/catalog_interface.py +2767 -0
- vgi/catalog/descriptors.py +870 -0
- vgi/catalog/duckdb_statistics.py +377 -0
- vgi/catalog/secret_type.py +96 -0
- vgi/catalog/setting.py +253 -0
- vgi/catalog/storage.py +372 -0
- vgi/client/__init__.py +67 -0
- vgi/client/catalog_mixin.py +1251 -0
- vgi/client/cli.py +582 -0
- vgi/client/cli_catalog.py +182 -0
- vgi/client/cli_schema.py +270 -0
- vgi/client/cli_table.py +907 -0
- vgi/client/cli_transaction.py +97 -0
- vgi/client/cli_utils.py +441 -0
- vgi/client/cli_view.py +303 -0
- vgi/client/client.py +2183 -0
- vgi/exceptions.py +205 -0
- vgi/function.py +245 -0
- vgi/function_storage.py +1636 -0
- vgi/function_storage_azure_sql.py +922 -0
- vgi/function_storage_cf_do.py +740 -0
- vgi/http/__init__.py +25 -0
- vgi/http/demo_storage.py +212 -0
- vgi/http/worker_page.py +1252 -0
- vgi/invocation.py +154 -0
- vgi/logging_config.py +93 -0
- vgi/meta_worker.py +661 -0
- vgi/metadata.py +1403 -0
- vgi/otel.py +406 -0
- vgi/protocol.py +2418 -0
- vgi/protocol_version.txt +1 -0
- vgi/py.typed +0 -0
- vgi/scalar_function.py +1211 -0
- vgi/schema_utils.py +234 -0
- vgi/secret_protocol.py +124 -0
- vgi/secret_service.py +238 -0
- vgi/serve.py +769 -0
- vgi/table_buffering_function.py +443 -0
- vgi/table_filter_pushdown.py +1528 -0
- vgi/table_function.py +1130 -0
- vgi/table_in_out_function.py +383 -0
- vgi/transactor/__init__.py +24 -0
- vgi/transactor/_duckdb_compat.py +27 -0
- vgi/transactor/client.py +137 -0
- vgi/transactor/protocol.py +149 -0
- vgi/transactor/server.py +740 -0
- vgi/worker.py +4761 -0
- vgi_python-0.8.0.dist-info/METADATA +735 -0
- vgi_python-0.8.0.dist-info/RECORD +124 -0
- vgi_python-0.8.0.dist-info/WHEEL +4 -0
- vgi_python-0.8.0.dist-info/entry_points.txt +5 -0
- vgi_python-0.8.0.dist-info/licenses/LICENSE +134 -0
|
@@ -0,0 +1,653 @@
|
|
|
1
|
+
# Copyright 2025, 2026 Query Farm LLC - https://query.farm
|
|
2
|
+
|
|
3
|
+
"""Schema-reconcile fixture worker.
|
|
4
|
+
|
|
5
|
+
Hosted inside the consolidated ``vgi-fixture-worker`` (entry point in
|
|
6
|
+
pyproject.toml) alongside the other reproducer catalogs. Used by the
|
|
7
|
+
``test/sql/integration/schema_reconcile.test`` regression test in
|
|
8
|
+
``~/Development/vgi`` to exercise the C++ ``ReconcileBatchToSchema`` helper
|
|
9
|
+
across INSERT, UPDATE, DELETE, and SELECT batch flows.
|
|
10
|
+
|
|
11
|
+
Three writable tables, each with a different rowid type — covering every
|
|
12
|
+
rowid shape that exercises a separate ReconcileBatchToSchema code path:
|
|
13
|
+
|
|
14
|
+
- ``demo`` : rowid int64 NOT NULL — primitive integer rowid.
|
|
15
|
+
- ``ts_demo`` : rowid timestamp[ms, tz=UTC] NOT NULL — TZ-aware
|
|
16
|
+
timestamp as the rowid; exercises the value cast on
|
|
17
|
+
the rowid itself (DuckDB collapses TIMESTAMP_TZ to
|
|
18
|
+
timestamp[us, tz=session]).
|
|
19
|
+
- ``struct_demo`` : rowid struct{a int64 NOT NULL, b string nullable} NOT NULL
|
|
20
|
+
— struct rowid with mixed nullability inside;
|
|
21
|
+
exercises recursive nullability reshape on a rowid.
|
|
22
|
+
|
|
23
|
+
User columns (id/ts/nested/tags) are identical across tables.
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
from __future__ import annotations
|
|
27
|
+
|
|
28
|
+
import os
|
|
29
|
+
import pickle
|
|
30
|
+
import sqlite3
|
|
31
|
+
import sys
|
|
32
|
+
import threading
|
|
33
|
+
from dataclasses import dataclass
|
|
34
|
+
from typing import Any
|
|
35
|
+
|
|
36
|
+
import pyarrow as pa
|
|
37
|
+
from vgi_rpc.rpc import OutputCollector
|
|
38
|
+
|
|
39
|
+
from vgi import Worker
|
|
40
|
+
from vgi.catalog import Catalog, Schema
|
|
41
|
+
from vgi.catalog.catalog_interface import (
|
|
42
|
+
AttachOpaqueData,
|
|
43
|
+
ReadOnlyCatalogInterface,
|
|
44
|
+
ScanFunctionResult,
|
|
45
|
+
SchemaInfo,
|
|
46
|
+
SchemaObjectType,
|
|
47
|
+
SerializedSchema,
|
|
48
|
+
TableInfo,
|
|
49
|
+
TransactionOpaqueData,
|
|
50
|
+
)
|
|
51
|
+
from vgi.invocation import BindResponse, GlobalInitResponse
|
|
52
|
+
from vgi.table_function import (
|
|
53
|
+
BindParams,
|
|
54
|
+
InitParams,
|
|
55
|
+
ProcessParams,
|
|
56
|
+
TableFunctionGenerator,
|
|
57
|
+
)
|
|
58
|
+
from vgi.table_in_out_function import TableInOutGenerator
|
|
59
|
+
|
|
60
|
+
CATALOG_NAME = "schema_reconcile"
|
|
61
|
+
_SCHEMA_NAME = "main"
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
# ---------------------------------------------------------------------------
|
|
65
|
+
# Declared user-facing columns — identical across all three tables.
|
|
66
|
+
# Every facet (NOT NULL primitive, TZ-aware ms timestamp, NOT NULL leaf
|
|
67
|
+
# inside a struct, NOT NULL item inside list-of-struct) is something
|
|
68
|
+
# DuckDB's Arrow round-trip cannot preserve.
|
|
69
|
+
# ---------------------------------------------------------------------------
|
|
70
|
+
|
|
71
|
+
USER_FIELDS: list[pa.Field[Any]] = [
|
|
72
|
+
pa.field("id", pa.int64(), nullable=False),
|
|
73
|
+
pa.field("ts", pa.timestamp("ms", tz="UTC"), nullable=False),
|
|
74
|
+
pa.field(
|
|
75
|
+
"nested",
|
|
76
|
+
pa.struct(
|
|
77
|
+
[
|
|
78
|
+
pa.field("a", pa.int32(), nullable=False),
|
|
79
|
+
pa.field("b", pa.string(), nullable=True),
|
|
80
|
+
pa.field("ts2", pa.timestamp("ms", tz="UTC"), nullable=True),
|
|
81
|
+
]
|
|
82
|
+
),
|
|
83
|
+
nullable=False,
|
|
84
|
+
),
|
|
85
|
+
pa.field(
|
|
86
|
+
"tags",
|
|
87
|
+
pa.list_(
|
|
88
|
+
pa.field(
|
|
89
|
+
"item",
|
|
90
|
+
pa.struct(
|
|
91
|
+
[
|
|
92
|
+
pa.field("k", pa.string(), nullable=False),
|
|
93
|
+
pa.field("v", pa.binary(), nullable=True),
|
|
94
|
+
]
|
|
95
|
+
),
|
|
96
|
+
nullable=False,
|
|
97
|
+
)
|
|
98
|
+
),
|
|
99
|
+
nullable=False,
|
|
100
|
+
),
|
|
101
|
+
]
|
|
102
|
+
USER_SCHEMA: pa.Schema = pa.schema(USER_FIELDS)
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
# ---------------------------------------------------------------------------
|
|
106
|
+
# Per-table specs — each table gets its own rowid type.
|
|
107
|
+
# ---------------------------------------------------------------------------
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def _rowid_field(arrow_type: pa.DataType) -> pa.Field[Any]:
|
|
111
|
+
"""Build a rowid field with the ``is_row_id`` metadata that the C++ side keys on.
|
|
112
|
+
|
|
113
|
+
Always declared NOT NULL to exercise the rowid reshape path in
|
|
114
|
+
ReconcileBatchToSchema.
|
|
115
|
+
"""
|
|
116
|
+
return pa.field("rowid", arrow_type, nullable=False, metadata={b"is_row_id": b""})
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
_INT64_ROWID = _rowid_field(pa.int64())
|
|
120
|
+
|
|
121
|
+
_TS_ROWID = _rowid_field(pa.timestamp("ms", tz="UTC"))
|
|
122
|
+
|
|
123
|
+
_STRUCT_ROWID = _rowid_field(
|
|
124
|
+
pa.struct(
|
|
125
|
+
[
|
|
126
|
+
pa.field("a", pa.int64(), nullable=False),
|
|
127
|
+
pa.field("b", pa.string(), nullable=True),
|
|
128
|
+
]
|
|
129
|
+
)
|
|
130
|
+
)
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
@dataclass(frozen=True)
|
|
134
|
+
class TableSpec:
|
|
135
|
+
name: str
|
|
136
|
+
rowid_field: pa.Field[Any]
|
|
137
|
+
storage_table: str # Underlying SQLite table name.
|
|
138
|
+
|
|
139
|
+
@property
|
|
140
|
+
def table_schema(self) -> pa.Schema:
|
|
141
|
+
return pa.schema(USER_FIELDS + [self.rowid_field])
|
|
142
|
+
|
|
143
|
+
@property
|
|
144
|
+
def delete_input_schema(self) -> pa.Schema:
|
|
145
|
+
return pa.schema([self.rowid_field])
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
TABLES: dict[str, TableSpec] = {
|
|
149
|
+
spec.name: spec
|
|
150
|
+
for spec in (
|
|
151
|
+
TableSpec("demo", _INT64_ROWID, "demo_rows"),
|
|
152
|
+
TableSpec("ts_demo", _TS_ROWID, "ts_demo_rows"),
|
|
153
|
+
TableSpec("struct_demo", _STRUCT_ROWID, "struct_demo_rows"),
|
|
154
|
+
)
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
_COUNT_SCHEMA: pa.Schema = pa.schema([pa.field("count", pa.int64(), nullable=False)])
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
# ---------------------------------------------------------------------------
|
|
162
|
+
# Storage — SQLite. Each logical table gets its own row-store SQLite table.
|
|
163
|
+
# Rowid is opaque (pickled tuple), so this works for int, timestamp, and
|
|
164
|
+
# struct rowids alike. The Arrow schema (the thing under test) is
|
|
165
|
+
# reconstructed from TABLES on read.
|
|
166
|
+
#
|
|
167
|
+
# The C++ extension's worker pool freely spawns multiple worker processes
|
|
168
|
+
# for one ATTACH (POOL_MAX caps idle pool size, not concurrency), so the
|
|
169
|
+
# fixture needs cross-process state; SQLite is the cheapest such store.
|
|
170
|
+
# To avoid leftover rows from a previous test run poisoning the next,
|
|
171
|
+
# the DB filename is keyed on the parent (DuckDB) PID — every test
|
|
172
|
+
# session gets its own file, and all worker processes spawned from the
|
|
173
|
+
# same DuckDB process share it without any cross-process synchronization.
|
|
174
|
+
# ---------------------------------------------------------------------------
|
|
175
|
+
|
|
176
|
+
_lock = threading.Lock()
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
def _db_path() -> str:
|
|
180
|
+
override = os.environ.get("VGI_SCHEMA_RECONCILE_DB")
|
|
181
|
+
if override:
|
|
182
|
+
return override
|
|
183
|
+
# Use the worker's process-group ID so every worker subprocess spawned
|
|
184
|
+
# by the same DuckDB process shares one SQLite file, while distinct
|
|
185
|
+
# test invocations land on different files. PPID alone is unstable
|
|
186
|
+
# because ``uv run`` inserts an intermediate process per worker; PGID
|
|
187
|
+
# propagates across fork/exec by default and stays stable for the
|
|
188
|
+
# life of one test session.
|
|
189
|
+
if sys.platform == "win32": # pragma: no cover - PGID is POSIX; PPID is the
|
|
190
|
+
# closest stable-per-session stand-in.
|
|
191
|
+
import tempfile
|
|
192
|
+
|
|
193
|
+
return os.path.join(tempfile.gettempdir(), f"vgi_schema_reconcile.{os.getppid()}.sqlite")
|
|
194
|
+
return f"/tmp/vgi_schema_reconcile.{os.getpgrp()}.sqlite"
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
def _connect() -> sqlite3.Connection:
|
|
198
|
+
conn = sqlite3.connect(_db_path(), timeout=30.0)
|
|
199
|
+
conn.execute("PRAGMA journal_mode=WAL")
|
|
200
|
+
for spec in TABLES.values():
|
|
201
|
+
conn.execute(
|
|
202
|
+
f"CREATE TABLE IF NOT EXISTS {spec.storage_table} ( rid_blob BLOB PRIMARY KEY, payload BLOB NOT NULL)"
|
|
203
|
+
)
|
|
204
|
+
return conn
|
|
205
|
+
|
|
206
|
+
|
|
207
|
+
def _rid_key(rid: Any) -> bytes:
|
|
208
|
+
return pickle.dumps(rid)
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
def _all_rows(spec: TableSpec) -> list[tuple[Any, dict[str, Any]]]:
|
|
212
|
+
with _lock, _connect() as conn:
|
|
213
|
+
out: list[tuple[Any, dict[str, Any]]] = []
|
|
214
|
+
for rid_blob, payload in conn.execute(f"SELECT rid_blob, payload FROM {spec.storage_table}"):
|
|
215
|
+
out.append((pickle.loads(rid_blob), pickle.loads(payload)))
|
|
216
|
+
return out
|
|
217
|
+
|
|
218
|
+
|
|
219
|
+
def _insert_row(spec: TableSpec, rid: Any, payload: dict[str, Any]) -> None:
|
|
220
|
+
with _lock, _connect() as conn:
|
|
221
|
+
conn.execute(
|
|
222
|
+
f"INSERT OR REPLACE INTO {spec.storage_table} (rid_blob, payload) VALUES (?, ?)",
|
|
223
|
+
(_rid_key(rid), pickle.dumps(payload)),
|
|
224
|
+
)
|
|
225
|
+
|
|
226
|
+
|
|
227
|
+
def _update_row(spec: TableSpec, rid: Any, updates: dict[str, Any]) -> bool:
|
|
228
|
+
with _lock, _connect() as conn:
|
|
229
|
+
row = conn.execute(
|
|
230
|
+
f"SELECT payload FROM {spec.storage_table} WHERE rid_blob = ?",
|
|
231
|
+
(_rid_key(rid),),
|
|
232
|
+
).fetchone()
|
|
233
|
+
if row is None:
|
|
234
|
+
return False
|
|
235
|
+
payload = pickle.loads(row[0])
|
|
236
|
+
payload.update(updates)
|
|
237
|
+
conn.execute(
|
|
238
|
+
f"UPDATE {spec.storage_table} SET payload = ? WHERE rid_blob = ?",
|
|
239
|
+
(pickle.dumps(payload), _rid_key(rid)),
|
|
240
|
+
)
|
|
241
|
+
return True
|
|
242
|
+
|
|
243
|
+
|
|
244
|
+
def _delete_row(spec: TableSpec, rid: Any) -> bool:
|
|
245
|
+
with _lock, _connect() as conn:
|
|
246
|
+
cur = conn.execute(
|
|
247
|
+
f"DELETE FROM {spec.storage_table} WHERE rid_blob = ?",
|
|
248
|
+
(_rid_key(rid),),
|
|
249
|
+
)
|
|
250
|
+
return cur.rowcount > 0
|
|
251
|
+
|
|
252
|
+
|
|
253
|
+
def _next_int_rowid(spec: TableSpec) -> int:
|
|
254
|
+
"""For the int64-rowid table, autoincrement-ish."""
|
|
255
|
+
with _lock, _connect() as conn:
|
|
256
|
+
rows = conn.execute(f"SELECT payload FROM {spec.storage_table}").fetchall()
|
|
257
|
+
# The int rowid is stored in the payload as ``__rid__`` for convenience
|
|
258
|
+
# of monotonic generation.
|
|
259
|
+
existing = [pickle.loads(p[0]).get("__rid__", 0) for p in rows]
|
|
260
|
+
return (max(existing) + 1) if existing else 1
|
|
261
|
+
|
|
262
|
+
|
|
263
|
+
# ---------------------------------------------------------------------------
|
|
264
|
+
# Strict schema verifier
|
|
265
|
+
# ---------------------------------------------------------------------------
|
|
266
|
+
|
|
267
|
+
|
|
268
|
+
def _strict_assert_schema(label: str, actual: pa.Schema, expected: pa.Schema) -> None:
|
|
269
|
+
"""Hard-fail if ``actual`` doesn't bit-for-bit equal ``expected``.
|
|
270
|
+
|
|
271
|
+
The vgi C++ ``ReconcileBatchToSchema`` helper is what makes these
|
|
272
|
+
schemas equal — DuckDB on its own emits batches with all-nullable
|
|
273
|
+
fields, ``timestamp[us, tz=session]`` for TZ timestamps, and so on.
|
|
274
|
+
A mismatch here means reconciliation regressed.
|
|
275
|
+
"""
|
|
276
|
+
if actual.equals(expected, check_metadata=False):
|
|
277
|
+
return
|
|
278
|
+
|
|
279
|
+
detail = []
|
|
280
|
+
if len(actual) != len(expected):
|
|
281
|
+
detail.append(f"field count: actual={len(actual)} expected={len(expected)}")
|
|
282
|
+
for i in range(min(len(actual), len(expected))):
|
|
283
|
+
af = actual.field(i)
|
|
284
|
+
ef = expected.field(i)
|
|
285
|
+
if af.name != ef.name:
|
|
286
|
+
detail.append(f"field[{i}].name: actual={af.name!r} expected={ef.name!r}")
|
|
287
|
+
if af.nullable != ef.nullable:
|
|
288
|
+
detail.append(f"field[{i}={af.name!r}].nullable: actual={af.nullable} expected={ef.nullable}")
|
|
289
|
+
if not af.type.equals(ef.type):
|
|
290
|
+
detail.append(f"field[{i}={af.name!r}].type: actual={af.type} expected={ef.type}")
|
|
291
|
+
raise ValueError(
|
|
292
|
+
f"[schema_reconcile] {label} batch schema mismatch (reconciliation regression?):\n"
|
|
293
|
+
+ "\n".join(f" - {d}" for d in detail)
|
|
294
|
+
+ f"\n actual: {actual}\n expected: {expected}"
|
|
295
|
+
)
|
|
296
|
+
|
|
297
|
+
|
|
298
|
+
# ---------------------------------------------------------------------------
|
|
299
|
+
# Handler functions
|
|
300
|
+
# ---------------------------------------------------------------------------
|
|
301
|
+
|
|
302
|
+
|
|
303
|
+
def _emit_count(out: OutputCollector, n: int) -> None:
|
|
304
|
+
out.emit(pa.RecordBatch.from_pydict({"count": [n]}, schema=_COUNT_SCHEMA))
|
|
305
|
+
|
|
306
|
+
|
|
307
|
+
def _spec_from_args(positional: tuple[Any, ...]) -> TableSpec:
|
|
308
|
+
if not positional or positional[0] is None:
|
|
309
|
+
raise ValueError("schema_reconcile handler: missing table_name positional[0]")
|
|
310
|
+
name = str(positional[0].as_py())
|
|
311
|
+
spec = TABLES.get(name)
|
|
312
|
+
if spec is None:
|
|
313
|
+
raise ValueError(f"schema_reconcile handler: unknown table {name!r}")
|
|
314
|
+
return spec
|
|
315
|
+
|
|
316
|
+
|
|
317
|
+
def _row_to_dict(batch: pa.RecordBatch, i: int, fields: list[str]) -> dict[str, Any]:
|
|
318
|
+
return {name: batch.column(name)[i].as_py() for name in fields}
|
|
319
|
+
|
|
320
|
+
|
|
321
|
+
def _generate_rowid(spec: TableSpec, payload: dict[str, Any]) -> Any:
|
|
322
|
+
"""Synthesize a rowid for INSERT (no rowid column on input)."""
|
|
323
|
+
if spec.rowid_field.type.equals(pa.int64()):
|
|
324
|
+
rid = _next_int_rowid(spec)
|
|
325
|
+
payload["__rid__"] = rid
|
|
326
|
+
return rid
|
|
327
|
+
if isinstance(spec.rowid_field.type, pa.TimestampType):
|
|
328
|
+
# Use the row's `ts` column as a rowid — guaranteed unique enough
|
|
329
|
+
# for tests since tests insert distinct timestamps. Stored as the
|
|
330
|
+
# Python ``datetime`` value the user inserted.
|
|
331
|
+
return payload["ts"]
|
|
332
|
+
if isinstance(spec.rowid_field.type, pa.StructType):
|
|
333
|
+
# Project ``id`` -> a (NOT NULL int64) and ``nested.b`` -> b (nullable string).
|
|
334
|
+
return {"a": payload["id"], "b": payload["nested"].get("b")}
|
|
335
|
+
raise ValueError(f"unhandled rowid type: {spec.rowid_field.type}")
|
|
336
|
+
|
|
337
|
+
|
|
338
|
+
class SchemaReconcileInsert(TableInOutGenerator[None, None]):
|
|
339
|
+
"""INSERT handler — asserts the input batch matches USER_SCHEMA exactly."""
|
|
340
|
+
|
|
341
|
+
class Meta:
|
|
342
|
+
name = "schema_reconcile_insert"
|
|
343
|
+
description = "INSERT handler for the schema_reconcile fixture"
|
|
344
|
+
|
|
345
|
+
@classmethod
|
|
346
|
+
def on_bind(cls, params: BindParams[None]) -> BindResponse:
|
|
347
|
+
return BindResponse(output_schema=_COUNT_SCHEMA)
|
|
348
|
+
|
|
349
|
+
@classmethod
|
|
350
|
+
def process(
|
|
351
|
+
cls,
|
|
352
|
+
params: ProcessParams[None],
|
|
353
|
+
state: None,
|
|
354
|
+
batch: pa.RecordBatch,
|
|
355
|
+
out: OutputCollector,
|
|
356
|
+
) -> None:
|
|
357
|
+
assert params.init_call is not None
|
|
358
|
+
spec = _spec_from_args(params.init_call.bind_call.arguments.positional)
|
|
359
|
+
_strict_assert_schema(f"INSERT[{spec.name}]", batch.schema, USER_SCHEMA)
|
|
360
|
+
names = [f.name for f in USER_SCHEMA]
|
|
361
|
+
for i in range(batch.num_rows):
|
|
362
|
+
payload = _row_to_dict(batch, i, names)
|
|
363
|
+
rid = _generate_rowid(spec, payload)
|
|
364
|
+
_insert_row(spec, rid, payload)
|
|
365
|
+
_emit_count(out, batch.num_rows)
|
|
366
|
+
|
|
367
|
+
|
|
368
|
+
class SchemaReconcileUpdate(TableInOutGenerator[None, None]):
|
|
369
|
+
"""UPDATE handler — assert rowid + selected user columns are present.
|
|
370
|
+
|
|
371
|
+
Asserts batch is rowid + selected user columns, every field with the
|
|
372
|
+
worker-declared flags/types intact.
|
|
373
|
+
"""
|
|
374
|
+
|
|
375
|
+
class Meta:
|
|
376
|
+
name = "schema_reconcile_update"
|
|
377
|
+
description = "UPDATE handler for the schema_reconcile fixture"
|
|
378
|
+
|
|
379
|
+
@classmethod
|
|
380
|
+
def on_bind(cls, params: BindParams[None]) -> BindResponse:
|
|
381
|
+
return BindResponse(output_schema=_COUNT_SCHEMA)
|
|
382
|
+
|
|
383
|
+
@classmethod
|
|
384
|
+
def process(
|
|
385
|
+
cls,
|
|
386
|
+
params: ProcessParams[None],
|
|
387
|
+
state: None,
|
|
388
|
+
batch: pa.RecordBatch,
|
|
389
|
+
out: OutputCollector,
|
|
390
|
+
) -> None:
|
|
391
|
+
assert params.init_call is not None
|
|
392
|
+
spec = _spec_from_args(params.init_call.bind_call.arguments.positional)
|
|
393
|
+
cols = batch.schema.names
|
|
394
|
+
if "rowid" not in cols:
|
|
395
|
+
raise ValueError(f"[schema_reconcile] UPDATE[{spec.name}] missing rowid column; got: {cols}")
|
|
396
|
+
full = spec.table_schema
|
|
397
|
+
for f in batch.schema:
|
|
398
|
+
expected = full.field(full.get_field_index(f.name))
|
|
399
|
+
if f.nullable != expected.nullable or not f.type.equals(expected.type):
|
|
400
|
+
raise ValueError(
|
|
401
|
+
f"[schema_reconcile] UPDATE[{spec.name}] field {f.name!r} mismatch "
|
|
402
|
+
f"(reconciliation regression?): "
|
|
403
|
+
f"actual=({f.type}, nullable={f.nullable}) "
|
|
404
|
+
f"expected=({expected.type}, nullable={expected.nullable})"
|
|
405
|
+
)
|
|
406
|
+
|
|
407
|
+
update_cols = [c for c in cols if c != "rowid"]
|
|
408
|
+
n = 0
|
|
409
|
+
for i in range(batch.num_rows):
|
|
410
|
+
rid = batch.column("rowid")[i].as_py()
|
|
411
|
+
updates = {c: batch.column(c)[i].as_py() for c in update_cols}
|
|
412
|
+
if _update_row(spec, rid, updates):
|
|
413
|
+
n += 1
|
|
414
|
+
_emit_count(out, n)
|
|
415
|
+
|
|
416
|
+
|
|
417
|
+
class SchemaReconcileDelete(TableInOutGenerator[None, None]):
|
|
418
|
+
"""DELETE handler — asserts batch is rowid-only with declared flag/type."""
|
|
419
|
+
|
|
420
|
+
class Meta:
|
|
421
|
+
name = "schema_reconcile_delete"
|
|
422
|
+
description = "DELETE handler for the schema_reconcile fixture"
|
|
423
|
+
|
|
424
|
+
@classmethod
|
|
425
|
+
def on_bind(cls, params: BindParams[None]) -> BindResponse:
|
|
426
|
+
return BindResponse(output_schema=_COUNT_SCHEMA)
|
|
427
|
+
|
|
428
|
+
@classmethod
|
|
429
|
+
def process(
|
|
430
|
+
cls,
|
|
431
|
+
params: ProcessParams[None],
|
|
432
|
+
state: None,
|
|
433
|
+
batch: pa.RecordBatch,
|
|
434
|
+
out: OutputCollector,
|
|
435
|
+
) -> None:
|
|
436
|
+
assert params.init_call is not None
|
|
437
|
+
spec = _spec_from_args(params.init_call.bind_call.arguments.positional)
|
|
438
|
+
_strict_assert_schema(f"DELETE[{spec.name}]", batch.schema, spec.delete_input_schema)
|
|
439
|
+
n = 0
|
|
440
|
+
for i in range(batch.num_rows):
|
|
441
|
+
rid = batch.column("rowid")[i].as_py()
|
|
442
|
+
if _delete_row(spec, rid):
|
|
443
|
+
n += 1
|
|
444
|
+
_emit_count(out, n)
|
|
445
|
+
|
|
446
|
+
|
|
447
|
+
class SchemaReconcileScan(TableFunctionGenerator[None, None]):
|
|
448
|
+
"""SELECT handler — emits the table's stored rows in its declared schema."""
|
|
449
|
+
|
|
450
|
+
class Meta:
|
|
451
|
+
name = "schema_reconcile_scan"
|
|
452
|
+
description = "SCAN handler for the schema_reconcile fixture"
|
|
453
|
+
projection_pushdown = True
|
|
454
|
+
|
|
455
|
+
@classmethod
|
|
456
|
+
def on_bind(cls, params: BindParams[None]) -> BindResponse:
|
|
457
|
+
spec = _spec_from_args(params.bind_call.arguments.positional)
|
|
458
|
+
return BindResponse(output_schema=spec.table_schema)
|
|
459
|
+
|
|
460
|
+
@classmethod
|
|
461
|
+
def on_init(cls, params: InitParams[None]) -> GlobalInitResponse:
|
|
462
|
+
# One worker emits the full table; with parallel workers each
|
|
463
|
+
# would duplicate every row.
|
|
464
|
+
return GlobalInitResponse(max_workers=1)
|
|
465
|
+
|
|
466
|
+
@classmethod
|
|
467
|
+
def process(cls, params: ProcessParams[None], state: None, out: OutputCollector) -> None:
|
|
468
|
+
assert params.init_call is not None
|
|
469
|
+
spec = _spec_from_args(params.init_call.bind_call.arguments.positional)
|
|
470
|
+
out_schema = params.output_schema
|
|
471
|
+
rows: list[dict[str, Any]] = []
|
|
472
|
+
for rid, payload in _all_rows(spec):
|
|
473
|
+
full = {**payload, "rowid": rid}
|
|
474
|
+
# Don't emit the bookkeeping ``__rid__`` column.
|
|
475
|
+
full.pop("__rid__", None)
|
|
476
|
+
rows.append({name: full[name] for name in out_schema.names})
|
|
477
|
+
out.emit(pa.RecordBatch.from_pylist(rows, schema=out_schema))
|
|
478
|
+
out.finish()
|
|
479
|
+
|
|
480
|
+
|
|
481
|
+
# ---------------------------------------------------------------------------
|
|
482
|
+
# Catalog
|
|
483
|
+
# ---------------------------------------------------------------------------
|
|
484
|
+
|
|
485
|
+
|
|
486
|
+
def _serialize_schema(schema: pa.Schema) -> bytes:
|
|
487
|
+
sink = pa.BufferOutputStream()
|
|
488
|
+
with pa.ipc.new_stream(sink, schema):
|
|
489
|
+
pass
|
|
490
|
+
return sink.getvalue().to_pybytes()
|
|
491
|
+
|
|
492
|
+
|
|
493
|
+
_FUNCTIONS = [
|
|
494
|
+
SchemaReconcileInsert,
|
|
495
|
+
SchemaReconcileUpdate,
|
|
496
|
+
SchemaReconcileDelete,
|
|
497
|
+
SchemaReconcileScan,
|
|
498
|
+
]
|
|
499
|
+
|
|
500
|
+
|
|
501
|
+
_CATALOG = Catalog(
|
|
502
|
+
name=CATALOG_NAME,
|
|
503
|
+
default_schema=_SCHEMA_NAME,
|
|
504
|
+
schemas=[
|
|
505
|
+
Schema(
|
|
506
|
+
name=_SCHEMA_NAME,
|
|
507
|
+
comment="Schema-reconcile fixture catalog",
|
|
508
|
+
functions=list(_FUNCTIONS),
|
|
509
|
+
tables=[],
|
|
510
|
+
),
|
|
511
|
+
],
|
|
512
|
+
)
|
|
513
|
+
|
|
514
|
+
|
|
515
|
+
class SchemaReconcileCatalog(ReadOnlyCatalogInterface):
|
|
516
|
+
"""Catalog exposing the three writable schema-reconcile tables."""
|
|
517
|
+
|
|
518
|
+
catalog = _CATALOG
|
|
519
|
+
catalog_name = CATALOG_NAME
|
|
520
|
+
|
|
521
|
+
def _table_info(self, spec: TableSpec) -> TableInfo:
|
|
522
|
+
return TableInfo(
|
|
523
|
+
comment=f"Schema-reconcile {spec.name} (rowid type {spec.rowid_field.type})",
|
|
524
|
+
tags={},
|
|
525
|
+
name=spec.name,
|
|
526
|
+
schema_name=_SCHEMA_NAME,
|
|
527
|
+
columns=SerializedSchema(_serialize_schema(spec.table_schema)),
|
|
528
|
+
not_null_constraints=[],
|
|
529
|
+
unique_constraints=[],
|
|
530
|
+
check_constraints=[],
|
|
531
|
+
supports_insert=True,
|
|
532
|
+
supports_update=True,
|
|
533
|
+
supports_delete=True,
|
|
534
|
+
)
|
|
535
|
+
|
|
536
|
+
def schemas(
|
|
537
|
+
self, *, attach_opaque_data: AttachOpaqueData, transaction_opaque_data: TransactionOpaqueData | None
|
|
538
|
+
) -> list[SchemaInfo]:
|
|
539
|
+
# The declarative ``Schema(tables=[])`` would auto-populate
|
|
540
|
+
# ``estimated_object_count[table] = 0``, which the C++ client treats
|
|
541
|
+
# as a hard guarantee and uses to skip the bulk RPC. But this catalog
|
|
542
|
+
# publishes tables via the ``schema_contents`` override below, not
|
|
543
|
+
# via the declarative ``tables=`` field — so the count is wrong.
|
|
544
|
+
# Override at the catalog level to report the real population.
|
|
545
|
+
infos = super().schemas(attach_opaque_data=attach_opaque_data, transaction_opaque_data=transaction_opaque_data)
|
|
546
|
+
for i, info in enumerate(infos):
|
|
547
|
+
if info.name == _SCHEMA_NAME:
|
|
548
|
+
infos[i] = SchemaInfo(
|
|
549
|
+
attach_opaque_data=info.attach_opaque_data,
|
|
550
|
+
name=info.name,
|
|
551
|
+
comment=info.comment,
|
|
552
|
+
tags=info.tags,
|
|
553
|
+
estimated_object_count={
|
|
554
|
+
**(info.estimated_object_count or {}),
|
|
555
|
+
"table": len(TABLES),
|
|
556
|
+
},
|
|
557
|
+
)
|
|
558
|
+
return infos
|
|
559
|
+
|
|
560
|
+
def schema_contents(
|
|
561
|
+
self,
|
|
562
|
+
*,
|
|
563
|
+
attach_opaque_data: AttachOpaqueData,
|
|
564
|
+
transaction_opaque_data: TransactionOpaqueData | None,
|
|
565
|
+
name: str,
|
|
566
|
+
type: Any,
|
|
567
|
+
) -> Any:
|
|
568
|
+
if name.lower() == _SCHEMA_NAME and type == SchemaObjectType.TABLE:
|
|
569
|
+
return [self._table_info(spec) for spec in TABLES.values()]
|
|
570
|
+
return super().schema_contents(
|
|
571
|
+
attach_opaque_data=attach_opaque_data, transaction_opaque_data=transaction_opaque_data, name=name, type=type
|
|
572
|
+
)
|
|
573
|
+
|
|
574
|
+
def table_get(
|
|
575
|
+
self,
|
|
576
|
+
*,
|
|
577
|
+
attach_opaque_data: AttachOpaqueData,
|
|
578
|
+
transaction_opaque_data: TransactionOpaqueData | None,
|
|
579
|
+
schema_name: str,
|
|
580
|
+
name: str,
|
|
581
|
+
at_unit: str | None = None,
|
|
582
|
+
at_value: str | None = None,
|
|
583
|
+
) -> TableInfo | None:
|
|
584
|
+
if schema_name.lower() != _SCHEMA_NAME:
|
|
585
|
+
return None
|
|
586
|
+
spec = TABLES.get(name.lower())
|
|
587
|
+
return self._table_info(spec) if spec else None
|
|
588
|
+
|
|
589
|
+
def _route(self, fn_name: str, schema_name: str, name: str) -> ScanFunctionResult:
|
|
590
|
+
return ScanFunctionResult(
|
|
591
|
+
function_name=fn_name,
|
|
592
|
+
positional_arguments=[pa.scalar(name, type=pa.string())],
|
|
593
|
+
named_arguments={},
|
|
594
|
+
required_extensions=[],
|
|
595
|
+
)
|
|
596
|
+
|
|
597
|
+
def table_scan_function_get(
|
|
598
|
+
self,
|
|
599
|
+
*,
|
|
600
|
+
attach_opaque_data: AttachOpaqueData,
|
|
601
|
+
transaction_opaque_data: TransactionOpaqueData | None,
|
|
602
|
+
schema_name: str,
|
|
603
|
+
name: str,
|
|
604
|
+
at_unit: str | None,
|
|
605
|
+
at_value: str | None,
|
|
606
|
+
) -> ScanFunctionResult:
|
|
607
|
+
return self._route("schema_reconcile_scan", schema_name, name)
|
|
608
|
+
|
|
609
|
+
def table_insert_function_get(
|
|
610
|
+
self,
|
|
611
|
+
*,
|
|
612
|
+
attach_opaque_data: AttachOpaqueData,
|
|
613
|
+
transaction_opaque_data: TransactionOpaqueData | None,
|
|
614
|
+
schema_name: str,
|
|
615
|
+
name: str,
|
|
616
|
+
writable_branch_function_name: str | None = None,
|
|
617
|
+
) -> ScanFunctionResult:
|
|
618
|
+
del writable_branch_function_name
|
|
619
|
+
return self._route("schema_reconcile_insert", schema_name, name)
|
|
620
|
+
|
|
621
|
+
def table_update_function_get(
|
|
622
|
+
self,
|
|
623
|
+
*,
|
|
624
|
+
attach_opaque_data: AttachOpaqueData,
|
|
625
|
+
transaction_opaque_data: TransactionOpaqueData | None,
|
|
626
|
+
schema_name: str,
|
|
627
|
+
name: str,
|
|
628
|
+
) -> ScanFunctionResult:
|
|
629
|
+
return self._route("schema_reconcile_update", schema_name, name)
|
|
630
|
+
|
|
631
|
+
def table_delete_function_get(
|
|
632
|
+
self,
|
|
633
|
+
*,
|
|
634
|
+
attach_opaque_data: AttachOpaqueData,
|
|
635
|
+
transaction_opaque_data: TransactionOpaqueData | None,
|
|
636
|
+
schema_name: str,
|
|
637
|
+
name: str,
|
|
638
|
+
) -> ScanFunctionResult:
|
|
639
|
+
return self._route("schema_reconcile_delete", schema_name, name)
|
|
640
|
+
|
|
641
|
+
|
|
642
|
+
# ---------------------------------------------------------------------------
|
|
643
|
+
# Worker
|
|
644
|
+
# ---------------------------------------------------------------------------
|
|
645
|
+
|
|
646
|
+
|
|
647
|
+
class SchemaReconcileWorker(Worker):
|
|
648
|
+
"""Worker exposing the schema-reconcile fixture catalog."""
|
|
649
|
+
|
|
650
|
+
catalog_interface = SchemaReconcileCatalog
|
|
651
|
+
catalog_name = CATALOG_NAME
|
|
652
|
+
catalog = _CATALOG
|
|
653
|
+
functions = list(_FUNCTIONS)
|