vgi-python 0.8.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (124) hide show
  1. vgi/__init__.py +152 -0
  2. vgi/_duckdb.py +62 -0
  3. vgi/_storage_profile.py +132 -0
  4. vgi/_test_fixtures/__init__.py +20 -0
  5. vgi/_test_fixtures/accumulate/__init__.py +19 -0
  6. vgi/_test_fixtures/accumulate/worker.py +762 -0
  7. vgi/_test_fixtures/aggregate/__init__.py +62 -0
  8. vgi/_test_fixtures/aggregate/_common.py +21 -0
  9. vgi/_test_fixtures/aggregate/basic.py +232 -0
  10. vgi/_test_fixtures/aggregate/dynamic.py +409 -0
  11. vgi/_test_fixtures/aggregate/generic.py +86 -0
  12. vgi/_test_fixtures/aggregate/listagg.py +71 -0
  13. vgi/_test_fixtures/aggregate/percentile.py +107 -0
  14. vgi/_test_fixtures/aggregate/streaming.py +192 -0
  15. vgi/_test_fixtures/aggregate/varargs.py +75 -0
  16. vgi/_test_fixtures/aggregate/window.py +380 -0
  17. vgi/_test_fixtures/attach_options.py +308 -0
  18. vgi/_test_fixtures/bad_protocol.py +62 -0
  19. vgi/_test_fixtures/cancellable.py +336 -0
  20. vgi/_test_fixtures/catalog.py +813 -0
  21. vgi/_test_fixtures/http_server.py +394 -0
  22. vgi/_test_fixtures/nest_tensor.py +614 -0
  23. vgi/_test_fixtures/orchard_catalog.py +47 -0
  24. vgi/_test_fixtures/projection_repro/__init__.py +6 -0
  25. vgi/_test_fixtures/projection_repro/worker.py +454 -0
  26. vgi/_test_fixtures/scalar/__init__.py +116 -0
  27. vgi/_test_fixtures/scalar/_common.py +69 -0
  28. vgi/_test_fixtures/scalar/arithmetic.py +321 -0
  29. vgi/_test_fixtures/scalar/binary.py +120 -0
  30. vgi/_test_fixtures/scalar/formatting.py +176 -0
  31. vgi/_test_fixtures/scalar/geo.py +300 -0
  32. vgi/_test_fixtures/scalar/null_handling.py +107 -0
  33. vgi/_test_fixtures/scalar/random_demo.py +171 -0
  34. vgi/_test_fixtures/scalar/settings_secrets.py +102 -0
  35. vgi/_test_fixtures/scalar/type_info.py +219 -0
  36. vgi/_test_fixtures/schema_reconcile/__init__.py +29 -0
  37. vgi/_test_fixtures/schema_reconcile/worker.py +653 -0
  38. vgi/_test_fixtures/simple_writable.py +793 -0
  39. vgi/_test_fixtures/table/__init__.py +221 -0
  40. vgi/_test_fixtures/table/_common.py +162 -0
  41. vgi/_test_fixtures/table/batch_index.py +283 -0
  42. vgi/_test_fixtures/table/batch_index_broken.py +200 -0
  43. vgi/_test_fixtures/table/catalog_scans.py +162 -0
  44. vgi/_test_fixtures/table/filters.py +1005 -0
  45. vgi/_test_fixtures/table/late_materialization.py +249 -0
  46. vgi/_test_fixtures/table/make_series.py +273 -0
  47. vgi/_test_fixtures/table/misc.py +499 -0
  48. vgi/_test_fixtures/table/order_modes.py +164 -0
  49. vgi/_test_fixtures/table/pairs.py +437 -0
  50. vgi/_test_fixtures/table/partition_columns.py +472 -0
  51. vgi/_test_fixtures/table/partition_columns_broken.py +304 -0
  52. vgi/_test_fixtures/table/profiling_example.py +195 -0
  53. vgi/_test_fixtures/table/required_filters.py +234 -0
  54. vgi/_test_fixtures/table/sequence.py +710 -0
  55. vgi/_test_fixtures/table/settings.py +426 -0
  56. vgi/_test_fixtures/table/transaction_storage.py +162 -0
  57. vgi/_test_fixtures/table/tt_pushdown.py +191 -0
  58. vgi/_test_fixtures/table/versioned.py +230 -0
  59. vgi/_test_fixtures/table_in_out.py +1392 -0
  60. vgi/_test_fixtures/versioned.py +155 -0
  61. vgi/_test_fixtures/versioned_tables.py +595 -0
  62. vgi/_test_fixtures/worker.py +1631 -0
  63. vgi/_test_fixtures/writable/__init__.py +8 -0
  64. vgi/_test_fixtures/writable/generic.py +236 -0
  65. vgi/_test_fixtures/writable/table.py +149 -0
  66. vgi/_test_fixtures/writable/worker.py +1148 -0
  67. vgi/aggregate_function.py +607 -0
  68. vgi/argument_spec.py +472 -0
  69. vgi/arguments.py +1747 -0
  70. vgi/auth.py +55 -0
  71. vgi/catalog/__init__.py +88 -0
  72. vgi/catalog/attach_option.py +206 -0
  73. vgi/catalog/catalog_interface.py +2767 -0
  74. vgi/catalog/descriptors.py +870 -0
  75. vgi/catalog/duckdb_statistics.py +377 -0
  76. vgi/catalog/secret_type.py +96 -0
  77. vgi/catalog/setting.py +253 -0
  78. vgi/catalog/storage.py +372 -0
  79. vgi/client/__init__.py +67 -0
  80. vgi/client/catalog_mixin.py +1251 -0
  81. vgi/client/cli.py +582 -0
  82. vgi/client/cli_catalog.py +182 -0
  83. vgi/client/cli_schema.py +270 -0
  84. vgi/client/cli_table.py +907 -0
  85. vgi/client/cli_transaction.py +97 -0
  86. vgi/client/cli_utils.py +441 -0
  87. vgi/client/cli_view.py +303 -0
  88. vgi/client/client.py +2183 -0
  89. vgi/exceptions.py +205 -0
  90. vgi/function.py +245 -0
  91. vgi/function_storage.py +1636 -0
  92. vgi/function_storage_azure_sql.py +922 -0
  93. vgi/function_storage_cf_do.py +740 -0
  94. vgi/http/__init__.py +25 -0
  95. vgi/http/demo_storage.py +212 -0
  96. vgi/http/worker_page.py +1252 -0
  97. vgi/invocation.py +154 -0
  98. vgi/logging_config.py +93 -0
  99. vgi/meta_worker.py +661 -0
  100. vgi/metadata.py +1403 -0
  101. vgi/otel.py +406 -0
  102. vgi/protocol.py +2418 -0
  103. vgi/protocol_version.txt +1 -0
  104. vgi/py.typed +0 -0
  105. vgi/scalar_function.py +1211 -0
  106. vgi/schema_utils.py +234 -0
  107. vgi/secret_protocol.py +124 -0
  108. vgi/secret_service.py +238 -0
  109. vgi/serve.py +769 -0
  110. vgi/table_buffering_function.py +443 -0
  111. vgi/table_filter_pushdown.py +1528 -0
  112. vgi/table_function.py +1130 -0
  113. vgi/table_in_out_function.py +383 -0
  114. vgi/transactor/__init__.py +24 -0
  115. vgi/transactor/_duckdb_compat.py +27 -0
  116. vgi/transactor/client.py +137 -0
  117. vgi/transactor/protocol.py +149 -0
  118. vgi/transactor/server.py +740 -0
  119. vgi/worker.py +4761 -0
  120. vgi_python-0.8.0.dist-info/METADATA +735 -0
  121. vgi_python-0.8.0.dist-info/RECORD +124 -0
  122. vgi_python-0.8.0.dist-info/WHEEL +4 -0
  123. vgi_python-0.8.0.dist-info/entry_points.txt +5 -0
  124. vgi_python-0.8.0.dist-info/licenses/LICENSE +134 -0
@@ -0,0 +1,793 @@
1
+ # Copyright 2025, 2026 Query Farm LLC - https://query.farm
2
+
3
+ """Minimal in-memory writable worker — no transactor, no subcursor.
4
+
5
+ Skips proper transactional semantics: data is mutated in process memory and
6
+ becomes visible to all observers immediately. ``BEGIN`` is a no-op,
7
+ ``COMMIT`` is a no-op, ``ROLLBACK`` does NOT undo earlier writes. The fixture
8
+ exists only to drive the C++ extension's INSERT/UPDATE/DELETE wire path
9
+ without depending on the production writable fixture's reliance on the VGI
10
+ fork of duckdb-python (subcursor / enable_suspended_queries).
11
+
12
+ Three pre-defined tables are exposed under the ``main`` schema:
13
+
14
+ * ``items`` — supports INSERT/UPDATE/DELETE with RETURNING.
15
+ * ``items_no_returning`` — supports INSERT/UPDATE/DELETE *without* RETURNING.
16
+ Used to exercise the supports_returning=False rejection path.
17
+ * ``items_insert_only`` — supports INSERT only (no UPDATE/DELETE/RETURNING).
18
+
19
+ State is held module-global, keyed by ``attach_opaque_data``. Per the
20
+ "pooled workers don't share per-attach state" gotcha this means the fixture
21
+ only behaves consistently when a single subprocess serves all queries for an
22
+ attach. The default pool (max=256, idle=5s) reuses the same subprocess for
23
+ back-to-back queries in a sqllogictest, so this is fine in practice — but
24
+ parallel queries on the same attach may diverge. Don't rely on this fixture
25
+ for correctness tests, only wire-protocol tests.
26
+
27
+ Registered as the ``vgi-fixture-simple-writable-worker`` entry point.
28
+ """
29
+
30
+ from __future__ import annotations
31
+
32
+ import contextlib
33
+ import os
34
+ import sqlite3
35
+ import tempfile
36
+ import threading
37
+ import uuid
38
+ from collections.abc import Iterator, Sequence
39
+ from dataclasses import dataclass
40
+ from typing import TYPE_CHECKING, Annotated, Any, Literal, overload
41
+
42
+ import pyarrow as pa
43
+ from vgi_rpc import ArrowSerializableDataclass, Transient
44
+ from vgi_rpc.rpc import OutputCollector
45
+
46
+ from vgi.catalog import (
47
+ AttachOpaqueData,
48
+ CatalogAttachResult,
49
+ ReadOnlyCatalogInterface,
50
+ ScanFunctionResult,
51
+ SchemaInfo,
52
+ SchemaObjectType,
53
+ SerializedSchema,
54
+ TableInfo,
55
+ TransactionOpaqueData,
56
+ )
57
+ from vgi.catalog.descriptors import Catalog, Schema
58
+ from vgi.invocation import BindResponse, GlobalInitResponse
59
+ from vgi.schema_utils import schema as build_schema
60
+ from vgi.table_function import BindParams, InitParams, ProcessParams, TableFunctionGenerator
61
+ from vgi.table_in_out_function import TableInOutGenerator
62
+ from vgi.worker import Worker
63
+
64
+ if TYPE_CHECKING:
65
+ from vgi.catalog.catalog_interface import (
66
+ FunctionInfo,
67
+ IndexInfo,
68
+ MacroInfo,
69
+ ViewInfo,
70
+ )
71
+
72
+ __all__ = [
73
+ "SimpleWritableCatalog",
74
+ "SimpleWritableWorker",
75
+ "main",
76
+ ]
77
+
78
+
79
+ CATALOG_NAME = "simple_writable"
80
+
81
+ # DuckDB rowid pseudocolumn — extension reads is_row_id metadata to identify it.
82
+ _ROWID_FIELD = pa.field("rowid", pa.int64(), metadata={b"is_row_id": b""})
83
+
84
+ # Output schema for write functions returning affected row counts.
85
+ _COUNT_SCHEMA = build_schema(count=pa.int64())
86
+
87
+
88
+ # ============================================================================
89
+ # Storage — SQLite file per attach_opaque_data under TMPDIR.
90
+ #
91
+ # Pooled-worker subprocesses don't share Python state (see CLAUDE.md gotcha),
92
+ # so module-globals would lose rows whenever the pool routed a query to a
93
+ # fresh process. We persist into a SQLite file keyed by attach_opaque_data hex so the
94
+ # data survives subprocess churn for the lifetime of an ATTACH.
95
+ # ============================================================================
96
+
97
+
98
+ _SQL_TYPE_MAP: dict[pa.DataType, str] = {
99
+ pa.int64(): "INTEGER",
100
+ pa.int32(): "INTEGER",
101
+ pa.string(): "TEXT",
102
+ pa.float64(): "REAL",
103
+ pa.bool_(): "INTEGER",
104
+ }
105
+
106
+
107
+ def _sql_type(arrow_type: pa.DataType) -> str:
108
+ if arrow_type in _SQL_TYPE_MAP:
109
+ return _SQL_TYPE_MAP[arrow_type]
110
+ raise ValueError(f"simple_writable: unsupported Arrow type {arrow_type!r}")
111
+
112
+
113
+ def _table_specs() -> dict[str, pa.Schema]:
114
+ """User-visible schema (no rowid) for each pre-defined table."""
115
+ return {
116
+ "items": build_schema(id=pa.int64(), name=pa.string(), qty=pa.int64()),
117
+ "items_no_returning": build_schema(id=pa.int64(), name=pa.string(), qty=pa.int64()),
118
+ "items_insert_only": build_schema(id=pa.int64(), name=pa.string()),
119
+ # Lies: catalog advertises supports_returning=True but the insert
120
+ # function always emits a (count BIGINT) batch. Used by tests to verify
121
+ # the C++ extension rejects the mismatched batch with a clean IOException
122
+ # instead of crashing inside ArrowToDuckDB.
123
+ "items_broken_returning": build_schema(id=pa.int64(), name=pa.string()),
124
+ }
125
+
126
+
127
+ def _table_supports_returning(name: str) -> bool:
128
+ return name != "items_no_returning"
129
+
130
+
131
+ def _table_supports_update_delete(name: str) -> bool:
132
+ # items_insert_only and items_broken_returning don't expose UPDATE/DELETE.
133
+ return name not in {"items_insert_only", "items_broken_returning"}
134
+
135
+
136
+ _DB_DIR = os.path.join(tempfile.gettempdir(), "vgi-simple-writable")
137
+ _INIT_LOCK = threading.Lock()
138
+ _INITIALIZED: set[bytes] = set()
139
+
140
+
141
+ def _db_path(attach_opaque_data: bytes) -> str:
142
+ return os.path.join(_DB_DIR, f"{attach_opaque_data.hex()}.sqlite")
143
+
144
+
145
+ def _ensure_init(attach_opaque_data: bytes) -> None:
146
+ """Create per-attach SQLite file + tables if not yet seen by this process.
147
+
148
+ Idempotent and process-local: pooled-worker subprocesses each cache once.
149
+ """
150
+ with _INIT_LOCK:
151
+ if attach_opaque_data in _INITIALIZED:
152
+ return
153
+ os.makedirs(_DB_DIR, exist_ok=True)
154
+ conn = sqlite3.connect(_db_path(attach_opaque_data), isolation_level=None)
155
+ try:
156
+ conn.execute("PRAGMA journal_mode=WAL;")
157
+ for tname, tschema in _table_specs().items():
158
+ cols = ", ".join(f'"{f.name}" {_sql_type(f.type)}' for f in tschema)
159
+ conn.execute(f'CREATE TABLE IF NOT EXISTS "{tname}" ({cols})')
160
+ finally:
161
+ conn.close()
162
+ _INITIALIZED.add(attach_opaque_data)
163
+
164
+
165
+ @contextlib.contextmanager
166
+ def _connect(attach_opaque_data: bytes) -> Iterator[sqlite3.Connection]:
167
+ _ensure_init(attach_opaque_data)
168
+ conn = sqlite3.connect(_db_path(attach_opaque_data), isolation_level=None)
169
+ try:
170
+ conn.execute("PRAGMA journal_mode=WAL;")
171
+ yield conn
172
+ finally:
173
+ conn.close()
174
+
175
+
176
+ def _init_db(attach_opaque_data: bytes) -> None:
177
+ """Eagerly initialize the per-attach store (called at catalog_attach)."""
178
+ _ensure_init(attach_opaque_data)
179
+
180
+
181
+ def _bare_name(qualified: str) -> str:
182
+ return qualified.split(".", 1)[1] if "." in qualified else qualified
183
+
184
+
185
+ def _get_user_schema(qualified: str) -> pa.Schema:
186
+ bare = _bare_name(qualified)
187
+ if bare not in _table_specs():
188
+ raise ValueError(f"Unknown table {qualified!r}; available: {sorted(_table_specs())}")
189
+ return _table_specs()[bare]
190
+
191
+
192
+ # ============================================================================
193
+ # Helpers shared by write functions
194
+ # ============================================================================
195
+
196
+
197
+ def _qualified_from_bind(params: BindParams[None]) -> str:
198
+ args = params.bind_call.arguments
199
+ if not args.positional or args.positional[0] is None:
200
+ raise ValueError("table_name positional argument is required")
201
+ return str(args.positional[0].as_py())
202
+
203
+
204
+ def _qualified_from_process(params: ProcessParams[None]) -> str:
205
+ assert params.init_call is not None
206
+ args = params.init_call.bind_call.arguments
207
+ if not args.positional or args.positional[0] is None:
208
+ raise ValueError("table_name positional argument is required")
209
+ return str(args.positional[0].as_py())
210
+
211
+
212
+ def _attach_opaque_data_from_bind(params: BindParams[None]) -> bytes:
213
+ # Unwrapped plaintext attach (storage shards on the sealed form via request).
214
+ aid = params.attach_opaque_data
215
+ if aid is None:
216
+ raise ValueError("attach_opaque_data missing")
217
+ return bytes(aid)
218
+
219
+
220
+ def _attach_opaque_data_from_process(params: ProcessParams[None]) -> bytes:
221
+ aid = params.attach_opaque_data
222
+ if aid is None:
223
+ raise ValueError("attach_opaque_data missing")
224
+ return bytes(aid)
225
+
226
+
227
+ def _parse_write_options(params: BindParams[None]) -> dict[str, Any]:
228
+ """Decode the write_options batch passed in named arguments."""
229
+ defaults: dict[str, Any] = {"return_chunks": False, "on_conflict": "throw", "on_conflict_columns": []}
230
+ if not (params.bind_call.arguments and params.bind_call.arguments.named):
231
+ return defaults
232
+ val = params.bind_call.arguments.named.get("write_options")
233
+ if val is None:
234
+ return defaults
235
+ from vgi_rpc.utils import deserialize_record_batch
236
+
237
+ batch, _ = deserialize_record_batch(val.as_py())
238
+ out = dict(defaults)
239
+ if "return_chunks" in batch.schema.names:
240
+ out["return_chunks"] = batch.column("return_chunks")[0].as_py()
241
+ if "on_conflict" in batch.schema.names:
242
+ out["on_conflict"] = batch.column("on_conflict")[0].as_py()
243
+ if "on_conflict_columns" in batch.schema.names:
244
+ out["on_conflict_columns"] = batch.column("on_conflict_columns")[0].as_py()
245
+ return out
246
+
247
+
248
+ def _user_schema_from_bind(params: BindParams[None]) -> pa.Schema:
249
+ qualified = _qualified_from_bind(params)
250
+ return _get_user_schema(qualified)
251
+
252
+
253
+ # ============================================================================
254
+ # Scan
255
+ # ============================================================================
256
+
257
+
258
+ class SimpleScan(TableFunctionGenerator[None, "_ScanState"]):
259
+ """Scan one of the pre-defined tables — emits all current rows once."""
260
+
261
+ class Meta:
262
+ name = "simple_writable_scan"
263
+ projection_pushdown = True
264
+ filter_pushdown = False
265
+
266
+ @classmethod
267
+ def on_bind(cls, params: BindParams[None]) -> BindResponse:
268
+ qualified = _qualified_from_bind(params)
269
+ user_schema = _get_user_schema(qualified)
270
+ # Output schema is user_schema + rowid so UPDATE/DELETE can reference rows.
271
+ fields = list(user_schema) + [_ROWID_FIELD]
272
+ return BindResponse(output_schema=pa.schema(fields))
273
+
274
+ @classmethod
275
+ def on_init(cls, params: InitParams[None]) -> GlobalInitResponse:
276
+ return GlobalInitResponse(max_workers=1)
277
+
278
+ @classmethod
279
+ def initial_state(cls, params: ProcessParams[None]) -> _ScanState:
280
+ qualified = _qualified_from_process(params)
281
+ attach_opaque_data = _attach_opaque_data_from_process(params)
282
+ bare = _bare_name(qualified)
283
+ # Build SELECT list positionally — DuckDB's planner can request the
284
+ # same column twice (e.g. "id, qty, name, id" for UPDATE...RETURNING),
285
+ # so build one SELECT entry per output_schema field, including `rowid`.
286
+ select_cols = [f.name for f in params.output_schema]
287
+ select_list = ", ".join(f'"{c}"' for c in select_cols) if select_cols else "1"
288
+ with _connect(attach_opaque_data) as conn:
289
+ cur = conn.execute(f'SELECT {select_list} FROM "{bare}" ORDER BY rowid')
290
+ rows = cur.fetchall()
291
+ return _ScanState(rows=rows, schema=params.output_schema)
292
+
293
+ @classmethod
294
+ def process(cls, params: ProcessParams[None], state: _ScanState, out: OutputCollector) -> None:
295
+ assert state.rows is not None and state.schema is not None
296
+ if state.cursor >= len(state.rows):
297
+ out.finish()
298
+ return
299
+ # Build column arrays positionally so duplicate field names in the
300
+ # output schema each get the SQL row's value at that position.
301
+ n_cols = len(state.schema)
302
+ col_arrays: list[list[Any]] = [[] for _ in range(n_cols)]
303
+ for row in state.rows[state.cursor :]:
304
+ for i in range(n_cols):
305
+ col_arrays[i].append(row[i])
306
+ state.cursor = len(state.rows)
307
+ arrow_arrays = [pa.array(col, type=state.schema.field(i).type) for i, col in enumerate(col_arrays)]
308
+ out.emit(pa.RecordBatch.from_arrays(arrow_arrays, schema=state.schema))
309
+
310
+
311
+ @dataclass(kw_only=True)
312
+ class _ScanState(ArrowSerializableDataclass):
313
+ rows: Annotated[list[tuple[Any, ...]] | None, Transient()] = None
314
+ schema: Annotated[pa.Schema | None, Transient()] = None
315
+ cursor: int = 0
316
+
317
+
318
+ # ============================================================================
319
+ # Insert / Update / Delete
320
+ # ============================================================================
321
+
322
+
323
+ class SimpleInsert(TableInOutGenerator[None, None]):
324
+ """INSERT handler: append rows, optionally return the inserted rows."""
325
+
326
+ class Meta:
327
+ name = "simple_writable_insert"
328
+
329
+ @classmethod
330
+ def on_bind(cls, params: BindParams[None]) -> BindResponse:
331
+ opts = _parse_write_options(params)
332
+ if opts["return_chunks"]:
333
+ return BindResponse(output_schema=_user_schema_from_bind(params))
334
+ return BindResponse(output_schema=_COUNT_SCHEMA)
335
+
336
+ @classmethod
337
+ def process(
338
+ cls,
339
+ params: ProcessParams[None],
340
+ state: None,
341
+ batch: pa.RecordBatch,
342
+ out: OutputCollector,
343
+ ) -> None:
344
+ qualified = _qualified_from_process(params)
345
+ attach_opaque_data = _attach_opaque_data_from_process(params)
346
+ bare = _bare_name(qualified)
347
+ user_schema = _get_user_schema(qualified)
348
+ return_chunks = params.output_schema != _COUNT_SCHEMA
349
+
350
+ col_names = [f.name for f in user_schema]
351
+ cols_sql = ", ".join(f'"{c}"' for c in col_names)
352
+ placeholders = ", ".join("?" for _ in col_names)
353
+ rows_to_insert: list[tuple[Any, ...]] = []
354
+ for i in range(batch.num_rows):
355
+ rows_to_insert.append(tuple(batch.column(c)[i].as_py() for c in col_names))
356
+
357
+ with _connect(attach_opaque_data) as conn:
358
+ conn.execute("BEGIN")
359
+ conn.executemany(
360
+ f'INSERT INTO "{bare}" ({cols_sql}) VALUES ({placeholders})',
361
+ rows_to_insert,
362
+ )
363
+ conn.execute("COMMIT")
364
+
365
+ if return_chunks:
366
+ out_cols: dict[str, list[Any]] = {c: [] for c in col_names}
367
+ for row in rows_to_insert:
368
+ for c, v in zip(col_names, row, strict=True):
369
+ out_cols[c].append(v)
370
+ out.emit(pa.RecordBatch.from_pydict(out_cols, schema=user_schema))
371
+ else:
372
+ out.emit(pa.RecordBatch.from_pydict({"count": [batch.num_rows]}, schema=_COUNT_SCHEMA))
373
+
374
+
375
+ class SimpleUpdate(TableInOutGenerator[None, None]):
376
+ """UPDATE handler: input batch is (updated_cols..., rowid)."""
377
+
378
+ class Meta:
379
+ name = "simple_writable_update"
380
+
381
+ @classmethod
382
+ def on_bind(cls, params: BindParams[None]) -> BindResponse:
383
+ opts = _parse_write_options(params)
384
+ if opts["return_chunks"]:
385
+ return BindResponse(output_schema=_user_schema_from_bind(params))
386
+ return BindResponse(output_schema=_COUNT_SCHEMA)
387
+
388
+ @classmethod
389
+ def process(
390
+ cls,
391
+ params: ProcessParams[None],
392
+ state: None,
393
+ batch: pa.RecordBatch,
394
+ out: OutputCollector,
395
+ ) -> None:
396
+ qualified = _qualified_from_process(params)
397
+ attach_opaque_data = _attach_opaque_data_from_process(params)
398
+ bare = _bare_name(qualified)
399
+ user_schema = _get_user_schema(qualified)
400
+ return_chunks = params.output_schema != _COUNT_SCHEMA
401
+
402
+ update_cols = [n for n in batch.schema.names if n != "rowid"]
403
+ set_clause = ", ".join(f'"{c}"=?' for c in update_cols)
404
+ user_col_names = [f.name for f in user_schema]
405
+ select_list = ", ".join(f'"{c}"' for c in user_col_names)
406
+
407
+ rowid_col = batch.column("rowid")
408
+ updated: list[tuple[Any, ...]] = []
409
+ with _connect(attach_opaque_data) as conn:
410
+ conn.execute("BEGIN")
411
+ for i in range(batch.num_rows):
412
+ rowid = rowid_col[i].as_py()
413
+ values = tuple(batch.column(c)[i].as_py() for c in update_cols)
414
+ cur = conn.execute(f'UPDATE "{bare}" SET {set_clause} WHERE rowid=?', (*values, rowid))
415
+ if cur.rowcount == 0:
416
+ conn.execute("ROLLBACK")
417
+ raise ValueError(f"Update target rowid {rowid} not in table {qualified}")
418
+ row = conn.execute(f'SELECT {select_list} FROM "{bare}" WHERE rowid=?', (rowid,)).fetchone()
419
+ updated.append(row)
420
+ conn.execute("COMMIT")
421
+
422
+ if return_chunks:
423
+ cols = {c: [row[i] for row in updated] for i, c in enumerate(user_col_names)}
424
+ out.emit(pa.RecordBatch.from_pydict(cols, schema=user_schema))
425
+ else:
426
+ out.emit(pa.RecordBatch.from_pydict({"count": [batch.num_rows]}, schema=_COUNT_SCHEMA))
427
+
428
+
429
+ class SimpleDelete(TableInOutGenerator[None, None]):
430
+ """DELETE handler: input batch is just (rowid,)."""
431
+
432
+ class Meta:
433
+ name = "simple_writable_delete"
434
+
435
+ @classmethod
436
+ def on_bind(cls, params: BindParams[None]) -> BindResponse:
437
+ opts = _parse_write_options(params)
438
+ if opts["return_chunks"]:
439
+ return BindResponse(output_schema=_user_schema_from_bind(params))
440
+ return BindResponse(output_schema=_COUNT_SCHEMA)
441
+
442
+ @classmethod
443
+ def process(
444
+ cls,
445
+ params: ProcessParams[None],
446
+ state: None,
447
+ batch: pa.RecordBatch,
448
+ out: OutputCollector,
449
+ ) -> None:
450
+ qualified = _qualified_from_process(params)
451
+ attach_opaque_data = _attach_opaque_data_from_process(params)
452
+ bare = _bare_name(qualified)
453
+ user_schema = _get_user_schema(qualified)
454
+ return_chunks = params.output_schema != _COUNT_SCHEMA
455
+
456
+ user_col_names = [f.name for f in user_schema]
457
+ select_list = ", ".join(f'"{c}"' for c in user_col_names)
458
+ rowid_col = batch.column("rowid")
459
+
460
+ deleted: list[tuple[Any, ...]] = []
461
+ with _connect(attach_opaque_data) as conn:
462
+ conn.execute("BEGIN")
463
+ for i in range(batch.num_rows):
464
+ rowid = rowid_col[i].as_py()
465
+ row = conn.execute(f'SELECT {select_list} FROM "{bare}" WHERE rowid=?', (rowid,)).fetchone()
466
+ if row is None:
467
+ conn.execute("ROLLBACK")
468
+ raise ValueError(f"Delete target rowid {rowid} not in table {qualified}")
469
+ conn.execute(f'DELETE FROM "{bare}" WHERE rowid=?', (rowid,))
470
+ deleted.append(row)
471
+ conn.execute("COMMIT")
472
+
473
+ if return_chunks:
474
+ cols = {c: [row[i] for row in deleted] for i, c in enumerate(user_col_names)}
475
+ out.emit(pa.RecordBatch.from_pydict(cols, schema=user_schema))
476
+ else:
477
+ out.emit(pa.RecordBatch.from_pydict({"count": [batch.num_rows]}, schema=_COUNT_SCHEMA))
478
+
479
+
480
+ class BrokenReturningInsert(TableInOutGenerator[None, None]):
481
+ """Misbehaving INSERT handler that lies about its RETURNING support.
482
+
483
+ Claims RETURNING support but always emits a (count BIGINT) batch —
484
+ same shape that triggered the original SIGSEGV in the kafka worker.
485
+ Used to verify the C++ extension's runtime schema validator throws a
486
+ clean IOException instead of crashing inside ArrowToDuckDB.
487
+ """
488
+
489
+ class Meta:
490
+ name = "simple_writable_broken_returning_insert"
491
+
492
+ @classmethod
493
+ def on_bind(cls, params: BindParams[None]) -> BindResponse:
494
+ # Always advertise the count surface, even when return_chunks=True.
495
+ # The C++ side will see this at bind via the worker's output schema and
496
+ # tries to route the responses through ArrowToDuckDB on the table-row
497
+ # schema — that mismatch is what we want to catch at runtime.
498
+ return BindResponse(output_schema=_COUNT_SCHEMA)
499
+
500
+ @classmethod
501
+ def process(
502
+ cls,
503
+ params: ProcessParams[None],
504
+ state: None,
505
+ batch: pa.RecordBatch,
506
+ out: OutputCollector,
507
+ ) -> None:
508
+ qualified = _qualified_from_process(params)
509
+ attach_opaque_data = _attach_opaque_data_from_process(params)
510
+ bare = _bare_name(qualified)
511
+ user_schema = _get_user_schema(qualified)
512
+
513
+ col_names = [f.name for f in user_schema]
514
+ cols_sql = ", ".join(f'"{c}"' for c in col_names)
515
+ placeholders = ", ".join("?" for _ in col_names)
516
+ rows_to_insert: list[tuple[Any, ...]] = []
517
+ for i in range(batch.num_rows):
518
+ rows_to_insert.append(tuple(batch.column(c)[i].as_py() for c in col_names))
519
+
520
+ with _connect(attach_opaque_data) as conn:
521
+ conn.execute("BEGIN")
522
+ conn.executemany(
523
+ f'INSERT INTO "{bare}" ({cols_sql}) VALUES ({placeholders})',
524
+ rows_to_insert,
525
+ )
526
+ conn.execute("COMMIT")
527
+ # Always emit count, regardless of return_chunks — that's the bug.
528
+ out.emit(pa.RecordBatch.from_pydict({"count": [batch.num_rows]}, schema=_COUNT_SCHEMA))
529
+
530
+
531
+ # ============================================================================
532
+ # Catalog interface
533
+ # ============================================================================
534
+
535
+
536
+ _CATALOG = Catalog(
537
+ name=CATALOG_NAME,
538
+ default_schema="main",
539
+ schemas=[
540
+ Schema(
541
+ name="main",
542
+ functions=[SimpleScan, SimpleInsert, SimpleUpdate, SimpleDelete, BrokenReturningInsert],
543
+ tables=[],
544
+ ),
545
+ ],
546
+ )
547
+
548
+
549
+ class SimpleWritableCatalog(ReadOnlyCatalogInterface):
550
+ """Function-only catalog whose pre-defined tables live in process memory."""
551
+
552
+ catalog = _CATALOG
553
+ supports_transactions = False
554
+ catalog_version_frozen = True
555
+
556
+ def catalog_attach(
557
+ self,
558
+ *,
559
+ name: str,
560
+ options: dict[str, Any],
561
+ data_version_spec: str | None,
562
+ implementation_version: str | None,
563
+ ctx: Any | None = None,
564
+ ) -> CatalogAttachResult:
565
+ del options, data_version_spec, implementation_version, ctx
566
+ if name != CATALOG_NAME:
567
+ raise ValueError(f"Unknown catalog: {name!r}")
568
+ attach_opaque_data = AttachOpaqueData(uuid.uuid4().bytes)
569
+ # Ensure the SQLite file and schema exist before any worker tries to
570
+ # read/write — otherwise a SELECT before the first INSERT would 500.
571
+ _init_db(bytes(attach_opaque_data))
572
+ return CatalogAttachResult(
573
+ attach_opaque_data=attach_opaque_data,
574
+ supports_transactions=False,
575
+ supports_time_travel=False,
576
+ catalog_version_frozen=True,
577
+ catalog_version=1,
578
+ attach_opaque_data_required=True,
579
+ default_schema="main",
580
+ settings=[],
581
+ secret_types=[],
582
+ resolved_data_version=None,
583
+ resolved_implementation_version=None,
584
+ )
585
+
586
+ # --------- schema / table discovery ---------
587
+
588
+ def schemas(
589
+ self, *, attach_opaque_data: AttachOpaqueData, transaction_opaque_data: TransactionOpaqueData | None
590
+ ) -> list[SchemaInfo]:
591
+ del transaction_opaque_data
592
+ return [SchemaInfo(attach_opaque_data=attach_opaque_data, name="main", comment=None, tags={})]
593
+
594
+ def schema_get(
595
+ self, *, attach_opaque_data: AttachOpaqueData, transaction_opaque_data: TransactionOpaqueData | None, name: str
596
+ ) -> SchemaInfo | None:
597
+ del transaction_opaque_data
598
+ if name.lower() != "main":
599
+ return None
600
+ return SchemaInfo(attach_opaque_data=attach_opaque_data, name="main", comment=None, tags={})
601
+
602
+ def _build_table_info(self, *, name: str, schema_name: str) -> TableInfo:
603
+ user_schema = _table_specs()[name]
604
+ # Embed rowid at the end, with is_row_id metadata.
605
+ full = pa.schema(list(user_schema) + [_ROWID_FIELD])
606
+ ud = _table_supports_update_delete(name)
607
+ return TableInfo(
608
+ comment=None,
609
+ tags={},
610
+ name=name,
611
+ schema_name=schema_name,
612
+ columns=SerializedSchema(full.serialize().to_pybytes()),
613
+ not_null_constraints=[],
614
+ unique_constraints=[],
615
+ check_constraints=[],
616
+ primary_key_constraints=[],
617
+ foreign_key_constraints=[],
618
+ supports_insert=True,
619
+ supports_update=ud,
620
+ supports_delete=ud,
621
+ supports_returning=_table_supports_returning(name),
622
+ )
623
+
624
+ def table_get(
625
+ self,
626
+ *,
627
+ attach_opaque_data: AttachOpaqueData,
628
+ transaction_opaque_data: TransactionOpaqueData | None,
629
+ schema_name: str,
630
+ name: str,
631
+ at_unit: str | None = None,
632
+ at_value: str | None = None,
633
+ ) -> TableInfo | None:
634
+ del attach_opaque_data, transaction_opaque_data, at_unit, at_value
635
+ if schema_name.lower() != "main":
636
+ return None
637
+ if name.lower() not in _table_specs():
638
+ return None
639
+ return self._build_table_info(name=name.lower(), schema_name="main")
640
+
641
+ def view_get(self, **kwargs: Any) -> None:
642
+ return None
643
+
644
+ @overload
645
+ def schema_contents(
646
+ self,
647
+ *,
648
+ attach_opaque_data: AttachOpaqueData,
649
+ transaction_opaque_data: TransactionOpaqueData | None,
650
+ name: str,
651
+ type: Literal[SchemaObjectType.TABLE],
652
+ ) -> Sequence[TableInfo]: ...
653
+ @overload
654
+ def schema_contents(
655
+ self,
656
+ *,
657
+ attach_opaque_data: AttachOpaqueData,
658
+ transaction_opaque_data: TransactionOpaqueData | None,
659
+ name: str,
660
+ type: Literal[SchemaObjectType.VIEW],
661
+ ) -> Sequence[ViewInfo]: ...
662
+ @overload
663
+ def schema_contents(
664
+ self,
665
+ *,
666
+ attach_opaque_data: AttachOpaqueData,
667
+ transaction_opaque_data: TransactionOpaqueData | None,
668
+ name: str,
669
+ type: Literal[
670
+ SchemaObjectType.SCALAR_FUNCTION,
671
+ SchemaObjectType.TABLE_FUNCTION,
672
+ SchemaObjectType.AGGREGATE_FUNCTION,
673
+ ],
674
+ ) -> Sequence[FunctionInfo]: ...
675
+ @overload
676
+ def schema_contents(
677
+ self,
678
+ *,
679
+ attach_opaque_data: AttachOpaqueData,
680
+ transaction_opaque_data: TransactionOpaqueData | None,
681
+ name: str,
682
+ type: Literal[SchemaObjectType.SCALAR_MACRO, SchemaObjectType.TABLE_MACRO],
683
+ ) -> Sequence[MacroInfo]: ...
684
+ @overload
685
+ def schema_contents(
686
+ self,
687
+ *,
688
+ attach_opaque_data: AttachOpaqueData,
689
+ transaction_opaque_data: TransactionOpaqueData | None,
690
+ name: str,
691
+ type: Literal[SchemaObjectType.INDEX],
692
+ ) -> Sequence[IndexInfo]: ...
693
+
694
+ def schema_contents(
695
+ self,
696
+ *,
697
+ attach_opaque_data: AttachOpaqueData,
698
+ transaction_opaque_data: TransactionOpaqueData | None,
699
+ name: str,
700
+ type: SchemaObjectType,
701
+ ) -> Sequence[Any]:
702
+ type_enum = type if isinstance(type, SchemaObjectType) else SchemaObjectType(type)
703
+ if name.lower() != "main":
704
+ return []
705
+ if type_enum == SchemaObjectType.TABLE:
706
+ return [self._build_table_info(name=tn, schema_name="main") for tn in sorted(_table_specs())]
707
+ # Functions, views, etc. — fall through to base which uses the static catalog.
708
+ return super().schema_contents( # type: ignore[call-overload, no-any-return]
709
+ attach_opaque_data=attach_opaque_data, transaction_opaque_data=transaction_opaque_data, name=name, type=type
710
+ )
711
+
712
+ # --------- function dispatch ---------
713
+
714
+ def _function_get(self, kind: str, *, schema_name: str, name: str) -> ScanFunctionResult:
715
+ qualified = f"{schema_name}.{name}" if schema_name else name
716
+ return ScanFunctionResult(
717
+ function_name=f"simple_writable_{kind}",
718
+ positional_arguments=[pa.scalar(qualified)],
719
+ named_arguments={},
720
+ )
721
+
722
+ def table_scan_function_get(
723
+ self,
724
+ *,
725
+ attach_opaque_data: AttachOpaqueData,
726
+ transaction_opaque_data: TransactionOpaqueData | None,
727
+ schema_name: str,
728
+ name: str,
729
+ at_unit: str | None,
730
+ at_value: str | None,
731
+ ) -> ScanFunctionResult:
732
+ del attach_opaque_data, transaction_opaque_data, at_unit, at_value
733
+ return self._function_get("scan", schema_name=schema_name, name=name)
734
+
735
+ def table_insert_function_get(
736
+ self,
737
+ *,
738
+ attach_opaque_data: AttachOpaqueData,
739
+ transaction_opaque_data: TransactionOpaqueData | None,
740
+ schema_name: str,
741
+ name: str,
742
+ writable_branch_function_name: str | None = None,
743
+ ) -> ScanFunctionResult:
744
+ del attach_opaque_data, transaction_opaque_data, writable_branch_function_name
745
+ # Route the broken table to the misbehaving insert function. Tests rely
746
+ # on this lying about RETURNING shape so the C++ runtime validator
747
+ # gets exercised.
748
+ if name.lower() == "items_broken_returning":
749
+ qualified = f"{schema_name}.{name}" if schema_name else name
750
+ return ScanFunctionResult(
751
+ function_name="simple_writable_broken_returning_insert",
752
+ positional_arguments=[pa.scalar(qualified)],
753
+ named_arguments={},
754
+ )
755
+ return self._function_get("insert", schema_name=schema_name, name=name)
756
+
757
+ def table_update_function_get(
758
+ self,
759
+ *,
760
+ attach_opaque_data: AttachOpaqueData,
761
+ transaction_opaque_data: TransactionOpaqueData | None,
762
+ schema_name: str,
763
+ name: str,
764
+ ) -> ScanFunctionResult:
765
+ del attach_opaque_data, transaction_opaque_data
766
+ return self._function_get("update", schema_name=schema_name, name=name)
767
+
768
+ def table_delete_function_get(
769
+ self,
770
+ *,
771
+ attach_opaque_data: AttachOpaqueData,
772
+ transaction_opaque_data: TransactionOpaqueData | None,
773
+ schema_name: str,
774
+ name: str,
775
+ ) -> ScanFunctionResult:
776
+ del attach_opaque_data, transaction_opaque_data
777
+ return self._function_get("delete", schema_name=schema_name, name=name)
778
+
779
+
780
+ class SimpleWritableWorker(Worker):
781
+ """Worker exposing :class:`SimpleWritableCatalog`."""
782
+
783
+ catalog_interface = SimpleWritableCatalog
784
+ catalog = _CATALOG
785
+
786
+
787
+ def main() -> None:
788
+ """Run the simple writable worker process."""
789
+ SimpleWritableWorker.main()
790
+
791
+
792
+ if __name__ == "__main__":
793
+ main()