vgi-python 0.8.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (124) hide show
  1. vgi/__init__.py +152 -0
  2. vgi/_duckdb.py +62 -0
  3. vgi/_storage_profile.py +132 -0
  4. vgi/_test_fixtures/__init__.py +20 -0
  5. vgi/_test_fixtures/accumulate/__init__.py +19 -0
  6. vgi/_test_fixtures/accumulate/worker.py +762 -0
  7. vgi/_test_fixtures/aggregate/__init__.py +62 -0
  8. vgi/_test_fixtures/aggregate/_common.py +21 -0
  9. vgi/_test_fixtures/aggregate/basic.py +232 -0
  10. vgi/_test_fixtures/aggregate/dynamic.py +409 -0
  11. vgi/_test_fixtures/aggregate/generic.py +86 -0
  12. vgi/_test_fixtures/aggregate/listagg.py +71 -0
  13. vgi/_test_fixtures/aggregate/percentile.py +107 -0
  14. vgi/_test_fixtures/aggregate/streaming.py +192 -0
  15. vgi/_test_fixtures/aggregate/varargs.py +75 -0
  16. vgi/_test_fixtures/aggregate/window.py +380 -0
  17. vgi/_test_fixtures/attach_options.py +308 -0
  18. vgi/_test_fixtures/bad_protocol.py +62 -0
  19. vgi/_test_fixtures/cancellable.py +336 -0
  20. vgi/_test_fixtures/catalog.py +813 -0
  21. vgi/_test_fixtures/http_server.py +394 -0
  22. vgi/_test_fixtures/nest_tensor.py +614 -0
  23. vgi/_test_fixtures/orchard_catalog.py +47 -0
  24. vgi/_test_fixtures/projection_repro/__init__.py +6 -0
  25. vgi/_test_fixtures/projection_repro/worker.py +454 -0
  26. vgi/_test_fixtures/scalar/__init__.py +116 -0
  27. vgi/_test_fixtures/scalar/_common.py +69 -0
  28. vgi/_test_fixtures/scalar/arithmetic.py +321 -0
  29. vgi/_test_fixtures/scalar/binary.py +120 -0
  30. vgi/_test_fixtures/scalar/formatting.py +176 -0
  31. vgi/_test_fixtures/scalar/geo.py +300 -0
  32. vgi/_test_fixtures/scalar/null_handling.py +107 -0
  33. vgi/_test_fixtures/scalar/random_demo.py +171 -0
  34. vgi/_test_fixtures/scalar/settings_secrets.py +102 -0
  35. vgi/_test_fixtures/scalar/type_info.py +219 -0
  36. vgi/_test_fixtures/schema_reconcile/__init__.py +29 -0
  37. vgi/_test_fixtures/schema_reconcile/worker.py +653 -0
  38. vgi/_test_fixtures/simple_writable.py +793 -0
  39. vgi/_test_fixtures/table/__init__.py +221 -0
  40. vgi/_test_fixtures/table/_common.py +162 -0
  41. vgi/_test_fixtures/table/batch_index.py +283 -0
  42. vgi/_test_fixtures/table/batch_index_broken.py +200 -0
  43. vgi/_test_fixtures/table/catalog_scans.py +162 -0
  44. vgi/_test_fixtures/table/filters.py +1005 -0
  45. vgi/_test_fixtures/table/late_materialization.py +249 -0
  46. vgi/_test_fixtures/table/make_series.py +273 -0
  47. vgi/_test_fixtures/table/misc.py +499 -0
  48. vgi/_test_fixtures/table/order_modes.py +164 -0
  49. vgi/_test_fixtures/table/pairs.py +437 -0
  50. vgi/_test_fixtures/table/partition_columns.py +472 -0
  51. vgi/_test_fixtures/table/partition_columns_broken.py +304 -0
  52. vgi/_test_fixtures/table/profiling_example.py +195 -0
  53. vgi/_test_fixtures/table/required_filters.py +234 -0
  54. vgi/_test_fixtures/table/sequence.py +710 -0
  55. vgi/_test_fixtures/table/settings.py +426 -0
  56. vgi/_test_fixtures/table/transaction_storage.py +162 -0
  57. vgi/_test_fixtures/table/tt_pushdown.py +191 -0
  58. vgi/_test_fixtures/table/versioned.py +230 -0
  59. vgi/_test_fixtures/table_in_out.py +1392 -0
  60. vgi/_test_fixtures/versioned.py +155 -0
  61. vgi/_test_fixtures/versioned_tables.py +595 -0
  62. vgi/_test_fixtures/worker.py +1631 -0
  63. vgi/_test_fixtures/writable/__init__.py +8 -0
  64. vgi/_test_fixtures/writable/generic.py +236 -0
  65. vgi/_test_fixtures/writable/table.py +149 -0
  66. vgi/_test_fixtures/writable/worker.py +1148 -0
  67. vgi/aggregate_function.py +607 -0
  68. vgi/argument_spec.py +472 -0
  69. vgi/arguments.py +1747 -0
  70. vgi/auth.py +55 -0
  71. vgi/catalog/__init__.py +88 -0
  72. vgi/catalog/attach_option.py +206 -0
  73. vgi/catalog/catalog_interface.py +2767 -0
  74. vgi/catalog/descriptors.py +870 -0
  75. vgi/catalog/duckdb_statistics.py +377 -0
  76. vgi/catalog/secret_type.py +96 -0
  77. vgi/catalog/setting.py +253 -0
  78. vgi/catalog/storage.py +372 -0
  79. vgi/client/__init__.py +67 -0
  80. vgi/client/catalog_mixin.py +1251 -0
  81. vgi/client/cli.py +582 -0
  82. vgi/client/cli_catalog.py +182 -0
  83. vgi/client/cli_schema.py +270 -0
  84. vgi/client/cli_table.py +907 -0
  85. vgi/client/cli_transaction.py +97 -0
  86. vgi/client/cli_utils.py +441 -0
  87. vgi/client/cli_view.py +303 -0
  88. vgi/client/client.py +2183 -0
  89. vgi/exceptions.py +205 -0
  90. vgi/function.py +245 -0
  91. vgi/function_storage.py +1636 -0
  92. vgi/function_storage_azure_sql.py +922 -0
  93. vgi/function_storage_cf_do.py +740 -0
  94. vgi/http/__init__.py +25 -0
  95. vgi/http/demo_storage.py +212 -0
  96. vgi/http/worker_page.py +1252 -0
  97. vgi/invocation.py +154 -0
  98. vgi/logging_config.py +93 -0
  99. vgi/meta_worker.py +661 -0
  100. vgi/metadata.py +1403 -0
  101. vgi/otel.py +406 -0
  102. vgi/protocol.py +2418 -0
  103. vgi/protocol_version.txt +1 -0
  104. vgi/py.typed +0 -0
  105. vgi/scalar_function.py +1211 -0
  106. vgi/schema_utils.py +234 -0
  107. vgi/secret_protocol.py +124 -0
  108. vgi/secret_service.py +238 -0
  109. vgi/serve.py +769 -0
  110. vgi/table_buffering_function.py +443 -0
  111. vgi/table_filter_pushdown.py +1528 -0
  112. vgi/table_function.py +1130 -0
  113. vgi/table_in_out_function.py +383 -0
  114. vgi/transactor/__init__.py +24 -0
  115. vgi/transactor/_duckdb_compat.py +27 -0
  116. vgi/transactor/client.py +137 -0
  117. vgi/transactor/protocol.py +149 -0
  118. vgi/transactor/server.py +740 -0
  119. vgi/worker.py +4761 -0
  120. vgi_python-0.8.0.dist-info/METADATA +735 -0
  121. vgi_python-0.8.0.dist-info/RECORD +124 -0
  122. vgi_python-0.8.0.dist-info/WHEEL +4 -0
  123. vgi_python-0.8.0.dist-info/entry_points.txt +5 -0
  124. vgi_python-0.8.0.dist-info/licenses/LICENSE +134 -0
@@ -0,0 +1,762 @@
1
+ # Copyright 2025, 2026 Query Farm LLC - https://query.farm
2
+
3
+ """VGI worker that accumulates table rows, keyed by name, in framework storage.
4
+
5
+ Functions
6
+ ---------
7
+ - ``accumulate(name, <rows>, ttl, max_row_size, result)`` — append rows to a
8
+ named collection and optionally return its contents.
9
+ - ``accumulate_read(name)`` — read a collection's contents without modifying it.
10
+ - ``accumulate_clear(name)`` — drop a collection; returns rows removed.
11
+
12
+ Each ``accumulate`` call stamps the input rows with a single call-time
13
+ ``_timestamp`` and appends them. The ``result`` option controls what it returns:
14
+ ``'all'`` (the whole collection, default), ``'new'`` (only the rows added by
15
+ this call), or ``'none'`` (nothing — a cheap append). The input schema is
16
+ validated against whatever schema was first accumulated under that name. Two
17
+ optional named parameters bound the collection: ``ttl`` (an INTERVAL — rows
18
+ older than ``call_time - ttl`` are evicted) and ``max_row_size`` (a row cap;
19
+ oldest dropped first).
20
+
21
+ Storage, scoping & performance
22
+ ------------------------------
23
+ Data is persisted through the VGI framework's ``FunctionStorage`` (the worker's
24
+ ``cls.storage``), so the backend is pluggable via ``VGI_WORKER_SHARED_STORAGE``:
25
+ a file-backed SQLite (default; persistent across restarts), in-memory, Azure
26
+ SQL, or Cloudflare Durable Objects (the last two are durable across machines).
27
+
28
+ Each collection is scoped to a random *attach id* minted once per ``ATTACH``
29
+ (carried back on every call via the catalog's attach-opaque-data), so two
30
+ independent ATTACH sessions never share a collection. Within that scope a
31
+ collection's rows live as append-only *segments* keyed by ingest time under a
32
+ per-collection namespace, so an append is O(batch) and needs no lock: each op is
33
+ a single atomic storage statement. A TTL evicts in one ranged delete of the
34
+ time-ordered key range (whole expired segments, exactly the expired rows, since
35
+ a segment carries a single call timestamp). ``max_row_size`` keeps an atomic
36
+ int64 row counter and, when the cap is exceeded, drops the oldest segments
37
+ (trimming only the one straddling segment) — no whole-collection repack.
38
+
39
+ Usage
40
+ -----
41
+ Hosted inside the consolidated ``vgi-fixture-worker`` (and the
42
+ ``vgi-fixture-http`` server) via MetaWorker — attach by catalog name:
43
+
44
+ ATTACH 'accumulate' AS accumulate (TYPE vgi, LOCATION '${VGI_TEST_WORKER}');
45
+ SELECT * FROM accumulate.main.accumulate('events', (SELECT * FROM my_rows));
46
+ SELECT * FROM accumulate.main.accumulate('events', (VALUES (1)) t(x), result := 'new');
47
+ SELECT * FROM accumulate.main.accumulate_read('events');
48
+ SELECT * FROM accumulate.main.accumulate_clear('events');
49
+
50
+ Exercised end-to-end by ``test/sql/integration/accumulate/*.test`` in the C++
51
+ repo and mirrored by ``tests/conformance/test_accumulate.py``.
52
+ """
53
+
54
+ from __future__ import annotations
55
+
56
+ import dataclasses
57
+ import uuid
58
+ from datetime import UTC, datetime, timedelta
59
+ from typing import TYPE_CHECKING, Annotated, Any
60
+
61
+ import pyarrow as pa
62
+ from vgi_rpc import ArrowSerializableDataclass
63
+ from vgi_rpc.rpc import OutputCollector
64
+
65
+ from vgi import Worker
66
+ from vgi.arguments import Arg, TableInput
67
+ from vgi.catalog import Catalog, ReadOnlyCatalogInterface, Schema
68
+ from vgi.catalog.catalog_interface import AttachOpaqueData, CatalogAttachResult, CatalogInfo
69
+ from vgi.function_storage import BoundStorage, FunctionStorage
70
+ from vgi.invocation import BindResponse
71
+ from vgi.metadata import FunctionExample
72
+ from vgi.table_buffering_function import TableBufferingFunction, TableBufferingParams
73
+ from vgi.table_function import (
74
+ BindParams,
75
+ ProcessParams,
76
+ TableFunctionGenerator,
77
+ init_single_worker,
78
+ )
79
+
80
+ if TYPE_CHECKING:
81
+ from vgi_rpc.rpc import CallContext
82
+
83
+ DATA_VERSION = "2.0.0"
84
+ IMPLEMENTATION_VERSION = "vgi-fixture"
85
+
86
+ # Name of the column appended to every output row holding the per-call ingest
87
+ # time. Plain (tz-naive) microsecond timestamp so it surfaces as DuckDB
88
+ # TIMESTAMP rather than TIMESTAMP WITH TIME ZONE. Underscore-prefixed so it is
89
+ # unlikely to collide with a user's own column named ``timestamp``.
90
+ TIMESTAMP_COLUMN = "_timestamp"
91
+ TIMESTAMP_TYPE = pa.timestamp("us")
92
+
93
+ # Target rows per emitted/staged batch (output is streamed in chunks of this
94
+ # size) and per stored/repacked segment.
95
+ OUT_BATCH_ROWS = 65536
96
+
97
+ # Execution-scoped BoundStorage namespaces (transient per query) for the
98
+ # buffering operator's Sink->Combine->Source handoff and for accumulate_read.
99
+ _NS_IN = b"in" # staged input batches (Sink -> Combine)
100
+ _NS_OUT = b"out" # staged result rows (Combine -> Source/finalize)
101
+ _NS_READ = b"read" # staged snapshot for accumulate_read
102
+
103
+ # Persistent (attach-scoped) namespaces. A collection's segments live under a
104
+ # per-collection namespace keyed by ingest time, so the whole collection wipes
105
+ # with one namespace delete and a TTL cutoff is one ranged delete. The schema
106
+ # lives under a shared meta namespace; the row count under a per-collection
107
+ # int64 counter (the separate function_counter table) keyed by collection name
108
+ # in that same namespace.
109
+ _SEG_NS_PREFIX = b"seg:"
110
+ _META_NS = b"meta"
111
+
112
+ _EPOCH = datetime(1970, 1, 1)
113
+
114
+ # Width of the big-endian ingest-time prefix on each segment key, so segment
115
+ # keys sort by time (memcmp == numeric for fixed-width unsigned big-endian).
116
+ _TS_KEY_BYTES = 8
117
+
118
+
119
+ # ---------------------------------------------------------------------------
120
+ # Time / schema helpers
121
+ # ---------------------------------------------------------------------------
122
+
123
+
124
+ def _now_naive() -> datetime:
125
+ return datetime.now(UTC).replace(tzinfo=None)
126
+
127
+
128
+ def _to_us(dt: datetime) -> int:
129
+ return (dt - _EPOCH) // timedelta(microseconds=1)
130
+
131
+
132
+ def _interval_to_timedelta(interval: Any) -> timedelta:
133
+ """Convert a DuckDB INTERVAL (pa.MonthDayNano) to a timedelta.
134
+
135
+ Calendar months have no fixed length, so each month is approximated as 30
136
+ days: ``INTERVAL '1 month'`` evicts rows older than 30 days, not older than
137
+ one calendar month. Use ``INTERVAL '30 days'`` / ``'24 hours'`` etc. when an
138
+ exact span matters.
139
+ """
140
+ months = getattr(interval, "months", 0) or 0
141
+ days = getattr(interval, "days", 0) or 0
142
+ nanoseconds = getattr(interval, "nanoseconds", 0) or 0
143
+ return timedelta(days=months * 30 + days, microseconds=nanoseconds // 1000)
144
+
145
+
146
+ def _output_schema(input_schema: pa.Schema) -> pa.Schema:
147
+ return pa.schema(list(input_schema) + [pa.field(TIMESTAMP_COLUMN, TIMESTAMP_TYPE)])
148
+
149
+
150
+ def _input_schema_of(output_schema: pa.Schema) -> pa.Schema:
151
+ return pa.schema([f for f in output_schema if f.name != TIMESTAMP_COLUMN])
152
+
153
+
154
+ def _schemas_match(expected: pa.Schema, actual: pa.Schema) -> bool:
155
+ return expected.equals(actual, check_metadata=False)
156
+
157
+
158
+ # Upper bound on a collection name's UTF-8 byte length. The name becomes the
159
+ # suffix of a storage namespace key (``seg:<name>``), so it is bounded to keep
160
+ # keys small; the limit is generous enough for any real-world name.
161
+ _MAX_NAME_BYTES = 255
162
+
163
+
164
+ def _validate_name(name: str) -> None:
165
+ """Reject empty/blank or oversized collection names at bind time."""
166
+ if not name or not name.strip():
167
+ raise ValueError("collection name must be a non-empty string")
168
+ if len(name.encode()) > _MAX_NAME_BYTES:
169
+ raise ValueError(f"collection name must be at most {_MAX_NAME_BYTES} bytes")
170
+
171
+
172
+ # ---------------------------------------------------------------------------
173
+ # Arrow IPC (de)serialization
174
+ # ---------------------------------------------------------------------------
175
+
176
+
177
+ def _table_to_ipc(table: pa.Table) -> bytes:
178
+ sink = pa.BufferOutputStream()
179
+ with pa.ipc.new_stream(sink, table.schema) as writer:
180
+ writer.write_table(table)
181
+ return sink.getvalue().to_pybytes()
182
+
183
+
184
+ def _table_from_ipc(blob: bytes) -> pa.Table:
185
+ with pa.ipc.open_stream(pa.py_buffer(blob)) as reader:
186
+ return reader.read_all()
187
+
188
+
189
+ def _batch_to_ipc(batch: pa.RecordBatch) -> bytes:
190
+ sink = pa.BufferOutputStream()
191
+ with pa.ipc.new_stream(sink, batch.schema) as writer:
192
+ writer.write_batch(batch)
193
+ return sink.getvalue().to_pybytes()
194
+
195
+
196
+ def _batch_from_ipc(value: bytes) -> pa.RecordBatch:
197
+ return pa.ipc.open_stream(value).read_next_batch()
198
+
199
+
200
+ def _schema_to_ipc(schema: pa.Schema) -> bytes:
201
+ return _table_to_ipc(schema.empty_table())
202
+
203
+
204
+ def _schema_from_ipc(blob: bytes) -> pa.Schema:
205
+ return _table_from_ipc(blob).schema
206
+
207
+
208
+ def _stage_table(storage: BoundStorage, ns: bytes, table: pa.Table) -> None:
209
+ """Stage an in-memory table into an execution-scoped log in bounded batches."""
210
+ for batch in table.to_batches(max_chunksize=OUT_BATCH_ROWS):
211
+ storage.state_append(ns, b"", _batch_to_ipc(batch))
212
+
213
+
214
+ # ---------------------------------------------------------------------------
215
+ # Persistent, attach-scoped collection store (over FunctionStorage)
216
+ # ---------------------------------------------------------------------------
217
+ #
218
+ # `ps` below is a BoundStorage bound to the ATTACH scope (stable across queries),
219
+ # distinct from the per-query `params.storage`.
220
+
221
+
222
+ def _store(storage: FunctionStorage, attach_opaque_data: bytes | None) -> BoundStorage:
223
+ """Build a BoundStorage scoped to the ATTACH session (persists across queries).
224
+
225
+ Constructed without ``attach_plaintext``, so under shard-routing backends
226
+ (``VGI_SQLITE_SHARD=1``, cloudflare-do) the data lands on the default
227
+ shard — irrelevant for the plain sqlite backends the test suites use.
228
+ """
229
+ return BoundStorage(storage, attach_opaque_data if attach_opaque_data else b"default")
230
+
231
+
232
+ def _seg_ns(name: bytes) -> bytes:
233
+ return _SEG_NS_PREFIX + name
234
+
235
+
236
+ def _seg_key(call_ts_us: int) -> bytes:
237
+ """Segment key: big-endian ingest time + uuid, so keys sort by time."""
238
+ return call_ts_us.to_bytes(_TS_KEY_BYTES, "big") + uuid.uuid4().bytes
239
+
240
+
241
+ def _get_schema(ps: BoundStorage, name: bytes) -> pa.Schema | None:
242
+ blob = ps.state_get(_META_NS, name)
243
+ return _schema_from_ipc(blob) if blob is not None else None
244
+
245
+
246
+ def _put_schema(ps: BoundStorage, name: bytes, output_schema: pa.Schema) -> None:
247
+ ps.state_put(_META_NS, name, _schema_to_ipc(output_schema))
248
+
249
+
250
+ def _get_count(ps: BoundStorage, name: bytes) -> int:
251
+ """Return a collection's current row count (the per-collection int64 counter)."""
252
+ return ps.counter_get(_META_NS, name)
253
+
254
+
255
+ def _append_segment(ps: BoundStorage, name: bytes, table: pa.Table, call_ts_us: int) -> None:
256
+ """Append one time-keyed segment (O(batch)) and bump the row counter."""
257
+ ps.state_put(_seg_ns(name), _seg_key(call_ts_us), _table_to_ipc(table))
258
+ ps.counter_add(_META_NS, name, table.num_rows)
259
+
260
+
261
+ def _read_collection(ps: BoundStorage, name: bytes, output_schema: pa.Schema) -> pa.Table:
262
+ # Segments are time-keyed, so the scan returns them oldest-first.
263
+ parts = [_table_from_ipc(value) for _key, value in ps.state_scan(_seg_ns(name))]
264
+ return pa.concat_tables(parts) if parts else output_schema.empty_table()
265
+
266
+
267
+ def _evict_ttl(ps: BoundStorage, name: bytes, cutoff_us: int) -> None:
268
+ """Drop segments whose ingest time is before ``cutoff_us`` (one ranged delete).
269
+
270
+ A segment carries a single call timestamp, so the time-keyed range
271
+ ``[.., cutoff)`` is exactly the expired rows. We sum their rows first (the
272
+ expired set is small and about to be deleted) to keep the counter exact.
273
+ """
274
+ if cutoff_us <= 0:
275
+ return # nothing predates the epoch
276
+ end = cutoff_us.to_bytes(_TS_KEY_BYTES, "big")
277
+ removed = sum(_table_from_ipc(value).num_rows for _key, value in ps.state_scan(_seg_ns(name), end=end))
278
+ if removed:
279
+ ps.state_delete(_seg_ns(name), end=end)
280
+ ps.counter_add(_META_NS, name, -removed)
281
+
282
+
283
+ def _evict_max_rows(ps: BoundStorage, name: bytes, total: int, max_row_size: int) -> None:
284
+ """Drop the oldest rows until at most ``max_row_size`` remain.
285
+
286
+ Walks segments oldest-first, deleting whole segments and trimming only the
287
+ one segment that straddles the cap — never a whole-collection rewrite.
288
+ """
289
+ overflow = total - max_row_size
290
+ removed = 0
291
+ delete_keys: list[bytes] = []
292
+ trim: tuple[bytes, pa.Table] | None = None
293
+ for key, value in ps.state_scan(_seg_ns(name)): # oldest-first
294
+ seg = _table_from_ipc(value)
295
+ if removed + seg.num_rows <= overflow:
296
+ removed += seg.num_rows
297
+ delete_keys.append(key)
298
+ if removed == overflow:
299
+ break
300
+ else:
301
+ # Boundary segment: keep its newest rows, drop the oldest.
302
+ trim = (key, seg.slice(overflow - removed))
303
+ removed = overflow
304
+ break
305
+ if delete_keys:
306
+ ps.state_delete(_seg_ns(name), delete_keys)
307
+ if trim is not None:
308
+ trim_key, trim_table = trim
309
+ ps.state_put(_seg_ns(name), trim_key, _table_to_ipc(trim_table))
310
+ if removed:
311
+ ps.counter_add(_META_NS, name, -removed)
312
+
313
+
314
+ def _clear_collection(ps: BoundStorage, name: bytes) -> int:
315
+ """Drop a collection (segments + schema + counter); return rows removed."""
316
+ total = _get_count(ps, name)
317
+ ps.state_delete(_seg_ns(name), None)
318
+ ps.state_delete(_META_NS, [name])
319
+ ps.counter_delete(_META_NS, name)
320
+ return total
321
+
322
+
323
+ # ---------------------------------------------------------------------------
324
+ # accumulate(name, <rows>, ttl, max_row_size, result)
325
+ # ---------------------------------------------------------------------------
326
+
327
+ _RESULT_CHOICES = ("all", "new", "none")
328
+
329
+
330
+ @dataclasses.dataclass(slots=True, frozen=True, kw_only=True)
331
+ class AccumulateArgs:
332
+ """Arguments for the ``accumulate`` table function."""
333
+
334
+ name: Annotated[str, Arg(0, doc="Name of the collection to accumulate into")]
335
+ data: Annotated[TableInput, Arg(1, doc="Rows to accumulate (any table expression)")]
336
+ ttl: Annotated[
337
+ object | None,
338
+ Arg(
339
+ "ttl",
340
+ default=None,
341
+ arrow_type=pa.month_day_nano_interval(),
342
+ doc="Evict rows older than this INTERVAL before returning (months are treated as 30 days)",
343
+ ),
344
+ ] = None
345
+ max_row_size: Annotated[
346
+ int,
347
+ Arg(
348
+ "max_row_size",
349
+ default=0,
350
+ ge=0,
351
+ doc="Maximum rows retained per name; oldest dropped first (0 = unlimited)",
352
+ ),
353
+ ] = 0
354
+ result: Annotated[
355
+ str,
356
+ Arg(
357
+ "result",
358
+ default="all",
359
+ choices=_RESULT_CHOICES,
360
+ doc="What to return: 'all' accumulated rows (default), only the 'new' rows, or 'none'",
361
+ ),
362
+ ] = "all"
363
+
364
+
365
+ @dataclasses.dataclass
366
+ class AccumulateDrainState(ArrowSerializableDataclass):
367
+ """Cursor over the staged output log, advanced one batch per finalize tick."""
368
+
369
+ after_id: int = -1
370
+
371
+
372
+ class AccumulateFunction(TableBufferingFunction[AccumulateArgs, AccumulateDrainState]):
373
+ """Append input rows to a named collection; optionally return the collection.
374
+
375
+ A buffering (Sink -> Combine -> Source) operator: the input is staged across
376
+ the parallel sink, ``combine`` runs once to stamp the rows with a single
377
+ timestamp, append them to the persistent collection, apply ttl/max_row_size,
378
+ and stage the rows to return, and the source streams them back.
379
+ """
380
+
381
+ class Meta:
382
+ """Function metadata."""
383
+
384
+ name = "accumulate"
385
+ description = "Append rows to a named collection; return all/new/no rows with a _timestamp column"
386
+ categories = ["stateful", "utility"]
387
+ tags = {"category": "stateful", "type": "accumulator"}
388
+ examples = [
389
+ FunctionExample(
390
+ sql="SELECT * FROM accumulate('events', (VALUES (1), (2)) t(x))",
391
+ description="Accumulate two rows under 'events' and return the full collection",
392
+ ),
393
+ FunctionExample(
394
+ sql="SELECT * FROM accumulate('events', (VALUES (3)) t(x), result := 'new')",
395
+ description="Append a row and return only the newly-added rows",
396
+ ),
397
+ FunctionExample(
398
+ sql="SELECT * FROM accumulate('events', (VALUES (4)) t(x), result := 'none')",
399
+ description="Append a row and return nothing (cheap, fire-and-forget)",
400
+ ),
401
+ FunctionExample(
402
+ sql=(
403
+ "SELECT * FROM accumulate('events', (VALUES (5)) t(x), "
404
+ "ttl := INTERVAL '1 hour', max_row_size := 1000)"
405
+ ),
406
+ description="Append with a 1-hour TTL and a 1000-row cap",
407
+ ),
408
+ ]
409
+
410
+ @classmethod
411
+ def on_bind(cls, params: BindParams[AccumulateArgs]) -> BindResponse:
412
+ """Validate the input schema against the named collection and add timestamp."""
413
+ _validate_name(params.args.name)
414
+ input_schema = params.bind_call.input_schema
415
+ if input_schema is None:
416
+ raise ValueError("accumulate requires a table input")
417
+ if TIMESTAMP_COLUMN in input_schema.names:
418
+ raise ValueError(
419
+ f"input may not contain a reserved '{TIMESTAMP_COLUMN}' column; "
420
+ "accumulate adds this column to its output"
421
+ )
422
+
423
+ ps = _store(cls.storage, params.attach_opaque_data)
424
+ name = params.args.name.encode()
425
+ out_schema = _output_schema(input_schema)
426
+ # Lock-free schema pin: read the pinned schema, write it if absent, or
427
+ # reject a mismatch. The only race is two *simultaneous first* appends
428
+ # of *incompatible* schemas to a brand-new name (pathological); the
429
+ # worst case is a confusing validation error, never data corruption.
430
+ existing = _get_schema(ps, name)
431
+ if existing is None:
432
+ _put_schema(ps, name, out_schema)
433
+ elif not _schemas_match(_input_schema_of(existing), input_schema):
434
+ raise ValueError(
435
+ f"input schema for accumulate('{params.args.name}', ...) does not match the "
436
+ f"schema already accumulated under that name.\n"
437
+ f" accumulated: {_input_schema_of(existing)}\n"
438
+ f" received: {input_schema}"
439
+ )
440
+ return BindResponse(output_schema=out_schema)
441
+
442
+ # ---- Sink: stage each input batch (parallel across DuckDB threads) ----
443
+ @classmethod
444
+ def process(cls, batch: pa.RecordBatch, params: TableBufferingParams[AccumulateArgs]) -> bytes:
445
+ """Stage one input batch into the execution-scoped log."""
446
+ params.storage.state_append(_NS_IN, b"", _batch_to_ipc(batch))
447
+ return params.execution_id
448
+
449
+ # ---- Combine: append, evict, and stage the requested result ----
450
+ @classmethod
451
+ def combine(cls, state_ids: list[bytes], params: TableBufferingParams[AccumulateArgs]) -> list[bytes]:
452
+ """Append staged input to the collection, apply eviction, stage the result."""
453
+ ps = _store(cls.storage, params.attach_opaque_data)
454
+ name = params.args.name.encode()
455
+ ttl = params.args.ttl
456
+ max_row_size = params.args.max_row_size
457
+ result_mode = params.args.result
458
+ output_schema = params.output_schema
459
+ input_schema = _input_schema_of(output_schema)
460
+
461
+ # Reassemble this call's input from the execution-scoped staging log.
462
+ staged = params.storage.state_log_scan(_NS_IN, b"", after_id=-1, limit=None)
463
+ input_batches = [_batch_from_ipc(value) for _id, value in staged]
464
+ new_input = (
465
+ pa.Table.from_batches(input_batches, schema=input_schema) if input_batches else input_schema.empty_table()
466
+ )
467
+
468
+ call_ts = _now_naive()
469
+ call_ts_us = _to_us(call_ts)
470
+ if new_input.num_rows:
471
+ ts_col = pa.array([call_ts] * new_input.num_rows, type=TIMESTAMP_TYPE)
472
+ new_table = new_input.append_column(pa.field(TIMESTAMP_COLUMN, TIMESTAMP_TYPE), ts_col)
473
+ else:
474
+ new_table = output_schema.empty_table()
475
+
476
+ # No lock: each step below is a single atomic storage op. Append is
477
+ # O(batch); a TTL is one ranged delete; max_row_size drops whole oldest
478
+ # segments plus at most one trimmed boundary segment.
479
+ if new_table.num_rows:
480
+ _append_segment(ps, name, new_table, call_ts_us)
481
+
482
+ if ttl is not None:
483
+ _evict_ttl(ps, name, _to_us(call_ts - _interval_to_timedelta(ttl)))
484
+
485
+ if max_row_size:
486
+ total = _get_count(ps, name)
487
+ if total > max_row_size:
488
+ _evict_max_rows(ps, name, total, max_row_size)
489
+
490
+ if result_mode == "all":
491
+ to_emit: pa.Table | None = _read_collection(ps, name, output_schema)
492
+ elif result_mode == "new":
493
+ to_emit = new_table # the rows this call added (pre-eviction)
494
+ else: # "none"
495
+ to_emit = None
496
+
497
+ if to_emit is not None and to_emit.num_rows:
498
+ _stage_table(params.storage, _NS_OUT, to_emit)
499
+
500
+ return [params.execution_id]
501
+
502
+ # ---- Source: drain the staged result, one batch per tick ----
503
+ @classmethod
504
+ def initial_finalize_state(
505
+ cls, finalize_state_id: bytes, params: TableBufferingParams[AccumulateArgs]
506
+ ) -> AccumulateDrainState:
507
+ """Start the drain cursor at the beginning of the staged output log."""
508
+ return AccumulateDrainState(after_id=-1)
509
+
510
+ @classmethod
511
+ def finalize(
512
+ cls,
513
+ params: TableBufferingParams[AccumulateArgs],
514
+ finalize_state_id: bytes,
515
+ state: AccumulateDrainState,
516
+ out: OutputCollector,
517
+ ) -> None:
518
+ """Emit the next staged output batch, or finish when the log is drained."""
519
+ rows = params.storage.state_log_scan(_NS_OUT, b"", after_id=state.after_id, limit=1)
520
+ if not rows:
521
+ out.finish()
522
+ return
523
+ log_id, value = rows[0]
524
+ out.emit(_batch_from_ipc(value))
525
+ state.after_id = log_id
526
+
527
+
528
+ # ---------------------------------------------------------------------------
529
+ # accumulate_read(name) — read a collection without modifying it
530
+ # ---------------------------------------------------------------------------
531
+
532
+
533
+ @dataclasses.dataclass(slots=True, frozen=True, kw_only=True)
534
+ class AccumulateReadArgs:
535
+ """Arguments for the ``accumulate_read`` table function."""
536
+
537
+ name: Annotated[str, Arg(0, doc="Name of the collection to read")]
538
+
539
+
540
+ @dataclasses.dataclass
541
+ class AccumulateReadState(ArrowSerializableDataclass):
542
+ """Whether the snapshot has been staged, plus the drain cursor."""
543
+
544
+ staged: bool = False
545
+ after_id: int = -1
546
+
547
+
548
+ @init_single_worker
549
+ class AccumulateReadFunction(TableFunctionGenerator[AccumulateReadArgs, AccumulateReadState]):
550
+ """Return a collection's accumulated rows without modifying it.
551
+
552
+ Emits the same columns ``accumulate`` returns (input columns + ``_timestamp``).
553
+ Reading a name that doesn't exist in this session raises. Row order is not
554
+ guaranteed; ``ORDER BY _timestamp`` for a stable ordering.
555
+ """
556
+
557
+ class Meta:
558
+ """Function metadata."""
559
+
560
+ name = "accumulate_read"
561
+ description = "Read an accumulated collection's rows without modifying it"
562
+ categories = ["stateful", "utility"]
563
+ examples = [
564
+ FunctionExample(
565
+ sql="SELECT * FROM accumulate_read('events')",
566
+ description="Return all rows accumulated under 'events'",
567
+ ),
568
+ ]
569
+
570
+ @classmethod
571
+ def on_bind(cls, params: BindParams[AccumulateReadArgs]) -> BindResponse:
572
+ """Resolve the collection's pinned schema; raise if the name is unknown."""
573
+ _validate_name(params.args.name)
574
+ ps = _store(cls.storage, params.attach_opaque_data)
575
+ schema = _get_schema(ps, params.args.name.encode())
576
+ if schema is None:
577
+ raise ValueError(f"no accumulation named '{params.args.name}' in this session")
578
+ return BindResponse(output_schema=schema)
579
+
580
+ @classmethod
581
+ def initial_state(cls, params: ProcessParams[AccumulateReadArgs]) -> AccumulateReadState:
582
+ """Start unstaged with the drain cursor at the beginning."""
583
+ return AccumulateReadState(staged=False, after_id=-1)
584
+
585
+ @classmethod
586
+ def process(
587
+ cls,
588
+ params: ProcessParams[AccumulateReadArgs],
589
+ state: AccumulateReadState,
590
+ out: OutputCollector,
591
+ ) -> None:
592
+ """Snapshot the collection into bounded batches (first tick), then drain one per tick."""
593
+ if not state.staged:
594
+ ps = _store(cls.storage, params.attach_opaque_data)
595
+ table = _read_collection(ps, params.args.name.encode(), params.output_schema)
596
+ _stage_table(params.storage, _NS_READ, table)
597
+ state.staged = True
598
+
599
+ rows = params.storage.state_log_scan(_NS_READ, b"", after_id=state.after_id, limit=1)
600
+ if not rows:
601
+ out.finish()
602
+ return
603
+ log_id, value = rows[0]
604
+ out.emit(_batch_from_ipc(value))
605
+ state.after_id = log_id
606
+
607
+
608
+ # ---------------------------------------------------------------------------
609
+ # accumulate_clear(name)
610
+ # ---------------------------------------------------------------------------
611
+
612
+ _CLEAR_FIELDS: list[pa.Field[Any]] = [pa.field("name", pa.string()), pa.field("rows_cleared", pa.int64())]
613
+ CLEAR_SCHEMA = pa.schema(_CLEAR_FIELDS)
614
+
615
+
616
+ @dataclasses.dataclass(slots=True, frozen=True, kw_only=True)
617
+ class AccumulateClearArgs:
618
+ """Arguments for the ``accumulate_clear`` table function."""
619
+
620
+ name: Annotated[str, Arg(0, doc="Name of the collection to clear")]
621
+
622
+
623
+ @dataclasses.dataclass
624
+ class AccumulateClearState(ArrowSerializableDataclass):
625
+ """Whether the single result row has been emitted yet."""
626
+
627
+ done: bool = False
628
+
629
+
630
+ @init_single_worker
631
+ class AccumulateClearFunction(TableFunctionGenerator[AccumulateClearArgs, AccumulateClearState]):
632
+ """Remove an accumulated collection by name (scoped to the ATTACH session).
633
+
634
+ Drops the entire collection (rows + pinned schema), so the name is free to be
635
+ re-accumulated with any schema afterward. Emits a single row
636
+ ``(name, rows_cleared)``.
637
+ """
638
+
639
+ class Meta:
640
+ """Function metadata."""
641
+
642
+ name = "accumulate_clear"
643
+ description = "Remove an accumulated collection by name; returns rows cleared"
644
+ categories = ["stateful", "utility"]
645
+ examples = [
646
+ FunctionExample(
647
+ sql="SELECT * FROM accumulate_clear('events')",
648
+ description="Clear the 'events' collection, returning how many rows were removed",
649
+ ),
650
+ ]
651
+
652
+ @classmethod
653
+ def on_bind(cls, params: BindParams[AccumulateClearArgs]) -> BindResponse:
654
+ """Validate the name; the output schema is fixed."""
655
+ _validate_name(params.args.name)
656
+ return BindResponse(output_schema=CLEAR_SCHEMA)
657
+
658
+ @classmethod
659
+ def initial_state(cls, params: ProcessParams[AccumulateClearArgs]) -> AccumulateClearState:
660
+ """Start with the result row not yet emitted."""
661
+ return AccumulateClearState(done=False)
662
+
663
+ @classmethod
664
+ def process(
665
+ cls,
666
+ params: ProcessParams[AccumulateClearArgs],
667
+ state: AccumulateClearState,
668
+ out: OutputCollector,
669
+ ) -> None:
670
+ """Clear the collection (first tick) and emit the single result row."""
671
+ if state.done:
672
+ out.finish()
673
+ return
674
+
675
+ ps = _store(cls.storage, params.attach_opaque_data)
676
+ name = params.args.name
677
+ rows_cleared = _clear_collection(ps, name.encode())
678
+
679
+ out.emit(
680
+ pa.RecordBatch.from_arrays(
681
+ [pa.array([name], pa.string()), pa.array([rows_cleared], pa.int64())],
682
+ schema=CLEAR_SCHEMA,
683
+ )
684
+ )
685
+ state.done = True
686
+
687
+
688
+ # ---------------------------------------------------------------------------
689
+ # Catalog & worker
690
+ # ---------------------------------------------------------------------------
691
+
692
+ _ACCUMULATE_CATALOG = Catalog(
693
+ name="accumulate",
694
+ default_schema="main",
695
+ schemas=[
696
+ Schema(
697
+ name="main",
698
+ comment="Row accumulation keyed by name, persisted via FunctionStorage and scoped per ATTACH",
699
+ functions=[
700
+ AccumulateFunction,
701
+ AccumulateReadFunction,
702
+ AccumulateClearFunction,
703
+ ],
704
+ ),
705
+ ],
706
+ )
707
+
708
+
709
+ class AccumulateCatalog(ReadOnlyCatalogInterface):
710
+ """Catalog that mints a random per-ATTACH id and advertises versions.
711
+
712
+ The random ``attach_opaque_data`` is carried back on every call and used as
713
+ the storage scope, isolating each ATTACH session's accumulations.
714
+ """
715
+
716
+ catalog = _ACCUMULATE_CATALOG
717
+ catalog_name = _ACCUMULATE_CATALOG.name
718
+
719
+ def catalogs(self) -> list[CatalogInfo]:
720
+ """Advertise the catalog with its data/implementation versions."""
721
+ return [
722
+ CatalogInfo(
723
+ name=self._effective_catalog_name,
724
+ implementation_version=IMPLEMENTATION_VERSION,
725
+ data_version_spec=DATA_VERSION,
726
+ attach_option_specs=[spec.serialize() for spec in self.attach_option_specs],
727
+ )
728
+ ]
729
+
730
+ def catalog_attach(
731
+ self,
732
+ *,
733
+ name: str,
734
+ options: dict[str, Any],
735
+ data_version_spec: str | None,
736
+ implementation_version: str | None,
737
+ ctx: CallContext | None = None,
738
+ ) -> CatalogAttachResult:
739
+ """Attach, minting a random per-ATTACH storage scope id."""
740
+ result = super().catalog_attach(
741
+ name=name,
742
+ options=options,
743
+ data_version_spec=data_version_spec,
744
+ implementation_version=implementation_version,
745
+ ctx=ctx,
746
+ )
747
+ return dataclasses.replace(
748
+ result,
749
+ # Random id, unique per ATTACH; the client persists it and resends it
750
+ # on every call, so it also survives a worker restart.
751
+ attach_opaque_data=AttachOpaqueData(uuid.uuid4().bytes),
752
+ attach_opaque_data_required=True,
753
+ resolved_data_version=DATA_VERSION,
754
+ resolved_implementation_version=IMPLEMENTATION_VERSION,
755
+ )
756
+
757
+
758
+ class AccumulateWorker(Worker):
759
+ """Worker process hosting the accumulate functions."""
760
+
761
+ catalog = _ACCUMULATE_CATALOG
762
+ catalog_interface = AccumulateCatalog