vgi-python 0.8.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (124) hide show
  1. vgi/__init__.py +152 -0
  2. vgi/_duckdb.py +62 -0
  3. vgi/_storage_profile.py +132 -0
  4. vgi/_test_fixtures/__init__.py +20 -0
  5. vgi/_test_fixtures/accumulate/__init__.py +19 -0
  6. vgi/_test_fixtures/accumulate/worker.py +762 -0
  7. vgi/_test_fixtures/aggregate/__init__.py +62 -0
  8. vgi/_test_fixtures/aggregate/_common.py +21 -0
  9. vgi/_test_fixtures/aggregate/basic.py +232 -0
  10. vgi/_test_fixtures/aggregate/dynamic.py +409 -0
  11. vgi/_test_fixtures/aggregate/generic.py +86 -0
  12. vgi/_test_fixtures/aggregate/listagg.py +71 -0
  13. vgi/_test_fixtures/aggregate/percentile.py +107 -0
  14. vgi/_test_fixtures/aggregate/streaming.py +192 -0
  15. vgi/_test_fixtures/aggregate/varargs.py +75 -0
  16. vgi/_test_fixtures/aggregate/window.py +380 -0
  17. vgi/_test_fixtures/attach_options.py +308 -0
  18. vgi/_test_fixtures/bad_protocol.py +62 -0
  19. vgi/_test_fixtures/cancellable.py +336 -0
  20. vgi/_test_fixtures/catalog.py +813 -0
  21. vgi/_test_fixtures/http_server.py +394 -0
  22. vgi/_test_fixtures/nest_tensor.py +614 -0
  23. vgi/_test_fixtures/orchard_catalog.py +47 -0
  24. vgi/_test_fixtures/projection_repro/__init__.py +6 -0
  25. vgi/_test_fixtures/projection_repro/worker.py +454 -0
  26. vgi/_test_fixtures/scalar/__init__.py +116 -0
  27. vgi/_test_fixtures/scalar/_common.py +69 -0
  28. vgi/_test_fixtures/scalar/arithmetic.py +321 -0
  29. vgi/_test_fixtures/scalar/binary.py +120 -0
  30. vgi/_test_fixtures/scalar/formatting.py +176 -0
  31. vgi/_test_fixtures/scalar/geo.py +300 -0
  32. vgi/_test_fixtures/scalar/null_handling.py +107 -0
  33. vgi/_test_fixtures/scalar/random_demo.py +171 -0
  34. vgi/_test_fixtures/scalar/settings_secrets.py +102 -0
  35. vgi/_test_fixtures/scalar/type_info.py +219 -0
  36. vgi/_test_fixtures/schema_reconcile/__init__.py +29 -0
  37. vgi/_test_fixtures/schema_reconcile/worker.py +653 -0
  38. vgi/_test_fixtures/simple_writable.py +793 -0
  39. vgi/_test_fixtures/table/__init__.py +221 -0
  40. vgi/_test_fixtures/table/_common.py +162 -0
  41. vgi/_test_fixtures/table/batch_index.py +283 -0
  42. vgi/_test_fixtures/table/batch_index_broken.py +200 -0
  43. vgi/_test_fixtures/table/catalog_scans.py +162 -0
  44. vgi/_test_fixtures/table/filters.py +1005 -0
  45. vgi/_test_fixtures/table/late_materialization.py +249 -0
  46. vgi/_test_fixtures/table/make_series.py +273 -0
  47. vgi/_test_fixtures/table/misc.py +499 -0
  48. vgi/_test_fixtures/table/order_modes.py +164 -0
  49. vgi/_test_fixtures/table/pairs.py +437 -0
  50. vgi/_test_fixtures/table/partition_columns.py +472 -0
  51. vgi/_test_fixtures/table/partition_columns_broken.py +304 -0
  52. vgi/_test_fixtures/table/profiling_example.py +195 -0
  53. vgi/_test_fixtures/table/required_filters.py +234 -0
  54. vgi/_test_fixtures/table/sequence.py +710 -0
  55. vgi/_test_fixtures/table/settings.py +426 -0
  56. vgi/_test_fixtures/table/transaction_storage.py +162 -0
  57. vgi/_test_fixtures/table/tt_pushdown.py +191 -0
  58. vgi/_test_fixtures/table/versioned.py +230 -0
  59. vgi/_test_fixtures/table_in_out.py +1392 -0
  60. vgi/_test_fixtures/versioned.py +155 -0
  61. vgi/_test_fixtures/versioned_tables.py +595 -0
  62. vgi/_test_fixtures/worker.py +1631 -0
  63. vgi/_test_fixtures/writable/__init__.py +8 -0
  64. vgi/_test_fixtures/writable/generic.py +236 -0
  65. vgi/_test_fixtures/writable/table.py +149 -0
  66. vgi/_test_fixtures/writable/worker.py +1148 -0
  67. vgi/aggregate_function.py +607 -0
  68. vgi/argument_spec.py +472 -0
  69. vgi/arguments.py +1747 -0
  70. vgi/auth.py +55 -0
  71. vgi/catalog/__init__.py +88 -0
  72. vgi/catalog/attach_option.py +206 -0
  73. vgi/catalog/catalog_interface.py +2767 -0
  74. vgi/catalog/descriptors.py +870 -0
  75. vgi/catalog/duckdb_statistics.py +377 -0
  76. vgi/catalog/secret_type.py +96 -0
  77. vgi/catalog/setting.py +253 -0
  78. vgi/catalog/storage.py +372 -0
  79. vgi/client/__init__.py +67 -0
  80. vgi/client/catalog_mixin.py +1251 -0
  81. vgi/client/cli.py +582 -0
  82. vgi/client/cli_catalog.py +182 -0
  83. vgi/client/cli_schema.py +270 -0
  84. vgi/client/cli_table.py +907 -0
  85. vgi/client/cli_transaction.py +97 -0
  86. vgi/client/cli_utils.py +441 -0
  87. vgi/client/cli_view.py +303 -0
  88. vgi/client/client.py +2183 -0
  89. vgi/exceptions.py +205 -0
  90. vgi/function.py +245 -0
  91. vgi/function_storage.py +1636 -0
  92. vgi/function_storage_azure_sql.py +922 -0
  93. vgi/function_storage_cf_do.py +740 -0
  94. vgi/http/__init__.py +25 -0
  95. vgi/http/demo_storage.py +212 -0
  96. vgi/http/worker_page.py +1252 -0
  97. vgi/invocation.py +154 -0
  98. vgi/logging_config.py +93 -0
  99. vgi/meta_worker.py +661 -0
  100. vgi/metadata.py +1403 -0
  101. vgi/otel.py +406 -0
  102. vgi/protocol.py +2418 -0
  103. vgi/protocol_version.txt +1 -0
  104. vgi/py.typed +0 -0
  105. vgi/scalar_function.py +1211 -0
  106. vgi/schema_utils.py +234 -0
  107. vgi/secret_protocol.py +124 -0
  108. vgi/secret_service.py +238 -0
  109. vgi/serve.py +769 -0
  110. vgi/table_buffering_function.py +443 -0
  111. vgi/table_filter_pushdown.py +1528 -0
  112. vgi/table_function.py +1130 -0
  113. vgi/table_in_out_function.py +383 -0
  114. vgi/transactor/__init__.py +24 -0
  115. vgi/transactor/_duckdb_compat.py +27 -0
  116. vgi/transactor/client.py +137 -0
  117. vgi/transactor/protocol.py +149 -0
  118. vgi/transactor/server.py +740 -0
  119. vgi/worker.py +4761 -0
  120. vgi_python-0.8.0.dist-info/METADATA +735 -0
  121. vgi_python-0.8.0.dist-info/RECORD +124 -0
  122. vgi_python-0.8.0.dist-info/WHEEL +4 -0
  123. vgi_python-0.8.0.dist-info/entry_points.txt +5 -0
  124. vgi_python-0.8.0.dist-info/licenses/LICENSE +134 -0
@@ -0,0 +1,607 @@
1
+ # Copyright 2025, 2026 Query Farm LLC - https://query.farm
2
+
3
+ """Framework for implementing aggregate functions.
4
+
5
+ AggregateFunction provides a batch-oriented API for DuckDB aggregate functions
6
+ (e.g., ``SELECT my_agg(col) FROM t GROUP BY category``). The C++ side manages
7
+ trivial per-group state (just an int64 group_id), while Python holds the real
8
+ accumulation state in FunctionStorage.
9
+
10
+ Three phases:
11
+ - UPDATE: accumulate input rows into per-group state
12
+ - COMBINE: merge states from parallel workers
13
+ - FINALIZE: produce one result per group
14
+ """
15
+
16
+ from __future__ import annotations
17
+
18
+ import contextlib
19
+ import inspect
20
+ from abc import abstractmethod
21
+ from dataclasses import dataclass
22
+ from typing import Any, Final, TypeVar, final, get_args, get_origin
23
+
24
+ import pyarrow as pa
25
+ from vgi_rpc import ArrowSerializableDataclass
26
+ from vgi_rpc.rpc import AuthContext
27
+
28
+ import vgi.function
29
+ from vgi.arguments import Arguments
30
+ from vgi.invocation import (
31
+ BindResponse,
32
+ )
33
+ from vgi.schema_utils import schema
34
+ from vgi.table_function import (
35
+ ProcessParams,
36
+ SecretsAccessor,
37
+ )
38
+
39
+ __all__ = [
40
+ "AggregateBindParams",
41
+ "AggregateFunction",
42
+ "GROUP_COLUMN_NAME",
43
+ "WindowPartition",
44
+ ]
45
+
46
+
47
+ @dataclass(slots=True, frozen=True, kw_only=True)
48
+ class AggregateBindParams:
49
+ """Parameters passed to AggregateFunction.on_bind()."""
50
+
51
+ args: Arguments | None
52
+ input_schema: pa.Schema | None
53
+ settings: dict[str, Any]
54
+ secrets: SecretsAccessor
55
+ auth_context: AuthContext = AuthContext.anonymous()
56
+
57
+
58
+ @dataclass(slots=True, frozen=True)
59
+ class WindowPartition:
60
+ """Full partition data passed to a windowed aggregate callback.
61
+
62
+ Constructed by the worker from the ``aggregate_window_init`` RPC payload
63
+ and re-hydrated on every ``aggregate_window`` call via storage.
64
+
65
+ Attributes:
66
+ inputs: The partition's input RecordBatch (all input columns, all rows).
67
+ row_count: Total number of rows in the partition.
68
+ filter_mask: Boolean mask from an optional ``FILTER (WHERE ...)`` clause.
69
+ Length equals ``row_count``.
70
+ frame_stats: ``((begin_delta, end_delta), (begin_delta, end_delta))`` —
71
+ DuckDB's per-partition frame statistics for planning.
72
+ all_valid: Per-input-column validity flag (True if no nulls in column).
73
+
74
+ """
75
+
76
+ inputs: pa.RecordBatch
77
+ row_count: int
78
+ filter_mask: pa.BooleanArray
79
+ frame_stats: tuple[tuple[int, int], tuple[int, int]]
80
+ all_valid: list[bool]
81
+
82
+ def filter(self, start: int, end: int) -> pa.RecordBatch:
83
+ """Slice the partition inputs for rows ``[start, end)``."""
84
+ return self.inputs.slice(start, end - start)
85
+
86
+
87
+ GROUP_COLUMN_NAME: Final[str] = "__vgi_group_id"
88
+ """Reserved column name prepended by C++ to UPDATE exchange batches."""
89
+
90
+ TState = TypeVar("TState", bound=ArrowSerializableDataclass)
91
+
92
+
93
+ class AggregateFunction[TState: ArrowSerializableDataclass](vgi.function.Function):
94
+ """Base class for aggregate functions.
95
+
96
+ Aggregate functions accumulate input rows into per-group state during
97
+ UPDATE, merge parallel worker states during COMBINE, and produce one
98
+ result row per group during FINALIZE.
99
+
100
+ Input columns are declared via ``Param`` annotations on ``update()``,
101
+ and the output type via ``Returns`` annotation — the same pattern as
102
+ ``ScalarFunction.compute()``.
103
+
104
+ Type Parameters:
105
+ TState: ``ArrowSerializableDataclass`` for per-group accumulation state.
106
+
107
+ Example::
108
+
109
+ class SumFunction(AggregateFunction[SumState]):
110
+ class Meta:
111
+ name = "vgi_sum"
112
+
113
+ @classmethod
114
+ def initial_state(cls, params):
115
+ return SumState()
116
+
117
+ @classmethod
118
+ def update(
119
+ cls,
120
+ states: dict[int, SumState],
121
+ group_ids: pa.Int64Array,
122
+ value: Annotated[pa.Int64Array, Param(doc="Column to sum")],
123
+ ) -> None:
124
+ ...
125
+
126
+ @classmethod
127
+ def combine(cls, source, target, params):
128
+ return SumState(total=source.total + target.total)
129
+
130
+ @classmethod
131
+ def finalize(
132
+ cls,
133
+ group_ids: pa.Int64Array,
134
+ states: dict[int, SumState],
135
+ params: ProcessParams,
136
+ ) -> Annotated[pa.RecordBatch, Returns(pa.int64())]:
137
+ ...
138
+
139
+ """
140
+
141
+ state_class: type[TState] | None = None
142
+ _compute_params: dict[str, Any] = {} # noqa: RUF012
143
+ _const_params: dict[str, Any] = {} # noqa: RUF012
144
+ _setting_params: dict[str, str] = {} # noqa: RUF012
145
+ _secret_params: dict[str, Any] = {} # noqa: RUF012
146
+ _const_param_phases: dict[str, str] = {} # noqa: RUF012
147
+ _returns_output_type: pa.DataType | None = None
148
+
149
+ def __init_subclass__(cls, **kwargs: object) -> None:
150
+ """Extract state_class, Param annotations, and Returns type."""
151
+ super().__init_subclass__(**kwargs)
152
+
153
+ from typing import cast, get_type_hints
154
+
155
+ from vgi.arguments import ARRAY_CLASS_TO_DATATYPE, Arg, ConstParam, Param, Returns
156
+ from vgi.scalar_function import _const_param_to_arg, _param_to_arg
157
+
158
+ # Skip abstract classes
159
+ if inspect.isabstract(cls):
160
+ return
161
+
162
+ # Extract TState from generic type parameters
163
+ orig_bases = getattr(cls, "__orig_bases__", ())
164
+ for base in orig_bases:
165
+ origin = get_origin(base)
166
+ if origin is None:
167
+ continue
168
+ if not (isinstance(origin, type) and issubclass(origin, AggregateFunction)):
169
+ continue
170
+ type_args = get_args(base)
171
+ if type_args:
172
+ state_type = type_args[0]
173
+ if not isinstance(state_type, TypeVar):
174
+ cls.state_class = state_type
175
+ break
176
+
177
+ # Parse Param and ConstParam annotations from update() method.
178
+ # Single interleaved loop to get correct overall_position values.
179
+ update_method = getattr(cls, "update", None)
180
+ if update_method is None:
181
+ return
182
+
183
+ hints: dict[str, Any] = {}
184
+ try:
185
+ hints = get_type_hints(update_method, include_extras=True)
186
+ except Exception as exc:
187
+ import warnings
188
+
189
+ warnings.warn(
190
+ f"{cls.__name__}.update() type hints could not be resolved: {exc!r}. "
191
+ "Param/ConstParam annotations will be ignored, leaving the function "
192
+ "registered with no input columns.",
193
+ stacklevel=2,
194
+ )
195
+
196
+ compute_params: dict[str, Arg[Any]] = {}
197
+ const_params: dict[str, Arg[Any]] = {}
198
+ const_param_phases: dict[str, str] = {}
199
+ overall_position = 0
200
+ column_index = 0
201
+ const_index = 0
202
+
203
+ sig = inspect.signature(update_method)
204
+ skip_params = {"self", "cls", "states", "group_ids", "params"}
205
+
206
+ for name in sig.parameters:
207
+ if name in skip_params:
208
+ continue
209
+
210
+ hint = hints.get(name)
211
+ if hint is None:
212
+ continue
213
+
214
+ if hasattr(hint, "__metadata__"):
215
+ for meta in hint.__metadata__:
216
+ if isinstance(meta, Param):
217
+ hint_args = get_args(hint)
218
+ base_type = hint_args[0] if hint_args else pa.Array
219
+ arg = _param_to_arg(meta, base_type, overall_position)
220
+ arg._name = name
221
+ arg._resolution_index = column_index
222
+ compute_params[name] = arg
223
+ overall_position += 1
224
+ column_index += 1
225
+ break
226
+ if isinstance(meta, ConstParam):
227
+ hint_args = get_args(hint)
228
+ base_type = cast(type, hint_args[0] if hint_args else Any)
229
+ arg = _const_param_to_arg(meta, base_type, overall_position)
230
+ arg._name = name
231
+ arg._resolution_index = const_index
232
+ const_params[name] = arg
233
+ const_param_phases[name] = getattr(meta, "phase", "all")
234
+ overall_position += 1
235
+ const_index += 1
236
+ break
237
+
238
+ cls._compute_params = compute_params
239
+ cls._const_params = const_params
240
+ cls._const_param_phases = const_param_phases
241
+
242
+ # Parse Returns annotation from finalize() return type
243
+ finalize_method = getattr(cls, "finalize", None)
244
+ returns_output_type: pa.DataType | None = None
245
+ if finalize_method is not None:
246
+ finalize_hints: dict[str, Any] = {}
247
+ with contextlib.suppress(Exception):
248
+ finalize_hints = get_type_hints(finalize_method, include_extras=True)
249
+ return_hint = finalize_hints.get("return")
250
+ if return_hint is not None and hasattr(return_hint, "__metadata__"):
251
+ for meta in return_hint.__metadata__:
252
+ if isinstance(meta, Returns):
253
+ if meta.arrow_type is not None:
254
+ returns_output_type = meta.arrow_type
255
+ else:
256
+ ret_args = get_args(return_hint)
257
+ if ret_args and ret_args[0] in ARRAY_CLASS_TO_DATATYPE:
258
+ returns_output_type = ARRAY_CLASS_TO_DATATYPE[ret_args[0]]
259
+ break
260
+
261
+ cls._returns_output_type = returns_output_type
262
+
263
+ # Parse on_bind() signature for Setting/Secret annotations
264
+ from vgi.table_function import _extract_setting_secret_params
265
+
266
+ on_bind_method = getattr(cls, "on_bind", None)
267
+ if on_bind_method is not None and "on_bind" in cls.__dict__:
268
+ cls._setting_params, cls._secret_params = _extract_setting_secret_params(on_bind_method)
269
+ else:
270
+ cls._setting_params = getattr(cls, "_setting_params", {})
271
+ cls._secret_params = getattr(cls, "_secret_params", {})
272
+
273
+ @classmethod
274
+ def on_bind(cls, params: AggregateBindParams, **kwargs: Any) -> BindResponse:
275
+ """Override to provide output schema and optional bind-time logic.
276
+
277
+ Must return a ``BindResponse`` with an ``output_schema`` containing
278
+ exactly one field (the aggregate result column).
279
+ """
280
+ # Default: use Returns annotation if available
281
+ if cls._returns_output_type is not None:
282
+ return BindResponse(output_schema=schema(result=cls._returns_output_type))
283
+ raise NotImplementedError(
284
+ f"{cls.__name__} must either implement on_bind() or annotate finalize() with Returns(arrow_type=...)"
285
+ )
286
+
287
+ @final
288
+ @classmethod
289
+ def catalog_output_schema(cls) -> pa.Schema:
290
+ """Return output schema for catalog introspection."""
291
+ if cls._returns_output_type is not None:
292
+ return schema(result=cls._returns_output_type)
293
+ # Dynamic type (Returns() with no arrow_type) — mark as "any" for C++
294
+ field = pa.field("result", pa.null(), metadata={b"vgi:any": b"true"})
295
+ return pa.schema([field])
296
+
297
+ @classmethod
298
+ @abstractmethod
299
+ def initial_state(cls, params: ProcessParams[Any]) -> TState:
300
+ """Create the initial state for a new group.
301
+
302
+ Called when a group_id is first encountered during UPDATE.
303
+ Must return a valid ``TState`` instance representing the identity
304
+ element (e.g., 0 for SUM, empty list for LISTAGG).
305
+ """
306
+ ...
307
+
308
+ @classmethod
309
+ @abstractmethod
310
+ def update(cls, *args: Any, **kwargs: Any) -> None:
311
+ """Accumulate input rows into per-group state.
312
+
313
+ Declare input columns as ``Param``-annotated parameters::
314
+
315
+ @classmethod
316
+ def update(
317
+ cls,
318
+ states: dict[int, MyState],
319
+ group_ids: pa.Int64Array,
320
+ value: Annotated[pa.Int64Array, Param(doc="Column to sum")],
321
+ ) -> None:
322
+ ...
323
+
324
+ The ``states`` dict is pre-populated with ``initial_state()`` for
325
+ all new group_ids. ``group_ids`` is parallel to each column array.
326
+
327
+ """
328
+ ...
329
+
330
+ @classmethod
331
+ @abstractmethod
332
+ def combine(
333
+ cls,
334
+ source: TState,
335
+ target: TState,
336
+ params: ProcessParams[Any],
337
+ ) -> TState:
338
+ """Merge two partial states from parallel workers.
339
+
340
+ Returns the merged ``TState``. Framework replaces target and removes source.
341
+
342
+ """
343
+ ...
344
+
345
+ @classmethod
346
+ @abstractmethod
347
+ def finalize(cls, *args: Any, **kwargs: Any) -> Any:
348
+ """Produce results for the requested group_ids.
349
+
350
+ Annotate the return type with ``Returns``::
351
+
352
+ @classmethod
353
+ def finalize(
354
+ cls,
355
+ group_ids: pa.Int64Array,
356
+ states: dict[int, MyState],
357
+ params: ProcessParams,
358
+ ) -> Annotated[pa.RecordBatch, Returns(pa.int64())]:
359
+ ...
360
+
361
+ Must return a RecordBatch with one row per ``group_id``.
362
+
363
+ """
364
+ ...
365
+
366
+ @classmethod
367
+ def ensure_state(
368
+ cls,
369
+ states: dict[int, TState],
370
+ group_id: int,
371
+ params: ProcessParams[Any],
372
+ ) -> TState:
373
+ """Get or create state for a group_id.
374
+
375
+ The framework pre-populates the states dict before calling ``update()``
376
+ and ``finalize()``, so this helper should not normally be needed.
377
+ Provided for defensive coding.
378
+
379
+ Returns:
380
+ The state for the given group_id.
381
+
382
+ """
383
+ if group_id not in states:
384
+ states[group_id] = cls.initial_state(params)
385
+ return states[group_id]
386
+
387
+ # ------------------------------------------------------------------
388
+ # Optional windowed-aggregate callbacks
389
+ # ------------------------------------------------------------------
390
+ # Enable by setting ``Meta.supports_window = True`` and overriding
391
+ # ``window()`` (and optionally ``window_init()``).
392
+ #
393
+ # The C++ extension ships the full partition once per ``OVER`` partition
394
+ # via ``aggregate_window_init``; the worker serialises it to
395
+ # ``FunctionStorage`` keyed by ``(execution_id, partition_id)``. Each
396
+ # subsequent ``aggregate_window`` RPC carries just ``(rid, subframes)``
397
+ # and re-hydrates the partition from storage before calling ``window()``.
398
+ # See ``plan`` for the per-call flushing rationale (DuckDB's window
399
+ # callback has no per-Evaluate finalize hook).
400
+
401
+ @classmethod
402
+ def window_init(
403
+ cls,
404
+ partition: WindowPartition,
405
+ params: ProcessParams[Any],
406
+ ) -> Any:
407
+ """Derive optional per-partition state from the raw partition.
408
+
409
+ Called once per partition before any ``window()`` call. Return any
410
+ ``ArrowSerializableDataclass`` (so it can round-trip through storage),
411
+ or ``None`` if no derived state is required. The return value is
412
+ passed back to ``window()`` as ``window_state``.
413
+
414
+ Default implementation returns ``None``.
415
+ """
416
+ return None
417
+
418
+ @classmethod
419
+ def window_prepare(
420
+ cls,
421
+ partition: WindowPartition,
422
+ window_state: Any,
423
+ params: ProcessParams[Any],
424
+ ) -> Any:
425
+ """Derive per-partition state for the window() loop (optional hook).
426
+
427
+ Called once per partition, after ``window_init`` (or after the state
428
+ is rehydrated from storage on a cold reload), before any
429
+ ``window()`` call. The return value is passed as ``window_state``
430
+ to every ``window()`` call against this partition, replacing the
431
+ opaque ``_WindowStatePlaceholder`` user code would otherwise
432
+ receive.
433
+
434
+ Use this hook for one-shot per-partition work that ``window()``
435
+ would otherwise have to redo on every call: deserialise the
436
+ ``_WindowStatePlaceholder``, reshape NumPy buffers from
437
+ ``window_init``'s state, build symbol→index lookups, etc.
438
+ Anything you would otherwise be tempted to memoise via a
439
+ module-level dict.
440
+
441
+ The result lives in the framework's per-partition cache and is
442
+ dropped automatically when the partition is evicted from the LRU
443
+ or its destructor fires.
444
+
445
+ Default implementation returns ``window_state`` unchanged — for
446
+ aggregates that don't define this hook, ``window()`` receives the
447
+ placeholder (or ``None``) exactly as it did before. Backward
448
+ compatible.
449
+ """
450
+ return window_state
451
+
452
+ @classmethod
453
+ def window(
454
+ cls,
455
+ rid: int,
456
+ subframes: list[tuple[int, int]],
457
+ partition: WindowPartition,
458
+ window_state: Any,
459
+ params: ProcessParams[Any],
460
+ ) -> Any:
461
+ """Compute the aggregate value for one output row.
462
+
463
+ Args:
464
+ rid: Partition-local row index being filled.
465
+ subframes: Frame ranges ``[(begin, end), ...]`` — 1 for the default
466
+ frame, 3 when ``EXCLUDE`` produces multiple subframes.
467
+ partition: The cached partition data.
468
+ window_state: ``window_prepare()``'s return value if the function
469
+ defines that hook; otherwise the value returned by
470
+ ``window_init()`` (may be ``None``), wrapped in a
471
+ ``_WindowStatePlaceholder`` on cold reload.
472
+ params: Shared ``ProcessParams``.
473
+
474
+ Returns:
475
+ A Python scalar or Arrow-compatible value; the worker wraps it
476
+ into an IPC batch matching the function's output schema.
477
+
478
+ """
479
+ raise NotImplementedError(f"{cls.__name__}: Meta.supports_window=True requires overriding window()")
480
+
481
+ @classmethod
482
+ def window_batch(
483
+ cls,
484
+ row_ids: list[int],
485
+ subframes: list[list[tuple[int, int]]],
486
+ partition: WindowPartition,
487
+ window_state: Any,
488
+ params: ProcessParams[Any],
489
+ ) -> pa.Array[Any] | list[Any]:
490
+ """Compute the aggregate value for ``count`` consecutive output rows.
491
+
492
+ Default implementation calls :meth:`window` once per row. Override
493
+ when per-row Python object construction dominates the call cost
494
+ and you want to build the output as an Arrow array directly,
495
+ bypassing the framework's default ``pa.array(results, ...)``
496
+ conversion.
497
+
498
+ Args:
499
+ row_ids: Partition-local row indices being filled. Length is
500
+ the batch size.
501
+ subframes: ``subframes[i]`` is the frame ranges for output
502
+ row ``row_ids[i]``. Same shape as :meth:`window`'s
503
+ ``subframes`` argument, one per row.
504
+ partition: The cached partition data.
505
+ window_state: As :meth:`window`.
506
+ params: As :meth:`window`.
507
+
508
+ Returns:
509
+ Either a :class:`pa.Array` of length ``len(row_ids)`` matching
510
+ the function's output type — shipped directly as the response
511
+ with no further conversion — or a ``list[Any]`` of the same
512
+ length, fed through ``pa.array(results, type=output_type)``
513
+ (equivalent to the default per-row path).
514
+
515
+ """
516
+ return [
517
+ cls.window(rid, frames, partition, window_state, params)
518
+ for rid, frames in zip(row_ids, subframes, strict=True)
519
+ ]
520
+
521
+ # ------------------------------------------------------------------
522
+ # Optional streaming-partitioned callbacks
523
+ # ------------------------------------------------------------------
524
+ # Enable by setting ``Meta.streaming_partitioned = True`` and overriding
525
+ # ``streaming_chunk()`` (and optionally ``streaming_open`` /
526
+ # ``streaming_close``).
527
+ #
528
+ # Streaming-partitioned aggregates handle queries shaped like
529
+ # ``f(...) OVER (PARTITION BY p ORDER BY o)`` with a cumulative frame
530
+ # (``UNBOUNDED PRECEDING -> CURRENT ROW``) where the input is too large
531
+ # to materialise in DuckDB memory but compresses heavily into per-
532
+ # partition state. The framework streams input chunks to the worker;
533
+ # the worker maintains concurrent per-partition state in a hash map and
534
+ # emits one output row per input row.
535
+
536
+ @classmethod
537
+ def streaming_open(cls, params: ProcessParams[Any]) -> Any:
538
+ """Build cross-partition global state for a streaming session.
539
+
540
+ Called once when ``aggregate_streaming_open`` arrives, before any
541
+ chunk is processed. Return any object (it lives in an in-process
542
+ cache keyed by ``execution_id`` for the duration of the session).
543
+
544
+ Typical contents: a ``dict`` of per-partition aggregate states
545
+ (populated lazily as new partition keys appear in input chunks),
546
+ plus any cross-partition resources to share — symbol intern
547
+ tables, allocator pools, prepared output buffers.
548
+
549
+ Default implementation returns ``None`` (no shared state); the
550
+ function still works if ``streaming_chunk`` keeps everything in
551
+ local variables, but per-partition state would have to live
552
+ somewhere caller-supplied.
553
+ """
554
+ return None
555
+
556
+ @classmethod
557
+ def streaming_chunk(
558
+ cls,
559
+ chunk: pa.RecordBatch,
560
+ streaming_state: Any,
561
+ partition_key_count: int,
562
+ order_key_count: int,
563
+ params: ProcessParams[Any],
564
+ ) -> pa.Array[Any] | list[Any]:
565
+ """Process one chunk of streaming input.
566
+
567
+ Args:
568
+ chunk: Input rows for this batch. Schema layout is
569
+ ``[partition_key_cols..., order_key_cols..., value_cols...]``
570
+ — the first ``partition_key_count`` columns are partition
571
+ keys (used to dispatch to the right per-partition state),
572
+ the next ``order_key_count`` are order keys (informational;
573
+ may be used to verify monotonicity), the rest are the
574
+ function's value arguments in declaration order.
575
+ streaming_state: Whatever ``streaming_open`` returned. The
576
+ framework passes the same object on every chunk; mutate
577
+ in place to accumulate state across chunks.
578
+ partition_key_count: Number of leading columns that form the
579
+ partition key.
580
+ order_key_count: Number of columns following the partition key
581
+ that form the order key.
582
+ params: Shared ``ProcessParams``.
583
+
584
+ Returns:
585
+ Either a :class:`pa.Array` of length ``chunk.num_rows`` matching
586
+ the function's output type, or a list of the same length
587
+ (which the framework converts via ``pa.array``). Each output
588
+ value is the cumulative aggregate snapshot at that input
589
+ row's position in its partition's order.
590
+
591
+ """
592
+ raise NotImplementedError(
593
+ f"{cls.__name__}: Meta.streaming_partitioned=True requires overriding streaming_chunk()"
594
+ )
595
+
596
+ @classmethod
597
+ def streaming_close(cls, streaming_state: Any, params: ProcessParams[Any]) -> None:
598
+ """Tear down streaming session state.
599
+
600
+ Called once when ``aggregate_streaming_close`` arrives, after the
601
+ last chunk. Use to release any external resources held by
602
+ ``streaming_state``. The framework drops its reference after this
603
+ call, so anything not held elsewhere is GCed naturally.
604
+
605
+ Default implementation is a no-op.
606
+ """
607
+ return None