vgi-python 0.8.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vgi/__init__.py +152 -0
- vgi/_duckdb.py +62 -0
- vgi/_storage_profile.py +132 -0
- vgi/_test_fixtures/__init__.py +20 -0
- vgi/_test_fixtures/accumulate/__init__.py +19 -0
- vgi/_test_fixtures/accumulate/worker.py +762 -0
- vgi/_test_fixtures/aggregate/__init__.py +62 -0
- vgi/_test_fixtures/aggregate/_common.py +21 -0
- vgi/_test_fixtures/aggregate/basic.py +232 -0
- vgi/_test_fixtures/aggregate/dynamic.py +409 -0
- vgi/_test_fixtures/aggregate/generic.py +86 -0
- vgi/_test_fixtures/aggregate/listagg.py +71 -0
- vgi/_test_fixtures/aggregate/percentile.py +107 -0
- vgi/_test_fixtures/aggregate/streaming.py +192 -0
- vgi/_test_fixtures/aggregate/varargs.py +75 -0
- vgi/_test_fixtures/aggregate/window.py +380 -0
- vgi/_test_fixtures/attach_options.py +308 -0
- vgi/_test_fixtures/bad_protocol.py +62 -0
- vgi/_test_fixtures/cancellable.py +336 -0
- vgi/_test_fixtures/catalog.py +813 -0
- vgi/_test_fixtures/http_server.py +394 -0
- vgi/_test_fixtures/nest_tensor.py +614 -0
- vgi/_test_fixtures/orchard_catalog.py +47 -0
- vgi/_test_fixtures/projection_repro/__init__.py +6 -0
- vgi/_test_fixtures/projection_repro/worker.py +454 -0
- vgi/_test_fixtures/scalar/__init__.py +116 -0
- vgi/_test_fixtures/scalar/_common.py +69 -0
- vgi/_test_fixtures/scalar/arithmetic.py +321 -0
- vgi/_test_fixtures/scalar/binary.py +120 -0
- vgi/_test_fixtures/scalar/formatting.py +176 -0
- vgi/_test_fixtures/scalar/geo.py +300 -0
- vgi/_test_fixtures/scalar/null_handling.py +107 -0
- vgi/_test_fixtures/scalar/random_demo.py +171 -0
- vgi/_test_fixtures/scalar/settings_secrets.py +102 -0
- vgi/_test_fixtures/scalar/type_info.py +219 -0
- vgi/_test_fixtures/schema_reconcile/__init__.py +29 -0
- vgi/_test_fixtures/schema_reconcile/worker.py +653 -0
- vgi/_test_fixtures/simple_writable.py +793 -0
- vgi/_test_fixtures/table/__init__.py +221 -0
- vgi/_test_fixtures/table/_common.py +162 -0
- vgi/_test_fixtures/table/batch_index.py +283 -0
- vgi/_test_fixtures/table/batch_index_broken.py +200 -0
- vgi/_test_fixtures/table/catalog_scans.py +162 -0
- vgi/_test_fixtures/table/filters.py +1005 -0
- vgi/_test_fixtures/table/late_materialization.py +249 -0
- vgi/_test_fixtures/table/make_series.py +273 -0
- vgi/_test_fixtures/table/misc.py +499 -0
- vgi/_test_fixtures/table/order_modes.py +164 -0
- vgi/_test_fixtures/table/pairs.py +437 -0
- vgi/_test_fixtures/table/partition_columns.py +472 -0
- vgi/_test_fixtures/table/partition_columns_broken.py +304 -0
- vgi/_test_fixtures/table/profiling_example.py +195 -0
- vgi/_test_fixtures/table/required_filters.py +234 -0
- vgi/_test_fixtures/table/sequence.py +710 -0
- vgi/_test_fixtures/table/settings.py +426 -0
- vgi/_test_fixtures/table/transaction_storage.py +162 -0
- vgi/_test_fixtures/table/tt_pushdown.py +191 -0
- vgi/_test_fixtures/table/versioned.py +230 -0
- vgi/_test_fixtures/table_in_out.py +1392 -0
- vgi/_test_fixtures/versioned.py +155 -0
- vgi/_test_fixtures/versioned_tables.py +595 -0
- vgi/_test_fixtures/worker.py +1631 -0
- vgi/_test_fixtures/writable/__init__.py +8 -0
- vgi/_test_fixtures/writable/generic.py +236 -0
- vgi/_test_fixtures/writable/table.py +149 -0
- vgi/_test_fixtures/writable/worker.py +1148 -0
- vgi/aggregate_function.py +607 -0
- vgi/argument_spec.py +472 -0
- vgi/arguments.py +1747 -0
- vgi/auth.py +55 -0
- vgi/catalog/__init__.py +88 -0
- vgi/catalog/attach_option.py +206 -0
- vgi/catalog/catalog_interface.py +2767 -0
- vgi/catalog/descriptors.py +870 -0
- vgi/catalog/duckdb_statistics.py +377 -0
- vgi/catalog/secret_type.py +96 -0
- vgi/catalog/setting.py +253 -0
- vgi/catalog/storage.py +372 -0
- vgi/client/__init__.py +67 -0
- vgi/client/catalog_mixin.py +1251 -0
- vgi/client/cli.py +582 -0
- vgi/client/cli_catalog.py +182 -0
- vgi/client/cli_schema.py +270 -0
- vgi/client/cli_table.py +907 -0
- vgi/client/cli_transaction.py +97 -0
- vgi/client/cli_utils.py +441 -0
- vgi/client/cli_view.py +303 -0
- vgi/client/client.py +2183 -0
- vgi/exceptions.py +205 -0
- vgi/function.py +245 -0
- vgi/function_storage.py +1636 -0
- vgi/function_storage_azure_sql.py +922 -0
- vgi/function_storage_cf_do.py +740 -0
- vgi/http/__init__.py +25 -0
- vgi/http/demo_storage.py +212 -0
- vgi/http/worker_page.py +1252 -0
- vgi/invocation.py +154 -0
- vgi/logging_config.py +93 -0
- vgi/meta_worker.py +661 -0
- vgi/metadata.py +1403 -0
- vgi/otel.py +406 -0
- vgi/protocol.py +2418 -0
- vgi/protocol_version.txt +1 -0
- vgi/py.typed +0 -0
- vgi/scalar_function.py +1211 -0
- vgi/schema_utils.py +234 -0
- vgi/secret_protocol.py +124 -0
- vgi/secret_service.py +238 -0
- vgi/serve.py +769 -0
- vgi/table_buffering_function.py +443 -0
- vgi/table_filter_pushdown.py +1528 -0
- vgi/table_function.py +1130 -0
- vgi/table_in_out_function.py +383 -0
- vgi/transactor/__init__.py +24 -0
- vgi/transactor/_duckdb_compat.py +27 -0
- vgi/transactor/client.py +137 -0
- vgi/transactor/protocol.py +149 -0
- vgi/transactor/server.py +740 -0
- vgi/worker.py +4761 -0
- vgi_python-0.8.0.dist-info/METADATA +735 -0
- vgi_python-0.8.0.dist-info/RECORD +124 -0
- vgi_python-0.8.0.dist-info/WHEEL +4 -0
- vgi_python-0.8.0.dist-info/entry_points.txt +5 -0
- vgi_python-0.8.0.dist-info/licenses/LICENSE +134 -0
|
@@ -0,0 +1,607 @@
|
|
|
1
|
+
# Copyright 2025, 2026 Query Farm LLC - https://query.farm
|
|
2
|
+
|
|
3
|
+
"""Framework for implementing aggregate functions.
|
|
4
|
+
|
|
5
|
+
AggregateFunction provides a batch-oriented API for DuckDB aggregate functions
|
|
6
|
+
(e.g., ``SELECT my_agg(col) FROM t GROUP BY category``). The C++ side manages
|
|
7
|
+
trivial per-group state (just an int64 group_id), while Python holds the real
|
|
8
|
+
accumulation state in FunctionStorage.
|
|
9
|
+
|
|
10
|
+
Three phases:
|
|
11
|
+
- UPDATE: accumulate input rows into per-group state
|
|
12
|
+
- COMBINE: merge states from parallel workers
|
|
13
|
+
- FINALIZE: produce one result per group
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
from __future__ import annotations
|
|
17
|
+
|
|
18
|
+
import contextlib
|
|
19
|
+
import inspect
|
|
20
|
+
from abc import abstractmethod
|
|
21
|
+
from dataclasses import dataclass
|
|
22
|
+
from typing import Any, Final, TypeVar, final, get_args, get_origin
|
|
23
|
+
|
|
24
|
+
import pyarrow as pa
|
|
25
|
+
from vgi_rpc import ArrowSerializableDataclass
|
|
26
|
+
from vgi_rpc.rpc import AuthContext
|
|
27
|
+
|
|
28
|
+
import vgi.function
|
|
29
|
+
from vgi.arguments import Arguments
|
|
30
|
+
from vgi.invocation import (
|
|
31
|
+
BindResponse,
|
|
32
|
+
)
|
|
33
|
+
from vgi.schema_utils import schema
|
|
34
|
+
from vgi.table_function import (
|
|
35
|
+
ProcessParams,
|
|
36
|
+
SecretsAccessor,
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
__all__ = [
|
|
40
|
+
"AggregateBindParams",
|
|
41
|
+
"AggregateFunction",
|
|
42
|
+
"GROUP_COLUMN_NAME",
|
|
43
|
+
"WindowPartition",
|
|
44
|
+
]
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
@dataclass(slots=True, frozen=True, kw_only=True)
|
|
48
|
+
class AggregateBindParams:
|
|
49
|
+
"""Parameters passed to AggregateFunction.on_bind()."""
|
|
50
|
+
|
|
51
|
+
args: Arguments | None
|
|
52
|
+
input_schema: pa.Schema | None
|
|
53
|
+
settings: dict[str, Any]
|
|
54
|
+
secrets: SecretsAccessor
|
|
55
|
+
auth_context: AuthContext = AuthContext.anonymous()
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
@dataclass(slots=True, frozen=True)
|
|
59
|
+
class WindowPartition:
|
|
60
|
+
"""Full partition data passed to a windowed aggregate callback.
|
|
61
|
+
|
|
62
|
+
Constructed by the worker from the ``aggregate_window_init`` RPC payload
|
|
63
|
+
and re-hydrated on every ``aggregate_window`` call via storage.
|
|
64
|
+
|
|
65
|
+
Attributes:
|
|
66
|
+
inputs: The partition's input RecordBatch (all input columns, all rows).
|
|
67
|
+
row_count: Total number of rows in the partition.
|
|
68
|
+
filter_mask: Boolean mask from an optional ``FILTER (WHERE ...)`` clause.
|
|
69
|
+
Length equals ``row_count``.
|
|
70
|
+
frame_stats: ``((begin_delta, end_delta), (begin_delta, end_delta))`` —
|
|
71
|
+
DuckDB's per-partition frame statistics for planning.
|
|
72
|
+
all_valid: Per-input-column validity flag (True if no nulls in column).
|
|
73
|
+
|
|
74
|
+
"""
|
|
75
|
+
|
|
76
|
+
inputs: pa.RecordBatch
|
|
77
|
+
row_count: int
|
|
78
|
+
filter_mask: pa.BooleanArray
|
|
79
|
+
frame_stats: tuple[tuple[int, int], tuple[int, int]]
|
|
80
|
+
all_valid: list[bool]
|
|
81
|
+
|
|
82
|
+
def filter(self, start: int, end: int) -> pa.RecordBatch:
|
|
83
|
+
"""Slice the partition inputs for rows ``[start, end)``."""
|
|
84
|
+
return self.inputs.slice(start, end - start)
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
GROUP_COLUMN_NAME: Final[str] = "__vgi_group_id"
|
|
88
|
+
"""Reserved column name prepended by C++ to UPDATE exchange batches."""
|
|
89
|
+
|
|
90
|
+
TState = TypeVar("TState", bound=ArrowSerializableDataclass)
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
class AggregateFunction[TState: ArrowSerializableDataclass](vgi.function.Function):
|
|
94
|
+
"""Base class for aggregate functions.
|
|
95
|
+
|
|
96
|
+
Aggregate functions accumulate input rows into per-group state during
|
|
97
|
+
UPDATE, merge parallel worker states during COMBINE, and produce one
|
|
98
|
+
result row per group during FINALIZE.
|
|
99
|
+
|
|
100
|
+
Input columns are declared via ``Param`` annotations on ``update()``,
|
|
101
|
+
and the output type via ``Returns`` annotation — the same pattern as
|
|
102
|
+
``ScalarFunction.compute()``.
|
|
103
|
+
|
|
104
|
+
Type Parameters:
|
|
105
|
+
TState: ``ArrowSerializableDataclass`` for per-group accumulation state.
|
|
106
|
+
|
|
107
|
+
Example::
|
|
108
|
+
|
|
109
|
+
class SumFunction(AggregateFunction[SumState]):
|
|
110
|
+
class Meta:
|
|
111
|
+
name = "vgi_sum"
|
|
112
|
+
|
|
113
|
+
@classmethod
|
|
114
|
+
def initial_state(cls, params):
|
|
115
|
+
return SumState()
|
|
116
|
+
|
|
117
|
+
@classmethod
|
|
118
|
+
def update(
|
|
119
|
+
cls,
|
|
120
|
+
states: dict[int, SumState],
|
|
121
|
+
group_ids: pa.Int64Array,
|
|
122
|
+
value: Annotated[pa.Int64Array, Param(doc="Column to sum")],
|
|
123
|
+
) -> None:
|
|
124
|
+
...
|
|
125
|
+
|
|
126
|
+
@classmethod
|
|
127
|
+
def combine(cls, source, target, params):
|
|
128
|
+
return SumState(total=source.total + target.total)
|
|
129
|
+
|
|
130
|
+
@classmethod
|
|
131
|
+
def finalize(
|
|
132
|
+
cls,
|
|
133
|
+
group_ids: pa.Int64Array,
|
|
134
|
+
states: dict[int, SumState],
|
|
135
|
+
params: ProcessParams,
|
|
136
|
+
) -> Annotated[pa.RecordBatch, Returns(pa.int64())]:
|
|
137
|
+
...
|
|
138
|
+
|
|
139
|
+
"""
|
|
140
|
+
|
|
141
|
+
state_class: type[TState] | None = None
|
|
142
|
+
_compute_params: dict[str, Any] = {} # noqa: RUF012
|
|
143
|
+
_const_params: dict[str, Any] = {} # noqa: RUF012
|
|
144
|
+
_setting_params: dict[str, str] = {} # noqa: RUF012
|
|
145
|
+
_secret_params: dict[str, Any] = {} # noqa: RUF012
|
|
146
|
+
_const_param_phases: dict[str, str] = {} # noqa: RUF012
|
|
147
|
+
_returns_output_type: pa.DataType | None = None
|
|
148
|
+
|
|
149
|
+
def __init_subclass__(cls, **kwargs: object) -> None:
|
|
150
|
+
"""Extract state_class, Param annotations, and Returns type."""
|
|
151
|
+
super().__init_subclass__(**kwargs)
|
|
152
|
+
|
|
153
|
+
from typing import cast, get_type_hints
|
|
154
|
+
|
|
155
|
+
from vgi.arguments import ARRAY_CLASS_TO_DATATYPE, Arg, ConstParam, Param, Returns
|
|
156
|
+
from vgi.scalar_function import _const_param_to_arg, _param_to_arg
|
|
157
|
+
|
|
158
|
+
# Skip abstract classes
|
|
159
|
+
if inspect.isabstract(cls):
|
|
160
|
+
return
|
|
161
|
+
|
|
162
|
+
# Extract TState from generic type parameters
|
|
163
|
+
orig_bases = getattr(cls, "__orig_bases__", ())
|
|
164
|
+
for base in orig_bases:
|
|
165
|
+
origin = get_origin(base)
|
|
166
|
+
if origin is None:
|
|
167
|
+
continue
|
|
168
|
+
if not (isinstance(origin, type) and issubclass(origin, AggregateFunction)):
|
|
169
|
+
continue
|
|
170
|
+
type_args = get_args(base)
|
|
171
|
+
if type_args:
|
|
172
|
+
state_type = type_args[0]
|
|
173
|
+
if not isinstance(state_type, TypeVar):
|
|
174
|
+
cls.state_class = state_type
|
|
175
|
+
break
|
|
176
|
+
|
|
177
|
+
# Parse Param and ConstParam annotations from update() method.
|
|
178
|
+
# Single interleaved loop to get correct overall_position values.
|
|
179
|
+
update_method = getattr(cls, "update", None)
|
|
180
|
+
if update_method is None:
|
|
181
|
+
return
|
|
182
|
+
|
|
183
|
+
hints: dict[str, Any] = {}
|
|
184
|
+
try:
|
|
185
|
+
hints = get_type_hints(update_method, include_extras=True)
|
|
186
|
+
except Exception as exc:
|
|
187
|
+
import warnings
|
|
188
|
+
|
|
189
|
+
warnings.warn(
|
|
190
|
+
f"{cls.__name__}.update() type hints could not be resolved: {exc!r}. "
|
|
191
|
+
"Param/ConstParam annotations will be ignored, leaving the function "
|
|
192
|
+
"registered with no input columns.",
|
|
193
|
+
stacklevel=2,
|
|
194
|
+
)
|
|
195
|
+
|
|
196
|
+
compute_params: dict[str, Arg[Any]] = {}
|
|
197
|
+
const_params: dict[str, Arg[Any]] = {}
|
|
198
|
+
const_param_phases: dict[str, str] = {}
|
|
199
|
+
overall_position = 0
|
|
200
|
+
column_index = 0
|
|
201
|
+
const_index = 0
|
|
202
|
+
|
|
203
|
+
sig = inspect.signature(update_method)
|
|
204
|
+
skip_params = {"self", "cls", "states", "group_ids", "params"}
|
|
205
|
+
|
|
206
|
+
for name in sig.parameters:
|
|
207
|
+
if name in skip_params:
|
|
208
|
+
continue
|
|
209
|
+
|
|
210
|
+
hint = hints.get(name)
|
|
211
|
+
if hint is None:
|
|
212
|
+
continue
|
|
213
|
+
|
|
214
|
+
if hasattr(hint, "__metadata__"):
|
|
215
|
+
for meta in hint.__metadata__:
|
|
216
|
+
if isinstance(meta, Param):
|
|
217
|
+
hint_args = get_args(hint)
|
|
218
|
+
base_type = hint_args[0] if hint_args else pa.Array
|
|
219
|
+
arg = _param_to_arg(meta, base_type, overall_position)
|
|
220
|
+
arg._name = name
|
|
221
|
+
arg._resolution_index = column_index
|
|
222
|
+
compute_params[name] = arg
|
|
223
|
+
overall_position += 1
|
|
224
|
+
column_index += 1
|
|
225
|
+
break
|
|
226
|
+
if isinstance(meta, ConstParam):
|
|
227
|
+
hint_args = get_args(hint)
|
|
228
|
+
base_type = cast(type, hint_args[0] if hint_args else Any)
|
|
229
|
+
arg = _const_param_to_arg(meta, base_type, overall_position)
|
|
230
|
+
arg._name = name
|
|
231
|
+
arg._resolution_index = const_index
|
|
232
|
+
const_params[name] = arg
|
|
233
|
+
const_param_phases[name] = getattr(meta, "phase", "all")
|
|
234
|
+
overall_position += 1
|
|
235
|
+
const_index += 1
|
|
236
|
+
break
|
|
237
|
+
|
|
238
|
+
cls._compute_params = compute_params
|
|
239
|
+
cls._const_params = const_params
|
|
240
|
+
cls._const_param_phases = const_param_phases
|
|
241
|
+
|
|
242
|
+
# Parse Returns annotation from finalize() return type
|
|
243
|
+
finalize_method = getattr(cls, "finalize", None)
|
|
244
|
+
returns_output_type: pa.DataType | None = None
|
|
245
|
+
if finalize_method is not None:
|
|
246
|
+
finalize_hints: dict[str, Any] = {}
|
|
247
|
+
with contextlib.suppress(Exception):
|
|
248
|
+
finalize_hints = get_type_hints(finalize_method, include_extras=True)
|
|
249
|
+
return_hint = finalize_hints.get("return")
|
|
250
|
+
if return_hint is not None and hasattr(return_hint, "__metadata__"):
|
|
251
|
+
for meta in return_hint.__metadata__:
|
|
252
|
+
if isinstance(meta, Returns):
|
|
253
|
+
if meta.arrow_type is not None:
|
|
254
|
+
returns_output_type = meta.arrow_type
|
|
255
|
+
else:
|
|
256
|
+
ret_args = get_args(return_hint)
|
|
257
|
+
if ret_args and ret_args[0] in ARRAY_CLASS_TO_DATATYPE:
|
|
258
|
+
returns_output_type = ARRAY_CLASS_TO_DATATYPE[ret_args[0]]
|
|
259
|
+
break
|
|
260
|
+
|
|
261
|
+
cls._returns_output_type = returns_output_type
|
|
262
|
+
|
|
263
|
+
# Parse on_bind() signature for Setting/Secret annotations
|
|
264
|
+
from vgi.table_function import _extract_setting_secret_params
|
|
265
|
+
|
|
266
|
+
on_bind_method = getattr(cls, "on_bind", None)
|
|
267
|
+
if on_bind_method is not None and "on_bind" in cls.__dict__:
|
|
268
|
+
cls._setting_params, cls._secret_params = _extract_setting_secret_params(on_bind_method)
|
|
269
|
+
else:
|
|
270
|
+
cls._setting_params = getattr(cls, "_setting_params", {})
|
|
271
|
+
cls._secret_params = getattr(cls, "_secret_params", {})
|
|
272
|
+
|
|
273
|
+
@classmethod
|
|
274
|
+
def on_bind(cls, params: AggregateBindParams, **kwargs: Any) -> BindResponse:
|
|
275
|
+
"""Override to provide output schema and optional bind-time logic.
|
|
276
|
+
|
|
277
|
+
Must return a ``BindResponse`` with an ``output_schema`` containing
|
|
278
|
+
exactly one field (the aggregate result column).
|
|
279
|
+
"""
|
|
280
|
+
# Default: use Returns annotation if available
|
|
281
|
+
if cls._returns_output_type is not None:
|
|
282
|
+
return BindResponse(output_schema=schema(result=cls._returns_output_type))
|
|
283
|
+
raise NotImplementedError(
|
|
284
|
+
f"{cls.__name__} must either implement on_bind() or annotate finalize() with Returns(arrow_type=...)"
|
|
285
|
+
)
|
|
286
|
+
|
|
287
|
+
@final
|
|
288
|
+
@classmethod
|
|
289
|
+
def catalog_output_schema(cls) -> pa.Schema:
|
|
290
|
+
"""Return output schema for catalog introspection."""
|
|
291
|
+
if cls._returns_output_type is not None:
|
|
292
|
+
return schema(result=cls._returns_output_type)
|
|
293
|
+
# Dynamic type (Returns() with no arrow_type) — mark as "any" for C++
|
|
294
|
+
field = pa.field("result", pa.null(), metadata={b"vgi:any": b"true"})
|
|
295
|
+
return pa.schema([field])
|
|
296
|
+
|
|
297
|
+
@classmethod
|
|
298
|
+
@abstractmethod
|
|
299
|
+
def initial_state(cls, params: ProcessParams[Any]) -> TState:
|
|
300
|
+
"""Create the initial state for a new group.
|
|
301
|
+
|
|
302
|
+
Called when a group_id is first encountered during UPDATE.
|
|
303
|
+
Must return a valid ``TState`` instance representing the identity
|
|
304
|
+
element (e.g., 0 for SUM, empty list for LISTAGG).
|
|
305
|
+
"""
|
|
306
|
+
...
|
|
307
|
+
|
|
308
|
+
@classmethod
|
|
309
|
+
@abstractmethod
|
|
310
|
+
def update(cls, *args: Any, **kwargs: Any) -> None:
|
|
311
|
+
"""Accumulate input rows into per-group state.
|
|
312
|
+
|
|
313
|
+
Declare input columns as ``Param``-annotated parameters::
|
|
314
|
+
|
|
315
|
+
@classmethod
|
|
316
|
+
def update(
|
|
317
|
+
cls,
|
|
318
|
+
states: dict[int, MyState],
|
|
319
|
+
group_ids: pa.Int64Array,
|
|
320
|
+
value: Annotated[pa.Int64Array, Param(doc="Column to sum")],
|
|
321
|
+
) -> None:
|
|
322
|
+
...
|
|
323
|
+
|
|
324
|
+
The ``states`` dict is pre-populated with ``initial_state()`` for
|
|
325
|
+
all new group_ids. ``group_ids`` is parallel to each column array.
|
|
326
|
+
|
|
327
|
+
"""
|
|
328
|
+
...
|
|
329
|
+
|
|
330
|
+
@classmethod
|
|
331
|
+
@abstractmethod
|
|
332
|
+
def combine(
|
|
333
|
+
cls,
|
|
334
|
+
source: TState,
|
|
335
|
+
target: TState,
|
|
336
|
+
params: ProcessParams[Any],
|
|
337
|
+
) -> TState:
|
|
338
|
+
"""Merge two partial states from parallel workers.
|
|
339
|
+
|
|
340
|
+
Returns the merged ``TState``. Framework replaces target and removes source.
|
|
341
|
+
|
|
342
|
+
"""
|
|
343
|
+
...
|
|
344
|
+
|
|
345
|
+
@classmethod
|
|
346
|
+
@abstractmethod
|
|
347
|
+
def finalize(cls, *args: Any, **kwargs: Any) -> Any:
|
|
348
|
+
"""Produce results for the requested group_ids.
|
|
349
|
+
|
|
350
|
+
Annotate the return type with ``Returns``::
|
|
351
|
+
|
|
352
|
+
@classmethod
|
|
353
|
+
def finalize(
|
|
354
|
+
cls,
|
|
355
|
+
group_ids: pa.Int64Array,
|
|
356
|
+
states: dict[int, MyState],
|
|
357
|
+
params: ProcessParams,
|
|
358
|
+
) -> Annotated[pa.RecordBatch, Returns(pa.int64())]:
|
|
359
|
+
...
|
|
360
|
+
|
|
361
|
+
Must return a RecordBatch with one row per ``group_id``.
|
|
362
|
+
|
|
363
|
+
"""
|
|
364
|
+
...
|
|
365
|
+
|
|
366
|
+
@classmethod
|
|
367
|
+
def ensure_state(
|
|
368
|
+
cls,
|
|
369
|
+
states: dict[int, TState],
|
|
370
|
+
group_id: int,
|
|
371
|
+
params: ProcessParams[Any],
|
|
372
|
+
) -> TState:
|
|
373
|
+
"""Get or create state for a group_id.
|
|
374
|
+
|
|
375
|
+
The framework pre-populates the states dict before calling ``update()``
|
|
376
|
+
and ``finalize()``, so this helper should not normally be needed.
|
|
377
|
+
Provided for defensive coding.
|
|
378
|
+
|
|
379
|
+
Returns:
|
|
380
|
+
The state for the given group_id.
|
|
381
|
+
|
|
382
|
+
"""
|
|
383
|
+
if group_id not in states:
|
|
384
|
+
states[group_id] = cls.initial_state(params)
|
|
385
|
+
return states[group_id]
|
|
386
|
+
|
|
387
|
+
# ------------------------------------------------------------------
|
|
388
|
+
# Optional windowed-aggregate callbacks
|
|
389
|
+
# ------------------------------------------------------------------
|
|
390
|
+
# Enable by setting ``Meta.supports_window = True`` and overriding
|
|
391
|
+
# ``window()`` (and optionally ``window_init()``).
|
|
392
|
+
#
|
|
393
|
+
# The C++ extension ships the full partition once per ``OVER`` partition
|
|
394
|
+
# via ``aggregate_window_init``; the worker serialises it to
|
|
395
|
+
# ``FunctionStorage`` keyed by ``(execution_id, partition_id)``. Each
|
|
396
|
+
# subsequent ``aggregate_window`` RPC carries just ``(rid, subframes)``
|
|
397
|
+
# and re-hydrates the partition from storage before calling ``window()``.
|
|
398
|
+
# See ``plan`` for the per-call flushing rationale (DuckDB's window
|
|
399
|
+
# callback has no per-Evaluate finalize hook).
|
|
400
|
+
|
|
401
|
+
@classmethod
|
|
402
|
+
def window_init(
|
|
403
|
+
cls,
|
|
404
|
+
partition: WindowPartition,
|
|
405
|
+
params: ProcessParams[Any],
|
|
406
|
+
) -> Any:
|
|
407
|
+
"""Derive optional per-partition state from the raw partition.
|
|
408
|
+
|
|
409
|
+
Called once per partition before any ``window()`` call. Return any
|
|
410
|
+
``ArrowSerializableDataclass`` (so it can round-trip through storage),
|
|
411
|
+
or ``None`` if no derived state is required. The return value is
|
|
412
|
+
passed back to ``window()`` as ``window_state``.
|
|
413
|
+
|
|
414
|
+
Default implementation returns ``None``.
|
|
415
|
+
"""
|
|
416
|
+
return None
|
|
417
|
+
|
|
418
|
+
@classmethod
|
|
419
|
+
def window_prepare(
|
|
420
|
+
cls,
|
|
421
|
+
partition: WindowPartition,
|
|
422
|
+
window_state: Any,
|
|
423
|
+
params: ProcessParams[Any],
|
|
424
|
+
) -> Any:
|
|
425
|
+
"""Derive per-partition state for the window() loop (optional hook).
|
|
426
|
+
|
|
427
|
+
Called once per partition, after ``window_init`` (or after the state
|
|
428
|
+
is rehydrated from storage on a cold reload), before any
|
|
429
|
+
``window()`` call. The return value is passed as ``window_state``
|
|
430
|
+
to every ``window()`` call against this partition, replacing the
|
|
431
|
+
opaque ``_WindowStatePlaceholder`` user code would otherwise
|
|
432
|
+
receive.
|
|
433
|
+
|
|
434
|
+
Use this hook for one-shot per-partition work that ``window()``
|
|
435
|
+
would otherwise have to redo on every call: deserialise the
|
|
436
|
+
``_WindowStatePlaceholder``, reshape NumPy buffers from
|
|
437
|
+
``window_init``'s state, build symbol→index lookups, etc.
|
|
438
|
+
Anything you would otherwise be tempted to memoise via a
|
|
439
|
+
module-level dict.
|
|
440
|
+
|
|
441
|
+
The result lives in the framework's per-partition cache and is
|
|
442
|
+
dropped automatically when the partition is evicted from the LRU
|
|
443
|
+
or its destructor fires.
|
|
444
|
+
|
|
445
|
+
Default implementation returns ``window_state`` unchanged — for
|
|
446
|
+
aggregates that don't define this hook, ``window()`` receives the
|
|
447
|
+
placeholder (or ``None``) exactly as it did before. Backward
|
|
448
|
+
compatible.
|
|
449
|
+
"""
|
|
450
|
+
return window_state
|
|
451
|
+
|
|
452
|
+
@classmethod
|
|
453
|
+
def window(
|
|
454
|
+
cls,
|
|
455
|
+
rid: int,
|
|
456
|
+
subframes: list[tuple[int, int]],
|
|
457
|
+
partition: WindowPartition,
|
|
458
|
+
window_state: Any,
|
|
459
|
+
params: ProcessParams[Any],
|
|
460
|
+
) -> Any:
|
|
461
|
+
"""Compute the aggregate value for one output row.
|
|
462
|
+
|
|
463
|
+
Args:
|
|
464
|
+
rid: Partition-local row index being filled.
|
|
465
|
+
subframes: Frame ranges ``[(begin, end), ...]`` — 1 for the default
|
|
466
|
+
frame, 3 when ``EXCLUDE`` produces multiple subframes.
|
|
467
|
+
partition: The cached partition data.
|
|
468
|
+
window_state: ``window_prepare()``'s return value if the function
|
|
469
|
+
defines that hook; otherwise the value returned by
|
|
470
|
+
``window_init()`` (may be ``None``), wrapped in a
|
|
471
|
+
``_WindowStatePlaceholder`` on cold reload.
|
|
472
|
+
params: Shared ``ProcessParams``.
|
|
473
|
+
|
|
474
|
+
Returns:
|
|
475
|
+
A Python scalar or Arrow-compatible value; the worker wraps it
|
|
476
|
+
into an IPC batch matching the function's output schema.
|
|
477
|
+
|
|
478
|
+
"""
|
|
479
|
+
raise NotImplementedError(f"{cls.__name__}: Meta.supports_window=True requires overriding window()")
|
|
480
|
+
|
|
481
|
+
@classmethod
|
|
482
|
+
def window_batch(
|
|
483
|
+
cls,
|
|
484
|
+
row_ids: list[int],
|
|
485
|
+
subframes: list[list[tuple[int, int]]],
|
|
486
|
+
partition: WindowPartition,
|
|
487
|
+
window_state: Any,
|
|
488
|
+
params: ProcessParams[Any],
|
|
489
|
+
) -> pa.Array[Any] | list[Any]:
|
|
490
|
+
"""Compute the aggregate value for ``count`` consecutive output rows.
|
|
491
|
+
|
|
492
|
+
Default implementation calls :meth:`window` once per row. Override
|
|
493
|
+
when per-row Python object construction dominates the call cost
|
|
494
|
+
and you want to build the output as an Arrow array directly,
|
|
495
|
+
bypassing the framework's default ``pa.array(results, ...)``
|
|
496
|
+
conversion.
|
|
497
|
+
|
|
498
|
+
Args:
|
|
499
|
+
row_ids: Partition-local row indices being filled. Length is
|
|
500
|
+
the batch size.
|
|
501
|
+
subframes: ``subframes[i]`` is the frame ranges for output
|
|
502
|
+
row ``row_ids[i]``. Same shape as :meth:`window`'s
|
|
503
|
+
``subframes`` argument, one per row.
|
|
504
|
+
partition: The cached partition data.
|
|
505
|
+
window_state: As :meth:`window`.
|
|
506
|
+
params: As :meth:`window`.
|
|
507
|
+
|
|
508
|
+
Returns:
|
|
509
|
+
Either a :class:`pa.Array` of length ``len(row_ids)`` matching
|
|
510
|
+
the function's output type — shipped directly as the response
|
|
511
|
+
with no further conversion — or a ``list[Any]`` of the same
|
|
512
|
+
length, fed through ``pa.array(results, type=output_type)``
|
|
513
|
+
(equivalent to the default per-row path).
|
|
514
|
+
|
|
515
|
+
"""
|
|
516
|
+
return [
|
|
517
|
+
cls.window(rid, frames, partition, window_state, params)
|
|
518
|
+
for rid, frames in zip(row_ids, subframes, strict=True)
|
|
519
|
+
]
|
|
520
|
+
|
|
521
|
+
# ------------------------------------------------------------------
|
|
522
|
+
# Optional streaming-partitioned callbacks
|
|
523
|
+
# ------------------------------------------------------------------
|
|
524
|
+
# Enable by setting ``Meta.streaming_partitioned = True`` and overriding
|
|
525
|
+
# ``streaming_chunk()`` (and optionally ``streaming_open`` /
|
|
526
|
+
# ``streaming_close``).
|
|
527
|
+
#
|
|
528
|
+
# Streaming-partitioned aggregates handle queries shaped like
|
|
529
|
+
# ``f(...) OVER (PARTITION BY p ORDER BY o)`` with a cumulative frame
|
|
530
|
+
# (``UNBOUNDED PRECEDING -> CURRENT ROW``) where the input is too large
|
|
531
|
+
# to materialise in DuckDB memory but compresses heavily into per-
|
|
532
|
+
# partition state. The framework streams input chunks to the worker;
|
|
533
|
+
# the worker maintains concurrent per-partition state in a hash map and
|
|
534
|
+
# emits one output row per input row.
|
|
535
|
+
|
|
536
|
+
@classmethod
|
|
537
|
+
def streaming_open(cls, params: ProcessParams[Any]) -> Any:
|
|
538
|
+
"""Build cross-partition global state for a streaming session.
|
|
539
|
+
|
|
540
|
+
Called once when ``aggregate_streaming_open`` arrives, before any
|
|
541
|
+
chunk is processed. Return any object (it lives in an in-process
|
|
542
|
+
cache keyed by ``execution_id`` for the duration of the session).
|
|
543
|
+
|
|
544
|
+
Typical contents: a ``dict`` of per-partition aggregate states
|
|
545
|
+
(populated lazily as new partition keys appear in input chunks),
|
|
546
|
+
plus any cross-partition resources to share — symbol intern
|
|
547
|
+
tables, allocator pools, prepared output buffers.
|
|
548
|
+
|
|
549
|
+
Default implementation returns ``None`` (no shared state); the
|
|
550
|
+
function still works if ``streaming_chunk`` keeps everything in
|
|
551
|
+
local variables, but per-partition state would have to live
|
|
552
|
+
somewhere caller-supplied.
|
|
553
|
+
"""
|
|
554
|
+
return None
|
|
555
|
+
|
|
556
|
+
@classmethod
|
|
557
|
+
def streaming_chunk(
|
|
558
|
+
cls,
|
|
559
|
+
chunk: pa.RecordBatch,
|
|
560
|
+
streaming_state: Any,
|
|
561
|
+
partition_key_count: int,
|
|
562
|
+
order_key_count: int,
|
|
563
|
+
params: ProcessParams[Any],
|
|
564
|
+
) -> pa.Array[Any] | list[Any]:
|
|
565
|
+
"""Process one chunk of streaming input.
|
|
566
|
+
|
|
567
|
+
Args:
|
|
568
|
+
chunk: Input rows for this batch. Schema layout is
|
|
569
|
+
``[partition_key_cols..., order_key_cols..., value_cols...]``
|
|
570
|
+
— the first ``partition_key_count`` columns are partition
|
|
571
|
+
keys (used to dispatch to the right per-partition state),
|
|
572
|
+
the next ``order_key_count`` are order keys (informational;
|
|
573
|
+
may be used to verify monotonicity), the rest are the
|
|
574
|
+
function's value arguments in declaration order.
|
|
575
|
+
streaming_state: Whatever ``streaming_open`` returned. The
|
|
576
|
+
framework passes the same object on every chunk; mutate
|
|
577
|
+
in place to accumulate state across chunks.
|
|
578
|
+
partition_key_count: Number of leading columns that form the
|
|
579
|
+
partition key.
|
|
580
|
+
order_key_count: Number of columns following the partition key
|
|
581
|
+
that form the order key.
|
|
582
|
+
params: Shared ``ProcessParams``.
|
|
583
|
+
|
|
584
|
+
Returns:
|
|
585
|
+
Either a :class:`pa.Array` of length ``chunk.num_rows`` matching
|
|
586
|
+
the function's output type, or a list of the same length
|
|
587
|
+
(which the framework converts via ``pa.array``). Each output
|
|
588
|
+
value is the cumulative aggregate snapshot at that input
|
|
589
|
+
row's position in its partition's order.
|
|
590
|
+
|
|
591
|
+
"""
|
|
592
|
+
raise NotImplementedError(
|
|
593
|
+
f"{cls.__name__}: Meta.streaming_partitioned=True requires overriding streaming_chunk()"
|
|
594
|
+
)
|
|
595
|
+
|
|
596
|
+
@classmethod
|
|
597
|
+
def streaming_close(cls, streaming_state: Any, params: ProcessParams[Any]) -> None:
|
|
598
|
+
"""Tear down streaming session state.
|
|
599
|
+
|
|
600
|
+
Called once when ``aggregate_streaming_close`` arrives, after the
|
|
601
|
+
last chunk. Use to release any external resources held by
|
|
602
|
+
``streaming_state``. The framework drops its reference after this
|
|
603
|
+
call, so anything not held elsewhere is GCed naturally.
|
|
604
|
+
|
|
605
|
+
Default implementation is a no-op.
|
|
606
|
+
"""
|
|
607
|
+
return None
|