vgi-python 0.8.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vgi/__init__.py +152 -0
- vgi/_duckdb.py +62 -0
- vgi/_storage_profile.py +132 -0
- vgi/_test_fixtures/__init__.py +20 -0
- vgi/_test_fixtures/accumulate/__init__.py +19 -0
- vgi/_test_fixtures/accumulate/worker.py +762 -0
- vgi/_test_fixtures/aggregate/__init__.py +62 -0
- vgi/_test_fixtures/aggregate/_common.py +21 -0
- vgi/_test_fixtures/aggregate/basic.py +232 -0
- vgi/_test_fixtures/aggregate/dynamic.py +409 -0
- vgi/_test_fixtures/aggregate/generic.py +86 -0
- vgi/_test_fixtures/aggregate/listagg.py +71 -0
- vgi/_test_fixtures/aggregate/percentile.py +107 -0
- vgi/_test_fixtures/aggregate/streaming.py +192 -0
- vgi/_test_fixtures/aggregate/varargs.py +75 -0
- vgi/_test_fixtures/aggregate/window.py +380 -0
- vgi/_test_fixtures/attach_options.py +308 -0
- vgi/_test_fixtures/bad_protocol.py +62 -0
- vgi/_test_fixtures/cancellable.py +336 -0
- vgi/_test_fixtures/catalog.py +813 -0
- vgi/_test_fixtures/http_server.py +394 -0
- vgi/_test_fixtures/nest_tensor.py +614 -0
- vgi/_test_fixtures/orchard_catalog.py +47 -0
- vgi/_test_fixtures/projection_repro/__init__.py +6 -0
- vgi/_test_fixtures/projection_repro/worker.py +454 -0
- vgi/_test_fixtures/scalar/__init__.py +116 -0
- vgi/_test_fixtures/scalar/_common.py +69 -0
- vgi/_test_fixtures/scalar/arithmetic.py +321 -0
- vgi/_test_fixtures/scalar/binary.py +120 -0
- vgi/_test_fixtures/scalar/formatting.py +176 -0
- vgi/_test_fixtures/scalar/geo.py +300 -0
- vgi/_test_fixtures/scalar/null_handling.py +107 -0
- vgi/_test_fixtures/scalar/random_demo.py +171 -0
- vgi/_test_fixtures/scalar/settings_secrets.py +102 -0
- vgi/_test_fixtures/scalar/type_info.py +219 -0
- vgi/_test_fixtures/schema_reconcile/__init__.py +29 -0
- vgi/_test_fixtures/schema_reconcile/worker.py +653 -0
- vgi/_test_fixtures/simple_writable.py +793 -0
- vgi/_test_fixtures/table/__init__.py +221 -0
- vgi/_test_fixtures/table/_common.py +162 -0
- vgi/_test_fixtures/table/batch_index.py +283 -0
- vgi/_test_fixtures/table/batch_index_broken.py +200 -0
- vgi/_test_fixtures/table/catalog_scans.py +162 -0
- vgi/_test_fixtures/table/filters.py +1005 -0
- vgi/_test_fixtures/table/late_materialization.py +249 -0
- vgi/_test_fixtures/table/make_series.py +273 -0
- vgi/_test_fixtures/table/misc.py +499 -0
- vgi/_test_fixtures/table/order_modes.py +164 -0
- vgi/_test_fixtures/table/pairs.py +437 -0
- vgi/_test_fixtures/table/partition_columns.py +472 -0
- vgi/_test_fixtures/table/partition_columns_broken.py +304 -0
- vgi/_test_fixtures/table/profiling_example.py +195 -0
- vgi/_test_fixtures/table/required_filters.py +234 -0
- vgi/_test_fixtures/table/sequence.py +710 -0
- vgi/_test_fixtures/table/settings.py +426 -0
- vgi/_test_fixtures/table/transaction_storage.py +162 -0
- vgi/_test_fixtures/table/tt_pushdown.py +191 -0
- vgi/_test_fixtures/table/versioned.py +230 -0
- vgi/_test_fixtures/table_in_out.py +1392 -0
- vgi/_test_fixtures/versioned.py +155 -0
- vgi/_test_fixtures/versioned_tables.py +595 -0
- vgi/_test_fixtures/worker.py +1631 -0
- vgi/_test_fixtures/writable/__init__.py +8 -0
- vgi/_test_fixtures/writable/generic.py +236 -0
- vgi/_test_fixtures/writable/table.py +149 -0
- vgi/_test_fixtures/writable/worker.py +1148 -0
- vgi/aggregate_function.py +607 -0
- vgi/argument_spec.py +472 -0
- vgi/arguments.py +1747 -0
- vgi/auth.py +55 -0
- vgi/catalog/__init__.py +88 -0
- vgi/catalog/attach_option.py +206 -0
- vgi/catalog/catalog_interface.py +2767 -0
- vgi/catalog/descriptors.py +870 -0
- vgi/catalog/duckdb_statistics.py +377 -0
- vgi/catalog/secret_type.py +96 -0
- vgi/catalog/setting.py +253 -0
- vgi/catalog/storage.py +372 -0
- vgi/client/__init__.py +67 -0
- vgi/client/catalog_mixin.py +1251 -0
- vgi/client/cli.py +582 -0
- vgi/client/cli_catalog.py +182 -0
- vgi/client/cli_schema.py +270 -0
- vgi/client/cli_table.py +907 -0
- vgi/client/cli_transaction.py +97 -0
- vgi/client/cli_utils.py +441 -0
- vgi/client/cli_view.py +303 -0
- vgi/client/client.py +2183 -0
- vgi/exceptions.py +205 -0
- vgi/function.py +245 -0
- vgi/function_storage.py +1636 -0
- vgi/function_storage_azure_sql.py +922 -0
- vgi/function_storage_cf_do.py +740 -0
- vgi/http/__init__.py +25 -0
- vgi/http/demo_storage.py +212 -0
- vgi/http/worker_page.py +1252 -0
- vgi/invocation.py +154 -0
- vgi/logging_config.py +93 -0
- vgi/meta_worker.py +661 -0
- vgi/metadata.py +1403 -0
- vgi/otel.py +406 -0
- vgi/protocol.py +2418 -0
- vgi/protocol_version.txt +1 -0
- vgi/py.typed +0 -0
- vgi/scalar_function.py +1211 -0
- vgi/schema_utils.py +234 -0
- vgi/secret_protocol.py +124 -0
- vgi/secret_service.py +238 -0
- vgi/serve.py +769 -0
- vgi/table_buffering_function.py +443 -0
- vgi/table_filter_pushdown.py +1528 -0
- vgi/table_function.py +1130 -0
- vgi/table_in_out_function.py +383 -0
- vgi/transactor/__init__.py +24 -0
- vgi/transactor/_duckdb_compat.py +27 -0
- vgi/transactor/client.py +137 -0
- vgi/transactor/protocol.py +149 -0
- vgi/transactor/server.py +740 -0
- vgi/worker.py +4761 -0
- vgi_python-0.8.0.dist-info/METADATA +735 -0
- vgi_python-0.8.0.dist-info/RECORD +124 -0
- vgi_python-0.8.0.dist-info/WHEEL +4 -0
- vgi_python-0.8.0.dist-info/entry_points.txt +5 -0
- vgi_python-0.8.0.dist-info/licenses/LICENSE +134 -0
|
@@ -0,0 +1,409 @@
|
|
|
1
|
+
# Copyright 2025, 2026 Query Farm LLC - https://query.farm
|
|
2
|
+
|
|
3
|
+
"""Dynamic-code aggregate fixtures (DynamicAggregate, DynamicMLAggregate)."""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
from dataclasses import dataclass, field
|
|
8
|
+
from typing import Annotated, Any
|
|
9
|
+
|
|
10
|
+
import numpy as np
|
|
11
|
+
import pyarrow as pa
|
|
12
|
+
from vgi_rpc import ArrowSerializableDataclass, ArrowType
|
|
13
|
+
|
|
14
|
+
from vgi.aggregate_function import AggregateFunction, WindowPartition
|
|
15
|
+
from vgi.arguments import Param, Returns
|
|
16
|
+
from vgi.metadata import DistinctDependence, NullHandling, OrderDependence
|
|
17
|
+
from vgi.table_function import ProcessParams
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
@dataclass(kw_only=True)
|
|
21
|
+
class DynamicState(ArrowSerializableDataclass):
|
|
22
|
+
state_bytes: Annotated[bytes, ArrowType(pa.binary())] = b""
|
|
23
|
+
code: Annotated[str, ArrowType(pa.string())] = ""
|
|
24
|
+
params: Annotated[dict[str, float], ArrowType(pa.map_(pa.string(), pa.float64()))] = field(default_factory=dict)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def _serialize_table(table: pa.Table) -> bytes:
|
|
28
|
+
"""Serialize a Table to Arrow IPC stream bytes."""
|
|
29
|
+
sink = pa.BufferOutputStream()
|
|
30
|
+
with pa.ipc.new_stream(sink, table.schema) as writer:
|
|
31
|
+
for batch in table.to_batches():
|
|
32
|
+
writer.write_batch(batch)
|
|
33
|
+
return sink.getvalue().to_pybytes()
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def _deserialize_table(data: bytes) -> pa.Table:
|
|
37
|
+
"""Deserialize Arrow IPC stream bytes to a Table."""
|
|
38
|
+
return pa.ipc.open_stream(data).read_all()
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
# ---------------------------------------------------------------------------
|
|
42
|
+
# Aggregate functions
|
|
43
|
+
# ---------------------------------------------------------------------------
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
_DYNAMIC_EXEC_NAMESPACE: dict[str, Any] = {
|
|
47
|
+
"dataclass": dataclass,
|
|
48
|
+
"field": field,
|
|
49
|
+
"Annotated": Annotated,
|
|
50
|
+
"pa": pa,
|
|
51
|
+
"np": np,
|
|
52
|
+
"ArrowSerializableDataclass": ArrowSerializableDataclass,
|
|
53
|
+
"ArrowType": ArrowType,
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
_dynamic_class_cache: dict[str, Any] = {}
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def _get_aggregate_class(code: str) -> Any:
|
|
60
|
+
"""Exec the code string, validate, cache, and return the Aggregate class."""
|
|
61
|
+
if code not in _dynamic_class_cache:
|
|
62
|
+
namespace: dict[str, Any] = dict(_DYNAMIC_EXEC_NAMESPACE)
|
|
63
|
+
# Compile with dont_inherit=True so `from __future__ import annotations`
|
|
64
|
+
# in this module doesn't make the exec'd annotations into strings.
|
|
65
|
+
compiled = compile(code, "<dynamic_aggregate>", "exec", dont_inherit=True)
|
|
66
|
+
exec(compiled, namespace) # noqa: S102
|
|
67
|
+
if "Aggregate" not in namespace:
|
|
68
|
+
raise ValueError("Dynamic aggregate code must define a class named 'Aggregate'")
|
|
69
|
+
agg_cls = namespace["Aggregate"]
|
|
70
|
+
for method in ("finalize",):
|
|
71
|
+
if not hasattr(agg_cls, method):
|
|
72
|
+
raise ValueError(f"Aggregate class must define a '{method}' method")
|
|
73
|
+
_dynamic_class_cache[code] = agg_cls
|
|
74
|
+
return _dynamic_class_cache[code]
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def _pack_dynamic_state(
|
|
78
|
+
dynamic_state: ArrowSerializableDataclass,
|
|
79
|
+
code: str = "",
|
|
80
|
+
params: dict[str, float] | None = None,
|
|
81
|
+
) -> DynamicState:
|
|
82
|
+
return DynamicState(
|
|
83
|
+
state_bytes=dynamic_state.serialize_to_bytes(),
|
|
84
|
+
code=code,
|
|
85
|
+
params=params or {},
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def _unpack_dynamic_state(
|
|
90
|
+
wrapper: DynamicState, state_cls: type[ArrowSerializableDataclass]
|
|
91
|
+
) -> ArrowSerializableDataclass:
|
|
92
|
+
return state_cls.deserialize_from_bytes(wrapper.state_bytes)
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
class _DynamicAggregateBase(AggregateFunction[DynamicState]):
|
|
96
|
+
"""Shared logic for dynamic aggregate functions.
|
|
97
|
+
|
|
98
|
+
The dynamic code's ``update(state, *arrays)`` receives Arrow arrays
|
|
99
|
+
directly — no per-row Python scalar conversion. State stores accumulated
|
|
100
|
+
data as Arrow IPC bytes for zero-copy round-trips.
|
|
101
|
+
|
|
102
|
+
For the ML variant, ``finalize(state, params)`` receives the params dict.
|
|
103
|
+
"""
|
|
104
|
+
|
|
105
|
+
@classmethod
|
|
106
|
+
def initial_state(cls, params: ProcessParams[None]) -> DynamicState:
|
|
107
|
+
return DynamicState()
|
|
108
|
+
|
|
109
|
+
@classmethod
|
|
110
|
+
def _do_update(
|
|
111
|
+
cls,
|
|
112
|
+
states: dict[int, DynamicState],
|
|
113
|
+
group_ids: pa.Int64Array,
|
|
114
|
+
code_col: pa.StringArray,
|
|
115
|
+
columns: list[pa.Array[Any]],
|
|
116
|
+
params_col: pa.Array[Any] | None = None,
|
|
117
|
+
) -> None:
|
|
118
|
+
code: str = code_col[0].as_py()
|
|
119
|
+
raw_params = params_col[0].as_py() if params_col is not None else None
|
|
120
|
+
if isinstance(raw_params, list):
|
|
121
|
+
params: dict[str, float] = {str(k): float(v) for k, v in raw_params}
|
|
122
|
+
elif isinstance(raw_params, dict):
|
|
123
|
+
params = {str(k): float(v) for k, v in raw_params.items()}
|
|
124
|
+
else:
|
|
125
|
+
params = {}
|
|
126
|
+
_get_aggregate_class(code) # validate + cache the code early
|
|
127
|
+
|
|
128
|
+
# Build a table from the incoming columns (drop nulls)
|
|
129
|
+
col_names = [f"c{i}" for i in range(len(columns))]
|
|
130
|
+
incoming = pa.table({col_names[i]: columns[i] for i in range(len(columns))})
|
|
131
|
+
# Filter null rows
|
|
132
|
+
mask: pa.ChunkedArray[pa.BooleanScalar] | pa.BooleanArray | None = None
|
|
133
|
+
for col in incoming.columns:
|
|
134
|
+
valid = col.is_valid()
|
|
135
|
+
mask = valid if mask is None else pa.compute.and_(mask, valid)
|
|
136
|
+
if mask is not None:
|
|
137
|
+
incoming = incoming.filter(mask)
|
|
138
|
+
|
|
139
|
+
# Group by group_id and dispatch. For window aggregates there's
|
|
140
|
+
# typically one group, so this is just one iteration.
|
|
141
|
+
unique_gids = group_ids.unique()
|
|
142
|
+
for gid_scalar in unique_gids:
|
|
143
|
+
gid: int = gid_scalar.as_py()
|
|
144
|
+
wrapper = states[gid]
|
|
145
|
+
# Get row indices for this group
|
|
146
|
+
gid_mask = pa.compute.equal(group_ids, gid_scalar)
|
|
147
|
+
group_table = incoming.filter(gid_mask)
|
|
148
|
+
if group_table.num_rows == 0:
|
|
149
|
+
continue
|
|
150
|
+
|
|
151
|
+
# Accumulate: concat with existing state data.
|
|
152
|
+
if wrapper.state_bytes:
|
|
153
|
+
combined = pa.concat_tables([_deserialize_table(wrapper.state_bytes), group_table])
|
|
154
|
+
else:
|
|
155
|
+
combined = group_table
|
|
156
|
+
|
|
157
|
+
states[gid] = DynamicState(
|
|
158
|
+
state_bytes=_serialize_table(combined),
|
|
159
|
+
code=code,
|
|
160
|
+
params=params,
|
|
161
|
+
)
|
|
162
|
+
|
|
163
|
+
@classmethod
|
|
164
|
+
def combine(cls, source: DynamicState, target: DynamicState, params: ProcessParams[None]) -> DynamicState:
|
|
165
|
+
code = target.code or source.code
|
|
166
|
+
if not code:
|
|
167
|
+
return target
|
|
168
|
+
p = target.params or source.params
|
|
169
|
+
src_table = _deserialize_table(source.state_bytes) if source.state_bytes else None
|
|
170
|
+
tgt_table = _deserialize_table(target.state_bytes) if target.state_bytes else None
|
|
171
|
+
combined: pa.Table | None
|
|
172
|
+
if src_table is not None and tgt_table is not None:
|
|
173
|
+
combined = pa.concat_tables([tgt_table, src_table])
|
|
174
|
+
else:
|
|
175
|
+
combined = tgt_table or src_table
|
|
176
|
+
return DynamicState(
|
|
177
|
+
state_bytes=_serialize_table(combined) if combined is not None else b"",
|
|
178
|
+
code=code,
|
|
179
|
+
params=p,
|
|
180
|
+
)
|
|
181
|
+
|
|
182
|
+
# ------------------------------------------------------------------
|
|
183
|
+
# Windowed path
|
|
184
|
+
# ------------------------------------------------------------------
|
|
185
|
+
# Shared logic for both vgi_dynamic_agg and vgi_dynamic_ml_agg.
|
|
186
|
+
# Each subclass overrides window() directly — the shared helper below just
|
|
187
|
+
# slices all partition columns to the current frame with filter_mask and
|
|
188
|
+
# NULL-drop applied. Reading code/params from the sliced frame (rather
|
|
189
|
+
# than partition.inputs.column(X)[0]) avoids aliasing across partitions
|
|
190
|
+
# when DuckDB batches many partitions into shared buffers.
|
|
191
|
+
|
|
192
|
+
@staticmethod
|
|
193
|
+
def _slice_to_frame( # noqa: D417
|
|
194
|
+
partition: WindowPartition,
|
|
195
|
+
subframes: list[tuple[int, int]],
|
|
196
|
+
data_start: int,
|
|
197
|
+
) -> pa.Table:
|
|
198
|
+
"""Slice all partition columns to the frame rows.
|
|
199
|
+
|
|
200
|
+
Args:
|
|
201
|
+
data_start: Index where data columns begin (header columns are
|
|
202
|
+
``[0 .. data_start)``). NULL-drop is applied on data columns
|
|
203
|
+
only — matches the filtering ``_do_update`` performs in the
|
|
204
|
+
non-window path.
|
|
205
|
+
|
|
206
|
+
"""
|
|
207
|
+
num_cols = partition.inputs.num_columns
|
|
208
|
+
cols = [partition.inputs.column(i) for i in range(num_cols)]
|
|
209
|
+
col_names = [f"c{i}" for i in range(num_cols)]
|
|
210
|
+
slices: list[pa.Table] = []
|
|
211
|
+
for begin, end in subframes:
|
|
212
|
+
if end <= begin:
|
|
213
|
+
continue
|
|
214
|
+
length = end - begin
|
|
215
|
+
sliced = {col_names[i]: cols[i].slice(begin, length) for i in range(num_cols)}
|
|
216
|
+
t = pa.table(sliced)
|
|
217
|
+
if partition.filter_mask is not None:
|
|
218
|
+
t = t.filter(partition.filter_mask.slice(begin, length))
|
|
219
|
+
data_cols_of_t = t.columns[data_start:]
|
|
220
|
+
if data_cols_of_t:
|
|
221
|
+
null_mask: pa.ChunkedArray[pa.BooleanScalar] | pa.BooleanArray | None = None
|
|
222
|
+
for col in data_cols_of_t:
|
|
223
|
+
valid = col.is_valid()
|
|
224
|
+
null_mask = valid if null_mask is None else pa.compute.and_(null_mask, valid)
|
|
225
|
+
if null_mask is not None:
|
|
226
|
+
t = t.filter(null_mask)
|
|
227
|
+
slices.append(t)
|
|
228
|
+
if not slices:
|
|
229
|
+
return pa.table({c: pa.array([], type=cols[i].type) for i, c in enumerate(col_names)})
|
|
230
|
+
return pa.concat_tables(slices)
|
|
231
|
+
|
|
232
|
+
@staticmethod
|
|
233
|
+
def _data_table_from(frame: pa.Table, data_start: int) -> pa.Table:
|
|
234
|
+
"""Rebuild a 0-indexed ``c0, c1, …`` data-only table for user code."""
|
|
235
|
+
data_cols = frame.columns[data_start:]
|
|
236
|
+
return pa.table({f"c{i}": col for i, col in enumerate(data_cols)})
|
|
237
|
+
|
|
238
|
+
@staticmethod
|
|
239
|
+
def _call_user(agg_cls: Any, data_table: pa.Table, user_params: dict[str, float] | None) -> Any:
|
|
240
|
+
"""Prefer the user's ``window()``; fall back to ``finalize()``."""
|
|
241
|
+
fn = getattr(agg_cls, "window", None) or agg_cls.finalize
|
|
242
|
+
if user_params is None:
|
|
243
|
+
return fn(data_table)
|
|
244
|
+
return fn(data_table, user_params)
|
|
245
|
+
|
|
246
|
+
|
|
247
|
+
class DynamicAggregateFunction(_DynamicAggregateBase):
|
|
248
|
+
"""Dynamic aggregate — behavior defined by a Python code string.
|
|
249
|
+
|
|
250
|
+
``vgi_dynamic_agg(code, col1, col2, ...)``
|
|
251
|
+
|
|
252
|
+
The code and columns are regular parameters (not constants), so the code
|
|
253
|
+
can come from a table lookup, subquery, or variable.
|
|
254
|
+
|
|
255
|
+
The exec namespace pre-provides: ``dataclass``, ``Annotated``, ``pa``,
|
|
256
|
+
``ArrowSerializableDataclass``, ``ArrowType``.
|
|
257
|
+
"""
|
|
258
|
+
|
|
259
|
+
class Meta:
|
|
260
|
+
name = "vgi_dynamic_agg"
|
|
261
|
+
description = "Dynamic aggregate defined by Python code string"
|
|
262
|
+
null_handling = NullHandling.DEFAULT
|
|
263
|
+
# User code is free-form Python that may depend on input order (e.g. data[-1]
|
|
264
|
+
# for "last row", slicing like data[:-1] / data[1:]). The framework can't
|
|
265
|
+
# introspect what the user does, so conservatively assume order matters.
|
|
266
|
+
order_dependent = OrderDependence.ORDER_DEPENDENT
|
|
267
|
+
distinct_dependent = DistinctDependence.NOT_DISTINCT_DEPENDENT
|
|
268
|
+
supports_window = True
|
|
269
|
+
|
|
270
|
+
@classmethod
|
|
271
|
+
def update(
|
|
272
|
+
cls,
|
|
273
|
+
states: dict[int, DynamicState],
|
|
274
|
+
group_ids: pa.Int64Array,
|
|
275
|
+
code: Annotated[pa.StringArray, Param(doc="Python code defining Aggregate class")],
|
|
276
|
+
columns: Annotated[list[pa.Array], Param(doc="Input columns", varargs=True)], # type: ignore[type-arg]
|
|
277
|
+
) -> None:
|
|
278
|
+
cls._do_update(states, group_ids, code, columns)
|
|
279
|
+
|
|
280
|
+
@classmethod
|
|
281
|
+
def finalize(
|
|
282
|
+
cls,
|
|
283
|
+
group_ids: pa.Int64Array,
|
|
284
|
+
states: dict[int, DynamicState],
|
|
285
|
+
params: ProcessParams[None],
|
|
286
|
+
) -> Annotated[pa.RecordBatch, Returns(pa.float64())]:
|
|
287
|
+
results: list[float | None] = []
|
|
288
|
+
for gid in group_ids:
|
|
289
|
+
wrapper = states[gid.as_py()]
|
|
290
|
+
if wrapper is not None and wrapper.code and wrapper.state_bytes:
|
|
291
|
+
table = _deserialize_table(wrapper.state_bytes)
|
|
292
|
+
agg_cls = _get_aggregate_class(wrapper.code)
|
|
293
|
+
result = agg_cls.finalize(table)
|
|
294
|
+
results.append(float(result) if result is not None else None)
|
|
295
|
+
else:
|
|
296
|
+
results.append(None)
|
|
297
|
+
return pa.record_batch({"result": pa.array(results, type=pa.float64())})
|
|
298
|
+
|
|
299
|
+
@classmethod
|
|
300
|
+
def window(
|
|
301
|
+
cls,
|
|
302
|
+
rid: int,
|
|
303
|
+
subframes: list[tuple[int, int]],
|
|
304
|
+
partition: WindowPartition,
|
|
305
|
+
window_state: Any,
|
|
306
|
+
params: ProcessParams[None],
|
|
307
|
+
) -> float | None:
|
|
308
|
+
# Column layout: [code, col1, col2, ...]
|
|
309
|
+
frame = cls._slice_to_frame(partition, subframes, data_start=1)
|
|
310
|
+
if frame.num_rows == 0:
|
|
311
|
+
return None
|
|
312
|
+
code = frame.column(0)[0].as_py()
|
|
313
|
+
data_table = cls._data_table_from(frame, data_start=1)
|
|
314
|
+
agg_cls = _get_aggregate_class(code)
|
|
315
|
+
result = cls._call_user(agg_cls, data_table, user_params=None)
|
|
316
|
+
return float(result) if result is not None else None
|
|
317
|
+
|
|
318
|
+
|
|
319
|
+
class DynamicMLAggregateFunction(_DynamicAggregateBase):
|
|
320
|
+
"""Dynamic ML aggregate with params dict.
|
|
321
|
+
|
|
322
|
+
``vgi_dynamic_ml_agg(code, params, col1, col2, ...)``
|
|
323
|
+
|
|
324
|
+
Like ``vgi_dynamic_agg`` but with a ``MAP(VARCHAR, DOUBLE)`` params
|
|
325
|
+
column forwarded to ``Aggregate.finalize(state, params)`` so the
|
|
326
|
+
dynamic code can access arbitrary parameters (seed, lookback, alpha, etc.).
|
|
327
|
+
|
|
328
|
+
SQL::
|
|
329
|
+
|
|
330
|
+
SELECT vgi_dynamic_ml_agg(
|
|
331
|
+
code,
|
|
332
|
+
MAP {'seed': 42, 'lb': 5, 'alpha': 1.0},
|
|
333
|
+
col1, col2
|
|
334
|
+
) ...
|
|
335
|
+
"""
|
|
336
|
+
|
|
337
|
+
class Meta:
|
|
338
|
+
name = "vgi_dynamic_ml_agg"
|
|
339
|
+
description = "Dynamic ML aggregate with params dict"
|
|
340
|
+
null_handling = NullHandling.DEFAULT
|
|
341
|
+
# User code is free-form Python that may depend on input order (e.g. data[-1]
|
|
342
|
+
# for "last row", slicing like data[:-1] / data[1:]). The framework can't
|
|
343
|
+
# introspect what the user does, so conservatively assume order matters.
|
|
344
|
+
order_dependent = OrderDependence.ORDER_DEPENDENT
|
|
345
|
+
distinct_dependent = DistinctDependence.NOT_DISTINCT_DEPENDENT
|
|
346
|
+
supports_window = True
|
|
347
|
+
|
|
348
|
+
@classmethod
|
|
349
|
+
def update(
|
|
350
|
+
cls,
|
|
351
|
+
states: dict[int, DynamicState],
|
|
352
|
+
group_ids: pa.Int64Array,
|
|
353
|
+
code: Annotated[pa.StringArray, Param(doc="Python code defining Aggregate class")],
|
|
354
|
+
params_col: Annotated[pa.Array, Param(doc="MAP(VARCHAR, DOUBLE) parameters")], # type: ignore[type-arg]
|
|
355
|
+
columns: Annotated[list[pa.Array], Param(doc="Input columns", varargs=True)], # type: ignore[type-arg]
|
|
356
|
+
) -> None:
|
|
357
|
+
cls._do_update(states, group_ids, code, columns, params_col=params_col)
|
|
358
|
+
|
|
359
|
+
@classmethod
|
|
360
|
+
def finalize(
|
|
361
|
+
cls,
|
|
362
|
+
group_ids: pa.Int64Array,
|
|
363
|
+
states: dict[int, DynamicState],
|
|
364
|
+
params: ProcessParams[None],
|
|
365
|
+
) -> Annotated[pa.RecordBatch, Returns(pa.float64())]:
|
|
366
|
+
results: list[float | None] = []
|
|
367
|
+
for gid in group_ids:
|
|
368
|
+
wrapper = states[gid.as_py()]
|
|
369
|
+
if wrapper is not None and wrapper.code and wrapper.state_bytes:
|
|
370
|
+
table = _deserialize_table(wrapper.state_bytes)
|
|
371
|
+
agg_cls = _get_aggregate_class(wrapper.code)
|
|
372
|
+
result = agg_cls.finalize(table, wrapper.params)
|
|
373
|
+
results.append(float(result) if result is not None else None)
|
|
374
|
+
else:
|
|
375
|
+
results.append(None)
|
|
376
|
+
return pa.record_batch({"result": pa.array(results, type=pa.float64())})
|
|
377
|
+
|
|
378
|
+
@classmethod
|
|
379
|
+
def window(
|
|
380
|
+
cls,
|
|
381
|
+
rid: int,
|
|
382
|
+
subframes: list[tuple[int, int]],
|
|
383
|
+
partition: WindowPartition,
|
|
384
|
+
window_state: Any,
|
|
385
|
+
params: ProcessParams[None],
|
|
386
|
+
) -> float | None:
|
|
387
|
+
# Column layout: [code, params_map, col1, col2, ...]
|
|
388
|
+
frame = cls._slice_to_frame(partition, subframes, data_start=2)
|
|
389
|
+
if frame.num_rows == 0:
|
|
390
|
+
return None
|
|
391
|
+
code = frame.column(0)[0].as_py()
|
|
392
|
+
raw = frame.column(1)[0].as_py()
|
|
393
|
+
if isinstance(raw, list):
|
|
394
|
+
user_params: dict[str, float] = {str(k): float(v) for k, v in raw}
|
|
395
|
+
elif isinstance(raw, dict):
|
|
396
|
+
user_params = {str(k): float(v) for k, v in raw.items()}
|
|
397
|
+
else:
|
|
398
|
+
user_params = {}
|
|
399
|
+
data_table = cls._data_table_from(frame, data_start=2)
|
|
400
|
+
agg_cls = _get_aggregate_class(code)
|
|
401
|
+
result = cls._call_user(agg_cls, data_table, user_params=user_params)
|
|
402
|
+
return float(result) if result is not None else None
|
|
403
|
+
|
|
404
|
+
|
|
405
|
+
# ---------------------------------------------------------------------------
|
|
406
|
+
# Window-capable aggregates (Meta.supports_window = True)
|
|
407
|
+
# ---------------------------------------------------------------------------
|
|
408
|
+
# These demonstrate the window() callback which lets DuckDB ship the whole
|
|
409
|
+
# partition once and call the worker per output row with frame bounds.
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
# Copyright 2025, 2026 Query Farm LLC - https://query.farm
|
|
2
|
+
|
|
3
|
+
"""GenericSumFunction — any-type aggregate (uses on_bind to derive output type)."""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
from dataclasses import dataclass
|
|
8
|
+
from typing import Annotated
|
|
9
|
+
|
|
10
|
+
import pyarrow as pa
|
|
11
|
+
from vgi_rpc import ArrowSerializableDataclass, ArrowType
|
|
12
|
+
|
|
13
|
+
from vgi.aggregate_function import AggregateBindParams, AggregateFunction
|
|
14
|
+
from vgi.arguments import Param, Returns
|
|
15
|
+
from vgi.invocation import BindResponse
|
|
16
|
+
from vgi.metadata import DistinctDependence, NullHandling, OrderDependence
|
|
17
|
+
from vgi.schema_utils import schema
|
|
18
|
+
from vgi.table_function import ProcessParams
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@dataclass(kw_only=True)
|
|
22
|
+
class GenericSumState(ArrowSerializableDataclass):
|
|
23
|
+
total: Annotated[float, ArrowType(pa.float64())] = 0.0
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class GenericSumFunction(AggregateFunction[GenericSumState]):
|
|
27
|
+
"""Sum aggregate that accepts any numeric type and returns the same type.
|
|
28
|
+
|
|
29
|
+
Demonstrates AnyArrow input with dynamic output type resolved in on_bind().
|
|
30
|
+
SQL: ``SELECT vgi_generic_sum(value) FROM t``
|
|
31
|
+
"""
|
|
32
|
+
|
|
33
|
+
class Meta:
|
|
34
|
+
name = "vgi_generic_sum"
|
|
35
|
+
description = "Sum any numeric type"
|
|
36
|
+
null_handling = NullHandling.DEFAULT
|
|
37
|
+
order_dependent = OrderDependence.NOT_ORDER_DEPENDENT
|
|
38
|
+
distinct_dependent = DistinctDependence.NOT_DISTINCT_DEPENDENT
|
|
39
|
+
|
|
40
|
+
@classmethod
|
|
41
|
+
def on_bind(cls, params: AggregateBindParams, **kwargs: object) -> BindResponse:
|
|
42
|
+
"""Resolve output type from input type."""
|
|
43
|
+
if params.input_schema:
|
|
44
|
+
input_type = params.input_schema.field(0).type
|
|
45
|
+
return BindResponse(output_schema=schema(result=input_type))
|
|
46
|
+
return BindResponse(output_schema=schema(result=pa.float64()))
|
|
47
|
+
|
|
48
|
+
@classmethod
|
|
49
|
+
def initial_state(cls, params: ProcessParams[None]) -> GenericSumState:
|
|
50
|
+
return GenericSumState()
|
|
51
|
+
|
|
52
|
+
@classmethod
|
|
53
|
+
def update(
|
|
54
|
+
cls,
|
|
55
|
+
states: dict[int, GenericSumState],
|
|
56
|
+
group_ids: pa.Int64Array,
|
|
57
|
+
value: Annotated[pa.Array, Param(doc="Numeric value to sum")], # type: ignore[type-arg]
|
|
58
|
+
) -> None:
|
|
59
|
+
table = pa.table({"gid": group_ids, "value": value.cast(pa.float64())})
|
|
60
|
+
grouped = table.group_by("gid").aggregate([("value", "sum")])
|
|
61
|
+
for i in range(grouped.num_rows):
|
|
62
|
+
gid: int = grouped.column("gid")[i].as_py()
|
|
63
|
+
val = grouped.column("value_sum")[i].as_py()
|
|
64
|
+
if val is not None:
|
|
65
|
+
states[gid] = GenericSumState(total=states[gid].total + val)
|
|
66
|
+
|
|
67
|
+
@classmethod
|
|
68
|
+
def combine(cls, source: GenericSumState, target: GenericSumState, params: ProcessParams[None]) -> GenericSumState:
|
|
69
|
+
return GenericSumState(total=source.total + target.total)
|
|
70
|
+
|
|
71
|
+
@classmethod
|
|
72
|
+
def finalize(
|
|
73
|
+
cls,
|
|
74
|
+
group_ids: pa.Int64Array,
|
|
75
|
+
states: dict[int, GenericSumState],
|
|
76
|
+
params: ProcessParams[None],
|
|
77
|
+
) -> Annotated[pa.RecordBatch, Returns()]:
|
|
78
|
+
# Output type determined by on_bind(), available via params.output_schema
|
|
79
|
+
output_type = params.output_schema.field(0).type if params.output_schema else pa.float64()
|
|
80
|
+
results = [s.total if (s := states[gid.as_py()]) is not None else None for gid in group_ids]
|
|
81
|
+
return pa.record_batch({"result": pa.array(results, type=output_type)})
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
# ---------------------------------------------------------------------------
|
|
85
|
+
# SumAllFunction — demonstrates varargs aggregate (sums all numeric columns)
|
|
86
|
+
# ---------------------------------------------------------------------------
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
# Copyright 2025, 2026 Query Farm LLC - https://query.farm
|
|
2
|
+
|
|
3
|
+
"""ListAgg aggregate fixture (order-dependent string concatenation)."""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
from typing import Annotated
|
|
8
|
+
|
|
9
|
+
import pyarrow as pa
|
|
10
|
+
|
|
11
|
+
from vgi._test_fixtures.aggregate._common import ListAggState
|
|
12
|
+
from vgi.aggregate_function import AggregateFunction
|
|
13
|
+
from vgi.arguments import Param, Returns
|
|
14
|
+
from vgi.metadata import DistinctDependence, NullHandling, OrderDependence
|
|
15
|
+
from vgi.table_function import ProcessParams
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class ListAggFunction(AggregateFunction[ListAggState]):
|
|
19
|
+
"""List aggregate — order-dependent, concatenates strings with comma separator.
|
|
20
|
+
|
|
21
|
+
SQL: ``SELECT vgi_listagg(name ORDER BY name) FROM t GROUP BY category``
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
class Meta:
|
|
25
|
+
name = "vgi_listagg"
|
|
26
|
+
description = "Concatenate strings with comma separator"
|
|
27
|
+
null_handling = NullHandling.DEFAULT
|
|
28
|
+
order_dependent = OrderDependence.ORDER_DEPENDENT
|
|
29
|
+
distinct_dependent = DistinctDependence.DISTINCT_DEPENDENT
|
|
30
|
+
|
|
31
|
+
@classmethod
|
|
32
|
+
def initial_state(cls, params: ProcessParams[None]) -> ListAggState:
|
|
33
|
+
return ListAggState()
|
|
34
|
+
|
|
35
|
+
@classmethod
|
|
36
|
+
def update(
|
|
37
|
+
cls,
|
|
38
|
+
states: dict[int, ListAggState],
|
|
39
|
+
group_ids: pa.Int64Array,
|
|
40
|
+
value: Annotated[pa.StringArray, Param(doc="String column")],
|
|
41
|
+
) -> None:
|
|
42
|
+
for i in range(len(group_ids)):
|
|
43
|
+
gid: int = group_ids[i].as_py()
|
|
44
|
+
val = value[i].as_py()
|
|
45
|
+
if val is not None:
|
|
46
|
+
s = states[gid]
|
|
47
|
+
if s.values:
|
|
48
|
+
states[gid] = ListAggState(values=s.values + "," + val)
|
|
49
|
+
else:
|
|
50
|
+
states[gid] = ListAggState(values=val)
|
|
51
|
+
|
|
52
|
+
@classmethod
|
|
53
|
+
def combine(cls, source: ListAggState, target: ListAggState, params: ProcessParams[None]) -> ListAggState:
|
|
54
|
+
if source.values and target.values:
|
|
55
|
+
return ListAggState(values=target.values + "," + source.values)
|
|
56
|
+
return ListAggState(values=target.values or source.values)
|
|
57
|
+
|
|
58
|
+
@classmethod
|
|
59
|
+
def finalize(
|
|
60
|
+
cls,
|
|
61
|
+
group_ids: pa.Int64Array,
|
|
62
|
+
states: dict[int, ListAggState],
|
|
63
|
+
params: ProcessParams[None],
|
|
64
|
+
) -> Annotated[pa.RecordBatch, Returns(pa.string())]:
|
|
65
|
+
results = [s.values or None if (s := states[gid.as_py()]) is not None else None for gid in group_ids]
|
|
66
|
+
return pa.record_batch({"result": pa.array(results, type=pa.string())})
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
# ---------------------------------------------------------------------------
|
|
70
|
+
# PercentileFunction — demonstrates ConstParam on aggregate
|
|
71
|
+
# ---------------------------------------------------------------------------
|
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
# Copyright 2025, 2026 Query Farm LLC - https://query.farm
|
|
2
|
+
|
|
3
|
+
"""Percentile aggregate fixture (sorted-quantile demo)."""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
from dataclasses import dataclass
|
|
8
|
+
from typing import Annotated
|
|
9
|
+
|
|
10
|
+
import pyarrow as pa
|
|
11
|
+
from vgi_rpc import ArrowSerializableDataclass, ArrowType
|
|
12
|
+
|
|
13
|
+
from vgi.aggregate_function import AggregateFunction
|
|
14
|
+
from vgi.arguments import ConstParam, Param, Returns
|
|
15
|
+
from vgi.metadata import DistinctDependence, NullHandling, OrderDependence
|
|
16
|
+
from vgi.table_function import ProcessParams
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
@dataclass(kw_only=True)
|
|
20
|
+
class PercentileState(ArrowSerializableDataclass):
|
|
21
|
+
# Store values as comma-separated string (simple serialization)
|
|
22
|
+
values_csv: Annotated[str, ArrowType(pa.string())] = ""
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class PercentileFunction(AggregateFunction[PercentileState]):
|
|
26
|
+
"""Approximate percentile — demonstrates ConstParam on aggregate functions.
|
|
27
|
+
|
|
28
|
+
SQL: ``SELECT vgi_percentile(value, 0.5) FROM t GROUP BY category``
|
|
29
|
+
The percentile parameter (0.5) is constant-folded at bind time.
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
class Meta:
|
|
33
|
+
name = "vgi_percentile"
|
|
34
|
+
description = "Approximate percentile (demonstrates ConstParam)"
|
|
35
|
+
null_handling = NullHandling.DEFAULT
|
|
36
|
+
order_dependent = OrderDependence.NOT_ORDER_DEPENDENT
|
|
37
|
+
distinct_dependent = DistinctDependence.NOT_DISTINCT_DEPENDENT
|
|
38
|
+
|
|
39
|
+
@classmethod
|
|
40
|
+
def initial_state(cls, params: ProcessParams[None]) -> PercentileState:
|
|
41
|
+
return PercentileState()
|
|
42
|
+
|
|
43
|
+
@classmethod
|
|
44
|
+
def update(
|
|
45
|
+
cls,
|
|
46
|
+
states: dict[int, PercentileState],
|
|
47
|
+
group_ids: pa.Int64Array,
|
|
48
|
+
value: Annotated[pa.DoubleArray, Param(doc="Values")],
|
|
49
|
+
percentile: Annotated[float, ConstParam("Percentile (0-1)", phase="finalize")] = 0.5,
|
|
50
|
+
) -> None:
|
|
51
|
+
# percentile is NOT injected here (phase="finalize") — only needed in finalize
|
|
52
|
+
for i in range(len(group_ids)):
|
|
53
|
+
gid: int = group_ids[i].as_py()
|
|
54
|
+
val = value[i].as_py()
|
|
55
|
+
if val is not None:
|
|
56
|
+
s = states[gid]
|
|
57
|
+
if s.values_csv:
|
|
58
|
+
states[gid] = PercentileState(values_csv=s.values_csv + "," + str(val))
|
|
59
|
+
else:
|
|
60
|
+
states[gid] = PercentileState(values_csv=str(val))
|
|
61
|
+
|
|
62
|
+
@classmethod
|
|
63
|
+
def combine(cls, source: PercentileState, target: PercentileState, params: ProcessParams[None]) -> PercentileState:
|
|
64
|
+
if source.values_csv and target.values_csv:
|
|
65
|
+
return PercentileState(values_csv=target.values_csv + "," + source.values_csv)
|
|
66
|
+
return PercentileState(values_csv=target.values_csv or source.values_csv)
|
|
67
|
+
|
|
68
|
+
@classmethod
|
|
69
|
+
def finalize(
|
|
70
|
+
cls,
|
|
71
|
+
group_ids: pa.Int64Array,
|
|
72
|
+
states: dict[int, PercentileState],
|
|
73
|
+
params: ProcessParams[None],
|
|
74
|
+
) -> Annotated[pa.RecordBatch, Returns(pa.float64())]:
|
|
75
|
+
import math
|
|
76
|
+
from decimal import Decimal
|
|
77
|
+
|
|
78
|
+
# Access percentile via params.args (loaded from FunctionStorage)
|
|
79
|
+
raw_pct = params.args.positional[0].as_py() if params.args and params.args.positional else 0.5
|
|
80
|
+
# Validate the percentile constant explicitly so callers see a clear
|
|
81
|
+
# error instead of an opaque NumPy/builtin TypeError downstream.
|
|
82
|
+
if raw_pct is None:
|
|
83
|
+
raise ValueError("vgi_percentile: percentile must not be NULL")
|
|
84
|
+
# Accept Python int/float and Decimal (DuckDB DECIMAL literals decode as Decimal).
|
|
85
|
+
if isinstance(raw_pct, (Decimal, int, float)):
|
|
86
|
+
pct = float(raw_pct)
|
|
87
|
+
else:
|
|
88
|
+
raise ValueError(f"vgi_percentile: percentile must be a number, got {type(raw_pct).__name__}")
|
|
89
|
+
if math.isnan(pct) or math.isinf(pct):
|
|
90
|
+
raise ValueError(f"vgi_percentile: percentile must be a finite number, got {raw_pct!r}")
|
|
91
|
+
if pct < 0.0 or pct > 1.0:
|
|
92
|
+
raise ValueError(f"vgi_percentile: percentile must be in [0, 1], got {pct}")
|
|
93
|
+
results: list[float | None] = []
|
|
94
|
+
for gid in group_ids:
|
|
95
|
+
s = states[gid.as_py()]
|
|
96
|
+
if s is not None and s.values_csv:
|
|
97
|
+
vals = sorted(float(v) for v in s.values_csv.split(","))
|
|
98
|
+
idx = min(int(pct * len(vals)), len(vals) - 1)
|
|
99
|
+
results.append(vals[idx])
|
|
100
|
+
else:
|
|
101
|
+
results.append(None)
|
|
102
|
+
return pa.record_batch({"result": pa.array(results, type=pa.float64())})
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
# ---------------------------------------------------------------------------
|
|
106
|
+
# GenericSumFunction — demonstrates AnyArrow / dynamic output type
|
|
107
|
+
# ---------------------------------------------------------------------------
|