vgi-python 0.8.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vgi/__init__.py +152 -0
- vgi/_duckdb.py +62 -0
- vgi/_storage_profile.py +132 -0
- vgi/_test_fixtures/__init__.py +20 -0
- vgi/_test_fixtures/accumulate/__init__.py +19 -0
- vgi/_test_fixtures/accumulate/worker.py +762 -0
- vgi/_test_fixtures/aggregate/__init__.py +62 -0
- vgi/_test_fixtures/aggregate/_common.py +21 -0
- vgi/_test_fixtures/aggregate/basic.py +232 -0
- vgi/_test_fixtures/aggregate/dynamic.py +409 -0
- vgi/_test_fixtures/aggregate/generic.py +86 -0
- vgi/_test_fixtures/aggregate/listagg.py +71 -0
- vgi/_test_fixtures/aggregate/percentile.py +107 -0
- vgi/_test_fixtures/aggregate/streaming.py +192 -0
- vgi/_test_fixtures/aggregate/varargs.py +75 -0
- vgi/_test_fixtures/aggregate/window.py +380 -0
- vgi/_test_fixtures/attach_options.py +308 -0
- vgi/_test_fixtures/bad_protocol.py +62 -0
- vgi/_test_fixtures/cancellable.py +336 -0
- vgi/_test_fixtures/catalog.py +813 -0
- vgi/_test_fixtures/http_server.py +394 -0
- vgi/_test_fixtures/nest_tensor.py +614 -0
- vgi/_test_fixtures/orchard_catalog.py +47 -0
- vgi/_test_fixtures/projection_repro/__init__.py +6 -0
- vgi/_test_fixtures/projection_repro/worker.py +454 -0
- vgi/_test_fixtures/scalar/__init__.py +116 -0
- vgi/_test_fixtures/scalar/_common.py +69 -0
- vgi/_test_fixtures/scalar/arithmetic.py +321 -0
- vgi/_test_fixtures/scalar/binary.py +120 -0
- vgi/_test_fixtures/scalar/formatting.py +176 -0
- vgi/_test_fixtures/scalar/geo.py +300 -0
- vgi/_test_fixtures/scalar/null_handling.py +107 -0
- vgi/_test_fixtures/scalar/random_demo.py +171 -0
- vgi/_test_fixtures/scalar/settings_secrets.py +102 -0
- vgi/_test_fixtures/scalar/type_info.py +219 -0
- vgi/_test_fixtures/schema_reconcile/__init__.py +29 -0
- vgi/_test_fixtures/schema_reconcile/worker.py +653 -0
- vgi/_test_fixtures/simple_writable.py +793 -0
- vgi/_test_fixtures/table/__init__.py +221 -0
- vgi/_test_fixtures/table/_common.py +162 -0
- vgi/_test_fixtures/table/batch_index.py +283 -0
- vgi/_test_fixtures/table/batch_index_broken.py +200 -0
- vgi/_test_fixtures/table/catalog_scans.py +162 -0
- vgi/_test_fixtures/table/filters.py +1005 -0
- vgi/_test_fixtures/table/late_materialization.py +249 -0
- vgi/_test_fixtures/table/make_series.py +273 -0
- vgi/_test_fixtures/table/misc.py +499 -0
- vgi/_test_fixtures/table/order_modes.py +164 -0
- vgi/_test_fixtures/table/pairs.py +437 -0
- vgi/_test_fixtures/table/partition_columns.py +472 -0
- vgi/_test_fixtures/table/partition_columns_broken.py +304 -0
- vgi/_test_fixtures/table/profiling_example.py +195 -0
- vgi/_test_fixtures/table/required_filters.py +234 -0
- vgi/_test_fixtures/table/sequence.py +710 -0
- vgi/_test_fixtures/table/settings.py +426 -0
- vgi/_test_fixtures/table/transaction_storage.py +162 -0
- vgi/_test_fixtures/table/tt_pushdown.py +191 -0
- vgi/_test_fixtures/table/versioned.py +230 -0
- vgi/_test_fixtures/table_in_out.py +1392 -0
- vgi/_test_fixtures/versioned.py +155 -0
- vgi/_test_fixtures/versioned_tables.py +595 -0
- vgi/_test_fixtures/worker.py +1631 -0
- vgi/_test_fixtures/writable/__init__.py +8 -0
- vgi/_test_fixtures/writable/generic.py +236 -0
- vgi/_test_fixtures/writable/table.py +149 -0
- vgi/_test_fixtures/writable/worker.py +1148 -0
- vgi/aggregate_function.py +607 -0
- vgi/argument_spec.py +472 -0
- vgi/arguments.py +1747 -0
- vgi/auth.py +55 -0
- vgi/catalog/__init__.py +88 -0
- vgi/catalog/attach_option.py +206 -0
- vgi/catalog/catalog_interface.py +2767 -0
- vgi/catalog/descriptors.py +870 -0
- vgi/catalog/duckdb_statistics.py +377 -0
- vgi/catalog/secret_type.py +96 -0
- vgi/catalog/setting.py +253 -0
- vgi/catalog/storage.py +372 -0
- vgi/client/__init__.py +67 -0
- vgi/client/catalog_mixin.py +1251 -0
- vgi/client/cli.py +582 -0
- vgi/client/cli_catalog.py +182 -0
- vgi/client/cli_schema.py +270 -0
- vgi/client/cli_table.py +907 -0
- vgi/client/cli_transaction.py +97 -0
- vgi/client/cli_utils.py +441 -0
- vgi/client/cli_view.py +303 -0
- vgi/client/client.py +2183 -0
- vgi/exceptions.py +205 -0
- vgi/function.py +245 -0
- vgi/function_storage.py +1636 -0
- vgi/function_storage_azure_sql.py +922 -0
- vgi/function_storage_cf_do.py +740 -0
- vgi/http/__init__.py +25 -0
- vgi/http/demo_storage.py +212 -0
- vgi/http/worker_page.py +1252 -0
- vgi/invocation.py +154 -0
- vgi/logging_config.py +93 -0
- vgi/meta_worker.py +661 -0
- vgi/metadata.py +1403 -0
- vgi/otel.py +406 -0
- vgi/protocol.py +2418 -0
- vgi/protocol_version.txt +1 -0
- vgi/py.typed +0 -0
- vgi/scalar_function.py +1211 -0
- vgi/schema_utils.py +234 -0
- vgi/secret_protocol.py +124 -0
- vgi/secret_service.py +238 -0
- vgi/serve.py +769 -0
- vgi/table_buffering_function.py +443 -0
- vgi/table_filter_pushdown.py +1528 -0
- vgi/table_function.py +1130 -0
- vgi/table_in_out_function.py +383 -0
- vgi/transactor/__init__.py +24 -0
- vgi/transactor/_duckdb_compat.py +27 -0
- vgi/transactor/client.py +137 -0
- vgi/transactor/protocol.py +149 -0
- vgi/transactor/server.py +740 -0
- vgi/worker.py +4761 -0
- vgi_python-0.8.0.dist-info/METADATA +735 -0
- vgi_python-0.8.0.dist-info/RECORD +124 -0
- vgi_python-0.8.0.dist-info/WHEEL +4 -0
- vgi_python-0.8.0.dist-info/entry_points.txt +5 -0
- vgi_python-0.8.0.dist-info/licenses/LICENSE +134 -0
|
@@ -0,0 +1,192 @@
|
|
|
1
|
+
# Copyright 2025, 2026 Query Farm LLC - https://query.farm
|
|
2
|
+
|
|
3
|
+
"""Streaming-partitioned aggregate fixtures.
|
|
4
|
+
|
|
5
|
+
Exercise the ``streaming_partitioned`` opt-in: ``streaming_open`` /
|
|
6
|
+
``streaming_chunk`` / ``streaming_close``. These are routed through the
|
|
7
|
+
VGI DuckDB extension's custom streaming operator, which pipes input
|
|
8
|
+
chunks straight to the worker without materialising the partition on
|
|
9
|
+
the DuckDB side. State is bounded by partitions × per-partition state,
|
|
10
|
+
not by row count — the structural answer to "running aggregate over
|
|
11
|
+
unbounded ordered input."
|
|
12
|
+
|
|
13
|
+
These fixtures are reference implementations for the protocol. Real
|
|
14
|
+
production aggregates (e.g. ``portfolio_agg``) follow the same shape
|
|
15
|
+
but with domain-specific state and I/O optimisations (Decimal128 buffer
|
|
16
|
+
tricks, etc.).
|
|
17
|
+
|
|
18
|
+
When the optimizer rule rejects a query (non-cumulative frame, EXCLUDE
|
|
19
|
+
clause, DISTINCT/FILTER, etc.) DuckDB falls back to the standard
|
|
20
|
+
windowed path — so all three of these classes also implement
|
|
21
|
+
update/combine/finalize for plain GROUP BY usage and (optionally) the
|
|
22
|
+
windowed callbacks. The streaming methods are additive.
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
from __future__ import annotations
|
|
26
|
+
|
|
27
|
+
from typing import Annotated, Any
|
|
28
|
+
|
|
29
|
+
import pyarrow as pa
|
|
30
|
+
|
|
31
|
+
from vgi._test_fixtures.aggregate._common import SumState
|
|
32
|
+
from vgi.aggregate_function import AggregateFunction
|
|
33
|
+
from vgi.arguments import Param, Returns
|
|
34
|
+
from vgi.metadata import DistinctDependence, NullHandling, OrderDependence
|
|
35
|
+
from vgi.table_function import ProcessParams
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class StreamingSumFunction(AggregateFunction[SumState]):
|
|
39
|
+
"""Streaming-partitioned running sum.
|
|
40
|
+
|
|
41
|
+
Cumulative across each `(PARTITION BY key)` group, in `ORDER BY` order.
|
|
42
|
+
For every input row, emits the running sum of the value column at that
|
|
43
|
+
row's position in its partition.
|
|
44
|
+
|
|
45
|
+
Also wired for ``GROUP BY`` via ``update`` / ``combine`` / ``finalize``,
|
|
46
|
+
so the same function works in both shapes::
|
|
47
|
+
|
|
48
|
+
-- streaming-partitioned (one running value per fill row):
|
|
49
|
+
SELECT k, v, vgi_streaming_sum(v) OVER (PARTITION BY k ORDER BY ts)
|
|
50
|
+
FROM trades;
|
|
51
|
+
|
|
52
|
+
-- group-by (one final value per partition):
|
|
53
|
+
SELECT k, vgi_streaming_sum(v) FROM trades GROUP BY k;
|
|
54
|
+
|
|
55
|
+
State persistence: the per-partition dict lives in worker memory in an
|
|
56
|
+
in-process LRU and is also persisted to ``FunctionStorage`` after each
|
|
57
|
+
chunk so a follow-up chunk landing on a different worker pool entry
|
|
58
|
+
can rehydrate. No special handling required from this class — the
|
|
59
|
+
framework does it.
|
|
60
|
+
"""
|
|
61
|
+
|
|
62
|
+
class Meta:
|
|
63
|
+
name = "vgi_streaming_sum"
|
|
64
|
+
description = (
|
|
65
|
+
"Running sum across PARTITION BY keys via the streaming-partitioned "
|
|
66
|
+
"protocol. Each input row emits the cumulative sum at its position."
|
|
67
|
+
)
|
|
68
|
+
null_handling = NullHandling.DEFAULT
|
|
69
|
+
order_dependent = OrderDependence.NOT_ORDER_DEPENDENT
|
|
70
|
+
distinct_dependent = DistinctDependence.NOT_DISTINCT_DEPENDENT
|
|
71
|
+
supports_window = False
|
|
72
|
+
# Opt into the streaming-partitioned operator. The optimizer rule
|
|
73
|
+
# will route eligible OVER queries through it; ineligible shapes
|
|
74
|
+
# (sliding frames, EXCLUDE, DISTINCT, FILTER) fall back to the
|
|
75
|
+
# standard windowed path automatically.
|
|
76
|
+
streaming_partitioned = True
|
|
77
|
+
|
|
78
|
+
# ------------------------------------------------------------------
|
|
79
|
+
# GROUP BY path — required for plain aggregation queries.
|
|
80
|
+
# ------------------------------------------------------------------
|
|
81
|
+
|
|
82
|
+
@classmethod
|
|
83
|
+
def initial_state(cls, params: ProcessParams[None]) -> SumState:
|
|
84
|
+
return SumState()
|
|
85
|
+
|
|
86
|
+
@classmethod
|
|
87
|
+
def update(
|
|
88
|
+
cls,
|
|
89
|
+
states: dict[int, SumState],
|
|
90
|
+
group_ids: pa.Int64Array,
|
|
91
|
+
value: Annotated[pa.Int64Array, Param(doc="Column to sum")],
|
|
92
|
+
) -> None:
|
|
93
|
+
table = pa.table({"gid": group_ids, "value": value})
|
|
94
|
+
grouped = table.group_by("gid").aggregate([("value", "sum")])
|
|
95
|
+
for i in range(grouped.num_rows):
|
|
96
|
+
gid: int = grouped.column("gid")[i].as_py()
|
|
97
|
+
v = grouped.column("value_sum")[i].as_py()
|
|
98
|
+
if v is not None:
|
|
99
|
+
states[gid] = SumState(total=states[gid].total + v)
|
|
100
|
+
|
|
101
|
+
@classmethod
|
|
102
|
+
def combine(cls, source: SumState, target: SumState, params: ProcessParams[None]) -> SumState:
|
|
103
|
+
return SumState(total=source.total + target.total)
|
|
104
|
+
|
|
105
|
+
@classmethod
|
|
106
|
+
def finalize(
|
|
107
|
+
cls,
|
|
108
|
+
group_ids: pa.Int64Array,
|
|
109
|
+
states: dict[int, SumState],
|
|
110
|
+
params: ProcessParams[None],
|
|
111
|
+
) -> Annotated[pa.RecordBatch, Returns(pa.int64())]:
|
|
112
|
+
results = [(s.total if (s := states.get(gid.as_py())) is not None else None) for gid in group_ids]
|
|
113
|
+
return pa.record_batch({"result": pa.array(results, type=pa.int64())})
|
|
114
|
+
|
|
115
|
+
# ------------------------------------------------------------------
|
|
116
|
+
# Streaming-partitioned path.
|
|
117
|
+
# ------------------------------------------------------------------
|
|
118
|
+
#
|
|
119
|
+
# Three callbacks: open / chunk / close. The framework handles session
|
|
120
|
+
# lifecycle (allocates execution_id, persists to FunctionStorage,
|
|
121
|
+
# rehydrates across pool workers); user code only owns the in-memory
|
|
122
|
+
# state object.
|
|
123
|
+
|
|
124
|
+
@classmethod
|
|
125
|
+
def streaming_open(cls, params: ProcessParams[None]) -> dict[str, Any]:
|
|
126
|
+
# Session state. Free shape — anything picklable. The framework
|
|
127
|
+
# passes this object back to streaming_chunk and streaming_close
|
|
128
|
+
# unchanged. For multi-partition aggregates, hold a per-partition
|
|
129
|
+
# dict here; for single-partition, just hold the running scalar.
|
|
130
|
+
return {
|
|
131
|
+
# partition_key_tuple -> running int sum
|
|
132
|
+
"partition_states": {},
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
@classmethod
|
|
136
|
+
def streaming_chunk(
|
|
137
|
+
cls,
|
|
138
|
+
chunk: pa.RecordBatch,
|
|
139
|
+
streaming_state: dict[str, Any],
|
|
140
|
+
partition_key_count: int,
|
|
141
|
+
order_key_count: int,
|
|
142
|
+
params: ProcessParams[None],
|
|
143
|
+
) -> pa.Array[Any]:
|
|
144
|
+
# Column layout from the operator:
|
|
145
|
+
# [partition_key_cols..., order_key_cols..., value_cols...]
|
|
146
|
+
# We don't actually need the order keys at runtime here — the
|
|
147
|
+
# input arrives in (partition, order) order already, so cumulative
|
|
148
|
+
# state is naturally correct.
|
|
149
|
+
n = chunk.num_rows
|
|
150
|
+
value_idx = partition_key_count + order_key_count
|
|
151
|
+
|
|
152
|
+
if partition_key_count > 0:
|
|
153
|
+
pk_columns = [chunk.column(i).to_pylist() for i in range(partition_key_count)]
|
|
154
|
+
else:
|
|
155
|
+
pk_columns = []
|
|
156
|
+
values = chunk.column(value_idx).to_pylist()
|
|
157
|
+
|
|
158
|
+
partition_states: dict[Any, int] = streaming_state["partition_states"]
|
|
159
|
+
|
|
160
|
+
# Returns one cumulative-sum int per input row. NULL value rows
|
|
161
|
+
# leave state unchanged but still emit the current sum (matches
|
|
162
|
+
# the GROUP BY path's NullHandling.DEFAULT semantics).
|
|
163
|
+
out: list[int] = [0] * n
|
|
164
|
+
for i in range(n):
|
|
165
|
+
if partition_key_count == 0:
|
|
166
|
+
key: Any = ()
|
|
167
|
+
elif partition_key_count == 1:
|
|
168
|
+
key = pk_columns[0][i]
|
|
169
|
+
else:
|
|
170
|
+
key = tuple(col[i] for col in pk_columns)
|
|
171
|
+
|
|
172
|
+
running = partition_states.get(key, 0)
|
|
173
|
+
v = values[i]
|
|
174
|
+
if v is not None:
|
|
175
|
+
running += v
|
|
176
|
+
partition_states[key] = running
|
|
177
|
+
out[i] = running
|
|
178
|
+
|
|
179
|
+
return pa.array(out, type=pa.int64())
|
|
180
|
+
|
|
181
|
+
@classmethod
|
|
182
|
+
def streaming_close(
|
|
183
|
+
cls,
|
|
184
|
+
streaming_state: dict[str, Any],
|
|
185
|
+
params: ProcessParams[None],
|
|
186
|
+
) -> None:
|
|
187
|
+
# Cleanup hook. For this fixture there's nothing to release;
|
|
188
|
+
# state is plain Python objects that GC collects when the
|
|
189
|
+
# session is dropped from the framework's cache. Real
|
|
190
|
+
# implementations might release file handles, close DB
|
|
191
|
+
# connections, or flush logs here.
|
|
192
|
+
return None
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
# Copyright 2025, 2026 Query Farm LLC - https://query.farm
|
|
2
|
+
|
|
3
|
+
"""SumAllFunction — varargs aggregate (sums any number of numeric columns)."""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
from dataclasses import dataclass
|
|
8
|
+
from typing import Annotated
|
|
9
|
+
|
|
10
|
+
import pyarrow as pa
|
|
11
|
+
from vgi_rpc import ArrowSerializableDataclass, ArrowType
|
|
12
|
+
|
|
13
|
+
from vgi.aggregate_function import AggregateFunction
|
|
14
|
+
from vgi.arguments import Param, Returns
|
|
15
|
+
from vgi.metadata import DistinctDependence, NullHandling, OrderDependence
|
|
16
|
+
from vgi.table_function import ProcessParams
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
@dataclass(kw_only=True)
|
|
20
|
+
class SumAllState(ArrowSerializableDataclass):
|
|
21
|
+
total: Annotated[float, ArrowType(pa.float64())] = 0.0
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class SumAllFunction(AggregateFunction[SumAllState]):
|
|
25
|
+
"""Sum all numeric columns — demonstrates varargs aggregate.
|
|
26
|
+
|
|
27
|
+
Accepts any number of numeric columns and sums them all together.
|
|
28
|
+
SQL: ``SELECT vgi_sum_all(a, b, c) FROM t GROUP BY category``
|
|
29
|
+
"""
|
|
30
|
+
|
|
31
|
+
class Meta:
|
|
32
|
+
name = "vgi_sum_all"
|
|
33
|
+
description = "Sum all numeric columns"
|
|
34
|
+
null_handling = NullHandling.DEFAULT
|
|
35
|
+
order_dependent = OrderDependence.NOT_ORDER_DEPENDENT
|
|
36
|
+
distinct_dependent = DistinctDependence.NOT_DISTINCT_DEPENDENT
|
|
37
|
+
|
|
38
|
+
@classmethod
|
|
39
|
+
def initial_state(cls, params: ProcessParams[None]) -> SumAllState:
|
|
40
|
+
return SumAllState()
|
|
41
|
+
|
|
42
|
+
@classmethod
|
|
43
|
+
def update(
|
|
44
|
+
cls,
|
|
45
|
+
states: dict[int, SumAllState],
|
|
46
|
+
group_ids: pa.Int64Array,
|
|
47
|
+
columns: Annotated[pa.Array, Param(doc="Numeric columns to sum", varargs=True)], # type: ignore[type-arg]
|
|
48
|
+
) -> None:
|
|
49
|
+
for i in range(len(group_ids)):
|
|
50
|
+
gid: int = group_ids[i].as_py()
|
|
51
|
+
row_total = 0.0
|
|
52
|
+
for col in columns:
|
|
53
|
+
val = col[i].as_py()
|
|
54
|
+
if val is not None:
|
|
55
|
+
row_total += float(val)
|
|
56
|
+
states[gid] = SumAllState(total=states[gid].total + row_total)
|
|
57
|
+
|
|
58
|
+
@classmethod
|
|
59
|
+
def combine(cls, source: SumAllState, target: SumAllState, params: ProcessParams[None]) -> SumAllState:
|
|
60
|
+
return SumAllState(total=source.total + target.total)
|
|
61
|
+
|
|
62
|
+
@classmethod
|
|
63
|
+
def finalize(
|
|
64
|
+
cls,
|
|
65
|
+
group_ids: pa.Int64Array,
|
|
66
|
+
states: dict[int, SumAllState],
|
|
67
|
+
params: ProcessParams[None],
|
|
68
|
+
) -> Annotated[pa.RecordBatch, Returns(pa.float64())]:
|
|
69
|
+
results = [s.total if (s := states[gid.as_py()]) is not None else None for gid in group_ids]
|
|
70
|
+
return pa.record_batch({"result": pa.array(results, type=pa.float64())})
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
# ---------------------------------------------------------------------------
|
|
74
|
+
# DynamicAggregateFunction — aggregate behavior defined by Python code string
|
|
75
|
+
# ---------------------------------------------------------------------------
|
|
@@ -0,0 +1,380 @@
|
|
|
1
|
+
# Copyright 2025, 2026 Query Farm LLC - https://query.farm
|
|
2
|
+
|
|
3
|
+
"""Windowed aggregate fixtures (window_sum, window_median, window_listagg)."""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
from dataclasses import dataclass
|
|
8
|
+
from typing import Annotated, Any
|
|
9
|
+
|
|
10
|
+
import pyarrow as pa
|
|
11
|
+
from vgi_rpc import ArrowSerializableDataclass
|
|
12
|
+
|
|
13
|
+
from vgi._test_fixtures.aggregate._common import ListAggState, SumState
|
|
14
|
+
from vgi.aggregate_function import AggregateFunction, WindowPartition
|
|
15
|
+
from vgi.arguments import Param, Returns
|
|
16
|
+
from vgi.metadata import DistinctDependence, NullHandling, OrderDependence
|
|
17
|
+
from vgi.table_function import ProcessParams
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
@dataclass(kw_only=True)
|
|
21
|
+
class _EmptyWindowState(ArrowSerializableDataclass):
|
|
22
|
+
"""Placeholder for functions that don't need derived per-partition state."""
|
|
23
|
+
|
|
24
|
+
pass
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class WindowSumFunction(AggregateFunction[SumState]):
|
|
28
|
+
"""Windowed running-sum — demonstrates a simple window() callback.
|
|
29
|
+
|
|
30
|
+
Also implements update/combine/finalize so the function still works in
|
|
31
|
+
plain ``GROUP BY`` contexts (DuckDB picks the window path automatically
|
|
32
|
+
via ``WindowCustomAggregator::CanAggregate``).
|
|
33
|
+
|
|
34
|
+
SQL::
|
|
35
|
+
|
|
36
|
+
SELECT x, vgi_window_sum(x) OVER (ORDER BY x ROWS BETWEEN 2 PRECEDING AND CURRENT ROW)
|
|
37
|
+
FROM generate_series(1, 10) t(x);
|
|
38
|
+
"""
|
|
39
|
+
|
|
40
|
+
class Meta:
|
|
41
|
+
name = "vgi_window_sum"
|
|
42
|
+
description = "Windowed sum that uses the per-partition window() callback"
|
|
43
|
+
null_handling = NullHandling.DEFAULT
|
|
44
|
+
order_dependent = OrderDependence.NOT_ORDER_DEPENDENT
|
|
45
|
+
distinct_dependent = DistinctDependence.NOT_DISTINCT_DEPENDENT
|
|
46
|
+
supports_window = True
|
|
47
|
+
|
|
48
|
+
@classmethod
|
|
49
|
+
def initial_state(cls, params: ProcessParams[None]) -> SumState:
|
|
50
|
+
return SumState()
|
|
51
|
+
|
|
52
|
+
@classmethod
|
|
53
|
+
def update(
|
|
54
|
+
cls,
|
|
55
|
+
states: dict[int, SumState],
|
|
56
|
+
group_ids: pa.Int64Array,
|
|
57
|
+
value: Annotated[pa.Int64Array, Param(doc="Column to sum")],
|
|
58
|
+
) -> None:
|
|
59
|
+
table = pa.table({"gid": group_ids, "value": value})
|
|
60
|
+
grouped = table.group_by("gid").aggregate([("value", "sum")])
|
|
61
|
+
for i in range(grouped.num_rows):
|
|
62
|
+
gid: int = grouped.column("gid")[i].as_py()
|
|
63
|
+
val = grouped.column("value_sum")[i].as_py()
|
|
64
|
+
if val is not None:
|
|
65
|
+
states[gid] = SumState(total=states[gid].total + val)
|
|
66
|
+
|
|
67
|
+
@classmethod
|
|
68
|
+
def combine(cls, source: SumState, target: SumState, params: ProcessParams[None]) -> SumState:
|
|
69
|
+
return SumState(total=source.total + target.total)
|
|
70
|
+
|
|
71
|
+
@classmethod
|
|
72
|
+
def finalize(
|
|
73
|
+
cls,
|
|
74
|
+
group_ids: pa.Int64Array,
|
|
75
|
+
states: dict[int, SumState],
|
|
76
|
+
params: ProcessParams[None],
|
|
77
|
+
) -> Annotated[pa.RecordBatch, Returns(pa.int64())]:
|
|
78
|
+
results = [s.total if (s := states[gid.as_py()]) is not None else None for gid in group_ids]
|
|
79
|
+
return pa.record_batch({"result": pa.array(results, type=pa.int64())})
|
|
80
|
+
|
|
81
|
+
# --- Window path ---
|
|
82
|
+
|
|
83
|
+
@classmethod
|
|
84
|
+
def window(
|
|
85
|
+
cls,
|
|
86
|
+
rid: int,
|
|
87
|
+
subframes: list[tuple[int, int]],
|
|
88
|
+
partition: WindowPartition,
|
|
89
|
+
window_state: Any,
|
|
90
|
+
params: ProcessParams[None],
|
|
91
|
+
) -> int | None:
|
|
92
|
+
import pyarrow.compute as pc
|
|
93
|
+
|
|
94
|
+
value_col = partition.inputs.column(0)
|
|
95
|
+
total = 0
|
|
96
|
+
any_valid = False
|
|
97
|
+
for begin, end in subframes:
|
|
98
|
+
if end <= begin:
|
|
99
|
+
continue
|
|
100
|
+
slice_ = value_col.slice(begin, end - begin)
|
|
101
|
+
if partition.filter_mask is not None:
|
|
102
|
+
mask = partition.filter_mask.slice(begin, end - begin)
|
|
103
|
+
slice_ = slice_.filter(mask)
|
|
104
|
+
s = pc.sum(slice_)
|
|
105
|
+
if s.is_valid:
|
|
106
|
+
total += s.as_py()
|
|
107
|
+
any_valid = True
|
|
108
|
+
return total if any_valid else None
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
class WindowMedianFunction(AggregateFunction[_EmptyWindowState]):
|
|
112
|
+
"""Windowed median — non-incremental, benefits from caching the partition.
|
|
113
|
+
|
|
114
|
+
Uses the window() callback exclusively (no incremental update path makes
|
|
115
|
+
sense for median). Falls back to a naive GROUP BY implementation via
|
|
116
|
+
update/combine/finalize that collects values in a single string field.
|
|
117
|
+
|
|
118
|
+
SQL::
|
|
119
|
+
|
|
120
|
+
SELECT x, vgi_window_median(x) OVER (ORDER BY x ROWS BETWEEN 2 PRECEDING AND 2 FOLLOWING)
|
|
121
|
+
FROM generate_series(1, 20) t(x);
|
|
122
|
+
"""
|
|
123
|
+
|
|
124
|
+
class Meta:
|
|
125
|
+
name = "vgi_window_median"
|
|
126
|
+
description = "Windowed median (window() callback demonstrates non-incremental aggregates)"
|
|
127
|
+
null_handling = NullHandling.DEFAULT
|
|
128
|
+
order_dependent = OrderDependence.NOT_ORDER_DEPENDENT
|
|
129
|
+
distinct_dependent = DistinctDependence.NOT_DISTINCT_DEPENDENT
|
|
130
|
+
supports_window = True
|
|
131
|
+
|
|
132
|
+
@classmethod
|
|
133
|
+
def initial_state(cls, params: ProcessParams[None]) -> _EmptyWindowState:
|
|
134
|
+
return _EmptyWindowState()
|
|
135
|
+
|
|
136
|
+
@classmethod
|
|
137
|
+
def update(
|
|
138
|
+
cls,
|
|
139
|
+
states: dict[int, _EmptyWindowState],
|
|
140
|
+
group_ids: pa.Int64Array,
|
|
141
|
+
value: Annotated[pa.DoubleArray, Param(doc="Column to compute median of")],
|
|
142
|
+
) -> None:
|
|
143
|
+
# GROUP BY path not the primary use — kept only so the function works
|
|
144
|
+
# when used outside an OVER clause. Caller must not expect exact
|
|
145
|
+
# semantics for huge groups.
|
|
146
|
+
pass
|
|
147
|
+
|
|
148
|
+
@classmethod
|
|
149
|
+
def combine(
|
|
150
|
+
cls, source: _EmptyWindowState, target: _EmptyWindowState, params: ProcessParams[None]
|
|
151
|
+
) -> _EmptyWindowState:
|
|
152
|
+
return target
|
|
153
|
+
|
|
154
|
+
@classmethod
|
|
155
|
+
def finalize(
|
|
156
|
+
cls,
|
|
157
|
+
group_ids: pa.Int64Array,
|
|
158
|
+
states: dict[int, _EmptyWindowState],
|
|
159
|
+
params: ProcessParams[None],
|
|
160
|
+
) -> Annotated[pa.RecordBatch, Returns(pa.float64())]:
|
|
161
|
+
results = [None] * len(group_ids)
|
|
162
|
+
return pa.record_batch({"result": pa.array(results, type=pa.float64())})
|
|
163
|
+
|
|
164
|
+
@classmethod
|
|
165
|
+
def window(
|
|
166
|
+
cls,
|
|
167
|
+
rid: int,
|
|
168
|
+
subframes: list[tuple[int, int]],
|
|
169
|
+
partition: WindowPartition,
|
|
170
|
+
window_state: Any,
|
|
171
|
+
params: ProcessParams[None],
|
|
172
|
+
) -> float | None:
|
|
173
|
+
value_col = partition.inputs.column(0)
|
|
174
|
+
values: list[float] = []
|
|
175
|
+
for begin, end in subframes:
|
|
176
|
+
if end <= begin:
|
|
177
|
+
continue
|
|
178
|
+
slice_ = value_col.slice(begin, end - begin)
|
|
179
|
+
if partition.filter_mask is not None:
|
|
180
|
+
mask = partition.filter_mask.slice(begin, end - begin)
|
|
181
|
+
slice_ = slice_.filter(mask)
|
|
182
|
+
for v in slice_.to_pylist():
|
|
183
|
+
if v is not None:
|
|
184
|
+
values.append(float(v))
|
|
185
|
+
if not values:
|
|
186
|
+
return None
|
|
187
|
+
values.sort()
|
|
188
|
+
n = len(values)
|
|
189
|
+
mid = n // 2
|
|
190
|
+
if n % 2 == 1:
|
|
191
|
+
return values[mid]
|
|
192
|
+
return (values[mid - 1] + values[mid]) / 2.0
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
class WindowListAggFunction(AggregateFunction[ListAggState]):
|
|
196
|
+
"""Windowed ORDER_DEPENDENT aggregate — demonstrates the fallback handoff.
|
|
197
|
+
|
|
198
|
+
For ``vgi_window_listagg(s) OVER (ORDER BY x ...)`` DuckDB picks our
|
|
199
|
+
``window()`` callback (arg_orders is empty; frame ordering comes from
|
|
200
|
+
the OVER clause).
|
|
201
|
+
|
|
202
|
+
For ``vgi_window_listagg(s ORDER BY x) OVER (...)`` DuckDB's
|
|
203
|
+
``WindowCustomAggregator::CanAggregate`` rejects the window path
|
|
204
|
+
because ``wexpr.arg_orders`` is non-empty, and falls back to
|
|
205
|
+
update/combine/finalize. The result is still correct — just slower.
|
|
206
|
+
"""
|
|
207
|
+
|
|
208
|
+
class Meta:
|
|
209
|
+
name = "vgi_window_listagg"
|
|
210
|
+
description = "Windowed string concat (ORDER_DEPENDENT; tests fallback handoff)"
|
|
211
|
+
null_handling = NullHandling.DEFAULT
|
|
212
|
+
order_dependent = OrderDependence.ORDER_DEPENDENT
|
|
213
|
+
distinct_dependent = DistinctDependence.DISTINCT_DEPENDENT
|
|
214
|
+
supports_window = True
|
|
215
|
+
|
|
216
|
+
@classmethod
|
|
217
|
+
def initial_state(cls, params: ProcessParams[None]) -> ListAggState:
|
|
218
|
+
return ListAggState()
|
|
219
|
+
|
|
220
|
+
@classmethod
|
|
221
|
+
def update(
|
|
222
|
+
cls,
|
|
223
|
+
states: dict[int, ListAggState],
|
|
224
|
+
group_ids: pa.Int64Array,
|
|
225
|
+
value: Annotated[pa.StringArray, Param(doc="String column")],
|
|
226
|
+
) -> None:
|
|
227
|
+
for i in range(len(group_ids)):
|
|
228
|
+
gid: int = group_ids[i].as_py()
|
|
229
|
+
val = value[i].as_py()
|
|
230
|
+
if val is not None:
|
|
231
|
+
s = states[gid]
|
|
232
|
+
if s.values:
|
|
233
|
+
states[gid] = ListAggState(values=s.values + "," + val)
|
|
234
|
+
else:
|
|
235
|
+
states[gid] = ListAggState(values=val)
|
|
236
|
+
|
|
237
|
+
@classmethod
|
|
238
|
+
def combine(cls, source: ListAggState, target: ListAggState, params: ProcessParams[None]) -> ListAggState:
|
|
239
|
+
if source.values and target.values:
|
|
240
|
+
return ListAggState(values=target.values + "," + source.values)
|
|
241
|
+
return ListAggState(values=target.values or source.values)
|
|
242
|
+
|
|
243
|
+
@classmethod
|
|
244
|
+
def finalize(
|
|
245
|
+
cls,
|
|
246
|
+
group_ids: pa.Int64Array,
|
|
247
|
+
states: dict[int, ListAggState],
|
|
248
|
+
params: ProcessParams[None],
|
|
249
|
+
) -> Annotated[pa.RecordBatch, Returns(pa.string())]:
|
|
250
|
+
results = [s.values or None if (s := states[gid.as_py()]) is not None else None for gid in group_ids]
|
|
251
|
+
return pa.record_batch({"result": pa.array(results, type=pa.string())})
|
|
252
|
+
|
|
253
|
+
@classmethod
|
|
254
|
+
def window(
|
|
255
|
+
cls,
|
|
256
|
+
rid: int,
|
|
257
|
+
subframes: list[tuple[int, int]],
|
|
258
|
+
partition: WindowPartition,
|
|
259
|
+
window_state: Any,
|
|
260
|
+
params: ProcessParams[None],
|
|
261
|
+
) -> str | None:
|
|
262
|
+
value_col = partition.inputs.column(0)
|
|
263
|
+
parts: list[str] = []
|
|
264
|
+
for begin, end in subframes:
|
|
265
|
+
if end <= begin:
|
|
266
|
+
continue
|
|
267
|
+
slice_ = value_col.slice(begin, end - begin)
|
|
268
|
+
if partition.filter_mask is not None:
|
|
269
|
+
mask = partition.filter_mask.slice(begin, end - begin)
|
|
270
|
+
slice_ = slice_.filter(mask)
|
|
271
|
+
for v in slice_.to_pylist():
|
|
272
|
+
if v is not None:
|
|
273
|
+
parts.append(v)
|
|
274
|
+
return ",".join(parts) if parts else None
|
|
275
|
+
|
|
276
|
+
|
|
277
|
+
class WindowSumBatchFunction(AggregateFunction[SumState]):
|
|
278
|
+
"""Windowed running-sum returning a pre-built ``pa.Array``.
|
|
279
|
+
|
|
280
|
+
Overrides ``window_batch`` to return a pre-built ``pa.Array`` rather
|
|
281
|
+
than a Python list.
|
|
282
|
+
|
|
283
|
+
Functionally equivalent to :class:`WindowSumFunction`. The point of this
|
|
284
|
+
fixture is to exercise the framework's polymorphic batch return: when
|
|
285
|
+
user code returns a ``pa.Array``, the worker should ship it directly
|
|
286
|
+
without round-tripping through ``pa.array(list, type=...)``.
|
|
287
|
+
|
|
288
|
+
Used by the unit tests for ``window_batch`` to confirm the dispatcher
|
|
289
|
+
accepts both a list and a pa.Array, and that the pa.Array path
|
|
290
|
+
produces identical answers.
|
|
291
|
+
"""
|
|
292
|
+
|
|
293
|
+
class Meta:
|
|
294
|
+
name = "vgi_window_sum_batch"
|
|
295
|
+
description = "Windowed sum demonstrating window_batch returning pa.Array"
|
|
296
|
+
null_handling = NullHandling.DEFAULT
|
|
297
|
+
order_dependent = OrderDependence.NOT_ORDER_DEPENDENT
|
|
298
|
+
distinct_dependent = DistinctDependence.NOT_DISTINCT_DEPENDENT
|
|
299
|
+
supports_window = True
|
|
300
|
+
|
|
301
|
+
@classmethod
|
|
302
|
+
def initial_state(cls, params: ProcessParams[None]) -> SumState:
|
|
303
|
+
return SumState()
|
|
304
|
+
|
|
305
|
+
@classmethod
|
|
306
|
+
def update(
|
|
307
|
+
cls,
|
|
308
|
+
states: dict[int, SumState],
|
|
309
|
+
group_ids: pa.Int64Array,
|
|
310
|
+
value: Annotated[pa.Int64Array, Param(doc="Column to sum")],
|
|
311
|
+
) -> None:
|
|
312
|
+
table = pa.table({"gid": group_ids, "value": value})
|
|
313
|
+
grouped = table.group_by("gid").aggregate([("value", "sum")])
|
|
314
|
+
for i in range(grouped.num_rows):
|
|
315
|
+
gid: int = grouped.column("gid")[i].as_py()
|
|
316
|
+
val = grouped.column("value_sum")[i].as_py()
|
|
317
|
+
if val is not None:
|
|
318
|
+
states[gid] = SumState(total=states[gid].total + val)
|
|
319
|
+
|
|
320
|
+
@classmethod
|
|
321
|
+
def combine(cls, source: SumState, target: SumState, params: ProcessParams[None]) -> SumState:
|
|
322
|
+
return SumState(total=source.total + target.total)
|
|
323
|
+
|
|
324
|
+
@classmethod
|
|
325
|
+
def finalize(
|
|
326
|
+
cls,
|
|
327
|
+
group_ids: pa.Int64Array,
|
|
328
|
+
states: dict[int, SumState],
|
|
329
|
+
params: ProcessParams[None],
|
|
330
|
+
) -> Annotated[pa.RecordBatch, Returns(pa.int64())]:
|
|
331
|
+
results = [s.total if (s := states[gid.as_py()]) is not None else None for gid in group_ids]
|
|
332
|
+
return pa.record_batch({"result": pa.array(results, type=pa.int64())})
|
|
333
|
+
|
|
334
|
+
@classmethod
|
|
335
|
+
def window(
|
|
336
|
+
cls,
|
|
337
|
+
rid: int,
|
|
338
|
+
subframes: list[tuple[int, int]],
|
|
339
|
+
partition: WindowPartition,
|
|
340
|
+
window_state: Any,
|
|
341
|
+
params: ProcessParams[None],
|
|
342
|
+
) -> int | None:
|
|
343
|
+
# Single-row fallback (still required so plain window() invocations
|
|
344
|
+
# work in unit tests). Production callers go through window_batch.
|
|
345
|
+
return cls._sum_one(subframes, partition)
|
|
346
|
+
|
|
347
|
+
@classmethod
|
|
348
|
+
def window_batch(
|
|
349
|
+
cls,
|
|
350
|
+
row_ids: list[int],
|
|
351
|
+
subframes: list[list[tuple[int, int]]],
|
|
352
|
+
partition: WindowPartition,
|
|
353
|
+
window_state: Any,
|
|
354
|
+
params: ProcessParams[None],
|
|
355
|
+
) -> pa.Array[Any]:
|
|
356
|
+
out = [cls._sum_one(frames, partition) for frames in subframes]
|
|
357
|
+
return pa.array(out, type=pa.int64())
|
|
358
|
+
|
|
359
|
+
@staticmethod
|
|
360
|
+
def _sum_one(
|
|
361
|
+
subframes: list[tuple[int, int]],
|
|
362
|
+
partition: WindowPartition,
|
|
363
|
+
) -> int | None:
|
|
364
|
+
import pyarrow.compute as pc
|
|
365
|
+
|
|
366
|
+
value_col = partition.inputs.column(0)
|
|
367
|
+
total = 0
|
|
368
|
+
any_valid = False
|
|
369
|
+
for begin, end in subframes:
|
|
370
|
+
if end <= begin:
|
|
371
|
+
continue
|
|
372
|
+
slice_ = value_col.slice(begin, end - begin)
|
|
373
|
+
if partition.filter_mask is not None:
|
|
374
|
+
mask = partition.filter_mask.slice(begin, end - begin)
|
|
375
|
+
slice_ = slice_.filter(mask)
|
|
376
|
+
s = pc.sum(slice_)
|
|
377
|
+
if s.is_valid:
|
|
378
|
+
total += s.as_py()
|
|
379
|
+
any_valid = True
|
|
380
|
+
return total if any_valid else None
|