vgi-python 0.8.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (124) hide show
  1. vgi/__init__.py +152 -0
  2. vgi/_duckdb.py +62 -0
  3. vgi/_storage_profile.py +132 -0
  4. vgi/_test_fixtures/__init__.py +20 -0
  5. vgi/_test_fixtures/accumulate/__init__.py +19 -0
  6. vgi/_test_fixtures/accumulate/worker.py +762 -0
  7. vgi/_test_fixtures/aggregate/__init__.py +62 -0
  8. vgi/_test_fixtures/aggregate/_common.py +21 -0
  9. vgi/_test_fixtures/aggregate/basic.py +232 -0
  10. vgi/_test_fixtures/aggregate/dynamic.py +409 -0
  11. vgi/_test_fixtures/aggregate/generic.py +86 -0
  12. vgi/_test_fixtures/aggregate/listagg.py +71 -0
  13. vgi/_test_fixtures/aggregate/percentile.py +107 -0
  14. vgi/_test_fixtures/aggregate/streaming.py +192 -0
  15. vgi/_test_fixtures/aggregate/varargs.py +75 -0
  16. vgi/_test_fixtures/aggregate/window.py +380 -0
  17. vgi/_test_fixtures/attach_options.py +308 -0
  18. vgi/_test_fixtures/bad_protocol.py +62 -0
  19. vgi/_test_fixtures/cancellable.py +336 -0
  20. vgi/_test_fixtures/catalog.py +813 -0
  21. vgi/_test_fixtures/http_server.py +394 -0
  22. vgi/_test_fixtures/nest_tensor.py +614 -0
  23. vgi/_test_fixtures/orchard_catalog.py +47 -0
  24. vgi/_test_fixtures/projection_repro/__init__.py +6 -0
  25. vgi/_test_fixtures/projection_repro/worker.py +454 -0
  26. vgi/_test_fixtures/scalar/__init__.py +116 -0
  27. vgi/_test_fixtures/scalar/_common.py +69 -0
  28. vgi/_test_fixtures/scalar/arithmetic.py +321 -0
  29. vgi/_test_fixtures/scalar/binary.py +120 -0
  30. vgi/_test_fixtures/scalar/formatting.py +176 -0
  31. vgi/_test_fixtures/scalar/geo.py +300 -0
  32. vgi/_test_fixtures/scalar/null_handling.py +107 -0
  33. vgi/_test_fixtures/scalar/random_demo.py +171 -0
  34. vgi/_test_fixtures/scalar/settings_secrets.py +102 -0
  35. vgi/_test_fixtures/scalar/type_info.py +219 -0
  36. vgi/_test_fixtures/schema_reconcile/__init__.py +29 -0
  37. vgi/_test_fixtures/schema_reconcile/worker.py +653 -0
  38. vgi/_test_fixtures/simple_writable.py +793 -0
  39. vgi/_test_fixtures/table/__init__.py +221 -0
  40. vgi/_test_fixtures/table/_common.py +162 -0
  41. vgi/_test_fixtures/table/batch_index.py +283 -0
  42. vgi/_test_fixtures/table/batch_index_broken.py +200 -0
  43. vgi/_test_fixtures/table/catalog_scans.py +162 -0
  44. vgi/_test_fixtures/table/filters.py +1005 -0
  45. vgi/_test_fixtures/table/late_materialization.py +249 -0
  46. vgi/_test_fixtures/table/make_series.py +273 -0
  47. vgi/_test_fixtures/table/misc.py +499 -0
  48. vgi/_test_fixtures/table/order_modes.py +164 -0
  49. vgi/_test_fixtures/table/pairs.py +437 -0
  50. vgi/_test_fixtures/table/partition_columns.py +472 -0
  51. vgi/_test_fixtures/table/partition_columns_broken.py +304 -0
  52. vgi/_test_fixtures/table/profiling_example.py +195 -0
  53. vgi/_test_fixtures/table/required_filters.py +234 -0
  54. vgi/_test_fixtures/table/sequence.py +710 -0
  55. vgi/_test_fixtures/table/settings.py +426 -0
  56. vgi/_test_fixtures/table/transaction_storage.py +162 -0
  57. vgi/_test_fixtures/table/tt_pushdown.py +191 -0
  58. vgi/_test_fixtures/table/versioned.py +230 -0
  59. vgi/_test_fixtures/table_in_out.py +1392 -0
  60. vgi/_test_fixtures/versioned.py +155 -0
  61. vgi/_test_fixtures/versioned_tables.py +595 -0
  62. vgi/_test_fixtures/worker.py +1631 -0
  63. vgi/_test_fixtures/writable/__init__.py +8 -0
  64. vgi/_test_fixtures/writable/generic.py +236 -0
  65. vgi/_test_fixtures/writable/table.py +149 -0
  66. vgi/_test_fixtures/writable/worker.py +1148 -0
  67. vgi/aggregate_function.py +607 -0
  68. vgi/argument_spec.py +472 -0
  69. vgi/arguments.py +1747 -0
  70. vgi/auth.py +55 -0
  71. vgi/catalog/__init__.py +88 -0
  72. vgi/catalog/attach_option.py +206 -0
  73. vgi/catalog/catalog_interface.py +2767 -0
  74. vgi/catalog/descriptors.py +870 -0
  75. vgi/catalog/duckdb_statistics.py +377 -0
  76. vgi/catalog/secret_type.py +96 -0
  77. vgi/catalog/setting.py +253 -0
  78. vgi/catalog/storage.py +372 -0
  79. vgi/client/__init__.py +67 -0
  80. vgi/client/catalog_mixin.py +1251 -0
  81. vgi/client/cli.py +582 -0
  82. vgi/client/cli_catalog.py +182 -0
  83. vgi/client/cli_schema.py +270 -0
  84. vgi/client/cli_table.py +907 -0
  85. vgi/client/cli_transaction.py +97 -0
  86. vgi/client/cli_utils.py +441 -0
  87. vgi/client/cli_view.py +303 -0
  88. vgi/client/client.py +2183 -0
  89. vgi/exceptions.py +205 -0
  90. vgi/function.py +245 -0
  91. vgi/function_storage.py +1636 -0
  92. vgi/function_storage_azure_sql.py +922 -0
  93. vgi/function_storage_cf_do.py +740 -0
  94. vgi/http/__init__.py +25 -0
  95. vgi/http/demo_storage.py +212 -0
  96. vgi/http/worker_page.py +1252 -0
  97. vgi/invocation.py +154 -0
  98. vgi/logging_config.py +93 -0
  99. vgi/meta_worker.py +661 -0
  100. vgi/metadata.py +1403 -0
  101. vgi/otel.py +406 -0
  102. vgi/protocol.py +2418 -0
  103. vgi/protocol_version.txt +1 -0
  104. vgi/py.typed +0 -0
  105. vgi/scalar_function.py +1211 -0
  106. vgi/schema_utils.py +234 -0
  107. vgi/secret_protocol.py +124 -0
  108. vgi/secret_service.py +238 -0
  109. vgi/serve.py +769 -0
  110. vgi/table_buffering_function.py +443 -0
  111. vgi/table_filter_pushdown.py +1528 -0
  112. vgi/table_function.py +1130 -0
  113. vgi/table_in_out_function.py +383 -0
  114. vgi/transactor/__init__.py +24 -0
  115. vgi/transactor/_duckdb_compat.py +27 -0
  116. vgi/transactor/client.py +137 -0
  117. vgi/transactor/protocol.py +149 -0
  118. vgi/transactor/server.py +740 -0
  119. vgi/worker.py +4761 -0
  120. vgi_python-0.8.0.dist-info/METADATA +735 -0
  121. vgi_python-0.8.0.dist-info/RECORD +124 -0
  122. vgi_python-0.8.0.dist-info/WHEEL +4 -0
  123. vgi_python-0.8.0.dist-info/entry_points.txt +5 -0
  124. vgi_python-0.8.0.dist-info/licenses/LICENSE +134 -0
@@ -0,0 +1,192 @@
1
+ # Copyright 2025, 2026 Query Farm LLC - https://query.farm
2
+
3
+ """Streaming-partitioned aggregate fixtures.
4
+
5
+ Exercise the ``streaming_partitioned`` opt-in: ``streaming_open`` /
6
+ ``streaming_chunk`` / ``streaming_close``. These are routed through the
7
+ VGI DuckDB extension's custom streaming operator, which pipes input
8
+ chunks straight to the worker without materialising the partition on
9
+ the DuckDB side. State is bounded by partitions × per-partition state,
10
+ not by row count — the structural answer to "running aggregate over
11
+ unbounded ordered input."
12
+
13
+ These fixtures are reference implementations for the protocol. Real
14
+ production aggregates (e.g. ``portfolio_agg``) follow the same shape
15
+ but with domain-specific state and I/O optimisations (Decimal128 buffer
16
+ tricks, etc.).
17
+
18
+ When the optimizer rule rejects a query (non-cumulative frame, EXCLUDE
19
+ clause, DISTINCT/FILTER, etc.) DuckDB falls back to the standard
20
+ windowed path — so all three of these classes also implement
21
+ update/combine/finalize for plain GROUP BY usage and (optionally) the
22
+ windowed callbacks. The streaming methods are additive.
23
+ """
24
+
25
+ from __future__ import annotations
26
+
27
+ from typing import Annotated, Any
28
+
29
+ import pyarrow as pa
30
+
31
+ from vgi._test_fixtures.aggregate._common import SumState
32
+ from vgi.aggregate_function import AggregateFunction
33
+ from vgi.arguments import Param, Returns
34
+ from vgi.metadata import DistinctDependence, NullHandling, OrderDependence
35
+ from vgi.table_function import ProcessParams
36
+
37
+
38
+ class StreamingSumFunction(AggregateFunction[SumState]):
39
+ """Streaming-partitioned running sum.
40
+
41
+ Cumulative across each `(PARTITION BY key)` group, in `ORDER BY` order.
42
+ For every input row, emits the running sum of the value column at that
43
+ row's position in its partition.
44
+
45
+ Also wired for ``GROUP BY`` via ``update`` / ``combine`` / ``finalize``,
46
+ so the same function works in both shapes::
47
+
48
+ -- streaming-partitioned (one running value per fill row):
49
+ SELECT k, v, vgi_streaming_sum(v) OVER (PARTITION BY k ORDER BY ts)
50
+ FROM trades;
51
+
52
+ -- group-by (one final value per partition):
53
+ SELECT k, vgi_streaming_sum(v) FROM trades GROUP BY k;
54
+
55
+ State persistence: the per-partition dict lives in worker memory in an
56
+ in-process LRU and is also persisted to ``FunctionStorage`` after each
57
+ chunk so a follow-up chunk landing on a different worker pool entry
58
+ can rehydrate. No special handling required from this class — the
59
+ framework does it.
60
+ """
61
+
62
+ class Meta:
63
+ name = "vgi_streaming_sum"
64
+ description = (
65
+ "Running sum across PARTITION BY keys via the streaming-partitioned "
66
+ "protocol. Each input row emits the cumulative sum at its position."
67
+ )
68
+ null_handling = NullHandling.DEFAULT
69
+ order_dependent = OrderDependence.NOT_ORDER_DEPENDENT
70
+ distinct_dependent = DistinctDependence.NOT_DISTINCT_DEPENDENT
71
+ supports_window = False
72
+ # Opt into the streaming-partitioned operator. The optimizer rule
73
+ # will route eligible OVER queries through it; ineligible shapes
74
+ # (sliding frames, EXCLUDE, DISTINCT, FILTER) fall back to the
75
+ # standard windowed path automatically.
76
+ streaming_partitioned = True
77
+
78
+ # ------------------------------------------------------------------
79
+ # GROUP BY path — required for plain aggregation queries.
80
+ # ------------------------------------------------------------------
81
+
82
+ @classmethod
83
+ def initial_state(cls, params: ProcessParams[None]) -> SumState:
84
+ return SumState()
85
+
86
+ @classmethod
87
+ def update(
88
+ cls,
89
+ states: dict[int, SumState],
90
+ group_ids: pa.Int64Array,
91
+ value: Annotated[pa.Int64Array, Param(doc="Column to sum")],
92
+ ) -> None:
93
+ table = pa.table({"gid": group_ids, "value": value})
94
+ grouped = table.group_by("gid").aggregate([("value", "sum")])
95
+ for i in range(grouped.num_rows):
96
+ gid: int = grouped.column("gid")[i].as_py()
97
+ v = grouped.column("value_sum")[i].as_py()
98
+ if v is not None:
99
+ states[gid] = SumState(total=states[gid].total + v)
100
+
101
+ @classmethod
102
+ def combine(cls, source: SumState, target: SumState, params: ProcessParams[None]) -> SumState:
103
+ return SumState(total=source.total + target.total)
104
+
105
+ @classmethod
106
+ def finalize(
107
+ cls,
108
+ group_ids: pa.Int64Array,
109
+ states: dict[int, SumState],
110
+ params: ProcessParams[None],
111
+ ) -> Annotated[pa.RecordBatch, Returns(pa.int64())]:
112
+ results = [(s.total if (s := states.get(gid.as_py())) is not None else None) for gid in group_ids]
113
+ return pa.record_batch({"result": pa.array(results, type=pa.int64())})
114
+
115
+ # ------------------------------------------------------------------
116
+ # Streaming-partitioned path.
117
+ # ------------------------------------------------------------------
118
+ #
119
+ # Three callbacks: open / chunk / close. The framework handles session
120
+ # lifecycle (allocates execution_id, persists to FunctionStorage,
121
+ # rehydrates across pool workers); user code only owns the in-memory
122
+ # state object.
123
+
124
+ @classmethod
125
+ def streaming_open(cls, params: ProcessParams[None]) -> dict[str, Any]:
126
+ # Session state. Free shape — anything picklable. The framework
127
+ # passes this object back to streaming_chunk and streaming_close
128
+ # unchanged. For multi-partition aggregates, hold a per-partition
129
+ # dict here; for single-partition, just hold the running scalar.
130
+ return {
131
+ # partition_key_tuple -> running int sum
132
+ "partition_states": {},
133
+ }
134
+
135
+ @classmethod
136
+ def streaming_chunk(
137
+ cls,
138
+ chunk: pa.RecordBatch,
139
+ streaming_state: dict[str, Any],
140
+ partition_key_count: int,
141
+ order_key_count: int,
142
+ params: ProcessParams[None],
143
+ ) -> pa.Array[Any]:
144
+ # Column layout from the operator:
145
+ # [partition_key_cols..., order_key_cols..., value_cols...]
146
+ # We don't actually need the order keys at runtime here — the
147
+ # input arrives in (partition, order) order already, so cumulative
148
+ # state is naturally correct.
149
+ n = chunk.num_rows
150
+ value_idx = partition_key_count + order_key_count
151
+
152
+ if partition_key_count > 0:
153
+ pk_columns = [chunk.column(i).to_pylist() for i in range(partition_key_count)]
154
+ else:
155
+ pk_columns = []
156
+ values = chunk.column(value_idx).to_pylist()
157
+
158
+ partition_states: dict[Any, int] = streaming_state["partition_states"]
159
+
160
+ # Returns one cumulative-sum int per input row. NULL value rows
161
+ # leave state unchanged but still emit the current sum (matches
162
+ # the GROUP BY path's NullHandling.DEFAULT semantics).
163
+ out: list[int] = [0] * n
164
+ for i in range(n):
165
+ if partition_key_count == 0:
166
+ key: Any = ()
167
+ elif partition_key_count == 1:
168
+ key = pk_columns[0][i]
169
+ else:
170
+ key = tuple(col[i] for col in pk_columns)
171
+
172
+ running = partition_states.get(key, 0)
173
+ v = values[i]
174
+ if v is not None:
175
+ running += v
176
+ partition_states[key] = running
177
+ out[i] = running
178
+
179
+ return pa.array(out, type=pa.int64())
180
+
181
+ @classmethod
182
+ def streaming_close(
183
+ cls,
184
+ streaming_state: dict[str, Any],
185
+ params: ProcessParams[None],
186
+ ) -> None:
187
+ # Cleanup hook. For this fixture there's nothing to release;
188
+ # state is plain Python objects that GC collects when the
189
+ # session is dropped from the framework's cache. Real
190
+ # implementations might release file handles, close DB
191
+ # connections, or flush logs here.
192
+ return None
@@ -0,0 +1,75 @@
1
+ # Copyright 2025, 2026 Query Farm LLC - https://query.farm
2
+
3
+ """SumAllFunction — varargs aggregate (sums any number of numeric columns)."""
4
+
5
+ from __future__ import annotations
6
+
7
+ from dataclasses import dataclass
8
+ from typing import Annotated
9
+
10
+ import pyarrow as pa
11
+ from vgi_rpc import ArrowSerializableDataclass, ArrowType
12
+
13
+ from vgi.aggregate_function import AggregateFunction
14
+ from vgi.arguments import Param, Returns
15
+ from vgi.metadata import DistinctDependence, NullHandling, OrderDependence
16
+ from vgi.table_function import ProcessParams
17
+
18
+
19
+ @dataclass(kw_only=True)
20
+ class SumAllState(ArrowSerializableDataclass):
21
+ total: Annotated[float, ArrowType(pa.float64())] = 0.0
22
+
23
+
24
+ class SumAllFunction(AggregateFunction[SumAllState]):
25
+ """Sum all numeric columns — demonstrates varargs aggregate.
26
+
27
+ Accepts any number of numeric columns and sums them all together.
28
+ SQL: ``SELECT vgi_sum_all(a, b, c) FROM t GROUP BY category``
29
+ """
30
+
31
+ class Meta:
32
+ name = "vgi_sum_all"
33
+ description = "Sum all numeric columns"
34
+ null_handling = NullHandling.DEFAULT
35
+ order_dependent = OrderDependence.NOT_ORDER_DEPENDENT
36
+ distinct_dependent = DistinctDependence.NOT_DISTINCT_DEPENDENT
37
+
38
+ @classmethod
39
+ def initial_state(cls, params: ProcessParams[None]) -> SumAllState:
40
+ return SumAllState()
41
+
42
+ @classmethod
43
+ def update(
44
+ cls,
45
+ states: dict[int, SumAllState],
46
+ group_ids: pa.Int64Array,
47
+ columns: Annotated[pa.Array, Param(doc="Numeric columns to sum", varargs=True)], # type: ignore[type-arg]
48
+ ) -> None:
49
+ for i in range(len(group_ids)):
50
+ gid: int = group_ids[i].as_py()
51
+ row_total = 0.0
52
+ for col in columns:
53
+ val = col[i].as_py()
54
+ if val is not None:
55
+ row_total += float(val)
56
+ states[gid] = SumAllState(total=states[gid].total + row_total)
57
+
58
+ @classmethod
59
+ def combine(cls, source: SumAllState, target: SumAllState, params: ProcessParams[None]) -> SumAllState:
60
+ return SumAllState(total=source.total + target.total)
61
+
62
+ @classmethod
63
+ def finalize(
64
+ cls,
65
+ group_ids: pa.Int64Array,
66
+ states: dict[int, SumAllState],
67
+ params: ProcessParams[None],
68
+ ) -> Annotated[pa.RecordBatch, Returns(pa.float64())]:
69
+ results = [s.total if (s := states[gid.as_py()]) is not None else None for gid in group_ids]
70
+ return pa.record_batch({"result": pa.array(results, type=pa.float64())})
71
+
72
+
73
+ # ---------------------------------------------------------------------------
74
+ # DynamicAggregateFunction — aggregate behavior defined by Python code string
75
+ # ---------------------------------------------------------------------------
@@ -0,0 +1,380 @@
1
+ # Copyright 2025, 2026 Query Farm LLC - https://query.farm
2
+
3
+ """Windowed aggregate fixtures (window_sum, window_median, window_listagg)."""
4
+
5
+ from __future__ import annotations
6
+
7
+ from dataclasses import dataclass
8
+ from typing import Annotated, Any
9
+
10
+ import pyarrow as pa
11
+ from vgi_rpc import ArrowSerializableDataclass
12
+
13
+ from vgi._test_fixtures.aggregate._common import ListAggState, SumState
14
+ from vgi.aggregate_function import AggregateFunction, WindowPartition
15
+ from vgi.arguments import Param, Returns
16
+ from vgi.metadata import DistinctDependence, NullHandling, OrderDependence
17
+ from vgi.table_function import ProcessParams
18
+
19
+
20
+ @dataclass(kw_only=True)
21
+ class _EmptyWindowState(ArrowSerializableDataclass):
22
+ """Placeholder for functions that don't need derived per-partition state."""
23
+
24
+ pass
25
+
26
+
27
+ class WindowSumFunction(AggregateFunction[SumState]):
28
+ """Windowed running-sum — demonstrates a simple window() callback.
29
+
30
+ Also implements update/combine/finalize so the function still works in
31
+ plain ``GROUP BY`` contexts (DuckDB picks the window path automatically
32
+ via ``WindowCustomAggregator::CanAggregate``).
33
+
34
+ SQL::
35
+
36
+ SELECT x, vgi_window_sum(x) OVER (ORDER BY x ROWS BETWEEN 2 PRECEDING AND CURRENT ROW)
37
+ FROM generate_series(1, 10) t(x);
38
+ """
39
+
40
+ class Meta:
41
+ name = "vgi_window_sum"
42
+ description = "Windowed sum that uses the per-partition window() callback"
43
+ null_handling = NullHandling.DEFAULT
44
+ order_dependent = OrderDependence.NOT_ORDER_DEPENDENT
45
+ distinct_dependent = DistinctDependence.NOT_DISTINCT_DEPENDENT
46
+ supports_window = True
47
+
48
+ @classmethod
49
+ def initial_state(cls, params: ProcessParams[None]) -> SumState:
50
+ return SumState()
51
+
52
+ @classmethod
53
+ def update(
54
+ cls,
55
+ states: dict[int, SumState],
56
+ group_ids: pa.Int64Array,
57
+ value: Annotated[pa.Int64Array, Param(doc="Column to sum")],
58
+ ) -> None:
59
+ table = pa.table({"gid": group_ids, "value": value})
60
+ grouped = table.group_by("gid").aggregate([("value", "sum")])
61
+ for i in range(grouped.num_rows):
62
+ gid: int = grouped.column("gid")[i].as_py()
63
+ val = grouped.column("value_sum")[i].as_py()
64
+ if val is not None:
65
+ states[gid] = SumState(total=states[gid].total + val)
66
+
67
+ @classmethod
68
+ def combine(cls, source: SumState, target: SumState, params: ProcessParams[None]) -> SumState:
69
+ return SumState(total=source.total + target.total)
70
+
71
+ @classmethod
72
+ def finalize(
73
+ cls,
74
+ group_ids: pa.Int64Array,
75
+ states: dict[int, SumState],
76
+ params: ProcessParams[None],
77
+ ) -> Annotated[pa.RecordBatch, Returns(pa.int64())]:
78
+ results = [s.total if (s := states[gid.as_py()]) is not None else None for gid in group_ids]
79
+ return pa.record_batch({"result": pa.array(results, type=pa.int64())})
80
+
81
+ # --- Window path ---
82
+
83
+ @classmethod
84
+ def window(
85
+ cls,
86
+ rid: int,
87
+ subframes: list[tuple[int, int]],
88
+ partition: WindowPartition,
89
+ window_state: Any,
90
+ params: ProcessParams[None],
91
+ ) -> int | None:
92
+ import pyarrow.compute as pc
93
+
94
+ value_col = partition.inputs.column(0)
95
+ total = 0
96
+ any_valid = False
97
+ for begin, end in subframes:
98
+ if end <= begin:
99
+ continue
100
+ slice_ = value_col.slice(begin, end - begin)
101
+ if partition.filter_mask is not None:
102
+ mask = partition.filter_mask.slice(begin, end - begin)
103
+ slice_ = slice_.filter(mask)
104
+ s = pc.sum(slice_)
105
+ if s.is_valid:
106
+ total += s.as_py()
107
+ any_valid = True
108
+ return total if any_valid else None
109
+
110
+
111
+ class WindowMedianFunction(AggregateFunction[_EmptyWindowState]):
112
+ """Windowed median — non-incremental, benefits from caching the partition.
113
+
114
+ Uses the window() callback exclusively (no incremental update path makes
115
+ sense for median). Falls back to a naive GROUP BY implementation via
116
+ update/combine/finalize that collects values in a single string field.
117
+
118
+ SQL::
119
+
120
+ SELECT x, vgi_window_median(x) OVER (ORDER BY x ROWS BETWEEN 2 PRECEDING AND 2 FOLLOWING)
121
+ FROM generate_series(1, 20) t(x);
122
+ """
123
+
124
+ class Meta:
125
+ name = "vgi_window_median"
126
+ description = "Windowed median (window() callback demonstrates non-incremental aggregates)"
127
+ null_handling = NullHandling.DEFAULT
128
+ order_dependent = OrderDependence.NOT_ORDER_DEPENDENT
129
+ distinct_dependent = DistinctDependence.NOT_DISTINCT_DEPENDENT
130
+ supports_window = True
131
+
132
+ @classmethod
133
+ def initial_state(cls, params: ProcessParams[None]) -> _EmptyWindowState:
134
+ return _EmptyWindowState()
135
+
136
+ @classmethod
137
+ def update(
138
+ cls,
139
+ states: dict[int, _EmptyWindowState],
140
+ group_ids: pa.Int64Array,
141
+ value: Annotated[pa.DoubleArray, Param(doc="Column to compute median of")],
142
+ ) -> None:
143
+ # GROUP BY path not the primary use — kept only so the function works
144
+ # when used outside an OVER clause. Caller must not expect exact
145
+ # semantics for huge groups.
146
+ pass
147
+
148
+ @classmethod
149
+ def combine(
150
+ cls, source: _EmptyWindowState, target: _EmptyWindowState, params: ProcessParams[None]
151
+ ) -> _EmptyWindowState:
152
+ return target
153
+
154
+ @classmethod
155
+ def finalize(
156
+ cls,
157
+ group_ids: pa.Int64Array,
158
+ states: dict[int, _EmptyWindowState],
159
+ params: ProcessParams[None],
160
+ ) -> Annotated[pa.RecordBatch, Returns(pa.float64())]:
161
+ results = [None] * len(group_ids)
162
+ return pa.record_batch({"result": pa.array(results, type=pa.float64())})
163
+
164
+ @classmethod
165
+ def window(
166
+ cls,
167
+ rid: int,
168
+ subframes: list[tuple[int, int]],
169
+ partition: WindowPartition,
170
+ window_state: Any,
171
+ params: ProcessParams[None],
172
+ ) -> float | None:
173
+ value_col = partition.inputs.column(0)
174
+ values: list[float] = []
175
+ for begin, end in subframes:
176
+ if end <= begin:
177
+ continue
178
+ slice_ = value_col.slice(begin, end - begin)
179
+ if partition.filter_mask is not None:
180
+ mask = partition.filter_mask.slice(begin, end - begin)
181
+ slice_ = slice_.filter(mask)
182
+ for v in slice_.to_pylist():
183
+ if v is not None:
184
+ values.append(float(v))
185
+ if not values:
186
+ return None
187
+ values.sort()
188
+ n = len(values)
189
+ mid = n // 2
190
+ if n % 2 == 1:
191
+ return values[mid]
192
+ return (values[mid - 1] + values[mid]) / 2.0
193
+
194
+
195
+ class WindowListAggFunction(AggregateFunction[ListAggState]):
196
+ """Windowed ORDER_DEPENDENT aggregate — demonstrates the fallback handoff.
197
+
198
+ For ``vgi_window_listagg(s) OVER (ORDER BY x ...)`` DuckDB picks our
199
+ ``window()`` callback (arg_orders is empty; frame ordering comes from
200
+ the OVER clause).
201
+
202
+ For ``vgi_window_listagg(s ORDER BY x) OVER (...)`` DuckDB's
203
+ ``WindowCustomAggregator::CanAggregate`` rejects the window path
204
+ because ``wexpr.arg_orders`` is non-empty, and falls back to
205
+ update/combine/finalize. The result is still correct — just slower.
206
+ """
207
+
208
+ class Meta:
209
+ name = "vgi_window_listagg"
210
+ description = "Windowed string concat (ORDER_DEPENDENT; tests fallback handoff)"
211
+ null_handling = NullHandling.DEFAULT
212
+ order_dependent = OrderDependence.ORDER_DEPENDENT
213
+ distinct_dependent = DistinctDependence.DISTINCT_DEPENDENT
214
+ supports_window = True
215
+
216
+ @classmethod
217
+ def initial_state(cls, params: ProcessParams[None]) -> ListAggState:
218
+ return ListAggState()
219
+
220
+ @classmethod
221
+ def update(
222
+ cls,
223
+ states: dict[int, ListAggState],
224
+ group_ids: pa.Int64Array,
225
+ value: Annotated[pa.StringArray, Param(doc="String column")],
226
+ ) -> None:
227
+ for i in range(len(group_ids)):
228
+ gid: int = group_ids[i].as_py()
229
+ val = value[i].as_py()
230
+ if val is not None:
231
+ s = states[gid]
232
+ if s.values:
233
+ states[gid] = ListAggState(values=s.values + "," + val)
234
+ else:
235
+ states[gid] = ListAggState(values=val)
236
+
237
+ @classmethod
238
+ def combine(cls, source: ListAggState, target: ListAggState, params: ProcessParams[None]) -> ListAggState:
239
+ if source.values and target.values:
240
+ return ListAggState(values=target.values + "," + source.values)
241
+ return ListAggState(values=target.values or source.values)
242
+
243
+ @classmethod
244
+ def finalize(
245
+ cls,
246
+ group_ids: pa.Int64Array,
247
+ states: dict[int, ListAggState],
248
+ params: ProcessParams[None],
249
+ ) -> Annotated[pa.RecordBatch, Returns(pa.string())]:
250
+ results = [s.values or None if (s := states[gid.as_py()]) is not None else None for gid in group_ids]
251
+ return pa.record_batch({"result": pa.array(results, type=pa.string())})
252
+
253
+ @classmethod
254
+ def window(
255
+ cls,
256
+ rid: int,
257
+ subframes: list[tuple[int, int]],
258
+ partition: WindowPartition,
259
+ window_state: Any,
260
+ params: ProcessParams[None],
261
+ ) -> str | None:
262
+ value_col = partition.inputs.column(0)
263
+ parts: list[str] = []
264
+ for begin, end in subframes:
265
+ if end <= begin:
266
+ continue
267
+ slice_ = value_col.slice(begin, end - begin)
268
+ if partition.filter_mask is not None:
269
+ mask = partition.filter_mask.slice(begin, end - begin)
270
+ slice_ = slice_.filter(mask)
271
+ for v in slice_.to_pylist():
272
+ if v is not None:
273
+ parts.append(v)
274
+ return ",".join(parts) if parts else None
275
+
276
+
277
+ class WindowSumBatchFunction(AggregateFunction[SumState]):
278
+ """Windowed running-sum returning a pre-built ``pa.Array``.
279
+
280
+ Overrides ``window_batch`` to return a pre-built ``pa.Array`` rather
281
+ than a Python list.
282
+
283
+ Functionally equivalent to :class:`WindowSumFunction`. The point of this
284
+ fixture is to exercise the framework's polymorphic batch return: when
285
+ user code returns a ``pa.Array``, the worker should ship it directly
286
+ without round-tripping through ``pa.array(list, type=...)``.
287
+
288
+ Used by the unit tests for ``window_batch`` to confirm the dispatcher
289
+ accepts both a list and a pa.Array, and that the pa.Array path
290
+ produces identical answers.
291
+ """
292
+
293
+ class Meta:
294
+ name = "vgi_window_sum_batch"
295
+ description = "Windowed sum demonstrating window_batch returning pa.Array"
296
+ null_handling = NullHandling.DEFAULT
297
+ order_dependent = OrderDependence.NOT_ORDER_DEPENDENT
298
+ distinct_dependent = DistinctDependence.NOT_DISTINCT_DEPENDENT
299
+ supports_window = True
300
+
301
+ @classmethod
302
+ def initial_state(cls, params: ProcessParams[None]) -> SumState:
303
+ return SumState()
304
+
305
+ @classmethod
306
+ def update(
307
+ cls,
308
+ states: dict[int, SumState],
309
+ group_ids: pa.Int64Array,
310
+ value: Annotated[pa.Int64Array, Param(doc="Column to sum")],
311
+ ) -> None:
312
+ table = pa.table({"gid": group_ids, "value": value})
313
+ grouped = table.group_by("gid").aggregate([("value", "sum")])
314
+ for i in range(grouped.num_rows):
315
+ gid: int = grouped.column("gid")[i].as_py()
316
+ val = grouped.column("value_sum")[i].as_py()
317
+ if val is not None:
318
+ states[gid] = SumState(total=states[gid].total + val)
319
+
320
+ @classmethod
321
+ def combine(cls, source: SumState, target: SumState, params: ProcessParams[None]) -> SumState:
322
+ return SumState(total=source.total + target.total)
323
+
324
+ @classmethod
325
+ def finalize(
326
+ cls,
327
+ group_ids: pa.Int64Array,
328
+ states: dict[int, SumState],
329
+ params: ProcessParams[None],
330
+ ) -> Annotated[pa.RecordBatch, Returns(pa.int64())]:
331
+ results = [s.total if (s := states[gid.as_py()]) is not None else None for gid in group_ids]
332
+ return pa.record_batch({"result": pa.array(results, type=pa.int64())})
333
+
334
+ @classmethod
335
+ def window(
336
+ cls,
337
+ rid: int,
338
+ subframes: list[tuple[int, int]],
339
+ partition: WindowPartition,
340
+ window_state: Any,
341
+ params: ProcessParams[None],
342
+ ) -> int | None:
343
+ # Single-row fallback (still required so plain window() invocations
344
+ # work in unit tests). Production callers go through window_batch.
345
+ return cls._sum_one(subframes, partition)
346
+
347
+ @classmethod
348
+ def window_batch(
349
+ cls,
350
+ row_ids: list[int],
351
+ subframes: list[list[tuple[int, int]]],
352
+ partition: WindowPartition,
353
+ window_state: Any,
354
+ params: ProcessParams[None],
355
+ ) -> pa.Array[Any]:
356
+ out = [cls._sum_one(frames, partition) for frames in subframes]
357
+ return pa.array(out, type=pa.int64())
358
+
359
+ @staticmethod
360
+ def _sum_one(
361
+ subframes: list[tuple[int, int]],
362
+ partition: WindowPartition,
363
+ ) -> int | None:
364
+ import pyarrow.compute as pc
365
+
366
+ value_col = partition.inputs.column(0)
367
+ total = 0
368
+ any_valid = False
369
+ for begin, end in subframes:
370
+ if end <= begin:
371
+ continue
372
+ slice_ = value_col.slice(begin, end - begin)
373
+ if partition.filter_mask is not None:
374
+ mask = partition.filter_mask.slice(begin, end - begin)
375
+ slice_ = slice_.filter(mask)
376
+ s = pc.sum(slice_)
377
+ if s.is_valid:
378
+ total += s.as_py()
379
+ any_valid = True
380
+ return total if any_valid else None