vgi-python 0.8.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (124) hide show
  1. vgi/__init__.py +152 -0
  2. vgi/_duckdb.py +62 -0
  3. vgi/_storage_profile.py +132 -0
  4. vgi/_test_fixtures/__init__.py +20 -0
  5. vgi/_test_fixtures/accumulate/__init__.py +19 -0
  6. vgi/_test_fixtures/accumulate/worker.py +762 -0
  7. vgi/_test_fixtures/aggregate/__init__.py +62 -0
  8. vgi/_test_fixtures/aggregate/_common.py +21 -0
  9. vgi/_test_fixtures/aggregate/basic.py +232 -0
  10. vgi/_test_fixtures/aggregate/dynamic.py +409 -0
  11. vgi/_test_fixtures/aggregate/generic.py +86 -0
  12. vgi/_test_fixtures/aggregate/listagg.py +71 -0
  13. vgi/_test_fixtures/aggregate/percentile.py +107 -0
  14. vgi/_test_fixtures/aggregate/streaming.py +192 -0
  15. vgi/_test_fixtures/aggregate/varargs.py +75 -0
  16. vgi/_test_fixtures/aggregate/window.py +380 -0
  17. vgi/_test_fixtures/attach_options.py +308 -0
  18. vgi/_test_fixtures/bad_protocol.py +62 -0
  19. vgi/_test_fixtures/cancellable.py +336 -0
  20. vgi/_test_fixtures/catalog.py +813 -0
  21. vgi/_test_fixtures/http_server.py +394 -0
  22. vgi/_test_fixtures/nest_tensor.py +614 -0
  23. vgi/_test_fixtures/orchard_catalog.py +47 -0
  24. vgi/_test_fixtures/projection_repro/__init__.py +6 -0
  25. vgi/_test_fixtures/projection_repro/worker.py +454 -0
  26. vgi/_test_fixtures/scalar/__init__.py +116 -0
  27. vgi/_test_fixtures/scalar/_common.py +69 -0
  28. vgi/_test_fixtures/scalar/arithmetic.py +321 -0
  29. vgi/_test_fixtures/scalar/binary.py +120 -0
  30. vgi/_test_fixtures/scalar/formatting.py +176 -0
  31. vgi/_test_fixtures/scalar/geo.py +300 -0
  32. vgi/_test_fixtures/scalar/null_handling.py +107 -0
  33. vgi/_test_fixtures/scalar/random_demo.py +171 -0
  34. vgi/_test_fixtures/scalar/settings_secrets.py +102 -0
  35. vgi/_test_fixtures/scalar/type_info.py +219 -0
  36. vgi/_test_fixtures/schema_reconcile/__init__.py +29 -0
  37. vgi/_test_fixtures/schema_reconcile/worker.py +653 -0
  38. vgi/_test_fixtures/simple_writable.py +793 -0
  39. vgi/_test_fixtures/table/__init__.py +221 -0
  40. vgi/_test_fixtures/table/_common.py +162 -0
  41. vgi/_test_fixtures/table/batch_index.py +283 -0
  42. vgi/_test_fixtures/table/batch_index_broken.py +200 -0
  43. vgi/_test_fixtures/table/catalog_scans.py +162 -0
  44. vgi/_test_fixtures/table/filters.py +1005 -0
  45. vgi/_test_fixtures/table/late_materialization.py +249 -0
  46. vgi/_test_fixtures/table/make_series.py +273 -0
  47. vgi/_test_fixtures/table/misc.py +499 -0
  48. vgi/_test_fixtures/table/order_modes.py +164 -0
  49. vgi/_test_fixtures/table/pairs.py +437 -0
  50. vgi/_test_fixtures/table/partition_columns.py +472 -0
  51. vgi/_test_fixtures/table/partition_columns_broken.py +304 -0
  52. vgi/_test_fixtures/table/profiling_example.py +195 -0
  53. vgi/_test_fixtures/table/required_filters.py +234 -0
  54. vgi/_test_fixtures/table/sequence.py +710 -0
  55. vgi/_test_fixtures/table/settings.py +426 -0
  56. vgi/_test_fixtures/table/transaction_storage.py +162 -0
  57. vgi/_test_fixtures/table/tt_pushdown.py +191 -0
  58. vgi/_test_fixtures/table/versioned.py +230 -0
  59. vgi/_test_fixtures/table_in_out.py +1392 -0
  60. vgi/_test_fixtures/versioned.py +155 -0
  61. vgi/_test_fixtures/versioned_tables.py +595 -0
  62. vgi/_test_fixtures/worker.py +1631 -0
  63. vgi/_test_fixtures/writable/__init__.py +8 -0
  64. vgi/_test_fixtures/writable/generic.py +236 -0
  65. vgi/_test_fixtures/writable/table.py +149 -0
  66. vgi/_test_fixtures/writable/worker.py +1148 -0
  67. vgi/aggregate_function.py +607 -0
  68. vgi/argument_spec.py +472 -0
  69. vgi/arguments.py +1747 -0
  70. vgi/auth.py +55 -0
  71. vgi/catalog/__init__.py +88 -0
  72. vgi/catalog/attach_option.py +206 -0
  73. vgi/catalog/catalog_interface.py +2767 -0
  74. vgi/catalog/descriptors.py +870 -0
  75. vgi/catalog/duckdb_statistics.py +377 -0
  76. vgi/catalog/secret_type.py +96 -0
  77. vgi/catalog/setting.py +253 -0
  78. vgi/catalog/storage.py +372 -0
  79. vgi/client/__init__.py +67 -0
  80. vgi/client/catalog_mixin.py +1251 -0
  81. vgi/client/cli.py +582 -0
  82. vgi/client/cli_catalog.py +182 -0
  83. vgi/client/cli_schema.py +270 -0
  84. vgi/client/cli_table.py +907 -0
  85. vgi/client/cli_transaction.py +97 -0
  86. vgi/client/cli_utils.py +441 -0
  87. vgi/client/cli_view.py +303 -0
  88. vgi/client/client.py +2183 -0
  89. vgi/exceptions.py +205 -0
  90. vgi/function.py +245 -0
  91. vgi/function_storage.py +1636 -0
  92. vgi/function_storage_azure_sql.py +922 -0
  93. vgi/function_storage_cf_do.py +740 -0
  94. vgi/http/__init__.py +25 -0
  95. vgi/http/demo_storage.py +212 -0
  96. vgi/http/worker_page.py +1252 -0
  97. vgi/invocation.py +154 -0
  98. vgi/logging_config.py +93 -0
  99. vgi/meta_worker.py +661 -0
  100. vgi/metadata.py +1403 -0
  101. vgi/otel.py +406 -0
  102. vgi/protocol.py +2418 -0
  103. vgi/protocol_version.txt +1 -0
  104. vgi/py.typed +0 -0
  105. vgi/scalar_function.py +1211 -0
  106. vgi/schema_utils.py +234 -0
  107. vgi/secret_protocol.py +124 -0
  108. vgi/secret_service.py +238 -0
  109. vgi/serve.py +769 -0
  110. vgi/table_buffering_function.py +443 -0
  111. vgi/table_filter_pushdown.py +1528 -0
  112. vgi/table_function.py +1130 -0
  113. vgi/table_in_out_function.py +383 -0
  114. vgi/transactor/__init__.py +24 -0
  115. vgi/transactor/_duckdb_compat.py +27 -0
  116. vgi/transactor/client.py +137 -0
  117. vgi/transactor/protocol.py +149 -0
  118. vgi/transactor/server.py +740 -0
  119. vgi/worker.py +4761 -0
  120. vgi_python-0.8.0.dist-info/METADATA +735 -0
  121. vgi_python-0.8.0.dist-info/RECORD +124 -0
  122. vgi_python-0.8.0.dist-info/WHEEL +4 -0
  123. vgi_python-0.8.0.dist-info/entry_points.txt +5 -0
  124. vgi_python-0.8.0.dist-info/licenses/LICENSE +134 -0
@@ -0,0 +1,409 @@
1
+ # Copyright 2025, 2026 Query Farm LLC - https://query.farm
2
+
3
+ """Dynamic-code aggregate fixtures (DynamicAggregate, DynamicMLAggregate)."""
4
+
5
+ from __future__ import annotations
6
+
7
+ from dataclasses import dataclass, field
8
+ from typing import Annotated, Any
9
+
10
+ import numpy as np
11
+ import pyarrow as pa
12
+ from vgi_rpc import ArrowSerializableDataclass, ArrowType
13
+
14
+ from vgi.aggregate_function import AggregateFunction, WindowPartition
15
+ from vgi.arguments import Param, Returns
16
+ from vgi.metadata import DistinctDependence, NullHandling, OrderDependence
17
+ from vgi.table_function import ProcessParams
18
+
19
+
20
+ @dataclass(kw_only=True)
21
+ class DynamicState(ArrowSerializableDataclass):
22
+ state_bytes: Annotated[bytes, ArrowType(pa.binary())] = b""
23
+ code: Annotated[str, ArrowType(pa.string())] = ""
24
+ params: Annotated[dict[str, float], ArrowType(pa.map_(pa.string(), pa.float64()))] = field(default_factory=dict)
25
+
26
+
27
+ def _serialize_table(table: pa.Table) -> bytes:
28
+ """Serialize a Table to Arrow IPC stream bytes."""
29
+ sink = pa.BufferOutputStream()
30
+ with pa.ipc.new_stream(sink, table.schema) as writer:
31
+ for batch in table.to_batches():
32
+ writer.write_batch(batch)
33
+ return sink.getvalue().to_pybytes()
34
+
35
+
36
+ def _deserialize_table(data: bytes) -> pa.Table:
37
+ """Deserialize Arrow IPC stream bytes to a Table."""
38
+ return pa.ipc.open_stream(data).read_all()
39
+
40
+
41
+ # ---------------------------------------------------------------------------
42
+ # Aggregate functions
43
+ # ---------------------------------------------------------------------------
44
+
45
+
46
+ _DYNAMIC_EXEC_NAMESPACE: dict[str, Any] = {
47
+ "dataclass": dataclass,
48
+ "field": field,
49
+ "Annotated": Annotated,
50
+ "pa": pa,
51
+ "np": np,
52
+ "ArrowSerializableDataclass": ArrowSerializableDataclass,
53
+ "ArrowType": ArrowType,
54
+ }
55
+
56
+ _dynamic_class_cache: dict[str, Any] = {}
57
+
58
+
59
+ def _get_aggregate_class(code: str) -> Any:
60
+ """Exec the code string, validate, cache, and return the Aggregate class."""
61
+ if code not in _dynamic_class_cache:
62
+ namespace: dict[str, Any] = dict(_DYNAMIC_EXEC_NAMESPACE)
63
+ # Compile with dont_inherit=True so `from __future__ import annotations`
64
+ # in this module doesn't make the exec'd annotations into strings.
65
+ compiled = compile(code, "<dynamic_aggregate>", "exec", dont_inherit=True)
66
+ exec(compiled, namespace) # noqa: S102
67
+ if "Aggregate" not in namespace:
68
+ raise ValueError("Dynamic aggregate code must define a class named 'Aggregate'")
69
+ agg_cls = namespace["Aggregate"]
70
+ for method in ("finalize",):
71
+ if not hasattr(agg_cls, method):
72
+ raise ValueError(f"Aggregate class must define a '{method}' method")
73
+ _dynamic_class_cache[code] = agg_cls
74
+ return _dynamic_class_cache[code]
75
+
76
+
77
+ def _pack_dynamic_state(
78
+ dynamic_state: ArrowSerializableDataclass,
79
+ code: str = "",
80
+ params: dict[str, float] | None = None,
81
+ ) -> DynamicState:
82
+ return DynamicState(
83
+ state_bytes=dynamic_state.serialize_to_bytes(),
84
+ code=code,
85
+ params=params or {},
86
+ )
87
+
88
+
89
+ def _unpack_dynamic_state(
90
+ wrapper: DynamicState, state_cls: type[ArrowSerializableDataclass]
91
+ ) -> ArrowSerializableDataclass:
92
+ return state_cls.deserialize_from_bytes(wrapper.state_bytes)
93
+
94
+
95
+ class _DynamicAggregateBase(AggregateFunction[DynamicState]):
96
+ """Shared logic for dynamic aggregate functions.
97
+
98
+ The dynamic code's ``update(state, *arrays)`` receives Arrow arrays
99
+ directly — no per-row Python scalar conversion. State stores accumulated
100
+ data as Arrow IPC bytes for zero-copy round-trips.
101
+
102
+ For the ML variant, ``finalize(state, params)`` receives the params dict.
103
+ """
104
+
105
+ @classmethod
106
+ def initial_state(cls, params: ProcessParams[None]) -> DynamicState:
107
+ return DynamicState()
108
+
109
+ @classmethod
110
+ def _do_update(
111
+ cls,
112
+ states: dict[int, DynamicState],
113
+ group_ids: pa.Int64Array,
114
+ code_col: pa.StringArray,
115
+ columns: list[pa.Array[Any]],
116
+ params_col: pa.Array[Any] | None = None,
117
+ ) -> None:
118
+ code: str = code_col[0].as_py()
119
+ raw_params = params_col[0].as_py() if params_col is not None else None
120
+ if isinstance(raw_params, list):
121
+ params: dict[str, float] = {str(k): float(v) for k, v in raw_params}
122
+ elif isinstance(raw_params, dict):
123
+ params = {str(k): float(v) for k, v in raw_params.items()}
124
+ else:
125
+ params = {}
126
+ _get_aggregate_class(code) # validate + cache the code early
127
+
128
+ # Build a table from the incoming columns (drop nulls)
129
+ col_names = [f"c{i}" for i in range(len(columns))]
130
+ incoming = pa.table({col_names[i]: columns[i] for i in range(len(columns))})
131
+ # Filter null rows
132
+ mask: pa.ChunkedArray[pa.BooleanScalar] | pa.BooleanArray | None = None
133
+ for col in incoming.columns:
134
+ valid = col.is_valid()
135
+ mask = valid if mask is None else pa.compute.and_(mask, valid)
136
+ if mask is not None:
137
+ incoming = incoming.filter(mask)
138
+
139
+ # Group by group_id and dispatch. For window aggregates there's
140
+ # typically one group, so this is just one iteration.
141
+ unique_gids = group_ids.unique()
142
+ for gid_scalar in unique_gids:
143
+ gid: int = gid_scalar.as_py()
144
+ wrapper = states[gid]
145
+ # Get row indices for this group
146
+ gid_mask = pa.compute.equal(group_ids, gid_scalar)
147
+ group_table = incoming.filter(gid_mask)
148
+ if group_table.num_rows == 0:
149
+ continue
150
+
151
+ # Accumulate: concat with existing state data.
152
+ if wrapper.state_bytes:
153
+ combined = pa.concat_tables([_deserialize_table(wrapper.state_bytes), group_table])
154
+ else:
155
+ combined = group_table
156
+
157
+ states[gid] = DynamicState(
158
+ state_bytes=_serialize_table(combined),
159
+ code=code,
160
+ params=params,
161
+ )
162
+
163
+ @classmethod
164
+ def combine(cls, source: DynamicState, target: DynamicState, params: ProcessParams[None]) -> DynamicState:
165
+ code = target.code or source.code
166
+ if not code:
167
+ return target
168
+ p = target.params or source.params
169
+ src_table = _deserialize_table(source.state_bytes) if source.state_bytes else None
170
+ tgt_table = _deserialize_table(target.state_bytes) if target.state_bytes else None
171
+ combined: pa.Table | None
172
+ if src_table is not None and tgt_table is not None:
173
+ combined = pa.concat_tables([tgt_table, src_table])
174
+ else:
175
+ combined = tgt_table or src_table
176
+ return DynamicState(
177
+ state_bytes=_serialize_table(combined) if combined is not None else b"",
178
+ code=code,
179
+ params=p,
180
+ )
181
+
182
+ # ------------------------------------------------------------------
183
+ # Windowed path
184
+ # ------------------------------------------------------------------
185
+ # Shared logic for both vgi_dynamic_agg and vgi_dynamic_ml_agg.
186
+ # Each subclass overrides window() directly — the shared helper below just
187
+ # slices all partition columns to the current frame with filter_mask and
188
+ # NULL-drop applied. Reading code/params from the sliced frame (rather
189
+ # than partition.inputs.column(X)[0]) avoids aliasing across partitions
190
+ # when DuckDB batches many partitions into shared buffers.
191
+
192
+ @staticmethod
193
+ def _slice_to_frame( # noqa: D417
194
+ partition: WindowPartition,
195
+ subframes: list[tuple[int, int]],
196
+ data_start: int,
197
+ ) -> pa.Table:
198
+ """Slice all partition columns to the frame rows.
199
+
200
+ Args:
201
+ data_start: Index where data columns begin (header columns are
202
+ ``[0 .. data_start)``). NULL-drop is applied on data columns
203
+ only — matches the filtering ``_do_update`` performs in the
204
+ non-window path.
205
+
206
+ """
207
+ num_cols = partition.inputs.num_columns
208
+ cols = [partition.inputs.column(i) for i in range(num_cols)]
209
+ col_names = [f"c{i}" for i in range(num_cols)]
210
+ slices: list[pa.Table] = []
211
+ for begin, end in subframes:
212
+ if end <= begin:
213
+ continue
214
+ length = end - begin
215
+ sliced = {col_names[i]: cols[i].slice(begin, length) for i in range(num_cols)}
216
+ t = pa.table(sliced)
217
+ if partition.filter_mask is not None:
218
+ t = t.filter(partition.filter_mask.slice(begin, length))
219
+ data_cols_of_t = t.columns[data_start:]
220
+ if data_cols_of_t:
221
+ null_mask: pa.ChunkedArray[pa.BooleanScalar] | pa.BooleanArray | None = None
222
+ for col in data_cols_of_t:
223
+ valid = col.is_valid()
224
+ null_mask = valid if null_mask is None else pa.compute.and_(null_mask, valid)
225
+ if null_mask is not None:
226
+ t = t.filter(null_mask)
227
+ slices.append(t)
228
+ if not slices:
229
+ return pa.table({c: pa.array([], type=cols[i].type) for i, c in enumerate(col_names)})
230
+ return pa.concat_tables(slices)
231
+
232
+ @staticmethod
233
+ def _data_table_from(frame: pa.Table, data_start: int) -> pa.Table:
234
+ """Rebuild a 0-indexed ``c0, c1, …`` data-only table for user code."""
235
+ data_cols = frame.columns[data_start:]
236
+ return pa.table({f"c{i}": col for i, col in enumerate(data_cols)})
237
+
238
+ @staticmethod
239
+ def _call_user(agg_cls: Any, data_table: pa.Table, user_params: dict[str, float] | None) -> Any:
240
+ """Prefer the user's ``window()``; fall back to ``finalize()``."""
241
+ fn = getattr(agg_cls, "window", None) or agg_cls.finalize
242
+ if user_params is None:
243
+ return fn(data_table)
244
+ return fn(data_table, user_params)
245
+
246
+
247
+ class DynamicAggregateFunction(_DynamicAggregateBase):
248
+ """Dynamic aggregate — behavior defined by a Python code string.
249
+
250
+ ``vgi_dynamic_agg(code, col1, col2, ...)``
251
+
252
+ The code and columns are regular parameters (not constants), so the code
253
+ can come from a table lookup, subquery, or variable.
254
+
255
+ The exec namespace pre-provides: ``dataclass``, ``Annotated``, ``pa``,
256
+ ``ArrowSerializableDataclass``, ``ArrowType``.
257
+ """
258
+
259
+ class Meta:
260
+ name = "vgi_dynamic_agg"
261
+ description = "Dynamic aggregate defined by Python code string"
262
+ null_handling = NullHandling.DEFAULT
263
+ # User code is free-form Python that may depend on input order (e.g. data[-1]
264
+ # for "last row", slicing like data[:-1] / data[1:]). The framework can't
265
+ # introspect what the user does, so conservatively assume order matters.
266
+ order_dependent = OrderDependence.ORDER_DEPENDENT
267
+ distinct_dependent = DistinctDependence.NOT_DISTINCT_DEPENDENT
268
+ supports_window = True
269
+
270
+ @classmethod
271
+ def update(
272
+ cls,
273
+ states: dict[int, DynamicState],
274
+ group_ids: pa.Int64Array,
275
+ code: Annotated[pa.StringArray, Param(doc="Python code defining Aggregate class")],
276
+ columns: Annotated[list[pa.Array], Param(doc="Input columns", varargs=True)], # type: ignore[type-arg]
277
+ ) -> None:
278
+ cls._do_update(states, group_ids, code, columns)
279
+
280
+ @classmethod
281
+ def finalize(
282
+ cls,
283
+ group_ids: pa.Int64Array,
284
+ states: dict[int, DynamicState],
285
+ params: ProcessParams[None],
286
+ ) -> Annotated[pa.RecordBatch, Returns(pa.float64())]:
287
+ results: list[float | None] = []
288
+ for gid in group_ids:
289
+ wrapper = states[gid.as_py()]
290
+ if wrapper is not None and wrapper.code and wrapper.state_bytes:
291
+ table = _deserialize_table(wrapper.state_bytes)
292
+ agg_cls = _get_aggregate_class(wrapper.code)
293
+ result = agg_cls.finalize(table)
294
+ results.append(float(result) if result is not None else None)
295
+ else:
296
+ results.append(None)
297
+ return pa.record_batch({"result": pa.array(results, type=pa.float64())})
298
+
299
+ @classmethod
300
+ def window(
301
+ cls,
302
+ rid: int,
303
+ subframes: list[tuple[int, int]],
304
+ partition: WindowPartition,
305
+ window_state: Any,
306
+ params: ProcessParams[None],
307
+ ) -> float | None:
308
+ # Column layout: [code, col1, col2, ...]
309
+ frame = cls._slice_to_frame(partition, subframes, data_start=1)
310
+ if frame.num_rows == 0:
311
+ return None
312
+ code = frame.column(0)[0].as_py()
313
+ data_table = cls._data_table_from(frame, data_start=1)
314
+ agg_cls = _get_aggregate_class(code)
315
+ result = cls._call_user(agg_cls, data_table, user_params=None)
316
+ return float(result) if result is not None else None
317
+
318
+
319
+ class DynamicMLAggregateFunction(_DynamicAggregateBase):
320
+ """Dynamic ML aggregate with params dict.
321
+
322
+ ``vgi_dynamic_ml_agg(code, params, col1, col2, ...)``
323
+
324
+ Like ``vgi_dynamic_agg`` but with a ``MAP(VARCHAR, DOUBLE)`` params
325
+ column forwarded to ``Aggregate.finalize(state, params)`` so the
326
+ dynamic code can access arbitrary parameters (seed, lookback, alpha, etc.).
327
+
328
+ SQL::
329
+
330
+ SELECT vgi_dynamic_ml_agg(
331
+ code,
332
+ MAP {'seed': 42, 'lb': 5, 'alpha': 1.0},
333
+ col1, col2
334
+ ) ...
335
+ """
336
+
337
+ class Meta:
338
+ name = "vgi_dynamic_ml_agg"
339
+ description = "Dynamic ML aggregate with params dict"
340
+ null_handling = NullHandling.DEFAULT
341
+ # User code is free-form Python that may depend on input order (e.g. data[-1]
342
+ # for "last row", slicing like data[:-1] / data[1:]). The framework can't
343
+ # introspect what the user does, so conservatively assume order matters.
344
+ order_dependent = OrderDependence.ORDER_DEPENDENT
345
+ distinct_dependent = DistinctDependence.NOT_DISTINCT_DEPENDENT
346
+ supports_window = True
347
+
348
+ @classmethod
349
+ def update(
350
+ cls,
351
+ states: dict[int, DynamicState],
352
+ group_ids: pa.Int64Array,
353
+ code: Annotated[pa.StringArray, Param(doc="Python code defining Aggregate class")],
354
+ params_col: Annotated[pa.Array, Param(doc="MAP(VARCHAR, DOUBLE) parameters")], # type: ignore[type-arg]
355
+ columns: Annotated[list[pa.Array], Param(doc="Input columns", varargs=True)], # type: ignore[type-arg]
356
+ ) -> None:
357
+ cls._do_update(states, group_ids, code, columns, params_col=params_col)
358
+
359
+ @classmethod
360
+ def finalize(
361
+ cls,
362
+ group_ids: pa.Int64Array,
363
+ states: dict[int, DynamicState],
364
+ params: ProcessParams[None],
365
+ ) -> Annotated[pa.RecordBatch, Returns(pa.float64())]:
366
+ results: list[float | None] = []
367
+ for gid in group_ids:
368
+ wrapper = states[gid.as_py()]
369
+ if wrapper is not None and wrapper.code and wrapper.state_bytes:
370
+ table = _deserialize_table(wrapper.state_bytes)
371
+ agg_cls = _get_aggregate_class(wrapper.code)
372
+ result = agg_cls.finalize(table, wrapper.params)
373
+ results.append(float(result) if result is not None else None)
374
+ else:
375
+ results.append(None)
376
+ return pa.record_batch({"result": pa.array(results, type=pa.float64())})
377
+
378
+ @classmethod
379
+ def window(
380
+ cls,
381
+ rid: int,
382
+ subframes: list[tuple[int, int]],
383
+ partition: WindowPartition,
384
+ window_state: Any,
385
+ params: ProcessParams[None],
386
+ ) -> float | None:
387
+ # Column layout: [code, params_map, col1, col2, ...]
388
+ frame = cls._slice_to_frame(partition, subframes, data_start=2)
389
+ if frame.num_rows == 0:
390
+ return None
391
+ code = frame.column(0)[0].as_py()
392
+ raw = frame.column(1)[0].as_py()
393
+ if isinstance(raw, list):
394
+ user_params: dict[str, float] = {str(k): float(v) for k, v in raw}
395
+ elif isinstance(raw, dict):
396
+ user_params = {str(k): float(v) for k, v in raw.items()}
397
+ else:
398
+ user_params = {}
399
+ data_table = cls._data_table_from(frame, data_start=2)
400
+ agg_cls = _get_aggregate_class(code)
401
+ result = cls._call_user(agg_cls, data_table, user_params=user_params)
402
+ return float(result) if result is not None else None
403
+
404
+
405
+ # ---------------------------------------------------------------------------
406
+ # Window-capable aggregates (Meta.supports_window = True)
407
+ # ---------------------------------------------------------------------------
408
+ # These demonstrate the window() callback which lets DuckDB ship the whole
409
+ # partition once and call the worker per output row with frame bounds.
@@ -0,0 +1,86 @@
1
+ # Copyright 2025, 2026 Query Farm LLC - https://query.farm
2
+
3
+ """GenericSumFunction — any-type aggregate (uses on_bind to derive output type)."""
4
+
5
+ from __future__ import annotations
6
+
7
+ from dataclasses import dataclass
8
+ from typing import Annotated
9
+
10
+ import pyarrow as pa
11
+ from vgi_rpc import ArrowSerializableDataclass, ArrowType
12
+
13
+ from vgi.aggregate_function import AggregateBindParams, AggregateFunction
14
+ from vgi.arguments import Param, Returns
15
+ from vgi.invocation import BindResponse
16
+ from vgi.metadata import DistinctDependence, NullHandling, OrderDependence
17
+ from vgi.schema_utils import schema
18
+ from vgi.table_function import ProcessParams
19
+
20
+
21
+ @dataclass(kw_only=True)
22
+ class GenericSumState(ArrowSerializableDataclass):
23
+ total: Annotated[float, ArrowType(pa.float64())] = 0.0
24
+
25
+
26
+ class GenericSumFunction(AggregateFunction[GenericSumState]):
27
+ """Sum aggregate that accepts any numeric type and returns the same type.
28
+
29
+ Demonstrates AnyArrow input with dynamic output type resolved in on_bind().
30
+ SQL: ``SELECT vgi_generic_sum(value) FROM t``
31
+ """
32
+
33
+ class Meta:
34
+ name = "vgi_generic_sum"
35
+ description = "Sum any numeric type"
36
+ null_handling = NullHandling.DEFAULT
37
+ order_dependent = OrderDependence.NOT_ORDER_DEPENDENT
38
+ distinct_dependent = DistinctDependence.NOT_DISTINCT_DEPENDENT
39
+
40
+ @classmethod
41
+ def on_bind(cls, params: AggregateBindParams, **kwargs: object) -> BindResponse:
42
+ """Resolve output type from input type."""
43
+ if params.input_schema:
44
+ input_type = params.input_schema.field(0).type
45
+ return BindResponse(output_schema=schema(result=input_type))
46
+ return BindResponse(output_schema=schema(result=pa.float64()))
47
+
48
+ @classmethod
49
+ def initial_state(cls, params: ProcessParams[None]) -> GenericSumState:
50
+ return GenericSumState()
51
+
52
+ @classmethod
53
+ def update(
54
+ cls,
55
+ states: dict[int, GenericSumState],
56
+ group_ids: pa.Int64Array,
57
+ value: Annotated[pa.Array, Param(doc="Numeric value to sum")], # type: ignore[type-arg]
58
+ ) -> None:
59
+ table = pa.table({"gid": group_ids, "value": value.cast(pa.float64())})
60
+ grouped = table.group_by("gid").aggregate([("value", "sum")])
61
+ for i in range(grouped.num_rows):
62
+ gid: int = grouped.column("gid")[i].as_py()
63
+ val = grouped.column("value_sum")[i].as_py()
64
+ if val is not None:
65
+ states[gid] = GenericSumState(total=states[gid].total + val)
66
+
67
+ @classmethod
68
+ def combine(cls, source: GenericSumState, target: GenericSumState, params: ProcessParams[None]) -> GenericSumState:
69
+ return GenericSumState(total=source.total + target.total)
70
+
71
+ @classmethod
72
+ def finalize(
73
+ cls,
74
+ group_ids: pa.Int64Array,
75
+ states: dict[int, GenericSumState],
76
+ params: ProcessParams[None],
77
+ ) -> Annotated[pa.RecordBatch, Returns()]:
78
+ # Output type determined by on_bind(), available via params.output_schema
79
+ output_type = params.output_schema.field(0).type if params.output_schema else pa.float64()
80
+ results = [s.total if (s := states[gid.as_py()]) is not None else None for gid in group_ids]
81
+ return pa.record_batch({"result": pa.array(results, type=output_type)})
82
+
83
+
84
+ # ---------------------------------------------------------------------------
85
+ # SumAllFunction — demonstrates varargs aggregate (sums all numeric columns)
86
+ # ---------------------------------------------------------------------------
@@ -0,0 +1,71 @@
1
+ # Copyright 2025, 2026 Query Farm LLC - https://query.farm
2
+
3
+ """ListAgg aggregate fixture (order-dependent string concatenation)."""
4
+
5
+ from __future__ import annotations
6
+
7
+ from typing import Annotated
8
+
9
+ import pyarrow as pa
10
+
11
+ from vgi._test_fixtures.aggregate._common import ListAggState
12
+ from vgi.aggregate_function import AggregateFunction
13
+ from vgi.arguments import Param, Returns
14
+ from vgi.metadata import DistinctDependence, NullHandling, OrderDependence
15
+ from vgi.table_function import ProcessParams
16
+
17
+
18
+ class ListAggFunction(AggregateFunction[ListAggState]):
19
+ """List aggregate — order-dependent, concatenates strings with comma separator.
20
+
21
+ SQL: ``SELECT vgi_listagg(name ORDER BY name) FROM t GROUP BY category``
22
+ """
23
+
24
+ class Meta:
25
+ name = "vgi_listagg"
26
+ description = "Concatenate strings with comma separator"
27
+ null_handling = NullHandling.DEFAULT
28
+ order_dependent = OrderDependence.ORDER_DEPENDENT
29
+ distinct_dependent = DistinctDependence.DISTINCT_DEPENDENT
30
+
31
+ @classmethod
32
+ def initial_state(cls, params: ProcessParams[None]) -> ListAggState:
33
+ return ListAggState()
34
+
35
+ @classmethod
36
+ def update(
37
+ cls,
38
+ states: dict[int, ListAggState],
39
+ group_ids: pa.Int64Array,
40
+ value: Annotated[pa.StringArray, Param(doc="String column")],
41
+ ) -> None:
42
+ for i in range(len(group_ids)):
43
+ gid: int = group_ids[i].as_py()
44
+ val = value[i].as_py()
45
+ if val is not None:
46
+ s = states[gid]
47
+ if s.values:
48
+ states[gid] = ListAggState(values=s.values + "," + val)
49
+ else:
50
+ states[gid] = ListAggState(values=val)
51
+
52
+ @classmethod
53
+ def combine(cls, source: ListAggState, target: ListAggState, params: ProcessParams[None]) -> ListAggState:
54
+ if source.values and target.values:
55
+ return ListAggState(values=target.values + "," + source.values)
56
+ return ListAggState(values=target.values or source.values)
57
+
58
+ @classmethod
59
+ def finalize(
60
+ cls,
61
+ group_ids: pa.Int64Array,
62
+ states: dict[int, ListAggState],
63
+ params: ProcessParams[None],
64
+ ) -> Annotated[pa.RecordBatch, Returns(pa.string())]:
65
+ results = [s.values or None if (s := states[gid.as_py()]) is not None else None for gid in group_ids]
66
+ return pa.record_batch({"result": pa.array(results, type=pa.string())})
67
+
68
+
69
+ # ---------------------------------------------------------------------------
70
+ # PercentileFunction — demonstrates ConstParam on aggregate
71
+ # ---------------------------------------------------------------------------
@@ -0,0 +1,107 @@
1
+ # Copyright 2025, 2026 Query Farm LLC - https://query.farm
2
+
3
+ """Percentile aggregate fixture (sorted-quantile demo)."""
4
+
5
+ from __future__ import annotations
6
+
7
+ from dataclasses import dataclass
8
+ from typing import Annotated
9
+
10
+ import pyarrow as pa
11
+ from vgi_rpc import ArrowSerializableDataclass, ArrowType
12
+
13
+ from vgi.aggregate_function import AggregateFunction
14
+ from vgi.arguments import ConstParam, Param, Returns
15
+ from vgi.metadata import DistinctDependence, NullHandling, OrderDependence
16
+ from vgi.table_function import ProcessParams
17
+
18
+
19
+ @dataclass(kw_only=True)
20
+ class PercentileState(ArrowSerializableDataclass):
21
+ # Store values as comma-separated string (simple serialization)
22
+ values_csv: Annotated[str, ArrowType(pa.string())] = ""
23
+
24
+
25
+ class PercentileFunction(AggregateFunction[PercentileState]):
26
+ """Approximate percentile — demonstrates ConstParam on aggregate functions.
27
+
28
+ SQL: ``SELECT vgi_percentile(value, 0.5) FROM t GROUP BY category``
29
+ The percentile parameter (0.5) is constant-folded at bind time.
30
+ """
31
+
32
+ class Meta:
33
+ name = "vgi_percentile"
34
+ description = "Approximate percentile (demonstrates ConstParam)"
35
+ null_handling = NullHandling.DEFAULT
36
+ order_dependent = OrderDependence.NOT_ORDER_DEPENDENT
37
+ distinct_dependent = DistinctDependence.NOT_DISTINCT_DEPENDENT
38
+
39
+ @classmethod
40
+ def initial_state(cls, params: ProcessParams[None]) -> PercentileState:
41
+ return PercentileState()
42
+
43
+ @classmethod
44
+ def update(
45
+ cls,
46
+ states: dict[int, PercentileState],
47
+ group_ids: pa.Int64Array,
48
+ value: Annotated[pa.DoubleArray, Param(doc="Values")],
49
+ percentile: Annotated[float, ConstParam("Percentile (0-1)", phase="finalize")] = 0.5,
50
+ ) -> None:
51
+ # percentile is NOT injected here (phase="finalize") — only needed in finalize
52
+ for i in range(len(group_ids)):
53
+ gid: int = group_ids[i].as_py()
54
+ val = value[i].as_py()
55
+ if val is not None:
56
+ s = states[gid]
57
+ if s.values_csv:
58
+ states[gid] = PercentileState(values_csv=s.values_csv + "," + str(val))
59
+ else:
60
+ states[gid] = PercentileState(values_csv=str(val))
61
+
62
+ @classmethod
63
+ def combine(cls, source: PercentileState, target: PercentileState, params: ProcessParams[None]) -> PercentileState:
64
+ if source.values_csv and target.values_csv:
65
+ return PercentileState(values_csv=target.values_csv + "," + source.values_csv)
66
+ return PercentileState(values_csv=target.values_csv or source.values_csv)
67
+
68
+ @classmethod
69
+ def finalize(
70
+ cls,
71
+ group_ids: pa.Int64Array,
72
+ states: dict[int, PercentileState],
73
+ params: ProcessParams[None],
74
+ ) -> Annotated[pa.RecordBatch, Returns(pa.float64())]:
75
+ import math
76
+ from decimal import Decimal
77
+
78
+ # Access percentile via params.args (loaded from FunctionStorage)
79
+ raw_pct = params.args.positional[0].as_py() if params.args and params.args.positional else 0.5
80
+ # Validate the percentile constant explicitly so callers see a clear
81
+ # error instead of an opaque NumPy/builtin TypeError downstream.
82
+ if raw_pct is None:
83
+ raise ValueError("vgi_percentile: percentile must not be NULL")
84
+ # Accept Python int/float and Decimal (DuckDB DECIMAL literals decode as Decimal).
85
+ if isinstance(raw_pct, (Decimal, int, float)):
86
+ pct = float(raw_pct)
87
+ else:
88
+ raise ValueError(f"vgi_percentile: percentile must be a number, got {type(raw_pct).__name__}")
89
+ if math.isnan(pct) or math.isinf(pct):
90
+ raise ValueError(f"vgi_percentile: percentile must be a finite number, got {raw_pct!r}")
91
+ if pct < 0.0 or pct > 1.0:
92
+ raise ValueError(f"vgi_percentile: percentile must be in [0, 1], got {pct}")
93
+ results: list[float | None] = []
94
+ for gid in group_ids:
95
+ s = states[gid.as_py()]
96
+ if s is not None and s.values_csv:
97
+ vals = sorted(float(v) for v in s.values_csv.split(","))
98
+ idx = min(int(pct * len(vals)), len(vals) - 1)
99
+ results.append(vals[idx])
100
+ else:
101
+ results.append(None)
102
+ return pa.record_batch({"result": pa.array(results, type=pa.float64())})
103
+
104
+
105
+ # ---------------------------------------------------------------------------
106
+ # GenericSumFunction — demonstrates AnyArrow / dynamic output type
107
+ # ---------------------------------------------------------------------------