vgi-python 0.8.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vgi/__init__.py +152 -0
- vgi/_duckdb.py +62 -0
- vgi/_storage_profile.py +132 -0
- vgi/_test_fixtures/__init__.py +20 -0
- vgi/_test_fixtures/accumulate/__init__.py +19 -0
- vgi/_test_fixtures/accumulate/worker.py +762 -0
- vgi/_test_fixtures/aggregate/__init__.py +62 -0
- vgi/_test_fixtures/aggregate/_common.py +21 -0
- vgi/_test_fixtures/aggregate/basic.py +232 -0
- vgi/_test_fixtures/aggregate/dynamic.py +409 -0
- vgi/_test_fixtures/aggregate/generic.py +86 -0
- vgi/_test_fixtures/aggregate/listagg.py +71 -0
- vgi/_test_fixtures/aggregate/percentile.py +107 -0
- vgi/_test_fixtures/aggregate/streaming.py +192 -0
- vgi/_test_fixtures/aggregate/varargs.py +75 -0
- vgi/_test_fixtures/aggregate/window.py +380 -0
- vgi/_test_fixtures/attach_options.py +308 -0
- vgi/_test_fixtures/bad_protocol.py +62 -0
- vgi/_test_fixtures/cancellable.py +336 -0
- vgi/_test_fixtures/catalog.py +813 -0
- vgi/_test_fixtures/http_server.py +394 -0
- vgi/_test_fixtures/nest_tensor.py +614 -0
- vgi/_test_fixtures/orchard_catalog.py +47 -0
- vgi/_test_fixtures/projection_repro/__init__.py +6 -0
- vgi/_test_fixtures/projection_repro/worker.py +454 -0
- vgi/_test_fixtures/scalar/__init__.py +116 -0
- vgi/_test_fixtures/scalar/_common.py +69 -0
- vgi/_test_fixtures/scalar/arithmetic.py +321 -0
- vgi/_test_fixtures/scalar/binary.py +120 -0
- vgi/_test_fixtures/scalar/formatting.py +176 -0
- vgi/_test_fixtures/scalar/geo.py +300 -0
- vgi/_test_fixtures/scalar/null_handling.py +107 -0
- vgi/_test_fixtures/scalar/random_demo.py +171 -0
- vgi/_test_fixtures/scalar/settings_secrets.py +102 -0
- vgi/_test_fixtures/scalar/type_info.py +219 -0
- vgi/_test_fixtures/schema_reconcile/__init__.py +29 -0
- vgi/_test_fixtures/schema_reconcile/worker.py +653 -0
- vgi/_test_fixtures/simple_writable.py +793 -0
- vgi/_test_fixtures/table/__init__.py +221 -0
- vgi/_test_fixtures/table/_common.py +162 -0
- vgi/_test_fixtures/table/batch_index.py +283 -0
- vgi/_test_fixtures/table/batch_index_broken.py +200 -0
- vgi/_test_fixtures/table/catalog_scans.py +162 -0
- vgi/_test_fixtures/table/filters.py +1005 -0
- vgi/_test_fixtures/table/late_materialization.py +249 -0
- vgi/_test_fixtures/table/make_series.py +273 -0
- vgi/_test_fixtures/table/misc.py +499 -0
- vgi/_test_fixtures/table/order_modes.py +164 -0
- vgi/_test_fixtures/table/pairs.py +437 -0
- vgi/_test_fixtures/table/partition_columns.py +472 -0
- vgi/_test_fixtures/table/partition_columns_broken.py +304 -0
- vgi/_test_fixtures/table/profiling_example.py +195 -0
- vgi/_test_fixtures/table/required_filters.py +234 -0
- vgi/_test_fixtures/table/sequence.py +710 -0
- vgi/_test_fixtures/table/settings.py +426 -0
- vgi/_test_fixtures/table/transaction_storage.py +162 -0
- vgi/_test_fixtures/table/tt_pushdown.py +191 -0
- vgi/_test_fixtures/table/versioned.py +230 -0
- vgi/_test_fixtures/table_in_out.py +1392 -0
- vgi/_test_fixtures/versioned.py +155 -0
- vgi/_test_fixtures/versioned_tables.py +595 -0
- vgi/_test_fixtures/worker.py +1631 -0
- vgi/_test_fixtures/writable/__init__.py +8 -0
- vgi/_test_fixtures/writable/generic.py +236 -0
- vgi/_test_fixtures/writable/table.py +149 -0
- vgi/_test_fixtures/writable/worker.py +1148 -0
- vgi/aggregate_function.py +607 -0
- vgi/argument_spec.py +472 -0
- vgi/arguments.py +1747 -0
- vgi/auth.py +55 -0
- vgi/catalog/__init__.py +88 -0
- vgi/catalog/attach_option.py +206 -0
- vgi/catalog/catalog_interface.py +2767 -0
- vgi/catalog/descriptors.py +870 -0
- vgi/catalog/duckdb_statistics.py +377 -0
- vgi/catalog/secret_type.py +96 -0
- vgi/catalog/setting.py +253 -0
- vgi/catalog/storage.py +372 -0
- vgi/client/__init__.py +67 -0
- vgi/client/catalog_mixin.py +1251 -0
- vgi/client/cli.py +582 -0
- vgi/client/cli_catalog.py +182 -0
- vgi/client/cli_schema.py +270 -0
- vgi/client/cli_table.py +907 -0
- vgi/client/cli_transaction.py +97 -0
- vgi/client/cli_utils.py +441 -0
- vgi/client/cli_view.py +303 -0
- vgi/client/client.py +2183 -0
- vgi/exceptions.py +205 -0
- vgi/function.py +245 -0
- vgi/function_storage.py +1636 -0
- vgi/function_storage_azure_sql.py +922 -0
- vgi/function_storage_cf_do.py +740 -0
- vgi/http/__init__.py +25 -0
- vgi/http/demo_storage.py +212 -0
- vgi/http/worker_page.py +1252 -0
- vgi/invocation.py +154 -0
- vgi/logging_config.py +93 -0
- vgi/meta_worker.py +661 -0
- vgi/metadata.py +1403 -0
- vgi/otel.py +406 -0
- vgi/protocol.py +2418 -0
- vgi/protocol_version.txt +1 -0
- vgi/py.typed +0 -0
- vgi/scalar_function.py +1211 -0
- vgi/schema_utils.py +234 -0
- vgi/secret_protocol.py +124 -0
- vgi/secret_service.py +238 -0
- vgi/serve.py +769 -0
- vgi/table_buffering_function.py +443 -0
- vgi/table_filter_pushdown.py +1528 -0
- vgi/table_function.py +1130 -0
- vgi/table_in_out_function.py +383 -0
- vgi/transactor/__init__.py +24 -0
- vgi/transactor/_duckdb_compat.py +27 -0
- vgi/transactor/client.py +137 -0
- vgi/transactor/protocol.py +149 -0
- vgi/transactor/server.py +740 -0
- vgi/worker.py +4761 -0
- vgi_python-0.8.0.dist-info/METADATA +735 -0
- vgi_python-0.8.0.dist-info/RECORD +124 -0
- vgi_python-0.8.0.dist-info/WHEEL +4 -0
- vgi_python-0.8.0.dist-info/entry_points.txt +5 -0
- vgi_python-0.8.0.dist-info/licenses/LICENSE +134 -0
|
@@ -0,0 +1,614 @@
|
|
|
1
|
+
# Copyright 2025, 2026 Query Farm LLC - https://query.farm
|
|
2
|
+
|
|
3
|
+
# ruff: noqa: D102, D106
|
|
4
|
+
"""Aggregate that collects rows into a dense N-D tensor, plus its inverse.
|
|
5
|
+
|
|
6
|
+
Two functions that work as a pair:
|
|
7
|
+
|
|
8
|
+
- ``nest_tensor(value, {axis1: ..., axis2: ...})`` aggregate — collects rows
|
|
9
|
+
from a group into a struct ``{tensor, axes}`` where ``tensor`` is a dense
|
|
10
|
+
nested-list representation of the values keyed by the axis coordinates, and
|
|
11
|
+
``axes`` is a struct mirroring the input axes argument with each field
|
|
12
|
+
holding that axis's sorted, distinct coordinate values.
|
|
13
|
+
- ``unnest_tensor(t)`` table function — inverts the aggregate, emitting one
|
|
14
|
+
row per cell of the Cartesian product (including null-valued cells).
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from __future__ import annotations
|
|
18
|
+
|
|
19
|
+
import itertools
|
|
20
|
+
import os
|
|
21
|
+
from dataclasses import dataclass
|
|
22
|
+
from typing import Annotated, Any
|
|
23
|
+
|
|
24
|
+
import pyarrow as pa
|
|
25
|
+
from vgi_rpc import ArrowSerializableDataclass, ArrowType
|
|
26
|
+
from vgi_rpc.rpc import OutputCollector
|
|
27
|
+
|
|
28
|
+
from vgi.aggregate_function import AggregateBindParams, AggregateFunction
|
|
29
|
+
from vgi.arguments import Arg, Param, Returns, TableInput
|
|
30
|
+
from vgi.invocation import BindResponse
|
|
31
|
+
from vgi.metadata import DistinctDependence, NullHandling, OrderDependence
|
|
32
|
+
from vgi.scalar_function import BindParameters, BindResult, ScalarFunction
|
|
33
|
+
from vgi.schema_utils import schema
|
|
34
|
+
from vgi.table_function import BindParams, ProcessParams
|
|
35
|
+
from vgi.table_in_out_function import TableInOutGenerator
|
|
36
|
+
|
|
37
|
+
__all__ = [
|
|
38
|
+
"NestTensorError",
|
|
39
|
+
"NestTensorFunction",
|
|
40
|
+
"UnnestTensorFunction",
|
|
41
|
+
"UnnestTensorRowsFunction",
|
|
42
|
+
]
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
# ---------------------------------------------------------------------------
|
|
46
|
+
# Errors
|
|
47
|
+
# ---------------------------------------------------------------------------
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
class NestTensorError(ValueError):
|
|
51
|
+
"""Base error for nest_tensor / unnest_tensor."""
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
# ---------------------------------------------------------------------------
|
|
55
|
+
# Helpers
|
|
56
|
+
# ---------------------------------------------------------------------------
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
_DEFAULT_MAX_CELLS = 10_000_000
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def _max_cells() -> int:
|
|
63
|
+
raw = os.environ.get("VGI_NEST_TENSOR_MAX_CELLS")
|
|
64
|
+
if raw is None:
|
|
65
|
+
return _DEFAULT_MAX_CELLS
|
|
66
|
+
try:
|
|
67
|
+
value = int(raw)
|
|
68
|
+
except ValueError as exc:
|
|
69
|
+
raise NestTensorError(f"VGI_NEST_TENSOR_MAX_CELLS must be an integer, got {raw!r}") from exc
|
|
70
|
+
if value <= 0:
|
|
71
|
+
raise NestTensorError("VGI_NEST_TENSOR_MAX_CELLS must be positive")
|
|
72
|
+
return value
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def _validate_coord_type(name: str, arrow_type: pa.DataType) -> None:
|
|
76
|
+
"""Raise if an axis coord type is unsupported.
|
|
77
|
+
|
|
78
|
+
Allowed: integers, decimals, strings, binary, bool, date, timestamp, time.
|
|
79
|
+
Rejected: floating-point (NaN breaks equality/ordering), nested types.
|
|
80
|
+
"""
|
|
81
|
+
if pa.types.is_floating(arrow_type):
|
|
82
|
+
raise NestTensorError(
|
|
83
|
+
f"nest_tensor: axis '{name}' has floating-point type {arrow_type}; "
|
|
84
|
+
f"floats are not supported as coord types (NaN breaks equality)"
|
|
85
|
+
)
|
|
86
|
+
if (
|
|
87
|
+
pa.types.is_struct(arrow_type)
|
|
88
|
+
or pa.types.is_list(arrow_type)
|
|
89
|
+
or pa.types.is_large_list(arrow_type)
|
|
90
|
+
or pa.types.is_fixed_size_list(arrow_type)
|
|
91
|
+
or pa.types.is_map(arrow_type)
|
|
92
|
+
):
|
|
93
|
+
raise NestTensorError(
|
|
94
|
+
f"nest_tensor: axis '{name}' has nested type {arrow_type}; only scalar coord types are supported"
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def _serialize_table(table: pa.Table) -> bytes:
|
|
99
|
+
sink = pa.BufferOutputStream()
|
|
100
|
+
with pa.ipc.new_stream(sink, table.schema) as writer:
|
|
101
|
+
for batch in table.to_batches():
|
|
102
|
+
writer.write_batch(batch)
|
|
103
|
+
return sink.getvalue().to_pybytes()
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def _deserialize_table(data: bytes) -> pa.Table:
|
|
107
|
+
return pa.ipc.open_stream(data).read_all()
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def _read_rows(state: NestTensorState) -> pa.Table | None:
|
|
111
|
+
if not state.rows_ipc:
|
|
112
|
+
return None
|
|
113
|
+
return _deserialize_table(state.rows_ipc)
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
def _make_nested_lists(shape: list[int], fill: Any = None) -> Any:
|
|
117
|
+
"""Build a nested Python list of the given shape, filled with ``fill``."""
|
|
118
|
+
if not shape:
|
|
119
|
+
return fill
|
|
120
|
+
head, *rest = shape
|
|
121
|
+
return [_make_nested_lists(rest, fill) for _ in range(head)]
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
def _nested_list_type(inner: pa.DataType, depth: int) -> pa.DataType:
|
|
125
|
+
t = inner
|
|
126
|
+
for _ in range(depth):
|
|
127
|
+
t = pa.list_(t)
|
|
128
|
+
return t
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
def _output_struct_type(value_type: pa.DataType, axes_type: pa.StructType) -> pa.StructType:
|
|
132
|
+
n = len(axes_type)
|
|
133
|
+
tensor_type = _nested_list_type(value_type, n)
|
|
134
|
+
axes_out = pa.struct([pa.field(f.name, pa.list_(f.type)) for f in axes_type])
|
|
135
|
+
return pa.struct([("tensor", tensor_type), ("axes", axes_out)])
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
# ---------------------------------------------------------------------------
|
|
139
|
+
# Aggregate state
|
|
140
|
+
# ---------------------------------------------------------------------------
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
@dataclass(kw_only=True)
|
|
144
|
+
class NestTensorState(ArrowSerializableDataclass):
|
|
145
|
+
rows_ipc: Annotated[bytes, ArrowType(pa.binary())] = b""
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
# ---------------------------------------------------------------------------
|
|
149
|
+
# NestTensorFunction
|
|
150
|
+
# ---------------------------------------------------------------------------
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
class NestTensorFunction(AggregateFunction[NestTensorState]):
|
|
154
|
+
"""Collect rows into an N-D tensor plus per-axis coordinate lists.
|
|
155
|
+
|
|
156
|
+
SQL::
|
|
157
|
+
|
|
158
|
+
SELECT nest_tensor(value, {x: col_x, y: col_y}) FROM t GROUP BY g;
|
|
159
|
+
|
|
160
|
+
Returns a struct ``{tensor, axes}`` where ``tensor`` is a nested
|
|
161
|
+
``list<list<...>>`` (one level per axis) and ``axes`` is a struct
|
|
162
|
+
mirroring the input axes argument with each field holding that axis's
|
|
163
|
+
sorted distinct coordinate values.
|
|
164
|
+
"""
|
|
165
|
+
|
|
166
|
+
class Meta:
|
|
167
|
+
name = "nest_tensor"
|
|
168
|
+
description = "Collect rows into a dense N-D tensor plus per-axis coordinates"
|
|
169
|
+
null_handling = NullHandling.DEFAULT
|
|
170
|
+
order_dependent = OrderDependence.NOT_ORDER_DEPENDENT
|
|
171
|
+
distinct_dependent = DistinctDependence.NOT_DISTINCT_DEPENDENT
|
|
172
|
+
|
|
173
|
+
# ------------------------------------------------------------------ bind
|
|
174
|
+
|
|
175
|
+
@classmethod
|
|
176
|
+
def on_bind(cls, params: AggregateBindParams, **kwargs: object) -> BindResponse:
|
|
177
|
+
input_schema = params.input_schema
|
|
178
|
+
if input_schema is None or len(input_schema) < 2:
|
|
179
|
+
raise NestTensorError("nest_tensor: expected 2 arguments (value, axes struct)")
|
|
180
|
+
value_type = input_schema.field(0).type
|
|
181
|
+
axes_type = input_schema.field(1).type
|
|
182
|
+
if not pa.types.is_struct(axes_type):
|
|
183
|
+
raise NestTensorError(f"nest_tensor: second argument must be a struct, got {axes_type}")
|
|
184
|
+
if len(axes_type) == 0:
|
|
185
|
+
raise NestTensorError("nest_tensor: axes struct must have at least one field")
|
|
186
|
+
for f in axes_type:
|
|
187
|
+
_validate_coord_type(f.name, f.type)
|
|
188
|
+
|
|
189
|
+
out = _output_struct_type(value_type, axes_type)
|
|
190
|
+
return BindResponse(output_schema=schema(result=out))
|
|
191
|
+
|
|
192
|
+
# -------------------------------------------------------------- lifecycle
|
|
193
|
+
|
|
194
|
+
@classmethod
|
|
195
|
+
def initial_state(cls, params: ProcessParams[Any]) -> NestTensorState:
|
|
196
|
+
return NestTensorState()
|
|
197
|
+
|
|
198
|
+
# ------------------------------------------------------------------ update
|
|
199
|
+
|
|
200
|
+
@classmethod
|
|
201
|
+
def update(
|
|
202
|
+
cls,
|
|
203
|
+
states: dict[int, NestTensorState],
|
|
204
|
+
group_ids: pa.Int64Array,
|
|
205
|
+
value: Annotated[pa.Array, Param(doc="Tensor cell value")], # type: ignore[type-arg]
|
|
206
|
+
axes: Annotated[pa.Array, Param(doc="Struct of axis coordinates")], # type: ignore[type-arg]
|
|
207
|
+
) -> None:
|
|
208
|
+
if not isinstance(axes, pa.StructArray):
|
|
209
|
+
raise NestTensorError(f"nest_tensor: axes argument must be a struct array, got {type(axes).__name__}")
|
|
210
|
+
|
|
211
|
+
n_rows = len(group_ids)
|
|
212
|
+
axis_names = [f.name for f in axes.type]
|
|
213
|
+
|
|
214
|
+
# Materialise axes as a dict of field -> Array for fast per-row access.
|
|
215
|
+
axis_columns = {name: axes.field(i) for i, name in enumerate(axis_names)}
|
|
216
|
+
|
|
217
|
+
# Group rows by group_id, validating nulls and intra-batch duplicates.
|
|
218
|
+
per_group_rows: dict[int, list[int]] = {}
|
|
219
|
+
per_group_seen: dict[int, set[tuple[Any, ...]]] = {}
|
|
220
|
+
gids_py = group_ids.to_pylist()
|
|
221
|
+
axes_validity = axes.is_valid()
|
|
222
|
+
for i in range(n_rows):
|
|
223
|
+
gid_raw = gids_py[i]
|
|
224
|
+
if gid_raw is None:
|
|
225
|
+
continue # Null group_id — shouldn't happen but skip defensively.
|
|
226
|
+
gid = gid_raw
|
|
227
|
+
if not axes_validity[i].as_py():
|
|
228
|
+
continue # Null axes struct → skip.
|
|
229
|
+
coord = []
|
|
230
|
+
for name in axis_names:
|
|
231
|
+
col = axis_columns[name]
|
|
232
|
+
cell = col[i]
|
|
233
|
+
if not cell.is_valid:
|
|
234
|
+
raise NestTensorError(f"nest_tensor: null coord value for axis '{name}' at row {i} (group {gid})")
|
|
235
|
+
coord.append(cell.as_py())
|
|
236
|
+
coord_t = tuple(coord)
|
|
237
|
+
seen = per_group_seen.setdefault(gid, set())
|
|
238
|
+
if coord_t in seen:
|
|
239
|
+
raise NestTensorError(
|
|
240
|
+
f"nest_tensor: duplicate coordinate {dict(zip(axis_names, coord, strict=True))} in group {gid}"
|
|
241
|
+
)
|
|
242
|
+
seen.add(coord_t)
|
|
243
|
+
per_group_rows.setdefault(gid, []).append(i)
|
|
244
|
+
|
|
245
|
+
if not per_group_rows:
|
|
246
|
+
return
|
|
247
|
+
|
|
248
|
+
# Build per-group mini-tables and append to rows_ipc.
|
|
249
|
+
parent_schema = pa.schema(
|
|
250
|
+
[
|
|
251
|
+
pa.field("value", value.type),
|
|
252
|
+
pa.field("axes", axes.type),
|
|
253
|
+
]
|
|
254
|
+
)
|
|
255
|
+
for gid, indices in per_group_rows.items():
|
|
256
|
+
idx = pa.array(indices, type=pa.int64())
|
|
257
|
+
value_slice = value.take(idx)
|
|
258
|
+
axes_slice = axes.take(idx)
|
|
259
|
+
batch = pa.RecordBatch.from_arrays([value_slice, axes_slice], schema=parent_schema)
|
|
260
|
+
table = pa.Table.from_batches([batch])
|
|
261
|
+
prior_bytes = states[gid].rows_ipc
|
|
262
|
+
if prior_bytes:
|
|
263
|
+
prior = _deserialize_table(prior_bytes)
|
|
264
|
+
table = pa.concat_tables([prior, table])
|
|
265
|
+
states[gid] = NestTensorState(rows_ipc=_serialize_table(table))
|
|
266
|
+
|
|
267
|
+
# ---------------------------------------------------------------- combine
|
|
268
|
+
|
|
269
|
+
@classmethod
|
|
270
|
+
def combine(
|
|
271
|
+
cls,
|
|
272
|
+
source: NestTensorState,
|
|
273
|
+
target: NestTensorState,
|
|
274
|
+
params: ProcessParams[Any],
|
|
275
|
+
) -> NestTensorState:
|
|
276
|
+
if not source.rows_ipc:
|
|
277
|
+
return target
|
|
278
|
+
if not target.rows_ipc:
|
|
279
|
+
return NestTensorState(rows_ipc=source.rows_ipc)
|
|
280
|
+
s = _deserialize_table(source.rows_ipc)
|
|
281
|
+
t = _deserialize_table(target.rows_ipc)
|
|
282
|
+
return NestTensorState(rows_ipc=_serialize_table(pa.concat_tables([t, s])))
|
|
283
|
+
|
|
284
|
+
# ---------------------------------------------------------------- finalize
|
|
285
|
+
|
|
286
|
+
@classmethod
|
|
287
|
+
def finalize(
|
|
288
|
+
cls,
|
|
289
|
+
group_ids: pa.Int64Array,
|
|
290
|
+
states: dict[int, NestTensorState],
|
|
291
|
+
params: ProcessParams[Any],
|
|
292
|
+
) -> Annotated[pa.RecordBatch, Returns()]:
|
|
293
|
+
output_schema = params.output_schema
|
|
294
|
+
assert output_schema is not None, "nest_tensor: finalize called without output_schema"
|
|
295
|
+
out_type = output_schema.field(0).type
|
|
296
|
+
assert pa.types.is_struct(out_type)
|
|
297
|
+
tensor_type = out_type.field("tensor").type
|
|
298
|
+
axes_out_type = out_type.field("axes").type
|
|
299
|
+
axis_names = [axes_out_type.field(i).name for i in range(len(axes_out_type))]
|
|
300
|
+
|
|
301
|
+
max_cells = _max_cells()
|
|
302
|
+
|
|
303
|
+
tensors: list[Any] = []
|
|
304
|
+
axes_rows: list[dict[str, list[Any]]] = []
|
|
305
|
+
for gid_scalar in group_ids:
|
|
306
|
+
gid = gid_scalar.as_py()
|
|
307
|
+
state = states.get(gid)
|
|
308
|
+
table = _read_rows(state) if state is not None else None
|
|
309
|
+
if table is None or table.num_rows == 0:
|
|
310
|
+
# No rows for this group (e.g., filtered-out during update or
|
|
311
|
+
# an empty group). Emit zero-shape tensor + empty axes lists.
|
|
312
|
+
tensors.append(_make_nested_lists([0] * len(axis_names)))
|
|
313
|
+
axes_rows.append({name: [] for name in axis_names})
|
|
314
|
+
continue
|
|
315
|
+
|
|
316
|
+
tensors_entry, axes_entry = _materialise_group(
|
|
317
|
+
table=table,
|
|
318
|
+
axis_names=axis_names,
|
|
319
|
+
gid=gid,
|
|
320
|
+
max_cells=max_cells,
|
|
321
|
+
)
|
|
322
|
+
tensors.append(tensors_entry)
|
|
323
|
+
axes_rows.append(axes_entry)
|
|
324
|
+
|
|
325
|
+
tensor_array = pa.array(tensors, type=tensor_type)
|
|
326
|
+
axes_array = pa.array(axes_rows, type=axes_out_type)
|
|
327
|
+
result_array = pa.StructArray.from_arrays(
|
|
328
|
+
[tensor_array, axes_array], fields=[out_type.field("tensor"), out_type.field("axes")]
|
|
329
|
+
)
|
|
330
|
+
return pa.record_batch([result_array], schema=output_schema)
|
|
331
|
+
|
|
332
|
+
|
|
333
|
+
def _materialise_group(
|
|
334
|
+
*,
|
|
335
|
+
table: pa.Table,
|
|
336
|
+
axis_names: list[str],
|
|
337
|
+
gid: int,
|
|
338
|
+
max_cells: int,
|
|
339
|
+
) -> tuple[Any, dict[str, list[Any]]]:
|
|
340
|
+
"""Build the nested tensor + axes dict for a single group's accumulated rows."""
|
|
341
|
+
value_col = table.column("value")
|
|
342
|
+
axes_col = table.column("axes")
|
|
343
|
+
n_rows = table.num_rows
|
|
344
|
+
|
|
345
|
+
# Collect distinct coord values per axis, sorted ascending. We sort here
|
|
346
|
+
# (rather than preserve insertion order) for deterministic output across
|
|
347
|
+
# parallel combine orderings.
|
|
348
|
+
axis_values: list[list[Any]] = []
|
|
349
|
+
axis_idx: list[dict[Any, int]] = []
|
|
350
|
+
# Combine chunks into a single StructArray for easier field access.
|
|
351
|
+
axes_combined = axes_col.combine_chunks()
|
|
352
|
+
assert isinstance(axes_combined, pa.StructArray)
|
|
353
|
+
for name in axis_names:
|
|
354
|
+
field_array = axes_combined.field(name)
|
|
355
|
+
distinct = sorted({field_array[i].as_py() for i in range(n_rows)})
|
|
356
|
+
axis_values.append(distinct)
|
|
357
|
+
axis_idx.append({v: i for i, v in enumerate(distinct)})
|
|
358
|
+
|
|
359
|
+
shape = [len(v) for v in axis_values]
|
|
360
|
+
total = 1
|
|
361
|
+
for s in shape:
|
|
362
|
+
total *= s
|
|
363
|
+
if total > max_cells:
|
|
364
|
+
raise NestTensorError(
|
|
365
|
+
f"nest_tensor: tensor has {total} cells (shape {shape}) "
|
|
366
|
+
f"exceeds VGI_NEST_TENSOR_MAX_CELLS={max_cells} (group {gid})"
|
|
367
|
+
)
|
|
368
|
+
|
|
369
|
+
tensor = _make_nested_lists(shape, fill=None)
|
|
370
|
+
filled = _make_nested_lists(shape, fill=False)
|
|
371
|
+
|
|
372
|
+
value_flat = value_col.combine_chunks()
|
|
373
|
+
for row in range(n_rows):
|
|
374
|
+
idx_tuple = tuple(axis_idx[a][axes_combined.field(name)[row].as_py()] for a, name in enumerate(axis_names))
|
|
375
|
+
cell = tensor
|
|
376
|
+
flag = filled
|
|
377
|
+
for d in idx_tuple[:-1]:
|
|
378
|
+
cell = cell[d]
|
|
379
|
+
flag = flag[d]
|
|
380
|
+
last = idx_tuple[-1]
|
|
381
|
+
if flag[last]:
|
|
382
|
+
coord = {name: axes_combined.field(name)[row].as_py() for name in axis_names}
|
|
383
|
+
raise NestTensorError(
|
|
384
|
+
f"nest_tensor: duplicate coordinate {coord} in group {gid} (arrived from parallel partitions)"
|
|
385
|
+
)
|
|
386
|
+
cell[last] = value_flat[row].as_py()
|
|
387
|
+
flag[last] = True
|
|
388
|
+
|
|
389
|
+
axes_entry = {name: axis_values[i] for i, name in enumerate(axis_names)}
|
|
390
|
+
return tensor, axes_entry
|
|
391
|
+
|
|
392
|
+
|
|
393
|
+
# ---------------------------------------------------------------------------
|
|
394
|
+
# UnnestTensorFunction
|
|
395
|
+
# ---------------------------------------------------------------------------
|
|
396
|
+
|
|
397
|
+
|
|
398
|
+
class UnnestTensorFunction(ScalarFunction):
|
|
399
|
+
"""Invert ``nest_tensor``: return a list of ``{value, axes}`` structs.
|
|
400
|
+
|
|
401
|
+
SQL::
|
|
402
|
+
|
|
403
|
+
SELECT u.value, u.axes.x, u.axes.y
|
|
404
|
+
FROM (SELECT nest_tensor(v, {x: a, y: b}) AS t FROM rows GROUP BY g) r,
|
|
405
|
+
UNNEST(unnest_tensor(r.t)) AS u(value, axes);
|
|
406
|
+
|
|
407
|
+
Every cell of the axes Cartesian product is returned, including cells
|
|
408
|
+
whose ``value`` is null (unfilled slots or null input values).
|
|
409
|
+
|
|
410
|
+
Implemented as a scalar (not table) function because DuckDB table
|
|
411
|
+
functions cannot accept correlated column inputs from a lateral join.
|
|
412
|
+
"""
|
|
413
|
+
|
|
414
|
+
class Meta:
|
|
415
|
+
name = "unnest_tensor"
|
|
416
|
+
description = "Invert nest_tensor: list of {value, axes} structs per cell"
|
|
417
|
+
|
|
418
|
+
@classmethod
|
|
419
|
+
def on_bind(cls, params: BindParameters) -> BindResult:
|
|
420
|
+
struct_type = params.arguments_schema.field(0).type
|
|
421
|
+
if not pa.types.is_struct(struct_type):
|
|
422
|
+
raise NestTensorError(f"unnest_tensor: argument must be a struct, got {struct_type}")
|
|
423
|
+
field_names = {struct_type.field(i).name for i in range(len(struct_type))}
|
|
424
|
+
if "tensor" not in field_names or "axes" not in field_names:
|
|
425
|
+
raise NestTensorError(
|
|
426
|
+
f"unnest_tensor: struct must have 'tensor' and 'axes' fields, got {sorted(field_names)}"
|
|
427
|
+
)
|
|
428
|
+
axes_type = struct_type.field("axes").type
|
|
429
|
+
if not pa.types.is_struct(axes_type):
|
|
430
|
+
raise NestTensorError(f"unnest_tensor: 'axes' field must be a struct, got {axes_type}")
|
|
431
|
+
|
|
432
|
+
tensor_type = struct_type.field("tensor").type
|
|
433
|
+
depth = 0
|
|
434
|
+
inner = tensor_type
|
|
435
|
+
while pa.types.is_list(inner) or pa.types.is_large_list(inner) or pa.types.is_fixed_size_list(inner):
|
|
436
|
+
depth += 1
|
|
437
|
+
inner = inner.value_type
|
|
438
|
+
if depth != len(axes_type):
|
|
439
|
+
raise NestTensorError(
|
|
440
|
+
f"unnest_tensor: tensor nesting depth {depth} does not match number of axes {len(axes_type)}"
|
|
441
|
+
)
|
|
442
|
+
|
|
443
|
+
out_axes_type = pa.struct(
|
|
444
|
+
[pa.field(axes_type.field(i).name, axes_type.field(i).type.value_type) for i in range(len(axes_type))]
|
|
445
|
+
)
|
|
446
|
+
row_type = pa.struct([pa.field("value", inner), pa.field("axes", out_axes_type)])
|
|
447
|
+
return BindResult(pa.list_(row_type))
|
|
448
|
+
|
|
449
|
+
@classmethod
|
|
450
|
+
def compute(
|
|
451
|
+
cls,
|
|
452
|
+
tensor: Annotated[pa.Array, Param(doc="Struct produced by nest_tensor")], # type: ignore[type-arg]
|
|
453
|
+
) -> Annotated[pa.Array, Returns()]: # type: ignore[type-arg]
|
|
454
|
+
struct_array = tensor
|
|
455
|
+
if not pa.types.is_struct(struct_array.type):
|
|
456
|
+
raise NestTensorError("unnest_tensor: input must be a struct array")
|
|
457
|
+
|
|
458
|
+
axes_type = struct_array.type.field("axes").type
|
|
459
|
+
axis_names = [axes_type.field(i).name for i in range(len(axes_type))]
|
|
460
|
+
|
|
461
|
+
out_axes_type = pa.struct(
|
|
462
|
+
[pa.field(axes_type.field(i).name, axes_type.field(i).type.value_type) for i in range(len(axes_type))]
|
|
463
|
+
)
|
|
464
|
+
# Determine cell type by walking tensor nesting.
|
|
465
|
+
tensor_type = struct_array.type.field("tensor").type
|
|
466
|
+
inner = tensor_type
|
|
467
|
+
while pa.types.is_list(inner) or pa.types.is_large_list(inner) or pa.types.is_fixed_size_list(inner):
|
|
468
|
+
inner = inner.value_type
|
|
469
|
+
row_type = pa.struct([pa.field("value", inner), pa.field("axes", out_axes_type)])
|
|
470
|
+
|
|
471
|
+
result_rows: list[list[dict[str, Any]] | None] = []
|
|
472
|
+
for i in range(len(struct_array)):
|
|
473
|
+
scalar = struct_array[i]
|
|
474
|
+
if not scalar.is_valid:
|
|
475
|
+
result_rows.append(None)
|
|
476
|
+
continue
|
|
477
|
+
struct_value = scalar.as_py()
|
|
478
|
+
tensor_val = struct_value["tensor"]
|
|
479
|
+
axes_dict = struct_value["axes"]
|
|
480
|
+
coord_lists = [axes_dict.get(name) or [] for name in axis_names]
|
|
481
|
+
if any(len(v) == 0 for v in coord_lists):
|
|
482
|
+
result_rows.append([])
|
|
483
|
+
continue
|
|
484
|
+
rows: list[dict[str, Any]] = []
|
|
485
|
+
for index_tuple in itertools.product(*(range(len(v)) for v in coord_lists)):
|
|
486
|
+
cell: Any = tensor_val
|
|
487
|
+
for d in index_tuple:
|
|
488
|
+
cell = cell[d]
|
|
489
|
+
rows.append(
|
|
490
|
+
{
|
|
491
|
+
"value": cell,
|
|
492
|
+
"axes": {name: coord_lists[a][index_tuple[a]] for a, name in enumerate(axis_names)},
|
|
493
|
+
}
|
|
494
|
+
)
|
|
495
|
+
result_rows.append(rows)
|
|
496
|
+
|
|
497
|
+
return pa.array(result_rows, type=pa.list_(row_type))
|
|
498
|
+
|
|
499
|
+
|
|
500
|
+
# ---------------------------------------------------------------------------
|
|
501
|
+
# UnnestTensorRowsFunction (table-in-out variant for LATERAL joins)
|
|
502
|
+
# ---------------------------------------------------------------------------
|
|
503
|
+
|
|
504
|
+
|
|
505
|
+
@dataclass(slots=True, frozen=True, kw_only=True)
|
|
506
|
+
class UnnestTensorRowsArgs:
|
|
507
|
+
data: Annotated[TableInput, Arg(0, doc="Input table: one column of nest_tensor structs")]
|
|
508
|
+
|
|
509
|
+
|
|
510
|
+
class UnnestTensorRowsFunction(TableInOutGenerator[UnnestTensorRowsArgs]):
|
|
511
|
+
"""Invert ``nest_tensor`` as a table-in-out function.
|
|
512
|
+
|
|
513
|
+
Accepts a one-column input table whose column is a nest_tensor-shaped
|
|
514
|
+
struct. Emits one output row per cell of the Cartesian product for every
|
|
515
|
+
input row. Unlike the scalar ``unnest_tensor``, this streams output
|
|
516
|
+
without materialising a full list column, and composes with DuckDB's
|
|
517
|
+
``LATERAL`` joins on correlated columns.
|
|
518
|
+
|
|
519
|
+
SQL::
|
|
520
|
+
|
|
521
|
+
SELECT u.value, u.axes.x, u.axes.y
|
|
522
|
+
FROM (SELECT nest_tensor(v, {x: a, y: b}) AS t FROM rows GROUP BY g) r,
|
|
523
|
+
LATERAL unnest_tensor_rows((SELECT r.t)) u;
|
|
524
|
+
"""
|
|
525
|
+
|
|
526
|
+
class Meta:
|
|
527
|
+
name = "unnest_tensor_rows"
|
|
528
|
+
description = "Invert nest_tensor, streaming one row per cell (LATERAL-friendly)"
|
|
529
|
+
|
|
530
|
+
@classmethod
|
|
531
|
+
def on_bind(cls, params: BindParams[UnnestTensorRowsArgs]) -> BindResponse:
|
|
532
|
+
input_schema = params.bind_call.input_schema
|
|
533
|
+
if input_schema is None or len(input_schema) != 1:
|
|
534
|
+
raise NestTensorError(
|
|
535
|
+
"unnest_tensor_rows: input table must have exactly one column (the nest_tensor struct)"
|
|
536
|
+
)
|
|
537
|
+
struct_type = input_schema.field(0).type
|
|
538
|
+
if not pa.types.is_struct(struct_type):
|
|
539
|
+
raise NestTensorError(f"unnest_tensor_rows: input column must be a struct, got {struct_type}")
|
|
540
|
+
field_names = {struct_type.field(i).name for i in range(len(struct_type))}
|
|
541
|
+
if "tensor" not in field_names or "axes" not in field_names:
|
|
542
|
+
raise NestTensorError(
|
|
543
|
+
f"unnest_tensor_rows: struct must have 'tensor' and 'axes' fields, got {sorted(field_names)}"
|
|
544
|
+
)
|
|
545
|
+
axes_type = struct_type.field("axes").type
|
|
546
|
+
if not pa.types.is_struct(axes_type):
|
|
547
|
+
raise NestTensorError(f"unnest_tensor_rows: 'axes' field must be a struct, got {axes_type}")
|
|
548
|
+
|
|
549
|
+
tensor_type = struct_type.field("tensor").type
|
|
550
|
+
depth = 0
|
|
551
|
+
inner = tensor_type
|
|
552
|
+
while pa.types.is_list(inner) or pa.types.is_large_list(inner) or pa.types.is_fixed_size_list(inner):
|
|
553
|
+
depth += 1
|
|
554
|
+
inner = inner.value_type
|
|
555
|
+
if depth != len(axes_type):
|
|
556
|
+
raise NestTensorError(
|
|
557
|
+
f"unnest_tensor_rows: tensor nesting depth {depth} does not match number of axes {len(axes_type)}"
|
|
558
|
+
)
|
|
559
|
+
|
|
560
|
+
out_axes_type = pa.struct(
|
|
561
|
+
[pa.field(axes_type.field(i).name, axes_type.field(i).type.value_type) for i in range(len(axes_type))]
|
|
562
|
+
)
|
|
563
|
+
output_schema = schema(value=inner, axes=out_axes_type)
|
|
564
|
+
return BindResponse(output_schema=output_schema)
|
|
565
|
+
|
|
566
|
+
@classmethod
|
|
567
|
+
def process(
|
|
568
|
+
cls,
|
|
569
|
+
params: ProcessParams[UnnestTensorRowsArgs],
|
|
570
|
+
state: None,
|
|
571
|
+
batch: pa.RecordBatch,
|
|
572
|
+
out: OutputCollector,
|
|
573
|
+
) -> None:
|
|
574
|
+
output_schema = params.output_schema
|
|
575
|
+
value_type = output_schema.field("value").type
|
|
576
|
+
axes_out_type = output_schema.field("axes").type
|
|
577
|
+
axis_names = [axes_out_type.field(i).name for i in range(len(axes_out_type))]
|
|
578
|
+
|
|
579
|
+
if batch.num_rows == 0:
|
|
580
|
+
out.emit(
|
|
581
|
+
pa.RecordBatch.from_arrays(
|
|
582
|
+
[pa.array([], type=value_type), pa.array([], type=axes_out_type)],
|
|
583
|
+
schema=output_schema,
|
|
584
|
+
)
|
|
585
|
+
)
|
|
586
|
+
return
|
|
587
|
+
|
|
588
|
+
struct_array = batch.column(0)
|
|
589
|
+
values_buf: list[Any] = []
|
|
590
|
+
axes_buf: list[dict[str, Any]] = []
|
|
591
|
+
|
|
592
|
+
for i in range(batch.num_rows):
|
|
593
|
+
scalar = struct_array[i]
|
|
594
|
+
if not scalar.is_valid:
|
|
595
|
+
continue
|
|
596
|
+
struct_value = scalar.as_py()
|
|
597
|
+
tensor_val = struct_value["tensor"]
|
|
598
|
+
axes_dict = struct_value["axes"]
|
|
599
|
+
coord_lists = [axes_dict.get(name) or [] for name in axis_names]
|
|
600
|
+
if any(len(v) == 0 for v in coord_lists):
|
|
601
|
+
continue
|
|
602
|
+
for index_tuple in itertools.product(*(range(len(v)) for v in coord_lists)):
|
|
603
|
+
cell: Any = tensor_val
|
|
604
|
+
for d in index_tuple:
|
|
605
|
+
cell = cell[d]
|
|
606
|
+
values_buf.append(cell)
|
|
607
|
+
axes_buf.append({name: coord_lists[a][index_tuple[a]] for a, name in enumerate(axis_names)})
|
|
608
|
+
|
|
609
|
+
out.emit(
|
|
610
|
+
pa.RecordBatch.from_arrays(
|
|
611
|
+
[pa.array(values_buf, type=value_type), pa.array(axes_buf, type=axes_out_type)],
|
|
612
|
+
schema=output_schema,
|
|
613
|
+
)
|
|
614
|
+
)
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
# Copyright 2025, 2026 Query Farm LLC - https://query.farm
|
|
2
|
+
|
|
3
|
+
"""Test fixture: an Orchard-style catalog worker that advertises a secret-service URL.
|
|
4
|
+
|
|
5
|
+
Serves an in-memory catalog named ``orchard`` whose ``catalog_attach`` response
|
|
6
|
+
carries ``tags["vgi_secret_service_url"]`` (taken from the ``VGI_ORCHARD_SECRET_URL``
|
|
7
|
+
environment variable). The C++ VGI extension reads that tag at ATTACH time and
|
|
8
|
+
auto-registers a ``VgiRemoteSecretStorage`` pointing at the secret microservice.
|
|
9
|
+
|
|
10
|
+
Run with::
|
|
11
|
+
|
|
12
|
+
VGI_ORCHARD_SECRET_URL=http://127.0.0.1:<port>/ \
|
|
13
|
+
vgi-serve vgi._test_fixtures.orchard_catalog:OrchardCatalogWorker --http
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
from __future__ import annotations
|
|
17
|
+
|
|
18
|
+
import os
|
|
19
|
+
|
|
20
|
+
from vgi._test_fixtures.catalog import CatalogData, InMemoryCatalog, SchemaData
|
|
21
|
+
from vgi.catalog import AttachOpaqueData, SchemaInfo
|
|
22
|
+
from vgi.worker import Worker
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class OrchardCatalog(InMemoryCatalog):
|
|
26
|
+
"""In-memory catalog with an ``orchard`` catalog tagged with the secret URL."""
|
|
27
|
+
|
|
28
|
+
def __init__(self) -> None:
|
|
29
|
+
super().__init__()
|
|
30
|
+
url = os.environ.get("VGI_ORCHARD_SECRET_URL", "")
|
|
31
|
+
tags = {"vgi_secret_service_url": url} if url else {}
|
|
32
|
+
catalog = CatalogData(name="orchard", tags=tags)
|
|
33
|
+
placeholder = AttachOpaqueData(b"\x00" * 16)
|
|
34
|
+
catalog.schemas["main"] = SchemaData(
|
|
35
|
+
info=SchemaInfo(attach_opaque_data=placeholder, name="main", comment=None, tags={})
|
|
36
|
+
)
|
|
37
|
+
self._catalogs["orchard"] = catalog
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class OrchardCatalogWorker(Worker):
|
|
41
|
+
"""Worker serving :class:`OrchardCatalog`."""
|
|
42
|
+
|
|
43
|
+
catalog_interface = OrchardCatalog
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
if __name__ == "__main__":
|
|
47
|
+
OrchardCatalogWorker.main()
|