vgi-python 0.8.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (124) hide show
  1. vgi/__init__.py +152 -0
  2. vgi/_duckdb.py +62 -0
  3. vgi/_storage_profile.py +132 -0
  4. vgi/_test_fixtures/__init__.py +20 -0
  5. vgi/_test_fixtures/accumulate/__init__.py +19 -0
  6. vgi/_test_fixtures/accumulate/worker.py +762 -0
  7. vgi/_test_fixtures/aggregate/__init__.py +62 -0
  8. vgi/_test_fixtures/aggregate/_common.py +21 -0
  9. vgi/_test_fixtures/aggregate/basic.py +232 -0
  10. vgi/_test_fixtures/aggregate/dynamic.py +409 -0
  11. vgi/_test_fixtures/aggregate/generic.py +86 -0
  12. vgi/_test_fixtures/aggregate/listagg.py +71 -0
  13. vgi/_test_fixtures/aggregate/percentile.py +107 -0
  14. vgi/_test_fixtures/aggregate/streaming.py +192 -0
  15. vgi/_test_fixtures/aggregate/varargs.py +75 -0
  16. vgi/_test_fixtures/aggregate/window.py +380 -0
  17. vgi/_test_fixtures/attach_options.py +308 -0
  18. vgi/_test_fixtures/bad_protocol.py +62 -0
  19. vgi/_test_fixtures/cancellable.py +336 -0
  20. vgi/_test_fixtures/catalog.py +813 -0
  21. vgi/_test_fixtures/http_server.py +394 -0
  22. vgi/_test_fixtures/nest_tensor.py +614 -0
  23. vgi/_test_fixtures/orchard_catalog.py +47 -0
  24. vgi/_test_fixtures/projection_repro/__init__.py +6 -0
  25. vgi/_test_fixtures/projection_repro/worker.py +454 -0
  26. vgi/_test_fixtures/scalar/__init__.py +116 -0
  27. vgi/_test_fixtures/scalar/_common.py +69 -0
  28. vgi/_test_fixtures/scalar/arithmetic.py +321 -0
  29. vgi/_test_fixtures/scalar/binary.py +120 -0
  30. vgi/_test_fixtures/scalar/formatting.py +176 -0
  31. vgi/_test_fixtures/scalar/geo.py +300 -0
  32. vgi/_test_fixtures/scalar/null_handling.py +107 -0
  33. vgi/_test_fixtures/scalar/random_demo.py +171 -0
  34. vgi/_test_fixtures/scalar/settings_secrets.py +102 -0
  35. vgi/_test_fixtures/scalar/type_info.py +219 -0
  36. vgi/_test_fixtures/schema_reconcile/__init__.py +29 -0
  37. vgi/_test_fixtures/schema_reconcile/worker.py +653 -0
  38. vgi/_test_fixtures/simple_writable.py +793 -0
  39. vgi/_test_fixtures/table/__init__.py +221 -0
  40. vgi/_test_fixtures/table/_common.py +162 -0
  41. vgi/_test_fixtures/table/batch_index.py +283 -0
  42. vgi/_test_fixtures/table/batch_index_broken.py +200 -0
  43. vgi/_test_fixtures/table/catalog_scans.py +162 -0
  44. vgi/_test_fixtures/table/filters.py +1005 -0
  45. vgi/_test_fixtures/table/late_materialization.py +249 -0
  46. vgi/_test_fixtures/table/make_series.py +273 -0
  47. vgi/_test_fixtures/table/misc.py +499 -0
  48. vgi/_test_fixtures/table/order_modes.py +164 -0
  49. vgi/_test_fixtures/table/pairs.py +437 -0
  50. vgi/_test_fixtures/table/partition_columns.py +472 -0
  51. vgi/_test_fixtures/table/partition_columns_broken.py +304 -0
  52. vgi/_test_fixtures/table/profiling_example.py +195 -0
  53. vgi/_test_fixtures/table/required_filters.py +234 -0
  54. vgi/_test_fixtures/table/sequence.py +710 -0
  55. vgi/_test_fixtures/table/settings.py +426 -0
  56. vgi/_test_fixtures/table/transaction_storage.py +162 -0
  57. vgi/_test_fixtures/table/tt_pushdown.py +191 -0
  58. vgi/_test_fixtures/table/versioned.py +230 -0
  59. vgi/_test_fixtures/table_in_out.py +1392 -0
  60. vgi/_test_fixtures/versioned.py +155 -0
  61. vgi/_test_fixtures/versioned_tables.py +595 -0
  62. vgi/_test_fixtures/worker.py +1631 -0
  63. vgi/_test_fixtures/writable/__init__.py +8 -0
  64. vgi/_test_fixtures/writable/generic.py +236 -0
  65. vgi/_test_fixtures/writable/table.py +149 -0
  66. vgi/_test_fixtures/writable/worker.py +1148 -0
  67. vgi/aggregate_function.py +607 -0
  68. vgi/argument_spec.py +472 -0
  69. vgi/arguments.py +1747 -0
  70. vgi/auth.py +55 -0
  71. vgi/catalog/__init__.py +88 -0
  72. vgi/catalog/attach_option.py +206 -0
  73. vgi/catalog/catalog_interface.py +2767 -0
  74. vgi/catalog/descriptors.py +870 -0
  75. vgi/catalog/duckdb_statistics.py +377 -0
  76. vgi/catalog/secret_type.py +96 -0
  77. vgi/catalog/setting.py +253 -0
  78. vgi/catalog/storage.py +372 -0
  79. vgi/client/__init__.py +67 -0
  80. vgi/client/catalog_mixin.py +1251 -0
  81. vgi/client/cli.py +582 -0
  82. vgi/client/cli_catalog.py +182 -0
  83. vgi/client/cli_schema.py +270 -0
  84. vgi/client/cli_table.py +907 -0
  85. vgi/client/cli_transaction.py +97 -0
  86. vgi/client/cli_utils.py +441 -0
  87. vgi/client/cli_view.py +303 -0
  88. vgi/client/client.py +2183 -0
  89. vgi/exceptions.py +205 -0
  90. vgi/function.py +245 -0
  91. vgi/function_storage.py +1636 -0
  92. vgi/function_storage_azure_sql.py +922 -0
  93. vgi/function_storage_cf_do.py +740 -0
  94. vgi/http/__init__.py +25 -0
  95. vgi/http/demo_storage.py +212 -0
  96. vgi/http/worker_page.py +1252 -0
  97. vgi/invocation.py +154 -0
  98. vgi/logging_config.py +93 -0
  99. vgi/meta_worker.py +661 -0
  100. vgi/metadata.py +1403 -0
  101. vgi/otel.py +406 -0
  102. vgi/protocol.py +2418 -0
  103. vgi/protocol_version.txt +1 -0
  104. vgi/py.typed +0 -0
  105. vgi/scalar_function.py +1211 -0
  106. vgi/schema_utils.py +234 -0
  107. vgi/secret_protocol.py +124 -0
  108. vgi/secret_service.py +238 -0
  109. vgi/serve.py +769 -0
  110. vgi/table_buffering_function.py +443 -0
  111. vgi/table_filter_pushdown.py +1528 -0
  112. vgi/table_function.py +1130 -0
  113. vgi/table_in_out_function.py +383 -0
  114. vgi/transactor/__init__.py +24 -0
  115. vgi/transactor/_duckdb_compat.py +27 -0
  116. vgi/transactor/client.py +137 -0
  117. vgi/transactor/protocol.py +149 -0
  118. vgi/transactor/server.py +740 -0
  119. vgi/worker.py +4761 -0
  120. vgi_python-0.8.0.dist-info/METADATA +735 -0
  121. vgi_python-0.8.0.dist-info/RECORD +124 -0
  122. vgi_python-0.8.0.dist-info/WHEEL +4 -0
  123. vgi_python-0.8.0.dist-info/entry_points.txt +5 -0
  124. vgi_python-0.8.0.dist-info/licenses/LICENSE +134 -0
@@ -0,0 +1,614 @@
1
+ # Copyright 2025, 2026 Query Farm LLC - https://query.farm
2
+
3
+ # ruff: noqa: D102, D106
4
+ """Aggregate that collects rows into a dense N-D tensor, plus its inverse.
5
+
6
+ Two functions that work as a pair:
7
+
8
+ - ``nest_tensor(value, {axis1: ..., axis2: ...})`` aggregate — collects rows
9
+ from a group into a struct ``{tensor, axes}`` where ``tensor`` is a dense
10
+ nested-list representation of the values keyed by the axis coordinates, and
11
+ ``axes`` is a struct mirroring the input axes argument with each field
12
+ holding that axis's sorted, distinct coordinate values.
13
+ - ``unnest_tensor(t)`` table function — inverts the aggregate, emitting one
14
+ row per cell of the Cartesian product (including null-valued cells).
15
+ """
16
+
17
+ from __future__ import annotations
18
+
19
+ import itertools
20
+ import os
21
+ from dataclasses import dataclass
22
+ from typing import Annotated, Any
23
+
24
+ import pyarrow as pa
25
+ from vgi_rpc import ArrowSerializableDataclass, ArrowType
26
+ from vgi_rpc.rpc import OutputCollector
27
+
28
+ from vgi.aggregate_function import AggregateBindParams, AggregateFunction
29
+ from vgi.arguments import Arg, Param, Returns, TableInput
30
+ from vgi.invocation import BindResponse
31
+ from vgi.metadata import DistinctDependence, NullHandling, OrderDependence
32
+ from vgi.scalar_function import BindParameters, BindResult, ScalarFunction
33
+ from vgi.schema_utils import schema
34
+ from vgi.table_function import BindParams, ProcessParams
35
+ from vgi.table_in_out_function import TableInOutGenerator
36
+
37
+ __all__ = [
38
+ "NestTensorError",
39
+ "NestTensorFunction",
40
+ "UnnestTensorFunction",
41
+ "UnnestTensorRowsFunction",
42
+ ]
43
+
44
+
45
+ # ---------------------------------------------------------------------------
46
+ # Errors
47
+ # ---------------------------------------------------------------------------
48
+
49
+
50
+ class NestTensorError(ValueError):
51
+ """Base error for nest_tensor / unnest_tensor."""
52
+
53
+
54
+ # ---------------------------------------------------------------------------
55
+ # Helpers
56
+ # ---------------------------------------------------------------------------
57
+
58
+
59
+ _DEFAULT_MAX_CELLS = 10_000_000
60
+
61
+
62
+ def _max_cells() -> int:
63
+ raw = os.environ.get("VGI_NEST_TENSOR_MAX_CELLS")
64
+ if raw is None:
65
+ return _DEFAULT_MAX_CELLS
66
+ try:
67
+ value = int(raw)
68
+ except ValueError as exc:
69
+ raise NestTensorError(f"VGI_NEST_TENSOR_MAX_CELLS must be an integer, got {raw!r}") from exc
70
+ if value <= 0:
71
+ raise NestTensorError("VGI_NEST_TENSOR_MAX_CELLS must be positive")
72
+ return value
73
+
74
+
75
+ def _validate_coord_type(name: str, arrow_type: pa.DataType) -> None:
76
+ """Raise if an axis coord type is unsupported.
77
+
78
+ Allowed: integers, decimals, strings, binary, bool, date, timestamp, time.
79
+ Rejected: floating-point (NaN breaks equality/ordering), nested types.
80
+ """
81
+ if pa.types.is_floating(arrow_type):
82
+ raise NestTensorError(
83
+ f"nest_tensor: axis '{name}' has floating-point type {arrow_type}; "
84
+ f"floats are not supported as coord types (NaN breaks equality)"
85
+ )
86
+ if (
87
+ pa.types.is_struct(arrow_type)
88
+ or pa.types.is_list(arrow_type)
89
+ or pa.types.is_large_list(arrow_type)
90
+ or pa.types.is_fixed_size_list(arrow_type)
91
+ or pa.types.is_map(arrow_type)
92
+ ):
93
+ raise NestTensorError(
94
+ f"nest_tensor: axis '{name}' has nested type {arrow_type}; only scalar coord types are supported"
95
+ )
96
+
97
+
98
+ def _serialize_table(table: pa.Table) -> bytes:
99
+ sink = pa.BufferOutputStream()
100
+ with pa.ipc.new_stream(sink, table.schema) as writer:
101
+ for batch in table.to_batches():
102
+ writer.write_batch(batch)
103
+ return sink.getvalue().to_pybytes()
104
+
105
+
106
+ def _deserialize_table(data: bytes) -> pa.Table:
107
+ return pa.ipc.open_stream(data).read_all()
108
+
109
+
110
+ def _read_rows(state: NestTensorState) -> pa.Table | None:
111
+ if not state.rows_ipc:
112
+ return None
113
+ return _deserialize_table(state.rows_ipc)
114
+
115
+
116
+ def _make_nested_lists(shape: list[int], fill: Any = None) -> Any:
117
+ """Build a nested Python list of the given shape, filled with ``fill``."""
118
+ if not shape:
119
+ return fill
120
+ head, *rest = shape
121
+ return [_make_nested_lists(rest, fill) for _ in range(head)]
122
+
123
+
124
+ def _nested_list_type(inner: pa.DataType, depth: int) -> pa.DataType:
125
+ t = inner
126
+ for _ in range(depth):
127
+ t = pa.list_(t)
128
+ return t
129
+
130
+
131
+ def _output_struct_type(value_type: pa.DataType, axes_type: pa.StructType) -> pa.StructType:
132
+ n = len(axes_type)
133
+ tensor_type = _nested_list_type(value_type, n)
134
+ axes_out = pa.struct([pa.field(f.name, pa.list_(f.type)) for f in axes_type])
135
+ return pa.struct([("tensor", tensor_type), ("axes", axes_out)])
136
+
137
+
138
+ # ---------------------------------------------------------------------------
139
+ # Aggregate state
140
+ # ---------------------------------------------------------------------------
141
+
142
+
143
+ @dataclass(kw_only=True)
144
+ class NestTensorState(ArrowSerializableDataclass):
145
+ rows_ipc: Annotated[bytes, ArrowType(pa.binary())] = b""
146
+
147
+
148
+ # ---------------------------------------------------------------------------
149
+ # NestTensorFunction
150
+ # ---------------------------------------------------------------------------
151
+
152
+
153
+ class NestTensorFunction(AggregateFunction[NestTensorState]):
154
+ """Collect rows into an N-D tensor plus per-axis coordinate lists.
155
+
156
+ SQL::
157
+
158
+ SELECT nest_tensor(value, {x: col_x, y: col_y}) FROM t GROUP BY g;
159
+
160
+ Returns a struct ``{tensor, axes}`` where ``tensor`` is a nested
161
+ ``list<list<...>>`` (one level per axis) and ``axes`` is a struct
162
+ mirroring the input axes argument with each field holding that axis's
163
+ sorted distinct coordinate values.
164
+ """
165
+
166
+ class Meta:
167
+ name = "nest_tensor"
168
+ description = "Collect rows into a dense N-D tensor plus per-axis coordinates"
169
+ null_handling = NullHandling.DEFAULT
170
+ order_dependent = OrderDependence.NOT_ORDER_DEPENDENT
171
+ distinct_dependent = DistinctDependence.NOT_DISTINCT_DEPENDENT
172
+
173
+ # ------------------------------------------------------------------ bind
174
+
175
+ @classmethod
176
+ def on_bind(cls, params: AggregateBindParams, **kwargs: object) -> BindResponse:
177
+ input_schema = params.input_schema
178
+ if input_schema is None or len(input_schema) < 2:
179
+ raise NestTensorError("nest_tensor: expected 2 arguments (value, axes struct)")
180
+ value_type = input_schema.field(0).type
181
+ axes_type = input_schema.field(1).type
182
+ if not pa.types.is_struct(axes_type):
183
+ raise NestTensorError(f"nest_tensor: second argument must be a struct, got {axes_type}")
184
+ if len(axes_type) == 0:
185
+ raise NestTensorError("nest_tensor: axes struct must have at least one field")
186
+ for f in axes_type:
187
+ _validate_coord_type(f.name, f.type)
188
+
189
+ out = _output_struct_type(value_type, axes_type)
190
+ return BindResponse(output_schema=schema(result=out))
191
+
192
+ # -------------------------------------------------------------- lifecycle
193
+
194
+ @classmethod
195
+ def initial_state(cls, params: ProcessParams[Any]) -> NestTensorState:
196
+ return NestTensorState()
197
+
198
+ # ------------------------------------------------------------------ update
199
+
200
+ @classmethod
201
+ def update(
202
+ cls,
203
+ states: dict[int, NestTensorState],
204
+ group_ids: pa.Int64Array,
205
+ value: Annotated[pa.Array, Param(doc="Tensor cell value")], # type: ignore[type-arg]
206
+ axes: Annotated[pa.Array, Param(doc="Struct of axis coordinates")], # type: ignore[type-arg]
207
+ ) -> None:
208
+ if not isinstance(axes, pa.StructArray):
209
+ raise NestTensorError(f"nest_tensor: axes argument must be a struct array, got {type(axes).__name__}")
210
+
211
+ n_rows = len(group_ids)
212
+ axis_names = [f.name for f in axes.type]
213
+
214
+ # Materialise axes as a dict of field -> Array for fast per-row access.
215
+ axis_columns = {name: axes.field(i) for i, name in enumerate(axis_names)}
216
+
217
+ # Group rows by group_id, validating nulls and intra-batch duplicates.
218
+ per_group_rows: dict[int, list[int]] = {}
219
+ per_group_seen: dict[int, set[tuple[Any, ...]]] = {}
220
+ gids_py = group_ids.to_pylist()
221
+ axes_validity = axes.is_valid()
222
+ for i in range(n_rows):
223
+ gid_raw = gids_py[i]
224
+ if gid_raw is None:
225
+ continue # Null group_id — shouldn't happen but skip defensively.
226
+ gid = gid_raw
227
+ if not axes_validity[i].as_py():
228
+ continue # Null axes struct → skip.
229
+ coord = []
230
+ for name in axis_names:
231
+ col = axis_columns[name]
232
+ cell = col[i]
233
+ if not cell.is_valid:
234
+ raise NestTensorError(f"nest_tensor: null coord value for axis '{name}' at row {i} (group {gid})")
235
+ coord.append(cell.as_py())
236
+ coord_t = tuple(coord)
237
+ seen = per_group_seen.setdefault(gid, set())
238
+ if coord_t in seen:
239
+ raise NestTensorError(
240
+ f"nest_tensor: duplicate coordinate {dict(zip(axis_names, coord, strict=True))} in group {gid}"
241
+ )
242
+ seen.add(coord_t)
243
+ per_group_rows.setdefault(gid, []).append(i)
244
+
245
+ if not per_group_rows:
246
+ return
247
+
248
+ # Build per-group mini-tables and append to rows_ipc.
249
+ parent_schema = pa.schema(
250
+ [
251
+ pa.field("value", value.type),
252
+ pa.field("axes", axes.type),
253
+ ]
254
+ )
255
+ for gid, indices in per_group_rows.items():
256
+ idx = pa.array(indices, type=pa.int64())
257
+ value_slice = value.take(idx)
258
+ axes_slice = axes.take(idx)
259
+ batch = pa.RecordBatch.from_arrays([value_slice, axes_slice], schema=parent_schema)
260
+ table = pa.Table.from_batches([batch])
261
+ prior_bytes = states[gid].rows_ipc
262
+ if prior_bytes:
263
+ prior = _deserialize_table(prior_bytes)
264
+ table = pa.concat_tables([prior, table])
265
+ states[gid] = NestTensorState(rows_ipc=_serialize_table(table))
266
+
267
+ # ---------------------------------------------------------------- combine
268
+
269
+ @classmethod
270
+ def combine(
271
+ cls,
272
+ source: NestTensorState,
273
+ target: NestTensorState,
274
+ params: ProcessParams[Any],
275
+ ) -> NestTensorState:
276
+ if not source.rows_ipc:
277
+ return target
278
+ if not target.rows_ipc:
279
+ return NestTensorState(rows_ipc=source.rows_ipc)
280
+ s = _deserialize_table(source.rows_ipc)
281
+ t = _deserialize_table(target.rows_ipc)
282
+ return NestTensorState(rows_ipc=_serialize_table(pa.concat_tables([t, s])))
283
+
284
+ # ---------------------------------------------------------------- finalize
285
+
286
+ @classmethod
287
+ def finalize(
288
+ cls,
289
+ group_ids: pa.Int64Array,
290
+ states: dict[int, NestTensorState],
291
+ params: ProcessParams[Any],
292
+ ) -> Annotated[pa.RecordBatch, Returns()]:
293
+ output_schema = params.output_schema
294
+ assert output_schema is not None, "nest_tensor: finalize called without output_schema"
295
+ out_type = output_schema.field(0).type
296
+ assert pa.types.is_struct(out_type)
297
+ tensor_type = out_type.field("tensor").type
298
+ axes_out_type = out_type.field("axes").type
299
+ axis_names = [axes_out_type.field(i).name for i in range(len(axes_out_type))]
300
+
301
+ max_cells = _max_cells()
302
+
303
+ tensors: list[Any] = []
304
+ axes_rows: list[dict[str, list[Any]]] = []
305
+ for gid_scalar in group_ids:
306
+ gid = gid_scalar.as_py()
307
+ state = states.get(gid)
308
+ table = _read_rows(state) if state is not None else None
309
+ if table is None or table.num_rows == 0:
310
+ # No rows for this group (e.g., filtered-out during update or
311
+ # an empty group). Emit zero-shape tensor + empty axes lists.
312
+ tensors.append(_make_nested_lists([0] * len(axis_names)))
313
+ axes_rows.append({name: [] for name in axis_names})
314
+ continue
315
+
316
+ tensors_entry, axes_entry = _materialise_group(
317
+ table=table,
318
+ axis_names=axis_names,
319
+ gid=gid,
320
+ max_cells=max_cells,
321
+ )
322
+ tensors.append(tensors_entry)
323
+ axes_rows.append(axes_entry)
324
+
325
+ tensor_array = pa.array(tensors, type=tensor_type)
326
+ axes_array = pa.array(axes_rows, type=axes_out_type)
327
+ result_array = pa.StructArray.from_arrays(
328
+ [tensor_array, axes_array], fields=[out_type.field("tensor"), out_type.field("axes")]
329
+ )
330
+ return pa.record_batch([result_array], schema=output_schema)
331
+
332
+
333
+ def _materialise_group(
334
+ *,
335
+ table: pa.Table,
336
+ axis_names: list[str],
337
+ gid: int,
338
+ max_cells: int,
339
+ ) -> tuple[Any, dict[str, list[Any]]]:
340
+ """Build the nested tensor + axes dict for a single group's accumulated rows."""
341
+ value_col = table.column("value")
342
+ axes_col = table.column("axes")
343
+ n_rows = table.num_rows
344
+
345
+ # Collect distinct coord values per axis, sorted ascending. We sort here
346
+ # (rather than preserve insertion order) for deterministic output across
347
+ # parallel combine orderings.
348
+ axis_values: list[list[Any]] = []
349
+ axis_idx: list[dict[Any, int]] = []
350
+ # Combine chunks into a single StructArray for easier field access.
351
+ axes_combined = axes_col.combine_chunks()
352
+ assert isinstance(axes_combined, pa.StructArray)
353
+ for name in axis_names:
354
+ field_array = axes_combined.field(name)
355
+ distinct = sorted({field_array[i].as_py() for i in range(n_rows)})
356
+ axis_values.append(distinct)
357
+ axis_idx.append({v: i for i, v in enumerate(distinct)})
358
+
359
+ shape = [len(v) for v in axis_values]
360
+ total = 1
361
+ for s in shape:
362
+ total *= s
363
+ if total > max_cells:
364
+ raise NestTensorError(
365
+ f"nest_tensor: tensor has {total} cells (shape {shape}) "
366
+ f"exceeds VGI_NEST_TENSOR_MAX_CELLS={max_cells} (group {gid})"
367
+ )
368
+
369
+ tensor = _make_nested_lists(shape, fill=None)
370
+ filled = _make_nested_lists(shape, fill=False)
371
+
372
+ value_flat = value_col.combine_chunks()
373
+ for row in range(n_rows):
374
+ idx_tuple = tuple(axis_idx[a][axes_combined.field(name)[row].as_py()] for a, name in enumerate(axis_names))
375
+ cell = tensor
376
+ flag = filled
377
+ for d in idx_tuple[:-1]:
378
+ cell = cell[d]
379
+ flag = flag[d]
380
+ last = idx_tuple[-1]
381
+ if flag[last]:
382
+ coord = {name: axes_combined.field(name)[row].as_py() for name in axis_names}
383
+ raise NestTensorError(
384
+ f"nest_tensor: duplicate coordinate {coord} in group {gid} (arrived from parallel partitions)"
385
+ )
386
+ cell[last] = value_flat[row].as_py()
387
+ flag[last] = True
388
+
389
+ axes_entry = {name: axis_values[i] for i, name in enumerate(axis_names)}
390
+ return tensor, axes_entry
391
+
392
+
393
+ # ---------------------------------------------------------------------------
394
+ # UnnestTensorFunction
395
+ # ---------------------------------------------------------------------------
396
+
397
+
398
+ class UnnestTensorFunction(ScalarFunction):
399
+ """Invert ``nest_tensor``: return a list of ``{value, axes}`` structs.
400
+
401
+ SQL::
402
+
403
+ SELECT u.value, u.axes.x, u.axes.y
404
+ FROM (SELECT nest_tensor(v, {x: a, y: b}) AS t FROM rows GROUP BY g) r,
405
+ UNNEST(unnest_tensor(r.t)) AS u(value, axes);
406
+
407
+ Every cell of the axes Cartesian product is returned, including cells
408
+ whose ``value`` is null (unfilled slots or null input values).
409
+
410
+ Implemented as a scalar (not table) function because DuckDB table
411
+ functions cannot accept correlated column inputs from a lateral join.
412
+ """
413
+
414
+ class Meta:
415
+ name = "unnest_tensor"
416
+ description = "Invert nest_tensor: list of {value, axes} structs per cell"
417
+
418
+ @classmethod
419
+ def on_bind(cls, params: BindParameters) -> BindResult:
420
+ struct_type = params.arguments_schema.field(0).type
421
+ if not pa.types.is_struct(struct_type):
422
+ raise NestTensorError(f"unnest_tensor: argument must be a struct, got {struct_type}")
423
+ field_names = {struct_type.field(i).name for i in range(len(struct_type))}
424
+ if "tensor" not in field_names or "axes" not in field_names:
425
+ raise NestTensorError(
426
+ f"unnest_tensor: struct must have 'tensor' and 'axes' fields, got {sorted(field_names)}"
427
+ )
428
+ axes_type = struct_type.field("axes").type
429
+ if not pa.types.is_struct(axes_type):
430
+ raise NestTensorError(f"unnest_tensor: 'axes' field must be a struct, got {axes_type}")
431
+
432
+ tensor_type = struct_type.field("tensor").type
433
+ depth = 0
434
+ inner = tensor_type
435
+ while pa.types.is_list(inner) or pa.types.is_large_list(inner) or pa.types.is_fixed_size_list(inner):
436
+ depth += 1
437
+ inner = inner.value_type
438
+ if depth != len(axes_type):
439
+ raise NestTensorError(
440
+ f"unnest_tensor: tensor nesting depth {depth} does not match number of axes {len(axes_type)}"
441
+ )
442
+
443
+ out_axes_type = pa.struct(
444
+ [pa.field(axes_type.field(i).name, axes_type.field(i).type.value_type) for i in range(len(axes_type))]
445
+ )
446
+ row_type = pa.struct([pa.field("value", inner), pa.field("axes", out_axes_type)])
447
+ return BindResult(pa.list_(row_type))
448
+
449
+ @classmethod
450
+ def compute(
451
+ cls,
452
+ tensor: Annotated[pa.Array, Param(doc="Struct produced by nest_tensor")], # type: ignore[type-arg]
453
+ ) -> Annotated[pa.Array, Returns()]: # type: ignore[type-arg]
454
+ struct_array = tensor
455
+ if not pa.types.is_struct(struct_array.type):
456
+ raise NestTensorError("unnest_tensor: input must be a struct array")
457
+
458
+ axes_type = struct_array.type.field("axes").type
459
+ axis_names = [axes_type.field(i).name for i in range(len(axes_type))]
460
+
461
+ out_axes_type = pa.struct(
462
+ [pa.field(axes_type.field(i).name, axes_type.field(i).type.value_type) for i in range(len(axes_type))]
463
+ )
464
+ # Determine cell type by walking tensor nesting.
465
+ tensor_type = struct_array.type.field("tensor").type
466
+ inner = tensor_type
467
+ while pa.types.is_list(inner) or pa.types.is_large_list(inner) or pa.types.is_fixed_size_list(inner):
468
+ inner = inner.value_type
469
+ row_type = pa.struct([pa.field("value", inner), pa.field("axes", out_axes_type)])
470
+
471
+ result_rows: list[list[dict[str, Any]] | None] = []
472
+ for i in range(len(struct_array)):
473
+ scalar = struct_array[i]
474
+ if not scalar.is_valid:
475
+ result_rows.append(None)
476
+ continue
477
+ struct_value = scalar.as_py()
478
+ tensor_val = struct_value["tensor"]
479
+ axes_dict = struct_value["axes"]
480
+ coord_lists = [axes_dict.get(name) or [] for name in axis_names]
481
+ if any(len(v) == 0 for v in coord_lists):
482
+ result_rows.append([])
483
+ continue
484
+ rows: list[dict[str, Any]] = []
485
+ for index_tuple in itertools.product(*(range(len(v)) for v in coord_lists)):
486
+ cell: Any = tensor_val
487
+ for d in index_tuple:
488
+ cell = cell[d]
489
+ rows.append(
490
+ {
491
+ "value": cell,
492
+ "axes": {name: coord_lists[a][index_tuple[a]] for a, name in enumerate(axis_names)},
493
+ }
494
+ )
495
+ result_rows.append(rows)
496
+
497
+ return pa.array(result_rows, type=pa.list_(row_type))
498
+
499
+
500
+ # ---------------------------------------------------------------------------
501
+ # UnnestTensorRowsFunction (table-in-out variant for LATERAL joins)
502
+ # ---------------------------------------------------------------------------
503
+
504
+
505
+ @dataclass(slots=True, frozen=True, kw_only=True)
506
+ class UnnestTensorRowsArgs:
507
+ data: Annotated[TableInput, Arg(0, doc="Input table: one column of nest_tensor structs")]
508
+
509
+
510
+ class UnnestTensorRowsFunction(TableInOutGenerator[UnnestTensorRowsArgs]):
511
+ """Invert ``nest_tensor`` as a table-in-out function.
512
+
513
+ Accepts a one-column input table whose column is a nest_tensor-shaped
514
+ struct. Emits one output row per cell of the Cartesian product for every
515
+ input row. Unlike the scalar ``unnest_tensor``, this streams output
516
+ without materialising a full list column, and composes with DuckDB's
517
+ ``LATERAL`` joins on correlated columns.
518
+
519
+ SQL::
520
+
521
+ SELECT u.value, u.axes.x, u.axes.y
522
+ FROM (SELECT nest_tensor(v, {x: a, y: b}) AS t FROM rows GROUP BY g) r,
523
+ LATERAL unnest_tensor_rows((SELECT r.t)) u;
524
+ """
525
+
526
+ class Meta:
527
+ name = "unnest_tensor_rows"
528
+ description = "Invert nest_tensor, streaming one row per cell (LATERAL-friendly)"
529
+
530
+ @classmethod
531
+ def on_bind(cls, params: BindParams[UnnestTensorRowsArgs]) -> BindResponse:
532
+ input_schema = params.bind_call.input_schema
533
+ if input_schema is None or len(input_schema) != 1:
534
+ raise NestTensorError(
535
+ "unnest_tensor_rows: input table must have exactly one column (the nest_tensor struct)"
536
+ )
537
+ struct_type = input_schema.field(0).type
538
+ if not pa.types.is_struct(struct_type):
539
+ raise NestTensorError(f"unnest_tensor_rows: input column must be a struct, got {struct_type}")
540
+ field_names = {struct_type.field(i).name for i in range(len(struct_type))}
541
+ if "tensor" not in field_names or "axes" not in field_names:
542
+ raise NestTensorError(
543
+ f"unnest_tensor_rows: struct must have 'tensor' and 'axes' fields, got {sorted(field_names)}"
544
+ )
545
+ axes_type = struct_type.field("axes").type
546
+ if not pa.types.is_struct(axes_type):
547
+ raise NestTensorError(f"unnest_tensor_rows: 'axes' field must be a struct, got {axes_type}")
548
+
549
+ tensor_type = struct_type.field("tensor").type
550
+ depth = 0
551
+ inner = tensor_type
552
+ while pa.types.is_list(inner) or pa.types.is_large_list(inner) or pa.types.is_fixed_size_list(inner):
553
+ depth += 1
554
+ inner = inner.value_type
555
+ if depth != len(axes_type):
556
+ raise NestTensorError(
557
+ f"unnest_tensor_rows: tensor nesting depth {depth} does not match number of axes {len(axes_type)}"
558
+ )
559
+
560
+ out_axes_type = pa.struct(
561
+ [pa.field(axes_type.field(i).name, axes_type.field(i).type.value_type) for i in range(len(axes_type))]
562
+ )
563
+ output_schema = schema(value=inner, axes=out_axes_type)
564
+ return BindResponse(output_schema=output_schema)
565
+
566
+ @classmethod
567
+ def process(
568
+ cls,
569
+ params: ProcessParams[UnnestTensorRowsArgs],
570
+ state: None,
571
+ batch: pa.RecordBatch,
572
+ out: OutputCollector,
573
+ ) -> None:
574
+ output_schema = params.output_schema
575
+ value_type = output_schema.field("value").type
576
+ axes_out_type = output_schema.field("axes").type
577
+ axis_names = [axes_out_type.field(i).name for i in range(len(axes_out_type))]
578
+
579
+ if batch.num_rows == 0:
580
+ out.emit(
581
+ pa.RecordBatch.from_arrays(
582
+ [pa.array([], type=value_type), pa.array([], type=axes_out_type)],
583
+ schema=output_schema,
584
+ )
585
+ )
586
+ return
587
+
588
+ struct_array = batch.column(0)
589
+ values_buf: list[Any] = []
590
+ axes_buf: list[dict[str, Any]] = []
591
+
592
+ for i in range(batch.num_rows):
593
+ scalar = struct_array[i]
594
+ if not scalar.is_valid:
595
+ continue
596
+ struct_value = scalar.as_py()
597
+ tensor_val = struct_value["tensor"]
598
+ axes_dict = struct_value["axes"]
599
+ coord_lists = [axes_dict.get(name) or [] for name in axis_names]
600
+ if any(len(v) == 0 for v in coord_lists):
601
+ continue
602
+ for index_tuple in itertools.product(*(range(len(v)) for v in coord_lists)):
603
+ cell: Any = tensor_val
604
+ for d in index_tuple:
605
+ cell = cell[d]
606
+ values_buf.append(cell)
607
+ axes_buf.append({name: coord_lists[a][index_tuple[a]] for a, name in enumerate(axis_names)})
608
+
609
+ out.emit(
610
+ pa.RecordBatch.from_arrays(
611
+ [pa.array(values_buf, type=value_type), pa.array(axes_buf, type=axes_out_type)],
612
+ schema=output_schema,
613
+ )
614
+ )
@@ -0,0 +1,47 @@
1
+ # Copyright 2025, 2026 Query Farm LLC - https://query.farm
2
+
3
+ """Test fixture: an Orchard-style catalog worker that advertises a secret-service URL.
4
+
5
+ Serves an in-memory catalog named ``orchard`` whose ``catalog_attach`` response
6
+ carries ``tags["vgi_secret_service_url"]`` (taken from the ``VGI_ORCHARD_SECRET_URL``
7
+ environment variable). The C++ VGI extension reads that tag at ATTACH time and
8
+ auto-registers a ``VgiRemoteSecretStorage`` pointing at the secret microservice.
9
+
10
+ Run with::
11
+
12
+ VGI_ORCHARD_SECRET_URL=http://127.0.0.1:<port>/ \
13
+ vgi-serve vgi._test_fixtures.orchard_catalog:OrchardCatalogWorker --http
14
+ """
15
+
16
+ from __future__ import annotations
17
+
18
+ import os
19
+
20
+ from vgi._test_fixtures.catalog import CatalogData, InMemoryCatalog, SchemaData
21
+ from vgi.catalog import AttachOpaqueData, SchemaInfo
22
+ from vgi.worker import Worker
23
+
24
+
25
+ class OrchardCatalog(InMemoryCatalog):
26
+ """In-memory catalog with an ``orchard`` catalog tagged with the secret URL."""
27
+
28
+ def __init__(self) -> None:
29
+ super().__init__()
30
+ url = os.environ.get("VGI_ORCHARD_SECRET_URL", "")
31
+ tags = {"vgi_secret_service_url": url} if url else {}
32
+ catalog = CatalogData(name="orchard", tags=tags)
33
+ placeholder = AttachOpaqueData(b"\x00" * 16)
34
+ catalog.schemas["main"] = SchemaData(
35
+ info=SchemaInfo(attach_opaque_data=placeholder, name="main", comment=None, tags={})
36
+ )
37
+ self._catalogs["orchard"] = catalog
38
+
39
+
40
+ class OrchardCatalogWorker(Worker):
41
+ """Worker serving :class:`OrchardCatalog`."""
42
+
43
+ catalog_interface = OrchardCatalog
44
+
45
+
46
+ if __name__ == "__main__":
47
+ OrchardCatalogWorker.main()
@@ -0,0 +1,6 @@
1
+ # Copyright 2025, 2026 Query Farm LLC - https://query.farm
2
+
3
+ """Reproducer fixtures for projection-pushdown bugs.
4
+
5
+ See ``tests/test_projection_repro.py`` for the test cases.
6
+ """