vgi-python 0.8.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vgi/__init__.py +152 -0
- vgi/_duckdb.py +62 -0
- vgi/_storage_profile.py +132 -0
- vgi/_test_fixtures/__init__.py +20 -0
- vgi/_test_fixtures/accumulate/__init__.py +19 -0
- vgi/_test_fixtures/accumulate/worker.py +762 -0
- vgi/_test_fixtures/aggregate/__init__.py +62 -0
- vgi/_test_fixtures/aggregate/_common.py +21 -0
- vgi/_test_fixtures/aggregate/basic.py +232 -0
- vgi/_test_fixtures/aggregate/dynamic.py +409 -0
- vgi/_test_fixtures/aggregate/generic.py +86 -0
- vgi/_test_fixtures/aggregate/listagg.py +71 -0
- vgi/_test_fixtures/aggregate/percentile.py +107 -0
- vgi/_test_fixtures/aggregate/streaming.py +192 -0
- vgi/_test_fixtures/aggregate/varargs.py +75 -0
- vgi/_test_fixtures/aggregate/window.py +380 -0
- vgi/_test_fixtures/attach_options.py +308 -0
- vgi/_test_fixtures/bad_protocol.py +62 -0
- vgi/_test_fixtures/cancellable.py +336 -0
- vgi/_test_fixtures/catalog.py +813 -0
- vgi/_test_fixtures/http_server.py +394 -0
- vgi/_test_fixtures/nest_tensor.py +614 -0
- vgi/_test_fixtures/orchard_catalog.py +47 -0
- vgi/_test_fixtures/projection_repro/__init__.py +6 -0
- vgi/_test_fixtures/projection_repro/worker.py +454 -0
- vgi/_test_fixtures/scalar/__init__.py +116 -0
- vgi/_test_fixtures/scalar/_common.py +69 -0
- vgi/_test_fixtures/scalar/arithmetic.py +321 -0
- vgi/_test_fixtures/scalar/binary.py +120 -0
- vgi/_test_fixtures/scalar/formatting.py +176 -0
- vgi/_test_fixtures/scalar/geo.py +300 -0
- vgi/_test_fixtures/scalar/null_handling.py +107 -0
- vgi/_test_fixtures/scalar/random_demo.py +171 -0
- vgi/_test_fixtures/scalar/settings_secrets.py +102 -0
- vgi/_test_fixtures/scalar/type_info.py +219 -0
- vgi/_test_fixtures/schema_reconcile/__init__.py +29 -0
- vgi/_test_fixtures/schema_reconcile/worker.py +653 -0
- vgi/_test_fixtures/simple_writable.py +793 -0
- vgi/_test_fixtures/table/__init__.py +221 -0
- vgi/_test_fixtures/table/_common.py +162 -0
- vgi/_test_fixtures/table/batch_index.py +283 -0
- vgi/_test_fixtures/table/batch_index_broken.py +200 -0
- vgi/_test_fixtures/table/catalog_scans.py +162 -0
- vgi/_test_fixtures/table/filters.py +1005 -0
- vgi/_test_fixtures/table/late_materialization.py +249 -0
- vgi/_test_fixtures/table/make_series.py +273 -0
- vgi/_test_fixtures/table/misc.py +499 -0
- vgi/_test_fixtures/table/order_modes.py +164 -0
- vgi/_test_fixtures/table/pairs.py +437 -0
- vgi/_test_fixtures/table/partition_columns.py +472 -0
- vgi/_test_fixtures/table/partition_columns_broken.py +304 -0
- vgi/_test_fixtures/table/profiling_example.py +195 -0
- vgi/_test_fixtures/table/required_filters.py +234 -0
- vgi/_test_fixtures/table/sequence.py +710 -0
- vgi/_test_fixtures/table/settings.py +426 -0
- vgi/_test_fixtures/table/transaction_storage.py +162 -0
- vgi/_test_fixtures/table/tt_pushdown.py +191 -0
- vgi/_test_fixtures/table/versioned.py +230 -0
- vgi/_test_fixtures/table_in_out.py +1392 -0
- vgi/_test_fixtures/versioned.py +155 -0
- vgi/_test_fixtures/versioned_tables.py +595 -0
- vgi/_test_fixtures/worker.py +1631 -0
- vgi/_test_fixtures/writable/__init__.py +8 -0
- vgi/_test_fixtures/writable/generic.py +236 -0
- vgi/_test_fixtures/writable/table.py +149 -0
- vgi/_test_fixtures/writable/worker.py +1148 -0
- vgi/aggregate_function.py +607 -0
- vgi/argument_spec.py +472 -0
- vgi/arguments.py +1747 -0
- vgi/auth.py +55 -0
- vgi/catalog/__init__.py +88 -0
- vgi/catalog/attach_option.py +206 -0
- vgi/catalog/catalog_interface.py +2767 -0
- vgi/catalog/descriptors.py +870 -0
- vgi/catalog/duckdb_statistics.py +377 -0
- vgi/catalog/secret_type.py +96 -0
- vgi/catalog/setting.py +253 -0
- vgi/catalog/storage.py +372 -0
- vgi/client/__init__.py +67 -0
- vgi/client/catalog_mixin.py +1251 -0
- vgi/client/cli.py +582 -0
- vgi/client/cli_catalog.py +182 -0
- vgi/client/cli_schema.py +270 -0
- vgi/client/cli_table.py +907 -0
- vgi/client/cli_transaction.py +97 -0
- vgi/client/cli_utils.py +441 -0
- vgi/client/cli_view.py +303 -0
- vgi/client/client.py +2183 -0
- vgi/exceptions.py +205 -0
- vgi/function.py +245 -0
- vgi/function_storage.py +1636 -0
- vgi/function_storage_azure_sql.py +922 -0
- vgi/function_storage_cf_do.py +740 -0
- vgi/http/__init__.py +25 -0
- vgi/http/demo_storage.py +212 -0
- vgi/http/worker_page.py +1252 -0
- vgi/invocation.py +154 -0
- vgi/logging_config.py +93 -0
- vgi/meta_worker.py +661 -0
- vgi/metadata.py +1403 -0
- vgi/otel.py +406 -0
- vgi/protocol.py +2418 -0
- vgi/protocol_version.txt +1 -0
- vgi/py.typed +0 -0
- vgi/scalar_function.py +1211 -0
- vgi/schema_utils.py +234 -0
- vgi/secret_protocol.py +124 -0
- vgi/secret_service.py +238 -0
- vgi/serve.py +769 -0
- vgi/table_buffering_function.py +443 -0
- vgi/table_filter_pushdown.py +1528 -0
- vgi/table_function.py +1130 -0
- vgi/table_in_out_function.py +383 -0
- vgi/transactor/__init__.py +24 -0
- vgi/transactor/_duckdb_compat.py +27 -0
- vgi/transactor/client.py +137 -0
- vgi/transactor/protocol.py +149 -0
- vgi/transactor/server.py +740 -0
- vgi/worker.py +4761 -0
- vgi_python-0.8.0.dist-info/METADATA +735 -0
- vgi_python-0.8.0.dist-info/RECORD +124 -0
- vgi_python-0.8.0.dist-info/WHEEL +4 -0
- vgi_python-0.8.0.dist-info/entry_points.txt +5 -0
- vgi_python-0.8.0.dist-info/licenses/LICENSE +134 -0
|
@@ -0,0 +1,304 @@
|
|
|
1
|
+
# Copyright 2025, 2026 Query Farm LLC - https://query.farm
|
|
2
|
+
|
|
3
|
+
"""Deliberately-broken PartitionColumns fixtures for v2 contract testing.
|
|
4
|
+
|
|
5
|
+
Each fixture violates one specific clause of the PartitionColumns contract
|
|
6
|
+
documented at ``vgi/_test_fixtures/table/partition_columns.py`` /
|
|
7
|
+
``vgi/src/vgi_table_function_impl.cpp::InstallBatch``.
|
|
8
|
+
|
|
9
|
+
* :class:`BrokenMissingPartitionValuesFunction` — declares
|
|
10
|
+
``partition_kind = SINGLE_VALUE_PARTITIONS`` and an annotated bind-
|
|
11
|
+
schema field, but bypasses the framework's wrapper validation by
|
|
12
|
+
reaching the inner OutputCollector directly. The C++ extension's
|
|
13
|
+
``InstallBatch`` catches the missing ``vgi_partition_values#b64``
|
|
14
|
+
metadata.
|
|
15
|
+
|
|
16
|
+
* :class:`BrokenPartitionMinNeqMaxFunction` — declares
|
|
17
|
+
``SINGLE_VALUE_PARTITIONS`` but emits a chunk whose partition
|
|
18
|
+
column has multiple distinct values. The framework's auto-extract
|
|
19
|
+
path would catch this client-side, so the fixture supplies an
|
|
20
|
+
explicit ``partition_values={"col": (min, max)}`` with min != max
|
|
21
|
+
to defeat the worker check and reach the C++ defense-in-depth
|
|
22
|
+
validation in ``InstallBatch``. The C++ check is what guarantees
|
|
23
|
+
this fires on release builds where DuckDB's own
|
|
24
|
+
``BatchedDataCollection::Append`` assertion is compiled out.
|
|
25
|
+
|
|
26
|
+
* :class:`BrokenPartitionValuesNoAnnotationFunction` — no
|
|
27
|
+
``vgi.partition_column`` annotation on any bind-schema field and
|
|
28
|
+
``partition_kind = NOT_PARTITIONED``, but the worker passes
|
|
29
|
+
``partition_values=`` on ``out.emit`` anyway. The framework
|
|
30
|
+
rejects with RuntimeError at the emit site.
|
|
31
|
+
|
|
32
|
+
* :class:`BrokenPartitionColumnAbsentFromBatchFunction` — declares
|
|
33
|
+
``partition_kind`` and annotates a bind-schema field, but the
|
|
34
|
+
worker emits a batch that DOES NOT include that column AND does
|
|
35
|
+
not supply an explicit ``partition_values=`` override. The
|
|
36
|
+
framework's ``_merge_partition_values`` raises RuntimeError at
|
|
37
|
+
the emit site (auto-extract can't find the column).
|
|
38
|
+
"""
|
|
39
|
+
|
|
40
|
+
from __future__ import annotations
|
|
41
|
+
|
|
42
|
+
from dataclasses import dataclass
|
|
43
|
+
from typing import Annotated, Any, ClassVar, cast
|
|
44
|
+
|
|
45
|
+
import pyarrow as pa
|
|
46
|
+
from vgi_rpc import ArrowSerializableDataclass
|
|
47
|
+
from vgi_rpc.rpc import OutputCollector
|
|
48
|
+
|
|
49
|
+
from vgi._test_fixtures.table._common import _cardinality_from_count
|
|
50
|
+
from vgi.arguments import Arg
|
|
51
|
+
from vgi.metadata import PartitionKind
|
|
52
|
+
from vgi.protocol import VgiOutputCollector
|
|
53
|
+
from vgi.schema_utils import partition_field
|
|
54
|
+
from vgi.table_function import (
|
|
55
|
+
ProcessParams,
|
|
56
|
+
TableFunctionGenerator,
|
|
57
|
+
bind_fixed_schema,
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
@dataclass(slots=True, frozen=True)
|
|
62
|
+
class _BrokenArgs:
|
|
63
|
+
count: Annotated[int, Arg(0, doc="Rows to attempt to emit", ge=1)]
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
@dataclass(kw_only=True)
|
|
67
|
+
class _BrokenState(ArrowSerializableDataclass):
|
|
68
|
+
emitted: bool = False
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
# =============================================================================
|
|
72
|
+
# 1. Missing partition_values metadata (C++ side raises)
|
|
73
|
+
# =============================================================================
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
@bind_fixed_schema
|
|
77
|
+
@_cardinality_from_count
|
|
78
|
+
class BrokenMissingPartitionValuesFunction(TableFunctionGenerator[_BrokenArgs, _BrokenState]):
|
|
79
|
+
"""Opt-in declared, but worker bypasses framework metadata merge."""
|
|
80
|
+
|
|
81
|
+
FIXED_SCHEMA: ClassVar[pa.Schema] = pa.schema(
|
|
82
|
+
[
|
|
83
|
+
partition_field("country", pa.string()),
|
|
84
|
+
pa.field("sales", pa.int64()),
|
|
85
|
+
]
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
class Meta:
|
|
89
|
+
name = "broken_missing_partition_values"
|
|
90
|
+
description = (
|
|
91
|
+
"DELIBERATELY BROKEN: declares partition_kind + partition-annotated "
|
|
92
|
+
"field but emits a data batch without vgi_partition_values#b64 "
|
|
93
|
+
"metadata. C++ extension's contract check raises."
|
|
94
|
+
)
|
|
95
|
+
categories = ["testing", "broken"]
|
|
96
|
+
partition_kind = PartitionKind.SINGLE_VALUE_PARTITIONS
|
|
97
|
+
|
|
98
|
+
@classmethod
|
|
99
|
+
def initial_state(cls, params: ProcessParams[_BrokenArgs]) -> _BrokenState:
|
|
100
|
+
return _BrokenState()
|
|
101
|
+
|
|
102
|
+
@classmethod
|
|
103
|
+
def process(
|
|
104
|
+
cls,
|
|
105
|
+
params: ProcessParams[_BrokenArgs],
|
|
106
|
+
state: _BrokenState,
|
|
107
|
+
out: OutputCollector,
|
|
108
|
+
) -> None:
|
|
109
|
+
if state.emitted:
|
|
110
|
+
out.finish()
|
|
111
|
+
return
|
|
112
|
+
batch = pa.RecordBatch.from_pydict(
|
|
113
|
+
{"country": ["US"] * params.args.count, "sales": list(range(params.args.count))},
|
|
114
|
+
schema=cls.FIXED_SCHEMA,
|
|
115
|
+
)
|
|
116
|
+
# Reach into the wrapper stack and call the innermost inner
|
|
117
|
+
# directly. This is what makes the fixture "broken": the
|
|
118
|
+
# framework's _merge_partition_values validator never runs, so
|
|
119
|
+
# the data batch has no vgi_partition_values#b64 metadata and
|
|
120
|
+
# the C++ extension's InstallBatch contract check fires.
|
|
121
|
+
# Same pattern as v1's broken_missing_batch_index_tag fixture.
|
|
122
|
+
inner = out
|
|
123
|
+
while hasattr(inner, "_inner"):
|
|
124
|
+
inner = inner._inner
|
|
125
|
+
inner.emit(batch)
|
|
126
|
+
state.emitted = True
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
# =============================================================================
|
|
130
|
+
# 2. SINGLE_VALUE with min != max (C++ defense-in-depth raises)
|
|
131
|
+
# =============================================================================
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
@bind_fixed_schema
|
|
135
|
+
@_cardinality_from_count
|
|
136
|
+
class BrokenPartitionMinNeqMaxFunction(TableFunctionGenerator[_BrokenArgs, _BrokenState]):
|
|
137
|
+
"""SINGLE_VALUE_PARTITIONS but emit min != max via explicit override."""
|
|
138
|
+
|
|
139
|
+
FIXED_SCHEMA: ClassVar[pa.Schema] = pa.schema(
|
|
140
|
+
[
|
|
141
|
+
partition_field("country", pa.string()),
|
|
142
|
+
pa.field("sales", pa.int64()),
|
|
143
|
+
]
|
|
144
|
+
)
|
|
145
|
+
|
|
146
|
+
class Meta:
|
|
147
|
+
name = "broken_partition_min_neq_max"
|
|
148
|
+
description = (
|
|
149
|
+
"DELIBERATELY BROKEN: declares SINGLE_VALUE_PARTITIONS but "
|
|
150
|
+
"supplies an explicit partition_values override with "
|
|
151
|
+
"min != max. The framework's wrapper validation doesn't "
|
|
152
|
+
"compare min vs max for SINGLE_VALUE; the C++ extension's "
|
|
153
|
+
"defense-in-depth check in InstallBatch raises."
|
|
154
|
+
)
|
|
155
|
+
categories = ["testing", "broken"]
|
|
156
|
+
partition_kind = PartitionKind.SINGLE_VALUE_PARTITIONS
|
|
157
|
+
|
|
158
|
+
@classmethod
|
|
159
|
+
def initial_state(cls, params: ProcessParams[_BrokenArgs]) -> _BrokenState:
|
|
160
|
+
return _BrokenState()
|
|
161
|
+
|
|
162
|
+
@classmethod
|
|
163
|
+
def process(
|
|
164
|
+
cls,
|
|
165
|
+
params: ProcessParams[_BrokenArgs],
|
|
166
|
+
state: _BrokenState,
|
|
167
|
+
out: OutputCollector,
|
|
168
|
+
) -> None:
|
|
169
|
+
if state.emitted:
|
|
170
|
+
out.finish()
|
|
171
|
+
return
|
|
172
|
+
# Single-valued country column at the data level (so the
|
|
173
|
+
# framework's auto-extract WOULD pass), but the explicit
|
|
174
|
+
# override forces min != max — defeats the framework check
|
|
175
|
+
# and reaches C++ defense-in-depth.
|
|
176
|
+
batch = pa.RecordBatch.from_pydict(
|
|
177
|
+
{"country": ["US"] * params.args.count, "sales": list(range(params.args.count))},
|
|
178
|
+
schema=cls.FIXED_SCHEMA,
|
|
179
|
+
)
|
|
180
|
+
cast(VgiOutputCollector, out).emit(
|
|
181
|
+
batch,
|
|
182
|
+
partition_values={
|
|
183
|
+
"country": (
|
|
184
|
+
pa.scalar("US", type=pa.string()),
|
|
185
|
+
pa.scalar("BR", type=pa.string()), # max != min — bug
|
|
186
|
+
),
|
|
187
|
+
},
|
|
188
|
+
)
|
|
189
|
+
state.emitted = True
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
# =============================================================================
|
|
193
|
+
# 3. partition_values kwarg without any annotated field (worker-side raise)
|
|
194
|
+
# =============================================================================
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
@bind_fixed_schema
|
|
198
|
+
@_cardinality_from_count
|
|
199
|
+
class BrokenPartitionValuesNoAnnotationFunction(TableFunctionGenerator[_BrokenArgs, _BrokenState]):
|
|
200
|
+
"""No partition annotation, but worker passes partition_values=."""
|
|
201
|
+
|
|
202
|
+
# No partition_field() — bind schema has no partition columns.
|
|
203
|
+
# cast: mypy joins Field[StringType] + Field[Int64Type] to Field[object];
|
|
204
|
+
# the runtime list is a plain list of pa.Field.
|
|
205
|
+
FIXED_SCHEMA: ClassVar[pa.Schema] = pa.schema(
|
|
206
|
+
cast(
|
|
207
|
+
"list[pa.Field[Any]]",
|
|
208
|
+
[pa.field("country", pa.string()), pa.field("sales", pa.int64())],
|
|
209
|
+
)
|
|
210
|
+
)
|
|
211
|
+
|
|
212
|
+
class Meta:
|
|
213
|
+
name = "broken_partition_values_no_annotation"
|
|
214
|
+
description = (
|
|
215
|
+
"DELIBERATELY BROKEN: no field carries vgi.partition_column "
|
|
216
|
+
"metadata (and partition_kind defaults to NOT_PARTITIONED), "
|
|
217
|
+
"but the worker passes partition_values= on out.emit. The "
|
|
218
|
+
"framework rejects with RuntimeError before the wire."
|
|
219
|
+
)
|
|
220
|
+
categories = ["testing", "broken"]
|
|
221
|
+
# No partition_kind setting — defaults to NOT_PARTITIONED.
|
|
222
|
+
|
|
223
|
+
@classmethod
|
|
224
|
+
def initial_state(cls, params: ProcessParams[_BrokenArgs]) -> _BrokenState:
|
|
225
|
+
return _BrokenState()
|
|
226
|
+
|
|
227
|
+
@classmethod
|
|
228
|
+
def process(
|
|
229
|
+
cls,
|
|
230
|
+
params: ProcessParams[_BrokenArgs],
|
|
231
|
+
state: _BrokenState,
|
|
232
|
+
out: OutputCollector,
|
|
233
|
+
) -> None:
|
|
234
|
+
if state.emitted:
|
|
235
|
+
out.finish()
|
|
236
|
+
return
|
|
237
|
+
batch = pa.RecordBatch.from_pydict(
|
|
238
|
+
{"country": ["US"] * params.args.count, "sales": list(range(params.args.count))},
|
|
239
|
+
schema=cls.FIXED_SCHEMA,
|
|
240
|
+
)
|
|
241
|
+
cast(VgiOutputCollector, out).emit(
|
|
242
|
+
batch,
|
|
243
|
+
partition_values={
|
|
244
|
+
"country": (
|
|
245
|
+
pa.scalar("US", type=pa.string()),
|
|
246
|
+
pa.scalar("US", type=pa.string()),
|
|
247
|
+
),
|
|
248
|
+
},
|
|
249
|
+
)
|
|
250
|
+
state.emitted = True
|
|
251
|
+
|
|
252
|
+
|
|
253
|
+
# =============================================================================
|
|
254
|
+
# 4. Annotated column missing from batch, no explicit override (worker-side raise)
|
|
255
|
+
# =============================================================================
|
|
256
|
+
|
|
257
|
+
|
|
258
|
+
@bind_fixed_schema
|
|
259
|
+
@_cardinality_from_count
|
|
260
|
+
class BrokenPartitionColumnAbsentFromBatchFunction(TableFunctionGenerator[_BrokenArgs, _BrokenState]):
|
|
261
|
+
"""Annotated partition column not in emitted batch, no override."""
|
|
262
|
+
|
|
263
|
+
FIXED_SCHEMA: ClassVar[pa.Schema] = pa.schema(
|
|
264
|
+
[
|
|
265
|
+
partition_field("category", pa.string()),
|
|
266
|
+
pa.field("revenue", pa.int64()),
|
|
267
|
+
]
|
|
268
|
+
)
|
|
269
|
+
|
|
270
|
+
class Meta:
|
|
271
|
+
name = "broken_partition_column_absent_from_batch"
|
|
272
|
+
description = (
|
|
273
|
+
"DELIBERATELY BROKEN: declares partition_kind on "
|
|
274
|
+
"'category' but emits a batch without 'category' AND "
|
|
275
|
+
"doesn't supply an explicit partition_values override. The "
|
|
276
|
+
"framework's auto-extract fails with RuntimeError before "
|
|
277
|
+
"the wire."
|
|
278
|
+
)
|
|
279
|
+
categories = ["testing", "broken"]
|
|
280
|
+
partition_kind = PartitionKind.SINGLE_VALUE_PARTITIONS
|
|
281
|
+
|
|
282
|
+
@classmethod
|
|
283
|
+
def initial_state(cls, params: ProcessParams[_BrokenArgs]) -> _BrokenState:
|
|
284
|
+
return _BrokenState()
|
|
285
|
+
|
|
286
|
+
@classmethod
|
|
287
|
+
def process(
|
|
288
|
+
cls,
|
|
289
|
+
params: ProcessParams[_BrokenArgs],
|
|
290
|
+
state: _BrokenState,
|
|
291
|
+
out: OutputCollector,
|
|
292
|
+
) -> None:
|
|
293
|
+
if state.emitted:
|
|
294
|
+
out.finish()
|
|
295
|
+
return
|
|
296
|
+
# Emit a batch WITHOUT 'category'. Framework's auto-extract
|
|
297
|
+
# tries to read batch.column('category') and raises.
|
|
298
|
+
batch_schema = pa.schema([pa.field("revenue", pa.int64())])
|
|
299
|
+
batch = pa.RecordBatch.from_pydict(
|
|
300
|
+
{"revenue": list(range(params.args.count))},
|
|
301
|
+
schema=batch_schema,
|
|
302
|
+
)
|
|
303
|
+
out.emit(batch)
|
|
304
|
+
state.emitted = True
|
|
@@ -0,0 +1,195 @@
|
|
|
1
|
+
# Copyright 2025, 2026 Query Farm LLC - https://query.farm
|
|
2
|
+
|
|
3
|
+
"""Example function exercising the dynamic_to_string callback.
|
|
4
|
+
|
|
5
|
+
``ProfilingDemoFunction`` demonstrates the recommended persistence
|
|
6
|
+
pattern for diagnostics that should surface under ``EXPLAIN ANALYZE``:
|
|
7
|
+
|
|
8
|
+
1. ``process()`` keeps per-stream counters in user state (rows,
|
|
9
|
+
batches, start time), and after every tick writes a serialized
|
|
10
|
+
snapshot via ``params.storage.put(bytes)``.
|
|
11
|
+
2. ``dynamic_to_string()`` constructs a ``BoundStorage`` for the
|
|
12
|
+
given ``execution_id``, calls ``collect()`` to gather every
|
|
13
|
+
worker's last snapshot, and sums them.
|
|
14
|
+
|
|
15
|
+
``BoundStorage`` defaults to the sqlite-backed shared storage (see
|
|
16
|
+
CLAUDE.md → ``VGI_WORKER_SHARED_STORAGE``), so the pattern works across
|
|
17
|
+
worker processes — both subprocess transport and HTTP transport with
|
|
18
|
+
``max_workers > 1``. No in-memory class state is involved.
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
from __future__ import annotations
|
|
22
|
+
|
|
23
|
+
import struct
|
|
24
|
+
import time
|
|
25
|
+
from collections.abc import Mapping
|
|
26
|
+
from dataclasses import dataclass
|
|
27
|
+
from typing import Annotated, ClassVar
|
|
28
|
+
|
|
29
|
+
import numpy as np
|
|
30
|
+
import pyarrow as pa
|
|
31
|
+
from vgi_rpc import ArrowSerializableDataclass
|
|
32
|
+
from vgi_rpc.rpc import OutputCollector
|
|
33
|
+
|
|
34
|
+
from vgi._test_fixtures.table._common import CountBatchArgs
|
|
35
|
+
from vgi.arguments import Arg
|
|
36
|
+
from vgi.function_storage import BoundStorage
|
|
37
|
+
from vgi.metadata import FunctionExample
|
|
38
|
+
from vgi.schema_utils import schema
|
|
39
|
+
from vgi.table_function import (
|
|
40
|
+
BindParams,
|
|
41
|
+
ProcessParams,
|
|
42
|
+
TableCardinality,
|
|
43
|
+
TableFunctionGenerator,
|
|
44
|
+
bind_fixed_schema,
|
|
45
|
+
init_single_worker,
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
@dataclass(frozen=True)
|
|
50
|
+
class ProfilingDemoArgs(CountBatchArgs):
|
|
51
|
+
"""Arguments for ProfilingDemoFunction."""
|
|
52
|
+
|
|
53
|
+
increment: Annotated[int, Arg("increment", default=1, doc="Step between values", ge=1)]
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
@dataclass(kw_only=True)
|
|
57
|
+
class ProfilingState(ArrowSerializableDataclass):
|
|
58
|
+
"""Per-stream counters."""
|
|
59
|
+
|
|
60
|
+
remaining: int
|
|
61
|
+
current_index: int = 0
|
|
62
|
+
rows_emitted: int = 0
|
|
63
|
+
batches_emitted: int = 0
|
|
64
|
+
started_at_ns: int = 0
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
# Serialized snapshot wire format: three little-endian uint64s
|
|
68
|
+
# (rows, batches, elapsed_us). Compact; survives multi-worker collect().
|
|
69
|
+
_SNAPSHOT = struct.Struct("<QQQ")
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def _pack_snapshot(rows: int, batches: int, elapsed_us: int) -> bytes:
|
|
73
|
+
return _SNAPSHOT.pack(rows, batches, elapsed_us)
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def _unpack_snapshot(data: bytes) -> tuple[int, int, int]:
|
|
77
|
+
return _SNAPSHOT.unpack(data)
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
@init_single_worker
|
|
81
|
+
@bind_fixed_schema
|
|
82
|
+
class ProfilingDemoFunction(TableFunctionGenerator[ProfilingDemoArgs, ProfilingState]):
|
|
83
|
+
"""Sequence generator that publishes per-execution metrics under EXPLAIN ANALYZE.
|
|
84
|
+
|
|
85
|
+
Output is identical to ``sequence(count, batch_size, increment)``.
|
|
86
|
+
Additionally tracks ``rows_produced``, ``batches_emitted``, and
|
|
87
|
+
``elapsed_ms`` and surfaces them via ``dynamic_to_string``.
|
|
88
|
+
"""
|
|
89
|
+
|
|
90
|
+
FunctionArguments = ProfilingDemoArgs
|
|
91
|
+
|
|
92
|
+
class Meta:
|
|
93
|
+
"""Metadata for ProfilingDemoFunction."""
|
|
94
|
+
|
|
95
|
+
name = "profiling_demo"
|
|
96
|
+
description = "Sequence generator publishing diagnostics under EXPLAIN ANALYZE"
|
|
97
|
+
categories = ["generator", "utility"]
|
|
98
|
+
examples = [
|
|
99
|
+
FunctionExample(
|
|
100
|
+
sql="EXPLAIN ANALYZE SELECT count(*) FROM profiling_demo(500)",
|
|
101
|
+
description="Run with diagnostics surfaced as Extra Info",
|
|
102
|
+
),
|
|
103
|
+
]
|
|
104
|
+
|
|
105
|
+
FIXED_SCHEMA: ClassVar[pa.Schema] = schema(n=pa.int64())
|
|
106
|
+
NUMPY_DTYPE: ClassVar[type[np.generic]] = np.int64
|
|
107
|
+
|
|
108
|
+
@classmethod
|
|
109
|
+
def cardinality(cls, params: BindParams[ProfilingDemoArgs]) -> TableCardinality:
|
|
110
|
+
count = params.args.count
|
|
111
|
+
return TableCardinality(estimate=count, max=count)
|
|
112
|
+
|
|
113
|
+
@classmethod
|
|
114
|
+
def initial_state(cls, params: ProcessParams[ProfilingDemoArgs]) -> ProfilingState:
|
|
115
|
+
return ProfilingState(
|
|
116
|
+
remaining=params.args.count,
|
|
117
|
+
started_at_ns=time.monotonic_ns(),
|
|
118
|
+
)
|
|
119
|
+
|
|
120
|
+
@classmethod
|
|
121
|
+
def process(
|
|
122
|
+
cls,
|
|
123
|
+
params: ProcessParams[ProfilingDemoArgs],
|
|
124
|
+
state: ProfilingState,
|
|
125
|
+
out: OutputCollector,
|
|
126
|
+
) -> None:
|
|
127
|
+
if state.remaining <= 0:
|
|
128
|
+
# Final write so dynamic_to_string sees the totals even after
|
|
129
|
+
# the stream finishes. One row per OS pid via state_put under
|
|
130
|
+
# namespace b"profile" — dynamic_to_string drains them all.
|
|
131
|
+
elapsed_us = (time.monotonic_ns() - state.started_at_ns) // 1000
|
|
132
|
+
import os as _os
|
|
133
|
+
|
|
134
|
+
params.storage.state_put(
|
|
135
|
+
b"profile",
|
|
136
|
+
BoundStorage.pack_int_key(_os.getpid()),
|
|
137
|
+
_pack_snapshot(state.rows_emitted, state.batches_emitted, elapsed_us),
|
|
138
|
+
)
|
|
139
|
+
out.finish()
|
|
140
|
+
return
|
|
141
|
+
batch_size = params.args.batch_size
|
|
142
|
+
size = min(state.remaining, batch_size)
|
|
143
|
+
increment = params.args.increment
|
|
144
|
+
values = np.arange(
|
|
145
|
+
state.current_index * increment,
|
|
146
|
+
(state.current_index + size) * increment,
|
|
147
|
+
increment,
|
|
148
|
+
dtype=cls.NUMPY_DTYPE,
|
|
149
|
+
)
|
|
150
|
+
out.emit(pa.RecordBatch.from_arrays([pa.array(values)], schema=params.output_schema))
|
|
151
|
+
state.current_index += size
|
|
152
|
+
state.remaining -= size
|
|
153
|
+
state.rows_emitted += size
|
|
154
|
+
state.batches_emitted += 1
|
|
155
|
+
|
|
156
|
+
# Per-tick snapshot — overwrites this worker's slot. The dispatcher's
|
|
157
|
+
# state_drain on dynamic_to_string sums one snapshot per worker pid.
|
|
158
|
+
elapsed_us = (time.monotonic_ns() - state.started_at_ns) // 1000
|
|
159
|
+
import os as _os
|
|
160
|
+
|
|
161
|
+
params.storage.state_put(
|
|
162
|
+
b"profile",
|
|
163
|
+
BoundStorage.pack_int_key(_os.getpid()),
|
|
164
|
+
_pack_snapshot(state.rows_emitted, state.batches_emitted, elapsed_us),
|
|
165
|
+
)
|
|
166
|
+
|
|
167
|
+
@classmethod
|
|
168
|
+
def dynamic_to_string(
|
|
169
|
+
cls,
|
|
170
|
+
params: BindParams[ProfilingDemoArgs],
|
|
171
|
+
execution_id: bytes,
|
|
172
|
+
) -> Mapping[str, str]:
|
|
173
|
+
# BindParams doesn't carry a BoundStorage (no execution_id at bind
|
|
174
|
+
# time). Construct one with the execution_id we received.
|
|
175
|
+
storage = BoundStorage(cls.storage, execution_id, request=params.bind_call)
|
|
176
|
+
try:
|
|
177
|
+
# state_drain returns (key, value) pairs; we only want the values.
|
|
178
|
+
snapshots = [v for _, v in storage.state_drain(b"profile")]
|
|
179
|
+
except Exception:
|
|
180
|
+
return {}
|
|
181
|
+
if not snapshots:
|
|
182
|
+
return {}
|
|
183
|
+
rows = 0
|
|
184
|
+
batches = 0
|
|
185
|
+
elapsed_us = 0
|
|
186
|
+
for blob in snapshots:
|
|
187
|
+
r, b, e = _unpack_snapshot(blob)
|
|
188
|
+
rows += r
|
|
189
|
+
batches += b
|
|
190
|
+
elapsed_us = max(elapsed_us, e)
|
|
191
|
+
return {
|
|
192
|
+
"rows_produced": str(rows),
|
|
193
|
+
"batches_emitted": str(batches),
|
|
194
|
+
"elapsed_ms": f"{elapsed_us / 1000.0:.2f}",
|
|
195
|
+
}
|