vgi-python 0.8.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vgi/__init__.py +152 -0
- vgi/_duckdb.py +62 -0
- vgi/_storage_profile.py +132 -0
- vgi/_test_fixtures/__init__.py +20 -0
- vgi/_test_fixtures/accumulate/__init__.py +19 -0
- vgi/_test_fixtures/accumulate/worker.py +762 -0
- vgi/_test_fixtures/aggregate/__init__.py +62 -0
- vgi/_test_fixtures/aggregate/_common.py +21 -0
- vgi/_test_fixtures/aggregate/basic.py +232 -0
- vgi/_test_fixtures/aggregate/dynamic.py +409 -0
- vgi/_test_fixtures/aggregate/generic.py +86 -0
- vgi/_test_fixtures/aggregate/listagg.py +71 -0
- vgi/_test_fixtures/aggregate/percentile.py +107 -0
- vgi/_test_fixtures/aggregate/streaming.py +192 -0
- vgi/_test_fixtures/aggregate/varargs.py +75 -0
- vgi/_test_fixtures/aggregate/window.py +380 -0
- vgi/_test_fixtures/attach_options.py +308 -0
- vgi/_test_fixtures/bad_protocol.py +62 -0
- vgi/_test_fixtures/cancellable.py +336 -0
- vgi/_test_fixtures/catalog.py +813 -0
- vgi/_test_fixtures/http_server.py +394 -0
- vgi/_test_fixtures/nest_tensor.py +614 -0
- vgi/_test_fixtures/orchard_catalog.py +47 -0
- vgi/_test_fixtures/projection_repro/__init__.py +6 -0
- vgi/_test_fixtures/projection_repro/worker.py +454 -0
- vgi/_test_fixtures/scalar/__init__.py +116 -0
- vgi/_test_fixtures/scalar/_common.py +69 -0
- vgi/_test_fixtures/scalar/arithmetic.py +321 -0
- vgi/_test_fixtures/scalar/binary.py +120 -0
- vgi/_test_fixtures/scalar/formatting.py +176 -0
- vgi/_test_fixtures/scalar/geo.py +300 -0
- vgi/_test_fixtures/scalar/null_handling.py +107 -0
- vgi/_test_fixtures/scalar/random_demo.py +171 -0
- vgi/_test_fixtures/scalar/settings_secrets.py +102 -0
- vgi/_test_fixtures/scalar/type_info.py +219 -0
- vgi/_test_fixtures/schema_reconcile/__init__.py +29 -0
- vgi/_test_fixtures/schema_reconcile/worker.py +653 -0
- vgi/_test_fixtures/simple_writable.py +793 -0
- vgi/_test_fixtures/table/__init__.py +221 -0
- vgi/_test_fixtures/table/_common.py +162 -0
- vgi/_test_fixtures/table/batch_index.py +283 -0
- vgi/_test_fixtures/table/batch_index_broken.py +200 -0
- vgi/_test_fixtures/table/catalog_scans.py +162 -0
- vgi/_test_fixtures/table/filters.py +1005 -0
- vgi/_test_fixtures/table/late_materialization.py +249 -0
- vgi/_test_fixtures/table/make_series.py +273 -0
- vgi/_test_fixtures/table/misc.py +499 -0
- vgi/_test_fixtures/table/order_modes.py +164 -0
- vgi/_test_fixtures/table/pairs.py +437 -0
- vgi/_test_fixtures/table/partition_columns.py +472 -0
- vgi/_test_fixtures/table/partition_columns_broken.py +304 -0
- vgi/_test_fixtures/table/profiling_example.py +195 -0
- vgi/_test_fixtures/table/required_filters.py +234 -0
- vgi/_test_fixtures/table/sequence.py +710 -0
- vgi/_test_fixtures/table/settings.py +426 -0
- vgi/_test_fixtures/table/transaction_storage.py +162 -0
- vgi/_test_fixtures/table/tt_pushdown.py +191 -0
- vgi/_test_fixtures/table/versioned.py +230 -0
- vgi/_test_fixtures/table_in_out.py +1392 -0
- vgi/_test_fixtures/versioned.py +155 -0
- vgi/_test_fixtures/versioned_tables.py +595 -0
- vgi/_test_fixtures/worker.py +1631 -0
- vgi/_test_fixtures/writable/__init__.py +8 -0
- vgi/_test_fixtures/writable/generic.py +236 -0
- vgi/_test_fixtures/writable/table.py +149 -0
- vgi/_test_fixtures/writable/worker.py +1148 -0
- vgi/aggregate_function.py +607 -0
- vgi/argument_spec.py +472 -0
- vgi/arguments.py +1747 -0
- vgi/auth.py +55 -0
- vgi/catalog/__init__.py +88 -0
- vgi/catalog/attach_option.py +206 -0
- vgi/catalog/catalog_interface.py +2767 -0
- vgi/catalog/descriptors.py +870 -0
- vgi/catalog/duckdb_statistics.py +377 -0
- vgi/catalog/secret_type.py +96 -0
- vgi/catalog/setting.py +253 -0
- vgi/catalog/storage.py +372 -0
- vgi/client/__init__.py +67 -0
- vgi/client/catalog_mixin.py +1251 -0
- vgi/client/cli.py +582 -0
- vgi/client/cli_catalog.py +182 -0
- vgi/client/cli_schema.py +270 -0
- vgi/client/cli_table.py +907 -0
- vgi/client/cli_transaction.py +97 -0
- vgi/client/cli_utils.py +441 -0
- vgi/client/cli_view.py +303 -0
- vgi/client/client.py +2183 -0
- vgi/exceptions.py +205 -0
- vgi/function.py +245 -0
- vgi/function_storage.py +1636 -0
- vgi/function_storage_azure_sql.py +922 -0
- vgi/function_storage_cf_do.py +740 -0
- vgi/http/__init__.py +25 -0
- vgi/http/demo_storage.py +212 -0
- vgi/http/worker_page.py +1252 -0
- vgi/invocation.py +154 -0
- vgi/logging_config.py +93 -0
- vgi/meta_worker.py +661 -0
- vgi/metadata.py +1403 -0
- vgi/otel.py +406 -0
- vgi/protocol.py +2418 -0
- vgi/protocol_version.txt +1 -0
- vgi/py.typed +0 -0
- vgi/scalar_function.py +1211 -0
- vgi/schema_utils.py +234 -0
- vgi/secret_protocol.py +124 -0
- vgi/secret_service.py +238 -0
- vgi/serve.py +769 -0
- vgi/table_buffering_function.py +443 -0
- vgi/table_filter_pushdown.py +1528 -0
- vgi/table_function.py +1130 -0
- vgi/table_in_out_function.py +383 -0
- vgi/transactor/__init__.py +24 -0
- vgi/transactor/_duckdb_compat.py +27 -0
- vgi/transactor/client.py +137 -0
- vgi/transactor/protocol.py +149 -0
- vgi/transactor/server.py +740 -0
- vgi/worker.py +4761 -0
- vgi_python-0.8.0.dist-info/METADATA +735 -0
- vgi_python-0.8.0.dist-info/RECORD +124 -0
- vgi_python-0.8.0.dist-info/WHEEL +4 -0
- vgi_python-0.8.0.dist-info/entry_points.txt +5 -0
- vgi_python-0.8.0.dist-info/licenses/LICENSE +134 -0
|
@@ -0,0 +1,1005 @@
|
|
|
1
|
+
# Copyright 2025, 2026 Query Farm LLC - https://query.farm
|
|
2
|
+
|
|
3
|
+
"""Filter-pushdown demos (filter_echo, dynamic_filter_echo, expression_filter, spatial_filter)."""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
import os
|
|
8
|
+
import struct
|
|
9
|
+
from dataclasses import dataclass
|
|
10
|
+
from typing import Annotated, Any, ClassVar
|
|
11
|
+
|
|
12
|
+
import pyarrow as pa
|
|
13
|
+
from vgi_rpc import ArrowSerializableDataclass
|
|
14
|
+
from vgi_rpc.rpc import OutputCollector
|
|
15
|
+
|
|
16
|
+
from vgi._test_fixtures.table._common import (
|
|
17
|
+
_cardinality_from_count,
|
|
18
|
+
_EmptyArgs,
|
|
19
|
+
)
|
|
20
|
+
from vgi.arguments import Arg
|
|
21
|
+
from vgi.invocation import GlobalInitResponse
|
|
22
|
+
from vgi.metadata import FunctionExample
|
|
23
|
+
from vgi.schema_utils import schema
|
|
24
|
+
from vgi.table_filter_pushdown import PushdownFilters
|
|
25
|
+
from vgi.table_function import (
|
|
26
|
+
InitParams,
|
|
27
|
+
ProcessParams,
|
|
28
|
+
TableFunctionGenerator,
|
|
29
|
+
bind_fixed_schema,
|
|
30
|
+
init_single_worker,
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
# =============================================================================
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def _format_pushed_filters(filters: PushdownFilters | None) -> str:
|
|
37
|
+
"""Format pushed-down filters as a human-readable SQL-like string.
|
|
38
|
+
|
|
39
|
+
Large IN lists (from join key pushdown) are truncated to avoid
|
|
40
|
+
generating multi-megabyte filter strings.
|
|
41
|
+
"""
|
|
42
|
+
if not filters:
|
|
43
|
+
return "(none)"
|
|
44
|
+
|
|
45
|
+
from vgi.table_filter_pushdown import AndFilter, InFilter, OrFilter, _filter_to_sql
|
|
46
|
+
|
|
47
|
+
def _format_one(f: object) -> str:
|
|
48
|
+
"""Format a single filter, truncating large InFilters."""
|
|
49
|
+
if isinstance(f, InFilter) and len(f.values) > 20:
|
|
50
|
+
return f"{f.column_name} IN ({len(f.values)} values)"
|
|
51
|
+
if isinstance(f, AndFilter):
|
|
52
|
+
child_parts = [_format_one(c) for c in f.children]
|
|
53
|
+
return "(" + " AND ".join(child_parts) + ")"
|
|
54
|
+
if isinstance(f, OrFilter):
|
|
55
|
+
child_parts = [_format_one(c) for c in f.children]
|
|
56
|
+
return "(" + " OR ".join(child_parts) + ")"
|
|
57
|
+
# Fall back to SQL rendering for other filter types
|
|
58
|
+
sql, params = _filter_to_sql(f, lambda s: s, "?", 0) # type: ignore[arg-type]
|
|
59
|
+
parts: list[str] = []
|
|
60
|
+
param_iter = iter(params)
|
|
61
|
+
for chunk in sql.split("?"):
|
|
62
|
+
parts.append(chunk)
|
|
63
|
+
try:
|
|
64
|
+
p = next(param_iter)
|
|
65
|
+
parts.append(repr(p) if isinstance(p, str) else str(p))
|
|
66
|
+
except StopIteration:
|
|
67
|
+
pass
|
|
68
|
+
return "".join(parts)
|
|
69
|
+
|
|
70
|
+
formatted_parts = [_format_one(f) for f in filters]
|
|
71
|
+
return " AND ".join(formatted_parts) if formatted_parts else "(none)"
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
@dataclass(slots=True, frozen=True)
|
|
75
|
+
class FilterEchoFunctionArgs:
|
|
76
|
+
"""Arguments for FilterEchoFunction."""
|
|
77
|
+
|
|
78
|
+
count: Annotated[int, Arg(0, doc="Number of rows to generate", ge=0)]
|
|
79
|
+
batch_size: Annotated[int, Arg("batch_size", default=2048, doc="Batch size for output", ge=1)]
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
@dataclass(kw_only=True)
|
|
83
|
+
class FilterEchoState(ArrowSerializableDataclass):
|
|
84
|
+
"""Mutable state tracking remaining rows, position, and cached filter string.
|
|
85
|
+
|
|
86
|
+
``filter_str`` is serialized (not Transient): the framework's HTTP
|
|
87
|
+
rehydrate path deserializes user state but does not re-invoke
|
|
88
|
+
``initial_state``, so a Transient filter string would silently revert
|
|
89
|
+
to ``"(none)"`` after the first state-token round-trip.
|
|
90
|
+
"""
|
|
91
|
+
|
|
92
|
+
remaining: int
|
|
93
|
+
current_index: int = 0
|
|
94
|
+
filter_str: str = "(none)"
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
@init_single_worker
|
|
98
|
+
@bind_fixed_schema
|
|
99
|
+
@_cardinality_from_count
|
|
100
|
+
class FilterEchoFunction(TableFunctionGenerator[FilterEchoFunctionArgs, FilterEchoState]):
|
|
101
|
+
"""Echoes pushed-down filter predicates in output for diagnostic purposes.
|
|
102
|
+
|
|
103
|
+
USE CASE
|
|
104
|
+
--------
|
|
105
|
+
Verify which filters DuckDB pushes down to the VGI worker. The
|
|
106
|
+
``pushed_filters`` column shows the SQL-like representation of all
|
|
107
|
+
filters the engine sent. Filters are auto-applied by the worker so
|
|
108
|
+
the result set is always correct.
|
|
109
|
+
|
|
110
|
+
SCHEMA
|
|
111
|
+
------
|
|
112
|
+
Output: {"n": int64, "s": string, "pushed_filters": string}
|
|
113
|
+
|
|
114
|
+
Example:
|
|
115
|
+
-------
|
|
116
|
+
SELECT * FROM filter_echo(10) WHERE n >= 8
|
|
117
|
+
Returns: rows 8-9 with pushed_filters showing "n >= 8"
|
|
118
|
+
|
|
119
|
+
"""
|
|
120
|
+
|
|
121
|
+
class Meta:
|
|
122
|
+
"""Metadata for FilterEchoFunction."""
|
|
123
|
+
|
|
124
|
+
name = "filter_echo"
|
|
125
|
+
description = "Echoes pushed-down filter predicates in output"
|
|
126
|
+
categories = ["generator", "diagnostic"]
|
|
127
|
+
filter_pushdown = True
|
|
128
|
+
auto_apply_filters = True
|
|
129
|
+
projection_pushdown = True
|
|
130
|
+
examples = [
|
|
131
|
+
FunctionExample(
|
|
132
|
+
sql="SELECT * FROM filter_echo(10)",
|
|
133
|
+
description="Generate 10 rows showing pushed filters",
|
|
134
|
+
),
|
|
135
|
+
FunctionExample(
|
|
136
|
+
sql="SELECT pushed_filters FROM filter_echo(10) WHERE n >= 8",
|
|
137
|
+
description="See which filters were pushed down",
|
|
138
|
+
),
|
|
139
|
+
]
|
|
140
|
+
|
|
141
|
+
FIXED_SCHEMA: ClassVar[pa.Schema] = schema({"n": pa.int64(), "s": pa.utf8(), "pushed_filters": pa.utf8()})
|
|
142
|
+
|
|
143
|
+
@classmethod
|
|
144
|
+
def initial_state(cls, params: ProcessParams[FilterEchoFunctionArgs]) -> FilterEchoState:
|
|
145
|
+
"""Create initial state with remaining count and cached filter string."""
|
|
146
|
+
assert params.init_call is not None
|
|
147
|
+
pf = params.init_call.pushdown_filters
|
|
148
|
+
jk = params.init_call.join_keys
|
|
149
|
+
filters = cls.pushdown_filters(pf, join_keys=jk) if pf is not None else None
|
|
150
|
+
return FilterEchoState(
|
|
151
|
+
remaining=params.args.count,
|
|
152
|
+
filter_str=_format_pushed_filters(filters),
|
|
153
|
+
)
|
|
154
|
+
|
|
155
|
+
@classmethod
|
|
156
|
+
def process(
|
|
157
|
+
cls,
|
|
158
|
+
params: ProcessParams[FilterEchoFunctionArgs],
|
|
159
|
+
state: FilterEchoState,
|
|
160
|
+
out: OutputCollector,
|
|
161
|
+
) -> None:
|
|
162
|
+
"""Generate rows with n, s, and pushed_filters columns."""
|
|
163
|
+
if state.remaining <= 0:
|
|
164
|
+
out.finish()
|
|
165
|
+
return
|
|
166
|
+
|
|
167
|
+
size = min(state.remaining, params.args.batch_size)
|
|
168
|
+
start = state.current_index
|
|
169
|
+
|
|
170
|
+
n_values = list(range(start, start + size))
|
|
171
|
+
s_values = [f"row_{i}" for i in n_values]
|
|
172
|
+
filter_values = [state.filter_str] * size
|
|
173
|
+
|
|
174
|
+
out.emit(
|
|
175
|
+
pa.RecordBatch.from_pydict(
|
|
176
|
+
{"n": n_values, "s": s_values, "pushed_filters": filter_values},
|
|
177
|
+
schema=params.output_schema,
|
|
178
|
+
)
|
|
179
|
+
)
|
|
180
|
+
|
|
181
|
+
state.current_index += size
|
|
182
|
+
state.remaining -= size
|
|
183
|
+
|
|
184
|
+
|
|
185
|
+
# ============================================================================
|
|
186
|
+
# ValuePruneFunction — exercises PushdownFilters.get_column_values('n'), the
|
|
187
|
+
# partition-pruning idiom (resolve the discrete value set up front, fetch only
|
|
188
|
+
# those keys). filter_echo can't cover this: it auto-applies the predicate
|
|
189
|
+
# row-by-row via Filter.evaluate, a different code path. Here the `resolved`
|
|
190
|
+
# column echoes exactly what get_column_values returned, so a regression in the
|
|
191
|
+
# AND/OR-descent of that accessor is directly observable — e.g. DuckDB pushing
|
|
192
|
+
# `n IN (...) AND n >= min AND n <= max` (an AndFilter) or `n = a OR n = b` (an
|
|
193
|
+
# OrFilter) must resolve to the discrete set, not collapse to "(scan)".
|
|
194
|
+
# ============================================================================
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
@dataclass(slots=True, frozen=True)
|
|
198
|
+
class _ValuePruneArgs:
|
|
199
|
+
"""Arguments for ValuePruneFunction."""
|
|
200
|
+
|
|
201
|
+
count: Annotated[int, Arg(0, doc="Number of candidate rows (keys 0..count-1)", ge=0)]
|
|
202
|
+
batch_size: Annotated[int, Arg("batch_size", default=2048, doc="Batch size for output", ge=1)]
|
|
203
|
+
|
|
204
|
+
|
|
205
|
+
@dataclass(kw_only=True)
|
|
206
|
+
class _ValuePruneState(ArrowSerializableDataclass):
|
|
207
|
+
"""Resolved key set to emit plus the echoed get_column_values result.
|
|
208
|
+
|
|
209
|
+
Both fields are serialized (not Transient): the HTTP rehydrate path
|
|
210
|
+
deserializes state without re-running initial_state, so the resolution
|
|
211
|
+
must survive a state-token round-trip.
|
|
212
|
+
"""
|
|
213
|
+
|
|
214
|
+
values: list[int]
|
|
215
|
+
resolved: str
|
|
216
|
+
cursor: int = 0
|
|
217
|
+
|
|
218
|
+
|
|
219
|
+
@init_single_worker
|
|
220
|
+
@bind_fixed_schema
|
|
221
|
+
@_cardinality_from_count
|
|
222
|
+
class ValuePruneFunction(TableFunctionGenerator[_ValuePruneArgs, _ValuePruneState]):
|
|
223
|
+
"""Emits only the keys that ``get_column_values('n')`` resolves to.
|
|
224
|
+
|
|
225
|
+
The ``resolved`` column carries the sorted, comma-joined discrete set the
|
|
226
|
+
accessor returned (or ``"(scan)"`` when it returned None, i.e. the predicate
|
|
227
|
+
is not enumerable — no filter, a bare range, or an OR with a non-discrete
|
|
228
|
+
branch). Assert on ``resolved`` to verify the accessor end-to-end,
|
|
229
|
+
independent of any residual filtering.
|
|
230
|
+
"""
|
|
231
|
+
|
|
232
|
+
class Meta:
|
|
233
|
+
"""Metadata for ValuePruneFunction."""
|
|
234
|
+
|
|
235
|
+
name = "value_prune"
|
|
236
|
+
description = "Prunes the key set via get_column_values('n'); echoes the resolved discrete values"
|
|
237
|
+
categories = ["generator", "diagnostic"]
|
|
238
|
+
filter_pushdown = True
|
|
239
|
+
auto_apply_filters = True
|
|
240
|
+
projection_pushdown = True
|
|
241
|
+
examples = [
|
|
242
|
+
FunctionExample(
|
|
243
|
+
sql="SELECT DISTINCT resolved FROM value_prune(100) WHERE n IN (5, 50, 95)",
|
|
244
|
+
description="Resolve a discrete key set from an IN predicate",
|
|
245
|
+
),
|
|
246
|
+
]
|
|
247
|
+
|
|
248
|
+
FIXED_SCHEMA: ClassVar[pa.Schema] = schema({"n": pa.int64(), "resolved": pa.utf8()})
|
|
249
|
+
|
|
250
|
+
@classmethod
|
|
251
|
+
def initial_state(cls, params: ProcessParams[_ValuePruneArgs]) -> _ValuePruneState:
|
|
252
|
+
"""Resolve the discrete key set for `n` from the pushed-down filters."""
|
|
253
|
+
assert params.init_call is not None
|
|
254
|
+
count = params.args.count
|
|
255
|
+
pf = params.init_call.pushdown_filters
|
|
256
|
+
jk = params.init_call.join_keys
|
|
257
|
+
filters = cls.pushdown_filters(pf, join_keys=jk) if pf is not None else None
|
|
258
|
+
discrete = filters.get_column_values("n") if filters is not None else None
|
|
259
|
+
if discrete is not None:
|
|
260
|
+
resolved_vals = sorted(v for v in discrete.to_pylist() if v is not None)
|
|
261
|
+
resolved = ",".join(str(v) for v in resolved_vals)
|
|
262
|
+
emit = [v for v in resolved_vals if 0 <= v < count]
|
|
263
|
+
else:
|
|
264
|
+
resolved = "(scan)"
|
|
265
|
+
emit = list(range(count))
|
|
266
|
+
return _ValuePruneState(values=emit, resolved=resolved)
|
|
267
|
+
|
|
268
|
+
@classmethod
|
|
269
|
+
def process(
|
|
270
|
+
cls,
|
|
271
|
+
params: ProcessParams[_ValuePruneArgs],
|
|
272
|
+
state: _ValuePruneState,
|
|
273
|
+
out: OutputCollector,
|
|
274
|
+
) -> None:
|
|
275
|
+
"""Emit the resolved keys (with the echoed `resolved` diagnostic)."""
|
|
276
|
+
if state.cursor >= len(state.values):
|
|
277
|
+
out.finish()
|
|
278
|
+
return
|
|
279
|
+
size = min(len(state.values) - state.cursor, params.args.batch_size)
|
|
280
|
+
chunk = state.values[state.cursor : state.cursor + size]
|
|
281
|
+
out.emit(
|
|
282
|
+
pa.RecordBatch.from_pydict(
|
|
283
|
+
{"n": chunk, "resolved": [state.resolved] * len(chunk)},
|
|
284
|
+
schema=params.output_schema,
|
|
285
|
+
)
|
|
286
|
+
)
|
|
287
|
+
state.cursor += size
|
|
288
|
+
|
|
289
|
+
|
|
290
|
+
# ============================================================================
|
|
291
|
+
# DictFilterEchoFunction — output column declared as a *dictionary* Arrow type
|
|
292
|
+
# (dictionary<int8, utf8>) with no ENUM metadata. DuckDB maps such a column to
|
|
293
|
+
# plain VARCHAR, so a `WHERE s = 'x'` / `s IN (...)` predicate pushes a VARCHAR
|
|
294
|
+
# (string) literal down to the worker. The worker then emits the column
|
|
295
|
+
# dictionary-encoded, producing a (dictionary column, string literal) pair that
|
|
296
|
+
# the filter evaluator must compare. Naively casting the literal up to the
|
|
297
|
+
# column's dictionary type makes `pc.is_in(dict, dict)` / `pc.equal(dict, dict)`
|
|
298
|
+
# throw `ArrowTypeError: Array type doesn't match type of values set`; the
|
|
299
|
+
# correct path decodes the column to its value type. This fixture pins that
|
|
300
|
+
# behavior so every language implementation handles it identically.
|
|
301
|
+
# ============================================================================
|
|
302
|
+
|
|
303
|
+
|
|
304
|
+
_DICT_FILTER_ECHO_SCHEMA = pa.schema(
|
|
305
|
+
[
|
|
306
|
+
pa.field("n", pa.int64()),
|
|
307
|
+
pa.field("s", pa.dictionary(pa.int8(), pa.utf8())),
|
|
308
|
+
]
|
|
309
|
+
)
|
|
310
|
+
|
|
311
|
+
# Deterministic, low-cardinality values so dictionary encoding is meaningful and
|
|
312
|
+
# the row<->value mapping is easy to assert: row i carries _DICT_VALUES[i % len].
|
|
313
|
+
_DICT_VALUES: tuple[str, ...] = ("red", "green", "blue")
|
|
314
|
+
|
|
315
|
+
|
|
316
|
+
@dataclass(slots=True, frozen=True)
|
|
317
|
+
class _DictFilterEchoArgs:
|
|
318
|
+
"""Arguments for DictFilterEchoFunction."""
|
|
319
|
+
|
|
320
|
+
count: Annotated[int, Arg(0, doc="Number of rows to generate", ge=0)]
|
|
321
|
+
batch_size: Annotated[int, Arg("batch_size", default=2048, doc="Rows per batch", ge=1)]
|
|
322
|
+
|
|
323
|
+
|
|
324
|
+
@dataclass(kw_only=True)
|
|
325
|
+
class _DictFilterEchoState(ArrowSerializableDataclass):
|
|
326
|
+
"""Mutable state tracking remaining rows and position."""
|
|
327
|
+
|
|
328
|
+
remaining: int
|
|
329
|
+
current_index: int = 0
|
|
330
|
+
|
|
331
|
+
|
|
332
|
+
@init_single_worker
|
|
333
|
+
@bind_fixed_schema
|
|
334
|
+
@_cardinality_from_count
|
|
335
|
+
class DictFilterEchoFunction(TableFunctionGenerator[_DictFilterEchoArgs, _DictFilterEchoState]):
|
|
336
|
+
"""Emits a dictionary-encoded VARCHAR column to exercise filter pushdown.
|
|
337
|
+
|
|
338
|
+
USE CASE
|
|
339
|
+
--------
|
|
340
|
+
Regression coverage for filter pushdown over a dictionary-encoded
|
|
341
|
+
column whose DuckDB-facing type is plain VARCHAR. The pushed literal
|
|
342
|
+
arrives as a string while the emitted column is ``dictionary<int8,
|
|
343
|
+
utf8>``; the auto-applied filter must compare the two without
|
|
344
|
+
throwing. See the module comment above.
|
|
345
|
+
|
|
346
|
+
SCHEMA
|
|
347
|
+
------
|
|
348
|
+
Output: {"n": int64, "s": dictionary<int8, utf8> (VARCHAR to DuckDB)}
|
|
349
|
+
|
|
350
|
+
Row i has s = ("red", "green", "blue")[i % 3].
|
|
351
|
+
|
|
352
|
+
Example:
|
|
353
|
+
-------
|
|
354
|
+
SELECT * FROM dict_filter_echo(6) WHERE s = 'green'
|
|
355
|
+
Returns: rows 1 and 4.
|
|
356
|
+
|
|
357
|
+
"""
|
|
358
|
+
|
|
359
|
+
class Meta:
|
|
360
|
+
"""Metadata for DictFilterEchoFunction."""
|
|
361
|
+
|
|
362
|
+
name = "dict_filter_echo"
|
|
363
|
+
description = "Emits a dictionary-encoded VARCHAR column for filter-pushdown testing"
|
|
364
|
+
categories = ["generator", "diagnostic", "testing"]
|
|
365
|
+
filter_pushdown = True
|
|
366
|
+
auto_apply_filters = True
|
|
367
|
+
projection_pushdown = True
|
|
368
|
+
examples = [
|
|
369
|
+
FunctionExample(
|
|
370
|
+
sql="SELECT * FROM dict_filter_echo(6) WHERE s = 'green'",
|
|
371
|
+
description="Filter a dictionary-encoded column by an equality predicate",
|
|
372
|
+
),
|
|
373
|
+
FunctionExample(
|
|
374
|
+
sql="SELECT * FROM dict_filter_echo(6) WHERE s IN ('red', 'blue')",
|
|
375
|
+
description="Filter a dictionary-encoded column by an IN predicate",
|
|
376
|
+
),
|
|
377
|
+
]
|
|
378
|
+
|
|
379
|
+
FIXED_SCHEMA: ClassVar[pa.Schema] = _DICT_FILTER_ECHO_SCHEMA
|
|
380
|
+
|
|
381
|
+
@classmethod
|
|
382
|
+
def initial_state(cls, params: ProcessParams[_DictFilterEchoArgs]) -> _DictFilterEchoState:
|
|
383
|
+
"""Create initial state with the remaining row count."""
|
|
384
|
+
return _DictFilterEchoState(remaining=params.args.count)
|
|
385
|
+
|
|
386
|
+
@classmethod
|
|
387
|
+
def process(
|
|
388
|
+
cls,
|
|
389
|
+
params: ProcessParams[_DictFilterEchoArgs],
|
|
390
|
+
state: _DictFilterEchoState,
|
|
391
|
+
out: OutputCollector,
|
|
392
|
+
) -> None:
|
|
393
|
+
"""Emit a batch with n and a dictionary-encoded s column."""
|
|
394
|
+
if state.remaining <= 0:
|
|
395
|
+
out.finish()
|
|
396
|
+
return
|
|
397
|
+
|
|
398
|
+
size = min(state.remaining, params.args.batch_size)
|
|
399
|
+
start = state.current_index
|
|
400
|
+
|
|
401
|
+
n_values = list(range(start, start + size))
|
|
402
|
+
s_values = [_DICT_VALUES[i % len(_DICT_VALUES)] for i in n_values]
|
|
403
|
+
|
|
404
|
+
out.emit(
|
|
405
|
+
pa.RecordBatch.from_pydict(
|
|
406
|
+
{"n": n_values, "s": s_values},
|
|
407
|
+
schema=params.output_schema,
|
|
408
|
+
)
|
|
409
|
+
)
|
|
410
|
+
|
|
411
|
+
state.current_index += size
|
|
412
|
+
state.remaining -= size
|
|
413
|
+
|
|
414
|
+
|
|
415
|
+
# ============================================================================
|
|
416
|
+
|
|
417
|
+
|
|
418
|
+
def _make_wkb_point(x: float, y: float) -> bytes:
|
|
419
|
+
"""Encode a 2D point as little-endian WKB (byte_order=1, type=1=Point, x, y)."""
|
|
420
|
+
return struct.pack("<bI", 1, 1) + struct.pack("<dd", x, y)
|
|
421
|
+
|
|
422
|
+
|
|
423
|
+
# Arrow field with geoarrow.wkb extension metadata so DuckDB recognizes it as GEOMETRY
|
|
424
|
+
_GEOMETRY_FIELD = pa.field(
|
|
425
|
+
"geom",
|
|
426
|
+
pa.binary(),
|
|
427
|
+
metadata={
|
|
428
|
+
b"ARROW:extension:name": b"geoarrow.wkb",
|
|
429
|
+
b"ARROW:extension:metadata": b"{}",
|
|
430
|
+
},
|
|
431
|
+
)
|
|
432
|
+
|
|
433
|
+
_SPATIAL_FILTER_SCHEMA = pa.schema(
|
|
434
|
+
[ # type: ignore[arg-type] # pyarrow stubs: mixed-type fields
|
|
435
|
+
pa.field("n", pa.int64()),
|
|
436
|
+
pa.field("x", pa.float64()),
|
|
437
|
+
pa.field("y", pa.float64()),
|
|
438
|
+
_GEOMETRY_FIELD,
|
|
439
|
+
]
|
|
440
|
+
)
|
|
441
|
+
|
|
442
|
+
|
|
443
|
+
@dataclass(slots=True, frozen=True)
|
|
444
|
+
class _SpatialFilterArgs:
|
|
445
|
+
"""Arguments for SpatialFilterExampleFunction."""
|
|
446
|
+
|
|
447
|
+
count: Annotated[int, Arg(0, doc="Number of points to generate", ge=1)]
|
|
448
|
+
batch_size: Annotated[int, Arg("batch_size", default=1024, doc="Rows per batch")]
|
|
449
|
+
|
|
450
|
+
|
|
451
|
+
@dataclass(kw_only=True)
|
|
452
|
+
class _SpatialFilterState(ArrowSerializableDataclass):
|
|
453
|
+
"""Mutable state for SpatialFilterExampleFunction."""
|
|
454
|
+
|
|
455
|
+
remaining: int
|
|
456
|
+
total_count: int
|
|
457
|
+
current_index: int = 0
|
|
458
|
+
|
|
459
|
+
|
|
460
|
+
@init_single_worker
|
|
461
|
+
@bind_fixed_schema
|
|
462
|
+
@_cardinality_from_count
|
|
463
|
+
class SpatialFilterExampleFunction(TableFunctionGenerator[_SpatialFilterArgs, _SpatialFilterState]):
|
|
464
|
+
"""Generates points on a grid with geometry column for spatial filter testing.
|
|
465
|
+
|
|
466
|
+
USE CASE
|
|
467
|
+
--------
|
|
468
|
+
Test expression filter pushdown with spatial predicates. Points are placed
|
|
469
|
+
on a deterministic grid in [0, 1) x [0, 1) so that bounding box filter
|
|
470
|
+
counts are predictable.
|
|
471
|
+
|
|
472
|
+
SCHEMA
|
|
473
|
+
------
|
|
474
|
+
Output: {"n": int64, "x": float64, "y": float64, "geom": GEOMETRY}
|
|
475
|
+
|
|
476
|
+
Grid layout: For count=N, point i has coordinates:
|
|
477
|
+
x = (i % cols) / cols
|
|
478
|
+
y = (i // cols) / cols
|
|
479
|
+
where cols = ceil(sqrt(N)).
|
|
480
|
+
|
|
481
|
+
Example:
|
|
482
|
+
-------
|
|
483
|
+
SELECT * FROM spatial_filter_example(100) WHERE geom && ST_MakeEnvelope(0, 0, 0.5, 0.5)
|
|
484
|
+
Returns: points in the lower-left quadrant of the unit square.
|
|
485
|
+
|
|
486
|
+
"""
|
|
487
|
+
|
|
488
|
+
class Meta:
|
|
489
|
+
"""Metadata for SpatialFilterExampleFunction."""
|
|
490
|
+
|
|
491
|
+
name = "spatial_filter_example"
|
|
492
|
+
description = "Generates points on a grid with geometry for spatial filter testing"
|
|
493
|
+
categories = ["generator", "spatial", "testing"]
|
|
494
|
+
filter_pushdown = True
|
|
495
|
+
auto_apply_filters = True
|
|
496
|
+
projection_pushdown = True
|
|
497
|
+
supported_expression_filters = ["&&", "st_intersects_extent"]
|
|
498
|
+
examples = [
|
|
499
|
+
FunctionExample(
|
|
500
|
+
sql="SELECT * FROM spatial_filter_example(100)",
|
|
501
|
+
description="Generate 100 points on a 10x10 grid",
|
|
502
|
+
),
|
|
503
|
+
FunctionExample(
|
|
504
|
+
sql="SELECT COUNT(*) FROM spatial_filter_example(100) WHERE geom && ST_MakeEnvelope(0, 0, 0.5, 0.5)",
|
|
505
|
+
description="Count points in the lower-left quadrant",
|
|
506
|
+
),
|
|
507
|
+
]
|
|
508
|
+
|
|
509
|
+
FIXED_SCHEMA: ClassVar[pa.Schema] = _SPATIAL_FILTER_SCHEMA
|
|
510
|
+
|
|
511
|
+
@classmethod
|
|
512
|
+
def initial_state(cls, params: ProcessParams[_SpatialFilterArgs]) -> _SpatialFilterState:
|
|
513
|
+
"""Create initial state."""
|
|
514
|
+
return _SpatialFilterState(remaining=params.args.count, total_count=params.args.count)
|
|
515
|
+
|
|
516
|
+
@classmethod
|
|
517
|
+
def process(
|
|
518
|
+
cls,
|
|
519
|
+
params: ProcessParams[_SpatialFilterArgs],
|
|
520
|
+
state: _SpatialFilterState,
|
|
521
|
+
out: OutputCollector,
|
|
522
|
+
) -> None:
|
|
523
|
+
"""Generate grid points with WKB geometry."""
|
|
524
|
+
if state.remaining <= 0:
|
|
525
|
+
out.finish()
|
|
526
|
+
return
|
|
527
|
+
|
|
528
|
+
import math
|
|
529
|
+
|
|
530
|
+
cols = max(1, math.ceil(math.sqrt(state.total_count)))
|
|
531
|
+
size = min(state.remaining, params.args.batch_size)
|
|
532
|
+
start = state.current_index
|
|
533
|
+
|
|
534
|
+
ns = list(range(start, start + size))
|
|
535
|
+
xs = [(i % cols) / cols for i in ns]
|
|
536
|
+
ys = [(i // cols) / cols for i in ns]
|
|
537
|
+
geoms = [_make_wkb_point(x, y) for x, y in zip(xs, ys, strict=True)]
|
|
538
|
+
|
|
539
|
+
out.emit(
|
|
540
|
+
pa.RecordBatch.from_pydict(
|
|
541
|
+
{"n": ns, "x": xs, "y": ys, "geom": geoms},
|
|
542
|
+
schema=params.output_schema,
|
|
543
|
+
)
|
|
544
|
+
)
|
|
545
|
+
|
|
546
|
+
state.current_index += size
|
|
547
|
+
state.remaining -= size
|
|
548
|
+
|
|
549
|
+
|
|
550
|
+
# ============================================================================
|
|
551
|
+
|
|
552
|
+
|
|
553
|
+
@dataclass(slots=True, frozen=True)
|
|
554
|
+
class _DynFilterEchoArgs:
|
|
555
|
+
"""Arguments for DynamicFilterEchoFunction."""
|
|
556
|
+
|
|
557
|
+
count: Annotated[int, Arg(0, doc="Number of rows to generate", ge=1)]
|
|
558
|
+
batch_size: Annotated[int, Arg("batch_size", default=100, doc="Rows per batch")]
|
|
559
|
+
|
|
560
|
+
|
|
561
|
+
@dataclass(kw_only=True)
|
|
562
|
+
class _DynFilterEchoState(ArrowSerializableDataclass):
|
|
563
|
+
"""Mutable state for DynamicFilterEchoFunction."""
|
|
564
|
+
|
|
565
|
+
remaining: int
|
|
566
|
+
current_index: int = 0
|
|
567
|
+
|
|
568
|
+
|
|
569
|
+
def _format_pushed_filters_safe(filters: object) -> str:
|
|
570
|
+
"""Format PushdownFilters to readable string, returning '(none)' if empty/None."""
|
|
571
|
+
if filters is None:
|
|
572
|
+
return "(none)"
|
|
573
|
+
from vgi.table_filter_pushdown import PushdownFilters
|
|
574
|
+
|
|
575
|
+
if isinstance(filters, PushdownFilters) and filters:
|
|
576
|
+
return repr(filters)
|
|
577
|
+
return "(none)"
|
|
578
|
+
|
|
579
|
+
|
|
580
|
+
_DYN_FILTER_ECHO_SCHEMA = pa.schema(
|
|
581
|
+
[ # type: ignore[arg-type] # pyarrow stubs: mixed-type fields
|
|
582
|
+
pa.field("n", pa.int64()),
|
|
583
|
+
pa.field("pushed_filters", pa.utf8()),
|
|
584
|
+
]
|
|
585
|
+
)
|
|
586
|
+
|
|
587
|
+
|
|
588
|
+
@init_single_worker
|
|
589
|
+
@bind_fixed_schema
|
|
590
|
+
@_cardinality_from_count
|
|
591
|
+
class DynamicFilterEchoFunction(TableFunctionGenerator[_DynFilterEchoArgs, _DynFilterEchoState]):
|
|
592
|
+
"""Generates descending integers and echoes the current tick filter per batch.
|
|
593
|
+
|
|
594
|
+
USE CASE
|
|
595
|
+
--------
|
|
596
|
+
Demonstrates dynamic filter pushdown. Rows are generated in **descending**
|
|
597
|
+
order (count-1, count-2, ..., 0) so that ``ORDER BY n ASC LIMIT K`` causes
|
|
598
|
+
the Top-N heap to tighten gradually. Each batch's ``pushed_filters`` column
|
|
599
|
+
shows the filter received from the most recent tick.
|
|
600
|
+
|
|
601
|
+
SCHEMA
|
|
602
|
+
------
|
|
603
|
+
Output: {"n": int64, "pushed_filters": string}
|
|
604
|
+
|
|
605
|
+
"""
|
|
606
|
+
|
|
607
|
+
class Meta:
|
|
608
|
+
"""Metadata for DynamicFilterEchoFunction."""
|
|
609
|
+
|
|
610
|
+
name = "dynamic_filter_echo"
|
|
611
|
+
description = "Generates descending integers, echoes dynamic tick filter per batch"
|
|
612
|
+
categories = ["generator", "diagnostic"]
|
|
613
|
+
filter_pushdown = True
|
|
614
|
+
auto_apply_filters = True
|
|
615
|
+
projection_pushdown = True
|
|
616
|
+
|
|
617
|
+
FIXED_SCHEMA: ClassVar[pa.Schema] = _DYN_FILTER_ECHO_SCHEMA
|
|
618
|
+
|
|
619
|
+
@classmethod
|
|
620
|
+
def initial_state(cls, params: ProcessParams[_DynFilterEchoArgs]) -> _DynFilterEchoState:
|
|
621
|
+
"""Create initial state."""
|
|
622
|
+
return _DynFilterEchoState(remaining=params.args.count)
|
|
623
|
+
|
|
624
|
+
@classmethod
|
|
625
|
+
def process(
|
|
626
|
+
cls,
|
|
627
|
+
params: ProcessParams[_DynFilterEchoArgs],
|
|
628
|
+
state: _DynFilterEchoState,
|
|
629
|
+
out: OutputCollector,
|
|
630
|
+
) -> None:
|
|
631
|
+
"""Generate descending rows with current filter echoed."""
|
|
632
|
+
if state.remaining <= 0:
|
|
633
|
+
out.finish()
|
|
634
|
+
return
|
|
635
|
+
|
|
636
|
+
total = params.args.count
|
|
637
|
+
size = min(state.remaining, params.args.batch_size)
|
|
638
|
+
start = state.current_index
|
|
639
|
+
|
|
640
|
+
# Descending order: first batch has highest values
|
|
641
|
+
ns = [total - 1 - i for i in range(start, start + size)]
|
|
642
|
+
filter_str = _format_pushed_filters_safe(params.current_pushdown_filters)
|
|
643
|
+
filter_values = [filter_str] * size
|
|
644
|
+
|
|
645
|
+
out.emit(
|
|
646
|
+
pa.RecordBatch.from_pydict(
|
|
647
|
+
{"n": ns, "pushed_filters": filter_values},
|
|
648
|
+
schema=params.output_schema,
|
|
649
|
+
)
|
|
650
|
+
)
|
|
651
|
+
|
|
652
|
+
state.current_index += size
|
|
653
|
+
state.remaining -= size
|
|
654
|
+
|
|
655
|
+
|
|
656
|
+
# ============================================================================
|
|
657
|
+
|
|
658
|
+
_EXPR_FILTER_TEST_SCHEMA = pa.schema(
|
|
659
|
+
[ # type: ignore[arg-type] # pyarrow stubs: mixed-type fields
|
|
660
|
+
pa.field("id", pa.int64()),
|
|
661
|
+
pa.field("name", pa.utf8()),
|
|
662
|
+
pa.field("tags", pa.list_(pa.utf8())),
|
|
663
|
+
pa.field("score", pa.float64()),
|
|
664
|
+
]
|
|
665
|
+
)
|
|
666
|
+
|
|
667
|
+
|
|
668
|
+
@dataclass(slots=True, frozen=True)
|
|
669
|
+
class _ExprFilterTestArgs:
|
|
670
|
+
"""Arguments for ExpressionFilterTestFunction."""
|
|
671
|
+
|
|
672
|
+
count: Annotated[int, Arg(0, doc="Number of rows to generate", ge=1)]
|
|
673
|
+
batch_size: Annotated[int, Arg("batch_size", default=1024, doc="Rows per batch")]
|
|
674
|
+
|
|
675
|
+
|
|
676
|
+
@dataclass(kw_only=True)
|
|
677
|
+
class _ExprFilterTestState(ArrowSerializableDataclass):
|
|
678
|
+
"""Mutable state for ExpressionFilterTestFunction."""
|
|
679
|
+
|
|
680
|
+
remaining: int
|
|
681
|
+
current_index: int = 0
|
|
682
|
+
|
|
683
|
+
|
|
684
|
+
@init_single_worker
|
|
685
|
+
@bind_fixed_schema
|
|
686
|
+
@_cardinality_from_count
|
|
687
|
+
class ExpressionFilterTestFunction(TableFunctionGenerator[_ExprFilterTestArgs, _ExprFilterTestState]):
|
|
688
|
+
"""Generates rows with list and string columns for non-spatial expression filter testing.
|
|
689
|
+
|
|
690
|
+
USE CASE
|
|
691
|
+
--------
|
|
692
|
+
Test expression filter pushdown with non-spatial functions like
|
|
693
|
+
list_contains, prefix, starts_with, etc.
|
|
694
|
+
|
|
695
|
+
SCHEMA
|
|
696
|
+
------
|
|
697
|
+
Output: {"id": int64, "name": string, "tags": list<string>, "score": float64}
|
|
698
|
+
|
|
699
|
+
Row i has:
|
|
700
|
+
name = 'item_<i>'
|
|
701
|
+
tags = ['tag_<i%5>', 'tag_<(i+1)%5>']
|
|
702
|
+
score = i * 1.1
|
|
703
|
+
|
|
704
|
+
"""
|
|
705
|
+
|
|
706
|
+
class Meta:
|
|
707
|
+
"""Metadata for ExpressionFilterTestFunction."""
|
|
708
|
+
|
|
709
|
+
name = "expression_filter_test"
|
|
710
|
+
description = "Generates rows for non-spatial expression filter testing"
|
|
711
|
+
categories = ["generator", "testing"]
|
|
712
|
+
filter_pushdown = True
|
|
713
|
+
auto_apply_filters = True
|
|
714
|
+
projection_pushdown = True
|
|
715
|
+
supported_expression_filters = ["list_contains", "prefix", "starts_with", "contains"]
|
|
716
|
+
|
|
717
|
+
FIXED_SCHEMA: ClassVar[pa.Schema] = _EXPR_FILTER_TEST_SCHEMA
|
|
718
|
+
|
|
719
|
+
@classmethod
|
|
720
|
+
def initial_state(cls, params: ProcessParams[_ExprFilterTestArgs]) -> _ExprFilterTestState:
|
|
721
|
+
"""Create initial state."""
|
|
722
|
+
return _ExprFilterTestState(remaining=params.args.count)
|
|
723
|
+
|
|
724
|
+
@classmethod
|
|
725
|
+
def process(
|
|
726
|
+
cls,
|
|
727
|
+
params: ProcessParams[_ExprFilterTestArgs],
|
|
728
|
+
state: _ExprFilterTestState,
|
|
729
|
+
out: OutputCollector,
|
|
730
|
+
) -> None:
|
|
731
|
+
"""Generate rows with list and string columns."""
|
|
732
|
+
if state.remaining <= 0:
|
|
733
|
+
out.finish()
|
|
734
|
+
return
|
|
735
|
+
|
|
736
|
+
size = min(state.remaining, params.args.batch_size)
|
|
737
|
+
start = state.current_index
|
|
738
|
+
|
|
739
|
+
ids = list(range(start, start + size))
|
|
740
|
+
names = [f"item_{i}" for i in ids]
|
|
741
|
+
tags = [[f"tag_{i % 5}", f"tag_{(i + 1) % 5}"] for i in ids]
|
|
742
|
+
scores = [i * 1.1 for i in ids]
|
|
743
|
+
|
|
744
|
+
out.emit(
|
|
745
|
+
pa.RecordBatch.from_pydict(
|
|
746
|
+
{"id": ids, "name": names, "tags": tags, "score": scores},
|
|
747
|
+
schema=params.output_schema,
|
|
748
|
+
)
|
|
749
|
+
)
|
|
750
|
+
|
|
751
|
+
state.current_index += size
|
|
752
|
+
state.remaining -= size
|
|
753
|
+
|
|
754
|
+
|
|
755
|
+
# ============================================================================
|
|
756
|
+
# FilterEchoPartitionedFunction — multi-worker fixture that exercises filter
|
|
757
|
+
# pushdown across parallel workers. Combines the queue-based work distribution
|
|
758
|
+
# of PartitionedSequenceFunction with the filter-capture pattern of
|
|
759
|
+
# FilterEchoFunction so each worker echoes the filter it observed.
|
|
760
|
+
# ============================================================================
|
|
761
|
+
|
|
762
|
+
|
|
763
|
+
_FILTER_ECHO_PARTITIONED_SCHEMA = schema(
|
|
764
|
+
{
|
|
765
|
+
"n": pa.int64(),
|
|
766
|
+
"worker_pid": pa.int64(),
|
|
767
|
+
"pushed_filters": pa.utf8(),
|
|
768
|
+
}
|
|
769
|
+
)
|
|
770
|
+
|
|
771
|
+
|
|
772
|
+
@dataclass(slots=True, frozen=True)
|
|
773
|
+
class _FilterEchoPartitionedArgs:
|
|
774
|
+
"""Arguments for FilterEchoPartitionedFunction."""
|
|
775
|
+
|
|
776
|
+
count: Annotated[int, Arg(0, doc="Total number of integers to generate", ge=0)]
|
|
777
|
+
|
|
778
|
+
|
|
779
|
+
@dataclass(kw_only=True)
|
|
780
|
+
class _FilterEchoPartitionedState(ArrowSerializableDataclass):
|
|
781
|
+
"""Per-worker state.
|
|
782
|
+
|
|
783
|
+
``filter_str`` is serialized (not Transient): the framework's HTTP
|
|
784
|
+
rehydrate path deserializes user state but does not re-invoke
|
|
785
|
+
``initial_state``, so a Transient filter string would silently revert
|
|
786
|
+
to ``"(none)"`` after the first state-token round-trip — losing the
|
|
787
|
+
pushed-filter echo on every batch produced after a resume.
|
|
788
|
+
"""
|
|
789
|
+
|
|
790
|
+
current_start: int | None = None
|
|
791
|
+
current_end: int | None = None
|
|
792
|
+
current_idx: int = 0
|
|
793
|
+
filter_str: str = "(none)"
|
|
794
|
+
|
|
795
|
+
|
|
796
|
+
@bind_fixed_schema
|
|
797
|
+
@_cardinality_from_count
|
|
798
|
+
class FilterEchoPartitionedFunction(TableFunctionGenerator[_FilterEchoPartitionedArgs, _FilterEchoPartitionedState]):
|
|
799
|
+
"""Multi-worker filter-echo: queue-distributed sequence with filter pushdown.
|
|
800
|
+
|
|
801
|
+
Verifies that predicates DuckDB pushes down are observed *and* applied by
|
|
802
|
+
every parallel worker. Each worker pulls chunks from a shared queue and
|
|
803
|
+
independently deserializes the same pushed filter spec at init. The
|
|
804
|
+
framework auto-applies filters per emitted batch.
|
|
805
|
+
|
|
806
|
+
SCHEMA
|
|
807
|
+
------
|
|
808
|
+
Output: {"n": int64, "worker_pid": int64, "pushed_filters": string}
|
|
809
|
+
|
|
810
|
+
PARALLELIZATION
|
|
811
|
+
---------------
|
|
812
|
+
Uses a shared work queue: ``on_init`` enqueues 1000-row chunks. Workers
|
|
813
|
+
(up to DuckDB's parallel scan limit) pop chunks atomically.
|
|
814
|
+
``worker_pid`` reveals which OS process produced each row — under
|
|
815
|
+
subprocess transport that is one PID per worker; HTTP workers share a
|
|
816
|
+
process so the column collapses to a single value there.
|
|
817
|
+
|
|
818
|
+
"""
|
|
819
|
+
|
|
820
|
+
class Meta:
|
|
821
|
+
"""Metadata for FilterEchoPartitionedFunction."""
|
|
822
|
+
|
|
823
|
+
name = "filter_echo_partitioned"
|
|
824
|
+
description = "Multi-worker partitioned sequence that echoes pushed-down filters"
|
|
825
|
+
categories = ["generator", "diagnostic", "testing"]
|
|
826
|
+
filter_pushdown = True
|
|
827
|
+
auto_apply_filters = True
|
|
828
|
+
projection_pushdown = True
|
|
829
|
+
examples = [
|
|
830
|
+
FunctionExample(
|
|
831
|
+
sql="SELECT * FROM filter_echo_partitioned(10) WHERE n >= 8",
|
|
832
|
+
description="Multi-worker generation with filter pushdown",
|
|
833
|
+
),
|
|
834
|
+
]
|
|
835
|
+
|
|
836
|
+
# Cap the work queue at ~MAX_PARTITIONS items regardless of count, by sizing
|
|
837
|
+
# each chunk as ceil(count / MAX_PARTITIONS). The queue is drained one item
|
|
838
|
+
# per round-trip and serialized at the per-attach DO, so partition *count*
|
|
839
|
+
# drives remote cost. A fixed chunk size can't serve both a large query and
|
|
840
|
+
# a small distribution query (too-large chunks collapse the small one to one
|
|
841
|
+
# partition and kill fan-out); capping the partition count keeps ~24
|
|
842
|
+
# partitions at any scale. Each work item is a fixed-size (start, end) range
|
|
843
|
+
# — rows are generated locally and emitted in BATCH_SIZE batches — so this
|
|
844
|
+
# changes only the *count* of tiny pops, never any HTTP body size. Output is
|
|
845
|
+
# the echoed/filtered rows (partition-independent), so assertions hold.
|
|
846
|
+
MAX_PARTITIONS: ClassVar[int] = 24
|
|
847
|
+
BATCH_SIZE: ClassVar[int] = 1000
|
|
848
|
+
|
|
849
|
+
FIXED_SCHEMA: ClassVar[pa.Schema] = _FILTER_ECHO_PARTITIONED_SCHEMA
|
|
850
|
+
|
|
851
|
+
@classmethod
|
|
852
|
+
def on_init(
|
|
853
|
+
cls,
|
|
854
|
+
params: InitParams[_FilterEchoPartitionedArgs],
|
|
855
|
+
) -> GlobalInitResponse:
|
|
856
|
+
"""Populate the work queue with (start, end) chunks for parallel consumption."""
|
|
857
|
+
work_items: list[bytes] = []
|
|
858
|
+
chunk = max(1, -(-params.args.count // cls.MAX_PARTITIONS)) # ceil(count / MAX_PARTITIONS)
|
|
859
|
+
for start_idx in range(0, params.args.count, chunk):
|
|
860
|
+
end_idx = min(start_idx + chunk, params.args.count)
|
|
861
|
+
work_items.append(struct.pack(">QQ", start_idx, end_idx))
|
|
862
|
+
params.storage.queue_push(work_items)
|
|
863
|
+
return GlobalInitResponse()
|
|
864
|
+
|
|
865
|
+
@classmethod
|
|
866
|
+
def initial_state(cls, params: ProcessParams[_FilterEchoPartitionedArgs]) -> _FilterEchoPartitionedState:
|
|
867
|
+
"""Initialize per-worker state and capture the pushed filter string."""
|
|
868
|
+
assert params.init_call is not None
|
|
869
|
+
pf = params.init_call.pushdown_filters
|
|
870
|
+
jk = params.init_call.join_keys
|
|
871
|
+
filters = cls.pushdown_filters(pf, join_keys=jk) if pf is not None else None
|
|
872
|
+
return _FilterEchoPartitionedState(filter_str=_format_pushed_filters(filters))
|
|
873
|
+
|
|
874
|
+
@classmethod
|
|
875
|
+
def process(
|
|
876
|
+
cls,
|
|
877
|
+
params: ProcessParams[_FilterEchoPartitionedArgs],
|
|
878
|
+
state: _FilterEchoPartitionedState,
|
|
879
|
+
out: OutputCollector,
|
|
880
|
+
) -> None:
|
|
881
|
+
"""Pop a work chunk and emit a batch tagged with worker_pid and pushed_filters."""
|
|
882
|
+
if state.current_start is None or state.current_idx >= (state.current_end or 0):
|
|
883
|
+
work_data = params.storage.queue_pop()
|
|
884
|
+
if work_data is None:
|
|
885
|
+
out.finish()
|
|
886
|
+
return
|
|
887
|
+
state.current_start, state.current_end = struct.unpack(">QQ", work_data)
|
|
888
|
+
assert state.current_start is not None
|
|
889
|
+
state.current_idx = state.current_start
|
|
890
|
+
|
|
891
|
+
batch_end_idx = min(state.current_idx + cls.BATCH_SIZE, state.current_end or 0)
|
|
892
|
+
size = batch_end_idx - state.current_idx
|
|
893
|
+
ns = list(range(state.current_idx, batch_end_idx))
|
|
894
|
+
pid = os.getpid()
|
|
895
|
+
|
|
896
|
+
out.emit(
|
|
897
|
+
pa.RecordBatch.from_pydict(
|
|
898
|
+
{
|
|
899
|
+
"n": ns,
|
|
900
|
+
"worker_pid": [pid] * size,
|
|
901
|
+
"pushed_filters": [state.filter_str] * size,
|
|
902
|
+
},
|
|
903
|
+
schema=params.output_schema,
|
|
904
|
+
)
|
|
905
|
+
)
|
|
906
|
+
|
|
907
|
+
state.current_idx = batch_end_idx
|
|
908
|
+
|
|
909
|
+
|
|
910
|
+
# ============================================================================
|
|
911
|
+
# FilterEchoTableScanFunction — catalog *table* (not table function) backing for
|
|
912
|
+
# example.data.filter_echo_table. Mirrors FilterEchoFunction's pushed_filters
|
|
913
|
+
# echo, but is invoked with no positional args (the catalog scan route in the
|
|
914
|
+
# fixture worker passes none) so a `SELECT ... FROM example.data.filter_echo_table`
|
|
915
|
+
# — and a VIEW over it — can be characterized for filter pushdown. Crucially it
|
|
916
|
+
# declares supported_expression_filters so a `col LIKE 'abc%'` predicate (which
|
|
917
|
+
# DuckDB lowers to a prefix/starts_with expression filter) actually reaches the
|
|
918
|
+
# worker and shows up in the pushed_filters column. See
|
|
919
|
+
# test/sql/integration/table/filter_pushdown_through_view.test.
|
|
920
|
+
# ============================================================================
|
|
921
|
+
|
|
922
|
+
|
|
923
|
+
_FILTER_ECHO_TABLE_SCHEMA = schema({"n": pa.int64(), "s": pa.utf8(), "pushed_filters": pa.utf8()})
|
|
924
|
+
|
|
925
|
+
# Fixed 100-row dataset: n in 0..99, s = "row_<n>". The "row_" prefix makes
|
|
926
|
+
# LIKE 'row_1%' meaningful (matches row_1 and row_10..row_19).
|
|
927
|
+
_FILTER_ECHO_TABLE_ROWS = 100
|
|
928
|
+
|
|
929
|
+
|
|
930
|
+
@dataclass(kw_only=True)
|
|
931
|
+
class _FilterEchoTableState(ArrowSerializableDataclass):
|
|
932
|
+
"""One-shot state carrying the captured pushed-filter string.
|
|
933
|
+
|
|
934
|
+
``filter_str`` is serialized (not Transient): the framework's HTTP
|
|
935
|
+
rehydrate path deserializes user state but does not re-invoke
|
|
936
|
+
``initial_state``, so a Transient filter string would silently revert
|
|
937
|
+
to ``"(none)"`` after the first state-token round-trip.
|
|
938
|
+
"""
|
|
939
|
+
|
|
940
|
+
done: bool = False
|
|
941
|
+
filter_str: str = "(none)"
|
|
942
|
+
|
|
943
|
+
|
|
944
|
+
@init_single_worker
|
|
945
|
+
@bind_fixed_schema
|
|
946
|
+
class FilterEchoTableScanFunction(TableFunctionGenerator[_EmptyArgs, _FilterEchoTableState]):
|
|
947
|
+
"""Catalog-table scan that echoes the pushed-down filters it received.
|
|
948
|
+
|
|
949
|
+
Backs ``example.data.filter_echo_table``. Like :class:`FilterEchoFunction`
|
|
950
|
+
the ``pushed_filters`` column shows the SQL-like representation of whatever
|
|
951
|
+
DuckDB pushed down; the framework auto-applies the filters so the result set
|
|
952
|
+
stays correct. Unlike ``filter_echo`` it is a no-arg *table* scan and opts
|
|
953
|
+
into expression-filter pushdown, so a ``LIKE 'prefix%'`` predicate is
|
|
954
|
+
observable here (and through a view over this table).
|
|
955
|
+
|
|
956
|
+
SCHEMA
|
|
957
|
+
------
|
|
958
|
+
Output: {"n": int64, "s": string, "pushed_filters": string}, 100 rows
|
|
959
|
+
(n in 0..99, s = "row_<n>").
|
|
960
|
+
"""
|
|
961
|
+
|
|
962
|
+
class Meta:
|
|
963
|
+
"""Metadata for FilterEchoTableScanFunction."""
|
|
964
|
+
|
|
965
|
+
name = "filter_echo_table_scan"
|
|
966
|
+
description = "Catalog-table scan echoing pushed-down filters (backs example.data.filter_echo_table)"
|
|
967
|
+
categories = ["generator", "diagnostic", "testing"]
|
|
968
|
+
filter_pushdown = True
|
|
969
|
+
auto_apply_filters = True
|
|
970
|
+
projection_pushdown = True
|
|
971
|
+
supported_expression_filters = ["prefix", "starts_with"]
|
|
972
|
+
|
|
973
|
+
FIXED_SCHEMA: ClassVar[pa.Schema] = _FILTER_ECHO_TABLE_SCHEMA
|
|
974
|
+
|
|
975
|
+
@classmethod
|
|
976
|
+
def initial_state(cls, params: ProcessParams[_EmptyArgs]) -> _FilterEchoTableState:
|
|
977
|
+
"""Capture the pushed-filter string for echoing."""
|
|
978
|
+
assert params.init_call is not None
|
|
979
|
+
pf = params.init_call.pushdown_filters
|
|
980
|
+
jk = params.init_call.join_keys
|
|
981
|
+
filters = cls.pushdown_filters(pf, join_keys=jk) if pf is not None else None
|
|
982
|
+
return _FilterEchoTableState(filter_str=_format_pushed_filters(filters))
|
|
983
|
+
|
|
984
|
+
@classmethod
|
|
985
|
+
def process(
|
|
986
|
+
cls,
|
|
987
|
+
params: ProcessParams[_EmptyArgs],
|
|
988
|
+
state: _FilterEchoTableState,
|
|
989
|
+
out: OutputCollector,
|
|
990
|
+
) -> None:
|
|
991
|
+
"""Emit the fixed dataset once, projecting to the requested columns."""
|
|
992
|
+
if state.done:
|
|
993
|
+
out.finish()
|
|
994
|
+
return
|
|
995
|
+
state.done = True
|
|
996
|
+
|
|
997
|
+
ns = list(range(_FILTER_ECHO_TABLE_ROWS))
|
|
998
|
+
full: dict[str, list[Any]] = {
|
|
999
|
+
"n": ns,
|
|
1000
|
+
"s": [f"row_{i}" for i in ns],
|
|
1001
|
+
"pushed_filters": [state.filter_str] * _FILTER_ECHO_TABLE_ROWS,
|
|
1002
|
+
}
|
|
1003
|
+
# projection_pushdown=True: emit only the requested columns.
|
|
1004
|
+
columns = {f.name: full[f.name] for f in params.output_schema}
|
|
1005
|
+
out.emit(pa.RecordBatch.from_pydict(columns, schema=params.output_schema))
|