vgi-python 0.8.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (124) hide show
  1. vgi/__init__.py +152 -0
  2. vgi/_duckdb.py +62 -0
  3. vgi/_storage_profile.py +132 -0
  4. vgi/_test_fixtures/__init__.py +20 -0
  5. vgi/_test_fixtures/accumulate/__init__.py +19 -0
  6. vgi/_test_fixtures/accumulate/worker.py +762 -0
  7. vgi/_test_fixtures/aggregate/__init__.py +62 -0
  8. vgi/_test_fixtures/aggregate/_common.py +21 -0
  9. vgi/_test_fixtures/aggregate/basic.py +232 -0
  10. vgi/_test_fixtures/aggregate/dynamic.py +409 -0
  11. vgi/_test_fixtures/aggregate/generic.py +86 -0
  12. vgi/_test_fixtures/aggregate/listagg.py +71 -0
  13. vgi/_test_fixtures/aggregate/percentile.py +107 -0
  14. vgi/_test_fixtures/aggregate/streaming.py +192 -0
  15. vgi/_test_fixtures/aggregate/varargs.py +75 -0
  16. vgi/_test_fixtures/aggregate/window.py +380 -0
  17. vgi/_test_fixtures/attach_options.py +308 -0
  18. vgi/_test_fixtures/bad_protocol.py +62 -0
  19. vgi/_test_fixtures/cancellable.py +336 -0
  20. vgi/_test_fixtures/catalog.py +813 -0
  21. vgi/_test_fixtures/http_server.py +394 -0
  22. vgi/_test_fixtures/nest_tensor.py +614 -0
  23. vgi/_test_fixtures/orchard_catalog.py +47 -0
  24. vgi/_test_fixtures/projection_repro/__init__.py +6 -0
  25. vgi/_test_fixtures/projection_repro/worker.py +454 -0
  26. vgi/_test_fixtures/scalar/__init__.py +116 -0
  27. vgi/_test_fixtures/scalar/_common.py +69 -0
  28. vgi/_test_fixtures/scalar/arithmetic.py +321 -0
  29. vgi/_test_fixtures/scalar/binary.py +120 -0
  30. vgi/_test_fixtures/scalar/formatting.py +176 -0
  31. vgi/_test_fixtures/scalar/geo.py +300 -0
  32. vgi/_test_fixtures/scalar/null_handling.py +107 -0
  33. vgi/_test_fixtures/scalar/random_demo.py +171 -0
  34. vgi/_test_fixtures/scalar/settings_secrets.py +102 -0
  35. vgi/_test_fixtures/scalar/type_info.py +219 -0
  36. vgi/_test_fixtures/schema_reconcile/__init__.py +29 -0
  37. vgi/_test_fixtures/schema_reconcile/worker.py +653 -0
  38. vgi/_test_fixtures/simple_writable.py +793 -0
  39. vgi/_test_fixtures/table/__init__.py +221 -0
  40. vgi/_test_fixtures/table/_common.py +162 -0
  41. vgi/_test_fixtures/table/batch_index.py +283 -0
  42. vgi/_test_fixtures/table/batch_index_broken.py +200 -0
  43. vgi/_test_fixtures/table/catalog_scans.py +162 -0
  44. vgi/_test_fixtures/table/filters.py +1005 -0
  45. vgi/_test_fixtures/table/late_materialization.py +249 -0
  46. vgi/_test_fixtures/table/make_series.py +273 -0
  47. vgi/_test_fixtures/table/misc.py +499 -0
  48. vgi/_test_fixtures/table/order_modes.py +164 -0
  49. vgi/_test_fixtures/table/pairs.py +437 -0
  50. vgi/_test_fixtures/table/partition_columns.py +472 -0
  51. vgi/_test_fixtures/table/partition_columns_broken.py +304 -0
  52. vgi/_test_fixtures/table/profiling_example.py +195 -0
  53. vgi/_test_fixtures/table/required_filters.py +234 -0
  54. vgi/_test_fixtures/table/sequence.py +710 -0
  55. vgi/_test_fixtures/table/settings.py +426 -0
  56. vgi/_test_fixtures/table/transaction_storage.py +162 -0
  57. vgi/_test_fixtures/table/tt_pushdown.py +191 -0
  58. vgi/_test_fixtures/table/versioned.py +230 -0
  59. vgi/_test_fixtures/table_in_out.py +1392 -0
  60. vgi/_test_fixtures/versioned.py +155 -0
  61. vgi/_test_fixtures/versioned_tables.py +595 -0
  62. vgi/_test_fixtures/worker.py +1631 -0
  63. vgi/_test_fixtures/writable/__init__.py +8 -0
  64. vgi/_test_fixtures/writable/generic.py +236 -0
  65. vgi/_test_fixtures/writable/table.py +149 -0
  66. vgi/_test_fixtures/writable/worker.py +1148 -0
  67. vgi/aggregate_function.py +607 -0
  68. vgi/argument_spec.py +472 -0
  69. vgi/arguments.py +1747 -0
  70. vgi/auth.py +55 -0
  71. vgi/catalog/__init__.py +88 -0
  72. vgi/catalog/attach_option.py +206 -0
  73. vgi/catalog/catalog_interface.py +2767 -0
  74. vgi/catalog/descriptors.py +870 -0
  75. vgi/catalog/duckdb_statistics.py +377 -0
  76. vgi/catalog/secret_type.py +96 -0
  77. vgi/catalog/setting.py +253 -0
  78. vgi/catalog/storage.py +372 -0
  79. vgi/client/__init__.py +67 -0
  80. vgi/client/catalog_mixin.py +1251 -0
  81. vgi/client/cli.py +582 -0
  82. vgi/client/cli_catalog.py +182 -0
  83. vgi/client/cli_schema.py +270 -0
  84. vgi/client/cli_table.py +907 -0
  85. vgi/client/cli_transaction.py +97 -0
  86. vgi/client/cli_utils.py +441 -0
  87. vgi/client/cli_view.py +303 -0
  88. vgi/client/client.py +2183 -0
  89. vgi/exceptions.py +205 -0
  90. vgi/function.py +245 -0
  91. vgi/function_storage.py +1636 -0
  92. vgi/function_storage_azure_sql.py +922 -0
  93. vgi/function_storage_cf_do.py +740 -0
  94. vgi/http/__init__.py +25 -0
  95. vgi/http/demo_storage.py +212 -0
  96. vgi/http/worker_page.py +1252 -0
  97. vgi/invocation.py +154 -0
  98. vgi/logging_config.py +93 -0
  99. vgi/meta_worker.py +661 -0
  100. vgi/metadata.py +1403 -0
  101. vgi/otel.py +406 -0
  102. vgi/protocol.py +2418 -0
  103. vgi/protocol_version.txt +1 -0
  104. vgi/py.typed +0 -0
  105. vgi/scalar_function.py +1211 -0
  106. vgi/schema_utils.py +234 -0
  107. vgi/secret_protocol.py +124 -0
  108. vgi/secret_service.py +238 -0
  109. vgi/serve.py +769 -0
  110. vgi/table_buffering_function.py +443 -0
  111. vgi/table_filter_pushdown.py +1528 -0
  112. vgi/table_function.py +1130 -0
  113. vgi/table_in_out_function.py +383 -0
  114. vgi/transactor/__init__.py +24 -0
  115. vgi/transactor/_duckdb_compat.py +27 -0
  116. vgi/transactor/client.py +137 -0
  117. vgi/transactor/protocol.py +149 -0
  118. vgi/transactor/server.py +740 -0
  119. vgi/worker.py +4761 -0
  120. vgi_python-0.8.0.dist-info/METADATA +735 -0
  121. vgi_python-0.8.0.dist-info/RECORD +124 -0
  122. vgi_python-0.8.0.dist-info/WHEEL +4 -0
  123. vgi_python-0.8.0.dist-info/entry_points.txt +5 -0
  124. vgi_python-0.8.0.dist-info/licenses/LICENSE +134 -0
@@ -0,0 +1,1005 @@
1
+ # Copyright 2025, 2026 Query Farm LLC - https://query.farm
2
+
3
+ """Filter-pushdown demos (filter_echo, dynamic_filter_echo, expression_filter, spatial_filter)."""
4
+
5
+ from __future__ import annotations
6
+
7
+ import os
8
+ import struct
9
+ from dataclasses import dataclass
10
+ from typing import Annotated, Any, ClassVar
11
+
12
+ import pyarrow as pa
13
+ from vgi_rpc import ArrowSerializableDataclass
14
+ from vgi_rpc.rpc import OutputCollector
15
+
16
+ from vgi._test_fixtures.table._common import (
17
+ _cardinality_from_count,
18
+ _EmptyArgs,
19
+ )
20
+ from vgi.arguments import Arg
21
+ from vgi.invocation import GlobalInitResponse
22
+ from vgi.metadata import FunctionExample
23
+ from vgi.schema_utils import schema
24
+ from vgi.table_filter_pushdown import PushdownFilters
25
+ from vgi.table_function import (
26
+ InitParams,
27
+ ProcessParams,
28
+ TableFunctionGenerator,
29
+ bind_fixed_schema,
30
+ init_single_worker,
31
+ )
32
+
33
+ # =============================================================================
34
+
35
+
36
+ def _format_pushed_filters(filters: PushdownFilters | None) -> str:
37
+ """Format pushed-down filters as a human-readable SQL-like string.
38
+
39
+ Large IN lists (from join key pushdown) are truncated to avoid
40
+ generating multi-megabyte filter strings.
41
+ """
42
+ if not filters:
43
+ return "(none)"
44
+
45
+ from vgi.table_filter_pushdown import AndFilter, InFilter, OrFilter, _filter_to_sql
46
+
47
+ def _format_one(f: object) -> str:
48
+ """Format a single filter, truncating large InFilters."""
49
+ if isinstance(f, InFilter) and len(f.values) > 20:
50
+ return f"{f.column_name} IN ({len(f.values)} values)"
51
+ if isinstance(f, AndFilter):
52
+ child_parts = [_format_one(c) for c in f.children]
53
+ return "(" + " AND ".join(child_parts) + ")"
54
+ if isinstance(f, OrFilter):
55
+ child_parts = [_format_one(c) for c in f.children]
56
+ return "(" + " OR ".join(child_parts) + ")"
57
+ # Fall back to SQL rendering for other filter types
58
+ sql, params = _filter_to_sql(f, lambda s: s, "?", 0) # type: ignore[arg-type]
59
+ parts: list[str] = []
60
+ param_iter = iter(params)
61
+ for chunk in sql.split("?"):
62
+ parts.append(chunk)
63
+ try:
64
+ p = next(param_iter)
65
+ parts.append(repr(p) if isinstance(p, str) else str(p))
66
+ except StopIteration:
67
+ pass
68
+ return "".join(parts)
69
+
70
+ formatted_parts = [_format_one(f) for f in filters]
71
+ return " AND ".join(formatted_parts) if formatted_parts else "(none)"
72
+
73
+
74
+ @dataclass(slots=True, frozen=True)
75
+ class FilterEchoFunctionArgs:
76
+ """Arguments for FilterEchoFunction."""
77
+
78
+ count: Annotated[int, Arg(0, doc="Number of rows to generate", ge=0)]
79
+ batch_size: Annotated[int, Arg("batch_size", default=2048, doc="Batch size for output", ge=1)]
80
+
81
+
82
+ @dataclass(kw_only=True)
83
+ class FilterEchoState(ArrowSerializableDataclass):
84
+ """Mutable state tracking remaining rows, position, and cached filter string.
85
+
86
+ ``filter_str`` is serialized (not Transient): the framework's HTTP
87
+ rehydrate path deserializes user state but does not re-invoke
88
+ ``initial_state``, so a Transient filter string would silently revert
89
+ to ``"(none)"`` after the first state-token round-trip.
90
+ """
91
+
92
+ remaining: int
93
+ current_index: int = 0
94
+ filter_str: str = "(none)"
95
+
96
+
97
+ @init_single_worker
98
+ @bind_fixed_schema
99
+ @_cardinality_from_count
100
+ class FilterEchoFunction(TableFunctionGenerator[FilterEchoFunctionArgs, FilterEchoState]):
101
+ """Echoes pushed-down filter predicates in output for diagnostic purposes.
102
+
103
+ USE CASE
104
+ --------
105
+ Verify which filters DuckDB pushes down to the VGI worker. The
106
+ ``pushed_filters`` column shows the SQL-like representation of all
107
+ filters the engine sent. Filters are auto-applied by the worker so
108
+ the result set is always correct.
109
+
110
+ SCHEMA
111
+ ------
112
+ Output: {"n": int64, "s": string, "pushed_filters": string}
113
+
114
+ Example:
115
+ -------
116
+ SELECT * FROM filter_echo(10) WHERE n >= 8
117
+ Returns: rows 8-9 with pushed_filters showing "n >= 8"
118
+
119
+ """
120
+
121
+ class Meta:
122
+ """Metadata for FilterEchoFunction."""
123
+
124
+ name = "filter_echo"
125
+ description = "Echoes pushed-down filter predicates in output"
126
+ categories = ["generator", "diagnostic"]
127
+ filter_pushdown = True
128
+ auto_apply_filters = True
129
+ projection_pushdown = True
130
+ examples = [
131
+ FunctionExample(
132
+ sql="SELECT * FROM filter_echo(10)",
133
+ description="Generate 10 rows showing pushed filters",
134
+ ),
135
+ FunctionExample(
136
+ sql="SELECT pushed_filters FROM filter_echo(10) WHERE n >= 8",
137
+ description="See which filters were pushed down",
138
+ ),
139
+ ]
140
+
141
+ FIXED_SCHEMA: ClassVar[pa.Schema] = schema({"n": pa.int64(), "s": pa.utf8(), "pushed_filters": pa.utf8()})
142
+
143
+ @classmethod
144
+ def initial_state(cls, params: ProcessParams[FilterEchoFunctionArgs]) -> FilterEchoState:
145
+ """Create initial state with remaining count and cached filter string."""
146
+ assert params.init_call is not None
147
+ pf = params.init_call.pushdown_filters
148
+ jk = params.init_call.join_keys
149
+ filters = cls.pushdown_filters(pf, join_keys=jk) if pf is not None else None
150
+ return FilterEchoState(
151
+ remaining=params.args.count,
152
+ filter_str=_format_pushed_filters(filters),
153
+ )
154
+
155
+ @classmethod
156
+ def process(
157
+ cls,
158
+ params: ProcessParams[FilterEchoFunctionArgs],
159
+ state: FilterEchoState,
160
+ out: OutputCollector,
161
+ ) -> None:
162
+ """Generate rows with n, s, and pushed_filters columns."""
163
+ if state.remaining <= 0:
164
+ out.finish()
165
+ return
166
+
167
+ size = min(state.remaining, params.args.batch_size)
168
+ start = state.current_index
169
+
170
+ n_values = list(range(start, start + size))
171
+ s_values = [f"row_{i}" for i in n_values]
172
+ filter_values = [state.filter_str] * size
173
+
174
+ out.emit(
175
+ pa.RecordBatch.from_pydict(
176
+ {"n": n_values, "s": s_values, "pushed_filters": filter_values},
177
+ schema=params.output_schema,
178
+ )
179
+ )
180
+
181
+ state.current_index += size
182
+ state.remaining -= size
183
+
184
+
185
+ # ============================================================================
186
+ # ValuePruneFunction — exercises PushdownFilters.get_column_values('n'), the
187
+ # partition-pruning idiom (resolve the discrete value set up front, fetch only
188
+ # those keys). filter_echo can't cover this: it auto-applies the predicate
189
+ # row-by-row via Filter.evaluate, a different code path. Here the `resolved`
190
+ # column echoes exactly what get_column_values returned, so a regression in the
191
+ # AND/OR-descent of that accessor is directly observable — e.g. DuckDB pushing
192
+ # `n IN (...) AND n >= min AND n <= max` (an AndFilter) or `n = a OR n = b` (an
193
+ # OrFilter) must resolve to the discrete set, not collapse to "(scan)".
194
+ # ============================================================================
195
+
196
+
197
+ @dataclass(slots=True, frozen=True)
198
+ class _ValuePruneArgs:
199
+ """Arguments for ValuePruneFunction."""
200
+
201
+ count: Annotated[int, Arg(0, doc="Number of candidate rows (keys 0..count-1)", ge=0)]
202
+ batch_size: Annotated[int, Arg("batch_size", default=2048, doc="Batch size for output", ge=1)]
203
+
204
+
205
+ @dataclass(kw_only=True)
206
+ class _ValuePruneState(ArrowSerializableDataclass):
207
+ """Resolved key set to emit plus the echoed get_column_values result.
208
+
209
+ Both fields are serialized (not Transient): the HTTP rehydrate path
210
+ deserializes state without re-running initial_state, so the resolution
211
+ must survive a state-token round-trip.
212
+ """
213
+
214
+ values: list[int]
215
+ resolved: str
216
+ cursor: int = 0
217
+
218
+
219
+ @init_single_worker
220
+ @bind_fixed_schema
221
+ @_cardinality_from_count
222
+ class ValuePruneFunction(TableFunctionGenerator[_ValuePruneArgs, _ValuePruneState]):
223
+ """Emits only the keys that ``get_column_values('n')`` resolves to.
224
+
225
+ The ``resolved`` column carries the sorted, comma-joined discrete set the
226
+ accessor returned (or ``"(scan)"`` when it returned None, i.e. the predicate
227
+ is not enumerable — no filter, a bare range, or an OR with a non-discrete
228
+ branch). Assert on ``resolved`` to verify the accessor end-to-end,
229
+ independent of any residual filtering.
230
+ """
231
+
232
+ class Meta:
233
+ """Metadata for ValuePruneFunction."""
234
+
235
+ name = "value_prune"
236
+ description = "Prunes the key set via get_column_values('n'); echoes the resolved discrete values"
237
+ categories = ["generator", "diagnostic"]
238
+ filter_pushdown = True
239
+ auto_apply_filters = True
240
+ projection_pushdown = True
241
+ examples = [
242
+ FunctionExample(
243
+ sql="SELECT DISTINCT resolved FROM value_prune(100) WHERE n IN (5, 50, 95)",
244
+ description="Resolve a discrete key set from an IN predicate",
245
+ ),
246
+ ]
247
+
248
+ FIXED_SCHEMA: ClassVar[pa.Schema] = schema({"n": pa.int64(), "resolved": pa.utf8()})
249
+
250
+ @classmethod
251
+ def initial_state(cls, params: ProcessParams[_ValuePruneArgs]) -> _ValuePruneState:
252
+ """Resolve the discrete key set for `n` from the pushed-down filters."""
253
+ assert params.init_call is not None
254
+ count = params.args.count
255
+ pf = params.init_call.pushdown_filters
256
+ jk = params.init_call.join_keys
257
+ filters = cls.pushdown_filters(pf, join_keys=jk) if pf is not None else None
258
+ discrete = filters.get_column_values("n") if filters is not None else None
259
+ if discrete is not None:
260
+ resolved_vals = sorted(v for v in discrete.to_pylist() if v is not None)
261
+ resolved = ",".join(str(v) for v in resolved_vals)
262
+ emit = [v for v in resolved_vals if 0 <= v < count]
263
+ else:
264
+ resolved = "(scan)"
265
+ emit = list(range(count))
266
+ return _ValuePruneState(values=emit, resolved=resolved)
267
+
268
+ @classmethod
269
+ def process(
270
+ cls,
271
+ params: ProcessParams[_ValuePruneArgs],
272
+ state: _ValuePruneState,
273
+ out: OutputCollector,
274
+ ) -> None:
275
+ """Emit the resolved keys (with the echoed `resolved` diagnostic)."""
276
+ if state.cursor >= len(state.values):
277
+ out.finish()
278
+ return
279
+ size = min(len(state.values) - state.cursor, params.args.batch_size)
280
+ chunk = state.values[state.cursor : state.cursor + size]
281
+ out.emit(
282
+ pa.RecordBatch.from_pydict(
283
+ {"n": chunk, "resolved": [state.resolved] * len(chunk)},
284
+ schema=params.output_schema,
285
+ )
286
+ )
287
+ state.cursor += size
288
+
289
+
290
+ # ============================================================================
291
+ # DictFilterEchoFunction — output column declared as a *dictionary* Arrow type
292
+ # (dictionary<int8, utf8>) with no ENUM metadata. DuckDB maps such a column to
293
+ # plain VARCHAR, so a `WHERE s = 'x'` / `s IN (...)` predicate pushes a VARCHAR
294
+ # (string) literal down to the worker. The worker then emits the column
295
+ # dictionary-encoded, producing a (dictionary column, string literal) pair that
296
+ # the filter evaluator must compare. Naively casting the literal up to the
297
+ # column's dictionary type makes `pc.is_in(dict, dict)` / `pc.equal(dict, dict)`
298
+ # throw `ArrowTypeError: Array type doesn't match type of values set`; the
299
+ # correct path decodes the column to its value type. This fixture pins that
300
+ # behavior so every language implementation handles it identically.
301
+ # ============================================================================
302
+
303
+
304
+ _DICT_FILTER_ECHO_SCHEMA = pa.schema(
305
+ [
306
+ pa.field("n", pa.int64()),
307
+ pa.field("s", pa.dictionary(pa.int8(), pa.utf8())),
308
+ ]
309
+ )
310
+
311
+ # Deterministic, low-cardinality values so dictionary encoding is meaningful and
312
+ # the row<->value mapping is easy to assert: row i carries _DICT_VALUES[i % len].
313
+ _DICT_VALUES: tuple[str, ...] = ("red", "green", "blue")
314
+
315
+
316
+ @dataclass(slots=True, frozen=True)
317
+ class _DictFilterEchoArgs:
318
+ """Arguments for DictFilterEchoFunction."""
319
+
320
+ count: Annotated[int, Arg(0, doc="Number of rows to generate", ge=0)]
321
+ batch_size: Annotated[int, Arg("batch_size", default=2048, doc="Rows per batch", ge=1)]
322
+
323
+
324
+ @dataclass(kw_only=True)
325
+ class _DictFilterEchoState(ArrowSerializableDataclass):
326
+ """Mutable state tracking remaining rows and position."""
327
+
328
+ remaining: int
329
+ current_index: int = 0
330
+
331
+
332
+ @init_single_worker
333
+ @bind_fixed_schema
334
+ @_cardinality_from_count
335
+ class DictFilterEchoFunction(TableFunctionGenerator[_DictFilterEchoArgs, _DictFilterEchoState]):
336
+ """Emits a dictionary-encoded VARCHAR column to exercise filter pushdown.
337
+
338
+ USE CASE
339
+ --------
340
+ Regression coverage for filter pushdown over a dictionary-encoded
341
+ column whose DuckDB-facing type is plain VARCHAR. The pushed literal
342
+ arrives as a string while the emitted column is ``dictionary<int8,
343
+ utf8>``; the auto-applied filter must compare the two without
344
+ throwing. See the module comment above.
345
+
346
+ SCHEMA
347
+ ------
348
+ Output: {"n": int64, "s": dictionary<int8, utf8> (VARCHAR to DuckDB)}
349
+
350
+ Row i has s = ("red", "green", "blue")[i % 3].
351
+
352
+ Example:
353
+ -------
354
+ SELECT * FROM dict_filter_echo(6) WHERE s = 'green'
355
+ Returns: rows 1 and 4.
356
+
357
+ """
358
+
359
+ class Meta:
360
+ """Metadata for DictFilterEchoFunction."""
361
+
362
+ name = "dict_filter_echo"
363
+ description = "Emits a dictionary-encoded VARCHAR column for filter-pushdown testing"
364
+ categories = ["generator", "diagnostic", "testing"]
365
+ filter_pushdown = True
366
+ auto_apply_filters = True
367
+ projection_pushdown = True
368
+ examples = [
369
+ FunctionExample(
370
+ sql="SELECT * FROM dict_filter_echo(6) WHERE s = 'green'",
371
+ description="Filter a dictionary-encoded column by an equality predicate",
372
+ ),
373
+ FunctionExample(
374
+ sql="SELECT * FROM dict_filter_echo(6) WHERE s IN ('red', 'blue')",
375
+ description="Filter a dictionary-encoded column by an IN predicate",
376
+ ),
377
+ ]
378
+
379
+ FIXED_SCHEMA: ClassVar[pa.Schema] = _DICT_FILTER_ECHO_SCHEMA
380
+
381
+ @classmethod
382
+ def initial_state(cls, params: ProcessParams[_DictFilterEchoArgs]) -> _DictFilterEchoState:
383
+ """Create initial state with the remaining row count."""
384
+ return _DictFilterEchoState(remaining=params.args.count)
385
+
386
+ @classmethod
387
+ def process(
388
+ cls,
389
+ params: ProcessParams[_DictFilterEchoArgs],
390
+ state: _DictFilterEchoState,
391
+ out: OutputCollector,
392
+ ) -> None:
393
+ """Emit a batch with n and a dictionary-encoded s column."""
394
+ if state.remaining <= 0:
395
+ out.finish()
396
+ return
397
+
398
+ size = min(state.remaining, params.args.batch_size)
399
+ start = state.current_index
400
+
401
+ n_values = list(range(start, start + size))
402
+ s_values = [_DICT_VALUES[i % len(_DICT_VALUES)] for i in n_values]
403
+
404
+ out.emit(
405
+ pa.RecordBatch.from_pydict(
406
+ {"n": n_values, "s": s_values},
407
+ schema=params.output_schema,
408
+ )
409
+ )
410
+
411
+ state.current_index += size
412
+ state.remaining -= size
413
+
414
+
415
+ # ============================================================================
416
+
417
+
418
+ def _make_wkb_point(x: float, y: float) -> bytes:
419
+ """Encode a 2D point as little-endian WKB (byte_order=1, type=1=Point, x, y)."""
420
+ return struct.pack("<bI", 1, 1) + struct.pack("<dd", x, y)
421
+
422
+
423
+ # Arrow field with geoarrow.wkb extension metadata so DuckDB recognizes it as GEOMETRY
424
+ _GEOMETRY_FIELD = pa.field(
425
+ "geom",
426
+ pa.binary(),
427
+ metadata={
428
+ b"ARROW:extension:name": b"geoarrow.wkb",
429
+ b"ARROW:extension:metadata": b"{}",
430
+ },
431
+ )
432
+
433
+ _SPATIAL_FILTER_SCHEMA = pa.schema(
434
+ [ # type: ignore[arg-type] # pyarrow stubs: mixed-type fields
435
+ pa.field("n", pa.int64()),
436
+ pa.field("x", pa.float64()),
437
+ pa.field("y", pa.float64()),
438
+ _GEOMETRY_FIELD,
439
+ ]
440
+ )
441
+
442
+
443
+ @dataclass(slots=True, frozen=True)
444
+ class _SpatialFilterArgs:
445
+ """Arguments for SpatialFilterExampleFunction."""
446
+
447
+ count: Annotated[int, Arg(0, doc="Number of points to generate", ge=1)]
448
+ batch_size: Annotated[int, Arg("batch_size", default=1024, doc="Rows per batch")]
449
+
450
+
451
+ @dataclass(kw_only=True)
452
+ class _SpatialFilterState(ArrowSerializableDataclass):
453
+ """Mutable state for SpatialFilterExampleFunction."""
454
+
455
+ remaining: int
456
+ total_count: int
457
+ current_index: int = 0
458
+
459
+
460
+ @init_single_worker
461
+ @bind_fixed_schema
462
+ @_cardinality_from_count
463
+ class SpatialFilterExampleFunction(TableFunctionGenerator[_SpatialFilterArgs, _SpatialFilterState]):
464
+ """Generates points on a grid with geometry column for spatial filter testing.
465
+
466
+ USE CASE
467
+ --------
468
+ Test expression filter pushdown with spatial predicates. Points are placed
469
+ on a deterministic grid in [0, 1) x [0, 1) so that bounding box filter
470
+ counts are predictable.
471
+
472
+ SCHEMA
473
+ ------
474
+ Output: {"n": int64, "x": float64, "y": float64, "geom": GEOMETRY}
475
+
476
+ Grid layout: For count=N, point i has coordinates:
477
+ x = (i % cols) / cols
478
+ y = (i // cols) / cols
479
+ where cols = ceil(sqrt(N)).
480
+
481
+ Example:
482
+ -------
483
+ SELECT * FROM spatial_filter_example(100) WHERE geom && ST_MakeEnvelope(0, 0, 0.5, 0.5)
484
+ Returns: points in the lower-left quadrant of the unit square.
485
+
486
+ """
487
+
488
+ class Meta:
489
+ """Metadata for SpatialFilterExampleFunction."""
490
+
491
+ name = "spatial_filter_example"
492
+ description = "Generates points on a grid with geometry for spatial filter testing"
493
+ categories = ["generator", "spatial", "testing"]
494
+ filter_pushdown = True
495
+ auto_apply_filters = True
496
+ projection_pushdown = True
497
+ supported_expression_filters = ["&&", "st_intersects_extent"]
498
+ examples = [
499
+ FunctionExample(
500
+ sql="SELECT * FROM spatial_filter_example(100)",
501
+ description="Generate 100 points on a 10x10 grid",
502
+ ),
503
+ FunctionExample(
504
+ sql="SELECT COUNT(*) FROM spatial_filter_example(100) WHERE geom && ST_MakeEnvelope(0, 0, 0.5, 0.5)",
505
+ description="Count points in the lower-left quadrant",
506
+ ),
507
+ ]
508
+
509
+ FIXED_SCHEMA: ClassVar[pa.Schema] = _SPATIAL_FILTER_SCHEMA
510
+
511
+ @classmethod
512
+ def initial_state(cls, params: ProcessParams[_SpatialFilterArgs]) -> _SpatialFilterState:
513
+ """Create initial state."""
514
+ return _SpatialFilterState(remaining=params.args.count, total_count=params.args.count)
515
+
516
+ @classmethod
517
+ def process(
518
+ cls,
519
+ params: ProcessParams[_SpatialFilterArgs],
520
+ state: _SpatialFilterState,
521
+ out: OutputCollector,
522
+ ) -> None:
523
+ """Generate grid points with WKB geometry."""
524
+ if state.remaining <= 0:
525
+ out.finish()
526
+ return
527
+
528
+ import math
529
+
530
+ cols = max(1, math.ceil(math.sqrt(state.total_count)))
531
+ size = min(state.remaining, params.args.batch_size)
532
+ start = state.current_index
533
+
534
+ ns = list(range(start, start + size))
535
+ xs = [(i % cols) / cols for i in ns]
536
+ ys = [(i // cols) / cols for i in ns]
537
+ geoms = [_make_wkb_point(x, y) for x, y in zip(xs, ys, strict=True)]
538
+
539
+ out.emit(
540
+ pa.RecordBatch.from_pydict(
541
+ {"n": ns, "x": xs, "y": ys, "geom": geoms},
542
+ schema=params.output_schema,
543
+ )
544
+ )
545
+
546
+ state.current_index += size
547
+ state.remaining -= size
548
+
549
+
550
+ # ============================================================================
551
+
552
+
553
+ @dataclass(slots=True, frozen=True)
554
+ class _DynFilterEchoArgs:
555
+ """Arguments for DynamicFilterEchoFunction."""
556
+
557
+ count: Annotated[int, Arg(0, doc="Number of rows to generate", ge=1)]
558
+ batch_size: Annotated[int, Arg("batch_size", default=100, doc="Rows per batch")]
559
+
560
+
561
+ @dataclass(kw_only=True)
562
+ class _DynFilterEchoState(ArrowSerializableDataclass):
563
+ """Mutable state for DynamicFilterEchoFunction."""
564
+
565
+ remaining: int
566
+ current_index: int = 0
567
+
568
+
569
+ def _format_pushed_filters_safe(filters: object) -> str:
570
+ """Format PushdownFilters to readable string, returning '(none)' if empty/None."""
571
+ if filters is None:
572
+ return "(none)"
573
+ from vgi.table_filter_pushdown import PushdownFilters
574
+
575
+ if isinstance(filters, PushdownFilters) and filters:
576
+ return repr(filters)
577
+ return "(none)"
578
+
579
+
580
+ _DYN_FILTER_ECHO_SCHEMA = pa.schema(
581
+ [ # type: ignore[arg-type] # pyarrow stubs: mixed-type fields
582
+ pa.field("n", pa.int64()),
583
+ pa.field("pushed_filters", pa.utf8()),
584
+ ]
585
+ )
586
+
587
+
588
+ @init_single_worker
589
+ @bind_fixed_schema
590
+ @_cardinality_from_count
591
+ class DynamicFilterEchoFunction(TableFunctionGenerator[_DynFilterEchoArgs, _DynFilterEchoState]):
592
+ """Generates descending integers and echoes the current tick filter per batch.
593
+
594
+ USE CASE
595
+ --------
596
+ Demonstrates dynamic filter pushdown. Rows are generated in **descending**
597
+ order (count-1, count-2, ..., 0) so that ``ORDER BY n ASC LIMIT K`` causes
598
+ the Top-N heap to tighten gradually. Each batch's ``pushed_filters`` column
599
+ shows the filter received from the most recent tick.
600
+
601
+ SCHEMA
602
+ ------
603
+ Output: {"n": int64, "pushed_filters": string}
604
+
605
+ """
606
+
607
+ class Meta:
608
+ """Metadata for DynamicFilterEchoFunction."""
609
+
610
+ name = "dynamic_filter_echo"
611
+ description = "Generates descending integers, echoes dynamic tick filter per batch"
612
+ categories = ["generator", "diagnostic"]
613
+ filter_pushdown = True
614
+ auto_apply_filters = True
615
+ projection_pushdown = True
616
+
617
+ FIXED_SCHEMA: ClassVar[pa.Schema] = _DYN_FILTER_ECHO_SCHEMA
618
+
619
+ @classmethod
620
+ def initial_state(cls, params: ProcessParams[_DynFilterEchoArgs]) -> _DynFilterEchoState:
621
+ """Create initial state."""
622
+ return _DynFilterEchoState(remaining=params.args.count)
623
+
624
+ @classmethod
625
+ def process(
626
+ cls,
627
+ params: ProcessParams[_DynFilterEchoArgs],
628
+ state: _DynFilterEchoState,
629
+ out: OutputCollector,
630
+ ) -> None:
631
+ """Generate descending rows with current filter echoed."""
632
+ if state.remaining <= 0:
633
+ out.finish()
634
+ return
635
+
636
+ total = params.args.count
637
+ size = min(state.remaining, params.args.batch_size)
638
+ start = state.current_index
639
+
640
+ # Descending order: first batch has highest values
641
+ ns = [total - 1 - i for i in range(start, start + size)]
642
+ filter_str = _format_pushed_filters_safe(params.current_pushdown_filters)
643
+ filter_values = [filter_str] * size
644
+
645
+ out.emit(
646
+ pa.RecordBatch.from_pydict(
647
+ {"n": ns, "pushed_filters": filter_values},
648
+ schema=params.output_schema,
649
+ )
650
+ )
651
+
652
+ state.current_index += size
653
+ state.remaining -= size
654
+
655
+
656
+ # ============================================================================
657
+
658
+ _EXPR_FILTER_TEST_SCHEMA = pa.schema(
659
+ [ # type: ignore[arg-type] # pyarrow stubs: mixed-type fields
660
+ pa.field("id", pa.int64()),
661
+ pa.field("name", pa.utf8()),
662
+ pa.field("tags", pa.list_(pa.utf8())),
663
+ pa.field("score", pa.float64()),
664
+ ]
665
+ )
666
+
667
+
668
+ @dataclass(slots=True, frozen=True)
669
+ class _ExprFilterTestArgs:
670
+ """Arguments for ExpressionFilterTestFunction."""
671
+
672
+ count: Annotated[int, Arg(0, doc="Number of rows to generate", ge=1)]
673
+ batch_size: Annotated[int, Arg("batch_size", default=1024, doc="Rows per batch")]
674
+
675
+
676
+ @dataclass(kw_only=True)
677
+ class _ExprFilterTestState(ArrowSerializableDataclass):
678
+ """Mutable state for ExpressionFilterTestFunction."""
679
+
680
+ remaining: int
681
+ current_index: int = 0
682
+
683
+
684
+ @init_single_worker
685
+ @bind_fixed_schema
686
+ @_cardinality_from_count
687
+ class ExpressionFilterTestFunction(TableFunctionGenerator[_ExprFilterTestArgs, _ExprFilterTestState]):
688
+ """Generates rows with list and string columns for non-spatial expression filter testing.
689
+
690
+ USE CASE
691
+ --------
692
+ Test expression filter pushdown with non-spatial functions like
693
+ list_contains, prefix, starts_with, etc.
694
+
695
+ SCHEMA
696
+ ------
697
+ Output: {"id": int64, "name": string, "tags": list<string>, "score": float64}
698
+
699
+ Row i has:
700
+ name = 'item_<i>'
701
+ tags = ['tag_<i%5>', 'tag_<(i+1)%5>']
702
+ score = i * 1.1
703
+
704
+ """
705
+
706
+ class Meta:
707
+ """Metadata for ExpressionFilterTestFunction."""
708
+
709
+ name = "expression_filter_test"
710
+ description = "Generates rows for non-spatial expression filter testing"
711
+ categories = ["generator", "testing"]
712
+ filter_pushdown = True
713
+ auto_apply_filters = True
714
+ projection_pushdown = True
715
+ supported_expression_filters = ["list_contains", "prefix", "starts_with", "contains"]
716
+
717
+ FIXED_SCHEMA: ClassVar[pa.Schema] = _EXPR_FILTER_TEST_SCHEMA
718
+
719
+ @classmethod
720
+ def initial_state(cls, params: ProcessParams[_ExprFilterTestArgs]) -> _ExprFilterTestState:
721
+ """Create initial state."""
722
+ return _ExprFilterTestState(remaining=params.args.count)
723
+
724
+ @classmethod
725
+ def process(
726
+ cls,
727
+ params: ProcessParams[_ExprFilterTestArgs],
728
+ state: _ExprFilterTestState,
729
+ out: OutputCollector,
730
+ ) -> None:
731
+ """Generate rows with list and string columns."""
732
+ if state.remaining <= 0:
733
+ out.finish()
734
+ return
735
+
736
+ size = min(state.remaining, params.args.batch_size)
737
+ start = state.current_index
738
+
739
+ ids = list(range(start, start + size))
740
+ names = [f"item_{i}" for i in ids]
741
+ tags = [[f"tag_{i % 5}", f"tag_{(i + 1) % 5}"] for i in ids]
742
+ scores = [i * 1.1 for i in ids]
743
+
744
+ out.emit(
745
+ pa.RecordBatch.from_pydict(
746
+ {"id": ids, "name": names, "tags": tags, "score": scores},
747
+ schema=params.output_schema,
748
+ )
749
+ )
750
+
751
+ state.current_index += size
752
+ state.remaining -= size
753
+
754
+
755
+ # ============================================================================
756
+ # FilterEchoPartitionedFunction — multi-worker fixture that exercises filter
757
+ # pushdown across parallel workers. Combines the queue-based work distribution
758
+ # of PartitionedSequenceFunction with the filter-capture pattern of
759
+ # FilterEchoFunction so each worker echoes the filter it observed.
760
+ # ============================================================================
761
+
762
+
763
+ _FILTER_ECHO_PARTITIONED_SCHEMA = schema(
764
+ {
765
+ "n": pa.int64(),
766
+ "worker_pid": pa.int64(),
767
+ "pushed_filters": pa.utf8(),
768
+ }
769
+ )
770
+
771
+
772
+ @dataclass(slots=True, frozen=True)
773
+ class _FilterEchoPartitionedArgs:
774
+ """Arguments for FilterEchoPartitionedFunction."""
775
+
776
+ count: Annotated[int, Arg(0, doc="Total number of integers to generate", ge=0)]
777
+
778
+
779
+ @dataclass(kw_only=True)
780
+ class _FilterEchoPartitionedState(ArrowSerializableDataclass):
781
+ """Per-worker state.
782
+
783
+ ``filter_str`` is serialized (not Transient): the framework's HTTP
784
+ rehydrate path deserializes user state but does not re-invoke
785
+ ``initial_state``, so a Transient filter string would silently revert
786
+ to ``"(none)"`` after the first state-token round-trip — losing the
787
+ pushed-filter echo on every batch produced after a resume.
788
+ """
789
+
790
+ current_start: int | None = None
791
+ current_end: int | None = None
792
+ current_idx: int = 0
793
+ filter_str: str = "(none)"
794
+
795
+
796
+ @bind_fixed_schema
797
+ @_cardinality_from_count
798
+ class FilterEchoPartitionedFunction(TableFunctionGenerator[_FilterEchoPartitionedArgs, _FilterEchoPartitionedState]):
799
+ """Multi-worker filter-echo: queue-distributed sequence with filter pushdown.
800
+
801
+ Verifies that predicates DuckDB pushes down are observed *and* applied by
802
+ every parallel worker. Each worker pulls chunks from a shared queue and
803
+ independently deserializes the same pushed filter spec at init. The
804
+ framework auto-applies filters per emitted batch.
805
+
806
+ SCHEMA
807
+ ------
808
+ Output: {"n": int64, "worker_pid": int64, "pushed_filters": string}
809
+
810
+ PARALLELIZATION
811
+ ---------------
812
+ Uses a shared work queue: ``on_init`` enqueues 1000-row chunks. Workers
813
+ (up to DuckDB's parallel scan limit) pop chunks atomically.
814
+ ``worker_pid`` reveals which OS process produced each row — under
815
+ subprocess transport that is one PID per worker; HTTP workers share a
816
+ process so the column collapses to a single value there.
817
+
818
+ """
819
+
820
+ class Meta:
821
+ """Metadata for FilterEchoPartitionedFunction."""
822
+
823
+ name = "filter_echo_partitioned"
824
+ description = "Multi-worker partitioned sequence that echoes pushed-down filters"
825
+ categories = ["generator", "diagnostic", "testing"]
826
+ filter_pushdown = True
827
+ auto_apply_filters = True
828
+ projection_pushdown = True
829
+ examples = [
830
+ FunctionExample(
831
+ sql="SELECT * FROM filter_echo_partitioned(10) WHERE n >= 8",
832
+ description="Multi-worker generation with filter pushdown",
833
+ ),
834
+ ]
835
+
836
+ # Cap the work queue at ~MAX_PARTITIONS items regardless of count, by sizing
837
+ # each chunk as ceil(count / MAX_PARTITIONS). The queue is drained one item
838
+ # per round-trip and serialized at the per-attach DO, so partition *count*
839
+ # drives remote cost. A fixed chunk size can't serve both a large query and
840
+ # a small distribution query (too-large chunks collapse the small one to one
841
+ # partition and kill fan-out); capping the partition count keeps ~24
842
+ # partitions at any scale. Each work item is a fixed-size (start, end) range
843
+ # — rows are generated locally and emitted in BATCH_SIZE batches — so this
844
+ # changes only the *count* of tiny pops, never any HTTP body size. Output is
845
+ # the echoed/filtered rows (partition-independent), so assertions hold.
846
+ MAX_PARTITIONS: ClassVar[int] = 24
847
+ BATCH_SIZE: ClassVar[int] = 1000
848
+
849
+ FIXED_SCHEMA: ClassVar[pa.Schema] = _FILTER_ECHO_PARTITIONED_SCHEMA
850
+
851
+ @classmethod
852
+ def on_init(
853
+ cls,
854
+ params: InitParams[_FilterEchoPartitionedArgs],
855
+ ) -> GlobalInitResponse:
856
+ """Populate the work queue with (start, end) chunks for parallel consumption."""
857
+ work_items: list[bytes] = []
858
+ chunk = max(1, -(-params.args.count // cls.MAX_PARTITIONS)) # ceil(count / MAX_PARTITIONS)
859
+ for start_idx in range(0, params.args.count, chunk):
860
+ end_idx = min(start_idx + chunk, params.args.count)
861
+ work_items.append(struct.pack(">QQ", start_idx, end_idx))
862
+ params.storage.queue_push(work_items)
863
+ return GlobalInitResponse()
864
+
865
+ @classmethod
866
+ def initial_state(cls, params: ProcessParams[_FilterEchoPartitionedArgs]) -> _FilterEchoPartitionedState:
867
+ """Initialize per-worker state and capture the pushed filter string."""
868
+ assert params.init_call is not None
869
+ pf = params.init_call.pushdown_filters
870
+ jk = params.init_call.join_keys
871
+ filters = cls.pushdown_filters(pf, join_keys=jk) if pf is not None else None
872
+ return _FilterEchoPartitionedState(filter_str=_format_pushed_filters(filters))
873
+
874
+ @classmethod
875
+ def process(
876
+ cls,
877
+ params: ProcessParams[_FilterEchoPartitionedArgs],
878
+ state: _FilterEchoPartitionedState,
879
+ out: OutputCollector,
880
+ ) -> None:
881
+ """Pop a work chunk and emit a batch tagged with worker_pid and pushed_filters."""
882
+ if state.current_start is None or state.current_idx >= (state.current_end or 0):
883
+ work_data = params.storage.queue_pop()
884
+ if work_data is None:
885
+ out.finish()
886
+ return
887
+ state.current_start, state.current_end = struct.unpack(">QQ", work_data)
888
+ assert state.current_start is not None
889
+ state.current_idx = state.current_start
890
+
891
+ batch_end_idx = min(state.current_idx + cls.BATCH_SIZE, state.current_end or 0)
892
+ size = batch_end_idx - state.current_idx
893
+ ns = list(range(state.current_idx, batch_end_idx))
894
+ pid = os.getpid()
895
+
896
+ out.emit(
897
+ pa.RecordBatch.from_pydict(
898
+ {
899
+ "n": ns,
900
+ "worker_pid": [pid] * size,
901
+ "pushed_filters": [state.filter_str] * size,
902
+ },
903
+ schema=params.output_schema,
904
+ )
905
+ )
906
+
907
+ state.current_idx = batch_end_idx
908
+
909
+
910
+ # ============================================================================
911
+ # FilterEchoTableScanFunction — catalog *table* (not table function) backing for
912
+ # example.data.filter_echo_table. Mirrors FilterEchoFunction's pushed_filters
913
+ # echo, but is invoked with no positional args (the catalog scan route in the
914
+ # fixture worker passes none) so a `SELECT ... FROM example.data.filter_echo_table`
915
+ # — and a VIEW over it — can be characterized for filter pushdown. Crucially it
916
+ # declares supported_expression_filters so a `col LIKE 'abc%'` predicate (which
917
+ # DuckDB lowers to a prefix/starts_with expression filter) actually reaches the
918
+ # worker and shows up in the pushed_filters column. See
919
+ # test/sql/integration/table/filter_pushdown_through_view.test.
920
+ # ============================================================================
921
+
922
+
923
+ _FILTER_ECHO_TABLE_SCHEMA = schema({"n": pa.int64(), "s": pa.utf8(), "pushed_filters": pa.utf8()})
924
+
925
+ # Fixed 100-row dataset: n in 0..99, s = "row_<n>". The "row_" prefix makes
926
+ # LIKE 'row_1%' meaningful (matches row_1 and row_10..row_19).
927
+ _FILTER_ECHO_TABLE_ROWS = 100
928
+
929
+
930
+ @dataclass(kw_only=True)
931
+ class _FilterEchoTableState(ArrowSerializableDataclass):
932
+ """One-shot state carrying the captured pushed-filter string.
933
+
934
+ ``filter_str`` is serialized (not Transient): the framework's HTTP
935
+ rehydrate path deserializes user state but does not re-invoke
936
+ ``initial_state``, so a Transient filter string would silently revert
937
+ to ``"(none)"`` after the first state-token round-trip.
938
+ """
939
+
940
+ done: bool = False
941
+ filter_str: str = "(none)"
942
+
943
+
944
+ @init_single_worker
945
+ @bind_fixed_schema
946
+ class FilterEchoTableScanFunction(TableFunctionGenerator[_EmptyArgs, _FilterEchoTableState]):
947
+ """Catalog-table scan that echoes the pushed-down filters it received.
948
+
949
+ Backs ``example.data.filter_echo_table``. Like :class:`FilterEchoFunction`
950
+ the ``pushed_filters`` column shows the SQL-like representation of whatever
951
+ DuckDB pushed down; the framework auto-applies the filters so the result set
952
+ stays correct. Unlike ``filter_echo`` it is a no-arg *table* scan and opts
953
+ into expression-filter pushdown, so a ``LIKE 'prefix%'`` predicate is
954
+ observable here (and through a view over this table).
955
+
956
+ SCHEMA
957
+ ------
958
+ Output: {"n": int64, "s": string, "pushed_filters": string}, 100 rows
959
+ (n in 0..99, s = "row_<n>").
960
+ """
961
+
962
+ class Meta:
963
+ """Metadata for FilterEchoTableScanFunction."""
964
+
965
+ name = "filter_echo_table_scan"
966
+ description = "Catalog-table scan echoing pushed-down filters (backs example.data.filter_echo_table)"
967
+ categories = ["generator", "diagnostic", "testing"]
968
+ filter_pushdown = True
969
+ auto_apply_filters = True
970
+ projection_pushdown = True
971
+ supported_expression_filters = ["prefix", "starts_with"]
972
+
973
+ FIXED_SCHEMA: ClassVar[pa.Schema] = _FILTER_ECHO_TABLE_SCHEMA
974
+
975
+ @classmethod
976
+ def initial_state(cls, params: ProcessParams[_EmptyArgs]) -> _FilterEchoTableState:
977
+ """Capture the pushed-filter string for echoing."""
978
+ assert params.init_call is not None
979
+ pf = params.init_call.pushdown_filters
980
+ jk = params.init_call.join_keys
981
+ filters = cls.pushdown_filters(pf, join_keys=jk) if pf is not None else None
982
+ return _FilterEchoTableState(filter_str=_format_pushed_filters(filters))
983
+
984
+ @classmethod
985
+ def process(
986
+ cls,
987
+ params: ProcessParams[_EmptyArgs],
988
+ state: _FilterEchoTableState,
989
+ out: OutputCollector,
990
+ ) -> None:
991
+ """Emit the fixed dataset once, projecting to the requested columns."""
992
+ if state.done:
993
+ out.finish()
994
+ return
995
+ state.done = True
996
+
997
+ ns = list(range(_FILTER_ECHO_TABLE_ROWS))
998
+ full: dict[str, list[Any]] = {
999
+ "n": ns,
1000
+ "s": [f"row_{i}" for i in ns],
1001
+ "pushed_filters": [state.filter_str] * _FILTER_ECHO_TABLE_ROWS,
1002
+ }
1003
+ # projection_pushdown=True: emit only the requested columns.
1004
+ columns = {f.name: full[f.name] for f in params.output_schema}
1005
+ out.emit(pa.RecordBatch.from_pydict(columns, schema=params.output_schema))