vgi-python 0.8.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (124) hide show
  1. vgi/__init__.py +152 -0
  2. vgi/_duckdb.py +62 -0
  3. vgi/_storage_profile.py +132 -0
  4. vgi/_test_fixtures/__init__.py +20 -0
  5. vgi/_test_fixtures/accumulate/__init__.py +19 -0
  6. vgi/_test_fixtures/accumulate/worker.py +762 -0
  7. vgi/_test_fixtures/aggregate/__init__.py +62 -0
  8. vgi/_test_fixtures/aggregate/_common.py +21 -0
  9. vgi/_test_fixtures/aggregate/basic.py +232 -0
  10. vgi/_test_fixtures/aggregate/dynamic.py +409 -0
  11. vgi/_test_fixtures/aggregate/generic.py +86 -0
  12. vgi/_test_fixtures/aggregate/listagg.py +71 -0
  13. vgi/_test_fixtures/aggregate/percentile.py +107 -0
  14. vgi/_test_fixtures/aggregate/streaming.py +192 -0
  15. vgi/_test_fixtures/aggregate/varargs.py +75 -0
  16. vgi/_test_fixtures/aggregate/window.py +380 -0
  17. vgi/_test_fixtures/attach_options.py +308 -0
  18. vgi/_test_fixtures/bad_protocol.py +62 -0
  19. vgi/_test_fixtures/cancellable.py +336 -0
  20. vgi/_test_fixtures/catalog.py +813 -0
  21. vgi/_test_fixtures/http_server.py +394 -0
  22. vgi/_test_fixtures/nest_tensor.py +614 -0
  23. vgi/_test_fixtures/orchard_catalog.py +47 -0
  24. vgi/_test_fixtures/projection_repro/__init__.py +6 -0
  25. vgi/_test_fixtures/projection_repro/worker.py +454 -0
  26. vgi/_test_fixtures/scalar/__init__.py +116 -0
  27. vgi/_test_fixtures/scalar/_common.py +69 -0
  28. vgi/_test_fixtures/scalar/arithmetic.py +321 -0
  29. vgi/_test_fixtures/scalar/binary.py +120 -0
  30. vgi/_test_fixtures/scalar/formatting.py +176 -0
  31. vgi/_test_fixtures/scalar/geo.py +300 -0
  32. vgi/_test_fixtures/scalar/null_handling.py +107 -0
  33. vgi/_test_fixtures/scalar/random_demo.py +171 -0
  34. vgi/_test_fixtures/scalar/settings_secrets.py +102 -0
  35. vgi/_test_fixtures/scalar/type_info.py +219 -0
  36. vgi/_test_fixtures/schema_reconcile/__init__.py +29 -0
  37. vgi/_test_fixtures/schema_reconcile/worker.py +653 -0
  38. vgi/_test_fixtures/simple_writable.py +793 -0
  39. vgi/_test_fixtures/table/__init__.py +221 -0
  40. vgi/_test_fixtures/table/_common.py +162 -0
  41. vgi/_test_fixtures/table/batch_index.py +283 -0
  42. vgi/_test_fixtures/table/batch_index_broken.py +200 -0
  43. vgi/_test_fixtures/table/catalog_scans.py +162 -0
  44. vgi/_test_fixtures/table/filters.py +1005 -0
  45. vgi/_test_fixtures/table/late_materialization.py +249 -0
  46. vgi/_test_fixtures/table/make_series.py +273 -0
  47. vgi/_test_fixtures/table/misc.py +499 -0
  48. vgi/_test_fixtures/table/order_modes.py +164 -0
  49. vgi/_test_fixtures/table/pairs.py +437 -0
  50. vgi/_test_fixtures/table/partition_columns.py +472 -0
  51. vgi/_test_fixtures/table/partition_columns_broken.py +304 -0
  52. vgi/_test_fixtures/table/profiling_example.py +195 -0
  53. vgi/_test_fixtures/table/required_filters.py +234 -0
  54. vgi/_test_fixtures/table/sequence.py +710 -0
  55. vgi/_test_fixtures/table/settings.py +426 -0
  56. vgi/_test_fixtures/table/transaction_storage.py +162 -0
  57. vgi/_test_fixtures/table/tt_pushdown.py +191 -0
  58. vgi/_test_fixtures/table/versioned.py +230 -0
  59. vgi/_test_fixtures/table_in_out.py +1392 -0
  60. vgi/_test_fixtures/versioned.py +155 -0
  61. vgi/_test_fixtures/versioned_tables.py +595 -0
  62. vgi/_test_fixtures/worker.py +1631 -0
  63. vgi/_test_fixtures/writable/__init__.py +8 -0
  64. vgi/_test_fixtures/writable/generic.py +236 -0
  65. vgi/_test_fixtures/writable/table.py +149 -0
  66. vgi/_test_fixtures/writable/worker.py +1148 -0
  67. vgi/aggregate_function.py +607 -0
  68. vgi/argument_spec.py +472 -0
  69. vgi/arguments.py +1747 -0
  70. vgi/auth.py +55 -0
  71. vgi/catalog/__init__.py +88 -0
  72. vgi/catalog/attach_option.py +206 -0
  73. vgi/catalog/catalog_interface.py +2767 -0
  74. vgi/catalog/descriptors.py +870 -0
  75. vgi/catalog/duckdb_statistics.py +377 -0
  76. vgi/catalog/secret_type.py +96 -0
  77. vgi/catalog/setting.py +253 -0
  78. vgi/catalog/storage.py +372 -0
  79. vgi/client/__init__.py +67 -0
  80. vgi/client/catalog_mixin.py +1251 -0
  81. vgi/client/cli.py +582 -0
  82. vgi/client/cli_catalog.py +182 -0
  83. vgi/client/cli_schema.py +270 -0
  84. vgi/client/cli_table.py +907 -0
  85. vgi/client/cli_transaction.py +97 -0
  86. vgi/client/cli_utils.py +441 -0
  87. vgi/client/cli_view.py +303 -0
  88. vgi/client/client.py +2183 -0
  89. vgi/exceptions.py +205 -0
  90. vgi/function.py +245 -0
  91. vgi/function_storage.py +1636 -0
  92. vgi/function_storage_azure_sql.py +922 -0
  93. vgi/function_storage_cf_do.py +740 -0
  94. vgi/http/__init__.py +25 -0
  95. vgi/http/demo_storage.py +212 -0
  96. vgi/http/worker_page.py +1252 -0
  97. vgi/invocation.py +154 -0
  98. vgi/logging_config.py +93 -0
  99. vgi/meta_worker.py +661 -0
  100. vgi/metadata.py +1403 -0
  101. vgi/otel.py +406 -0
  102. vgi/protocol.py +2418 -0
  103. vgi/protocol_version.txt +1 -0
  104. vgi/py.typed +0 -0
  105. vgi/scalar_function.py +1211 -0
  106. vgi/schema_utils.py +234 -0
  107. vgi/secret_protocol.py +124 -0
  108. vgi/secret_service.py +238 -0
  109. vgi/serve.py +769 -0
  110. vgi/table_buffering_function.py +443 -0
  111. vgi/table_filter_pushdown.py +1528 -0
  112. vgi/table_function.py +1130 -0
  113. vgi/table_in_out_function.py +383 -0
  114. vgi/transactor/__init__.py +24 -0
  115. vgi/transactor/_duckdb_compat.py +27 -0
  116. vgi/transactor/client.py +137 -0
  117. vgi/transactor/protocol.py +149 -0
  118. vgi/transactor/server.py +740 -0
  119. vgi/worker.py +4761 -0
  120. vgi_python-0.8.0.dist-info/METADATA +735 -0
  121. vgi_python-0.8.0.dist-info/RECORD +124 -0
  122. vgi_python-0.8.0.dist-info/WHEEL +4 -0
  123. vgi_python-0.8.0.dist-info/entry_points.txt +5 -0
  124. vgi_python-0.8.0.dist-info/licenses/LICENSE +134 -0
@@ -0,0 +1,710 @@
1
+ # Copyright 2025, 2026 Query Farm LLC - https://query.farm
2
+
3
+ """Sequence-style table generators (sequence, double_sequence, partitioned_sequence, etc.)."""
4
+
5
+ from __future__ import annotations
6
+
7
+ import struct
8
+ from dataclasses import dataclass
9
+ from typing import Annotated, Any, ClassVar
10
+
11
+ import numpy as np
12
+ import pyarrow as pa
13
+ from vgi_rpc import ArrowSerializableDataclass
14
+ from vgi_rpc.rpc import OutputCollector
15
+
16
+ from vgi._test_fixtures.table._common import (
17
+ CountBatchArgs,
18
+ CountdownState,
19
+ _BaseSequenceFunction,
20
+ _cardinality_from_count,
21
+ )
22
+ from vgi.arguments import Arg
23
+ from vgi.catalog.catalog_interface import ColumnStatistics
24
+ from vgi.invocation import BindResponse, GlobalInitResponse
25
+ from vgi.metadata import FunctionExample
26
+ from vgi.schema_utils import schema
27
+ from vgi.table_function import (
28
+ BindParams,
29
+ InitParams,
30
+ ProcessParams,
31
+ TableCardinality,
32
+ TableFunctionGenerator,
33
+ bind_fixed_schema,
34
+ init_single_worker,
35
+ )
36
+
37
+
38
+ @dataclass(frozen=True)
39
+ class SequenceFunctionArgs(CountBatchArgs):
40
+ """Arguments for SequenceFunction."""
41
+
42
+ increment: Annotated[int, Arg("increment", default=1, doc="Step between values", ge=1)]
43
+
44
+
45
+ @init_single_worker
46
+ @bind_fixed_schema
47
+ @_cardinality_from_count
48
+ class SequenceFunction(_BaseSequenceFunction):
49
+ """Generates a sequence of integers from 0 to n-1 with optional increment.
50
+
51
+ USE CASE
52
+ --------
53
+ Generate test data, create row numbers, or produce a fixed sequence
54
+ for joining or filtering. The increment parameter allows generating
55
+ sequences like 0, 2, 4, 6, ... or 0, 10, 20, 30, ...
56
+
57
+ SCHEMA
58
+ ------
59
+ Output: {"n": int64}
60
+
61
+ Example:
62
+ -------
63
+ SELECT * FROM sequence(5)
64
+ Returns: [{"n": 0}, {"n": 1}, {"n": 2}, {"n": 3}, {"n": 4}]
65
+
66
+ SELECT * FROM sequence(5, increment := 2)
67
+ Returns: [{"n": 0}, {"n": 2}, {"n": 4}, {"n": 6}, {"n": 8}]
68
+
69
+ SELECT * FROM sequence(1000, batch_size := 100)
70
+ Returns: integers 0-999 in batches of 100 rows each
71
+
72
+ """
73
+
74
+ FunctionArguments = SequenceFunctionArgs
75
+
76
+ class Meta:
77
+ """Metadata for SequenceFunction."""
78
+
79
+ name = "sequence"
80
+ description = "Generates a sequence of integers from 0 to n-1"
81
+ categories = ["generator", "utility"]
82
+ tags = {"category": "generator", "type": "utility"}
83
+ projection_pushdown = True
84
+ filter_pushdown = True
85
+ auto_apply_filters = True
86
+ examples = [
87
+ FunctionExample(
88
+ sql="SELECT * FROM sequence(10)",
89
+ description="Generate integers 0-9",
90
+ ),
91
+ FunctionExample(
92
+ sql="SELECT * FROM sequence(1000, batch_size := 100)",
93
+ description="Generate integers 0-999 in batches of 100",
94
+ ),
95
+ FunctionExample(
96
+ sql="SELECT * FROM sequence(5, batch_size := 10000, increment := 10)",
97
+ description="Generate 0, 10, 20, 30, 40",
98
+ ),
99
+ ]
100
+
101
+ # Full schema before projection
102
+ FIXED_SCHEMA: ClassVar[pa.Schema] = schema(n=pa.int64())
103
+ NUMPY_DTYPE: ClassVar[type[np.generic]] = np.int64
104
+
105
+
106
+ @dataclass(slots=True, frozen=True)
107
+ class NamedParamsEchoFunctionArgs:
108
+ """Arguments for NamedParamsEchoFunction.
109
+
110
+ Note: keeps its own ``count`` (no ``batch_size``) because the function
111
+ uses a fixed ``BATCH_SIZE_FALLBACK``. Subclassing CountBatchArgs would
112
+ expose a user knob this fixture intentionally hides.
113
+ """
114
+
115
+ count: Annotated[int, Arg(0, doc="Number of rows to generate", ge=0)]
116
+ greeting: Annotated[str, Arg("greeting", default="hello", doc="Greeting text echoed in output")]
117
+ multiplier: Annotated[int, Arg("multiplier", default=1, doc="Multiplier for value column")]
118
+ scale: Annotated[float, Arg("scale", default=1.0, doc="Scale factor for float_value column")]
119
+ enabled: Annotated[bool, Arg("enabled", default=True, doc="Boolean echoed in output")]
120
+
121
+
122
+ @init_single_worker
123
+ @bind_fixed_schema
124
+ @_cardinality_from_count
125
+ class NamedParamsEchoFunction(_BaseSequenceFunction):
126
+ """Echoes named parameter values directly in output columns.
127
+
128
+ USE CASE
129
+ --------
130
+ Testing that named parameters of various types (VARCHAR, BIGINT, DOUBLE,
131
+ BOOLEAN) are correctly passed from DuckDB to the worker. Each named
132
+ parameter value is echoed directly in an output column, making it easy
133
+ to assert correctness.
134
+
135
+ SCHEMA
136
+ ------
137
+ Output: {"id": int64, "greeting": string, "value": int64, "float_value": float64, "enabled": bool}
138
+
139
+ Example:
140
+ -------
141
+ SELECT * FROM named_params_echo(3)
142
+ Returns: rows with id=0..2, greeting='hello', value=id*1, float_value=id*1.0, enabled=true
143
+
144
+ SELECT * FROM named_params_echo(3, greeting := 'hi', multiplier := 10)
145
+ Returns: rows with id=0..2, greeting='hi', value=id*10, float_value=id*1.0, enabled=true
146
+
147
+ """
148
+
149
+ FunctionArguments = NamedParamsEchoFunctionArgs
150
+
151
+ class Meta:
152
+ """Metadata for NamedParamsEchoFunction."""
153
+
154
+ name = "named_params_echo"
155
+ description = "Echoes named parameter values in output columns"
156
+ categories = ["generator", "testing"]
157
+ tags = {"category": "testing", "type": "params"}
158
+ examples = [
159
+ FunctionExample(
160
+ sql="SELECT * FROM named_params_echo(3)",
161
+ description="Echo default parameter values for 3 rows",
162
+ ),
163
+ FunctionExample(
164
+ sql="SELECT * FROM named_params_echo(3, greeting := 'hi', multiplier := 10)",
165
+ description="Echo custom greeting and multiplier",
166
+ ),
167
+ ]
168
+
169
+ FIXED_SCHEMA: ClassVar[pa.Schema] = schema(
170
+ {
171
+ "id": pa.int64(),
172
+ "greeting": pa.string(),
173
+ "value": pa.int64(),
174
+ "float_value": pa.float64(),
175
+ "enabled": pa.bool_(),
176
+ }
177
+ )
178
+
179
+ @classmethod
180
+ def statistics(cls, params: BindParams[NamedParamsEchoFunctionArgs]) -> list[ColumnStatistics] | None:
181
+ """Echo function doesn't compute single-column stats — opt out of base impl."""
182
+ return None
183
+
184
+ @classmethod
185
+ def _emit_chunk(
186
+ cls,
187
+ params: ProcessParams[NamedParamsEchoFunctionArgs],
188
+ state: CountdownState,
189
+ out: OutputCollector,
190
+ start: int,
191
+ size: int,
192
+ ) -> None:
193
+ """Emit a batch of rows echoing the named parameter values."""
194
+ ids = list(range(start, start + size))
195
+ data: dict[str, list[int] | list[str] | list[float] | list[bool]] = {
196
+ "id": ids,
197
+ "greeting": [params.args.greeting] * size,
198
+ "value": [i * params.args.multiplier for i in ids],
199
+ "float_value": [i * params.args.scale for i in ids],
200
+ "enabled": [params.args.enabled] * size,
201
+ }
202
+ out.emit(pa.RecordBatch.from_pydict(data, schema=params.output_schema))
203
+
204
+
205
+ @dataclass(frozen=True)
206
+ class NestedSequenceFunctionArguments(CountBatchArgs):
207
+ """Arguments for NestedSequenceFunction."""
208
+
209
+ history_size: Annotated[int, Arg("history_size", default=20, doc="Max items in history list", ge=1)]
210
+
211
+
212
+ @init_single_worker
213
+ @bind_fixed_schema
214
+ @_cardinality_from_count
215
+ class NestedSequenceFunction(_BaseSequenceFunction):
216
+ """Generates a sequence with nested struct and list columns.
217
+
218
+ USE CASE
219
+ --------
220
+ Test filter pushdown with complex types (structs and lists). The function
221
+ generates rows with:
222
+ - n: sequence index (0 to count-1)
223
+ - metadata: struct with {index: int64, label: string}
224
+ - history: list of the last 20 sequence values
225
+
226
+ SCHEMA
227
+ ------
228
+ Output: {
229
+ "n": int64,
230
+ "metadata": struct<index: int64, label: string>,
231
+ "history": list<int64>
232
+ }
233
+
234
+ Example:
235
+ -------
236
+ SELECT * FROM nested_sequence(5)
237
+ Returns rows with n=0..4, metadata structs, and history lists
238
+
239
+ SELECT * FROM nested_sequence(100) WHERE n >= 50
240
+ Test filter pushdown on the sequence column
241
+
242
+ SELECT metadata.index FROM nested_sequence(10)
243
+ Test projection pushdown with struct field access
244
+
245
+ """
246
+
247
+ class Meta:
248
+ """Metadata for NestedSequenceFunction."""
249
+
250
+ name = "nested_sequence"
251
+ description = "Generates a sequence with nested struct and list columns"
252
+ categories = ["generator", "utility", "testing"]
253
+ tags = {"category": "generator", "type": "testing"}
254
+ projection_pushdown = True
255
+ filter_pushdown = True
256
+ auto_apply_filters = True
257
+ examples = [
258
+ FunctionExample(
259
+ sql="SELECT * FROM nested_sequence(10)",
260
+ description="Generate 10 rows with nested columns",
261
+ ),
262
+ FunctionExample(
263
+ sql="SELECT n, metadata FROM nested_sequence(100) WHERE n >= 50",
264
+ description="Filter and project nested sequence",
265
+ ),
266
+ ]
267
+
268
+ FunctionArguments = NestedSequenceFunctionArguments
269
+
270
+ # Full schema before projection
271
+ FIXED_SCHEMA: ClassVar[pa.Schema] = pa.schema(
272
+ [
273
+ pa.field("n", pa.int64()),
274
+ pa.field(
275
+ "metadata",
276
+ pa.struct([("index", pa.int64()), ("label", pa.string())]),
277
+ ),
278
+ pa.field("history", pa.list_(pa.int64())),
279
+ ]
280
+ )
281
+
282
+ @classmethod
283
+ def statistics(cls, params: BindParams[NestedSequenceFunctionArguments]) -> list[ColumnStatistics] | None:
284
+ """Nested sequence has multiple columns of varying types — opt out of base impl."""
285
+ return None
286
+
287
+ @classmethod
288
+ def _get_projected_column_names(cls, projection_ids: list[int] | None) -> set[str]:
289
+ """Get the set of column names to generate."""
290
+ if projection_ids is not None:
291
+ return {cls.FIXED_SCHEMA.field(i).name for i in projection_ids}
292
+ return {f.name for f in cls.FIXED_SCHEMA}
293
+
294
+ @classmethod
295
+ def _emit_chunk(
296
+ cls,
297
+ params: ProcessParams[NestedSequenceFunctionArguments],
298
+ state: CountdownState,
299
+ out: OutputCollector,
300
+ start: int,
301
+ size: int,
302
+ ) -> None:
303
+ """Emit a batch of nested-sequence rows, honouring projection pushdown."""
304
+ assert params.init_call is not None
305
+ projected_cols = cls._get_projected_column_names(params.init_call.projection_ids)
306
+ indices = list(range(start, start + size))
307
+ data: dict[str, Any] = {}
308
+
309
+ if "n" in projected_cols:
310
+ data["n"] = indices
311
+
312
+ if "metadata" in projected_cols:
313
+ data["metadata"] = [{"index": i, "label": f"row_{i}"} for i in indices]
314
+
315
+ if "history" in projected_cols:
316
+ history_list = []
317
+ for i in indices:
318
+ window_start = max(0, i - params.args.history_size + 1)
319
+ history_list.append(list(range(window_start, i + 1)))
320
+ data["history"] = history_list
321
+
322
+ out.emit(pa.RecordBatch.from_pydict(data, schema=params.output_schema))
323
+
324
+
325
+ @dataclass(frozen=True)
326
+ class DoubleSequenceFunctionArguments(CountBatchArgs):
327
+ """Arguments for DoubleSequenceFunction."""
328
+
329
+ increment: Annotated[float, Arg("increment", default=1.0, doc="Step between values", gt=0.0)]
330
+
331
+
332
+ @init_single_worker
333
+ @bind_fixed_schema
334
+ @_cardinality_from_count
335
+ class DoubleSequenceFunction(_BaseSequenceFunction):
336
+ """Generates a sequence of floats from 0.0 to n-1 with optional increment.
337
+
338
+ USE CASE
339
+ --------
340
+ Generate test data with floating-point values, create sequences for
341
+ interpolation or sampling. The increment parameter allows generating
342
+ sequences like 0.0, 0.5, 1.0, 1.5, ... or 0.0, 0.1, 0.2, 0.3, ...
343
+
344
+ SCHEMA
345
+ ------
346
+ Output: {"n": float64}
347
+
348
+ Example:
349
+ -------
350
+ SELECT * FROM double_sequence(5)
351
+ Returns: [{"n": 0.0}, {"n": 1.0}, {"n": 2.0}, {"n": 3.0}, {"n": 4.0}]
352
+
353
+ SELECT * FROM double_sequence(5, increment := 0.5)
354
+ Returns: [{"n": 0.0}, {"n": 0.5}, {"n": 1.0}, {"n": 1.5}, {"n": 2.0}]
355
+
356
+ SELECT * FROM double_sequence(1000, batch_size := 100)
357
+ Returns: floats 0.0-999.0 in batches of 100 rows each
358
+
359
+ """
360
+
361
+ FunctionArguments = DoubleSequenceFunctionArguments
362
+
363
+ class Meta:
364
+ """Metadata for DoubleSequenceFunction."""
365
+
366
+ name = "double_sequence"
367
+ description = "Generates a sequence of floating-point numbers from 0 to n-1"
368
+ categories = ["generator", "utility"]
369
+ tags = {"category": "generator", "type": "utility"}
370
+ examples = [
371
+ FunctionExample(
372
+ sql="SELECT * FROM double_sequence(10)",
373
+ description="Generate floats 0.0-9.0",
374
+ ),
375
+ FunctionExample(
376
+ sql="SELECT * FROM double_sequence(1000, batch_size := 100)",
377
+ description="Generate floats 0.0-999.0 in batches of 100",
378
+ ),
379
+ FunctionExample(
380
+ sql="SELECT * FROM double_sequence(5, increment := 0.5)",
381
+ description="Generate 0.0, 0.5, 1.0, 1.5, 2.0",
382
+ ),
383
+ ]
384
+
385
+ FIXED_SCHEMA: ClassVar[pa.Schema] = schema(n=pa.float64())
386
+ NUMPY_DTYPE: ClassVar[type[np.generic]] = np.float64
387
+ STATS_ARROW_TYPE: ClassVar[pa.DataType] = pa.float64()
388
+
389
+
390
+ @dataclass(slots=True, frozen=True)
391
+ class PartitionedSequenceFunctionArguments:
392
+ """Arguments for PartitionedSequenceFunction."""
393
+
394
+ count: Annotated[int, Arg(0, doc="Total number of integers to generate", ge=0)]
395
+ increment: Annotated[int, Arg("increment", default=1, doc="Step between values", ge=1)]
396
+
397
+
398
+ @dataclass(kw_only=True)
399
+ class PartitionedSequenceState(ArrowSerializableDataclass):
400
+ """Mutable state for PartitionedSequenceFunction."""
401
+
402
+ current_start: int | None = None
403
+ current_end: int | None = None
404
+ current_idx: int = 0
405
+
406
+
407
+ @bind_fixed_schema
408
+ @_cardinality_from_count
409
+ class PartitionedSequenceFunction(
410
+ TableFunctionGenerator[PartitionedSequenceFunctionArguments, PartitionedSequenceState]
411
+ ):
412
+ """Generates a partitioned sequence of integers for multi-worker execution.
413
+
414
+ USE CASE
415
+ --------
416
+ Generate a sequence of values using a work queue pattern. The primary worker
417
+ populates a queue with work chunks during initialization. All workers
418
+ (including the primary) pull chunks from the queue and generate output.
419
+
420
+ This is resilient to fewer workers launching than expected - all work
421
+ will still be completed by the available workers.
422
+
423
+ SCHEMA
424
+ ------
425
+ Output: {"n": int64}
426
+
427
+ PARALLELIZATION
428
+ ---------------
429
+ Fully parallelizable using a shared work queue. Each worker pulls chunks
430
+ atomically from the queue and generates values for that chunk.
431
+
432
+ The union of all workers' output produces the complete sequence.
433
+
434
+ Example:
435
+ -------
436
+ With count=3000 and MAX_PARTITIONS=24 (chunk = ceil(3000/24) = 125):
437
+ Queue is populated with 24 items: [(0, 125), (125, 250), ...].
438
+ Workers pull chunks and generate values for each range.
439
+ Combined output: [0, 1, 2, ..., 2999]
440
+
441
+ With count=5 and increment=10:
442
+ Combined output: [0, 10, 20, 30, 40]
443
+
444
+ """
445
+
446
+ class Meta:
447
+ """Metadata for PartitionedSequenceFunction."""
448
+
449
+ name = "partitioned_sequence"
450
+ description = "Generates a partitioned sequence for multi-worker execution"
451
+ categories = ["generator", "utility"]
452
+ examples = [
453
+ FunctionExample(
454
+ sql="SELECT * FROM partitioned_sequence(100)",
455
+ description="Generate 0-99 in parallel across workers",
456
+ ),
457
+ FunctionExample(
458
+ sql="SELECT * FROM partitioned_sequence(5, increment := 10)",
459
+ description="Generate 0, 10, 20, 30, 40 in parallel",
460
+ ),
461
+ ]
462
+
463
+ # Cap the work queue at ~MAX_PARTITIONS items regardless of count, by sizing
464
+ # each chunk as ceil(count / MAX_PARTITIONS). The queue is drained one item
465
+ # per round-trip and serialized at the per-attach DO, so partition *count*
466
+ # drives remote cost. A fixed chunk size can't serve both a large query and
467
+ # a small distribution query (too-large chunks collapse the small one to one
468
+ # partition and kill fan-out); capping the partition count keeps ~24
469
+ # partitions at any scale. Each work item is a fixed-size (start, end) range
470
+ # — rows are generated locally and emitted in BATCH_SIZE batches — so this
471
+ # changes only the *count* of tiny pops, never any HTTP body size. Output is
472
+ # the plain sequence (partition-independent), so assertions are unchanged.
473
+ MAX_PARTITIONS: ClassVar[int] = 24
474
+ # Batch size for output within each chunk
475
+ BATCH_SIZE: ClassVar[int] = 1000
476
+
477
+ FIXED_SCHEMA: ClassVar[pa.Schema] = schema(n=pa.int64())
478
+
479
+ @classmethod
480
+ def on_init(
481
+ cls,
482
+ params: InitParams[PartitionedSequenceFunctionArguments],
483
+ ) -> GlobalInitResponse:
484
+ """Perform the global init of the worker for this function call."""
485
+ # Create work items for each chunk of the sequence
486
+ work_items: list[bytes] = []
487
+ chunk = max(1, -(-params.args.count // cls.MAX_PARTITIONS)) # ceil(count / MAX_PARTITIONS)
488
+ for start_idx in range(0, params.args.count, chunk):
489
+ end_idx = min(start_idx + chunk, params.args.count)
490
+ # Pack as two unsigned 64-bit integers: (start_idx, end_idx)
491
+ work_items.append(struct.pack(">QQ", start_idx, end_idx))
492
+
493
+ # Always enqueue (even if empty) to register the invocation
494
+ params.storage.queue_push(work_items)
495
+ return GlobalInitResponse()
496
+
497
+ @classmethod
498
+ def initial_state(cls, params: ProcessParams[PartitionedSequenceFunctionArguments]) -> PartitionedSequenceState:
499
+ """Create initial state."""
500
+ return PartitionedSequenceState()
501
+
502
+ @classmethod
503
+ def process(
504
+ cls,
505
+ params: ProcessParams[PartitionedSequenceFunctionArguments],
506
+ state: PartitionedSequenceState,
507
+ out: OutputCollector,
508
+ ) -> None:
509
+ """Generate values by pulling chunks from the work queue."""
510
+ # If we have no current chunk or finished current chunk, pop next
511
+ if state.current_start is None or state.current_idx >= (state.current_end or 0):
512
+ work_data = params.storage.queue_pop()
513
+ if work_data is None:
514
+ out.finish()
515
+ return
516
+ state.current_start, state.current_end = struct.unpack(">QQ", work_data)
517
+ assert state.current_start is not None
518
+ state.current_idx = state.current_start
519
+
520
+ batch_end_idx = min(state.current_idx + cls.BATCH_SIZE, state.current_end or 0)
521
+ values = [idx * params.args.increment for idx in range(state.current_idx, batch_end_idx)]
522
+
523
+ out.emit(pa.RecordBatch.from_pydict({"n": values}, schema=params.output_schema))
524
+
525
+ state.current_idx = batch_end_idx
526
+
527
+
528
+ @dataclass(slots=True, frozen=True)
529
+ class TenThousandFunctionArguments:
530
+ """Arguments for TenThousandFunction."""
531
+
532
+
533
+ @dataclass(kw_only=True)
534
+ class TenThousandState(ArrowSerializableDataclass):
535
+ """Mutable state for TenThousandFunction."""
536
+
537
+ start: int = 0
538
+
539
+
540
+ @init_single_worker
541
+ @bind_fixed_schema
542
+ class TenThousandFunction(TableFunctionGenerator[TenThousandFunctionArguments, TenThousandState]):
543
+ """Generates 10000 rows with integers from 0 to 9999.
544
+
545
+ USE CASE
546
+ --------
547
+ Simple test data generator with a fixed row count. Useful for testing
548
+ and benchmarking without needing to specify parameters.
549
+
550
+ SCHEMA
551
+ ------
552
+ Output: {"n": int64}
553
+
554
+ Example:
555
+ -------
556
+ SELECT * FROM ten_thousand()
557
+ Returns: [{"n": 0}, {"n": 1}, ..., {"n": 9999}]
558
+
559
+ """
560
+
561
+ class Meta:
562
+ """Metadata for TenThousandFunction."""
563
+
564
+ name = "ten_thousand"
565
+ description = "Generates 10000 integers from 0 to 9999"
566
+ categories = ["generator", "utility"]
567
+ examples = [
568
+ FunctionExample(
569
+ sql="SELECT * FROM ten_thousand()",
570
+ description="Generate integers 0-9999",
571
+ ),
572
+ ]
573
+
574
+ BATCH_SIZE: ClassVar[int] = 1000
575
+
576
+ FIXED_SCHEMA: ClassVar[pa.Schema] = schema(n=pa.int64())
577
+
578
+ @classmethod
579
+ def cardinality(cls, params: BindParams[TenThousandFunctionArguments]) -> TableCardinality:
580
+ """Return exact cardinality (always 10000)."""
581
+ return TableCardinality(estimate=10000, max=10000)
582
+
583
+ @classmethod
584
+ def initial_state(cls, params: ProcessParams[TenThousandFunctionArguments]) -> TenThousandState:
585
+ """Create initial state."""
586
+ return TenThousandState()
587
+
588
+ @classmethod
589
+ def process(
590
+ cls,
591
+ params: ProcessParams[TenThousandFunctionArguments],
592
+ state: TenThousandState,
593
+ out: OutputCollector,
594
+ ) -> None:
595
+ """Generate 10000 integers in batches."""
596
+ if state.start >= 10000:
597
+ out.finish()
598
+ return
599
+
600
+ end = min(state.start + cls.BATCH_SIZE, 10000)
601
+ values = np.arange(state.start, end, dtype=np.int64)
602
+ out.emit(pa.RecordBatch.from_pydict({"n": values}, schema=params.output_schema))
603
+ state.start = end
604
+
605
+
606
+ @dataclass(slots=True, frozen=True)
607
+ class RowIdSequenceFunctionArgs:
608
+ """Arguments for RowIdSequenceFunction."""
609
+
610
+ count: Annotated[int, Arg(0, doc="Number of rows to generate", ge=0)]
611
+ layout: Annotated[
612
+ str,
613
+ Arg(
614
+ "layout",
615
+ default="first",
616
+ doc="Row ID column position",
617
+ choices=("first", "middle", "last"),
618
+ ),
619
+ ]
620
+ row_id_type: Annotated[
621
+ str,
622
+ Arg(
623
+ "row_id_type",
624
+ default="int64",
625
+ doc="Row ID type",
626
+ choices=("int64", "string", "struct"),
627
+ ),
628
+ ]
629
+
630
+
631
+ @init_single_worker
632
+ class RowIdSequenceFunction(_BaseSequenceFunction):
633
+ """Generates a sequence with a row_id column for testing row_id support.
634
+
635
+ The layout argument controls where the row_id column appears in the schema,
636
+ and row_id_type controls the type of the row_id column.
637
+
638
+ """
639
+
640
+ FunctionArguments = RowIdSequenceFunctionArgs
641
+
642
+ class Meta:
643
+ """Metadata for RowIdSequenceFunction."""
644
+
645
+ name = "rowid_sequence"
646
+ description = "Sequence with row_id column"
647
+ projection_pushdown = True
648
+
649
+ @classmethod
650
+ def statistics(cls, params: BindParams[RowIdSequenceFunctionArgs]) -> list[ColumnStatistics] | None:
651
+ """Skip the base ``int64`` arange stats — schema is dynamic per-args here."""
652
+ return None
653
+
654
+ @classmethod
655
+ def on_bind(cls, params: BindParams[RowIdSequenceFunctionArgs]) -> BindResponse:
656
+ """Build schema with is_row_id metadata on the appropriate field."""
657
+ layout = params.args.layout
658
+ row_id_type = params.args.row_id_type
659
+
660
+ # Build the row_id field with is_row_id metadata
661
+ rid_metadata = {b"is_row_id": b""}
662
+ rid_field: pa.Field[Any]
663
+ if row_id_type == "string":
664
+ rid_field = pa.field("row_id", pa.string(), metadata=rid_metadata)
665
+ elif row_id_type == "struct":
666
+ rid_field = pa.field(
667
+ "row_id",
668
+ pa.struct([("a", pa.int64()), ("b", pa.string())]),
669
+ metadata=rid_metadata,
670
+ )
671
+ else: # int64
672
+ rid_field = pa.field("row_id", pa.int64(), metadata=rid_metadata)
673
+
674
+ name_field = pa.field("name", pa.string())
675
+ value_field = pa.field("value", pa.string())
676
+
677
+ if layout == "middle":
678
+ fields = [name_field, rid_field, value_field]
679
+ elif layout == "last":
680
+ fields = [name_field, value_field, rid_field]
681
+ else: # first
682
+ fields = [rid_field, name_field, value_field]
683
+
684
+ return BindResponse(output_schema=pa.schema(fields))
685
+
686
+ @classmethod
687
+ def _emit_chunk(
688
+ cls,
689
+ params: ProcessParams[RowIdSequenceFunctionArgs],
690
+ state: CountdownState,
691
+ out: OutputCollector,
692
+ start: int,
693
+ size: int,
694
+ ) -> None:
695
+ """Emit a batch of row_id + data columns matching the dynamic output schema."""
696
+ columns: dict[str, Any] = {}
697
+ for f in params.output_schema:
698
+ if f.name == "row_id":
699
+ if pa.types.is_string(f.type):
700
+ columns["row_id"] = [f"rid_{i}" for i in range(start, start + size)]
701
+ elif pa.types.is_struct(f.type):
702
+ columns["row_id"] = [{"a": i, "b": f"s_{i}"} for i in range(start, start + size)]
703
+ else:
704
+ columns["row_id"] = list(range(start, start + size))
705
+ elif f.name == "name":
706
+ columns["name"] = [f"item_{i}" for i in range(start, start + size)]
707
+ elif f.name == "value":
708
+ columns["value"] = [f"val_{i}" for i in range(start, start + size)]
709
+
710
+ out.emit(pa.RecordBatch.from_pydict(columns, schema=params.output_schema))