vgi-python 0.8.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (124) hide show
  1. vgi/__init__.py +152 -0
  2. vgi/_duckdb.py +62 -0
  3. vgi/_storage_profile.py +132 -0
  4. vgi/_test_fixtures/__init__.py +20 -0
  5. vgi/_test_fixtures/accumulate/__init__.py +19 -0
  6. vgi/_test_fixtures/accumulate/worker.py +762 -0
  7. vgi/_test_fixtures/aggregate/__init__.py +62 -0
  8. vgi/_test_fixtures/aggregate/_common.py +21 -0
  9. vgi/_test_fixtures/aggregate/basic.py +232 -0
  10. vgi/_test_fixtures/aggregate/dynamic.py +409 -0
  11. vgi/_test_fixtures/aggregate/generic.py +86 -0
  12. vgi/_test_fixtures/aggregate/listagg.py +71 -0
  13. vgi/_test_fixtures/aggregate/percentile.py +107 -0
  14. vgi/_test_fixtures/aggregate/streaming.py +192 -0
  15. vgi/_test_fixtures/aggregate/varargs.py +75 -0
  16. vgi/_test_fixtures/aggregate/window.py +380 -0
  17. vgi/_test_fixtures/attach_options.py +308 -0
  18. vgi/_test_fixtures/bad_protocol.py +62 -0
  19. vgi/_test_fixtures/cancellable.py +336 -0
  20. vgi/_test_fixtures/catalog.py +813 -0
  21. vgi/_test_fixtures/http_server.py +394 -0
  22. vgi/_test_fixtures/nest_tensor.py +614 -0
  23. vgi/_test_fixtures/orchard_catalog.py +47 -0
  24. vgi/_test_fixtures/projection_repro/__init__.py +6 -0
  25. vgi/_test_fixtures/projection_repro/worker.py +454 -0
  26. vgi/_test_fixtures/scalar/__init__.py +116 -0
  27. vgi/_test_fixtures/scalar/_common.py +69 -0
  28. vgi/_test_fixtures/scalar/arithmetic.py +321 -0
  29. vgi/_test_fixtures/scalar/binary.py +120 -0
  30. vgi/_test_fixtures/scalar/formatting.py +176 -0
  31. vgi/_test_fixtures/scalar/geo.py +300 -0
  32. vgi/_test_fixtures/scalar/null_handling.py +107 -0
  33. vgi/_test_fixtures/scalar/random_demo.py +171 -0
  34. vgi/_test_fixtures/scalar/settings_secrets.py +102 -0
  35. vgi/_test_fixtures/scalar/type_info.py +219 -0
  36. vgi/_test_fixtures/schema_reconcile/__init__.py +29 -0
  37. vgi/_test_fixtures/schema_reconcile/worker.py +653 -0
  38. vgi/_test_fixtures/simple_writable.py +793 -0
  39. vgi/_test_fixtures/table/__init__.py +221 -0
  40. vgi/_test_fixtures/table/_common.py +162 -0
  41. vgi/_test_fixtures/table/batch_index.py +283 -0
  42. vgi/_test_fixtures/table/batch_index_broken.py +200 -0
  43. vgi/_test_fixtures/table/catalog_scans.py +162 -0
  44. vgi/_test_fixtures/table/filters.py +1005 -0
  45. vgi/_test_fixtures/table/late_materialization.py +249 -0
  46. vgi/_test_fixtures/table/make_series.py +273 -0
  47. vgi/_test_fixtures/table/misc.py +499 -0
  48. vgi/_test_fixtures/table/order_modes.py +164 -0
  49. vgi/_test_fixtures/table/pairs.py +437 -0
  50. vgi/_test_fixtures/table/partition_columns.py +472 -0
  51. vgi/_test_fixtures/table/partition_columns_broken.py +304 -0
  52. vgi/_test_fixtures/table/profiling_example.py +195 -0
  53. vgi/_test_fixtures/table/required_filters.py +234 -0
  54. vgi/_test_fixtures/table/sequence.py +710 -0
  55. vgi/_test_fixtures/table/settings.py +426 -0
  56. vgi/_test_fixtures/table/transaction_storage.py +162 -0
  57. vgi/_test_fixtures/table/tt_pushdown.py +191 -0
  58. vgi/_test_fixtures/table/versioned.py +230 -0
  59. vgi/_test_fixtures/table_in_out.py +1392 -0
  60. vgi/_test_fixtures/versioned.py +155 -0
  61. vgi/_test_fixtures/versioned_tables.py +595 -0
  62. vgi/_test_fixtures/worker.py +1631 -0
  63. vgi/_test_fixtures/writable/__init__.py +8 -0
  64. vgi/_test_fixtures/writable/generic.py +236 -0
  65. vgi/_test_fixtures/writable/table.py +149 -0
  66. vgi/_test_fixtures/writable/worker.py +1148 -0
  67. vgi/aggregate_function.py +607 -0
  68. vgi/argument_spec.py +472 -0
  69. vgi/arguments.py +1747 -0
  70. vgi/auth.py +55 -0
  71. vgi/catalog/__init__.py +88 -0
  72. vgi/catalog/attach_option.py +206 -0
  73. vgi/catalog/catalog_interface.py +2767 -0
  74. vgi/catalog/descriptors.py +870 -0
  75. vgi/catalog/duckdb_statistics.py +377 -0
  76. vgi/catalog/secret_type.py +96 -0
  77. vgi/catalog/setting.py +253 -0
  78. vgi/catalog/storage.py +372 -0
  79. vgi/client/__init__.py +67 -0
  80. vgi/client/catalog_mixin.py +1251 -0
  81. vgi/client/cli.py +582 -0
  82. vgi/client/cli_catalog.py +182 -0
  83. vgi/client/cli_schema.py +270 -0
  84. vgi/client/cli_table.py +907 -0
  85. vgi/client/cli_transaction.py +97 -0
  86. vgi/client/cli_utils.py +441 -0
  87. vgi/client/cli_view.py +303 -0
  88. vgi/client/client.py +2183 -0
  89. vgi/exceptions.py +205 -0
  90. vgi/function.py +245 -0
  91. vgi/function_storage.py +1636 -0
  92. vgi/function_storage_azure_sql.py +922 -0
  93. vgi/function_storage_cf_do.py +740 -0
  94. vgi/http/__init__.py +25 -0
  95. vgi/http/demo_storage.py +212 -0
  96. vgi/http/worker_page.py +1252 -0
  97. vgi/invocation.py +154 -0
  98. vgi/logging_config.py +93 -0
  99. vgi/meta_worker.py +661 -0
  100. vgi/metadata.py +1403 -0
  101. vgi/otel.py +406 -0
  102. vgi/protocol.py +2418 -0
  103. vgi/protocol_version.txt +1 -0
  104. vgi/py.typed +0 -0
  105. vgi/scalar_function.py +1211 -0
  106. vgi/schema_utils.py +234 -0
  107. vgi/secret_protocol.py +124 -0
  108. vgi/secret_service.py +238 -0
  109. vgi/serve.py +769 -0
  110. vgi/table_buffering_function.py +443 -0
  111. vgi/table_filter_pushdown.py +1528 -0
  112. vgi/table_function.py +1130 -0
  113. vgi/table_in_out_function.py +383 -0
  114. vgi/transactor/__init__.py +24 -0
  115. vgi/transactor/_duckdb_compat.py +27 -0
  116. vgi/transactor/client.py +137 -0
  117. vgi/transactor/protocol.py +149 -0
  118. vgi/transactor/server.py +740 -0
  119. vgi/worker.py +4761 -0
  120. vgi_python-0.8.0.dist-info/METADATA +735 -0
  121. vgi_python-0.8.0.dist-info/RECORD +124 -0
  122. vgi_python-0.8.0.dist-info/WHEEL +4 -0
  123. vgi_python-0.8.0.dist-info/entry_points.txt +5 -0
  124. vgi_python-0.8.0.dist-info/licenses/LICENSE +134 -0
@@ -0,0 +1,735 @@
1
+ Metadata-Version: 2.4
2
+ Name: vgi-python
3
+ Version: 0.8.0
4
+ Summary: Vector Gateway Interface - Connect DuckDB to external programs via Apache Arrow
5
+ Project-URL: Homepage, https://query.farm
6
+ Project-URL: Repository, https://github.com/Query-farm/vgi-python
7
+ Project-URL: Documentation, https://github.com/Query-farm/vgi-python/tree/main/docs
8
+ Project-URL: Issues, https://github.com/Query-farm/vgi-python/issues
9
+ Author-email: Rusty Conover <rusty@query.farm>
10
+ Maintainer-email: Query Farm LLC <hello@query.farm>
11
+ License: Query Farm Source-Available License, Version 1.0
12
+
13
+ Copyright (c) 2025, 2026 Query Farm LLC. All rights reserved.
14
+
15
+ ## 1. Definitions
16
+
17
+ "Licensor" means Query Farm LLC (http://query.farm, hello@query.farm) and its
18
+ affiliates under common control.
19
+
20
+ "VGI" means the Vector Gateway Interface, the DuckDB extension technology developed
21
+ by the Licensor, also referred to by the Licensor as its "Hyperfederation" database
22
+ technology.
23
+
24
+ "Licensed Work" means VGI, including its source code, object code, and any
25
+ documentation distributed with it, in each version made available by the Licensor
26
+ under this License.
27
+
28
+ "You" (or "Your") means the individual or legal entity exercising rights under this
29
+ License, together with all affiliates under common control with that entity.
30
+
31
+ "Production Use" means any use of the Licensed Work other than for development,
32
+ testing, evaluation, experimentation, or other non-production purposes.
33
+
34
+ "Hyperfederation Services" means services relating to the federation, gateway,
35
+ integration, querying, or interoperation of data sources using VGI or
36
+ functionally equivalent technology, including services that expose, broker, or
37
+ provide access to such federated or gateway capabilities.
38
+
39
+ "Commercial Marketplace" means any platform, exchange, or intermediary service,
40
+ whether or not operated for a fee, that connects providers and consumers of
41
+ Hyperfederation Services, or that facilitates the offering, discovery, exchange,
42
+ sale, or licensing of Hyperfederation Services among third parties.
43
+
44
+ "Competing Offering" means a product or service that You make available to third
45
+ parties, on a paid basis (including through paid support, subscription, or hosting
46
+ arrangements), whose capabilities significantly overlap with those of the Licensor's
47
+ version(s) of the Licensed Work.
48
+
49
+ ## 2. Grant of Rights
50
+
51
+ Subject to the terms and limitations of this License, the Licensor grants You a
52
+ worldwide, royalty-free, non-exclusive license to:
53
+
54
+ (a) use, copy, and run the Licensed Work for any non-production purpose;
55
+
56
+ (b) modify the Licensed Work and create derivative works of it;
57
+
58
+ (c) redistribute the Licensed Work and Your derivative works, provided You comply
59
+ with Section 5; and
60
+
61
+ (d) make Production Use of the Licensed Work, except where such use is restricted by
62
+ Section 3 or reserved to the Licensor by Section 4.
63
+
64
+ ## 3. Production Use Conditions
65
+
66
+ The grant of Production Use in Section 2(d) does not extend to, and You may not
67
+ without a separate commercial license from the Licensor:
68
+
69
+ (a) provide a Competing Offering to third parties; or
70
+
71
+ (b) offer the Licensed Work, or any derivative work of it, to third parties on a
72
+ hosted, embedded, or as-a-service basis where doing so competes with the Licensor's
73
+ commercial interests in the Licensed Work.
74
+
75
+ "Embedded" includes incorporating the source or object code of the Licensed Work
76
+ into a Competing Offering, and packaging a Competing Offering such that the Licensed
77
+ Work must be accessed or downloaded for that offering to function.
78
+
79
+ Hosting or using the Licensed Work for Your own internal purposes is not a Competing
80
+ Offering and is permitted, including across Your affiliates under common control.
81
+
82
+ ## 4. Reserved Rights
83
+
84
+ Notwithstanding any other provision of this License, the Licensor reserves to itself
85
+ the exclusive right to build, operate, offer, or authorize a Commercial Marketplace
86
+ that incorporates, integrates, is built upon, or otherwise uses the Licensed Work.
87
+
88
+ This License grants You no right to construct, operate, or enable a Commercial
89
+ Marketplace using the Licensed Work, whether on a commercial or non-commercial basis,
90
+ and any such use requires a separate written agreement with the Licensor.
91
+
92
+ ## 5. Redistribution
93
+
94
+ If You redistribute the Licensed Work or any derivative work of it, in original or
95
+ modified form, You must:
96
+
97
+ (a) include a complete, unmodified copy of this License with each copy; and
98
+
99
+ (b) cause any recipient to receive the Licensed Work subject to the terms of this
100
+ License.
101
+
102
+ The conditions in Sections 3 and 4 apply to every recipient of the Licensed Work,
103
+ whether received directly from the Licensor or through a third party.
104
+
105
+ ## 6. Conversion to Open Source
106
+
107
+ For each version of the Licensed Work, on the tenth anniversary of the date the
108
+ Licensor first made that version publicly available (the "Change Date" for that
109
+ version), the Licensor additionally grants You the right to use that version under
110
+ the terms of the Apache License, Version 2.0, and on and after that version's Change
111
+ Date the restrictions in Sections 3 and 4 no longer apply to that version.
112
+
113
+ This License applies separately to each version of the Licensed Work, and the Change
114
+ Date may differ between versions.
115
+
116
+ ## 7. Commercial Licensing
117
+
118
+ If Your intended use is not permitted under this License, You may obtain a separate
119
+ commercial license from the Licensor by contacting hello@query.farm. Absent such a
120
+ license, You must refrain from the restricted use.
121
+
122
+ ## 8. Trademarks
123
+
124
+ This License does not grant You any right to use the names, trademarks, service
125
+ marks, or logos of the Licensor, including "Vector Gateway Interface," "VGI," and
126
+ "Hyperfederation," except as required for reasonable and customary use in describing
127
+ the origin of the Licensed Work.
128
+
129
+ ## 9. Termination
130
+
131
+ Any use of the Licensed Work in violation of this License automatically terminates
132
+ Your rights under this License for the current and all other versions of the Licensed
133
+ Work. Your rights may be reinstated only by a writing signed by the Licensor.
134
+
135
+ ## 10. Disclaimer of Warranty and Limitation of Liability
136
+
137
+ TO THE MAXIMUM EXTENT PERMITTED BY APPLICABLE LAW, THE LICENSED WORK IS PROVIDED ON
138
+ AN "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, EXPRESS OR IMPLIED,
139
+ INCLUDING WITHOUT LIMITATION ANY WARRANTIES OR CONDITIONS OF MERCHANTABILITY, FITNESS
140
+ FOR A PARTICULAR PURPOSE, NON-INFRINGEMENT, OR TITLE.
141
+
142
+ TO THE MAXIMUM EXTENT PERMITTED BY APPLICABLE LAW, IN NO EVENT WILL THE LICENSOR BE
143
+ LIABLE TO YOU FOR ANY DAMAGES ARISING OUT OF OR RELATING TO THIS LICENSE OR THE USE
144
+ OF THE LICENSED WORK, WHETHER IN CONTRACT, TORT, OR OTHERWISE.
145
+ License-File: LICENSE
146
+ Classifier: Development Status :: 4 - Beta
147
+ Classifier: Intended Audience :: Developers
148
+ Classifier: License :: Other/Proprietary License
149
+ Classifier: Operating System :: OS Independent
150
+ Classifier: Programming Language :: Python :: 3
151
+ Classifier: Programming Language :: Python :: 3 :: Only
152
+ Classifier: Programming Language :: Python :: 3.13
153
+ Classifier: Programming Language :: Python :: 3.14
154
+ Classifier: Topic :: Database
155
+ Classifier: Topic :: Database :: Database Engines/Servers
156
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
157
+ Classifier: Typing :: Typed
158
+ Requires-Python: >=3.13
159
+ Requires-Dist: click
160
+ Requires-Dist: httpx>=0.24
161
+ Requires-Dist: platformdirs
162
+ Requires-Dist: pyarrow
163
+ Requires-Dist: typer>=0.9
164
+ Requires-Dist: vgi-rpc>=0.20.0
165
+ Provides-Extra: azure
166
+ Requires-Dist: azure-identity>=1.16.0; extra == 'azure'
167
+ Requires-Dist: pymssql>=2.3.0; extra == 'azure'
168
+ Provides-Extra: dev
169
+ Requires-Dist: azure-identity>=1.16.0; extra == 'dev'
170
+ Requires-Dist: duckdb>=1.5.3; extra == 'dev'
171
+ Requires-Dist: mypy; extra == 'dev'
172
+ Requires-Dist: numpy>=2.4.1; extra == 'dev'
173
+ Requires-Dist: pyarrow-stubs; extra == 'dev'
174
+ Requires-Dist: pymssql>=2.3.0; extra == 'dev'
175
+ Requires-Dist: pytest; extra == 'dev'
176
+ Requires-Dist: pytest-cov; extra == 'dev'
177
+ Requires-Dist: pytest-examples; extra == 'dev'
178
+ Requires-Dist: pytest-ruff; extra == 'dev'
179
+ Requires-Dist: pytest-xdist; extra == 'dev'
180
+ Requires-Dist: ruff; extra == 'dev'
181
+ Requires-Dist: sqlglot; extra == 'dev'
182
+ Requires-Dist: vgi-rpc[conformance,external,http,oauth,otel,sentry]; extra == 'dev'
183
+ Requires-Dist: vgi-rpc[http]; extra == 'dev'
184
+ Requires-Dist: vgi-rpc[oauth]; extra == 'dev'
185
+ Requires-Dist: vgi-rpc[otel]; extra == 'dev'
186
+ Requires-Dist: vgi-rpc[sentry]; extra == 'dev'
187
+ Provides-Extra: duckdb
188
+ Requires-Dist: duckdb>=1.5.3; extra == 'duckdb'
189
+ Provides-Extra: fixtures
190
+ Requires-Dist: vgi-fixtures; extra == 'fixtures'
191
+ Provides-Extra: haybarn
192
+ Requires-Dist: haybarn>=1.5.3rc10; extra == 'haybarn'
193
+ Provides-Extra: http
194
+ Requires-Dist: vgi-rpc[http]; extra == 'http'
195
+ Provides-Extra: oauth
196
+ Requires-Dist: vgi-rpc[oauth]; extra == 'oauth'
197
+ Provides-Extra: otel
198
+ Requires-Dist: vgi-rpc[otel]; extra == 'otel'
199
+ Provides-Extra: sentry
200
+ Requires-Dist: vgi-rpc[sentry]; extra == 'sentry'
201
+ Provides-Extra: test-fixtures
202
+ Requires-Dist: numpy>=2.4.1; extra == 'test-fixtures'
203
+ Provides-Extra: test-fixtures-writable
204
+ Requires-Dist: numpy>=2.4.1; extra == 'test-fixtures-writable'
205
+ Requires-Dist: sqlglot; extra == 'test-fixtures-writable'
206
+ Provides-Extra: transactor
207
+ Requires-Dist: sqlglot; extra == 'transactor'
208
+ Description-Content-Type: text/markdown
209
+
210
+ # VGI (Vector Gateway Interface)
211
+
212
+ <p align="center">
213
+ <img src="docs/vgi-logo.png" alt="VGI Logo" width="400">
214
+ </p>
215
+
216
+ <p align="center">
217
+ <strong>Apache Arrow-based protocol for extending DuckDB using any language.</strong><br/>
218
+ <strong>No C++/C/Zig/Rust or compilation/linking required (unless you want to).</strong>
219
+ </p>
220
+
221
+ <p align="center">
222
+ Created by <a href="https://query.farm">Query.Farm</a>
223
+ </p>
224
+
225
+ ---
226
+
227
+ ## See It in Action
228
+
229
+ ```python
230
+ # my_worker.py
231
+ from typing import Annotated
232
+ from vgi import ScalarFunction, Param, Returns, Worker
233
+ import pyarrow as pa
234
+ import pyarrow.compute as pc
235
+
236
+ class Greeting(ScalarFunction):
237
+ """Generate a greeting for each name."""
238
+
239
+ @classmethod
240
+ def compute(
241
+ cls,
242
+ name: Annotated[pa.StringArray, Param(doc="Column containing names")],
243
+ ) -> Annotated[pa.StringArray, Returns()]:
244
+ return pc.binary_join_element_wise("Hello, ", name, "!")
245
+
246
+ class MyWorker(Worker):
247
+ functions = [Greeting]
248
+
249
+ if __name__ == "__main__":
250
+ MyWorker().run()
251
+ ```
252
+
253
+ ```sql
254
+ -- First time only.
255
+ INSTALL vgi FROM COMMUNITY;
256
+ LOAD vgi;
257
+ ATTACH 'my_worker' (TYPE 'vgi', LOCATION './my_worker.py');
258
+
259
+ SELECT greeting(name) FROM users;
260
+ -- "Hello, Alice!"
261
+ -- "Hello, Bob!"
262
+ ```
263
+
264
+ Or you can launch the DuckDB CLI with
265
+
266
+ `duckdb vgi:my_worker.py` to start a new session with the functions you just added.
267
+
268
+ That's it. No C++ compilation, no extension versioning, no complex build process. Just a Python script that DuckDB can call.
269
+
270
+ ---
271
+
272
+ ## Installation
273
+
274
+ ```bash
275
+ pip install vgi
276
+ ```
277
+
278
+ Or with [uv](https://github.com/astral-sh/uv):
279
+
280
+ ```bash
281
+ uv add vgi
282
+ ```
283
+
284
+ ---
285
+
286
+ ## Why VGI?
287
+
288
+ VGI lets you extend DuckDB with Python functions that run in separate processes, communicating via Apache Arrow IPC. This means:
289
+
290
+ | Traditional Extensions | VGI Workers |
291
+ |----------------------|-------------|
292
+ | C/C++ compilation required | Any language but first Python and Typescript and Go |
293
+ | Tied to DuckDB version | Version independent |
294
+ | Complex build/release cycle | Ship a script or executable |
295
+ | Runs in-process | Process isolation |
296
+ | Single-threaded | Parallel workers |
297
+
298
+ **Use cases:**
299
+ - Call REST APIs or external services from SQL
300
+ - Run ML inference (PyTorch, scikit-learn, etc.)
301
+ - Process data with Python libraries (pandas, numpy)
302
+ - Build custom ETL transforms
303
+ - Create domain-specific functions for your team
304
+ - Expose external data sources as queryable tables and views
305
+
306
+ ---
307
+
308
+ ## Quick Start
309
+
310
+ ### Step 1: Create a Worker
311
+
312
+ A worker is a Python script that defines one or more functions:
313
+
314
+ ```python
315
+ #!/usr/bin/env python
316
+ # my_worker.py
317
+ from typing import Annotated
318
+ import pyarrow as pa
319
+ import pyarrow.compute as pc
320
+ from vgi import ScalarFunction, Param, Returns, Worker
321
+
322
+
323
+ class UpperCase(ScalarFunction):
324
+ """Convert string values to uppercase."""
325
+
326
+ @classmethod
327
+ def compute(
328
+ cls,
329
+ value: Annotated[pa.StringArray, Param(doc="String value to uppercase")],
330
+ ) -> Annotated[pa.StringArray, Returns()]:
331
+ return pc.utf8_upper(value)
332
+
333
+
334
+ class MyWorker(Worker):
335
+ catalog_name = "my_funcs"
336
+ functions = [UpperCase]
337
+
338
+
339
+ if __name__ == "__main__":
340
+ MyWorker().run()
341
+ ```
342
+
343
+ ### Step 2: Use from DuckDB
344
+
345
+ ```sql
346
+ -- Attach the worker as a catalog
347
+ ATTACH 'my_funcs' (TYPE 'vgi', LOCATION './my_worker.py');
348
+
349
+ -- Call your function
350
+ SELECT upper_case(name) FROM users;
351
+
352
+ -- Use in complex queries
353
+ SELECT id, upper_case(status) as status
354
+ FROM orders
355
+ WHERE created_at > '2024-01-01';
356
+ ```
357
+
358
+ ### Step 3: There is no step 3
359
+
360
+ Your function is now available in DuckDB. Ship the Python script to your team, and they can use it immediately.
361
+
362
+ ---
363
+
364
+ ## Going Further: Type-Safe Arguments
365
+
366
+ For production use, you'll want type validation. Use `Param` with `type_bound` to ensure columns have the correct type:
367
+
368
+ ```python
369
+ from typing import Annotated
370
+ from vgi import ScalarFunction, Param, Returns, Worker
371
+ import pyarrow as pa
372
+ import pyarrow.compute as pc
373
+
374
+
375
+ class AddValues(ScalarFunction):
376
+ """Add two integer values together."""
377
+
378
+ @classmethod
379
+ def compute(
380
+ cls,
381
+ left: Annotated[pa.Int64Array, Param(type_bound=pa.types.is_integer, doc="First integer value")],
382
+ right: Annotated[pa.Int64Array, Param(type_bound=pa.types.is_integer, doc="Second integer value")],
383
+ ) -> Annotated[pa.Int64Array, Returns()]:
384
+ return pc.add(left, right)
385
+ ```
386
+
387
+ ```sql
388
+ SELECT add_values(price, tax) as total FROM orders;
389
+
390
+ -- This would fail at bind time with a clear error:
391
+ -- SELECT add_values(name, price) FROM orders;
392
+ -- Error: Column 'name' has type string, expected integer
393
+ ```
394
+
395
+ Key features of the `Param`/`Returns` API:
396
+ - Types are inferred from PyArrow array annotations (`pa.Int64Array` -> `pa.int64()`)
397
+ - `type_bound` validates the column's Arrow type at bind time
398
+ - `ConstParam` receives scalar values (not columns) from SQL arguments
399
+ - `Returns` declares the output type
400
+
401
+ ---
402
+
403
+ ## Function Types
404
+
405
+ VGI supports three function types:
406
+
407
+ | Type | Base Class | SQL Pattern | Use Case |
408
+ |------|------------|-------------|----------|
409
+ | **Scalar** | `ScalarFunction` | `SELECT func(col) FROM t` | Per-row transforms (1:1) |
410
+ | **Table** | `TableFunctionGenerator` | `SELECT * FROM func(args)` | Generate data |
411
+ | **Table-In-Out** | `TableInOutFunction` | `SELECT * FROM func((SELECT ...))` | Aggregation, filtering |
412
+
413
+ ### Scalar Functions
414
+
415
+ Transform each row independently. Output has the same number of rows as input.
416
+
417
+ ```python
418
+ class Double(ScalarFunction):
419
+ """Double an integer value."""
420
+
421
+ @classmethod
422
+ def compute(
423
+ cls,
424
+ value: Annotated[pa.Int64Array, Param(doc="Value to double")],
425
+ ) -> Annotated[pa.Int64Array, Returns()]:
426
+ return pc.multiply(value, 2)
427
+ ```
428
+
429
+ ### Table Functions
430
+
431
+ Generate output data from arguments (no input table). Each call to `process()` emits
432
+ a batch via `out.emit()` or signals completion via `out.finish()`.
433
+
434
+ ```python
435
+ from dataclasses import dataclass
436
+ from typing import Annotated, ClassVar
437
+ import pyarrow as pa
438
+ from vgi import TableFunctionGenerator, Arg
439
+ from vgi.table_function import ProcessParams, OutputCollector
440
+
441
+
442
+ @dataclass
443
+ class CounterState:
444
+ remaining: int
445
+ current: int = 0
446
+
447
+
448
+ class Counter(TableFunctionGenerator):
449
+ """Generate a sequence of integers."""
450
+
451
+ count: Annotated[int, Arg(0, doc="Number of rows to generate")]
452
+ FIXED_SCHEMA: ClassVar[pa.Schema] = pa.schema([("n", pa.int64())])
453
+
454
+ @classmethod
455
+ def initial_state(cls, params: ProcessParams) -> CounterState:
456
+ return CounterState(remaining=params.args.count)
457
+
458
+ @classmethod
459
+ def process(cls, params: ProcessParams, state: CounterState, out: OutputCollector) -> None:
460
+ if state.remaining <= 0:
461
+ out.finish()
462
+ return
463
+ batch_size = min(state.remaining, 1000)
464
+ values = list(range(state.current, state.current + batch_size))
465
+ out.emit(pa.RecordBatch.from_pydict({"n": values}, schema=params.output_schema))
466
+ state.current += batch_size
467
+ state.remaining -= batch_size
468
+ ```
469
+
470
+ ### Table-In-Out Functions
471
+
472
+ Transform or aggregate input data. Override `transform()` for per-batch processing
473
+ and `finish()` for final output after all input is consumed.
474
+
475
+ ```python
476
+ import pyarrow as pa
477
+ import pyarrow.compute as pc
478
+ from vgi import TableInOutFunction
479
+
480
+
481
+ class FilterPositive(TableInOutFunction):
482
+ """Keep only rows where all numeric columns are positive."""
483
+
484
+ @property
485
+ def output_schema(self) -> pa.Schema:
486
+ return self.input_schema
487
+
488
+ def transform(self, batch: pa.RecordBatch) -> pa.RecordBatch:
489
+ mask = None
490
+ for i, field in enumerate(batch.schema):
491
+ if pa.types.is_integer(field.type) or pa.types.is_floating(field.type):
492
+ col_mask = pc.greater(batch.column(i), 0)
493
+ mask = col_mask if mask is None else pc.and_(mask, col_mask)
494
+ if mask is not None:
495
+ return pc.filter(batch, mask)
496
+ return batch
497
+ ```
498
+
499
+ ---
500
+
501
+ ## Beyond Functions: Full Catalog Support
502
+
503
+ VGI workers can expose more than just functions. A worker can provide a complete database catalog with:
504
+
505
+ - **Schemas** - Organize objects into namespaces
506
+ - **Tables** - Expose external data as queryable tables
507
+ - **Views** - Define SQL views over your data
508
+ - **Functions** - Scalar, table, and table-in-out functions
509
+
510
+ ```sql
511
+ ATTACH 'external_db' (TYPE 'vgi', LOCATION './my_catalog_worker.py');
512
+
513
+ -- Query tables from the attached catalog
514
+ SELECT * FROM external_db.main.users;
515
+
516
+ -- Use views
517
+ SELECT * FROM external_db.analytics.daily_summary;
518
+
519
+ -- Call functions
520
+ SELECT external_db.main.transform(col) FROM my_table;
521
+ ```
522
+
523
+ This enables VGI workers to act as bridges to external systems—databases, APIs, file systems—presenting them as native DuckDB catalogs.
524
+
525
+ See [Catalog Interface](docs/catalog-interface.md) for implementation details.
526
+
527
+ ---
528
+
529
+ ## Parallel Execution
530
+
531
+ Functions can run across multiple worker processes. The client automatically
532
+ distributes input batches round-robin across workers and collects results.
533
+
534
+ See [Function API Reference](docs/generator-api.md) for advanced patterns like distributed aggregation.
535
+
536
+ ---
537
+
538
+ ## Error Handling
539
+
540
+ Errors in your functions propagate to DuckDB with clear messages:
541
+
542
+ ```python test="skip"
543
+ @classmethod
544
+ def compute(cls, value: Annotated[pa.Int64Array, Param()]) -> Annotated[pa.Int64Array, Returns()]:
545
+ raise ValueError("Something went wrong")
546
+ ```
547
+
548
+ ```sql
549
+ SELECT my_func(col) FROM my_table;
550
+ -- Error: Something went wrong
551
+ ```
552
+
553
+ Type bound violations are caught at bind time (before processing starts):
554
+
555
+ ```sql
556
+ SELECT add_values(name, price) FROM orders;
557
+ -- Error: Argument 'left': Column 'name' has type string,
558
+ -- but type bound requires: is_integer
559
+ ```
560
+
561
+ ### Debugging Worker Failures
562
+
563
+ When a worker fails, the Python traceback is written to stderr. By default, the client captures this stderr and includes it in the error message (last 50 lines), so you get the full context:
564
+
565
+ ```
566
+ ClientError: Worker Exception: function 'my_func' raised ValueError
567
+
568
+ Worker stderr:
569
+ Traceback (most recent call last):
570
+ File "my_worker.py", line 42, in compute
571
+ ...
572
+ ValueError: Something went wrong
573
+ ```
574
+
575
+ For real-time debugging, set `VGI_WORKER_DEBUG=1` to stream worker logs directly to your terminal and enable DEBUG-level logging:
576
+
577
+ ```bash
578
+ VGI_WORKER_DEBUG=1 python my_script.py
579
+ ```
580
+
581
+ This is especially useful when integrating from C++ or other clients where stderr might otherwise be lost.
582
+
583
+ ---
584
+
585
+ ## Testing Your Functions
586
+
587
+ Use the VGI client for integration tests:
588
+
589
+ ```python
590
+ from vgi.client import Client
591
+ from vgi import Arguments
592
+ import pyarrow as pa
593
+
594
+ batch = pa.RecordBatch.from_pydict({"name": ["alice", "bob"]})
595
+
596
+ with Client("./my_worker.py") as client:
597
+ results = list(client.scalar_function(
598
+ function_name="upper_case",
599
+ input=iter([batch]),
600
+ arguments=Arguments(positional=[pa.scalar("name")]),
601
+ ))
602
+
603
+ assert results[0]["result"].to_pylist() == ["ALICE", "BOB"]
604
+ ```
605
+
606
+ ---
607
+
608
+ ## Protocol Overview
609
+
610
+ VGI uses `vgi_rpc`, an Apache Arrow IPC-based RPC framework, for all
611
+ client-worker communication over stdin/stdout pipes:
612
+
613
+ ```
614
+ Client Worker
615
+ │ │
616
+ │──── bind(request) ──────────────▶ │ Function name, args, input schema
617
+ │◀─── BindResponse ──────────────── │ Output schema, opaque data
618
+ │ │
619
+ │──── init(request) ──────────────▶ │ Start processing stream
620
+ │◀─── Stream header ─────────────── │ execution_id, max_workers
621
+ │ │
622
+ │──── exchange(batch1) ───────────▶ │
623
+ │◀─── output batch 1 ────────────── │ transform(batch)
624
+ │ ... │
625
+ │──── [stream close] ─────────────▶ │ Signal end of input
626
+ │ │
627
+ │──── init(phase=FINALIZE) ───────▶ │ Start finalize stream
628
+ │◀─── final output batches ──────── │ finish() results
629
+ └───────────────────────────────────┘
630
+ ```
631
+
632
+ ---
633
+
634
+ ## External Batch Offloading (Demo Storage)
635
+
636
+ When record batches are too large for HTTP request/response bodies, VGI supports
637
+ externalizing them to blob storage. The server replaces oversized batches with
638
+ pointer batches containing a URL, and the client transparently fetches the data.
639
+
640
+ The example HTTP server includes a built-in demo blob store for testing this
641
+ without S3 or any cloud infrastructure:
642
+
643
+ ```bash
644
+ # Start with demo storage (4 KiB threshold for testing)
645
+ vgi-fixture-http --demo-storage --externalize-threshold-bytes 4096
646
+
647
+ # With zstd compression
648
+ vgi-fixture-http --demo-storage --externalize-threshold-bytes 4096 --externalize-compression zstd
649
+ ```
650
+
651
+ When `--demo-storage` is enabled:
652
+ - Batches exceeding `--externalize-threshold-bytes` are stored in-memory and
653
+ served from `/__blobs__/{id}` endpoints on the same server
654
+ - Clients can request upload URLs for large inputs via the `__upload_url__` endpoint
655
+ - The server advertises `VGI-Max-Request-Bytes` and rejects oversized requests with 413
656
+
657
+ For production use, implement the `ExternalStorage` protocol from `vgi_rpc` against
658
+ your cloud storage (S3, GCS, etc.). The example server also supports S3 via `--s3-bucket`.
659
+
660
+ ---
661
+
662
+ ## Documentation
663
+
664
+ - [Function Lifecycle](docs/lifecycle.md) - Bind, init, process, finalize
665
+ - [Metadata API](docs/metadata.md) - Function introspection
666
+ - [Function API Reference](docs/generator-api.md) - Advanced function patterns
667
+ - [Catalog Interface](docs/catalog-interface.md) - DuckDB ATTACH integration
668
+
669
+ ---
670
+
671
+ ## Logging
672
+
673
+ Workers support `--debug`, `--log-level`, `--log-format`, and `--log-logger` options:
674
+
675
+ ```bash
676
+ # Enable debug logging
677
+ vgi-fixture-worker --debug
678
+
679
+ # JSON-formatted logs for structured pipelines
680
+ vgi-fixture-worker --log-format json
681
+
682
+ # Target a specific logger
683
+ vgi-fixture-worker --log-level DEBUG --log-logger vgi.worker
684
+ ```
685
+
686
+ You can also use the `VGI_WORKER_DEBUG=1` environment variable, which enables `--debug` on the worker and stderr passthrough on the client without changing any code or CLI flags:
687
+
688
+ ```bash
689
+ VGI_WORKER_DEBUG=1 python my_script.py
690
+ ```
691
+
692
+ See [CLI Reference](docs/cli.md#worker-logging) for the full list of loggers and options.
693
+
694
+ ---
695
+
696
+ ## Development
697
+
698
+ ```bash
699
+ git clone https://github.com/query-farm/vgi-python
700
+ cd vgi-python
701
+
702
+ uv sync --all-extras # Install dependencies
703
+ uv run pytest -n auto # Run tests
704
+ uv run ruff check --fix . # Lint
705
+ uv run ruff format . # Format
706
+ uv run mypy vgi/ # Type check
707
+ ```
708
+
709
+ ## Requirements
710
+
711
+ - Python >= 3.12.4
712
+ - pyarrow
713
+ - DuckDB (for SQL integration)
714
+
715
+ ---
716
+
717
+ ## License
718
+
719
+ Copyright (c) 2025, 2026 Query Farm LLC.
720
+
721
+ Licensed under the **Query Farm Source-Available License, Version 1.0** — see
722
+ [LICENSE](LICENSE) for the binding terms. In summary (the LICENSE text governs):
723
+
724
+ - ✅ **Use, copy, modify, and redistribute** the code freely, **including in
725
+ production and for commercial purposes** — your own internal use, and building
726
+ products and services on top of VGI.
727
+ - 🚫 Not permitted **without a separate commercial license**: offering a
728
+ *competing* VGI-equivalent product or service to third parties (hosted,
729
+ embedded, or as-a-service), or operating a commercial marketplace for such
730
+ services.
731
+ - ⏳ Each released version converts to the **Apache License, Version 2.0**, ten
732
+ years after its public release.
733
+
734
+ For a commercial license or any licensing questions, contact
735
+ [hello@query.farm](mailto:hello@query.farm).