vgi-python 0.8.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vgi/__init__.py +152 -0
- vgi/_duckdb.py +62 -0
- vgi/_storage_profile.py +132 -0
- vgi/_test_fixtures/__init__.py +20 -0
- vgi/_test_fixtures/accumulate/__init__.py +19 -0
- vgi/_test_fixtures/accumulate/worker.py +762 -0
- vgi/_test_fixtures/aggregate/__init__.py +62 -0
- vgi/_test_fixtures/aggregate/_common.py +21 -0
- vgi/_test_fixtures/aggregate/basic.py +232 -0
- vgi/_test_fixtures/aggregate/dynamic.py +409 -0
- vgi/_test_fixtures/aggregate/generic.py +86 -0
- vgi/_test_fixtures/aggregate/listagg.py +71 -0
- vgi/_test_fixtures/aggregate/percentile.py +107 -0
- vgi/_test_fixtures/aggregate/streaming.py +192 -0
- vgi/_test_fixtures/aggregate/varargs.py +75 -0
- vgi/_test_fixtures/aggregate/window.py +380 -0
- vgi/_test_fixtures/attach_options.py +308 -0
- vgi/_test_fixtures/bad_protocol.py +62 -0
- vgi/_test_fixtures/cancellable.py +336 -0
- vgi/_test_fixtures/catalog.py +813 -0
- vgi/_test_fixtures/http_server.py +394 -0
- vgi/_test_fixtures/nest_tensor.py +614 -0
- vgi/_test_fixtures/orchard_catalog.py +47 -0
- vgi/_test_fixtures/projection_repro/__init__.py +6 -0
- vgi/_test_fixtures/projection_repro/worker.py +454 -0
- vgi/_test_fixtures/scalar/__init__.py +116 -0
- vgi/_test_fixtures/scalar/_common.py +69 -0
- vgi/_test_fixtures/scalar/arithmetic.py +321 -0
- vgi/_test_fixtures/scalar/binary.py +120 -0
- vgi/_test_fixtures/scalar/formatting.py +176 -0
- vgi/_test_fixtures/scalar/geo.py +300 -0
- vgi/_test_fixtures/scalar/null_handling.py +107 -0
- vgi/_test_fixtures/scalar/random_demo.py +171 -0
- vgi/_test_fixtures/scalar/settings_secrets.py +102 -0
- vgi/_test_fixtures/scalar/type_info.py +219 -0
- vgi/_test_fixtures/schema_reconcile/__init__.py +29 -0
- vgi/_test_fixtures/schema_reconcile/worker.py +653 -0
- vgi/_test_fixtures/simple_writable.py +793 -0
- vgi/_test_fixtures/table/__init__.py +221 -0
- vgi/_test_fixtures/table/_common.py +162 -0
- vgi/_test_fixtures/table/batch_index.py +283 -0
- vgi/_test_fixtures/table/batch_index_broken.py +200 -0
- vgi/_test_fixtures/table/catalog_scans.py +162 -0
- vgi/_test_fixtures/table/filters.py +1005 -0
- vgi/_test_fixtures/table/late_materialization.py +249 -0
- vgi/_test_fixtures/table/make_series.py +273 -0
- vgi/_test_fixtures/table/misc.py +499 -0
- vgi/_test_fixtures/table/order_modes.py +164 -0
- vgi/_test_fixtures/table/pairs.py +437 -0
- vgi/_test_fixtures/table/partition_columns.py +472 -0
- vgi/_test_fixtures/table/partition_columns_broken.py +304 -0
- vgi/_test_fixtures/table/profiling_example.py +195 -0
- vgi/_test_fixtures/table/required_filters.py +234 -0
- vgi/_test_fixtures/table/sequence.py +710 -0
- vgi/_test_fixtures/table/settings.py +426 -0
- vgi/_test_fixtures/table/transaction_storage.py +162 -0
- vgi/_test_fixtures/table/tt_pushdown.py +191 -0
- vgi/_test_fixtures/table/versioned.py +230 -0
- vgi/_test_fixtures/table_in_out.py +1392 -0
- vgi/_test_fixtures/versioned.py +155 -0
- vgi/_test_fixtures/versioned_tables.py +595 -0
- vgi/_test_fixtures/worker.py +1631 -0
- vgi/_test_fixtures/writable/__init__.py +8 -0
- vgi/_test_fixtures/writable/generic.py +236 -0
- vgi/_test_fixtures/writable/table.py +149 -0
- vgi/_test_fixtures/writable/worker.py +1148 -0
- vgi/aggregate_function.py +607 -0
- vgi/argument_spec.py +472 -0
- vgi/arguments.py +1747 -0
- vgi/auth.py +55 -0
- vgi/catalog/__init__.py +88 -0
- vgi/catalog/attach_option.py +206 -0
- vgi/catalog/catalog_interface.py +2767 -0
- vgi/catalog/descriptors.py +870 -0
- vgi/catalog/duckdb_statistics.py +377 -0
- vgi/catalog/secret_type.py +96 -0
- vgi/catalog/setting.py +253 -0
- vgi/catalog/storage.py +372 -0
- vgi/client/__init__.py +67 -0
- vgi/client/catalog_mixin.py +1251 -0
- vgi/client/cli.py +582 -0
- vgi/client/cli_catalog.py +182 -0
- vgi/client/cli_schema.py +270 -0
- vgi/client/cli_table.py +907 -0
- vgi/client/cli_transaction.py +97 -0
- vgi/client/cli_utils.py +441 -0
- vgi/client/cli_view.py +303 -0
- vgi/client/client.py +2183 -0
- vgi/exceptions.py +205 -0
- vgi/function.py +245 -0
- vgi/function_storage.py +1636 -0
- vgi/function_storage_azure_sql.py +922 -0
- vgi/function_storage_cf_do.py +740 -0
- vgi/http/__init__.py +25 -0
- vgi/http/demo_storage.py +212 -0
- vgi/http/worker_page.py +1252 -0
- vgi/invocation.py +154 -0
- vgi/logging_config.py +93 -0
- vgi/meta_worker.py +661 -0
- vgi/metadata.py +1403 -0
- vgi/otel.py +406 -0
- vgi/protocol.py +2418 -0
- vgi/protocol_version.txt +1 -0
- vgi/py.typed +0 -0
- vgi/scalar_function.py +1211 -0
- vgi/schema_utils.py +234 -0
- vgi/secret_protocol.py +124 -0
- vgi/secret_service.py +238 -0
- vgi/serve.py +769 -0
- vgi/table_buffering_function.py +443 -0
- vgi/table_filter_pushdown.py +1528 -0
- vgi/table_function.py +1130 -0
- vgi/table_in_out_function.py +383 -0
- vgi/transactor/__init__.py +24 -0
- vgi/transactor/_duckdb_compat.py +27 -0
- vgi/transactor/client.py +137 -0
- vgi/transactor/protocol.py +149 -0
- vgi/transactor/server.py +740 -0
- vgi/worker.py +4761 -0
- vgi_python-0.8.0.dist-info/METADATA +735 -0
- vgi_python-0.8.0.dist-info/RECORD +124 -0
- vgi_python-0.8.0.dist-info/WHEEL +4 -0
- vgi_python-0.8.0.dist-info/entry_points.txt +5 -0
- vgi_python-0.8.0.dist-info/licenses/LICENSE +134 -0
vgi/http/__init__.py
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
# Copyright 2025, 2026 Query Farm LLC - https://query.farm
|
|
2
|
+
|
|
3
|
+
"""HTTP utilities for VGI workers.
|
|
4
|
+
|
|
5
|
+
Provides the worker description page and demo blob storage for HTTP-mode workers.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from vgi.http.demo_storage import (
|
|
11
|
+
DemoBlobStorage,
|
|
12
|
+
MaxRequestBytesMiddleware,
|
|
13
|
+
add_blob_routes,
|
|
14
|
+
localhost_only_validator,
|
|
15
|
+
)
|
|
16
|
+
from vgi.http.worker_page import WorkerPageResource, build_worker_page
|
|
17
|
+
|
|
18
|
+
__all__ = [
|
|
19
|
+
"DemoBlobStorage",
|
|
20
|
+
"MaxRequestBytesMiddleware",
|
|
21
|
+
"WorkerPageResource",
|
|
22
|
+
"add_blob_routes",
|
|
23
|
+
"build_worker_page",
|
|
24
|
+
"localhost_only_validator",
|
|
25
|
+
]
|
vgi/http/demo_storage.py
ADDED
|
@@ -0,0 +1,212 @@
|
|
|
1
|
+
# Copyright 2025, 2026 Query Farm LLC - https://query.farm
|
|
2
|
+
|
|
3
|
+
"""In-process blob storage for demonstrating and testing external batch offloading.
|
|
4
|
+
|
|
5
|
+
Provides a simple HTTP blob store that implements the ``ExternalStorage`` and
|
|
6
|
+
``UploadUrlProvider`` protocols from vgi_rpc, served from the same HTTP server
|
|
7
|
+
process. This allows the example worker to demonstrate external record batch
|
|
8
|
+
offloading without requiring S3 or any cloud infrastructure.
|
|
9
|
+
|
|
10
|
+
**Not for production use** — blobs are held in memory with LRU eviction.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
import threading
|
|
16
|
+
import uuid
|
|
17
|
+
from collections import OrderedDict
|
|
18
|
+
from datetime import UTC, datetime, timedelta
|
|
19
|
+
from typing import TYPE_CHECKING, Any
|
|
20
|
+
from urllib.parse import urlparse
|
|
21
|
+
|
|
22
|
+
if TYPE_CHECKING:
|
|
23
|
+
from vgi_rpc.external import UploadUrl
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class DemoBlobStorage:
|
|
27
|
+
"""In-memory blob store implementing ``ExternalStorage`` and ``UploadUrlProvider``.
|
|
28
|
+
|
|
29
|
+
Blobs are stored in an ``OrderedDict`` with LRU eviction when ``max_blobs``
|
|
30
|
+
is exceeded. Thread-safe for use with multi-threaded WSGI servers like
|
|
31
|
+
waitress.
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
def __init__(self, *, max_blobs: int = 1000) -> None: # noqa: D107
|
|
35
|
+
self._blobs: OrderedDict[str, tuple[bytes, str | None]] = OrderedDict()
|
|
36
|
+
self._lock = threading.Lock()
|
|
37
|
+
self._max_blobs = max_blobs
|
|
38
|
+
self._base_url = ""
|
|
39
|
+
|
|
40
|
+
def set_base_url(self, base_url: str) -> None:
|
|
41
|
+
"""Set the base URL for blob URLs. Call after port discovery."""
|
|
42
|
+
self._base_url = base_url.rstrip("/")
|
|
43
|
+
|
|
44
|
+
# -- ExternalStorage protocol --
|
|
45
|
+
|
|
46
|
+
def upload(self, data: bytes, schema: Any, *, content_encoding: str | None = None) -> str:
|
|
47
|
+
"""Upload IPC bytes and return a fetch URL.
|
|
48
|
+
|
|
49
|
+
Extension reflects the codec so that operators rummaging through
|
|
50
|
+
the in-memory blob store can tell at a glance what they're
|
|
51
|
+
looking at. Content-Encoding is what actually drives the GET
|
|
52
|
+
response header; the extension is cosmetic.
|
|
53
|
+
"""
|
|
54
|
+
ext_for_codec = {"zstd": ".arrow.zst", "gzip": ".arrow.gz"}
|
|
55
|
+
ext = ext_for_codec.get(content_encoding or "", ".arrow")
|
|
56
|
+
key = f"{uuid.uuid4().hex}{ext}"
|
|
57
|
+
with self._lock:
|
|
58
|
+
self._blobs[key] = (data, content_encoding)
|
|
59
|
+
self._evict()
|
|
60
|
+
return f"{self._base_url}/__blobs__/{key}"
|
|
61
|
+
|
|
62
|
+
# -- UploadUrlProvider protocol --
|
|
63
|
+
|
|
64
|
+
def generate_upload_url(self, schema: Any) -> UploadUrl:
|
|
65
|
+
"""Generate PUT/GET URL pair for client-side uploads."""
|
|
66
|
+
from vgi_rpc.external import UploadUrl
|
|
67
|
+
|
|
68
|
+
key = f"{uuid.uuid4().hex}.arrow"
|
|
69
|
+
# Create placeholder — will be filled by the client's PUT.
|
|
70
|
+
with self._lock:
|
|
71
|
+
self._blobs[key] = (b"", None)
|
|
72
|
+
self._evict()
|
|
73
|
+
url = f"{self._base_url}/__blobs__/{key}"
|
|
74
|
+
return UploadUrl(
|
|
75
|
+
upload_url=url,
|
|
76
|
+
download_url=url,
|
|
77
|
+
expires_at=datetime.now(UTC) + timedelta(hours=1),
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
# -- Internal accessors for BlobResource --
|
|
81
|
+
|
|
82
|
+
def get(self, key: str) -> tuple[bytes, str | None] | None:
|
|
83
|
+
"""Return ``(data, content_encoding)`` or ``None``."""
|
|
84
|
+
with self._lock:
|
|
85
|
+
entry = self._blobs.get(key)
|
|
86
|
+
if entry is not None:
|
|
87
|
+
self._blobs.move_to_end(key)
|
|
88
|
+
return entry
|
|
89
|
+
|
|
90
|
+
def put(self, key: str, data: bytes, content_encoding: str | None = None) -> None:
|
|
91
|
+
"""Store blob data (used by PUT requests from clients)."""
|
|
92
|
+
with self._lock:
|
|
93
|
+
self._blobs[key] = (data, content_encoding)
|
|
94
|
+
self._blobs.move_to_end(key)
|
|
95
|
+
self._evict()
|
|
96
|
+
|
|
97
|
+
def _evict(self) -> None:
|
|
98
|
+
"""Evict oldest entries if over capacity. Caller must hold lock."""
|
|
99
|
+
while len(self._blobs) > self._max_blobs:
|
|
100
|
+
self._blobs.popitem(last=False)
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
class BlobResource:
|
|
104
|
+
"""Falcon resource serving blobs at ``/__blobs__/{blob_id}``."""
|
|
105
|
+
|
|
106
|
+
def __init__(self, storage: DemoBlobStorage) -> None: # noqa: D107
|
|
107
|
+
self._storage = storage
|
|
108
|
+
|
|
109
|
+
def on_get(self, req: Any, resp: Any, blob_id: str) -> None: # noqa: D102
|
|
110
|
+
import falcon
|
|
111
|
+
|
|
112
|
+
entry = self._storage.get(blob_id)
|
|
113
|
+
if entry is None:
|
|
114
|
+
raise falcon.HTTPNotFound(description=f"Blob {blob_id!r} not found")
|
|
115
|
+
data, content_encoding = entry
|
|
116
|
+
resp.data = data
|
|
117
|
+
resp.content_length = len(data)
|
|
118
|
+
resp.content_type = "application/octet-stream"
|
|
119
|
+
resp.set_header("Accept-Ranges", "none")
|
|
120
|
+
if content_encoding:
|
|
121
|
+
resp.set_header("Content-Encoding", content_encoding)
|
|
122
|
+
resp.set_header("X-VGI-Content-Encoding", content_encoding)
|
|
123
|
+
|
|
124
|
+
def on_head(self, req: Any, resp: Any, blob_id: str) -> None: # noqa: D102
|
|
125
|
+
# Mirror on_get headers (Content-Length/-Type/-Encoding) without a body.
|
|
126
|
+
# Required so external_fetch._head_probe can discover Content-Encoding
|
|
127
|
+
# (zstd or gzip); otherwise a 405 forces a plain GET path that skips
|
|
128
|
+
# decompression.
|
|
129
|
+
import falcon
|
|
130
|
+
|
|
131
|
+
entry = self._storage.get(blob_id)
|
|
132
|
+
if entry is None:
|
|
133
|
+
raise falcon.HTTPNotFound(description=f"Blob {blob_id!r} not found")
|
|
134
|
+
data, content_encoding = entry
|
|
135
|
+
resp.content_length = len(data)
|
|
136
|
+
resp.content_type = "application/octet-stream"
|
|
137
|
+
resp.set_header("Accept-Ranges", "none")
|
|
138
|
+
if content_encoding:
|
|
139
|
+
resp.set_header("Content-Encoding", content_encoding)
|
|
140
|
+
resp.set_header("X-VGI-Content-Encoding", content_encoding)
|
|
141
|
+
|
|
142
|
+
def on_put(self, req: Any, resp: Any, blob_id: str) -> None: # noqa: D102
|
|
143
|
+
# vgi_rpc's _CompressionMiddleware drains ``req.bounded_stream`` when
|
|
144
|
+
# the request carries a supported ``Content-Encoding`` (zstd or gzip)
|
|
145
|
+
# and stashes the decompressed payload on
|
|
146
|
+
# ``req.context.decompressed_stream``. Prefer that stream when
|
|
147
|
+
# present so we capture the raw IPC bytes; the producer's
|
|
148
|
+
# SHA-256 in custom_metadata is computed pre-compression so
|
|
149
|
+
# downstream verification still succeeds when we serve uncompressed.
|
|
150
|
+
decompressed_stream = getattr(req.context, "decompressed_stream", None)
|
|
151
|
+
if decompressed_stream is not None:
|
|
152
|
+
data = decompressed_stream.read()
|
|
153
|
+
content_encoding: str | None = None
|
|
154
|
+
else:
|
|
155
|
+
data = req.bounded_stream.read()
|
|
156
|
+
content_encoding = req.get_header("Content-Encoding")
|
|
157
|
+
self._storage.put(blob_id, data, content_encoding)
|
|
158
|
+
resp.status = "201 Created"
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
def add_blob_routes(app: Any, storage: DemoBlobStorage, prefix: str = "") -> None:
|
|
162
|
+
"""Add blob GET/PUT routes to a Falcon app."""
|
|
163
|
+
app.add_route(f"{prefix}/__blobs__/{{blob_id}}", BlobResource(storage))
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
def localhost_only_validator(url: str) -> None:
|
|
167
|
+
"""URL validator that accepts only ``http://127.0.0.1`` and ``http://localhost``.
|
|
168
|
+
|
|
169
|
+
Raises ``ValueError`` for any other URL. Use as the ``url_validator``
|
|
170
|
+
parameter of ``ExternalLocationConfig`` for demo/test use.
|
|
171
|
+
"""
|
|
172
|
+
parsed = urlparse(url)
|
|
173
|
+
if parsed.hostname not in ("127.0.0.1", "localhost"):
|
|
174
|
+
msg = f"Demo storage only accepts localhost URLs, got: {url}"
|
|
175
|
+
raise ValueError(msg)
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
class MaxRequestBytesMiddleware:
|
|
179
|
+
"""WSGI middleware that rejects RPC requests exceeding a size limit with 413.
|
|
180
|
+
|
|
181
|
+
The limit models ``VGI-Max-Request-Bytes`` — the cap that drives clients to
|
|
182
|
+
offload oversized batches through an upload URL. The blob upload endpoint
|
|
183
|
+
(``/__blobs__/``) is the escape hatch for exactly those oversized payloads,
|
|
184
|
+
so it is exempt: enforcing the limit there would 413 the very requests the
|
|
185
|
+
externalization protocol relies on it to accept.
|
|
186
|
+
"""
|
|
187
|
+
|
|
188
|
+
def __init__(self, app: Any, max_bytes: int) -> None: # noqa: D107
|
|
189
|
+
self._app = app
|
|
190
|
+
self._max_bytes = max_bytes
|
|
191
|
+
|
|
192
|
+
def __call__(self, environ: dict[str, Any], start_response: Any) -> Any: # noqa: D102
|
|
193
|
+
path = environ.get("PATH_INFO", "")
|
|
194
|
+
if "/__blobs__/" in path:
|
|
195
|
+
# Upload/download endpoint — must accept payloads larger than the
|
|
196
|
+
# RPC request limit; that is its entire purpose.
|
|
197
|
+
return self._app(environ, start_response)
|
|
198
|
+
content_length = environ.get("CONTENT_LENGTH", "")
|
|
199
|
+
if content_length:
|
|
200
|
+
try:
|
|
201
|
+
if int(content_length) > self._max_bytes:
|
|
202
|
+
start_response(
|
|
203
|
+
"413 Request Entity Too Large",
|
|
204
|
+
[
|
|
205
|
+
("Content-Type", "text/plain"),
|
|
206
|
+
("Content-Length", "24"),
|
|
207
|
+
],
|
|
208
|
+
)
|
|
209
|
+
return [b"Request body too large.\n"]
|
|
210
|
+
except ValueError:
|
|
211
|
+
pass
|
|
212
|
+
return self._app(environ, start_response)
|