vgi-python 0.8.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (124) hide show
  1. vgi/__init__.py +152 -0
  2. vgi/_duckdb.py +62 -0
  3. vgi/_storage_profile.py +132 -0
  4. vgi/_test_fixtures/__init__.py +20 -0
  5. vgi/_test_fixtures/accumulate/__init__.py +19 -0
  6. vgi/_test_fixtures/accumulate/worker.py +762 -0
  7. vgi/_test_fixtures/aggregate/__init__.py +62 -0
  8. vgi/_test_fixtures/aggregate/_common.py +21 -0
  9. vgi/_test_fixtures/aggregate/basic.py +232 -0
  10. vgi/_test_fixtures/aggregate/dynamic.py +409 -0
  11. vgi/_test_fixtures/aggregate/generic.py +86 -0
  12. vgi/_test_fixtures/aggregate/listagg.py +71 -0
  13. vgi/_test_fixtures/aggregate/percentile.py +107 -0
  14. vgi/_test_fixtures/aggregate/streaming.py +192 -0
  15. vgi/_test_fixtures/aggregate/varargs.py +75 -0
  16. vgi/_test_fixtures/aggregate/window.py +380 -0
  17. vgi/_test_fixtures/attach_options.py +308 -0
  18. vgi/_test_fixtures/bad_protocol.py +62 -0
  19. vgi/_test_fixtures/cancellable.py +336 -0
  20. vgi/_test_fixtures/catalog.py +813 -0
  21. vgi/_test_fixtures/http_server.py +394 -0
  22. vgi/_test_fixtures/nest_tensor.py +614 -0
  23. vgi/_test_fixtures/orchard_catalog.py +47 -0
  24. vgi/_test_fixtures/projection_repro/__init__.py +6 -0
  25. vgi/_test_fixtures/projection_repro/worker.py +454 -0
  26. vgi/_test_fixtures/scalar/__init__.py +116 -0
  27. vgi/_test_fixtures/scalar/_common.py +69 -0
  28. vgi/_test_fixtures/scalar/arithmetic.py +321 -0
  29. vgi/_test_fixtures/scalar/binary.py +120 -0
  30. vgi/_test_fixtures/scalar/formatting.py +176 -0
  31. vgi/_test_fixtures/scalar/geo.py +300 -0
  32. vgi/_test_fixtures/scalar/null_handling.py +107 -0
  33. vgi/_test_fixtures/scalar/random_demo.py +171 -0
  34. vgi/_test_fixtures/scalar/settings_secrets.py +102 -0
  35. vgi/_test_fixtures/scalar/type_info.py +219 -0
  36. vgi/_test_fixtures/schema_reconcile/__init__.py +29 -0
  37. vgi/_test_fixtures/schema_reconcile/worker.py +653 -0
  38. vgi/_test_fixtures/simple_writable.py +793 -0
  39. vgi/_test_fixtures/table/__init__.py +221 -0
  40. vgi/_test_fixtures/table/_common.py +162 -0
  41. vgi/_test_fixtures/table/batch_index.py +283 -0
  42. vgi/_test_fixtures/table/batch_index_broken.py +200 -0
  43. vgi/_test_fixtures/table/catalog_scans.py +162 -0
  44. vgi/_test_fixtures/table/filters.py +1005 -0
  45. vgi/_test_fixtures/table/late_materialization.py +249 -0
  46. vgi/_test_fixtures/table/make_series.py +273 -0
  47. vgi/_test_fixtures/table/misc.py +499 -0
  48. vgi/_test_fixtures/table/order_modes.py +164 -0
  49. vgi/_test_fixtures/table/pairs.py +437 -0
  50. vgi/_test_fixtures/table/partition_columns.py +472 -0
  51. vgi/_test_fixtures/table/partition_columns_broken.py +304 -0
  52. vgi/_test_fixtures/table/profiling_example.py +195 -0
  53. vgi/_test_fixtures/table/required_filters.py +234 -0
  54. vgi/_test_fixtures/table/sequence.py +710 -0
  55. vgi/_test_fixtures/table/settings.py +426 -0
  56. vgi/_test_fixtures/table/transaction_storage.py +162 -0
  57. vgi/_test_fixtures/table/tt_pushdown.py +191 -0
  58. vgi/_test_fixtures/table/versioned.py +230 -0
  59. vgi/_test_fixtures/table_in_out.py +1392 -0
  60. vgi/_test_fixtures/versioned.py +155 -0
  61. vgi/_test_fixtures/versioned_tables.py +595 -0
  62. vgi/_test_fixtures/worker.py +1631 -0
  63. vgi/_test_fixtures/writable/__init__.py +8 -0
  64. vgi/_test_fixtures/writable/generic.py +236 -0
  65. vgi/_test_fixtures/writable/table.py +149 -0
  66. vgi/_test_fixtures/writable/worker.py +1148 -0
  67. vgi/aggregate_function.py +607 -0
  68. vgi/argument_spec.py +472 -0
  69. vgi/arguments.py +1747 -0
  70. vgi/auth.py +55 -0
  71. vgi/catalog/__init__.py +88 -0
  72. vgi/catalog/attach_option.py +206 -0
  73. vgi/catalog/catalog_interface.py +2767 -0
  74. vgi/catalog/descriptors.py +870 -0
  75. vgi/catalog/duckdb_statistics.py +377 -0
  76. vgi/catalog/secret_type.py +96 -0
  77. vgi/catalog/setting.py +253 -0
  78. vgi/catalog/storage.py +372 -0
  79. vgi/client/__init__.py +67 -0
  80. vgi/client/catalog_mixin.py +1251 -0
  81. vgi/client/cli.py +582 -0
  82. vgi/client/cli_catalog.py +182 -0
  83. vgi/client/cli_schema.py +270 -0
  84. vgi/client/cli_table.py +907 -0
  85. vgi/client/cli_transaction.py +97 -0
  86. vgi/client/cli_utils.py +441 -0
  87. vgi/client/cli_view.py +303 -0
  88. vgi/client/client.py +2183 -0
  89. vgi/exceptions.py +205 -0
  90. vgi/function.py +245 -0
  91. vgi/function_storage.py +1636 -0
  92. vgi/function_storage_azure_sql.py +922 -0
  93. vgi/function_storage_cf_do.py +740 -0
  94. vgi/http/__init__.py +25 -0
  95. vgi/http/demo_storage.py +212 -0
  96. vgi/http/worker_page.py +1252 -0
  97. vgi/invocation.py +154 -0
  98. vgi/logging_config.py +93 -0
  99. vgi/meta_worker.py +661 -0
  100. vgi/metadata.py +1403 -0
  101. vgi/otel.py +406 -0
  102. vgi/protocol.py +2418 -0
  103. vgi/protocol_version.txt +1 -0
  104. vgi/py.typed +0 -0
  105. vgi/scalar_function.py +1211 -0
  106. vgi/schema_utils.py +234 -0
  107. vgi/secret_protocol.py +124 -0
  108. vgi/secret_service.py +238 -0
  109. vgi/serve.py +769 -0
  110. vgi/table_buffering_function.py +443 -0
  111. vgi/table_filter_pushdown.py +1528 -0
  112. vgi/table_function.py +1130 -0
  113. vgi/table_in_out_function.py +383 -0
  114. vgi/transactor/__init__.py +24 -0
  115. vgi/transactor/_duckdb_compat.py +27 -0
  116. vgi/transactor/client.py +137 -0
  117. vgi/transactor/protocol.py +149 -0
  118. vgi/transactor/server.py +740 -0
  119. vgi/worker.py +4761 -0
  120. vgi_python-0.8.0.dist-info/METADATA +735 -0
  121. vgi_python-0.8.0.dist-info/RECORD +124 -0
  122. vgi_python-0.8.0.dist-info/WHEEL +4 -0
  123. vgi_python-0.8.0.dist-info/entry_points.txt +5 -0
  124. vgi_python-0.8.0.dist-info/licenses/LICENSE +134 -0
vgi/http/__init__.py ADDED
@@ -0,0 +1,25 @@
1
+ # Copyright 2025, 2026 Query Farm LLC - https://query.farm
2
+
3
+ """HTTP utilities for VGI workers.
4
+
5
+ Provides the worker description page and demo blob storage for HTTP-mode workers.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from vgi.http.demo_storage import (
11
+ DemoBlobStorage,
12
+ MaxRequestBytesMiddleware,
13
+ add_blob_routes,
14
+ localhost_only_validator,
15
+ )
16
+ from vgi.http.worker_page import WorkerPageResource, build_worker_page
17
+
18
+ __all__ = [
19
+ "DemoBlobStorage",
20
+ "MaxRequestBytesMiddleware",
21
+ "WorkerPageResource",
22
+ "add_blob_routes",
23
+ "build_worker_page",
24
+ "localhost_only_validator",
25
+ ]
@@ -0,0 +1,212 @@
1
+ # Copyright 2025, 2026 Query Farm LLC - https://query.farm
2
+
3
+ """In-process blob storage for demonstrating and testing external batch offloading.
4
+
5
+ Provides a simple HTTP blob store that implements the ``ExternalStorage`` and
6
+ ``UploadUrlProvider`` protocols from vgi_rpc, served from the same HTTP server
7
+ process. This allows the example worker to demonstrate external record batch
8
+ offloading without requiring S3 or any cloud infrastructure.
9
+
10
+ **Not for production use** — blobs are held in memory with LRU eviction.
11
+ """
12
+
13
+ from __future__ import annotations
14
+
15
+ import threading
16
+ import uuid
17
+ from collections import OrderedDict
18
+ from datetime import UTC, datetime, timedelta
19
+ from typing import TYPE_CHECKING, Any
20
+ from urllib.parse import urlparse
21
+
22
+ if TYPE_CHECKING:
23
+ from vgi_rpc.external import UploadUrl
24
+
25
+
26
+ class DemoBlobStorage:
27
+ """In-memory blob store implementing ``ExternalStorage`` and ``UploadUrlProvider``.
28
+
29
+ Blobs are stored in an ``OrderedDict`` with LRU eviction when ``max_blobs``
30
+ is exceeded. Thread-safe for use with multi-threaded WSGI servers like
31
+ waitress.
32
+ """
33
+
34
+ def __init__(self, *, max_blobs: int = 1000) -> None: # noqa: D107
35
+ self._blobs: OrderedDict[str, tuple[bytes, str | None]] = OrderedDict()
36
+ self._lock = threading.Lock()
37
+ self._max_blobs = max_blobs
38
+ self._base_url = ""
39
+
40
+ def set_base_url(self, base_url: str) -> None:
41
+ """Set the base URL for blob URLs. Call after port discovery."""
42
+ self._base_url = base_url.rstrip("/")
43
+
44
+ # -- ExternalStorage protocol --
45
+
46
+ def upload(self, data: bytes, schema: Any, *, content_encoding: str | None = None) -> str:
47
+ """Upload IPC bytes and return a fetch URL.
48
+
49
+ Extension reflects the codec so that operators rummaging through
50
+ the in-memory blob store can tell at a glance what they're
51
+ looking at. Content-Encoding is what actually drives the GET
52
+ response header; the extension is cosmetic.
53
+ """
54
+ ext_for_codec = {"zstd": ".arrow.zst", "gzip": ".arrow.gz"}
55
+ ext = ext_for_codec.get(content_encoding or "", ".arrow")
56
+ key = f"{uuid.uuid4().hex}{ext}"
57
+ with self._lock:
58
+ self._blobs[key] = (data, content_encoding)
59
+ self._evict()
60
+ return f"{self._base_url}/__blobs__/{key}"
61
+
62
+ # -- UploadUrlProvider protocol --
63
+
64
+ def generate_upload_url(self, schema: Any) -> UploadUrl:
65
+ """Generate PUT/GET URL pair for client-side uploads."""
66
+ from vgi_rpc.external import UploadUrl
67
+
68
+ key = f"{uuid.uuid4().hex}.arrow"
69
+ # Create placeholder — will be filled by the client's PUT.
70
+ with self._lock:
71
+ self._blobs[key] = (b"", None)
72
+ self._evict()
73
+ url = f"{self._base_url}/__blobs__/{key}"
74
+ return UploadUrl(
75
+ upload_url=url,
76
+ download_url=url,
77
+ expires_at=datetime.now(UTC) + timedelta(hours=1),
78
+ )
79
+
80
+ # -- Internal accessors for BlobResource --
81
+
82
+ def get(self, key: str) -> tuple[bytes, str | None] | None:
83
+ """Return ``(data, content_encoding)`` or ``None``."""
84
+ with self._lock:
85
+ entry = self._blobs.get(key)
86
+ if entry is not None:
87
+ self._blobs.move_to_end(key)
88
+ return entry
89
+
90
+ def put(self, key: str, data: bytes, content_encoding: str | None = None) -> None:
91
+ """Store blob data (used by PUT requests from clients)."""
92
+ with self._lock:
93
+ self._blobs[key] = (data, content_encoding)
94
+ self._blobs.move_to_end(key)
95
+ self._evict()
96
+
97
+ def _evict(self) -> None:
98
+ """Evict oldest entries if over capacity. Caller must hold lock."""
99
+ while len(self._blobs) > self._max_blobs:
100
+ self._blobs.popitem(last=False)
101
+
102
+
103
+ class BlobResource:
104
+ """Falcon resource serving blobs at ``/__blobs__/{blob_id}``."""
105
+
106
+ def __init__(self, storage: DemoBlobStorage) -> None: # noqa: D107
107
+ self._storage = storage
108
+
109
+ def on_get(self, req: Any, resp: Any, blob_id: str) -> None: # noqa: D102
110
+ import falcon
111
+
112
+ entry = self._storage.get(blob_id)
113
+ if entry is None:
114
+ raise falcon.HTTPNotFound(description=f"Blob {blob_id!r} not found")
115
+ data, content_encoding = entry
116
+ resp.data = data
117
+ resp.content_length = len(data)
118
+ resp.content_type = "application/octet-stream"
119
+ resp.set_header("Accept-Ranges", "none")
120
+ if content_encoding:
121
+ resp.set_header("Content-Encoding", content_encoding)
122
+ resp.set_header("X-VGI-Content-Encoding", content_encoding)
123
+
124
+ def on_head(self, req: Any, resp: Any, blob_id: str) -> None: # noqa: D102
125
+ # Mirror on_get headers (Content-Length/-Type/-Encoding) without a body.
126
+ # Required so external_fetch._head_probe can discover Content-Encoding
127
+ # (zstd or gzip); otherwise a 405 forces a plain GET path that skips
128
+ # decompression.
129
+ import falcon
130
+
131
+ entry = self._storage.get(blob_id)
132
+ if entry is None:
133
+ raise falcon.HTTPNotFound(description=f"Blob {blob_id!r} not found")
134
+ data, content_encoding = entry
135
+ resp.content_length = len(data)
136
+ resp.content_type = "application/octet-stream"
137
+ resp.set_header("Accept-Ranges", "none")
138
+ if content_encoding:
139
+ resp.set_header("Content-Encoding", content_encoding)
140
+ resp.set_header("X-VGI-Content-Encoding", content_encoding)
141
+
142
+ def on_put(self, req: Any, resp: Any, blob_id: str) -> None: # noqa: D102
143
+ # vgi_rpc's _CompressionMiddleware drains ``req.bounded_stream`` when
144
+ # the request carries a supported ``Content-Encoding`` (zstd or gzip)
145
+ # and stashes the decompressed payload on
146
+ # ``req.context.decompressed_stream``. Prefer that stream when
147
+ # present so we capture the raw IPC bytes; the producer's
148
+ # SHA-256 in custom_metadata is computed pre-compression so
149
+ # downstream verification still succeeds when we serve uncompressed.
150
+ decompressed_stream = getattr(req.context, "decompressed_stream", None)
151
+ if decompressed_stream is not None:
152
+ data = decompressed_stream.read()
153
+ content_encoding: str | None = None
154
+ else:
155
+ data = req.bounded_stream.read()
156
+ content_encoding = req.get_header("Content-Encoding")
157
+ self._storage.put(blob_id, data, content_encoding)
158
+ resp.status = "201 Created"
159
+
160
+
161
+ def add_blob_routes(app: Any, storage: DemoBlobStorage, prefix: str = "") -> None:
162
+ """Add blob GET/PUT routes to a Falcon app."""
163
+ app.add_route(f"{prefix}/__blobs__/{{blob_id}}", BlobResource(storage))
164
+
165
+
166
+ def localhost_only_validator(url: str) -> None:
167
+ """URL validator that accepts only ``http://127.0.0.1`` and ``http://localhost``.
168
+
169
+ Raises ``ValueError`` for any other URL. Use as the ``url_validator``
170
+ parameter of ``ExternalLocationConfig`` for demo/test use.
171
+ """
172
+ parsed = urlparse(url)
173
+ if parsed.hostname not in ("127.0.0.1", "localhost"):
174
+ msg = f"Demo storage only accepts localhost URLs, got: {url}"
175
+ raise ValueError(msg)
176
+
177
+
178
+ class MaxRequestBytesMiddleware:
179
+ """WSGI middleware that rejects RPC requests exceeding a size limit with 413.
180
+
181
+ The limit models ``VGI-Max-Request-Bytes`` — the cap that drives clients to
182
+ offload oversized batches through an upload URL. The blob upload endpoint
183
+ (``/__blobs__/``) is the escape hatch for exactly those oversized payloads,
184
+ so it is exempt: enforcing the limit there would 413 the very requests the
185
+ externalization protocol relies on it to accept.
186
+ """
187
+
188
+ def __init__(self, app: Any, max_bytes: int) -> None: # noqa: D107
189
+ self._app = app
190
+ self._max_bytes = max_bytes
191
+
192
+ def __call__(self, environ: dict[str, Any], start_response: Any) -> Any: # noqa: D102
193
+ path = environ.get("PATH_INFO", "")
194
+ if "/__blobs__/" in path:
195
+ # Upload/download endpoint — must accept payloads larger than the
196
+ # RPC request limit; that is its entire purpose.
197
+ return self._app(environ, start_response)
198
+ content_length = environ.get("CONTENT_LENGTH", "")
199
+ if content_length:
200
+ try:
201
+ if int(content_length) > self._max_bytes:
202
+ start_response(
203
+ "413 Request Entity Too Large",
204
+ [
205
+ ("Content-Type", "text/plain"),
206
+ ("Content-Length", "24"),
207
+ ],
208
+ )
209
+ return [b"Request body too large.\n"]
210
+ except ValueError:
211
+ pass
212
+ return self._app(environ, start_response)