vgi-python 0.8.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (124) hide show
  1. vgi/__init__.py +152 -0
  2. vgi/_duckdb.py +62 -0
  3. vgi/_storage_profile.py +132 -0
  4. vgi/_test_fixtures/__init__.py +20 -0
  5. vgi/_test_fixtures/accumulate/__init__.py +19 -0
  6. vgi/_test_fixtures/accumulate/worker.py +762 -0
  7. vgi/_test_fixtures/aggregate/__init__.py +62 -0
  8. vgi/_test_fixtures/aggregate/_common.py +21 -0
  9. vgi/_test_fixtures/aggregate/basic.py +232 -0
  10. vgi/_test_fixtures/aggregate/dynamic.py +409 -0
  11. vgi/_test_fixtures/aggregate/generic.py +86 -0
  12. vgi/_test_fixtures/aggregate/listagg.py +71 -0
  13. vgi/_test_fixtures/aggregate/percentile.py +107 -0
  14. vgi/_test_fixtures/aggregate/streaming.py +192 -0
  15. vgi/_test_fixtures/aggregate/varargs.py +75 -0
  16. vgi/_test_fixtures/aggregate/window.py +380 -0
  17. vgi/_test_fixtures/attach_options.py +308 -0
  18. vgi/_test_fixtures/bad_protocol.py +62 -0
  19. vgi/_test_fixtures/cancellable.py +336 -0
  20. vgi/_test_fixtures/catalog.py +813 -0
  21. vgi/_test_fixtures/http_server.py +394 -0
  22. vgi/_test_fixtures/nest_tensor.py +614 -0
  23. vgi/_test_fixtures/orchard_catalog.py +47 -0
  24. vgi/_test_fixtures/projection_repro/__init__.py +6 -0
  25. vgi/_test_fixtures/projection_repro/worker.py +454 -0
  26. vgi/_test_fixtures/scalar/__init__.py +116 -0
  27. vgi/_test_fixtures/scalar/_common.py +69 -0
  28. vgi/_test_fixtures/scalar/arithmetic.py +321 -0
  29. vgi/_test_fixtures/scalar/binary.py +120 -0
  30. vgi/_test_fixtures/scalar/formatting.py +176 -0
  31. vgi/_test_fixtures/scalar/geo.py +300 -0
  32. vgi/_test_fixtures/scalar/null_handling.py +107 -0
  33. vgi/_test_fixtures/scalar/random_demo.py +171 -0
  34. vgi/_test_fixtures/scalar/settings_secrets.py +102 -0
  35. vgi/_test_fixtures/scalar/type_info.py +219 -0
  36. vgi/_test_fixtures/schema_reconcile/__init__.py +29 -0
  37. vgi/_test_fixtures/schema_reconcile/worker.py +653 -0
  38. vgi/_test_fixtures/simple_writable.py +793 -0
  39. vgi/_test_fixtures/table/__init__.py +221 -0
  40. vgi/_test_fixtures/table/_common.py +162 -0
  41. vgi/_test_fixtures/table/batch_index.py +283 -0
  42. vgi/_test_fixtures/table/batch_index_broken.py +200 -0
  43. vgi/_test_fixtures/table/catalog_scans.py +162 -0
  44. vgi/_test_fixtures/table/filters.py +1005 -0
  45. vgi/_test_fixtures/table/late_materialization.py +249 -0
  46. vgi/_test_fixtures/table/make_series.py +273 -0
  47. vgi/_test_fixtures/table/misc.py +499 -0
  48. vgi/_test_fixtures/table/order_modes.py +164 -0
  49. vgi/_test_fixtures/table/pairs.py +437 -0
  50. vgi/_test_fixtures/table/partition_columns.py +472 -0
  51. vgi/_test_fixtures/table/partition_columns_broken.py +304 -0
  52. vgi/_test_fixtures/table/profiling_example.py +195 -0
  53. vgi/_test_fixtures/table/required_filters.py +234 -0
  54. vgi/_test_fixtures/table/sequence.py +710 -0
  55. vgi/_test_fixtures/table/settings.py +426 -0
  56. vgi/_test_fixtures/table/transaction_storage.py +162 -0
  57. vgi/_test_fixtures/table/tt_pushdown.py +191 -0
  58. vgi/_test_fixtures/table/versioned.py +230 -0
  59. vgi/_test_fixtures/table_in_out.py +1392 -0
  60. vgi/_test_fixtures/versioned.py +155 -0
  61. vgi/_test_fixtures/versioned_tables.py +595 -0
  62. vgi/_test_fixtures/worker.py +1631 -0
  63. vgi/_test_fixtures/writable/__init__.py +8 -0
  64. vgi/_test_fixtures/writable/generic.py +236 -0
  65. vgi/_test_fixtures/writable/table.py +149 -0
  66. vgi/_test_fixtures/writable/worker.py +1148 -0
  67. vgi/aggregate_function.py +607 -0
  68. vgi/argument_spec.py +472 -0
  69. vgi/arguments.py +1747 -0
  70. vgi/auth.py +55 -0
  71. vgi/catalog/__init__.py +88 -0
  72. vgi/catalog/attach_option.py +206 -0
  73. vgi/catalog/catalog_interface.py +2767 -0
  74. vgi/catalog/descriptors.py +870 -0
  75. vgi/catalog/duckdb_statistics.py +377 -0
  76. vgi/catalog/secret_type.py +96 -0
  77. vgi/catalog/setting.py +253 -0
  78. vgi/catalog/storage.py +372 -0
  79. vgi/client/__init__.py +67 -0
  80. vgi/client/catalog_mixin.py +1251 -0
  81. vgi/client/cli.py +582 -0
  82. vgi/client/cli_catalog.py +182 -0
  83. vgi/client/cli_schema.py +270 -0
  84. vgi/client/cli_table.py +907 -0
  85. vgi/client/cli_transaction.py +97 -0
  86. vgi/client/cli_utils.py +441 -0
  87. vgi/client/cli_view.py +303 -0
  88. vgi/client/client.py +2183 -0
  89. vgi/exceptions.py +205 -0
  90. vgi/function.py +245 -0
  91. vgi/function_storage.py +1636 -0
  92. vgi/function_storage_azure_sql.py +922 -0
  93. vgi/function_storage_cf_do.py +740 -0
  94. vgi/http/__init__.py +25 -0
  95. vgi/http/demo_storage.py +212 -0
  96. vgi/http/worker_page.py +1252 -0
  97. vgi/invocation.py +154 -0
  98. vgi/logging_config.py +93 -0
  99. vgi/meta_worker.py +661 -0
  100. vgi/metadata.py +1403 -0
  101. vgi/otel.py +406 -0
  102. vgi/protocol.py +2418 -0
  103. vgi/protocol_version.txt +1 -0
  104. vgi/py.typed +0 -0
  105. vgi/scalar_function.py +1211 -0
  106. vgi/schema_utils.py +234 -0
  107. vgi/secret_protocol.py +124 -0
  108. vgi/secret_service.py +238 -0
  109. vgi/serve.py +769 -0
  110. vgi/table_buffering_function.py +443 -0
  111. vgi/table_filter_pushdown.py +1528 -0
  112. vgi/table_function.py +1130 -0
  113. vgi/table_in_out_function.py +383 -0
  114. vgi/transactor/__init__.py +24 -0
  115. vgi/transactor/_duckdb_compat.py +27 -0
  116. vgi/transactor/client.py +137 -0
  117. vgi/transactor/protocol.py +149 -0
  118. vgi/transactor/server.py +740 -0
  119. vgi/worker.py +4761 -0
  120. vgi_python-0.8.0.dist-info/METADATA +735 -0
  121. vgi_python-0.8.0.dist-info/RECORD +124 -0
  122. vgi_python-0.8.0.dist-info/WHEEL +4 -0
  123. vgi_python-0.8.0.dist-info/entry_points.txt +5 -0
  124. vgi_python-0.8.0.dist-info/licenses/LICENSE +134 -0
vgi/catalog/setting.py ADDED
@@ -0,0 +1,253 @@
1
+ # Copyright 2025, 2026 Query Farm LLC - https://query.farm
2
+
3
+ """Setting descriptor for declarative worker settings.
4
+
5
+ This module provides the Setting descriptor class for defining worker settings
6
+ using Python's Annotated type hints, similar to how Arg works for function arguments.
7
+
8
+ """
9
+
10
+ from dataclasses import dataclass, field
11
+ from typing import (
12
+ TYPE_CHECKING,
13
+ Annotated,
14
+ Any,
15
+ ClassVar,
16
+ cast,
17
+ get_args,
18
+ get_origin,
19
+ get_type_hints,
20
+ )
21
+
22
+ import pyarrow as pa
23
+ from vgi_rpc.utils import deserialize_record_batch, serialize_record_batch_bytes
24
+
25
+ from vgi.schema_utils import schema
26
+
27
+ if TYPE_CHECKING:
28
+ from typing import Self
29
+
30
+ __all__ = [
31
+ "Setting",
32
+ "SettingSpec",
33
+ "extract_setting_specs",
34
+ ]
35
+
36
+
37
+ @dataclass(frozen=True)
38
+ class SettingSpec:
39
+ """Extracted setting metadata for catalog serialization.
40
+
41
+ This is the resolved form of a Setting, with all types inferred and
42
+ ready for serialization.
43
+
44
+ Attributes:
45
+ name: The setting name (from the class attribute name).
46
+ desc: Human-readable description.
47
+ type: The Arrow data type for this setting.
48
+ default: The default value (Python object).
49
+
50
+ """
51
+
52
+ name: str
53
+ desc: str
54
+ type: pa.DataType
55
+ default: Any
56
+
57
+ ARROW_SCHEMA: ClassVar[pa.Schema] = pa.schema(
58
+ [
59
+ pa.field("name", pa.string(), nullable=False),
60
+ pa.field("description", pa.string(), nullable=False),
61
+ pa.field("type", pa.binary(), nullable=False),
62
+ pa.field("default_value", pa.binary(), nullable=True),
63
+ ] # type: ignore[arg-type]
64
+ )
65
+
66
+ def serialize(self) -> bytes:
67
+ """Serialize to Arrow IPC bytes."""
68
+ # Serialize type as a single-field schema
69
+ type_schema = schema(value=self.type)
70
+ type_bytes = type_schema.serialize().to_pybytes()
71
+
72
+ # Serialize default value if present
73
+ default_bytes: bytes | None = None
74
+ if self.default is not None:
75
+ default_batch = pa.RecordBatch.from_pydict({"value": [self.default]}, schema=type_schema)
76
+ default_bytes = serialize_record_batch_bytes(default_batch)
77
+
78
+ batch = pa.RecordBatch.from_pylist(
79
+ [
80
+ {
81
+ "name": self.name,
82
+ "description": self.desc,
83
+ "type": type_bytes,
84
+ "default_value": default_bytes,
85
+ }
86
+ ],
87
+ schema=self.ARROW_SCHEMA,
88
+ )
89
+ return serialize_record_batch_bytes(batch)
90
+
91
+ @classmethod
92
+ def deserialize(cls, batch: pa.RecordBatch) -> "Self":
93
+ """Deserialize from Arrow RecordBatch."""
94
+ from vgi_rpc.utils import _validate_single_row_batch
95
+
96
+ row = _validate_single_row_batch(
97
+ batch,
98
+ cls.__name__,
99
+ required_fields=["name", "description", "type"],
100
+ )
101
+ # Deserialize type from schema bytes
102
+ type_schema = pa.ipc.read_schema(pa.py_buffer(cast(bytes, row["type"])))
103
+ data_type = type_schema.field("value").type
104
+
105
+ # Deserialize default value if present
106
+ default: Any = None
107
+ if row["default_value"] is not None:
108
+ default_batch, _ = deserialize_record_batch(cast(bytes, row["default_value"]))
109
+ default = default_batch.column("value")[0].as_py()
110
+
111
+ return cls(
112
+ name=cast(str, row["name"]),
113
+ desc=cast(str, row["description"]),
114
+ type=data_type,
115
+ default=default,
116
+ )
117
+
118
+
119
+ # Python type to Arrow type mapping
120
+ _PYTHON_TO_ARROW: dict[type, pa.DataType] = {
121
+ bool: pa.bool_(),
122
+ int: pa.int64(),
123
+ float: pa.float64(),
124
+ str: pa.string(),
125
+ bytes: pa.binary(),
126
+ }
127
+
128
+
129
+ def _resolve_arrow_type(type_hint: type | pa.DataType) -> pa.DataType:
130
+ """Resolve Arrow type from either a Python type or Arrow DataType.
131
+
132
+ Args:
133
+ type_hint: A Python type (bool, int, float, str, bytes) or Arrow DataType.
134
+
135
+ Returns:
136
+ The resolved Arrow DataType.
137
+
138
+ Raises:
139
+ TypeError: If the type cannot be resolved.
140
+
141
+ """
142
+ # If already an Arrow DataType, use it directly
143
+ if isinstance(type_hint, pa.DataType):
144
+ return type_hint
145
+
146
+ # Map Python types to Arrow types
147
+ if type_hint in _PYTHON_TO_ARROW:
148
+ return _PYTHON_TO_ARROW[type_hint]
149
+
150
+ raise TypeError(
151
+ f"Cannot resolve Arrow type from: {type_hint}. "
152
+ "Use a Python type (bool, int, float, str, bytes) or Arrow DataType."
153
+ )
154
+
155
+
156
+ @dataclass
157
+ class Setting:
158
+ """Descriptor for declarative setting definitions using Annotated.
159
+
160
+ Use with Annotated type hints to declare settings in a Worker's Settings class.
161
+ The Arrow type is resolved from the base type in the Annotated hint.
162
+
163
+ Attributes:
164
+ desc: Human-readable description of the setting.
165
+ arrow_type: Optional explicit Arrow type (overrides inference from annotation).
166
+
167
+ """
168
+
169
+ desc: str = ""
170
+ arrow_type: pa.DataType | None = None
171
+
172
+ # Internal fields set during class creation
173
+ _name: str = field(default="", init=False, repr=False)
174
+
175
+ def __set_name__(self, owner: type, name: str) -> None:
176
+ """Store the attribute name when assigned to a class."""
177
+ self._name = name
178
+
179
+ def __get__(self, obj: object | None, objtype: type | None = None) -> Any:
180
+ """Get the setting value.
181
+
182
+ When accessed on the class, returns the descriptor itself.
183
+ When accessed on an instance, returns the default value.
184
+ """
185
+ if obj is None:
186
+ return self
187
+ # Return the class-level default
188
+ return getattr(type(obj), self._name, None)
189
+
190
+
191
+ def extract_setting_specs(settings_cls: type) -> list[SettingSpec]:
192
+ """Extract SettingSpec objects from a Settings class.
193
+
194
+ Parses a Settings class with Annotated type hints and extracts
195
+ SettingSpec objects for each setting definition.
196
+
197
+ Args:
198
+ settings_cls: A class with Annotated[type, Setting(...)] attributes.
199
+
200
+ Returns:
201
+ List of SettingSpec objects extracted from the class.
202
+
203
+ Raises:
204
+ TypeError: If a setting's Arrow type cannot be resolved.
205
+
206
+ """
207
+ specs: list[SettingSpec] = []
208
+
209
+ # Get type hints with extras (preserves Annotated)
210
+ try:
211
+ hints = get_type_hints(settings_cls, include_extras=True)
212
+ except Exception:
213
+ # If type hints can't be resolved, return empty list
214
+ return specs
215
+
216
+ for name, hint in hints.items():
217
+ # Skip non-Annotated hints
218
+ if get_origin(hint) is not Annotated:
219
+ continue
220
+
221
+ args = get_args(hint)
222
+ if len(args) < 2:
223
+ continue
224
+
225
+ base_type = args[0]
226
+
227
+ # Find Setting in the annotation args
228
+ setting = None
229
+ for arg in args[1:]:
230
+ if isinstance(arg, Setting):
231
+ setting = arg
232
+ break
233
+
234
+ if setting is None:
235
+ continue
236
+
237
+ # Get default value from class attribute
238
+ default = getattr(settings_cls, name, None)
239
+
240
+ # Resolve Arrow type: explicit Setting.type takes precedence,
241
+ # otherwise resolve from base_type (Python type or Arrow DataType)
242
+ arrow_type = setting.arrow_type if setting.arrow_type is not None else _resolve_arrow_type(base_type)
243
+
244
+ specs.append(
245
+ SettingSpec(
246
+ name=name,
247
+ desc=setting.desc,
248
+ type=arrow_type,
249
+ default=default,
250
+ )
251
+ )
252
+
253
+ return specs
vgi/catalog/storage.py ADDED
@@ -0,0 +1,372 @@
1
+ # Copyright 2025, 2026 Query Farm LLC - https://query.farm
2
+
3
+ """Storage for VGI catalog state.
4
+
5
+ This module provides a storage protocol and implementation for persisting
6
+ catalog attach_opaque_data and transaction_opaque_data state across worker processes.
7
+
8
+ Protocol:
9
+ CatalogStorage: Protocol for catalog state persistence.
10
+
11
+ Implementation:
12
+ CatalogStorageSqlite: SQLite-backed storage implementation.
13
+
14
+ """
15
+
16
+ import random
17
+ import sqlite3
18
+ import uuid
19
+ from typing import Any, Protocol
20
+
21
+ from vgi.catalog.catalog_interface import AttachOpaqueData, TransactionOpaqueData
22
+
23
+ __all__ = [
24
+ "CatalogStorage",
25
+ "CatalogStorageSqlite",
26
+ ]
27
+
28
+
29
+ def _get_default_db_path() -> str:
30
+ """Return the default SQLite database path for catalog storage."""
31
+ from pathlib import Path
32
+
33
+ from platformdirs import user_state_dir
34
+
35
+ state_dir = Path(user_state_dir("vgi"))
36
+ state_dir.mkdir(parents=True, exist_ok=True)
37
+ return str((state_dir / "vgi_catalog.db").resolve())
38
+
39
+
40
+ class CatalogStorage(Protocol):
41
+ """Storage protocol for VGI catalog state persistence.
42
+
43
+ Provides two access patterns for catalog state:
44
+
45
+ **Attachments** - Track catalog attachments with their options.
46
+ Stores the mapping from attach_opaque_data to catalog name and options.
47
+
48
+ **Transactions** - Track active transactions.
49
+ Stores transaction state for catalogs that support transactions.
50
+
51
+ """
52
+
53
+ # --- Attachment State ---
54
+
55
+ def attach_put(self, attach_opaque_data: AttachOpaqueData, catalog_name: str, options: dict[str, Any]) -> None:
56
+ """Store attachment state.
57
+
58
+ Args:
59
+ attach_opaque_data: Unique identifier for the attachment.
60
+ catalog_name: Name of the attached catalog.
61
+ options: Options passed during attachment.
62
+
63
+ """
64
+ ...
65
+
66
+ def attach_get(self, attach_opaque_data: AttachOpaqueData) -> tuple[str, dict[str, Any]] | None:
67
+ """Retrieve attachment state by attach_opaque_data.
68
+
69
+ Args:
70
+ attach_opaque_data: Unique identifier for the attachment.
71
+
72
+ Returns:
73
+ Tuple of (catalog_name, options), or None if not found.
74
+
75
+ """
76
+ ...
77
+
78
+ def attach_delete(self, attach_opaque_data: AttachOpaqueData) -> None:
79
+ """Delete attachment state.
80
+
81
+ Args:
82
+ attach_opaque_data: Unique identifier for the attachment.
83
+
84
+ """
85
+ ...
86
+
87
+ def attach_list(self) -> list[AttachOpaqueData]:
88
+ """List all active attachments.
89
+
90
+ Returns:
91
+ List of all attach opaque data values in storage.
92
+
93
+ """
94
+ ...
95
+
96
+ # --- Transaction State ---
97
+
98
+ def transaction_put(
99
+ self, transaction_opaque_data: TransactionOpaqueData, attach_opaque_data: AttachOpaqueData, state: bytes
100
+ ) -> None:
101
+ """Store transaction state.
102
+
103
+ Args:
104
+ transaction_opaque_data: Unique identifier for the transaction.
105
+ attach_opaque_data: Attachment the transaction belongs to.
106
+ state: Serialized transaction state.
107
+
108
+ """
109
+ ...
110
+
111
+ def transaction_get(self, transaction_opaque_data: TransactionOpaqueData) -> tuple[AttachOpaqueData, bytes] | None:
112
+ """Retrieve transaction state.
113
+
114
+ Args:
115
+ transaction_opaque_data: Unique identifier for the transaction.
116
+
117
+ Returns:
118
+ Tuple of (attach_opaque_data, state bytes), or None if not found.
119
+
120
+ """
121
+ ...
122
+
123
+ def transaction_delete(self, transaction_opaque_data: TransactionOpaqueData) -> None:
124
+ """Delete transaction state.
125
+
126
+ Args:
127
+ transaction_opaque_data: Unique identifier for the transaction.
128
+
129
+ """
130
+ ...
131
+
132
+
133
+ class CatalogStorageSqlite:
134
+ """SQLite-backed storage for VGI catalog state.
135
+
136
+ This implementation uses SQLite with WAL mode to allow multiple worker
137
+ processes to share catalog state. It manages two tables:
138
+
139
+ - catalog_attachments: Maps attach_opaque_data to catalog name and options
140
+ - catalog_transactions: Tracks active transactions
141
+
142
+ """
143
+
144
+ def __init__(self, db_path: str | None = None) -> None:
145
+ """Initialize SQLite catalog storage.
146
+
147
+ Args:
148
+ db_path: Path to the SQLite database file. If None, uses a default
149
+ location in the user's state directory.
150
+
151
+ """
152
+ self.db_path = db_path if db_path is not None else _get_default_db_path()
153
+ self._ensure_tables()
154
+
155
+ def _connect(self) -> sqlite3.Connection:
156
+ """Create a new database connection."""
157
+ conn = sqlite3.connect(self.db_path, timeout=30.0)
158
+ conn.execute("PRAGMA journal_mode=WAL")
159
+ return conn
160
+
161
+ def _ensure_tables(self) -> None:
162
+ """Create all storage tables if they don't exist."""
163
+ conn = self._connect()
164
+ try:
165
+ # Attachment table
166
+ conn.execute("""
167
+ CREATE TABLE IF NOT EXISTS catalog_attachments (
168
+ attach_opaque_data BLOB PRIMARY KEY,
169
+ catalog_name TEXT NOT NULL,
170
+ options_json TEXT NOT NULL,
171
+ created_at REAL DEFAULT (julianday('now'))
172
+ )
173
+ """)
174
+ # Transaction table
175
+ conn.execute("""
176
+ CREATE TABLE IF NOT EXISTS catalog_transactions (
177
+ transaction_opaque_data BLOB PRIMARY KEY,
178
+ attach_opaque_data BLOB NOT NULL,
179
+ state_data BLOB NOT NULL,
180
+ created_at REAL DEFAULT (julianday('now')),
181
+ FOREIGN KEY (attach_opaque_data) REFERENCES catalog_attachments(attach_opaque_data)
182
+ )
183
+ """)
184
+ conn.execute("""
185
+ CREATE INDEX IF NOT EXISTS idx_transactions_attach
186
+ ON catalog_transactions(attach_opaque_data)
187
+ """)
188
+ conn.commit()
189
+ finally:
190
+ conn.close()
191
+
192
+ # --- Attachment State ---
193
+
194
+ def attach_put(self, attach_opaque_data: AttachOpaqueData, catalog_name: str, options: dict[str, Any]) -> None:
195
+ """Store attachment state."""
196
+ import json
197
+
198
+ # Opportunistically clean old entries (1% of calls)
199
+ if random.random() < 0.01:
200
+ self.cleanup_old_entries(max_age_days=7.0)
201
+
202
+ options_json = json.dumps(options)
203
+
204
+ conn = self._connect()
205
+ try:
206
+ conn.execute(
207
+ """
208
+ INSERT OR REPLACE INTO catalog_attachments
209
+ (attach_opaque_data, catalog_name, options_json, created_at)
210
+ VALUES (?, ?, ?, julianday('now'))
211
+ """,
212
+ (attach_opaque_data, catalog_name, options_json),
213
+ )
214
+ conn.commit()
215
+ finally:
216
+ conn.close()
217
+
218
+ def attach_get(self, attach_opaque_data: AttachOpaqueData) -> tuple[str, dict[str, Any]] | None:
219
+ """Retrieve attachment state by attach_opaque_data."""
220
+ import json
221
+
222
+ conn = self._connect()
223
+ try:
224
+ cursor = conn.execute(
225
+ """SELECT catalog_name, options_json
226
+ FROM catalog_attachments WHERE attach_opaque_data = ?""",
227
+ (attach_opaque_data,),
228
+ )
229
+ row = cursor.fetchone()
230
+ finally:
231
+ conn.close()
232
+
233
+ if row is None:
234
+ return None
235
+
236
+ catalog_name: str = row[0]
237
+ options: dict[str, Any] = json.loads(row[1])
238
+ return (catalog_name, options)
239
+
240
+ def attach_delete(self, attach_opaque_data: AttachOpaqueData) -> None:
241
+ """Delete attachment state."""
242
+ conn = self._connect()
243
+ try:
244
+ # Delete associated transactions first
245
+ conn.execute(
246
+ "DELETE FROM catalog_transactions WHERE attach_opaque_data = ?",
247
+ (attach_opaque_data,),
248
+ )
249
+ conn.execute(
250
+ "DELETE FROM catalog_attachments WHERE attach_opaque_data = ?",
251
+ (attach_opaque_data,),
252
+ )
253
+ conn.commit()
254
+ finally:
255
+ conn.close()
256
+
257
+ def attach_list(self) -> list[AttachOpaqueData]:
258
+ """List all active attachment IDs."""
259
+ conn = self._connect()
260
+ try:
261
+ cursor = conn.execute("SELECT attach_opaque_data FROM catalog_attachments")
262
+ return [AttachOpaqueData(row[0]) for row in cursor.fetchall()]
263
+ finally:
264
+ conn.close()
265
+
266
+ # --- Transaction State ---
267
+
268
+ def transaction_put(
269
+ self, transaction_opaque_data: TransactionOpaqueData, attach_opaque_data: AttachOpaqueData, state: bytes
270
+ ) -> None:
271
+ """Store transaction state."""
272
+ # Opportunistically clean old entries (1% of calls)
273
+ if random.random() < 0.01:
274
+ self.cleanup_old_entries(max_age_days=7.0)
275
+
276
+ conn = self._connect()
277
+ try:
278
+ conn.execute(
279
+ """
280
+ INSERT OR REPLACE INTO catalog_transactions
281
+ (transaction_opaque_data, attach_opaque_data, state_data, created_at)
282
+ VALUES (?, ?, ?, julianday('now'))
283
+ """,
284
+ (transaction_opaque_data, attach_opaque_data, state),
285
+ )
286
+ conn.commit()
287
+ finally:
288
+ conn.close()
289
+
290
+ def transaction_get(self, transaction_opaque_data: TransactionOpaqueData) -> tuple[AttachOpaqueData, bytes] | None:
291
+ """Retrieve transaction state."""
292
+ conn = self._connect()
293
+ try:
294
+ cursor = conn.execute(
295
+ """SELECT attach_opaque_data, state_data
296
+ FROM catalog_transactions WHERE transaction_opaque_data = ?""",
297
+ (transaction_opaque_data,),
298
+ )
299
+ row = cursor.fetchone()
300
+ finally:
301
+ conn.close()
302
+
303
+ if row is None:
304
+ return None
305
+
306
+ return (AttachOpaqueData(row[0]), row[1])
307
+
308
+ def transaction_delete(self, transaction_opaque_data: TransactionOpaqueData) -> None:
309
+ """Delete transaction state."""
310
+ conn = self._connect()
311
+ try:
312
+ conn.execute(
313
+ "DELETE FROM catalog_transactions WHERE transaction_opaque_data = ?",
314
+ (transaction_opaque_data,),
315
+ )
316
+ conn.commit()
317
+ finally:
318
+ conn.close()
319
+
320
+ # --- Utility Methods ---
321
+
322
+ def generate_attach_opaque_data(self) -> AttachOpaqueData:
323
+ """Generate a new unique attach_opaque_data.
324
+
325
+ Returns:
326
+ A new AttachOpaqueData based on UUID4.
327
+
328
+ """
329
+ return AttachOpaqueData(uuid.uuid4().bytes)
330
+
331
+ def generate_transaction_opaque_data(self) -> TransactionOpaqueData:
332
+ """Generate a new unique transaction_opaque_data.
333
+
334
+ Returns:
335
+ A new TransactionOpaqueData based on UUID4.
336
+
337
+ """
338
+ return TransactionOpaqueData(uuid.uuid4().bytes)
339
+
340
+ # --- Maintenance ---
341
+
342
+ def cleanup_old_entries(self, max_age_days: float = 7.0) -> int:
343
+ """Remove entries older than the specified age from all tables.
344
+
345
+ Args:
346
+ max_age_days: Maximum age in days for entries to keep.
347
+
348
+ Returns:
349
+ Total number of entries deleted.
350
+
351
+ """
352
+ conn = self._connect()
353
+ try:
354
+ # Delete old transactions first (foreign key constraint)
355
+ cursor1 = conn.execute(
356
+ """
357
+ DELETE FROM catalog_transactions
358
+ WHERE julianday('now') - created_at > ?
359
+ """,
360
+ (max_age_days,),
361
+ )
362
+ cursor2 = conn.execute(
363
+ """
364
+ DELETE FROM catalog_attachments
365
+ WHERE julianday('now') - created_at > ?
366
+ """,
367
+ (max_age_days,),
368
+ )
369
+ conn.commit()
370
+ return int(cursor1.rowcount) + int(cursor2.rowcount)
371
+ finally:
372
+ conn.close()
vgi/client/__init__.py ADDED
@@ -0,0 +1,67 @@
1
+ # Copyright 2025, 2026 Query Farm LLC - https://query.farm
2
+
3
+ """VGI client package for communicating with VGI workers.
4
+
5
+ This package provides:
6
+ - Client: A class for programmatic interaction with VGI workers, including
7
+ both function invocation and catalog operations
8
+ - ClientError: Exception raised by Client function operations
9
+ - CatalogClientMixin: Mixin class providing catalog operations
10
+ - OutputWriter: Helper for writing output in various formats
11
+ - main: CLI entry point
12
+
13
+ Usage (API):
14
+ from vgi.client import Client, ClientError
15
+ from vgi.arguments import Arguments
16
+
17
+ with Client("./my_worker.py") as client:
18
+ for batch in client.table_in_out_function(
19
+ function_name="echo",
20
+ arguments=Arguments(positional=[], named={}),
21
+ input=input_batches,
22
+ ):
23
+ process(batch)
24
+
25
+ Usage (Catalog API):
26
+ from vgi.client import Client
27
+
28
+ client = Client("./my_worker")
29
+ result = client.catalog_attach(
30
+ name="my_catalog", options={}, data_version_spec=None, implementation_version=None
31
+ )
32
+
33
+ Usage (CLI):
34
+ vgi-client --input data.parquet --function echo
35
+ vgi-client --input data.parquet --function sum_all_columns
36
+
37
+ """
38
+
39
+ from typing import TYPE_CHECKING, Any
40
+
41
+ from vgi.client.catalog_mixin import CatalogClientMixin
42
+ from vgi.client.client import Client, ClientError, ResumableTableScan, ResumeUnsupported
43
+
44
+ if TYPE_CHECKING:
45
+ from vgi.client.cli import OutputWriter, main
46
+
47
+ __all__ = [
48
+ "CatalogClientMixin",
49
+ "Client",
50
+ "ClientError",
51
+ "OutputWriter",
52
+ "ResumableTableScan",
53
+ "ResumeUnsupported",
54
+ "main",
55
+ ]
56
+
57
+
58
+ # Lazy-load the CLI surface. ``vgi.client.cli`` transitively imports
59
+ # ``pyarrow.parquet`` / ``pyarrow._s3fs`` / ``pyarrow._gcsfs`` etc., which add
60
+ # ~2 seconds to the cold import path. Programmatic users of ``Client`` don't
61
+ # need any of that; only the ``vgi-client`` CLI entry point does.
62
+ def __getattr__(name: str) -> Any:
63
+ if name in {"OutputWriter", "main"}:
64
+ from vgi.client import cli
65
+
66
+ return getattr(cli, name)
67
+ raise AttributeError(f"module 'vgi.client' has no attribute {name!r}")