sqlspec 0.11.1__py3-none-any.whl → 0.12.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of sqlspec might be problematic. Click here for more details.
- sqlspec/__init__.py +16 -3
- sqlspec/_serialization.py +3 -10
- sqlspec/_sql.py +1147 -0
- sqlspec/_typing.py +343 -41
- sqlspec/adapters/adbc/__init__.py +2 -6
- sqlspec/adapters/adbc/config.py +474 -149
- sqlspec/adapters/adbc/driver.py +330 -621
- sqlspec/adapters/aiosqlite/__init__.py +2 -6
- sqlspec/adapters/aiosqlite/config.py +143 -57
- sqlspec/adapters/aiosqlite/driver.py +269 -431
- sqlspec/adapters/asyncmy/__init__.py +3 -8
- sqlspec/adapters/asyncmy/config.py +247 -202
- sqlspec/adapters/asyncmy/driver.py +218 -436
- sqlspec/adapters/asyncpg/__init__.py +4 -7
- sqlspec/adapters/asyncpg/config.py +329 -176
- sqlspec/adapters/asyncpg/driver.py +417 -487
- sqlspec/adapters/bigquery/__init__.py +2 -2
- sqlspec/adapters/bigquery/config.py +407 -0
- sqlspec/adapters/bigquery/driver.py +600 -553
- sqlspec/adapters/duckdb/__init__.py +4 -1
- sqlspec/adapters/duckdb/config.py +432 -321
- sqlspec/adapters/duckdb/driver.py +392 -406
- sqlspec/adapters/oracledb/__init__.py +3 -8
- sqlspec/adapters/oracledb/config.py +625 -0
- sqlspec/adapters/oracledb/driver.py +548 -921
- sqlspec/adapters/psqlpy/__init__.py +4 -7
- sqlspec/adapters/psqlpy/config.py +372 -203
- sqlspec/adapters/psqlpy/driver.py +197 -533
- sqlspec/adapters/psycopg/__init__.py +3 -8
- sqlspec/adapters/psycopg/config.py +741 -0
- sqlspec/adapters/psycopg/driver.py +734 -694
- sqlspec/adapters/sqlite/__init__.py +2 -6
- sqlspec/adapters/sqlite/config.py +146 -81
- sqlspec/adapters/sqlite/driver.py +242 -405
- sqlspec/base.py +220 -784
- sqlspec/config.py +354 -0
- sqlspec/driver/__init__.py +22 -0
- sqlspec/driver/_async.py +252 -0
- sqlspec/driver/_common.py +338 -0
- sqlspec/driver/_sync.py +261 -0
- sqlspec/driver/mixins/__init__.py +17 -0
- sqlspec/driver/mixins/_pipeline.py +523 -0
- sqlspec/driver/mixins/_result_utils.py +122 -0
- sqlspec/driver/mixins/_sql_translator.py +35 -0
- sqlspec/driver/mixins/_storage.py +993 -0
- sqlspec/driver/mixins/_type_coercion.py +131 -0
- sqlspec/exceptions.py +299 -7
- sqlspec/extensions/aiosql/__init__.py +10 -0
- sqlspec/extensions/aiosql/adapter.py +474 -0
- sqlspec/extensions/litestar/__init__.py +1 -6
- sqlspec/extensions/litestar/_utils.py +1 -5
- sqlspec/extensions/litestar/config.py +5 -6
- sqlspec/extensions/litestar/handlers.py +13 -12
- sqlspec/extensions/litestar/plugin.py +22 -24
- sqlspec/extensions/litestar/providers.py +37 -55
- sqlspec/loader.py +528 -0
- sqlspec/service/__init__.py +3 -0
- sqlspec/service/base.py +24 -0
- sqlspec/service/pagination.py +26 -0
- sqlspec/statement/__init__.py +21 -0
- sqlspec/statement/builder/__init__.py +54 -0
- sqlspec/statement/builder/_ddl_utils.py +119 -0
- sqlspec/statement/builder/_parsing_utils.py +135 -0
- sqlspec/statement/builder/base.py +328 -0
- sqlspec/statement/builder/ddl.py +1379 -0
- sqlspec/statement/builder/delete.py +80 -0
- sqlspec/statement/builder/insert.py +274 -0
- sqlspec/statement/builder/merge.py +95 -0
- sqlspec/statement/builder/mixins/__init__.py +65 -0
- sqlspec/statement/builder/mixins/_aggregate_functions.py +151 -0
- sqlspec/statement/builder/mixins/_case_builder.py +91 -0
- sqlspec/statement/builder/mixins/_common_table_expr.py +91 -0
- sqlspec/statement/builder/mixins/_delete_from.py +34 -0
- sqlspec/statement/builder/mixins/_from.py +61 -0
- sqlspec/statement/builder/mixins/_group_by.py +119 -0
- sqlspec/statement/builder/mixins/_having.py +35 -0
- sqlspec/statement/builder/mixins/_insert_from_select.py +48 -0
- sqlspec/statement/builder/mixins/_insert_into.py +36 -0
- sqlspec/statement/builder/mixins/_insert_values.py +69 -0
- sqlspec/statement/builder/mixins/_join.py +110 -0
- sqlspec/statement/builder/mixins/_limit_offset.py +53 -0
- sqlspec/statement/builder/mixins/_merge_clauses.py +405 -0
- sqlspec/statement/builder/mixins/_order_by.py +46 -0
- sqlspec/statement/builder/mixins/_pivot.py +82 -0
- sqlspec/statement/builder/mixins/_returning.py +37 -0
- sqlspec/statement/builder/mixins/_select_columns.py +60 -0
- sqlspec/statement/builder/mixins/_set_ops.py +122 -0
- sqlspec/statement/builder/mixins/_unpivot.py +80 -0
- sqlspec/statement/builder/mixins/_update_from.py +54 -0
- sqlspec/statement/builder/mixins/_update_set.py +91 -0
- sqlspec/statement/builder/mixins/_update_table.py +29 -0
- sqlspec/statement/builder/mixins/_where.py +374 -0
- sqlspec/statement/builder/mixins/_window_functions.py +86 -0
- sqlspec/statement/builder/protocols.py +20 -0
- sqlspec/statement/builder/select.py +206 -0
- sqlspec/statement/builder/update.py +178 -0
- sqlspec/statement/filters.py +571 -0
- sqlspec/statement/parameters.py +736 -0
- sqlspec/statement/pipelines/__init__.py +67 -0
- sqlspec/statement/pipelines/analyzers/__init__.py +9 -0
- sqlspec/statement/pipelines/analyzers/_analyzer.py +649 -0
- sqlspec/statement/pipelines/base.py +315 -0
- sqlspec/statement/pipelines/context.py +119 -0
- sqlspec/statement/pipelines/result_types.py +41 -0
- sqlspec/statement/pipelines/transformers/__init__.py +8 -0
- sqlspec/statement/pipelines/transformers/_expression_simplifier.py +256 -0
- sqlspec/statement/pipelines/transformers/_literal_parameterizer.py +623 -0
- sqlspec/statement/pipelines/transformers/_remove_comments.py +66 -0
- sqlspec/statement/pipelines/transformers/_remove_hints.py +81 -0
- sqlspec/statement/pipelines/validators/__init__.py +23 -0
- sqlspec/statement/pipelines/validators/_dml_safety.py +275 -0
- sqlspec/statement/pipelines/validators/_parameter_style.py +297 -0
- sqlspec/statement/pipelines/validators/_performance.py +703 -0
- sqlspec/statement/pipelines/validators/_security.py +990 -0
- sqlspec/statement/pipelines/validators/base.py +67 -0
- sqlspec/statement/result.py +527 -0
- sqlspec/statement/splitter.py +701 -0
- sqlspec/statement/sql.py +1198 -0
- sqlspec/storage/__init__.py +15 -0
- sqlspec/storage/backends/__init__.py +0 -0
- sqlspec/storage/backends/base.py +166 -0
- sqlspec/storage/backends/fsspec.py +315 -0
- sqlspec/storage/backends/obstore.py +464 -0
- sqlspec/storage/protocol.py +170 -0
- sqlspec/storage/registry.py +315 -0
- sqlspec/typing.py +157 -36
- sqlspec/utils/correlation.py +155 -0
- sqlspec/utils/deprecation.py +3 -6
- sqlspec/utils/fixtures.py +6 -11
- sqlspec/utils/logging.py +135 -0
- sqlspec/utils/module_loader.py +45 -43
- sqlspec/utils/serializers.py +4 -0
- sqlspec/utils/singleton.py +6 -8
- sqlspec/utils/sync_tools.py +15 -27
- sqlspec/utils/text.py +58 -26
- {sqlspec-0.11.1.dist-info → sqlspec-0.12.0.dist-info}/METADATA +97 -26
- sqlspec-0.12.0.dist-info/RECORD +145 -0
- sqlspec/adapters/bigquery/config/__init__.py +0 -3
- sqlspec/adapters/bigquery/config/_common.py +0 -40
- sqlspec/adapters/bigquery/config/_sync.py +0 -87
- sqlspec/adapters/oracledb/config/__init__.py +0 -9
- sqlspec/adapters/oracledb/config/_asyncio.py +0 -186
- sqlspec/adapters/oracledb/config/_common.py +0 -131
- sqlspec/adapters/oracledb/config/_sync.py +0 -186
- sqlspec/adapters/psycopg/config/__init__.py +0 -19
- sqlspec/adapters/psycopg/config/_async.py +0 -169
- sqlspec/adapters/psycopg/config/_common.py +0 -56
- sqlspec/adapters/psycopg/config/_sync.py +0 -168
- sqlspec/filters.py +0 -331
- sqlspec/mixins.py +0 -305
- sqlspec/statement.py +0 -378
- sqlspec-0.11.1.dist-info/RECORD +0 -69
- {sqlspec-0.11.1.dist-info → sqlspec-0.12.0.dist-info}/WHEEL +0 -0
- {sqlspec-0.11.1.dist-info → sqlspec-0.12.0.dist-info}/licenses/LICENSE +0 -0
- {sqlspec-0.11.1.dist-info → sqlspec-0.12.0.dist-info}/licenses/NOTICE +0 -0
|
@@ -0,0 +1,464 @@
|
|
|
1
|
+
"""High-performance object storage using obstore.
|
|
2
|
+
|
|
3
|
+
This backend implements the ObjectStoreProtocol using obstore,
|
|
4
|
+
providing native support for S3, GCS, Azure, and local file storage
|
|
5
|
+
with excellent performance characteristics and native Arrow support.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import fnmatch
|
|
11
|
+
import logging
|
|
12
|
+
from typing import TYPE_CHECKING, Any, cast
|
|
13
|
+
|
|
14
|
+
from sqlspec.exceptions import MissingDependencyError, StorageOperationFailedError
|
|
15
|
+
from sqlspec.storage.backends.base import ObjectStoreBase
|
|
16
|
+
from sqlspec.typing import OBSTORE_INSTALLED
|
|
17
|
+
|
|
18
|
+
if TYPE_CHECKING:
|
|
19
|
+
from collections.abc import AsyncIterator, Iterator
|
|
20
|
+
|
|
21
|
+
from sqlspec.typing import ArrowRecordBatch, ArrowTable
|
|
22
|
+
|
|
23
|
+
__all__ = ("ObStoreBackend",)
|
|
24
|
+
|
|
25
|
+
logger = logging.getLogger(__name__)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class ObStoreBackend(ObjectStoreBase):
|
|
29
|
+
"""High-performance object storage backend using obstore.
|
|
30
|
+
|
|
31
|
+
This backend leverages obstore's Rust-based implementation for maximum
|
|
32
|
+
performance, providing native support for:
|
|
33
|
+
- AWS S3 and S3-compatible stores
|
|
34
|
+
- Google Cloud Storage
|
|
35
|
+
- Azure Blob Storage
|
|
36
|
+
- Local filesystem
|
|
37
|
+
- HTTP endpoints
|
|
38
|
+
|
|
39
|
+
Features native Arrow support and ~9x better performance than fsspec.
|
|
40
|
+
"""
|
|
41
|
+
|
|
42
|
+
def __init__(self, store_uri: str, base_path: str = "", **store_options: Any) -> None:
|
|
43
|
+
"""Initialize obstore backend.
|
|
44
|
+
|
|
45
|
+
Args:
|
|
46
|
+
store_uri: Storage URI (e.g., 's3://bucket', 'file:///path', 'gs://bucket')
|
|
47
|
+
base_path: Base path prefix for all operations
|
|
48
|
+
**store_options: Additional options for obstore configuration
|
|
49
|
+
"""
|
|
50
|
+
|
|
51
|
+
if not OBSTORE_INSTALLED:
|
|
52
|
+
raise MissingDependencyError(package="obstore", install_package="obstore")
|
|
53
|
+
|
|
54
|
+
try:
|
|
55
|
+
self.store_uri = store_uri
|
|
56
|
+
self.base_path = base_path.rstrip("/") if base_path else ""
|
|
57
|
+
self.store_options = store_options
|
|
58
|
+
self.store: Any # Will be set based on store_uri
|
|
59
|
+
|
|
60
|
+
# Initialize obstore instance
|
|
61
|
+
if store_uri.startswith("memory://"):
|
|
62
|
+
# MemoryStore doesn't use from_url - create directly
|
|
63
|
+
from obstore.store import MemoryStore
|
|
64
|
+
|
|
65
|
+
self.store = MemoryStore()
|
|
66
|
+
elif store_uri.startswith("file://"):
|
|
67
|
+
# For file:// URIs, use LocalStore with root directory
|
|
68
|
+
from obstore.store import LocalStore
|
|
69
|
+
|
|
70
|
+
# LocalStore works with directory paths, so we use root
|
|
71
|
+
self.store = LocalStore("/")
|
|
72
|
+
# The full path will be handled in _resolve_path
|
|
73
|
+
else:
|
|
74
|
+
# Use obstore's from_url for automatic URI parsing
|
|
75
|
+
from obstore.store import from_url
|
|
76
|
+
|
|
77
|
+
self.store = from_url(store_uri, **store_options) # pyright: ignore[reportAttributeAccessIssue]
|
|
78
|
+
|
|
79
|
+
# Log successful initialization
|
|
80
|
+
logger.debug("ObStore backend initialized for %s", store_uri)
|
|
81
|
+
|
|
82
|
+
except Exception as exc:
|
|
83
|
+
msg = f"Failed to initialize obstore backend for {store_uri}"
|
|
84
|
+
raise StorageOperationFailedError(msg) from exc
|
|
85
|
+
|
|
86
|
+
def _resolve_path(self, path: str) -> str:
|
|
87
|
+
"""Resolve path relative to base_path."""
|
|
88
|
+
# For file:// URIs, the path passed in is already absolute
|
|
89
|
+
if self.store_uri.startswith("file://") and path.startswith("/"):
|
|
90
|
+
# Remove leading slash for LocalStore (it's relative to its root)
|
|
91
|
+
return path.lstrip("/")
|
|
92
|
+
|
|
93
|
+
if self.base_path:
|
|
94
|
+
# Ensure no double slashes by stripping trailing slash from base_path
|
|
95
|
+
clean_base = self.base_path.rstrip("/")
|
|
96
|
+
clean_path = path.lstrip("/")
|
|
97
|
+
return f"{clean_base}/{clean_path}"
|
|
98
|
+
return path
|
|
99
|
+
|
|
100
|
+
@property
|
|
101
|
+
def backend_type(self) -> str:
|
|
102
|
+
"""Return backend type identifier."""
|
|
103
|
+
return "obstore"
|
|
104
|
+
|
|
105
|
+
# Implementation of abstract methods from ObjectStoreBase
|
|
106
|
+
|
|
107
|
+
def read_bytes(self, path: str, **kwargs: Any) -> bytes: # pyright: ignore[reportUnusedParameter]
|
|
108
|
+
"""Read bytes using obstore."""
|
|
109
|
+
try:
|
|
110
|
+
resolved_path = self._resolve_path(path)
|
|
111
|
+
result = self.store.get(resolved_path)
|
|
112
|
+
return result.bytes() # type: ignore[no-any-return] # pyright: ignore[reportReturnType]
|
|
113
|
+
except Exception as exc:
|
|
114
|
+
msg = f"Failed to read bytes from {path}"
|
|
115
|
+
raise StorageOperationFailedError(msg) from exc
|
|
116
|
+
|
|
117
|
+
def write_bytes(self, path: str, data: bytes, **kwargs: Any) -> None: # pyright: ignore[reportUnusedParameter]
|
|
118
|
+
"""Write bytes using obstore."""
|
|
119
|
+
try:
|
|
120
|
+
resolved_path = self._resolve_path(path)
|
|
121
|
+
self.store.put(resolved_path, data)
|
|
122
|
+
except Exception as exc:
|
|
123
|
+
msg = f"Failed to write bytes to {path}"
|
|
124
|
+
raise StorageOperationFailedError(msg) from exc
|
|
125
|
+
|
|
126
|
+
def read_text(self, path: str, encoding: str = "utf-8", **kwargs: Any) -> str:
|
|
127
|
+
"""Read text using obstore."""
|
|
128
|
+
data = self.read_bytes(path, **kwargs)
|
|
129
|
+
return data.decode(encoding)
|
|
130
|
+
|
|
131
|
+
def write_text(self, path: str, data: str, encoding: str = "utf-8", **kwargs: Any) -> None:
|
|
132
|
+
"""Write text using obstore."""
|
|
133
|
+
encoded_data = data.encode(encoding)
|
|
134
|
+
self.write_bytes(path, encoded_data, **kwargs)
|
|
135
|
+
|
|
136
|
+
def list_objects(self, prefix: str = "", recursive: bool = True, **kwargs: Any) -> list[str]: # pyright: ignore[reportUnusedParameter]
|
|
137
|
+
"""List objects using obstore."""
|
|
138
|
+
resolved_prefix = self._resolve_path(prefix) if prefix else self.base_path or ""
|
|
139
|
+
objects: list[str] = []
|
|
140
|
+
|
|
141
|
+
def _get_item_path(item: Any) -> str:
|
|
142
|
+
"""Extract path from item, trying path attribute first, then key."""
|
|
143
|
+
if hasattr(item, "path"):
|
|
144
|
+
return str(item.path)
|
|
145
|
+
if hasattr(item, "key"):
|
|
146
|
+
return str(item.key)
|
|
147
|
+
return str(item)
|
|
148
|
+
|
|
149
|
+
if not recursive:
|
|
150
|
+
objects.extend(_get_item_path(item) for item in self.store.list_with_delimiter(resolved_prefix)) # pyright: ignore
|
|
151
|
+
else:
|
|
152
|
+
objects.extend(_get_item_path(item) for item in self.store.list(resolved_prefix))
|
|
153
|
+
|
|
154
|
+
return sorted(objects)
|
|
155
|
+
|
|
156
|
+
def exists(self, path: str, **kwargs: Any) -> bool: # pyright: ignore[reportUnusedParameter]
|
|
157
|
+
"""Check if object exists using obstore."""
|
|
158
|
+
try:
|
|
159
|
+
self.store.head(self._resolve_path(path))
|
|
160
|
+
except Exception:
|
|
161
|
+
return False
|
|
162
|
+
return True
|
|
163
|
+
|
|
164
|
+
def delete(self, path: str, **kwargs: Any) -> None: # pyright: ignore[reportUnusedParameter]
|
|
165
|
+
"""Delete object using obstore."""
|
|
166
|
+
try:
|
|
167
|
+
self.store.delete(self._resolve_path(path))
|
|
168
|
+
except Exception as exc:
|
|
169
|
+
msg = f"Failed to delete {path}"
|
|
170
|
+
raise StorageOperationFailedError(msg) from exc
|
|
171
|
+
|
|
172
|
+
def copy(self, source: str, destination: str, **kwargs: Any) -> None: # pyright: ignore[reportUnusedParameter]
|
|
173
|
+
"""Copy object using obstore."""
|
|
174
|
+
try:
|
|
175
|
+
self.store.copy(self._resolve_path(source), self._resolve_path(destination))
|
|
176
|
+
except Exception as exc:
|
|
177
|
+
msg = f"Failed to copy {source} to {destination}"
|
|
178
|
+
raise StorageOperationFailedError(msg) from exc
|
|
179
|
+
|
|
180
|
+
def move(self, source: str, destination: str, **kwargs: Any) -> None: # pyright: ignore[reportUnusedParameter]
|
|
181
|
+
"""Move object using obstore."""
|
|
182
|
+
try:
|
|
183
|
+
self.store.rename(self._resolve_path(source), self._resolve_path(destination))
|
|
184
|
+
except Exception as exc:
|
|
185
|
+
msg = f"Failed to move {source} to {destination}"
|
|
186
|
+
raise StorageOperationFailedError(msg) from exc
|
|
187
|
+
|
|
188
|
+
def glob(self, pattern: str, **kwargs: Any) -> list[str]:
|
|
189
|
+
"""Find objects matching pattern using obstore.
|
|
190
|
+
|
|
191
|
+
Note: obstore does not support server-side globbing. This implementation
|
|
192
|
+
lists all objects and filters them client-side, which may be inefficient
|
|
193
|
+
for large buckets.
|
|
194
|
+
"""
|
|
195
|
+
from pathlib import PurePosixPath
|
|
196
|
+
|
|
197
|
+
# List all objects and filter by pattern
|
|
198
|
+
resolved_pattern = self._resolve_path(pattern)
|
|
199
|
+
all_objects = self.list_objects(recursive=True, **kwargs)
|
|
200
|
+
|
|
201
|
+
# For complex patterns with **, use PurePosixPath
|
|
202
|
+
if "**" in pattern:
|
|
203
|
+
matching_objects = []
|
|
204
|
+
|
|
205
|
+
# Special case: **/*.ext should also match *.ext in root
|
|
206
|
+
if pattern.startswith("**/"):
|
|
207
|
+
# Get the suffix pattern
|
|
208
|
+
suffix_pattern = pattern[3:] # Remove **/
|
|
209
|
+
|
|
210
|
+
for obj in all_objects:
|
|
211
|
+
# Check if object ends with the suffix pattern
|
|
212
|
+
obj_path = PurePosixPath(obj)
|
|
213
|
+
# Try both the full pattern and just the suffix
|
|
214
|
+
if obj_path.match(resolved_pattern) or obj_path.match(suffix_pattern):
|
|
215
|
+
matching_objects.append(obj)
|
|
216
|
+
else:
|
|
217
|
+
# Standard ** pattern matching
|
|
218
|
+
for obj in all_objects:
|
|
219
|
+
obj_path = PurePosixPath(obj)
|
|
220
|
+
if obj_path.match(resolved_pattern):
|
|
221
|
+
matching_objects.append(obj)
|
|
222
|
+
|
|
223
|
+
return matching_objects
|
|
224
|
+
# Use standard fnmatch for simple patterns
|
|
225
|
+
return [obj for obj in all_objects if fnmatch.fnmatch(obj, resolved_pattern)]
|
|
226
|
+
|
|
227
|
+
def get_metadata(self, path: str, **kwargs: Any) -> dict[str, Any]: # pyright: ignore[reportUnusedParameter]
|
|
228
|
+
"""Get object metadata using obstore."""
|
|
229
|
+
resolved_path = self._resolve_path(path)
|
|
230
|
+
try:
|
|
231
|
+
metadata = self.store.head(resolved_path)
|
|
232
|
+
result = {"path": resolved_path, "exists": True}
|
|
233
|
+
for attr in ("size", "last_modified", "e_tag", "version"):
|
|
234
|
+
if hasattr(metadata, attr):
|
|
235
|
+
result[attr] = getattr(metadata, attr)
|
|
236
|
+
|
|
237
|
+
# Include custom metadata if available
|
|
238
|
+
if hasattr(metadata, "metadata"):
|
|
239
|
+
custom_metadata = getattr(metadata, "metadata", None)
|
|
240
|
+
if custom_metadata:
|
|
241
|
+
result["custom_metadata"] = custom_metadata
|
|
242
|
+
except Exception:
|
|
243
|
+
# Object doesn't exist
|
|
244
|
+
return {"path": resolved_path, "exists": False}
|
|
245
|
+
else:
|
|
246
|
+
return result
|
|
247
|
+
|
|
248
|
+
def is_object(self, path: str) -> bool:
|
|
249
|
+
"""Check if path is an object using obstore."""
|
|
250
|
+
resolved_path = self._resolve_path(path)
|
|
251
|
+
# An object exists and doesn't end with /
|
|
252
|
+
return self.exists(path) and not resolved_path.endswith("/")
|
|
253
|
+
|
|
254
|
+
def is_path(self, path: str) -> bool:
|
|
255
|
+
"""Check if path is a prefix/directory using obstore."""
|
|
256
|
+
resolved_path = self._resolve_path(path)
|
|
257
|
+
|
|
258
|
+
# A path/prefix either ends with / or has objects under it
|
|
259
|
+
if resolved_path.endswith("/"):
|
|
260
|
+
return True
|
|
261
|
+
|
|
262
|
+
# Check if there are any objects with this prefix
|
|
263
|
+
try:
|
|
264
|
+
objects = self.list_objects(prefix=path, recursive=False)
|
|
265
|
+
return len(objects) > 0
|
|
266
|
+
except Exception:
|
|
267
|
+
return False
|
|
268
|
+
|
|
269
|
+
def read_arrow(self, path: str, **kwargs: Any) -> ArrowTable:
|
|
270
|
+
"""Read Arrow table using obstore."""
|
|
271
|
+
try:
|
|
272
|
+
resolved_path = self._resolve_path(path)
|
|
273
|
+
# Check if the store has native Arrow support
|
|
274
|
+
if hasattr(self.store, "read_arrow"):
|
|
275
|
+
return self.store.read_arrow(resolved_path, **kwargs) # type: ignore[no-any-return] # pyright: ignore[reportAttributeAccessIssue]
|
|
276
|
+
# Fall back to reading as Parquet via bytes
|
|
277
|
+
import io
|
|
278
|
+
|
|
279
|
+
import pyarrow.parquet as pq
|
|
280
|
+
|
|
281
|
+
data = self.read_bytes(resolved_path)
|
|
282
|
+
buffer = io.BytesIO(data)
|
|
283
|
+
return pq.read_table(buffer, **kwargs)
|
|
284
|
+
except Exception as exc:
|
|
285
|
+
msg = f"Failed to read Arrow table from {path}"
|
|
286
|
+
raise StorageOperationFailedError(msg) from exc
|
|
287
|
+
|
|
288
|
+
def write_arrow(self, path: str, table: ArrowTable, **kwargs: Any) -> None:
|
|
289
|
+
"""Write Arrow table using obstore."""
|
|
290
|
+
try:
|
|
291
|
+
resolved_path = self._resolve_path(path)
|
|
292
|
+
# Check if the store has native Arrow support
|
|
293
|
+
if hasattr(self.store, "write_arrow"):
|
|
294
|
+
self.store.write_arrow(resolved_path, table, **kwargs) # pyright: ignore[reportAttributeAccessIssue]
|
|
295
|
+
else:
|
|
296
|
+
# Fall back to writing as Parquet via bytes
|
|
297
|
+
import io
|
|
298
|
+
|
|
299
|
+
import pyarrow as pa
|
|
300
|
+
import pyarrow.parquet as pq
|
|
301
|
+
|
|
302
|
+
buffer = io.BytesIO()
|
|
303
|
+
|
|
304
|
+
# Check for decimal64 columns and convert to decimal128
|
|
305
|
+
# PyArrow doesn't support decimal64 in Parquet files
|
|
306
|
+
schema = table.schema
|
|
307
|
+
needs_conversion = False
|
|
308
|
+
new_fields = []
|
|
309
|
+
|
|
310
|
+
for field in schema:
|
|
311
|
+
if str(field.type).startswith("decimal64"):
|
|
312
|
+
# Convert decimal64 to decimal128
|
|
313
|
+
import re
|
|
314
|
+
|
|
315
|
+
match = re.match(r"decimal64\((\d+),\s*(\d+)\)", str(field.type))
|
|
316
|
+
if match:
|
|
317
|
+
precision, scale = int(match.group(1)), int(match.group(2))
|
|
318
|
+
new_field = pa.field(field.name, pa.decimal128(precision, scale))
|
|
319
|
+
new_fields.append(new_field)
|
|
320
|
+
needs_conversion = True
|
|
321
|
+
else:
|
|
322
|
+
new_fields.append(field)
|
|
323
|
+
else:
|
|
324
|
+
new_fields.append(field)
|
|
325
|
+
|
|
326
|
+
if needs_conversion:
|
|
327
|
+
new_schema = pa.schema(new_fields)
|
|
328
|
+
table = table.cast(new_schema)
|
|
329
|
+
|
|
330
|
+
pq.write_table(table, buffer, **kwargs)
|
|
331
|
+
buffer.seek(0)
|
|
332
|
+
self.write_bytes(resolved_path, buffer.read())
|
|
333
|
+
except Exception as exc:
|
|
334
|
+
msg = f"Failed to write Arrow table to {path}"
|
|
335
|
+
raise StorageOperationFailedError(msg) from exc
|
|
336
|
+
|
|
337
|
+
def stream_arrow(self, pattern: str, **kwargs: Any) -> Iterator[ArrowRecordBatch]:
|
|
338
|
+
"""Stream Arrow record batches using obstore.
|
|
339
|
+
|
|
340
|
+
Yields:
|
|
341
|
+
Iterator of Arrow record batches from matching objects.
|
|
342
|
+
"""
|
|
343
|
+
try:
|
|
344
|
+
resolved_pattern = self._resolve_path(pattern)
|
|
345
|
+
yield from self.store.stream_arrow(resolved_pattern, **kwargs) # pyright: ignore[reportAttributeAccessIssue]
|
|
346
|
+
except Exception as exc:
|
|
347
|
+
msg = f"Failed to stream Arrow data for pattern {pattern}"
|
|
348
|
+
raise StorageOperationFailedError(msg) from exc
|
|
349
|
+
|
|
350
|
+
# Private async implementations for instrumentation support
|
|
351
|
+
# These are called by the base class async methods after instrumentation
|
|
352
|
+
|
|
353
|
+
async def read_bytes_async(self, path: str, **kwargs: Any) -> bytes: # pyright: ignore[reportUnusedParameter]
|
|
354
|
+
"""Private async read bytes using native obstore async if available."""
|
|
355
|
+
resolved_path = self._resolve_path(path)
|
|
356
|
+
result = await self.store.get_async(resolved_path)
|
|
357
|
+
return cast("bytes", result.bytes()) # pyright: ignore[reportReturnType]
|
|
358
|
+
|
|
359
|
+
async def write_bytes_async(self, path: str, data: bytes, **kwargs: Any) -> None: # pyright: ignore[reportUnusedParameter]
|
|
360
|
+
"""Private async write bytes using native obstore async."""
|
|
361
|
+
resolved_path = self._resolve_path(path)
|
|
362
|
+
await self.store.put_async(resolved_path, data)
|
|
363
|
+
|
|
364
|
+
async def list_objects_async(self, prefix: str = "", recursive: bool = True, **kwargs: Any) -> list[str]: # pyright: ignore[reportUnusedParameter]
|
|
365
|
+
"""Private async list objects using native obstore async if available."""
|
|
366
|
+
resolved_prefix = self._resolve_path(prefix) if prefix else self.base_path or ""
|
|
367
|
+
|
|
368
|
+
# Note: store.list_async returns an async iterator
|
|
369
|
+
objects = [str(item.path) async for item in self.store.list_async(resolved_prefix)] # pyright: ignore[reportAttributeAccessIssue]
|
|
370
|
+
|
|
371
|
+
# Manual filtering for non-recursive if needed as obstore lacks an
|
|
372
|
+
# async version of list_with_delimiter.
|
|
373
|
+
if not recursive and resolved_prefix:
|
|
374
|
+
base_depth = resolved_prefix.count("/")
|
|
375
|
+
objects = [obj for obj in objects if obj.count("/") <= base_depth + 1]
|
|
376
|
+
|
|
377
|
+
return sorted(objects)
|
|
378
|
+
|
|
379
|
+
# Implement all other required abstract async methods
|
|
380
|
+
# ObStore provides native async for most operations
|
|
381
|
+
|
|
382
|
+
async def read_text_async(self, path: str, encoding: str = "utf-8", **kwargs: Any) -> str:
|
|
383
|
+
"""Async read text using native obstore async."""
|
|
384
|
+
data = await self.read_bytes_async(path, **kwargs)
|
|
385
|
+
return data.decode(encoding)
|
|
386
|
+
|
|
387
|
+
async def write_text_async(self, path: str, data: str, encoding: str = "utf-8", **kwargs: Any) -> None: # pyright: ignore[reportUnusedParameter]
|
|
388
|
+
"""Async write text using native obstore async."""
|
|
389
|
+
encoded_data = data.encode(encoding)
|
|
390
|
+
await self.write_bytes_async(path, encoded_data, **kwargs)
|
|
391
|
+
|
|
392
|
+
async def exists_async(self, path: str, **kwargs: Any) -> bool: # pyright: ignore[reportUnusedParameter]
|
|
393
|
+
"""Async check if object exists using native obstore async."""
|
|
394
|
+
resolved_path = self._resolve_path(path)
|
|
395
|
+
try:
|
|
396
|
+
await self.store.head_async(resolved_path)
|
|
397
|
+
except Exception:
|
|
398
|
+
return False
|
|
399
|
+
return True
|
|
400
|
+
|
|
401
|
+
async def delete_async(self, path: str, **kwargs: Any) -> None: # pyright: ignore[reportUnusedParameter]
|
|
402
|
+
"""Async delete object using native obstore async."""
|
|
403
|
+
resolved_path = self._resolve_path(path)
|
|
404
|
+
await self.store.delete_async(resolved_path)
|
|
405
|
+
|
|
406
|
+
async def copy_async(self, source: str, destination: str, **kwargs: Any) -> None: # pyright: ignore[reportUnusedParameter]
|
|
407
|
+
"""Async copy object using native obstore async."""
|
|
408
|
+
source_path = self._resolve_path(source)
|
|
409
|
+
dest_path = self._resolve_path(destination)
|
|
410
|
+
await self.store.copy_async(source_path, dest_path)
|
|
411
|
+
|
|
412
|
+
async def move_async(self, source: str, destination: str, **kwargs: Any) -> None: # pyright: ignore[reportUnusedParameter]
|
|
413
|
+
"""Async move object using native obstore async."""
|
|
414
|
+
source_path = self._resolve_path(source)
|
|
415
|
+
dest_path = self._resolve_path(destination)
|
|
416
|
+
await self.store.rename_async(source_path, dest_path)
|
|
417
|
+
|
|
418
|
+
async def get_metadata_async(self, path: str, **kwargs: Any) -> dict[str, Any]: # pyright: ignore[reportUnusedParameter]
|
|
419
|
+
"""Async get object metadata using native obstore async."""
|
|
420
|
+
resolved_path = self._resolve_path(path)
|
|
421
|
+
metadata = await self.store.head_async(resolved_path)
|
|
422
|
+
|
|
423
|
+
# Convert obstore ObjectMeta to dict
|
|
424
|
+
result = {"path": resolved_path, "exists": True}
|
|
425
|
+
|
|
426
|
+
# Extract metadata attributes if available
|
|
427
|
+
for attr in ["size", "last_modified", "e_tag", "version"]:
|
|
428
|
+
if hasattr(metadata, attr):
|
|
429
|
+
result[attr] = getattr(metadata, attr)
|
|
430
|
+
|
|
431
|
+
# Include custom metadata if available
|
|
432
|
+
if hasattr(metadata, "metadata"):
|
|
433
|
+
custom_metadata = getattr(metadata, "metadata", None)
|
|
434
|
+
if custom_metadata:
|
|
435
|
+
result["custom_metadata"] = custom_metadata
|
|
436
|
+
|
|
437
|
+
return result
|
|
438
|
+
|
|
439
|
+
async def read_arrow_async(self, path: str, **kwargs: Any) -> ArrowTable:
|
|
440
|
+
"""Async read Arrow table using native obstore async."""
|
|
441
|
+
resolved_path = self._resolve_path(path)
|
|
442
|
+
return await self.store.read_arrow_async(resolved_path, **kwargs) # type: ignore[no-any-return] # pyright: ignore[reportAttributeAccessIssue]
|
|
443
|
+
|
|
444
|
+
async def write_arrow_async(self, path: str, table: ArrowTable, **kwargs: Any) -> None:
|
|
445
|
+
"""Async write Arrow table using native obstore async."""
|
|
446
|
+
resolved_path = self._resolve_path(path)
|
|
447
|
+
# Check if the store has native async Arrow support
|
|
448
|
+
if hasattr(self.store, "write_arrow_async"):
|
|
449
|
+
await self.store.write_arrow_async(resolved_path, table, **kwargs) # pyright: ignore[reportAttributeAccessIssue]
|
|
450
|
+
else:
|
|
451
|
+
# Fall back to writing as Parquet via bytes
|
|
452
|
+
import io
|
|
453
|
+
|
|
454
|
+
import pyarrow.parquet as pq
|
|
455
|
+
|
|
456
|
+
buffer = io.BytesIO()
|
|
457
|
+
pq.write_table(table, buffer, **kwargs)
|
|
458
|
+
buffer.seek(0)
|
|
459
|
+
await self.write_bytes_async(resolved_path, buffer.read())
|
|
460
|
+
|
|
461
|
+
async def stream_arrow_async(self, pattern: str, **kwargs: Any) -> AsyncIterator[ArrowRecordBatch]:
|
|
462
|
+
resolved_pattern = self._resolve_path(pattern)
|
|
463
|
+
async for batch in self.store.stream_arrow_async(resolved_pattern, **kwargs): # pyright: ignore[reportAttributeAccessIssue]
|
|
464
|
+
yield batch
|
|
@@ -0,0 +1,170 @@
|
|
|
1
|
+
from typing import TYPE_CHECKING, Any, Protocol, runtime_checkable
|
|
2
|
+
|
|
3
|
+
if TYPE_CHECKING:
|
|
4
|
+
from collections.abc import AsyncIterator, Iterator
|
|
5
|
+
|
|
6
|
+
from sqlspec.typing import ArrowRecordBatch, ArrowTable
|
|
7
|
+
|
|
8
|
+
__all__ = ("ObjectStoreProtocol",)
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@runtime_checkable
|
|
12
|
+
class ObjectStoreProtocol(Protocol):
|
|
13
|
+
"""Unified protocol for object storage operations.
|
|
14
|
+
|
|
15
|
+
This protocol defines the interface for all storage backends with built-in
|
|
16
|
+
instrumentation support. Backends must implement both sync and async operations
|
|
17
|
+
where possible, with async operations suffixed with _async.
|
|
18
|
+
|
|
19
|
+
All methods use 'path' terminology for consistency with object store patterns.
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
def __init__(self, uri: str, **kwargs: Any) -> None:
|
|
23
|
+
return
|
|
24
|
+
|
|
25
|
+
# Core Operations (sync)
|
|
26
|
+
def read_bytes(self, path: str, **kwargs: Any) -> bytes:
|
|
27
|
+
"""Read bytes from an object."""
|
|
28
|
+
return b""
|
|
29
|
+
|
|
30
|
+
def write_bytes(self, path: str, data: bytes, **kwargs: Any) -> None:
|
|
31
|
+
"""Write bytes to an object."""
|
|
32
|
+
return
|
|
33
|
+
|
|
34
|
+
def read_text(self, path: str, encoding: str = "utf-8", **kwargs: Any) -> str:
|
|
35
|
+
"""Read text from an object."""
|
|
36
|
+
return ""
|
|
37
|
+
|
|
38
|
+
def write_text(self, path: str, data: str, encoding: str = "utf-8", **kwargs: Any) -> None:
|
|
39
|
+
"""Write text to an object."""
|
|
40
|
+
return
|
|
41
|
+
|
|
42
|
+
# Object Operations
|
|
43
|
+
def exists(self, path: str, **kwargs: Any) -> bool:
|
|
44
|
+
"""Check if an object exists."""
|
|
45
|
+
return False
|
|
46
|
+
|
|
47
|
+
def delete(self, path: str, **kwargs: Any) -> None:
|
|
48
|
+
"""Delete an object."""
|
|
49
|
+
return
|
|
50
|
+
|
|
51
|
+
def copy(self, source: str, destination: str, **kwargs: Any) -> None:
|
|
52
|
+
"""Copy an object."""
|
|
53
|
+
return
|
|
54
|
+
|
|
55
|
+
def move(self, source: str, destination: str, **kwargs: Any) -> None:
|
|
56
|
+
"""Move an object."""
|
|
57
|
+
return
|
|
58
|
+
|
|
59
|
+
# Listing Operations
|
|
60
|
+
def list_objects(self, prefix: str = "", recursive: bool = True, **kwargs: Any) -> list[str]:
|
|
61
|
+
"""List objects with optional prefix."""
|
|
62
|
+
return []
|
|
63
|
+
|
|
64
|
+
def glob(self, pattern: str, **kwargs: Any) -> list[str]:
|
|
65
|
+
"""Find objects matching a glob pattern."""
|
|
66
|
+
return []
|
|
67
|
+
|
|
68
|
+
# Path Operations
|
|
69
|
+
def is_object(self, path: str) -> bool:
|
|
70
|
+
"""Check if path points to an object."""
|
|
71
|
+
return False
|
|
72
|
+
|
|
73
|
+
def is_path(self, path: str) -> bool:
|
|
74
|
+
"""Check if path points to a prefix (directory-like)."""
|
|
75
|
+
return False
|
|
76
|
+
|
|
77
|
+
def get_metadata(self, path: str, **kwargs: Any) -> dict[str, Any]:
|
|
78
|
+
"""Get object metadata."""
|
|
79
|
+
return {}
|
|
80
|
+
|
|
81
|
+
# Arrow Operations
|
|
82
|
+
def read_arrow(self, path: str, **kwargs: Any) -> "ArrowTable":
|
|
83
|
+
"""Read an Arrow table from storage.
|
|
84
|
+
|
|
85
|
+
For obstore backend, this should use native arrow operations when available.
|
|
86
|
+
"""
|
|
87
|
+
msg = "Arrow reading not implemented"
|
|
88
|
+
raise NotImplementedError(msg)
|
|
89
|
+
|
|
90
|
+
def write_arrow(self, path: str, table: "ArrowTable", **kwargs: Any) -> None:
|
|
91
|
+
"""Write an Arrow table to storage.
|
|
92
|
+
|
|
93
|
+
For obstore backend, this should use native arrow operations when available.
|
|
94
|
+
"""
|
|
95
|
+
msg = "Arrow writing not implemented"
|
|
96
|
+
raise NotImplementedError(msg)
|
|
97
|
+
|
|
98
|
+
def stream_arrow(self, pattern: str, **kwargs: Any) -> "Iterator[ArrowRecordBatch]":
|
|
99
|
+
"""Stream Arrow record batches from matching objects.
|
|
100
|
+
|
|
101
|
+
For obstore backend, this should use native streaming when available.
|
|
102
|
+
"""
|
|
103
|
+
msg = "Arrow streaming not implemented"
|
|
104
|
+
raise NotImplementedError(msg)
|
|
105
|
+
|
|
106
|
+
# Async versions
|
|
107
|
+
async def read_bytes_async(self, path: str, **kwargs: Any) -> bytes:
|
|
108
|
+
"""Async read bytes from an object."""
|
|
109
|
+
msg = "Async operations not implemented"
|
|
110
|
+
raise NotImplementedError(msg)
|
|
111
|
+
|
|
112
|
+
async def write_bytes_async(self, path: str, data: bytes, **kwargs: Any) -> None:
|
|
113
|
+
"""Async write bytes to an object."""
|
|
114
|
+
msg = "Async operations not implemented"
|
|
115
|
+
raise NotImplementedError(msg)
|
|
116
|
+
|
|
117
|
+
async def read_text_async(self, path: str, encoding: str = "utf-8", **kwargs: Any) -> str:
|
|
118
|
+
"""Async read text from an object."""
|
|
119
|
+
msg = "Async operations not implemented"
|
|
120
|
+
raise NotImplementedError(msg)
|
|
121
|
+
|
|
122
|
+
async def write_text_async(self, path: str, data: str, encoding: str = "utf-8", **kwargs: Any) -> None:
|
|
123
|
+
"""Async write text to an object."""
|
|
124
|
+
msg = "Async operations not implemented"
|
|
125
|
+
raise NotImplementedError(msg)
|
|
126
|
+
|
|
127
|
+
async def exists_async(self, path: str, **kwargs: Any) -> bool:
|
|
128
|
+
"""Async check if an object exists."""
|
|
129
|
+
msg = "Async operations not implemented"
|
|
130
|
+
raise NotImplementedError(msg)
|
|
131
|
+
|
|
132
|
+
async def delete_async(self, path: str, **kwargs: Any) -> None:
|
|
133
|
+
"""Async delete an object."""
|
|
134
|
+
msg = "Async operations not implemented"
|
|
135
|
+
raise NotImplementedError(msg)
|
|
136
|
+
|
|
137
|
+
async def list_objects_async(self, prefix: str = "", recursive: bool = True, **kwargs: Any) -> list[str]:
|
|
138
|
+
"""Async list objects with optional prefix."""
|
|
139
|
+
msg = "Async operations not implemented"
|
|
140
|
+
raise NotImplementedError(msg)
|
|
141
|
+
|
|
142
|
+
async def copy_async(self, source: str, destination: str, **kwargs: Any) -> None:
|
|
143
|
+
"""Async copy an object."""
|
|
144
|
+
msg = "Async operations not implemented"
|
|
145
|
+
raise NotImplementedError(msg)
|
|
146
|
+
|
|
147
|
+
async def move_async(self, source: str, destination: str, **kwargs: Any) -> None:
|
|
148
|
+
"""Async move an object."""
|
|
149
|
+
msg = "Async operations not implemented"
|
|
150
|
+
raise NotImplementedError(msg)
|
|
151
|
+
|
|
152
|
+
async def get_metadata_async(self, path: str, **kwargs: Any) -> dict[str, Any]:
|
|
153
|
+
"""Async get object metadata."""
|
|
154
|
+
msg = "Async operations not implemented"
|
|
155
|
+
raise NotImplementedError(msg)
|
|
156
|
+
|
|
157
|
+
async def read_arrow_async(self, path: str, **kwargs: Any) -> "ArrowTable":
|
|
158
|
+
"""Async read an Arrow table from storage."""
|
|
159
|
+
msg = "Async arrow reading not implemented"
|
|
160
|
+
raise NotImplementedError(msg)
|
|
161
|
+
|
|
162
|
+
async def write_arrow_async(self, path: str, table: "ArrowTable", **kwargs: Any) -> None:
|
|
163
|
+
"""Async write an Arrow table to storage."""
|
|
164
|
+
msg = "Async arrow writing not implemented"
|
|
165
|
+
raise NotImplementedError(msg)
|
|
166
|
+
|
|
167
|
+
async def stream_arrow_async(self, pattern: str, **kwargs: Any) -> "AsyncIterator[ArrowRecordBatch]":
|
|
168
|
+
"""Async stream Arrow record batches from matching objects."""
|
|
169
|
+
msg = "Async arrow streaming not implemented"
|
|
170
|
+
raise NotImplementedError(msg)
|