sqlspec 0.21.1__py3-none-any.whl → 0.22.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of sqlspec might be problematic. Click here for more details.
- sqlspec/base.py +4 -4
- sqlspec/loader.py +65 -68
- sqlspec/protocols.py +3 -5
- sqlspec/storage/__init__.py +2 -12
- sqlspec/storage/backends/__init__.py +1 -0
- sqlspec/storage/backends/fsspec.py +87 -147
- sqlspec/storage/backends/local.py +310 -0
- sqlspec/storage/backends/obstore.py +210 -192
- sqlspec/storage/registry.py +101 -70
- sqlspec/utils/sync_tools.py +8 -5
- {sqlspec-0.21.1.dist-info → sqlspec-0.22.0.dist-info}/METADATA +1 -1
- {sqlspec-0.21.1.dist-info → sqlspec-0.22.0.dist-info}/RECORD +16 -16
- sqlspec/storage/capabilities.py +0 -102
- {sqlspec-0.21.1.dist-info → sqlspec-0.22.0.dist-info}/WHEEL +0 -0
- {sqlspec-0.21.1.dist-info → sqlspec-0.22.0.dist-info}/entry_points.txt +0 -0
- {sqlspec-0.21.1.dist-info → sqlspec-0.22.0.dist-info}/licenses/LICENSE +0 -0
- {sqlspec-0.21.1.dist-info → sqlspec-0.22.0.dist-info}/licenses/NOTICE +0 -0
|
@@ -4,24 +4,19 @@ Implements the ObjectStoreProtocol using obstore for S3, GCS, Azure,
|
|
|
4
4
|
and local file storage.
|
|
5
5
|
"""
|
|
6
6
|
|
|
7
|
-
from __future__ import annotations
|
|
8
|
-
|
|
9
7
|
import fnmatch
|
|
10
8
|
import logging
|
|
11
|
-
from
|
|
12
|
-
|
|
13
|
-
from
|
|
14
|
-
|
|
15
|
-
from sqlspec.exceptions import MissingDependencyError, StorageOperationFailedError
|
|
16
|
-
from sqlspec.storage.backends.base import ObjectStoreBase
|
|
17
|
-
from sqlspec.storage.capabilities import HasStorageCapabilities, StorageCapabilities
|
|
18
|
-
from sqlspec.typing import OBSTORE_INSTALLED
|
|
9
|
+
from collections.abc import AsyncIterator, Iterator
|
|
10
|
+
from typing import TYPE_CHECKING, Any, Final, Optional, Union, cast
|
|
11
|
+
from urllib.parse import urlparse
|
|
19
12
|
|
|
20
13
|
if TYPE_CHECKING:
|
|
21
|
-
from collections.abc import AsyncIterator, Iterator
|
|
22
14
|
from pathlib import Path
|
|
23
15
|
|
|
24
|
-
|
|
16
|
+
from mypy_extensions import mypyc_attr
|
|
17
|
+
|
|
18
|
+
from sqlspec.exceptions import MissingDependencyError, StorageOperationFailedError
|
|
19
|
+
from sqlspec.typing import OBSTORE_INSTALLED, PYARROW_INSTALLED, ArrowRecordBatch, ArrowTable
|
|
25
20
|
|
|
26
21
|
__all__ = ("ObStoreBackend",)
|
|
27
22
|
|
|
@@ -31,88 +26,122 @@ logger = logging.getLogger(__name__)
|
|
|
31
26
|
class _AsyncArrowIterator:
|
|
32
27
|
"""Helper class to work around mypyc's lack of async generator support."""
|
|
33
28
|
|
|
34
|
-
def __init__(self,
|
|
35
|
-
self.
|
|
29
|
+
def __init__(self, backend: "ObStoreBackend", pattern: str, **kwargs: Any) -> None:
|
|
30
|
+
self.backend = backend
|
|
36
31
|
self.pattern = pattern
|
|
37
32
|
self.kwargs = kwargs
|
|
38
|
-
self.
|
|
33
|
+
self._files_iterator: Optional[Iterator[str]] = None
|
|
34
|
+
self._current_file_iterator: Optional[Iterator[ArrowRecordBatch]] = None
|
|
39
35
|
|
|
40
|
-
def __aiter__(self) -> _AsyncArrowIterator:
|
|
36
|
+
def __aiter__(self) -> "_AsyncArrowIterator":
|
|
41
37
|
return self
|
|
42
38
|
|
|
43
39
|
async def __anext__(self) -> ArrowRecordBatch:
|
|
44
|
-
if self.
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
40
|
+
if self._files_iterator is None:
|
|
41
|
+
files = self.backend.glob(self.pattern, **self.kwargs)
|
|
42
|
+
self._files_iterator = iter(files)
|
|
43
|
+
|
|
44
|
+
while True:
|
|
45
|
+
if self._current_file_iterator is not None:
|
|
46
|
+
try:
|
|
47
|
+
return next(self._current_file_iterator)
|
|
48
|
+
except StopIteration:
|
|
49
|
+
self._current_file_iterator = None
|
|
50
|
+
|
|
51
|
+
try:
|
|
52
|
+
next_file = next(self._files_iterator)
|
|
53
|
+
# Stream from this file
|
|
54
|
+
file_batches = self.backend.stream_arrow(next_file)
|
|
55
|
+
self._current_file_iterator = iter(file_batches)
|
|
56
|
+
except StopIteration:
|
|
57
|
+
raise StopAsyncIteration
|
|
49
58
|
|
|
50
59
|
|
|
51
60
|
DEFAULT_OPTIONS: Final[dict[str, Any]] = {"connect_timeout": "30s", "request_timeout": "60s"}
|
|
52
61
|
|
|
53
62
|
|
|
54
63
|
@mypyc_attr(allow_interpreted_subclasses=True)
|
|
55
|
-
class ObStoreBackend
|
|
64
|
+
class ObStoreBackend:
|
|
56
65
|
"""Object storage backend using obstore.
|
|
57
66
|
|
|
58
|
-
|
|
59
|
-
Supports AWS S3, Google Cloud Storage, Azure Blob Storage,
|
|
67
|
+
Implements ObjectStoreProtocol using obstore's Rust-based implementation
|
|
68
|
+
for storage operations. Supports AWS S3, Google Cloud Storage, Azure Blob Storage,
|
|
60
69
|
local filesystem, and HTTP endpoints.
|
|
61
70
|
"""
|
|
62
71
|
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
supports_compression=True,
|
|
70
|
-
is_cloud_native=True,
|
|
71
|
-
has_low_latency=True,
|
|
72
|
-
)
|
|
72
|
+
__slots__ = ("_path_cache", "backend_type", "base_path", "protocol", "store", "store_options", "store_uri")
|
|
73
|
+
|
|
74
|
+
def _ensure_obstore(self) -> None:
|
|
75
|
+
"""Ensure obstore is available for operations."""
|
|
76
|
+
if not OBSTORE_INSTALLED:
|
|
77
|
+
raise MissingDependencyError(package="obstore", install_package="obstore")
|
|
73
78
|
|
|
74
|
-
|
|
79
|
+
def _ensure_pyarrow(self) -> None:
|
|
80
|
+
"""Ensure PyArrow is available for Arrow operations."""
|
|
81
|
+
if not PYARROW_INSTALLED:
|
|
82
|
+
raise MissingDependencyError(package="pyarrow", install_package="pyarrow")
|
|
75
83
|
|
|
76
|
-
def __init__(self,
|
|
84
|
+
def __init__(self, uri: str, **kwargs: Any) -> None:
|
|
77
85
|
"""Initialize obstore backend.
|
|
78
86
|
|
|
79
87
|
Args:
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
**store_options: Additional options for obstore configuration
|
|
88
|
+
uri: Storage URI (e.g., 's3://bucket', 'file:///path', 'gs://bucket')
|
|
89
|
+
**kwargs: Additional options including base_path and obstore configuration
|
|
83
90
|
"""
|
|
84
91
|
|
|
85
|
-
|
|
86
|
-
raise MissingDependencyError(package="obstore", install_package="obstore")
|
|
92
|
+
self._ensure_obstore()
|
|
87
93
|
|
|
88
94
|
try:
|
|
89
|
-
|
|
95
|
+
# Extract base_path from kwargs
|
|
96
|
+
base_path = kwargs.pop("base_path", "")
|
|
97
|
+
|
|
98
|
+
self.store_uri = uri
|
|
90
99
|
self.base_path = base_path.rstrip("/") if base_path else ""
|
|
91
|
-
self.store_options =
|
|
100
|
+
self.store_options = kwargs
|
|
92
101
|
self.store: Any
|
|
93
102
|
self._path_cache: dict[str, str] = {}
|
|
94
|
-
self.protocol =
|
|
103
|
+
self.protocol = uri.split("://", 1)[0] if "://" in uri else "file"
|
|
104
|
+
self.backend_type = "obstore"
|
|
95
105
|
|
|
96
|
-
if
|
|
106
|
+
if uri.startswith("memory://"):
|
|
97
107
|
from obstore.store import MemoryStore
|
|
98
108
|
|
|
99
109
|
self.store = MemoryStore()
|
|
100
|
-
elif
|
|
110
|
+
elif uri.startswith("file://"):
|
|
111
|
+
from pathlib import Path as PathlibPath
|
|
112
|
+
|
|
101
113
|
from obstore.store import LocalStore
|
|
102
114
|
|
|
103
|
-
|
|
115
|
+
parsed = urlparse(uri)
|
|
116
|
+
path = parsed.path or "/"
|
|
117
|
+
# Create directory if it doesn't exist (ObStore LocalStore requires it)
|
|
118
|
+
PathlibPath(path).mkdir(parents=True, exist_ok=True)
|
|
119
|
+
self.store = LocalStore(path)
|
|
104
120
|
else:
|
|
105
121
|
from obstore.store import from_url
|
|
106
122
|
|
|
107
|
-
self.store = from_url(
|
|
123
|
+
self.store = from_url(uri, **kwargs) # pyright: ignore[reportAttributeAccessIssue]
|
|
108
124
|
|
|
109
|
-
logger.debug("ObStore backend initialized for %s",
|
|
125
|
+
logger.debug("ObStore backend initialized for %s", uri)
|
|
110
126
|
|
|
111
127
|
except Exception as exc:
|
|
112
|
-
msg = f"Failed to initialize obstore backend for {
|
|
128
|
+
msg = f"Failed to initialize obstore backend for {uri}"
|
|
113
129
|
raise StorageOperationFailedError(msg) from exc
|
|
114
130
|
|
|
115
|
-
|
|
131
|
+
@classmethod
|
|
132
|
+
def from_config(cls, config: dict[str, Any]) -> "ObStoreBackend":
|
|
133
|
+
"""Create backend from configuration dictionary."""
|
|
134
|
+
store_uri = config["store_uri"]
|
|
135
|
+
base_path = config.get("base_path", "")
|
|
136
|
+
store_options = config.get("store_options", {})
|
|
137
|
+
|
|
138
|
+
kwargs = dict(store_options)
|
|
139
|
+
if base_path:
|
|
140
|
+
kwargs["base_path"] = base_path
|
|
141
|
+
|
|
142
|
+
return cls(uri=store_uri, **kwargs)
|
|
143
|
+
|
|
144
|
+
def _resolve_path(self, path: "Union[str, Path]") -> str:
|
|
116
145
|
"""Resolve path relative to base_path."""
|
|
117
146
|
path_str = str(path)
|
|
118
147
|
if path_str.startswith("file://"):
|
|
@@ -125,49 +154,33 @@ class ObStoreBackend(ObjectStoreBase, HasStorageCapabilities):
|
|
|
125
154
|
return f"{clean_base}/{clean_path}"
|
|
126
155
|
return path_str
|
|
127
156
|
|
|
128
|
-
|
|
129
|
-
def backend_type(self) -> str:
|
|
130
|
-
"""Return backend type identifier."""
|
|
131
|
-
return "obstore"
|
|
132
|
-
|
|
133
|
-
def read_bytes(self, path: str | Path, **kwargs: Any) -> bytes: # pyright: ignore[reportUnusedParameter]
|
|
157
|
+
def read_bytes(self, path: "Union[str, Path]", **kwargs: Any) -> bytes: # pyright: ignore[reportUnusedParameter]
|
|
134
158
|
"""Read bytes using obstore."""
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
return cast("bytes", result.bytes().to_bytes())
|
|
138
|
-
except Exception as exc:
|
|
139
|
-
msg = f"Failed to read bytes from {path}"
|
|
140
|
-
raise StorageOperationFailedError(msg) from exc
|
|
159
|
+
result = self.store.get(self._resolve_path(path))
|
|
160
|
+
return cast("bytes", result.bytes().to_bytes())
|
|
141
161
|
|
|
142
|
-
def write_bytes(self, path: str
|
|
162
|
+
def write_bytes(self, path: "Union[str, Path]", data: bytes, **kwargs: Any) -> None: # pyright: ignore[reportUnusedParameter]
|
|
143
163
|
"""Write bytes using obstore."""
|
|
144
|
-
|
|
145
|
-
self.store.put(self._resolve_path(path), data)
|
|
146
|
-
except Exception as exc:
|
|
147
|
-
msg = f"Failed to write bytes to {path}"
|
|
148
|
-
raise StorageOperationFailedError(msg) from exc
|
|
164
|
+
self.store.put(self._resolve_path(path), data)
|
|
149
165
|
|
|
150
|
-
def read_text(self, path: str
|
|
166
|
+
def read_text(self, path: "Union[str, Path]", encoding: str = "utf-8", **kwargs: Any) -> str:
|
|
151
167
|
"""Read text using obstore."""
|
|
152
168
|
return self.read_bytes(path, **kwargs).decode(encoding)
|
|
153
169
|
|
|
154
|
-
def write_text(self, path: str
|
|
170
|
+
def write_text(self, path: "Union[str, Path]", data: str, encoding: str = "utf-8", **kwargs: Any) -> None:
|
|
155
171
|
"""Write text using obstore."""
|
|
156
172
|
self.write_bytes(path, data.encode(encoding), **kwargs)
|
|
157
173
|
|
|
158
174
|
def list_objects(self, prefix: str = "", recursive: bool = True, **kwargs: Any) -> list[str]: # pyright: ignore[reportUnusedParameter]
|
|
159
175
|
"""List objects using obstore."""
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
)
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
raise StorageOperationFailedError(msg) from exc
|
|
169
|
-
|
|
170
|
-
def exists(self, path: str | Path, **kwargs: Any) -> bool: # pyright: ignore[reportUnusedParameter]
|
|
176
|
+
resolved_prefix = self._resolve_path(prefix) if prefix else self.base_path or ""
|
|
177
|
+
items = self.store.list_with_delimiter(resolved_prefix) if not recursive else self.store.list(resolved_prefix)
|
|
178
|
+
paths: list[str] = []
|
|
179
|
+
for batch in items:
|
|
180
|
+
paths.extend(item["path"] for item in batch)
|
|
181
|
+
return sorted(paths)
|
|
182
|
+
|
|
183
|
+
def exists(self, path: "Union[str, Path]", **kwargs: Any) -> bool: # pyright: ignore[reportUnusedParameter]
|
|
171
184
|
"""Check if object exists using obstore."""
|
|
172
185
|
try:
|
|
173
186
|
self.store.head(self._resolve_path(path))
|
|
@@ -175,29 +188,17 @@ class ObStoreBackend(ObjectStoreBase, HasStorageCapabilities):
|
|
|
175
188
|
return False
|
|
176
189
|
return True
|
|
177
190
|
|
|
178
|
-
def delete(self, path: str
|
|
191
|
+
def delete(self, path: "Union[str, Path]", **kwargs: Any) -> None: # pyright: ignore[reportUnusedParameter]
|
|
179
192
|
"""Delete object using obstore."""
|
|
180
|
-
|
|
181
|
-
self.store.delete(self._resolve_path(path))
|
|
182
|
-
except Exception as exc:
|
|
183
|
-
msg = f"Failed to delete {path}"
|
|
184
|
-
raise StorageOperationFailedError(msg) from exc
|
|
193
|
+
self.store.delete(self._resolve_path(path))
|
|
185
194
|
|
|
186
|
-
def copy(self, source: str
|
|
195
|
+
def copy(self, source: "Union[str, Path]", destination: "Union[str, Path]", **kwargs: Any) -> None: # pyright: ignore[reportUnusedParameter]
|
|
187
196
|
"""Copy object using obstore."""
|
|
188
|
-
|
|
189
|
-
self.store.copy(self._resolve_path(source), self._resolve_path(destination))
|
|
190
|
-
except Exception as exc:
|
|
191
|
-
msg = f"Failed to copy {source} to {destination}"
|
|
192
|
-
raise StorageOperationFailedError(msg) from exc
|
|
197
|
+
self.store.copy(self._resolve_path(source), self._resolve_path(destination))
|
|
193
198
|
|
|
194
|
-
def move(self, source: str
|
|
199
|
+
def move(self, source: "Union[str, Path]", destination: "Union[str, Path]", **kwargs: Any) -> None: # pyright: ignore[reportUnusedParameter]
|
|
195
200
|
"""Move object using obstore."""
|
|
196
|
-
|
|
197
|
-
self.store.rename(self._resolve_path(source), self._resolve_path(destination))
|
|
198
|
-
except Exception as exc:
|
|
199
|
-
msg = f"Failed to move {source} to {destination}"
|
|
200
|
-
raise StorageOperationFailedError(msg) from exc
|
|
201
|
+
self.store.rename(self._resolve_path(source), self._resolve_path(destination))
|
|
201
202
|
|
|
202
203
|
def glob(self, pattern: str, **kwargs: Any) -> list[str]:
|
|
203
204
|
"""Find objects matching pattern.
|
|
@@ -228,7 +229,7 @@ class ObStoreBackend(ObjectStoreBase, HasStorageCapabilities):
|
|
|
228
229
|
return matching_objects
|
|
229
230
|
return [obj for obj in all_objects if fnmatch.fnmatch(obj, resolved_pattern)]
|
|
230
231
|
|
|
231
|
-
def get_metadata(self, path: str
|
|
232
|
+
def get_metadata(self, path: "Union[str, Path]", **kwargs: Any) -> dict[str, Any]: # pyright: ignore[reportUnusedParameter]
|
|
232
233
|
"""Get object metadata using obstore."""
|
|
233
234
|
resolved_path = self._resolve_path(path)
|
|
234
235
|
result: dict[str, Any] = {}
|
|
@@ -252,12 +253,12 @@ class ObStoreBackend(ObjectStoreBase, HasStorageCapabilities):
|
|
|
252
253
|
else:
|
|
253
254
|
return result
|
|
254
255
|
|
|
255
|
-
def is_object(self, path: str
|
|
256
|
+
def is_object(self, path: "Union[str, Path]") -> bool:
|
|
256
257
|
"""Check if path is an object using obstore."""
|
|
257
258
|
resolved_path = self._resolve_path(path)
|
|
258
259
|
return self.exists(path) and not resolved_path.endswith("/")
|
|
259
260
|
|
|
260
|
-
def is_path(self, path: str
|
|
261
|
+
def is_path(self, path: "Union[str, Path]") -> bool:
|
|
261
262
|
"""Check if path is a prefix/directory using obstore."""
|
|
262
263
|
resolved_path = self._resolve_path(path)
|
|
263
264
|
|
|
@@ -270,61 +271,53 @@ class ObStoreBackend(ObjectStoreBase, HasStorageCapabilities):
|
|
|
270
271
|
except Exception:
|
|
271
272
|
return False
|
|
272
273
|
|
|
273
|
-
def read_arrow(self, path: str
|
|
274
|
+
def read_arrow(self, path: "Union[str, Path]", **kwargs: Any) -> ArrowTable:
|
|
274
275
|
"""Read Arrow table using obstore."""
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
276
|
+
resolved_path = self._resolve_path(path)
|
|
277
|
+
if hasattr(self.store, "read_arrow"):
|
|
278
|
+
return self.store.read_arrow(resolved_path, **kwargs) # type: ignore[no-any-return] # pyright: ignore[reportAttributeAccessIssue]
|
|
279
|
+
|
|
280
|
+
self._ensure_pyarrow()
|
|
281
|
+
import io
|
|
282
|
+
|
|
283
|
+
import pyarrow.parquet as pq
|
|
284
|
+
|
|
285
|
+
return pq.read_table(io.BytesIO(self.read_bytes(resolved_path)), **kwargs)
|
|
279
286
|
|
|
287
|
+
def write_arrow(self, path: "Union[str, Path]", table: ArrowTable, **kwargs: Any) -> None:
|
|
288
|
+
"""Write Arrow table using obstore."""
|
|
289
|
+
resolved_path = self._resolve_path(path)
|
|
290
|
+
if hasattr(self.store, "write_arrow"):
|
|
291
|
+
self.store.write_arrow(resolved_path, table, **kwargs) # pyright: ignore[reportAttributeAccessIssue]
|
|
292
|
+
else:
|
|
293
|
+
self._ensure_pyarrow()
|
|
280
294
|
import io
|
|
281
295
|
|
|
296
|
+
import pyarrow as pa
|
|
282
297
|
import pyarrow.parquet as pq
|
|
283
298
|
|
|
284
|
-
|
|
285
|
-
buffer = io.BytesIO(data)
|
|
286
|
-
return pq.read_table(buffer, **kwargs)
|
|
287
|
-
except Exception as exc:
|
|
288
|
-
msg = f"Failed to read Arrow table from {path}"
|
|
289
|
-
raise StorageOperationFailedError(msg) from exc
|
|
299
|
+
buffer = io.BytesIO()
|
|
290
300
|
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
buffer = io.BytesIO()
|
|
304
|
-
|
|
305
|
-
schema = table.schema
|
|
306
|
-
if any(str(f.type).startswith("decimal64") for f in schema):
|
|
307
|
-
new_fields = []
|
|
308
|
-
for field in schema:
|
|
309
|
-
if str(field.type).startswith("decimal64"):
|
|
310
|
-
import re
|
|
311
|
-
|
|
312
|
-
match = re.match(r"decimal64\((\d+),\s*(\d+)\)", str(field.type))
|
|
313
|
-
if match:
|
|
314
|
-
precision, scale = int(match.group(1)), int(match.group(2))
|
|
315
|
-
new_fields.append(pa.field(field.name, pa.decimal128(precision, scale)))
|
|
316
|
-
else:
|
|
317
|
-
new_fields.append(field) # pragma: no cover
|
|
301
|
+
schema = table.schema
|
|
302
|
+
if any(str(f.type).startswith("decimal64") for f in schema):
|
|
303
|
+
new_fields = []
|
|
304
|
+
for field in schema:
|
|
305
|
+
if str(field.type).startswith("decimal64"):
|
|
306
|
+
import re
|
|
307
|
+
|
|
308
|
+
match = re.match(r"decimal64\((\d+),\s*(\d+)\)", str(field.type))
|
|
309
|
+
if match:
|
|
310
|
+
precision, scale = int(match.group(1)), int(match.group(2))
|
|
311
|
+
new_fields.append(pa.field(field.name, pa.decimal128(precision, scale)))
|
|
318
312
|
else:
|
|
319
|
-
new_fields.append(field)
|
|
320
|
-
|
|
313
|
+
new_fields.append(field) # pragma: no cover
|
|
314
|
+
else:
|
|
315
|
+
new_fields.append(field)
|
|
316
|
+
table = table.cast(pa.schema(new_fields))
|
|
321
317
|
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
except Exception as exc:
|
|
326
|
-
msg = f"Failed to write Arrow table to {path}"
|
|
327
|
-
raise StorageOperationFailedError(msg) from exc
|
|
318
|
+
pq.write_table(table, buffer, **kwargs)
|
|
319
|
+
buffer.seek(0)
|
|
320
|
+
self.write_bytes(resolved_path, buffer.read())
|
|
328
321
|
|
|
329
322
|
def stream_arrow(self, pattern: str, **kwargs: Any) -> Iterator[ArrowRecordBatch]:
|
|
330
323
|
"""Stream Arrow record batches.
|
|
@@ -332,56 +325,65 @@ class ObStoreBackend(ObjectStoreBase, HasStorageCapabilities):
|
|
|
332
325
|
Yields:
|
|
333
326
|
Iterator of Arrow record batches from matching objects.
|
|
334
327
|
"""
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
328
|
+
self._ensure_pyarrow()
|
|
329
|
+
from io import BytesIO
|
|
330
|
+
|
|
331
|
+
import pyarrow.parquet as pq
|
|
332
|
+
|
|
333
|
+
for obj_path in self.glob(pattern, **kwargs):
|
|
334
|
+
result = self.store.get(self._resolve_path(obj_path))
|
|
335
|
+
bytes_obj = result.bytes()
|
|
336
|
+
data = bytes_obj.to_bytes()
|
|
337
|
+
buffer = BytesIO(data)
|
|
338
|
+
parquet_file = pq.ParquetFile(buffer)
|
|
339
|
+
yield from parquet_file.iter_batches()
|
|
340
|
+
|
|
341
|
+
def sign(self, path: str, expires_in: int = 3600, for_upload: bool = False) -> str:
|
|
342
|
+
"""Generate a signed URL for the object."""
|
|
343
|
+
resolved_path = self._resolve_path(path)
|
|
344
|
+
if hasattr(self.store, "sign_url") and callable(self.store.sign_url):
|
|
345
|
+
return self.store.sign_url(resolved_path, expires_in=expires_in) # type: ignore[no-any-return]
|
|
346
|
+
return f"{self.store_uri}/{resolved_path}"
|
|
341
347
|
|
|
342
|
-
async def read_bytes_async(self, path: str
|
|
348
|
+
async def read_bytes_async(self, path: "Union[str, Path]", **kwargs: Any) -> bytes: # pyright: ignore[reportUnusedParameter]
|
|
343
349
|
"""Read bytes from storage asynchronously."""
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
return bytes_obj.to_bytes() # type: ignore[no-any-return] # pyright: ignore[reportAttributeAccessIssue]
|
|
349
|
-
except Exception as exc:
|
|
350
|
-
msg = f"Failed to read bytes from {path}"
|
|
351
|
-
raise StorageOperationFailedError(msg) from exc
|
|
350
|
+
resolved_path = self._resolve_path(path)
|
|
351
|
+
result = await self.store.get_async(resolved_path)
|
|
352
|
+
bytes_obj = await result.bytes_async()
|
|
353
|
+
return bytes_obj.to_bytes() # type: ignore[no-any-return] # pyright: ignore[reportAttributeAccessIssue]
|
|
352
354
|
|
|
353
|
-
async def write_bytes_async(self, path: str
|
|
355
|
+
async def write_bytes_async(self, path: "Union[str, Path]", data: bytes, **kwargs: Any) -> None: # pyright: ignore[reportUnusedParameter]
|
|
354
356
|
"""Write bytes to storage asynchronously."""
|
|
355
357
|
resolved_path = self._resolve_path(path)
|
|
356
358
|
await self.store.put_async(resolved_path, data)
|
|
357
359
|
|
|
358
360
|
async def list_objects_async(self, prefix: str = "", recursive: bool = True, **kwargs: Any) -> list[str]: # pyright: ignore[reportUnusedParameter]
|
|
359
361
|
"""List objects in storage asynchronously."""
|
|
360
|
-
|
|
361
|
-
resolved_prefix = self._resolve_path(prefix) if prefix else self.base_path or ""
|
|
362
|
+
resolved_prefix = self._resolve_path(prefix) if prefix else self.base_path or ""
|
|
362
363
|
|
|
363
|
-
|
|
364
|
+
objects: list[str] = []
|
|
365
|
+
async for batch in self.store.list_async(resolved_prefix): # pyright: ignore[reportAttributeAccessIssue]
|
|
366
|
+
objects.extend(item["path"] for item in batch)
|
|
364
367
|
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
+
if not recursive and resolved_prefix:
|
|
369
|
+
base_depth = resolved_prefix.count("/")
|
|
370
|
+
objects = [obj for obj in objects if obj.count("/") <= base_depth + 1]
|
|
368
371
|
|
|
369
|
-
|
|
370
|
-
except Exception as exc:
|
|
371
|
-
msg = f"Failed to list objects with prefix '{prefix}'"
|
|
372
|
-
raise StorageOperationFailedError(msg) from exc
|
|
372
|
+
return sorted(objects)
|
|
373
373
|
|
|
374
|
-
async def read_text_async(self, path: str
|
|
374
|
+
async def read_text_async(self, path: "Union[str, Path]", encoding: str = "utf-8", **kwargs: Any) -> str:
|
|
375
375
|
"""Read text from storage asynchronously."""
|
|
376
376
|
data = await self.read_bytes_async(path, **kwargs)
|
|
377
377
|
return data.decode(encoding)
|
|
378
378
|
|
|
379
|
-
async def write_text_async(
|
|
379
|
+
async def write_text_async(
|
|
380
|
+
self, path: "Union[str, Path]", data: str, encoding: str = "utf-8", **kwargs: Any
|
|
381
|
+
) -> None: # pyright: ignore[reportUnusedParameter]
|
|
380
382
|
"""Write text to storage asynchronously."""
|
|
381
383
|
encoded_data = data.encode(encoding)
|
|
382
384
|
await self.write_bytes_async(path, encoded_data, **kwargs)
|
|
383
385
|
|
|
384
|
-
async def exists_async(self, path: str
|
|
386
|
+
async def exists_async(self, path: "Union[str, Path]", **kwargs: Any) -> bool: # pyright: ignore[reportUnusedParameter]
|
|
385
387
|
"""Check if object exists in storage asynchronously."""
|
|
386
388
|
resolved_path = self._resolve_path(path)
|
|
387
389
|
try:
|
|
@@ -390,24 +392,24 @@ class ObStoreBackend(ObjectStoreBase, HasStorageCapabilities):
|
|
|
390
392
|
return False
|
|
391
393
|
return True
|
|
392
394
|
|
|
393
|
-
async def delete_async(self, path: str
|
|
395
|
+
async def delete_async(self, path: "Union[str, Path]", **kwargs: Any) -> None: # pyright: ignore[reportUnusedParameter]
|
|
394
396
|
"""Delete object from storage asynchronously."""
|
|
395
397
|
resolved_path = self._resolve_path(path)
|
|
396
398
|
await self.store.delete_async(resolved_path)
|
|
397
399
|
|
|
398
|
-
async def copy_async(self, source: str
|
|
400
|
+
async def copy_async(self, source: "Union[str, Path]", destination: "Union[str, Path]", **kwargs: Any) -> None: # pyright: ignore[reportUnusedParameter]
|
|
399
401
|
"""Copy object in storage asynchronously."""
|
|
400
402
|
source_path = self._resolve_path(source)
|
|
401
403
|
dest_path = self._resolve_path(destination)
|
|
402
404
|
await self.store.copy_async(source_path, dest_path)
|
|
403
405
|
|
|
404
|
-
async def move_async(self, source: str
|
|
406
|
+
async def move_async(self, source: "Union[str, Path]", destination: "Union[str, Path]", **kwargs: Any) -> None: # pyright: ignore[reportUnusedParameter]
|
|
405
407
|
"""Move object in storage asynchronously."""
|
|
406
408
|
source_path = self._resolve_path(source)
|
|
407
409
|
dest_path = self._resolve_path(destination)
|
|
408
410
|
await self.store.rename_async(source_path, dest_path)
|
|
409
411
|
|
|
410
|
-
async def get_metadata_async(self, path: str
|
|
412
|
+
async def get_metadata_async(self, path: "Union[str, Path]", **kwargs: Any) -> dict[str, Any]: # pyright: ignore[reportUnusedParameter]
|
|
411
413
|
"""Get object metadata from storage asynchronously."""
|
|
412
414
|
resolved_path = self._resolve_path(path)
|
|
413
415
|
result: dict[str, Any] = {}
|
|
@@ -417,31 +419,40 @@ class ObStoreBackend(ObjectStoreBase, HasStorageCapabilities):
|
|
|
417
419
|
{
|
|
418
420
|
"path": resolved_path,
|
|
419
421
|
"exists": True,
|
|
420
|
-
"size": metadata.size,
|
|
421
|
-
"last_modified": metadata.last_modified,
|
|
422
|
-
"e_tag": metadata.e_tag,
|
|
423
|
-
"version": metadata.version,
|
|
422
|
+
"size": metadata.get("size"),
|
|
423
|
+
"last_modified": metadata.get("last_modified"),
|
|
424
|
+
"e_tag": metadata.get("e_tag"),
|
|
425
|
+
"version": metadata.get("version"),
|
|
424
426
|
}
|
|
425
427
|
)
|
|
426
|
-
if
|
|
427
|
-
result["custom_metadata"] = metadata
|
|
428
|
+
if metadata.get("metadata"):
|
|
429
|
+
result["custom_metadata"] = metadata["metadata"]
|
|
428
430
|
|
|
429
431
|
except Exception:
|
|
430
432
|
return {"path": resolved_path, "exists": False}
|
|
431
433
|
else:
|
|
432
434
|
return result
|
|
433
435
|
|
|
434
|
-
async def read_arrow_async(self, path: str
|
|
436
|
+
async def read_arrow_async(self, path: "Union[str, Path]", **kwargs: Any) -> ArrowTable:
|
|
435
437
|
"""Read Arrow table from storage asynchronously."""
|
|
436
438
|
resolved_path = self._resolve_path(path)
|
|
437
|
-
|
|
439
|
+
if hasattr(self.store, "read_arrow_async"):
|
|
440
|
+
return await self.store.read_arrow_async(resolved_path, **kwargs) # type: ignore[no-any-return] # pyright: ignore[reportAttributeAccessIssue]
|
|
441
|
+
|
|
442
|
+
self._ensure_pyarrow()
|
|
443
|
+
import io
|
|
438
444
|
|
|
439
|
-
|
|
445
|
+
import pyarrow.parquet as pq
|
|
446
|
+
|
|
447
|
+
return pq.read_table(io.BytesIO(await self.read_bytes_async(resolved_path)), **kwargs)
|
|
448
|
+
|
|
449
|
+
async def write_arrow_async(self, path: "Union[str, Path]", table: ArrowTable, **kwargs: Any) -> None:
|
|
440
450
|
"""Write Arrow table to storage asynchronously."""
|
|
441
451
|
resolved_path = self._resolve_path(path)
|
|
442
452
|
if hasattr(self.store, "write_arrow_async"):
|
|
443
453
|
await self.store.write_arrow_async(resolved_path, table, **kwargs) # pyright: ignore[reportAttributeAccessIssue]
|
|
444
454
|
else:
|
|
455
|
+
self._ensure_pyarrow()
|
|
445
456
|
import io
|
|
446
457
|
|
|
447
458
|
import pyarrow.parquet as pq
|
|
@@ -453,4 +464,11 @@ class ObStoreBackend(ObjectStoreBase, HasStorageCapabilities):
|
|
|
453
464
|
|
|
454
465
|
def stream_arrow_async(self, pattern: str, **kwargs: Any) -> AsyncIterator[ArrowRecordBatch]:
|
|
455
466
|
resolved_pattern = self._resolve_path(pattern)
|
|
456
|
-
return _AsyncArrowIterator(self
|
|
467
|
+
return _AsyncArrowIterator(self, resolved_pattern, **kwargs)
|
|
468
|
+
|
|
469
|
+
async def sign_async(self, path: str, expires_in: int = 3600, for_upload: bool = False) -> str:
|
|
470
|
+
"""Generate a signed URL asynchronously."""
|
|
471
|
+
resolved_path = self._resolve_path(path)
|
|
472
|
+
if hasattr(self.store, "sign_url_async") and callable(self.store.sign_url_async):
|
|
473
|
+
return await self.store.sign_url_async(resolved_path, expires_in=expires_in) # type: ignore[no-any-return]
|
|
474
|
+
return f"{self.store_uri}/{resolved_path}"
|