sqlspec 0.21.0__py3-none-any.whl → 0.22.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of sqlspec might be problematic. Click here for more details.
- sqlspec/base.py +4 -4
- sqlspec/driver/mixins/_result_tools.py +41 -6
- sqlspec/loader.py +65 -68
- sqlspec/protocols.py +3 -5
- sqlspec/storage/__init__.py +2 -12
- sqlspec/storage/backends/__init__.py +1 -0
- sqlspec/storage/backends/fsspec.py +87 -147
- sqlspec/storage/backends/local.py +310 -0
- sqlspec/storage/backends/obstore.py +210 -192
- sqlspec/storage/registry.py +101 -70
- sqlspec/utils/data_transformation.py +120 -0
- sqlspec/utils/sync_tools.py +8 -5
- sqlspec/utils/text.py +27 -19
- sqlspec/utils/type_guards.py +74 -0
- {sqlspec-0.21.0.dist-info → sqlspec-0.22.0.dist-info}/METADATA +1 -1
- {sqlspec-0.21.0.dist-info → sqlspec-0.22.0.dist-info}/RECORD +20 -19
- sqlspec/storage/capabilities.py +0 -102
- {sqlspec-0.21.0.dist-info → sqlspec-0.22.0.dist-info}/WHEEL +0 -0
- {sqlspec-0.21.0.dist-info → sqlspec-0.22.0.dist-info}/entry_points.txt +0 -0
- {sqlspec-0.21.0.dist-info → sqlspec-0.22.0.dist-info}/licenses/LICENSE +0 -0
- {sqlspec-0.21.0.dist-info → sqlspec-0.22.0.dist-info}/licenses/NOTICE +0 -0
|
@@ -1,18 +1,14 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
from pathlib import Path
|
|
3
|
-
from typing import TYPE_CHECKING, Any,
|
|
3
|
+
from typing import TYPE_CHECKING, Any, Optional, Union
|
|
4
4
|
|
|
5
|
-
from sqlspec.exceptions import MissingDependencyError
|
|
6
|
-
from sqlspec.storage.backends.base import ObjectStoreBase
|
|
7
|
-
from sqlspec.storage.capabilities import StorageCapabilities
|
|
5
|
+
from sqlspec.exceptions import MissingDependencyError
|
|
8
6
|
from sqlspec.typing import FSSPEC_INSTALLED, PYARROW_INSTALLED
|
|
9
7
|
from sqlspec.utils.sync_tools import async_
|
|
10
8
|
|
|
11
9
|
if TYPE_CHECKING:
|
|
12
10
|
from collections.abc import AsyncIterator, Iterator
|
|
13
11
|
|
|
14
|
-
from fsspec import AbstractFileSystem
|
|
15
|
-
|
|
16
12
|
from sqlspec.typing import ArrowRecordBatch, ArrowTable
|
|
17
13
|
|
|
18
14
|
__all__ = ("FSSpecBackend",)
|
|
@@ -56,40 +52,30 @@ class _ArrowStreamer:
|
|
|
56
52
|
raise StopAsyncIteration
|
|
57
53
|
|
|
58
54
|
|
|
59
|
-
class FSSpecBackend
|
|
55
|
+
class FSSpecBackend:
|
|
60
56
|
"""Storage backend using fsspec.
|
|
61
57
|
|
|
62
|
-
Implements
|
|
58
|
+
Implements ObjectStoreProtocol using fsspec for various protocols
|
|
63
59
|
including HTTP, HTTPS, FTP, and cloud storage services.
|
|
64
60
|
"""
|
|
65
61
|
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
supports_streaming=PYARROW_INSTALLED,
|
|
69
|
-
supports_async=True,
|
|
70
|
-
supports_compression=True,
|
|
71
|
-
is_remote=True,
|
|
72
|
-
is_cloud_native=False,
|
|
73
|
-
)
|
|
74
|
-
|
|
75
|
-
def __init__(self, fs: "Union[str, AbstractFileSystem]", base_path: str = "") -> None:
|
|
76
|
-
if not FSSPEC_INSTALLED:
|
|
77
|
-
raise MissingDependencyError(package="fsspec", install_package="fsspec")
|
|
62
|
+
def __init__(self, uri: str, **kwargs: Any) -> None:
|
|
63
|
+
self._ensure_fsspec()
|
|
78
64
|
|
|
65
|
+
base_path = kwargs.pop("base_path", "")
|
|
79
66
|
self.base_path = base_path.rstrip("/") if base_path else ""
|
|
80
67
|
|
|
81
|
-
if
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
self.fs = fsspec.filesystem(fs.split("://")[0])
|
|
85
|
-
self.protocol = fs.split("://")[0]
|
|
86
|
-
self._fs_uri = fs
|
|
68
|
+
if "://" in uri:
|
|
69
|
+
self.protocol = uri.split("://", maxsplit=1)[0]
|
|
70
|
+
self._fs_uri = uri
|
|
87
71
|
else:
|
|
88
|
-
self.
|
|
89
|
-
self.
|
|
90
|
-
|
|
72
|
+
self.protocol = uri
|
|
73
|
+
self._fs_uri = f"{uri}://"
|
|
74
|
+
|
|
75
|
+
import fsspec
|
|
91
76
|
|
|
92
|
-
self.
|
|
77
|
+
self.fs = fsspec.filesystem(self.protocol, **kwargs)
|
|
78
|
+
self.backend_type = "fsspec"
|
|
93
79
|
|
|
94
80
|
super().__init__()
|
|
95
81
|
|
|
@@ -99,11 +85,22 @@ class FSSpecBackend(ObjectStoreBase):
|
|
|
99
85
|
fs_config = config.get("fs_config", {})
|
|
100
86
|
base_path = config.get("base_path", "")
|
|
101
87
|
|
|
102
|
-
|
|
88
|
+
uri = f"{protocol}://"
|
|
89
|
+
kwargs = dict(fs_config)
|
|
90
|
+
if base_path:
|
|
91
|
+
kwargs["base_path"] = base_path
|
|
103
92
|
|
|
104
|
-
|
|
93
|
+
return cls(uri=uri, **kwargs)
|
|
105
94
|
|
|
106
|
-
|
|
95
|
+
def _ensure_fsspec(self) -> None:
|
|
96
|
+
"""Ensure fsspec is available for operations."""
|
|
97
|
+
if not FSSPEC_INSTALLED:
|
|
98
|
+
raise MissingDependencyError(package="fsspec", install_package="fsspec")
|
|
99
|
+
|
|
100
|
+
def _ensure_pyarrow(self) -> None:
|
|
101
|
+
"""Ensure PyArrow is available for Arrow operations."""
|
|
102
|
+
if not PYARROW_INSTALLED:
|
|
103
|
+
raise MissingDependencyError(package="pyarrow", install_package="pyarrow")
|
|
107
104
|
|
|
108
105
|
def _resolve_path(self, path: Union[str, Path]) -> str:
|
|
109
106
|
"""Resolve path relative to base_path."""
|
|
@@ -112,70 +109,38 @@ class FSSpecBackend(ObjectStoreBase):
|
|
|
112
109
|
clean_base = self.base_path.rstrip("/")
|
|
113
110
|
clean_path = path_str.lstrip("/")
|
|
114
111
|
return f"{clean_base}/{clean_path}"
|
|
112
|
+
if self.protocol == "s3" and "://" in self._fs_uri:
|
|
113
|
+
# For S3, we need to include the bucket from the URI
|
|
114
|
+
# Extract bucket and path from URI like s3://bucket/path
|
|
115
|
+
uri_parts = self._fs_uri.split("://", 1)[1] # Remove s3://
|
|
116
|
+
if "/" in uri_parts:
|
|
117
|
+
# URI has bucket and base path
|
|
118
|
+
return f"{uri_parts.rstrip('/')}/{path_str.lstrip('/')}"
|
|
119
|
+
# URI has only bucket
|
|
120
|
+
return f"{uri_parts}/{path_str.lstrip('/')}"
|
|
115
121
|
return path_str
|
|
116
122
|
|
|
117
|
-
def _detect_capabilities(self) -> StorageCapabilities:
|
|
118
|
-
"""Detect capabilities based on filesystem protocol."""
|
|
119
|
-
protocol = self.protocol.lower()
|
|
120
|
-
|
|
121
|
-
if protocol in {"s3", "s3a", "s3n"}:
|
|
122
|
-
return StorageCapabilities.s3_compatible()
|
|
123
|
-
if protocol in {"gcs", "gs"}:
|
|
124
|
-
return StorageCapabilities.gcs()
|
|
125
|
-
if protocol in {"abfs", "az", "azure"}:
|
|
126
|
-
return StorageCapabilities.azure_blob()
|
|
127
|
-
if protocol in {"file", "local"}:
|
|
128
|
-
return StorageCapabilities.local_filesystem()
|
|
129
|
-
return StorageCapabilities(
|
|
130
|
-
supports_arrow=PYARROW_INSTALLED,
|
|
131
|
-
supports_streaming=PYARROW_INSTALLED,
|
|
132
|
-
supports_async=True,
|
|
133
|
-
supports_compression=True,
|
|
134
|
-
is_remote=True,
|
|
135
|
-
is_cloud_native=False,
|
|
136
|
-
)
|
|
137
|
-
|
|
138
|
-
@property
|
|
139
|
-
def capabilities(self) -> StorageCapabilities:
|
|
140
|
-
"""Return capabilities based on detected protocol."""
|
|
141
|
-
return getattr(self, "_instance_capabilities", self.__class__._default_capabilities)
|
|
142
|
-
|
|
143
|
-
@classmethod
|
|
144
|
-
def has_capability(cls, capability: str) -> bool:
|
|
145
|
-
"""Check if backend has a specific capability."""
|
|
146
|
-
return getattr(cls._default_capabilities, capability, False)
|
|
147
|
-
|
|
148
|
-
@classmethod
|
|
149
|
-
def get_capabilities(cls) -> StorageCapabilities:
|
|
150
|
-
"""Get all capabilities for this backend."""
|
|
151
|
-
return cls._default_capabilities
|
|
152
|
-
|
|
153
|
-
@property
|
|
154
|
-
def backend_type(self) -> str:
|
|
155
|
-
return "fsspec"
|
|
156
|
-
|
|
157
123
|
@property
|
|
158
124
|
def base_uri(self) -> str:
|
|
159
125
|
return self._fs_uri
|
|
160
126
|
|
|
161
127
|
def read_bytes(self, path: Union[str, Path], **kwargs: Any) -> bytes:
|
|
162
128
|
"""Read bytes from an object."""
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
return self.fs.cat(resolved_path, **kwargs) # type: ignore[no-any-return] # pyright: ignore
|
|
166
|
-
except Exception as exc:
|
|
167
|
-
msg = f"Failed to read bytes from {path}"
|
|
168
|
-
raise StorageOperationFailedError(msg) from exc
|
|
129
|
+
resolved_path = self._resolve_path(path)
|
|
130
|
+
return self.fs.cat(resolved_path, **kwargs) # type: ignore[no-any-return] # pyright: ignore
|
|
169
131
|
|
|
170
132
|
def write_bytes(self, path: Union[str, Path], data: bytes, **kwargs: Any) -> None:
|
|
171
133
|
"""Write bytes to an object."""
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
134
|
+
resolved_path = self._resolve_path(path)
|
|
135
|
+
|
|
136
|
+
# Only create directories for local file systems, not for cloud storage
|
|
137
|
+
if self.protocol == "file":
|
|
138
|
+
parent_dir = str(Path(resolved_path).parent)
|
|
139
|
+
if parent_dir and not self.fs.exists(parent_dir):
|
|
140
|
+
self.fs.makedirs(parent_dir, exist_ok=True)
|
|
141
|
+
|
|
142
|
+
with self.fs.open(resolved_path, mode="wb", **kwargs) as f:
|
|
143
|
+
f.write(data) # pyright: ignore
|
|
179
144
|
|
|
180
145
|
def read_text(self, path: Union[str, Path], encoding: str = "utf-8", **kwargs: Any) -> str:
|
|
181
146
|
"""Read text from an object."""
|
|
@@ -193,87 +158,59 @@ class FSSpecBackend(ObjectStoreBase):
|
|
|
193
158
|
|
|
194
159
|
def delete(self, path: Union[str, Path], **kwargs: Any) -> None:
|
|
195
160
|
"""Delete an object."""
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
self.fs.rm(resolved_path, **kwargs)
|
|
199
|
-
except Exception as exc:
|
|
200
|
-
msg = f"Failed to delete {path}"
|
|
201
|
-
raise StorageOperationFailedError(msg) from exc
|
|
161
|
+
resolved_path = self._resolve_path(path)
|
|
162
|
+
self.fs.rm(resolved_path, **kwargs)
|
|
202
163
|
|
|
203
164
|
def copy(self, source: Union[str, Path], destination: Union[str, Path], **kwargs: Any) -> None:
|
|
204
165
|
"""Copy an object."""
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
self.fs.copy(source_path, dest_path, **kwargs)
|
|
209
|
-
except Exception as exc:
|
|
210
|
-
msg = f"Failed to copy {source} to {destination}"
|
|
211
|
-
raise StorageOperationFailedError(msg) from exc
|
|
166
|
+
source_path = self._resolve_path(source)
|
|
167
|
+
dest_path = self._resolve_path(destination)
|
|
168
|
+
self.fs.copy(source_path, dest_path, **kwargs)
|
|
212
169
|
|
|
213
170
|
def move(self, source: Union[str, Path], destination: Union[str, Path], **kwargs: Any) -> None:
|
|
214
171
|
"""Move an object."""
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
self.fs.mv(source_path, dest_path, **kwargs)
|
|
219
|
-
except Exception as exc:
|
|
220
|
-
msg = f"Failed to move {source} to {destination}"
|
|
221
|
-
raise StorageOperationFailedError(msg) from exc
|
|
172
|
+
source_path = self._resolve_path(source)
|
|
173
|
+
dest_path = self._resolve_path(destination)
|
|
174
|
+
self.fs.mv(source_path, dest_path, **kwargs)
|
|
222
175
|
|
|
223
176
|
def read_arrow(self, path: Union[str, Path], **kwargs: Any) -> "ArrowTable":
|
|
224
177
|
"""Read an Arrow table from storage."""
|
|
225
178
|
if not PYARROW_INSTALLED:
|
|
226
179
|
raise MissingDependencyError(package="pyarrow", install_package="pyarrow")
|
|
227
|
-
|
|
228
|
-
import pyarrow.parquet as pq
|
|
180
|
+
import pyarrow.parquet as pq
|
|
229
181
|
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
except Exception as exc:
|
|
234
|
-
msg = f"Failed to read Arrow table from {path}"
|
|
235
|
-
raise StorageOperationFailedError(msg) from exc
|
|
182
|
+
resolved_path = self._resolve_path(path)
|
|
183
|
+
with self.fs.open(resolved_path, mode="rb", **kwargs) as f:
|
|
184
|
+
return pq.read_table(f)
|
|
236
185
|
|
|
237
186
|
def write_arrow(self, path: Union[str, Path], table: "ArrowTable", **kwargs: Any) -> None:
|
|
238
187
|
"""Write an Arrow table to storage."""
|
|
239
188
|
if not PYARROW_INSTALLED:
|
|
240
189
|
raise MissingDependencyError(package="pyarrow", install_package="pyarrow")
|
|
241
|
-
|
|
242
|
-
import pyarrow.parquet as pq
|
|
190
|
+
import pyarrow.parquet as pq
|
|
243
191
|
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
except Exception as exc:
|
|
248
|
-
msg = f"Failed to write Arrow table to {path}"
|
|
249
|
-
raise StorageOperationFailedError(msg) from exc
|
|
192
|
+
resolved_path = self._resolve_path(path)
|
|
193
|
+
with self.fs.open(resolved_path, mode="wb") as f:
|
|
194
|
+
pq.write_table(table, f, **kwargs) # pyright: ignore
|
|
250
195
|
|
|
251
196
|
def list_objects(self, prefix: str = "", recursive: bool = True, **kwargs: Any) -> list[str]:
|
|
252
197
|
"""List objects with optional prefix."""
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
return sorted(self.fs.ls(resolved_prefix, detail=False, **kwargs))
|
|
258
|
-
except Exception as exc:
|
|
259
|
-
msg = f"Failed to list objects with prefix '{prefix}'"
|
|
260
|
-
raise StorageOperationFailedError(msg) from exc
|
|
198
|
+
resolved_prefix = self._resolve_path(prefix)
|
|
199
|
+
if recursive:
|
|
200
|
+
return sorted(self.fs.find(resolved_prefix, **kwargs))
|
|
201
|
+
return sorted(self.fs.ls(resolved_prefix, detail=False, **kwargs))
|
|
261
202
|
|
|
262
203
|
def glob(self, pattern: str, **kwargs: Any) -> list[str]:
|
|
263
204
|
"""Find objects matching a glob pattern."""
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
return sorted(self.fs.glob(resolved_pattern, **kwargs)) # pyright: ignore
|
|
267
|
-
except Exception as exc:
|
|
268
|
-
msg = f"Failed to glob with pattern '{pattern}'"
|
|
269
|
-
raise StorageOperationFailedError(msg) from exc
|
|
205
|
+
resolved_pattern = self._resolve_path(pattern)
|
|
206
|
+
return sorted(self.fs.glob(resolved_pattern, **kwargs)) # pyright: ignore
|
|
270
207
|
|
|
271
|
-
def is_object(self, path: str) -> bool:
|
|
208
|
+
def is_object(self, path: Union[str, Path]) -> bool:
|
|
272
209
|
"""Check if path points to an object."""
|
|
273
210
|
resolved_path = self._resolve_path(path)
|
|
274
211
|
return self.fs.exists(resolved_path) and not self.fs.isdir(resolved_path)
|
|
275
212
|
|
|
276
|
-
def is_path(self, path: str) -> bool:
|
|
213
|
+
def is_path(self, path: Union[str, Path]) -> bool:
|
|
277
214
|
"""Check if path points to a prefix (directory-like)."""
|
|
278
215
|
resolved_path = self._resolve_path(path)
|
|
279
216
|
return self.fs.isdir(resolved_path) # type: ignore[no-any-return]
|
|
@@ -294,9 +231,6 @@ class FSSpecBackend(ObjectStoreBase):
|
|
|
294
231
|
|
|
295
232
|
except FileNotFoundError:
|
|
296
233
|
return {"path": self._resolve_path(path), "exists": False}
|
|
297
|
-
except Exception as exc:
|
|
298
|
-
msg = f"Failed to get metadata for {path}"
|
|
299
|
-
raise StorageOperationFailedError(msg) from exc
|
|
300
234
|
return {
|
|
301
235
|
"path": resolved_path,
|
|
302
236
|
"exists": True,
|
|
@@ -305,6 +239,11 @@ class FSSpecBackend(ObjectStoreBase):
|
|
|
305
239
|
"type": info.type,
|
|
306
240
|
}
|
|
307
241
|
|
|
242
|
+
def sign(self, path: str, expires_in: int = 3600, for_upload: bool = False) -> str:
|
|
243
|
+
"""Generate a signed URL for the file."""
|
|
244
|
+
resolved_path = self._resolve_path(path)
|
|
245
|
+
return f"{self._fs_uri}{resolved_path}"
|
|
246
|
+
|
|
308
247
|
def _stream_file_batches(self, obj_path: Union[str, Path]) -> "Iterator[ArrowRecordBatch]":
|
|
309
248
|
import pyarrow.parquet as pq
|
|
310
249
|
|
|
@@ -313,10 +252,8 @@ class FSSpecBackend(ObjectStoreBase):
|
|
|
313
252
|
yield from parquet_file.iter_batches()
|
|
314
253
|
|
|
315
254
|
def stream_arrow(self, pattern: str, **kwargs: Any) -> "Iterator[ArrowRecordBatch]":
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
if not PYARROW_INSTALLED:
|
|
319
|
-
raise MissingDependencyError(package="pyarrow", install_package="pyarrow")
|
|
255
|
+
self._ensure_fsspec()
|
|
256
|
+
self._ensure_pyarrow()
|
|
320
257
|
|
|
321
258
|
for obj_path in self.glob(pattern, **kwargs):
|
|
322
259
|
yield from self._stream_file_batches(obj_path)
|
|
@@ -339,8 +276,7 @@ class FSSpecBackend(ObjectStoreBase):
|
|
|
339
276
|
Returns:
|
|
340
277
|
AsyncIterator of Arrow record batches
|
|
341
278
|
"""
|
|
342
|
-
|
|
343
|
-
raise MissingDependencyError(package="pyarrow", install_package="pyarrow")
|
|
279
|
+
self._ensure_pyarrow()
|
|
344
280
|
|
|
345
281
|
return _ArrowStreamer(self, pattern, **kwargs)
|
|
346
282
|
|
|
@@ -376,6 +312,10 @@ class FSSpecBackend(ObjectStoreBase):
|
|
|
376
312
|
"""Get object metadata from storage asynchronously."""
|
|
377
313
|
return await async_(self.get_metadata)(path, **kwargs)
|
|
378
314
|
|
|
315
|
+
async def sign_async(self, path: str, expires_in: int = 3600, for_upload: bool = False) -> str:
|
|
316
|
+
"""Generate a signed URL asynchronously."""
|
|
317
|
+
return await async_(self.sign)(path, expires_in, for_upload)
|
|
318
|
+
|
|
379
319
|
async def read_arrow_async(self, path: Union[str, Path], **kwargs: Any) -> "ArrowTable":
|
|
380
320
|
"""Read Arrow table from storage asynchronously."""
|
|
381
321
|
return await async_(self.read_arrow)(path, **kwargs)
|
|
@@ -0,0 +1,310 @@
|
|
|
1
|
+
"""Local file system storage backend.
|
|
2
|
+
|
|
3
|
+
A simple, zero-dependency implementation for local file operations.
|
|
4
|
+
No external dependencies like fsspec or obstore required.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import shutil
|
|
8
|
+
from collections.abc import AsyncIterator, Iterator
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import TYPE_CHECKING, Any, Optional, Union
|
|
11
|
+
from urllib.parse import unquote, urlparse
|
|
12
|
+
|
|
13
|
+
from sqlspec.exceptions import MissingDependencyError
|
|
14
|
+
from sqlspec.typing import PYARROW_INSTALLED
|
|
15
|
+
from sqlspec.utils.sync_tools import async_
|
|
16
|
+
|
|
17
|
+
if TYPE_CHECKING:
|
|
18
|
+
import asyncio
|
|
19
|
+
|
|
20
|
+
from sqlspec.typing import ArrowRecordBatch, ArrowTable
|
|
21
|
+
|
|
22
|
+
__all__ = ("LocalStore",)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class LocalStore:
|
|
26
|
+
"""Simple local file system storage backend.
|
|
27
|
+
|
|
28
|
+
Provides file system operations without requiring fsspec or obstore.
|
|
29
|
+
Supports file:// URIs and regular file paths.
|
|
30
|
+
|
|
31
|
+
Implements ObjectStoreProtocol for type safety.
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
__slots__ = ("_loop", "backend_type", "base_path", "protocol")
|
|
35
|
+
|
|
36
|
+
def __init__(self, uri: str = "", **kwargs: Any) -> None:
|
|
37
|
+
"""Initialize local storage backend.
|
|
38
|
+
|
|
39
|
+
Args:
|
|
40
|
+
uri: File URI or path (e.g., "file:///path" or "/path")
|
|
41
|
+
**kwargs: Additional options (base_path for relative operations)
|
|
42
|
+
"""
|
|
43
|
+
if uri.startswith("file://"):
|
|
44
|
+
parsed = urlparse(uri)
|
|
45
|
+
path = unquote(parsed.path)
|
|
46
|
+
# Handle Windows paths (file:///C:/path)
|
|
47
|
+
if path and len(path) > 2 and path[2] == ":": # noqa: PLR2004
|
|
48
|
+
path = path[1:] # Remove leading slash for Windows
|
|
49
|
+
self.base_path = Path(path).resolve()
|
|
50
|
+
elif uri:
|
|
51
|
+
self.base_path = Path(uri).resolve()
|
|
52
|
+
else:
|
|
53
|
+
self.base_path = Path.cwd()
|
|
54
|
+
|
|
55
|
+
# Allow override with explicit base_path
|
|
56
|
+
if "base_path" in kwargs:
|
|
57
|
+
self.base_path = Path(kwargs["base_path"]).resolve()
|
|
58
|
+
|
|
59
|
+
# Create base directory if it doesn't exist and it's actually a directory
|
|
60
|
+
if not self.base_path.exists():
|
|
61
|
+
self.base_path.mkdir(parents=True, exist_ok=True)
|
|
62
|
+
elif self.base_path.is_file():
|
|
63
|
+
# If base_path points to a file, use its parent as the base directory
|
|
64
|
+
self.base_path = self.base_path.parent
|
|
65
|
+
self._loop: Optional[asyncio.AbstractEventLoop] = None
|
|
66
|
+
|
|
67
|
+
self.protocol = "file"
|
|
68
|
+
self.backend_type = "local"
|
|
69
|
+
|
|
70
|
+
def _ensure_pyarrow(self) -> None:
|
|
71
|
+
"""Ensure PyArrow is available for Arrow operations."""
|
|
72
|
+
if not PYARROW_INSTALLED:
|
|
73
|
+
raise MissingDependencyError(package="pyarrow", install_package="pyarrow")
|
|
74
|
+
|
|
75
|
+
def _resolve_path(self, path: "Union[str, Path]") -> Path:
|
|
76
|
+
"""Resolve path relative to base_path."""
|
|
77
|
+
p = Path(path)
|
|
78
|
+
if p.is_absolute():
|
|
79
|
+
return p
|
|
80
|
+
return self.base_path / p
|
|
81
|
+
|
|
82
|
+
def read_bytes(self, path: "Union[str, Path]", **kwargs: Any) -> bytes:
|
|
83
|
+
"""Read bytes from file."""
|
|
84
|
+
resolved = self._resolve_path(path)
|
|
85
|
+
return resolved.read_bytes()
|
|
86
|
+
|
|
87
|
+
def write_bytes(self, path: "Union[str, Path]", data: bytes, **kwargs: Any) -> None:
|
|
88
|
+
"""Write bytes to file."""
|
|
89
|
+
resolved = self._resolve_path(path)
|
|
90
|
+
resolved.parent.mkdir(parents=True, exist_ok=True)
|
|
91
|
+
resolved.write_bytes(data)
|
|
92
|
+
|
|
93
|
+
def read_text(self, path: "Union[str, Path]", encoding: str = "utf-8", **kwargs: Any) -> str:
|
|
94
|
+
"""Read text from file."""
|
|
95
|
+
return self._resolve_path(path).read_text(encoding=encoding)
|
|
96
|
+
|
|
97
|
+
def write_text(self, path: "Union[str, Path]", data: str, encoding: str = "utf-8", **kwargs: Any) -> None:
|
|
98
|
+
"""Write text to file."""
|
|
99
|
+
resolved = self._resolve_path(path)
|
|
100
|
+
resolved.parent.mkdir(parents=True, exist_ok=True)
|
|
101
|
+
resolved.write_text(data, encoding=encoding)
|
|
102
|
+
|
|
103
|
+
def list_objects(self, prefix: str = "", recursive: bool = True, **kwargs: Any) -> list[str]:
|
|
104
|
+
"""List objects in directory."""
|
|
105
|
+
# If prefix looks like a directory path, treat as directory
|
|
106
|
+
if prefix and (prefix.endswith("/") or "/" in prefix):
|
|
107
|
+
search_path = self._resolve_path(prefix)
|
|
108
|
+
if not search_path.exists():
|
|
109
|
+
return []
|
|
110
|
+
if search_path.is_file():
|
|
111
|
+
return [str(search_path.relative_to(self.base_path))]
|
|
112
|
+
else:
|
|
113
|
+
# Treat as filename prefix filter
|
|
114
|
+
search_path = self.base_path
|
|
115
|
+
|
|
116
|
+
pattern = "**/*" if recursive else "*"
|
|
117
|
+
files = []
|
|
118
|
+
for path in search_path.glob(pattern):
|
|
119
|
+
if path.is_file():
|
|
120
|
+
try:
|
|
121
|
+
relative = path.relative_to(self.base_path)
|
|
122
|
+
relative_str = str(relative)
|
|
123
|
+
# Apply prefix filter if provided
|
|
124
|
+
if not prefix or relative_str.startswith(prefix):
|
|
125
|
+
files.append(relative_str)
|
|
126
|
+
except ValueError:
|
|
127
|
+
# Path is outside base_path, use absolute
|
|
128
|
+
path_str = str(path)
|
|
129
|
+
if not prefix or path_str.startswith(prefix):
|
|
130
|
+
files.append(path_str)
|
|
131
|
+
|
|
132
|
+
return sorted(files)
|
|
133
|
+
|
|
134
|
+
def exists(self, path: "Union[str, Path]", **kwargs: Any) -> bool:
|
|
135
|
+
"""Check if file exists."""
|
|
136
|
+
return self._resolve_path(path).exists()
|
|
137
|
+
|
|
138
|
+
def delete(self, path: "Union[str, Path]", **kwargs: Any) -> None:
|
|
139
|
+
"""Delete file or directory."""
|
|
140
|
+
resolved = self._resolve_path(path)
|
|
141
|
+
if resolved.is_dir():
|
|
142
|
+
shutil.rmtree(resolved)
|
|
143
|
+
elif resolved.exists():
|
|
144
|
+
resolved.unlink()
|
|
145
|
+
|
|
146
|
+
def copy(self, source: "Union[str, Path]", destination: "Union[str, Path]", **kwargs: Any) -> None:
|
|
147
|
+
"""Copy file or directory."""
|
|
148
|
+
src = self._resolve_path(source)
|
|
149
|
+
dst = self._resolve_path(destination)
|
|
150
|
+
dst.parent.mkdir(parents=True, exist_ok=True)
|
|
151
|
+
|
|
152
|
+
if src.is_dir():
|
|
153
|
+
shutil.copytree(src, dst, dirs_exist_ok=True)
|
|
154
|
+
else:
|
|
155
|
+
shutil.copy2(src, dst)
|
|
156
|
+
|
|
157
|
+
def move(self, source: "Union[str, Path]", destination: "Union[str, Path]", **kwargs: Any) -> None:
|
|
158
|
+
"""Move file or directory."""
|
|
159
|
+
src = self._resolve_path(source)
|
|
160
|
+
dst = self._resolve_path(destination)
|
|
161
|
+
dst.parent.mkdir(parents=True, exist_ok=True)
|
|
162
|
+
shutil.move(str(src), str(dst))
|
|
163
|
+
|
|
164
|
+
def glob(self, pattern: str, **kwargs: Any) -> list[str]:
|
|
165
|
+
"""Find files matching pattern."""
|
|
166
|
+
# Handle both relative and absolute patterns
|
|
167
|
+
if Path(pattern).is_absolute():
|
|
168
|
+
base_path = Path(pattern).parent
|
|
169
|
+
pattern_name = Path(pattern).name
|
|
170
|
+
matches = base_path.rglob(pattern_name) if "**" in pattern else base_path.glob(pattern_name)
|
|
171
|
+
else:
|
|
172
|
+
matches = self.base_path.rglob(pattern) if "**" in pattern else self.base_path.glob(pattern)
|
|
173
|
+
|
|
174
|
+
results = []
|
|
175
|
+
for match in matches:
|
|
176
|
+
if match.is_file():
|
|
177
|
+
try:
|
|
178
|
+
relative = match.relative_to(self.base_path)
|
|
179
|
+
results.append(str(relative))
|
|
180
|
+
except ValueError:
|
|
181
|
+
results.append(str(match))
|
|
182
|
+
|
|
183
|
+
return sorted(results)
|
|
184
|
+
|
|
185
|
+
def get_metadata(self, path: "Union[str, Path]", **kwargs: Any) -> dict[str, Any]:
|
|
186
|
+
"""Get file metadata."""
|
|
187
|
+
resolved = self._resolve_path(path)
|
|
188
|
+
if not resolved.exists():
|
|
189
|
+
return {}
|
|
190
|
+
|
|
191
|
+
stat = resolved.stat()
|
|
192
|
+
return {
|
|
193
|
+
"size": stat.st_size,
|
|
194
|
+
"modified": stat.st_mtime,
|
|
195
|
+
"created": stat.st_ctime,
|
|
196
|
+
"is_file": resolved.is_file(),
|
|
197
|
+
"is_dir": resolved.is_dir(),
|
|
198
|
+
"path": str(resolved),
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
def is_object(self, path: "Union[str, Path]") -> bool:
|
|
202
|
+
"""Check if path points to a file."""
|
|
203
|
+
return self._resolve_path(path).is_file()
|
|
204
|
+
|
|
205
|
+
def is_path(self, path: "Union[str, Path]") -> bool:
|
|
206
|
+
"""Check if path points to a directory."""
|
|
207
|
+
return self._resolve_path(path).is_dir()
|
|
208
|
+
|
|
209
|
+
def read_arrow(self, path: "Union[str, Path]", **kwargs: Any) -> "ArrowTable":
|
|
210
|
+
"""Read Arrow table from file."""
|
|
211
|
+
self._ensure_pyarrow()
|
|
212
|
+
import pyarrow.parquet as pq
|
|
213
|
+
|
|
214
|
+
return pq.read_table(str(self._resolve_path(path)))
|
|
215
|
+
|
|
216
|
+
def write_arrow(self, path: "Union[str, Path]", table: "ArrowTable", **kwargs: Any) -> None:
|
|
217
|
+
"""Write Arrow table to file."""
|
|
218
|
+
self._ensure_pyarrow()
|
|
219
|
+
import pyarrow.parquet as pq
|
|
220
|
+
|
|
221
|
+
resolved = self._resolve_path(path)
|
|
222
|
+
resolved.parent.mkdir(parents=True, exist_ok=True)
|
|
223
|
+
pq.write_table(table, str(resolved))
|
|
224
|
+
|
|
225
|
+
def stream_arrow(self, pattern: str, **kwargs: Any) -> Iterator["ArrowRecordBatch"]:
|
|
226
|
+
"""Stream Arrow record batches from files matching pattern.
|
|
227
|
+
|
|
228
|
+
Yields:
|
|
229
|
+
Arrow record batches from matching files.
|
|
230
|
+
"""
|
|
231
|
+
if not PYARROW_INSTALLED:
|
|
232
|
+
raise MissingDependencyError(package="pyarrow", install_package="pyarrow")
|
|
233
|
+
import pyarrow.parquet as pq
|
|
234
|
+
|
|
235
|
+
files = self.glob(pattern)
|
|
236
|
+
for file_path in files:
|
|
237
|
+
resolved = self._resolve_path(file_path)
|
|
238
|
+
parquet_file = pq.ParquetFile(str(resolved))
|
|
239
|
+
yield from parquet_file.iter_batches()
|
|
240
|
+
|
|
241
|
+
def sign(self, path: "Union[str, Path]", expires_in: int = 3600, for_upload: bool = False) -> str:
|
|
242
|
+
"""Generate a signed URL (returns file:// URI for local files)."""
|
|
243
|
+
# For local files, just return a file:// URI
|
|
244
|
+
# No actual signing needed for local files
|
|
245
|
+
return self._resolve_path(path).as_uri()
|
|
246
|
+
|
|
247
|
+
# Async methods using sync_tools.async_
|
|
248
|
+
async def read_bytes_async(self, path: "Union[str, Path]", **kwargs: Any) -> bytes:
|
|
249
|
+
"""Read bytes from file asynchronously."""
|
|
250
|
+
return await async_(self.read_bytes)(path, **kwargs)
|
|
251
|
+
|
|
252
|
+
async def write_bytes_async(self, path: "Union[str, Path]", data: bytes, **kwargs: Any) -> None:
|
|
253
|
+
"""Write bytes to file asynchronously."""
|
|
254
|
+
await async_(self.write_bytes)(path, data, **kwargs)
|
|
255
|
+
|
|
256
|
+
async def read_text_async(self, path: "Union[str, Path]", encoding: str = "utf-8", **kwargs: Any) -> str:
|
|
257
|
+
"""Read text from file asynchronously."""
|
|
258
|
+
return await async_(self.read_text)(path, encoding, **kwargs)
|
|
259
|
+
|
|
260
|
+
async def write_text_async(
|
|
261
|
+
self, path: "Union[str, Path]", data: str, encoding: str = "utf-8", **kwargs: Any
|
|
262
|
+
) -> None:
|
|
263
|
+
"""Write text to file asynchronously."""
|
|
264
|
+
await async_(self.write_text)(path, data, encoding, **kwargs)
|
|
265
|
+
|
|
266
|
+
async def list_objects_async(self, prefix: str = "", recursive: bool = True, **kwargs: Any) -> list[str]:
|
|
267
|
+
"""List objects asynchronously."""
|
|
268
|
+
return await async_(self.list_objects)(prefix, recursive, **kwargs)
|
|
269
|
+
|
|
270
|
+
async def exists_async(self, path: "Union[str, Path]", **kwargs: Any) -> bool:
|
|
271
|
+
"""Check if file exists asynchronously."""
|
|
272
|
+
return await async_(self.exists)(path, **kwargs)
|
|
273
|
+
|
|
274
|
+
async def delete_async(self, path: "Union[str, Path]", **kwargs: Any) -> None:
|
|
275
|
+
"""Delete file asynchronously."""
|
|
276
|
+
await async_(self.delete)(path, **kwargs)
|
|
277
|
+
|
|
278
|
+
async def copy_async(self, source: "Union[str, Path]", destination: "Union[str, Path]", **kwargs: Any) -> None:
|
|
279
|
+
"""Copy file asynchronously."""
|
|
280
|
+
await async_(self.copy)(source, destination, **kwargs)
|
|
281
|
+
|
|
282
|
+
async def move_async(self, source: "Union[str, Path]", destination: "Union[str, Path]", **kwargs: Any) -> None:
|
|
283
|
+
"""Move file asynchronously."""
|
|
284
|
+
await async_(self.move)(source, destination, **kwargs)
|
|
285
|
+
|
|
286
|
+
async def get_metadata_async(self, path: "Union[str, Path]", **kwargs: Any) -> dict[str, Any]:
|
|
287
|
+
"""Get file metadata asynchronously."""
|
|
288
|
+
return await async_(self.get_metadata)(path, **kwargs)
|
|
289
|
+
|
|
290
|
+
async def read_arrow_async(self, path: "Union[str, Path]", **kwargs: Any) -> "ArrowTable":
|
|
291
|
+
"""Read Arrow table asynchronously."""
|
|
292
|
+
return await async_(self.read_arrow)(path, **kwargs)
|
|
293
|
+
|
|
294
|
+
async def write_arrow_async(self, path: "Union[str, Path]", table: "ArrowTable", **kwargs: Any) -> None:
|
|
295
|
+
"""Write Arrow table asynchronously."""
|
|
296
|
+
await async_(self.write_arrow)(path, table, **kwargs)
|
|
297
|
+
|
|
298
|
+
def stream_arrow_async(self, pattern: str, **kwargs: Any) -> AsyncIterator["ArrowRecordBatch"]:
|
|
299
|
+
"""Stream Arrow record batches asynchronously."""
|
|
300
|
+
|
|
301
|
+
# Convert sync iterator to async
|
|
302
|
+
async def _stream() -> AsyncIterator["ArrowRecordBatch"]:
|
|
303
|
+
for batch in self.stream_arrow(pattern, **kwargs):
|
|
304
|
+
yield batch
|
|
305
|
+
|
|
306
|
+
return _stream()
|
|
307
|
+
|
|
308
|
+
async def sign_async(self, path: "Union[str, Path]", expires_in: int = 3600, for_upload: bool = False) -> str:
|
|
309
|
+
"""Generate a signed URL asynchronously (returns file:// URI for local files)."""
|
|
310
|
+
return await async_(self.sign)(path, expires_in, for_upload)
|