sqlspec 0.14.1__py3-none-any.whl → 0.15.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of sqlspec might be problematic. Click here for more details.
- sqlspec/__init__.py +50 -25
- sqlspec/__main__.py +1 -1
- sqlspec/__metadata__.py +1 -3
- sqlspec/_serialization.py +1 -2
- sqlspec/_sql.py +256 -120
- sqlspec/_typing.py +278 -142
- sqlspec/adapters/adbc/__init__.py +4 -3
- sqlspec/adapters/adbc/_types.py +12 -0
- sqlspec/adapters/adbc/config.py +115 -260
- sqlspec/adapters/adbc/driver.py +462 -367
- sqlspec/adapters/aiosqlite/__init__.py +18 -3
- sqlspec/adapters/aiosqlite/_types.py +13 -0
- sqlspec/adapters/aiosqlite/config.py +199 -129
- sqlspec/adapters/aiosqlite/driver.py +230 -269
- sqlspec/adapters/asyncmy/__init__.py +18 -3
- sqlspec/adapters/asyncmy/_types.py +12 -0
- sqlspec/adapters/asyncmy/config.py +80 -168
- sqlspec/adapters/asyncmy/driver.py +260 -225
- sqlspec/adapters/asyncpg/__init__.py +19 -4
- sqlspec/adapters/asyncpg/_types.py +17 -0
- sqlspec/adapters/asyncpg/config.py +82 -181
- sqlspec/adapters/asyncpg/driver.py +285 -383
- sqlspec/adapters/bigquery/__init__.py +17 -3
- sqlspec/adapters/bigquery/_types.py +12 -0
- sqlspec/adapters/bigquery/config.py +191 -258
- sqlspec/adapters/bigquery/driver.py +474 -646
- sqlspec/adapters/duckdb/__init__.py +14 -3
- sqlspec/adapters/duckdb/_types.py +12 -0
- sqlspec/adapters/duckdb/config.py +415 -351
- sqlspec/adapters/duckdb/driver.py +343 -413
- sqlspec/adapters/oracledb/__init__.py +19 -5
- sqlspec/adapters/oracledb/_types.py +14 -0
- sqlspec/adapters/oracledb/config.py +123 -379
- sqlspec/adapters/oracledb/driver.py +507 -560
- sqlspec/adapters/psqlpy/__init__.py +13 -3
- sqlspec/adapters/psqlpy/_types.py +11 -0
- sqlspec/adapters/psqlpy/config.py +93 -254
- sqlspec/adapters/psqlpy/driver.py +505 -234
- sqlspec/adapters/psycopg/__init__.py +19 -5
- sqlspec/adapters/psycopg/_types.py +17 -0
- sqlspec/adapters/psycopg/config.py +143 -403
- sqlspec/adapters/psycopg/driver.py +706 -872
- sqlspec/adapters/sqlite/__init__.py +14 -3
- sqlspec/adapters/sqlite/_types.py +11 -0
- sqlspec/adapters/sqlite/config.py +202 -118
- sqlspec/adapters/sqlite/driver.py +264 -303
- sqlspec/base.py +105 -9
- sqlspec/{statement/builder → builder}/__init__.py +12 -14
- sqlspec/{statement/builder → builder}/_base.py +120 -55
- sqlspec/{statement/builder → builder}/_column.py +17 -6
- sqlspec/{statement/builder → builder}/_ddl.py +46 -79
- sqlspec/{statement/builder → builder}/_ddl_utils.py +5 -10
- sqlspec/{statement/builder → builder}/_delete.py +6 -25
- sqlspec/{statement/builder → builder}/_insert.py +6 -64
- sqlspec/builder/_merge.py +56 -0
- sqlspec/{statement/builder → builder}/_parsing_utils.py +3 -10
- sqlspec/{statement/builder → builder}/_select.py +11 -56
- sqlspec/{statement/builder → builder}/_update.py +12 -18
- sqlspec/{statement/builder → builder}/mixins/__init__.py +10 -14
- sqlspec/{statement/builder → builder}/mixins/_cte_and_set_ops.py +48 -59
- sqlspec/{statement/builder → builder}/mixins/_insert_operations.py +22 -16
- sqlspec/{statement/builder → builder}/mixins/_join_operations.py +1 -3
- sqlspec/{statement/builder → builder}/mixins/_merge_operations.py +3 -5
- sqlspec/{statement/builder → builder}/mixins/_order_limit_operations.py +3 -3
- sqlspec/{statement/builder → builder}/mixins/_pivot_operations.py +4 -8
- sqlspec/{statement/builder → builder}/mixins/_select_operations.py +21 -36
- sqlspec/{statement/builder → builder}/mixins/_update_operations.py +3 -14
- sqlspec/{statement/builder → builder}/mixins/_where_clause.py +52 -79
- sqlspec/cli.py +4 -5
- sqlspec/config.py +180 -133
- sqlspec/core/__init__.py +63 -0
- sqlspec/core/cache.py +873 -0
- sqlspec/core/compiler.py +396 -0
- sqlspec/core/filters.py +828 -0
- sqlspec/core/hashing.py +310 -0
- sqlspec/core/parameters.py +1209 -0
- sqlspec/core/result.py +664 -0
- sqlspec/{statement → core}/splitter.py +321 -191
- sqlspec/core/statement.py +651 -0
- sqlspec/driver/__init__.py +7 -10
- sqlspec/driver/_async.py +387 -176
- sqlspec/driver/_common.py +527 -289
- sqlspec/driver/_sync.py +390 -172
- sqlspec/driver/mixins/__init__.py +2 -19
- sqlspec/driver/mixins/_result_tools.py +168 -0
- sqlspec/driver/mixins/_sql_translator.py +6 -3
- sqlspec/exceptions.py +5 -252
- sqlspec/extensions/aiosql/adapter.py +93 -96
- sqlspec/extensions/litestar/config.py +0 -1
- sqlspec/extensions/litestar/handlers.py +15 -26
- sqlspec/extensions/litestar/plugin.py +16 -14
- sqlspec/extensions/litestar/providers.py +17 -52
- sqlspec/loader.py +424 -105
- sqlspec/migrations/__init__.py +12 -0
- sqlspec/migrations/base.py +92 -68
- sqlspec/migrations/commands.py +24 -106
- sqlspec/migrations/loaders.py +402 -0
- sqlspec/migrations/runner.py +49 -51
- sqlspec/migrations/tracker.py +31 -44
- sqlspec/migrations/utils.py +64 -24
- sqlspec/protocols.py +7 -183
- sqlspec/storage/__init__.py +1 -1
- sqlspec/storage/backends/base.py +37 -40
- sqlspec/storage/backends/fsspec.py +136 -112
- sqlspec/storage/backends/obstore.py +138 -160
- sqlspec/storage/capabilities.py +5 -4
- sqlspec/storage/registry.py +57 -106
- sqlspec/typing.py +136 -115
- sqlspec/utils/__init__.py +2 -3
- sqlspec/utils/correlation.py +0 -3
- sqlspec/utils/deprecation.py +6 -6
- sqlspec/utils/fixtures.py +6 -6
- sqlspec/utils/logging.py +0 -2
- sqlspec/utils/module_loader.py +7 -12
- sqlspec/utils/singleton.py +0 -1
- sqlspec/utils/sync_tools.py +16 -37
- sqlspec/utils/text.py +12 -51
- sqlspec/utils/type_guards.py +443 -232
- {sqlspec-0.14.1.dist-info → sqlspec-0.15.0.dist-info}/METADATA +7 -2
- sqlspec-0.15.0.dist-info/RECORD +134 -0
- sqlspec/adapters/adbc/transformers.py +0 -108
- sqlspec/driver/connection.py +0 -207
- sqlspec/driver/mixins/_cache.py +0 -114
- sqlspec/driver/mixins/_csv_writer.py +0 -91
- sqlspec/driver/mixins/_pipeline.py +0 -508
- sqlspec/driver/mixins/_query_tools.py +0 -796
- sqlspec/driver/mixins/_result_utils.py +0 -138
- sqlspec/driver/mixins/_storage.py +0 -912
- sqlspec/driver/mixins/_type_coercion.py +0 -128
- sqlspec/driver/parameters.py +0 -138
- sqlspec/statement/__init__.py +0 -21
- sqlspec/statement/builder/_merge.py +0 -95
- sqlspec/statement/cache.py +0 -50
- sqlspec/statement/filters.py +0 -625
- sqlspec/statement/parameters.py +0 -956
- sqlspec/statement/pipelines/__init__.py +0 -210
- sqlspec/statement/pipelines/analyzers/__init__.py +0 -9
- sqlspec/statement/pipelines/analyzers/_analyzer.py +0 -646
- sqlspec/statement/pipelines/context.py +0 -109
- sqlspec/statement/pipelines/transformers/__init__.py +0 -7
- sqlspec/statement/pipelines/transformers/_expression_simplifier.py +0 -88
- sqlspec/statement/pipelines/transformers/_literal_parameterizer.py +0 -1247
- sqlspec/statement/pipelines/transformers/_remove_comments_and_hints.py +0 -76
- sqlspec/statement/pipelines/validators/__init__.py +0 -23
- sqlspec/statement/pipelines/validators/_dml_safety.py +0 -290
- sqlspec/statement/pipelines/validators/_parameter_style.py +0 -370
- sqlspec/statement/pipelines/validators/_performance.py +0 -714
- sqlspec/statement/pipelines/validators/_security.py +0 -967
- sqlspec/statement/result.py +0 -435
- sqlspec/statement/sql.py +0 -1774
- sqlspec/utils/cached_property.py +0 -25
- sqlspec/utils/statement_hashing.py +0 -203
- sqlspec-0.14.1.dist-info/RECORD +0 -145
- /sqlspec/{statement/builder → builder}/mixins/_delete_operations.py +0 -0
- {sqlspec-0.14.1.dist-info → sqlspec-0.15.0.dist-info}/WHEEL +0 -0
- {sqlspec-0.14.1.dist-info → sqlspec-0.15.0.dist-info}/entry_points.txt +0 -0
- {sqlspec-0.14.1.dist-info → sqlspec-0.15.0.dist-info}/licenses/LICENSE +0 -0
- {sqlspec-0.14.1.dist-info → sqlspec-0.15.0.dist-info}/licenses/NOTICE +0 -0
sqlspec/storage/backends/base.py
CHANGED
|
@@ -1,166 +1,163 @@
|
|
|
1
1
|
"""Base class for storage backends."""
|
|
2
2
|
|
|
3
|
-
from __future__ import annotations
|
|
4
|
-
|
|
5
3
|
from abc import ABC, abstractmethod
|
|
6
|
-
from
|
|
4
|
+
from collections.abc import AsyncIterator, Iterator
|
|
5
|
+
from typing import Any
|
|
7
6
|
|
|
8
|
-
|
|
9
|
-
from collections.abc import AsyncIterator, Iterator
|
|
7
|
+
from mypy_extensions import mypyc_attr
|
|
10
8
|
|
|
11
|
-
|
|
9
|
+
from sqlspec.typing import ArrowRecordBatch, ArrowTable
|
|
12
10
|
|
|
13
11
|
__all__ = ("ObjectStoreBase",)
|
|
14
12
|
|
|
15
13
|
|
|
14
|
+
@mypyc_attr(allow_interpreted_subclasses=True)
|
|
16
15
|
class ObjectStoreBase(ABC):
|
|
17
|
-
"""Base class for
|
|
16
|
+
"""Base class for storage backends."""
|
|
17
|
+
|
|
18
|
+
__slots__ = ()
|
|
18
19
|
|
|
19
|
-
# Sync Operations
|
|
20
20
|
@abstractmethod
|
|
21
21
|
def read_bytes(self, path: str, **kwargs: Any) -> bytes:
|
|
22
|
-
"""
|
|
22
|
+
"""Read bytes from storage."""
|
|
23
23
|
raise NotImplementedError
|
|
24
24
|
|
|
25
25
|
@abstractmethod
|
|
26
26
|
def write_bytes(self, path: str, data: bytes, **kwargs: Any) -> None:
|
|
27
|
-
"""
|
|
27
|
+
"""Write bytes to storage."""
|
|
28
28
|
raise NotImplementedError
|
|
29
29
|
|
|
30
30
|
@abstractmethod
|
|
31
31
|
def read_text(self, path: str, encoding: str = "utf-8", **kwargs: Any) -> str:
|
|
32
|
-
"""
|
|
32
|
+
"""Read text from storage."""
|
|
33
33
|
raise NotImplementedError
|
|
34
34
|
|
|
35
35
|
@abstractmethod
|
|
36
36
|
def write_text(self, path: str, data: str, encoding: str = "utf-8", **kwargs: Any) -> None:
|
|
37
|
-
"""
|
|
37
|
+
"""Write text to storage."""
|
|
38
38
|
raise NotImplementedError
|
|
39
39
|
|
|
40
40
|
@abstractmethod
|
|
41
41
|
def list_objects(self, prefix: str = "", recursive: bool = True, **kwargs: Any) -> list[str]:
|
|
42
|
-
"""
|
|
42
|
+
"""List objects in storage."""
|
|
43
43
|
raise NotImplementedError
|
|
44
44
|
|
|
45
45
|
@abstractmethod
|
|
46
46
|
def exists(self, path: str, **kwargs: Any) -> bool:
|
|
47
|
-
"""
|
|
47
|
+
"""Check if object exists in storage."""
|
|
48
48
|
raise NotImplementedError
|
|
49
49
|
|
|
50
50
|
@abstractmethod
|
|
51
51
|
def delete(self, path: str, **kwargs: Any) -> None:
|
|
52
|
-
"""
|
|
52
|
+
"""Delete object from storage."""
|
|
53
53
|
raise NotImplementedError
|
|
54
54
|
|
|
55
55
|
@abstractmethod
|
|
56
56
|
def copy(self, source: str, destination: str, **kwargs: Any) -> None:
|
|
57
|
-
"""
|
|
57
|
+
"""Copy object within storage."""
|
|
58
58
|
raise NotImplementedError
|
|
59
59
|
|
|
60
60
|
@abstractmethod
|
|
61
61
|
def move(self, source: str, destination: str, **kwargs: Any) -> None:
|
|
62
|
-
"""
|
|
62
|
+
"""Move object within storage."""
|
|
63
63
|
raise NotImplementedError
|
|
64
64
|
|
|
65
65
|
@abstractmethod
|
|
66
66
|
def glob(self, pattern: str, **kwargs: Any) -> list[str]:
|
|
67
|
-
"""
|
|
67
|
+
"""Find objects matching pattern."""
|
|
68
68
|
raise NotImplementedError
|
|
69
69
|
|
|
70
70
|
@abstractmethod
|
|
71
71
|
def get_metadata(self, path: str, **kwargs: Any) -> dict[str, Any]:
|
|
72
|
-
"""
|
|
72
|
+
"""Get object metadata from storage."""
|
|
73
73
|
raise NotImplementedError
|
|
74
74
|
|
|
75
75
|
@abstractmethod
|
|
76
76
|
def is_object(self, path: str) -> bool:
|
|
77
|
-
"""
|
|
77
|
+
"""Check if path points to an object."""
|
|
78
78
|
raise NotImplementedError
|
|
79
79
|
|
|
80
80
|
@abstractmethod
|
|
81
81
|
def is_path(self, path: str) -> bool:
|
|
82
|
-
"""
|
|
82
|
+
"""Check if path points to a directory."""
|
|
83
83
|
raise NotImplementedError
|
|
84
84
|
|
|
85
85
|
@abstractmethod
|
|
86
86
|
def read_arrow(self, path: str, **kwargs: Any) -> ArrowTable:
|
|
87
|
-
"""
|
|
87
|
+
"""Read Arrow table from storage."""
|
|
88
88
|
raise NotImplementedError
|
|
89
89
|
|
|
90
90
|
@abstractmethod
|
|
91
91
|
def write_arrow(self, path: str, table: ArrowTable, **kwargs: Any) -> None:
|
|
92
|
-
"""
|
|
92
|
+
"""Write Arrow table to storage."""
|
|
93
93
|
raise NotImplementedError
|
|
94
94
|
|
|
95
95
|
@abstractmethod
|
|
96
96
|
def stream_arrow(self, pattern: str, **kwargs: Any) -> Iterator[ArrowRecordBatch]:
|
|
97
|
-
"""
|
|
97
|
+
"""Stream Arrow record batches from storage."""
|
|
98
98
|
raise NotImplementedError
|
|
99
99
|
|
|
100
|
-
# Abstract async methods that subclasses must implement
|
|
101
|
-
# Backends can either provide native async implementations or wrap sync methods
|
|
102
|
-
|
|
103
100
|
@abstractmethod
|
|
104
101
|
async def read_bytes_async(self, path: str, **kwargs: Any) -> bytes:
|
|
105
|
-
"""
|
|
102
|
+
"""Read bytes from storage asynchronously."""
|
|
106
103
|
raise NotImplementedError
|
|
107
104
|
|
|
108
105
|
@abstractmethod
|
|
109
106
|
async def write_bytes_async(self, path: str, data: bytes, **kwargs: Any) -> None:
|
|
110
|
-
"""
|
|
107
|
+
"""Write bytes to storage asynchronously."""
|
|
111
108
|
raise NotImplementedError
|
|
112
109
|
|
|
113
110
|
@abstractmethod
|
|
114
111
|
async def read_text_async(self, path: str, encoding: str = "utf-8", **kwargs: Any) -> str:
|
|
115
|
-
"""
|
|
112
|
+
"""Read text from storage asynchronously."""
|
|
116
113
|
raise NotImplementedError
|
|
117
114
|
|
|
118
115
|
@abstractmethod
|
|
119
116
|
async def write_text_async(self, path: str, data: str, encoding: str = "utf-8", **kwargs: Any) -> None:
|
|
120
|
-
"""
|
|
117
|
+
"""Write text to storage asynchronously."""
|
|
121
118
|
raise NotImplementedError
|
|
122
119
|
|
|
123
120
|
@abstractmethod
|
|
124
121
|
async def list_objects_async(self, prefix: str = "", recursive: bool = True, **kwargs: Any) -> list[str]:
|
|
125
|
-
"""
|
|
122
|
+
"""List objects in storage asynchronously."""
|
|
126
123
|
raise NotImplementedError
|
|
127
124
|
|
|
128
125
|
@abstractmethod
|
|
129
126
|
async def exists_async(self, path: str, **kwargs: Any) -> bool:
|
|
130
|
-
"""
|
|
127
|
+
"""Check if object exists in storage asynchronously."""
|
|
131
128
|
raise NotImplementedError
|
|
132
129
|
|
|
133
130
|
@abstractmethod
|
|
134
131
|
async def delete_async(self, path: str, **kwargs: Any) -> None:
|
|
135
|
-
"""
|
|
132
|
+
"""Delete object from storage asynchronously."""
|
|
136
133
|
raise NotImplementedError
|
|
137
134
|
|
|
138
135
|
@abstractmethod
|
|
139
136
|
async def copy_async(self, source: str, destination: str, **kwargs: Any) -> None:
|
|
140
|
-
"""
|
|
137
|
+
"""Copy object within storage asynchronously."""
|
|
141
138
|
raise NotImplementedError
|
|
142
139
|
|
|
143
140
|
@abstractmethod
|
|
144
141
|
async def move_async(self, source: str, destination: str, **kwargs: Any) -> None:
|
|
145
|
-
"""
|
|
142
|
+
"""Move object within storage asynchronously."""
|
|
146
143
|
raise NotImplementedError
|
|
147
144
|
|
|
148
145
|
@abstractmethod
|
|
149
146
|
async def get_metadata_async(self, path: str, **kwargs: Any) -> dict[str, Any]:
|
|
150
|
-
"""
|
|
147
|
+
"""Get object metadata from storage asynchronously."""
|
|
151
148
|
raise NotImplementedError
|
|
152
149
|
|
|
153
150
|
@abstractmethod
|
|
154
151
|
async def read_arrow_async(self, path: str, **kwargs: Any) -> ArrowTable:
|
|
155
|
-
"""
|
|
152
|
+
"""Read Arrow table from storage asynchronously."""
|
|
156
153
|
raise NotImplementedError
|
|
157
154
|
|
|
158
155
|
@abstractmethod
|
|
159
156
|
async def write_arrow_async(self, path: str, table: ArrowTable, **kwargs: Any) -> None:
|
|
160
|
-
"""
|
|
157
|
+
"""Write Arrow table to storage asynchronously."""
|
|
161
158
|
raise NotImplementedError
|
|
162
159
|
|
|
163
160
|
@abstractmethod
|
|
164
161
|
def stream_arrow_async(self, pattern: str, **kwargs: Any) -> AsyncIterator[ArrowRecordBatch]:
|
|
165
|
-
"""
|
|
162
|
+
"""Stream Arrow record batches from storage asynchronously."""
|
|
166
163
|
raise NotImplementedError
|
|
@@ -1,10 +1,8 @@
|
|
|
1
|
-
# pyright: ignore=reportUnknownVariableType
|
|
2
1
|
import logging
|
|
3
|
-
from io import BytesIO
|
|
4
2
|
from pathlib import Path
|
|
5
|
-
from typing import TYPE_CHECKING, Any, ClassVar, Union
|
|
3
|
+
from typing import TYPE_CHECKING, Any, ClassVar, Optional, Union
|
|
6
4
|
|
|
7
|
-
from sqlspec.exceptions import MissingDependencyError
|
|
5
|
+
from sqlspec.exceptions import MissingDependencyError, StorageOperationFailedError
|
|
8
6
|
from sqlspec.storage.backends.base import ObjectStoreBase
|
|
9
7
|
from sqlspec.storage.capabilities import StorageCapabilities
|
|
10
8
|
from sqlspec.typing import FSSPEC_INSTALLED, PYARROW_INSTALLED
|
|
@@ -21,34 +19,51 @@ __all__ = ("FSSpecBackend",)
|
|
|
21
19
|
|
|
22
20
|
logger = logging.getLogger(__name__)
|
|
23
21
|
|
|
24
|
-
# Constants for URI validation
|
|
25
|
-
URI_PARTS_MIN_COUNT = 2
|
|
26
|
-
"""Minimum number of parts in a valid cloud storage URI (bucket/path)."""
|
|
27
22
|
|
|
28
|
-
|
|
29
|
-
""
|
|
23
|
+
class _ArrowStreamer:
|
|
24
|
+
def __init__(self, backend: "FSSpecBackend", pattern: str, **kwargs: Any) -> None:
|
|
25
|
+
self.backend = backend
|
|
26
|
+
self.pattern = pattern
|
|
27
|
+
self.kwargs = kwargs
|
|
28
|
+
self.paths_iterator: Optional[Iterator[str]] = None
|
|
29
|
+
self.batch_iterator: Optional[Iterator[ArrowRecordBatch]] = None
|
|
30
30
|
|
|
31
|
-
|
|
32
|
-
|
|
31
|
+
def __aiter__(self) -> "_ArrowStreamer":
|
|
32
|
+
return self
|
|
33
33
|
|
|
34
|
+
async def _initialize(self) -> None:
|
|
35
|
+
"""Initialize the paths iterator."""
|
|
36
|
+
if self.paths_iterator is None:
|
|
37
|
+
paths = await async_(self.backend.glob)(self.pattern, **self.kwargs)
|
|
38
|
+
self.paths_iterator = iter(paths)
|
|
34
39
|
|
|
35
|
-
def
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
40
|
+
async def __anext__(self) -> "ArrowRecordBatch":
|
|
41
|
+
await self._initialize()
|
|
42
|
+
|
|
43
|
+
if self.batch_iterator:
|
|
44
|
+
try:
|
|
45
|
+
return next(self.batch_iterator)
|
|
46
|
+
except StopIteration:
|
|
47
|
+
self.batch_iterator = None
|
|
48
|
+
|
|
49
|
+
if self.paths_iterator:
|
|
50
|
+
try:
|
|
51
|
+
path = next(self.paths_iterator)
|
|
52
|
+
self.batch_iterator = await async_(self.backend._stream_file_batches)(path)
|
|
53
|
+
return await self.__anext__()
|
|
54
|
+
except StopIteration:
|
|
55
|
+
raise StopAsyncIteration
|
|
56
|
+
raise StopAsyncIteration
|
|
41
57
|
|
|
42
58
|
|
|
43
59
|
class FSSpecBackend(ObjectStoreBase):
|
|
44
|
-
"""
|
|
60
|
+
"""Storage backend using fsspec.
|
|
45
61
|
|
|
46
|
-
|
|
47
|
-
providing support for
|
|
48
|
-
and
|
|
62
|
+
Implements the ObjectStoreProtocol using fsspec,
|
|
63
|
+
providing support for various protocols including HTTP, HTTPS, FTP,
|
|
64
|
+
and cloud storage services.
|
|
49
65
|
"""
|
|
50
66
|
|
|
51
|
-
# FSSpec supports most operations but varies by underlying filesystem
|
|
52
67
|
_default_capabilities: ClassVar[StorageCapabilities] = StorageCapabilities(
|
|
53
68
|
supports_arrow=PYARROW_INSTALLED,
|
|
54
69
|
supports_streaming=PYARROW_INSTALLED,
|
|
@@ -75,7 +90,6 @@ class FSSpecBackend(ObjectStoreBase):
|
|
|
75
90
|
self.protocol = getattr(fs, "protocol", "unknown")
|
|
76
91
|
self._fs_uri = f"{self.protocol}://"
|
|
77
92
|
|
|
78
|
-
# Set instance-level capabilities based on detected protocol
|
|
79
93
|
self._instance_capabilities = self._detect_capabilities()
|
|
80
94
|
|
|
81
95
|
super().__init__()
|
|
@@ -145,17 +159,24 @@ class FSSpecBackend(ObjectStoreBase):
|
|
|
145
159
|
def base_uri(self) -> str:
|
|
146
160
|
return self._fs_uri
|
|
147
161
|
|
|
148
|
-
# Core Operations (sync)
|
|
149
162
|
def read_bytes(self, path: Union[str, Path], **kwargs: Any) -> bytes:
|
|
150
163
|
"""Read bytes from an object."""
|
|
151
|
-
|
|
152
|
-
|
|
164
|
+
try:
|
|
165
|
+
resolved_path = self._resolve_path(path)
|
|
166
|
+
return self.fs.cat(resolved_path, **kwargs) # type: ignore[no-any-return] # pyright: ignore
|
|
167
|
+
except Exception as exc:
|
|
168
|
+
msg = f"Failed to read bytes from {path}"
|
|
169
|
+
raise StorageOperationFailedError(msg) from exc
|
|
153
170
|
|
|
154
171
|
def write_bytes(self, path: Union[str, Path], data: bytes, **kwargs: Any) -> None:
|
|
155
172
|
"""Write bytes to an object."""
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
173
|
+
try:
|
|
174
|
+
resolved_path = self._resolve_path(path)
|
|
175
|
+
with self.fs.open(resolved_path, mode="wb", **kwargs) as f:
|
|
176
|
+
f.write(data) # pyright: ignore
|
|
177
|
+
except Exception as exc:
|
|
178
|
+
msg = f"Failed to write bytes to {path}"
|
|
179
|
+
raise StorageOperationFailedError(msg) from exc
|
|
159
180
|
|
|
160
181
|
def read_text(self, path: Union[str, Path], encoding: str = "utf-8", **kwargs: Any) -> str:
|
|
161
182
|
"""Read text from an object."""
|
|
@@ -166,7 +187,6 @@ class FSSpecBackend(ObjectStoreBase):
|
|
|
166
187
|
"""Write text to an object."""
|
|
167
188
|
self.write_bytes(path, data.encode(encoding), **kwargs)
|
|
168
189
|
|
|
169
|
-
# Object Operations
|
|
170
190
|
def exists(self, path: Union[str, Path], **kwargs: Any) -> bool:
|
|
171
191
|
"""Check if an object exists."""
|
|
172
192
|
resolved_path = self._resolve_path(path)
|
|
@@ -174,66 +194,81 @@ class FSSpecBackend(ObjectStoreBase):
|
|
|
174
194
|
|
|
175
195
|
def delete(self, path: Union[str, Path], **kwargs: Any) -> None:
|
|
176
196
|
"""Delete an object."""
|
|
177
|
-
|
|
178
|
-
|
|
197
|
+
try:
|
|
198
|
+
resolved_path = self._resolve_path(path)
|
|
199
|
+
self.fs.rm(resolved_path, **kwargs)
|
|
200
|
+
except Exception as exc:
|
|
201
|
+
msg = f"Failed to delete {path}"
|
|
202
|
+
raise StorageOperationFailedError(msg) from exc
|
|
179
203
|
|
|
180
204
|
def copy(self, source: Union[str, Path], destination: Union[str, Path], **kwargs: Any) -> None:
|
|
181
205
|
"""Copy an object."""
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
206
|
+
try:
|
|
207
|
+
source_path = self._resolve_path(source)
|
|
208
|
+
dest_path = self._resolve_path(destination)
|
|
209
|
+
self.fs.copy(source_path, dest_path, **kwargs)
|
|
210
|
+
except Exception as exc:
|
|
211
|
+
msg = f"Failed to copy {source} to {destination}"
|
|
212
|
+
raise StorageOperationFailedError(msg) from exc
|
|
185
213
|
|
|
186
214
|
def move(self, source: Union[str, Path], destination: Union[str, Path], **kwargs: Any) -> None:
|
|
187
215
|
"""Move an object."""
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
216
|
+
try:
|
|
217
|
+
source_path = self._resolve_path(source)
|
|
218
|
+
dest_path = self._resolve_path(destination)
|
|
219
|
+
self.fs.mv(source_path, dest_path, **kwargs)
|
|
220
|
+
except Exception as exc:
|
|
221
|
+
msg = f"Failed to move {source} to {destination}"
|
|
222
|
+
raise StorageOperationFailedError(msg) from exc
|
|
191
223
|
|
|
192
|
-
# Arrow Operations
|
|
193
224
|
def read_arrow(self, path: Union[str, Path], **kwargs: Any) -> "ArrowTable":
|
|
194
225
|
"""Read an Arrow table from storage."""
|
|
195
226
|
if not PYARROW_INSTALLED:
|
|
196
227
|
raise MissingDependencyError(package="pyarrow", install_package="pyarrow")
|
|
228
|
+
try:
|
|
229
|
+
import pyarrow.parquet as pq
|
|
197
230
|
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
231
|
+
resolved_path = self._resolve_path(path)
|
|
232
|
+
with self.fs.open(resolved_path, mode="rb", **kwargs) as f:
|
|
233
|
+
return pq.read_table(f)
|
|
234
|
+
except Exception as exc:
|
|
235
|
+
msg = f"Failed to read Arrow table from {path}"
|
|
236
|
+
raise StorageOperationFailedError(msg) from exc
|
|
203
237
|
|
|
204
238
|
def write_arrow(self, path: Union[str, Path], table: "ArrowTable", **kwargs: Any) -> None:
|
|
205
239
|
"""Write an Arrow table to storage."""
|
|
206
240
|
if not PYARROW_INSTALLED:
|
|
207
241
|
raise MissingDependencyError(package="pyarrow", install_package="pyarrow")
|
|
242
|
+
try:
|
|
243
|
+
import pyarrow.parquet as pq
|
|
208
244
|
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
245
|
+
resolved_path = self._resolve_path(path)
|
|
246
|
+
with self.fs.open(resolved_path, mode="wb") as f:
|
|
247
|
+
pq.write_table(table, f, **kwargs) # pyright: ignore
|
|
248
|
+
except Exception as exc:
|
|
249
|
+
msg = f"Failed to write Arrow table to {path}"
|
|
250
|
+
raise StorageOperationFailedError(msg) from exc
|
|
214
251
|
|
|
215
|
-
# Listing Operations
|
|
216
252
|
def list_objects(self, prefix: str = "", recursive: bool = True, **kwargs: Any) -> list[str]:
|
|
217
253
|
"""List objects with optional prefix."""
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
paths = [str(path) for path in self.fs.glob(pattern, **kwargs) if not self.fs.isdir(path)]
|
|
227
|
-
return sorted(paths)
|
|
254
|
+
try:
|
|
255
|
+
resolved_prefix = self._resolve_path(prefix)
|
|
256
|
+
if recursive:
|
|
257
|
+
return sorted(self.fs.find(resolved_prefix, **kwargs))
|
|
258
|
+
return sorted(self.fs.ls(resolved_prefix, detail=False, **kwargs))
|
|
259
|
+
except Exception as exc:
|
|
260
|
+
msg = f"Failed to list objects with prefix '{prefix}'"
|
|
261
|
+
raise StorageOperationFailedError(msg) from exc
|
|
228
262
|
|
|
229
263
|
def glob(self, pattern: str, **kwargs: Any) -> list[str]:
|
|
230
264
|
"""Find objects matching a glob pattern."""
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
265
|
+
try:
|
|
266
|
+
resolved_pattern = self._resolve_path(pattern)
|
|
267
|
+
return sorted(self.fs.glob(resolved_pattern, **kwargs)) # pyright: ignore
|
|
268
|
+
except Exception as exc:
|
|
269
|
+
msg = f"Failed to glob with pattern '{pattern}'"
|
|
270
|
+
raise StorageOperationFailedError(msg) from exc
|
|
235
271
|
|
|
236
|
-
# Path Operations
|
|
237
272
|
def is_object(self, path: str) -> bool:
|
|
238
273
|
"""Check if path points to an object."""
|
|
239
274
|
resolved_path = self._resolve_path(path)
|
|
@@ -246,23 +281,29 @@ class FSSpecBackend(ObjectStoreBase):
|
|
|
246
281
|
|
|
247
282
|
def get_metadata(self, path: Union[str, Path], **kwargs: Any) -> dict[str, Any]:
|
|
248
283
|
"""Get object metadata."""
|
|
249
|
-
info = self.fs.info(self._resolve_path(path), **kwargs)
|
|
250
|
-
|
|
251
|
-
if isinstance(info, dict):
|
|
252
|
-
return info
|
|
253
|
-
|
|
254
|
-
# Try to get dict representation
|
|
255
284
|
try:
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
285
|
+
resolved_path = self._resolve_path(path)
|
|
286
|
+
info = self.fs.info(resolved_path, **kwargs)
|
|
287
|
+
if isinstance(info, dict):
|
|
288
|
+
return {
|
|
289
|
+
"path": resolved_path,
|
|
290
|
+
"exists": True,
|
|
291
|
+
"size": info.get("size"),
|
|
292
|
+
"last_modified": info.get("mtime"),
|
|
293
|
+
"type": info.get("type", "file"),
|
|
294
|
+
}
|
|
295
|
+
|
|
296
|
+
except FileNotFoundError:
|
|
297
|
+
return {"path": self._resolve_path(path), "exists": False}
|
|
298
|
+
except Exception as exc:
|
|
299
|
+
msg = f"Failed to get metadata for {path}"
|
|
300
|
+
raise StorageOperationFailedError(msg) from exc
|
|
261
301
|
return {
|
|
262
302
|
"path": resolved_path,
|
|
263
|
-
"exists":
|
|
264
|
-
"size":
|
|
265
|
-
"
|
|
303
|
+
"exists": True,
|
|
304
|
+
"size": info.size,
|
|
305
|
+
"last_modified": info.mtime,
|
|
306
|
+
"type": info.type,
|
|
266
307
|
}
|
|
267
308
|
|
|
268
309
|
def _stream_file_batches(self, obj_path: Union[str, Path]) -> "Iterator[ArrowRecordBatch]":
|
|
@@ -278,85 +319,68 @@ class FSSpecBackend(ObjectStoreBase):
|
|
|
278
319
|
if not PYARROW_INSTALLED:
|
|
279
320
|
raise MissingDependencyError(package="pyarrow", install_package="pyarrow")
|
|
280
321
|
|
|
281
|
-
# Stream each file as record batches
|
|
282
322
|
for obj_path in self.glob(pattern, **kwargs):
|
|
283
323
|
yield from self._stream_file_batches(obj_path)
|
|
284
324
|
|
|
285
325
|
async def read_bytes_async(self, path: Union[str, Path], **kwargs: Any) -> bytes:
|
|
286
|
-
"""
|
|
326
|
+
"""Read bytes from storage asynchronously."""
|
|
287
327
|
return await async_(self.read_bytes)(path, **kwargs)
|
|
288
328
|
|
|
289
329
|
async def write_bytes_async(self, path: Union[str, Path], data: bytes, **kwargs: Any) -> None:
|
|
290
|
-
"""
|
|
330
|
+
"""Write bytes to storage asynchronously."""
|
|
291
331
|
return await async_(self.write_bytes)(path, data, **kwargs)
|
|
292
332
|
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
data = await self.read_bytes_async(obj_path)
|
|
297
|
-
parquet_file = pq.ParquetFile(BytesIO(data))
|
|
298
|
-
for batch in parquet_file.iter_batches():
|
|
299
|
-
yield batch
|
|
300
|
-
|
|
301
|
-
async def stream_arrow_async(self, pattern: str, **kwargs: Any) -> "AsyncIterator[ArrowRecordBatch]":
|
|
302
|
-
"""Async stream Arrow record batches.
|
|
303
|
-
|
|
304
|
-
This implementation provides file-level async streaming. Each file is
|
|
305
|
-
read into memory before its batches are processed.
|
|
333
|
+
def stream_arrow_async(self, pattern: str, **kwargs: Any) -> "AsyncIterator[ArrowRecordBatch]":
|
|
334
|
+
"""Stream Arrow record batches from storage asynchronously.
|
|
306
335
|
|
|
307
336
|
Args:
|
|
308
337
|
pattern: The glob pattern to match.
|
|
309
338
|
**kwargs: Additional arguments to pass to the glob method.
|
|
310
339
|
|
|
311
|
-
|
|
340
|
+
Returns:
|
|
312
341
|
AsyncIterator of Arrow record batches
|
|
313
342
|
"""
|
|
314
343
|
if not PYARROW_INSTALLED:
|
|
315
344
|
raise MissingDependencyError(package="pyarrow", install_package="pyarrow")
|
|
316
345
|
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
# Stream batches from each path
|
|
320
|
-
for path in paths:
|
|
321
|
-
async for batch in self._stream_file_batches_async(path):
|
|
322
|
-
yield batch
|
|
346
|
+
return _ArrowStreamer(self, pattern, **kwargs)
|
|
323
347
|
|
|
324
348
|
async def read_text_async(self, path: Union[str, Path], encoding: str = "utf-8", **kwargs: Any) -> str:
|
|
325
|
-
"""
|
|
349
|
+
"""Read text from storage asynchronously."""
|
|
326
350
|
return await async_(self.read_text)(path, encoding, **kwargs)
|
|
327
351
|
|
|
328
352
|
async def write_text_async(self, path: Union[str, Path], data: str, encoding: str = "utf-8", **kwargs: Any) -> None:
|
|
329
|
-
"""
|
|
353
|
+
"""Write text to storage asynchronously."""
|
|
330
354
|
await async_(self.write_text)(path, data, encoding, **kwargs)
|
|
331
355
|
|
|
332
356
|
async def list_objects_async(self, prefix: str = "", recursive: bool = True, **kwargs: Any) -> list[str]:
|
|
333
|
-
"""
|
|
357
|
+
"""List objects in storage asynchronously."""
|
|
334
358
|
return await async_(self.list_objects)(prefix, recursive, **kwargs)
|
|
335
359
|
|
|
336
360
|
async def exists_async(self, path: Union[str, Path], **kwargs: Any) -> bool:
|
|
337
|
-
"""
|
|
361
|
+
"""Check if object exists in storage asynchronously."""
|
|
338
362
|
return await async_(self.exists)(path, **kwargs)
|
|
339
363
|
|
|
340
364
|
async def delete_async(self, path: Union[str, Path], **kwargs: Any) -> None:
|
|
341
|
-
"""
|
|
365
|
+
"""Delete object from storage asynchronously."""
|
|
342
366
|
await async_(self.delete)(path, **kwargs)
|
|
343
367
|
|
|
344
368
|
async def copy_async(self, source: Union[str, Path], destination: Union[str, Path], **kwargs: Any) -> None:
|
|
345
|
-
"""
|
|
369
|
+
"""Copy object in storage asynchronously."""
|
|
346
370
|
await async_(self.copy)(source, destination, **kwargs)
|
|
347
371
|
|
|
348
372
|
async def move_async(self, source: Union[str, Path], destination: Union[str, Path], **kwargs: Any) -> None:
|
|
349
|
-
"""
|
|
373
|
+
"""Move object in storage asynchronously."""
|
|
350
374
|
await async_(self.move)(source, destination, **kwargs)
|
|
351
375
|
|
|
352
376
|
async def get_metadata_async(self, path: Union[str, Path], **kwargs: Any) -> dict[str, Any]:
|
|
353
|
-
"""
|
|
377
|
+
"""Get object metadata from storage asynchronously."""
|
|
354
378
|
return await async_(self.get_metadata)(path, **kwargs)
|
|
355
379
|
|
|
356
380
|
async def read_arrow_async(self, path: Union[str, Path], **kwargs: Any) -> "ArrowTable":
|
|
357
|
-
"""
|
|
381
|
+
"""Read Arrow table from storage asynchronously."""
|
|
358
382
|
return await async_(self.read_arrow)(path, **kwargs)
|
|
359
383
|
|
|
360
384
|
async def write_arrow_async(self, path: Union[str, Path], table: "ArrowTable", **kwargs: Any) -> None:
|
|
361
|
-
"""
|
|
385
|
+
"""Write Arrow table to storage asynchronously."""
|
|
362
386
|
await async_(self.write_arrow)(path, table, **kwargs)
|