sqlspec 0.11.1__py3-none-any.whl → 0.12.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of sqlspec might be problematic. Click here for more details.
- sqlspec/__init__.py +16 -3
- sqlspec/_serialization.py +3 -10
- sqlspec/_sql.py +1147 -0
- sqlspec/_typing.py +343 -41
- sqlspec/adapters/adbc/__init__.py +2 -6
- sqlspec/adapters/adbc/config.py +474 -149
- sqlspec/adapters/adbc/driver.py +330 -621
- sqlspec/adapters/aiosqlite/__init__.py +2 -6
- sqlspec/adapters/aiosqlite/config.py +143 -57
- sqlspec/adapters/aiosqlite/driver.py +269 -431
- sqlspec/adapters/asyncmy/__init__.py +3 -8
- sqlspec/adapters/asyncmy/config.py +247 -202
- sqlspec/adapters/asyncmy/driver.py +218 -436
- sqlspec/adapters/asyncpg/__init__.py +4 -7
- sqlspec/adapters/asyncpg/config.py +329 -176
- sqlspec/adapters/asyncpg/driver.py +417 -487
- sqlspec/adapters/bigquery/__init__.py +2 -2
- sqlspec/adapters/bigquery/config.py +407 -0
- sqlspec/adapters/bigquery/driver.py +600 -553
- sqlspec/adapters/duckdb/__init__.py +4 -1
- sqlspec/adapters/duckdb/config.py +432 -321
- sqlspec/adapters/duckdb/driver.py +392 -406
- sqlspec/adapters/oracledb/__init__.py +3 -8
- sqlspec/adapters/oracledb/config.py +625 -0
- sqlspec/adapters/oracledb/driver.py +548 -921
- sqlspec/adapters/psqlpy/__init__.py +4 -7
- sqlspec/adapters/psqlpy/config.py +372 -203
- sqlspec/adapters/psqlpy/driver.py +197 -533
- sqlspec/adapters/psycopg/__init__.py +3 -8
- sqlspec/adapters/psycopg/config.py +741 -0
- sqlspec/adapters/psycopg/driver.py +734 -694
- sqlspec/adapters/sqlite/__init__.py +2 -6
- sqlspec/adapters/sqlite/config.py +146 -81
- sqlspec/adapters/sqlite/driver.py +242 -405
- sqlspec/base.py +220 -784
- sqlspec/config.py +354 -0
- sqlspec/driver/__init__.py +22 -0
- sqlspec/driver/_async.py +252 -0
- sqlspec/driver/_common.py +338 -0
- sqlspec/driver/_sync.py +261 -0
- sqlspec/driver/mixins/__init__.py +17 -0
- sqlspec/driver/mixins/_pipeline.py +523 -0
- sqlspec/driver/mixins/_result_utils.py +122 -0
- sqlspec/driver/mixins/_sql_translator.py +35 -0
- sqlspec/driver/mixins/_storage.py +993 -0
- sqlspec/driver/mixins/_type_coercion.py +131 -0
- sqlspec/exceptions.py +299 -7
- sqlspec/extensions/aiosql/__init__.py +10 -0
- sqlspec/extensions/aiosql/adapter.py +474 -0
- sqlspec/extensions/litestar/__init__.py +1 -6
- sqlspec/extensions/litestar/_utils.py +1 -5
- sqlspec/extensions/litestar/config.py +5 -6
- sqlspec/extensions/litestar/handlers.py +13 -12
- sqlspec/extensions/litestar/plugin.py +22 -24
- sqlspec/extensions/litestar/providers.py +37 -55
- sqlspec/loader.py +528 -0
- sqlspec/service/__init__.py +3 -0
- sqlspec/service/base.py +24 -0
- sqlspec/service/pagination.py +26 -0
- sqlspec/statement/__init__.py +21 -0
- sqlspec/statement/builder/__init__.py +54 -0
- sqlspec/statement/builder/_ddl_utils.py +119 -0
- sqlspec/statement/builder/_parsing_utils.py +135 -0
- sqlspec/statement/builder/base.py +328 -0
- sqlspec/statement/builder/ddl.py +1379 -0
- sqlspec/statement/builder/delete.py +80 -0
- sqlspec/statement/builder/insert.py +274 -0
- sqlspec/statement/builder/merge.py +95 -0
- sqlspec/statement/builder/mixins/__init__.py +65 -0
- sqlspec/statement/builder/mixins/_aggregate_functions.py +151 -0
- sqlspec/statement/builder/mixins/_case_builder.py +91 -0
- sqlspec/statement/builder/mixins/_common_table_expr.py +91 -0
- sqlspec/statement/builder/mixins/_delete_from.py +34 -0
- sqlspec/statement/builder/mixins/_from.py +61 -0
- sqlspec/statement/builder/mixins/_group_by.py +119 -0
- sqlspec/statement/builder/mixins/_having.py +35 -0
- sqlspec/statement/builder/mixins/_insert_from_select.py +48 -0
- sqlspec/statement/builder/mixins/_insert_into.py +36 -0
- sqlspec/statement/builder/mixins/_insert_values.py +69 -0
- sqlspec/statement/builder/mixins/_join.py +110 -0
- sqlspec/statement/builder/mixins/_limit_offset.py +53 -0
- sqlspec/statement/builder/mixins/_merge_clauses.py +405 -0
- sqlspec/statement/builder/mixins/_order_by.py +46 -0
- sqlspec/statement/builder/mixins/_pivot.py +82 -0
- sqlspec/statement/builder/mixins/_returning.py +37 -0
- sqlspec/statement/builder/mixins/_select_columns.py +60 -0
- sqlspec/statement/builder/mixins/_set_ops.py +122 -0
- sqlspec/statement/builder/mixins/_unpivot.py +80 -0
- sqlspec/statement/builder/mixins/_update_from.py +54 -0
- sqlspec/statement/builder/mixins/_update_set.py +91 -0
- sqlspec/statement/builder/mixins/_update_table.py +29 -0
- sqlspec/statement/builder/mixins/_where.py +374 -0
- sqlspec/statement/builder/mixins/_window_functions.py +86 -0
- sqlspec/statement/builder/protocols.py +20 -0
- sqlspec/statement/builder/select.py +206 -0
- sqlspec/statement/builder/update.py +178 -0
- sqlspec/statement/filters.py +571 -0
- sqlspec/statement/parameters.py +736 -0
- sqlspec/statement/pipelines/__init__.py +67 -0
- sqlspec/statement/pipelines/analyzers/__init__.py +9 -0
- sqlspec/statement/pipelines/analyzers/_analyzer.py +649 -0
- sqlspec/statement/pipelines/base.py +315 -0
- sqlspec/statement/pipelines/context.py +119 -0
- sqlspec/statement/pipelines/result_types.py +41 -0
- sqlspec/statement/pipelines/transformers/__init__.py +8 -0
- sqlspec/statement/pipelines/transformers/_expression_simplifier.py +256 -0
- sqlspec/statement/pipelines/transformers/_literal_parameterizer.py +623 -0
- sqlspec/statement/pipelines/transformers/_remove_comments.py +66 -0
- sqlspec/statement/pipelines/transformers/_remove_hints.py +81 -0
- sqlspec/statement/pipelines/validators/__init__.py +23 -0
- sqlspec/statement/pipelines/validators/_dml_safety.py +275 -0
- sqlspec/statement/pipelines/validators/_parameter_style.py +297 -0
- sqlspec/statement/pipelines/validators/_performance.py +703 -0
- sqlspec/statement/pipelines/validators/_security.py +990 -0
- sqlspec/statement/pipelines/validators/base.py +67 -0
- sqlspec/statement/result.py +527 -0
- sqlspec/statement/splitter.py +701 -0
- sqlspec/statement/sql.py +1198 -0
- sqlspec/storage/__init__.py +15 -0
- sqlspec/storage/backends/__init__.py +0 -0
- sqlspec/storage/backends/base.py +166 -0
- sqlspec/storage/backends/fsspec.py +315 -0
- sqlspec/storage/backends/obstore.py +464 -0
- sqlspec/storage/protocol.py +170 -0
- sqlspec/storage/registry.py +315 -0
- sqlspec/typing.py +157 -36
- sqlspec/utils/correlation.py +155 -0
- sqlspec/utils/deprecation.py +3 -6
- sqlspec/utils/fixtures.py +6 -11
- sqlspec/utils/logging.py +135 -0
- sqlspec/utils/module_loader.py +45 -43
- sqlspec/utils/serializers.py +4 -0
- sqlspec/utils/singleton.py +6 -8
- sqlspec/utils/sync_tools.py +15 -27
- sqlspec/utils/text.py +58 -26
- {sqlspec-0.11.1.dist-info → sqlspec-0.12.0.dist-info}/METADATA +97 -26
- sqlspec-0.12.0.dist-info/RECORD +145 -0
- sqlspec/adapters/bigquery/config/__init__.py +0 -3
- sqlspec/adapters/bigquery/config/_common.py +0 -40
- sqlspec/adapters/bigquery/config/_sync.py +0 -87
- sqlspec/adapters/oracledb/config/__init__.py +0 -9
- sqlspec/adapters/oracledb/config/_asyncio.py +0 -186
- sqlspec/adapters/oracledb/config/_common.py +0 -131
- sqlspec/adapters/oracledb/config/_sync.py +0 -186
- sqlspec/adapters/psycopg/config/__init__.py +0 -19
- sqlspec/adapters/psycopg/config/_async.py +0 -169
- sqlspec/adapters/psycopg/config/_common.py +0 -56
- sqlspec/adapters/psycopg/config/_sync.py +0 -168
- sqlspec/filters.py +0 -331
- sqlspec/mixins.py +0 -305
- sqlspec/statement.py +0 -378
- sqlspec-0.11.1.dist-info/RECORD +0 -69
- {sqlspec-0.11.1.dist-info → sqlspec-0.12.0.dist-info}/WHEEL +0 -0
- {sqlspec-0.11.1.dist-info → sqlspec-0.12.0.dist-info}/licenses/LICENSE +0 -0
- {sqlspec-0.11.1.dist-info → sqlspec-0.12.0.dist-info}/licenses/NOTICE +0 -0
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
"""Storage abstraction layer for SQLSpec.
|
|
2
|
+
|
|
3
|
+
This module provides a flexible storage system with:
|
|
4
|
+
- Multiple backend support (local, fsspec, obstore)
|
|
5
|
+
- Lazy loading and configuration-based registration
|
|
6
|
+
- URI scheme-based automatic backend resolution
|
|
7
|
+
- Key-based named storage configurations
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from sqlspec.storage.protocol import ObjectStoreProtocol
|
|
11
|
+
from sqlspec.storage.registry import StorageRegistry
|
|
12
|
+
|
|
13
|
+
storage_registry = StorageRegistry()
|
|
14
|
+
|
|
15
|
+
__all__ = ("ObjectStoreProtocol", "StorageRegistry", "storage_registry")
|
|
File without changes
|
|
@@ -0,0 +1,166 @@
|
|
|
1
|
+
"""Base class for storage backends."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from abc import ABC, abstractmethod
|
|
6
|
+
from typing import TYPE_CHECKING, Any
|
|
7
|
+
|
|
8
|
+
if TYPE_CHECKING:
|
|
9
|
+
from collections.abc import AsyncIterator, Iterator
|
|
10
|
+
|
|
11
|
+
from sqlspec.typing import ArrowRecordBatch, ArrowTable
|
|
12
|
+
|
|
13
|
+
__all__ = ("ObjectStoreBase",)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class ObjectStoreBase(ABC):
|
|
17
|
+
"""Base class for instrumented storage backends."""
|
|
18
|
+
|
|
19
|
+
# Sync Operations
|
|
20
|
+
@abstractmethod
|
|
21
|
+
def read_bytes(self, path: str, **kwargs: Any) -> bytes:
|
|
22
|
+
"""Actual implementation of read_bytes in subclasses."""
|
|
23
|
+
raise NotImplementedError
|
|
24
|
+
|
|
25
|
+
@abstractmethod
|
|
26
|
+
def write_bytes(self, path: str, data: bytes, **kwargs: Any) -> None:
|
|
27
|
+
"""Actual implementation of write_bytes in subclasses."""
|
|
28
|
+
raise NotImplementedError
|
|
29
|
+
|
|
30
|
+
@abstractmethod
|
|
31
|
+
def read_text(self, path: str, encoding: str = "utf-8", **kwargs: Any) -> str:
|
|
32
|
+
"""Actual implementation of read_text in subclasses."""
|
|
33
|
+
raise NotImplementedError
|
|
34
|
+
|
|
35
|
+
@abstractmethod
|
|
36
|
+
def write_text(self, path: str, data: str, encoding: str = "utf-8", **kwargs: Any) -> None:
|
|
37
|
+
"""Actual implementation of write_text in subclasses."""
|
|
38
|
+
raise NotImplementedError
|
|
39
|
+
|
|
40
|
+
@abstractmethod
|
|
41
|
+
def list_objects(self, prefix: str = "", recursive: bool = True, **kwargs: Any) -> list[str]:
|
|
42
|
+
"""Actual implementation of list_objects in subclasses."""
|
|
43
|
+
raise NotImplementedError
|
|
44
|
+
|
|
45
|
+
@abstractmethod
|
|
46
|
+
def exists(self, path: str, **kwargs: Any) -> bool:
|
|
47
|
+
"""Actual implementation of exists in subclasses."""
|
|
48
|
+
raise NotImplementedError
|
|
49
|
+
|
|
50
|
+
@abstractmethod
|
|
51
|
+
def delete(self, path: str, **kwargs: Any) -> None:
|
|
52
|
+
"""Actual implementation of delete in subclasses."""
|
|
53
|
+
raise NotImplementedError
|
|
54
|
+
|
|
55
|
+
@abstractmethod
|
|
56
|
+
def copy(self, source: str, destination: str, **kwargs: Any) -> None:
|
|
57
|
+
"""Actual implementation of copy in subclasses."""
|
|
58
|
+
raise NotImplementedError
|
|
59
|
+
|
|
60
|
+
@abstractmethod
|
|
61
|
+
def move(self, source: str, destination: str, **kwargs: Any) -> None:
|
|
62
|
+
"""Actual implementation of move in subclasses."""
|
|
63
|
+
raise NotImplementedError
|
|
64
|
+
|
|
65
|
+
@abstractmethod
|
|
66
|
+
def glob(self, pattern: str, **kwargs: Any) -> list[str]:
|
|
67
|
+
"""Actual implementation of glob in subclasses."""
|
|
68
|
+
raise NotImplementedError
|
|
69
|
+
|
|
70
|
+
@abstractmethod
|
|
71
|
+
def get_metadata(self, path: str, **kwargs: Any) -> dict[str, Any]:
|
|
72
|
+
"""Actual implementation of get_metadata in subclasses."""
|
|
73
|
+
raise NotImplementedError
|
|
74
|
+
|
|
75
|
+
@abstractmethod
|
|
76
|
+
def is_object(self, path: str) -> bool:
|
|
77
|
+
"""Actual implementation of is_object in subclasses."""
|
|
78
|
+
raise NotImplementedError
|
|
79
|
+
|
|
80
|
+
@abstractmethod
|
|
81
|
+
def is_path(self, path: str) -> bool:
|
|
82
|
+
"""Actual implementation of is_path in subclasses."""
|
|
83
|
+
raise NotImplementedError
|
|
84
|
+
|
|
85
|
+
@abstractmethod
|
|
86
|
+
def read_arrow(self, path: str, **kwargs: Any) -> ArrowTable:
|
|
87
|
+
"""Actual implementation of read_arrow in subclasses."""
|
|
88
|
+
raise NotImplementedError
|
|
89
|
+
|
|
90
|
+
@abstractmethod
|
|
91
|
+
def write_arrow(self, path: str, table: ArrowTable, **kwargs: Any) -> None:
|
|
92
|
+
"""Actual implementation of write_arrow in subclasses."""
|
|
93
|
+
raise NotImplementedError
|
|
94
|
+
|
|
95
|
+
@abstractmethod
|
|
96
|
+
def stream_arrow(self, pattern: str, **kwargs: Any) -> Iterator[ArrowRecordBatch]:
|
|
97
|
+
"""Actual implementation of stream_arrow in subclasses."""
|
|
98
|
+
raise NotImplementedError
|
|
99
|
+
|
|
100
|
+
# Abstract async methods that subclasses must implement
|
|
101
|
+
# Backends can either provide native async implementations or wrap sync methods
|
|
102
|
+
|
|
103
|
+
@abstractmethod
|
|
104
|
+
async def read_bytes_async(self, path: str, **kwargs: Any) -> bytes:
|
|
105
|
+
"""Actual async implementation of read_bytes in subclasses."""
|
|
106
|
+
raise NotImplementedError
|
|
107
|
+
|
|
108
|
+
@abstractmethod
|
|
109
|
+
async def write_bytes_async(self, path: str, data: bytes, **kwargs: Any) -> None:
|
|
110
|
+
"""Actual async implementation of write_bytes in subclasses."""
|
|
111
|
+
raise NotImplementedError
|
|
112
|
+
|
|
113
|
+
@abstractmethod
|
|
114
|
+
async def read_text_async(self, path: str, encoding: str = "utf-8", **kwargs: Any) -> str:
|
|
115
|
+
"""Actual async implementation of read_text in subclasses."""
|
|
116
|
+
raise NotImplementedError
|
|
117
|
+
|
|
118
|
+
@abstractmethod
|
|
119
|
+
async def write_text_async(self, path: str, data: str, encoding: str = "utf-8", **kwargs: Any) -> None:
|
|
120
|
+
"""Actual async implementation of write_text in subclasses."""
|
|
121
|
+
raise NotImplementedError
|
|
122
|
+
|
|
123
|
+
@abstractmethod
|
|
124
|
+
async def list_objects_async(self, prefix: str = "", recursive: bool = True, **kwargs: Any) -> list[str]:
|
|
125
|
+
"""Actual async implementation of list_objects in subclasses."""
|
|
126
|
+
raise NotImplementedError
|
|
127
|
+
|
|
128
|
+
@abstractmethod
|
|
129
|
+
async def exists_async(self, path: str, **kwargs: Any) -> bool:
|
|
130
|
+
"""Actual async implementation of exists in subclasses."""
|
|
131
|
+
raise NotImplementedError
|
|
132
|
+
|
|
133
|
+
@abstractmethod
|
|
134
|
+
async def delete_async(self, path: str, **kwargs: Any) -> None:
|
|
135
|
+
"""Actual async implementation of delete in subclasses."""
|
|
136
|
+
raise NotImplementedError
|
|
137
|
+
|
|
138
|
+
@abstractmethod
|
|
139
|
+
async def copy_async(self, source: str, destination: str, **kwargs: Any) -> None:
|
|
140
|
+
"""Actual async implementation of copy in subclasses."""
|
|
141
|
+
raise NotImplementedError
|
|
142
|
+
|
|
143
|
+
@abstractmethod
|
|
144
|
+
async def move_async(self, source: str, destination: str, **kwargs: Any) -> None:
|
|
145
|
+
"""Actual async implementation of move in subclasses."""
|
|
146
|
+
raise NotImplementedError
|
|
147
|
+
|
|
148
|
+
@abstractmethod
|
|
149
|
+
async def get_metadata_async(self, path: str, **kwargs: Any) -> dict[str, Any]:
|
|
150
|
+
"""Actual async implementation of get_metadata in subclasses."""
|
|
151
|
+
raise NotImplementedError
|
|
152
|
+
|
|
153
|
+
@abstractmethod
|
|
154
|
+
async def read_arrow_async(self, path: str, **kwargs: Any) -> ArrowTable:
|
|
155
|
+
"""Actual async implementation of read_arrow in subclasses."""
|
|
156
|
+
raise NotImplementedError
|
|
157
|
+
|
|
158
|
+
@abstractmethod
|
|
159
|
+
async def write_arrow_async(self, path: str, table: ArrowTable, **kwargs: Any) -> None:
|
|
160
|
+
"""Actual async implementation of write_arrow in subclasses."""
|
|
161
|
+
raise NotImplementedError
|
|
162
|
+
|
|
163
|
+
@abstractmethod
|
|
164
|
+
def stream_arrow_async(self, pattern: str, **kwargs: Any) -> AsyncIterator[ArrowRecordBatch]:
|
|
165
|
+
"""Actual async implementation of stream_arrow in subclasses."""
|
|
166
|
+
raise NotImplementedError
|
|
@@ -0,0 +1,315 @@
|
|
|
1
|
+
# pyright: ignore=reportUnknownVariableType
|
|
2
|
+
import logging
|
|
3
|
+
from io import BytesIO
|
|
4
|
+
from typing import TYPE_CHECKING, Any, Union
|
|
5
|
+
|
|
6
|
+
from sqlspec.exceptions import MissingDependencyError
|
|
7
|
+
from sqlspec.storage.backends.base import ObjectStoreBase
|
|
8
|
+
from sqlspec.typing import FSSPEC_INSTALLED, PYARROW_INSTALLED
|
|
9
|
+
from sqlspec.utils.sync_tools import async_
|
|
10
|
+
|
|
11
|
+
if TYPE_CHECKING:
|
|
12
|
+
from collections.abc import AsyncIterator, Iterator
|
|
13
|
+
|
|
14
|
+
from fsspec import AbstractFileSystem
|
|
15
|
+
|
|
16
|
+
from sqlspec.typing import ArrowRecordBatch, ArrowTable
|
|
17
|
+
|
|
18
|
+
__all__ = ("FSSpecBackend",)
|
|
19
|
+
|
|
20
|
+
logger = logging.getLogger(__name__)
|
|
21
|
+
|
|
22
|
+
# Constants for URI validation
|
|
23
|
+
URI_PARTS_MIN_COUNT = 2
|
|
24
|
+
"""Minimum number of parts in a valid cloud storage URI (bucket/path)."""
|
|
25
|
+
|
|
26
|
+
AZURE_URI_PARTS_MIN_COUNT = 2
|
|
27
|
+
"""Minimum number of parts in an Azure URI (account/container)."""
|
|
28
|
+
|
|
29
|
+
AZURE_URI_BLOB_INDEX = 2
|
|
30
|
+
"""Index of blob name in Azure URI parts."""
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def _join_path(prefix: str, path: str) -> str:
|
|
34
|
+
if not prefix:
|
|
35
|
+
return path
|
|
36
|
+
prefix = prefix.rstrip("/")
|
|
37
|
+
path = path.lstrip("/")
|
|
38
|
+
return f"{prefix}/{path}"
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class FSSpecBackend(ObjectStoreBase):
|
|
42
|
+
"""Extended protocol support via fsspec.
|
|
43
|
+
|
|
44
|
+
This backend implements the ObjectStoreProtocol using fsspec,
|
|
45
|
+
providing support for extended protocols not covered by obstore
|
|
46
|
+
and offering fallback capabilities.
|
|
47
|
+
"""
|
|
48
|
+
|
|
49
|
+
def __init__(self, fs: "Union[str, AbstractFileSystem]", base_path: str = "") -> None:
|
|
50
|
+
if not FSSPEC_INSTALLED:
|
|
51
|
+
raise MissingDependencyError(package="fsspec", install_package="fsspec")
|
|
52
|
+
|
|
53
|
+
self.base_path = base_path.rstrip("/") if base_path else ""
|
|
54
|
+
|
|
55
|
+
if isinstance(fs, str):
|
|
56
|
+
import fsspec
|
|
57
|
+
|
|
58
|
+
self.fs = fsspec.filesystem(fs.split("://")[0])
|
|
59
|
+
self.protocol = fs.split("://")[0]
|
|
60
|
+
self._fs_uri = fs
|
|
61
|
+
else:
|
|
62
|
+
self.fs = fs
|
|
63
|
+
self.protocol = getattr(fs, "protocol", "unknown")
|
|
64
|
+
self._fs_uri = f"{self.protocol}://"
|
|
65
|
+
super().__init__()
|
|
66
|
+
|
|
67
|
+
@classmethod
|
|
68
|
+
def from_config(cls, config: "dict[str, Any]") -> "FSSpecBackend":
|
|
69
|
+
protocol = config["protocol"]
|
|
70
|
+
fs_config = config.get("fs_config", {})
|
|
71
|
+
base_path = config.get("base_path", "")
|
|
72
|
+
|
|
73
|
+
# Create filesystem instance from protocol
|
|
74
|
+
import fsspec
|
|
75
|
+
|
|
76
|
+
fs_instance = fsspec.filesystem(protocol, **fs_config)
|
|
77
|
+
|
|
78
|
+
return cls(fs=fs_instance, base_path=base_path)
|
|
79
|
+
|
|
80
|
+
def _resolve_path(self, path: str) -> str:
|
|
81
|
+
"""Resolve path relative to base_path."""
|
|
82
|
+
if self.base_path:
|
|
83
|
+
# Ensure no double slashes
|
|
84
|
+
clean_base = self.base_path.rstrip("/")
|
|
85
|
+
clean_path = path.lstrip("/")
|
|
86
|
+
return f"{clean_base}/{clean_path}"
|
|
87
|
+
return path
|
|
88
|
+
|
|
89
|
+
@property
|
|
90
|
+
def backend_type(self) -> str:
|
|
91
|
+
return "fsspec"
|
|
92
|
+
|
|
93
|
+
@property
|
|
94
|
+
def base_uri(self) -> str:
|
|
95
|
+
return self._fs_uri
|
|
96
|
+
|
|
97
|
+
# Core Operations (sync)
|
|
98
|
+
def read_bytes(self, path: str, **kwargs: Any) -> bytes:
|
|
99
|
+
"""Read bytes from an object."""
|
|
100
|
+
resolved_path = self._resolve_path(path)
|
|
101
|
+
return self.fs.cat(resolved_path, **kwargs) # type: ignore[no-any-return] # pyright: ignore
|
|
102
|
+
|
|
103
|
+
def write_bytes(self, path: str, data: bytes, **kwargs: Any) -> None:
|
|
104
|
+
"""Write bytes to an object."""
|
|
105
|
+
resolved_path = self._resolve_path(path)
|
|
106
|
+
with self.fs.open(resolved_path, mode="wb", **kwargs) as f:
|
|
107
|
+
f.write(data) # pyright: ignore
|
|
108
|
+
|
|
109
|
+
def read_text(self, path: str, encoding: str = "utf-8", **kwargs: Any) -> str:
|
|
110
|
+
"""Read text from an object."""
|
|
111
|
+
data = self.read_bytes(path, **kwargs)
|
|
112
|
+
return data.decode(encoding)
|
|
113
|
+
|
|
114
|
+
def write_text(self, path: str, data: str, encoding: str = "utf-8", **kwargs: Any) -> None:
|
|
115
|
+
"""Write text to an object."""
|
|
116
|
+
self.write_bytes(path, data.encode(encoding), **kwargs)
|
|
117
|
+
|
|
118
|
+
# Object Operations
|
|
119
|
+
def exists(self, path: str, **kwargs: Any) -> bool:
|
|
120
|
+
"""Check if an object exists."""
|
|
121
|
+
resolved_path = self._resolve_path(path)
|
|
122
|
+
return self.fs.exists(resolved_path, **kwargs) # type: ignore[no-any-return]
|
|
123
|
+
|
|
124
|
+
def delete(self, path: str, **kwargs: Any) -> None:
|
|
125
|
+
"""Delete an object."""
|
|
126
|
+
resolved_path = self._resolve_path(path)
|
|
127
|
+
self.fs.rm(resolved_path, **kwargs)
|
|
128
|
+
|
|
129
|
+
def copy(self, source: str, destination: str, **kwargs: Any) -> None:
|
|
130
|
+
"""Copy an object."""
|
|
131
|
+
source_path = self._resolve_path(source)
|
|
132
|
+
dest_path = self._resolve_path(destination)
|
|
133
|
+
self.fs.copy(source_path, dest_path, **kwargs)
|
|
134
|
+
|
|
135
|
+
def move(self, source: str, destination: str, **kwargs: Any) -> None:
|
|
136
|
+
"""Move an object."""
|
|
137
|
+
source_path = self._resolve_path(source)
|
|
138
|
+
dest_path = self._resolve_path(destination)
|
|
139
|
+
self.fs.mv(source_path, dest_path, **kwargs)
|
|
140
|
+
|
|
141
|
+
# Arrow Operations
|
|
142
|
+
def read_arrow(self, path: str, **kwargs: Any) -> "ArrowTable":
|
|
143
|
+
"""Read an Arrow table from storage."""
|
|
144
|
+
if not PYARROW_INSTALLED:
|
|
145
|
+
raise MissingDependencyError(package="pyarrow", install_package="pyarrow")
|
|
146
|
+
|
|
147
|
+
import pyarrow.parquet as pq
|
|
148
|
+
|
|
149
|
+
resolved_path = self._resolve_path(path)
|
|
150
|
+
with self.fs.open(resolved_path, mode="rb", **kwargs) as f:
|
|
151
|
+
return pq.read_table(f)
|
|
152
|
+
|
|
153
|
+
def write_arrow(self, path: str, table: "ArrowTable", **kwargs: Any) -> None:
|
|
154
|
+
"""Write an Arrow table to storage."""
|
|
155
|
+
if not PYARROW_INSTALLED:
|
|
156
|
+
raise MissingDependencyError(package="pyarrow", install_package="pyarrow")
|
|
157
|
+
|
|
158
|
+
import pyarrow.parquet as pq
|
|
159
|
+
|
|
160
|
+
resolved_path = self._resolve_path(path)
|
|
161
|
+
with self.fs.open(resolved_path, mode="wb") as f:
|
|
162
|
+
pq.write_table(table, f, **kwargs) # pyright: ignore
|
|
163
|
+
|
|
164
|
+
# Listing Operations
|
|
165
|
+
def list_objects(self, prefix: str = "", recursive: bool = True, **kwargs: Any) -> list[str]:
|
|
166
|
+
"""List objects with optional prefix."""
|
|
167
|
+
resolved_prefix = self._resolve_path(prefix) if prefix else self.base_path
|
|
168
|
+
|
|
169
|
+
# Use fs.glob for listing files
|
|
170
|
+
if recursive:
|
|
171
|
+
pattern = f"{resolved_prefix}/**" if resolved_prefix else "**"
|
|
172
|
+
else:
|
|
173
|
+
pattern = f"{resolved_prefix}/*" if resolved_prefix else "*"
|
|
174
|
+
|
|
175
|
+
# Get all files (not directories)
|
|
176
|
+
paths = [str(path) for path in self.fs.glob(pattern, **kwargs) if not self.fs.isdir(path)]
|
|
177
|
+
return sorted(paths)
|
|
178
|
+
|
|
179
|
+
def glob(self, pattern: str, **kwargs: Any) -> list[str]:
|
|
180
|
+
"""Find objects matching a glob pattern."""
|
|
181
|
+
resolved_pattern = self._resolve_path(pattern)
|
|
182
|
+
# Use fsspec's native glob
|
|
183
|
+
paths = [str(path) for path in self.fs.glob(resolved_pattern, **kwargs) if not self.fs.isdir(path)]
|
|
184
|
+
return sorted(paths)
|
|
185
|
+
|
|
186
|
+
# Path Operations
|
|
187
|
+
def is_object(self, path: str) -> bool:
|
|
188
|
+
"""Check if path points to an object."""
|
|
189
|
+
resolved_path = self._resolve_path(path)
|
|
190
|
+
return self.fs.exists(resolved_path) and not self.fs.isdir(resolved_path)
|
|
191
|
+
|
|
192
|
+
def is_path(self, path: str) -> bool:
|
|
193
|
+
"""Check if path points to a prefix (directory-like)."""
|
|
194
|
+
resolved_path = self._resolve_path(path)
|
|
195
|
+
return self.fs.isdir(resolved_path) # type: ignore[no-any-return]
|
|
196
|
+
|
|
197
|
+
def get_metadata(self, path: str, **kwargs: Any) -> dict[str, Any]:
|
|
198
|
+
"""Get object metadata."""
|
|
199
|
+
info = self.fs.info(self._resolve_path(path), **kwargs)
|
|
200
|
+
|
|
201
|
+
# Convert fsspec info to dict
|
|
202
|
+
if isinstance(info, dict):
|
|
203
|
+
return info
|
|
204
|
+
|
|
205
|
+
# Try to get dict representation
|
|
206
|
+
try:
|
|
207
|
+
return vars(info) # type: ignore[no-any-return]
|
|
208
|
+
except AttributeError:
|
|
209
|
+
pass
|
|
210
|
+
|
|
211
|
+
# Fallback to basic metadata with safe attribute access
|
|
212
|
+
resolved_path = self._resolve_path(path)
|
|
213
|
+
return {
|
|
214
|
+
"path": resolved_path,
|
|
215
|
+
"exists": self.fs.exists(resolved_path),
|
|
216
|
+
"size": getattr(info, "size", None),
|
|
217
|
+
"type": getattr(info, "type", "file"),
|
|
218
|
+
}
|
|
219
|
+
|
|
220
|
+
def _stream_file_batches(self, obj_path: str) -> "Iterator[ArrowRecordBatch]":
|
|
221
|
+
import pyarrow.parquet as pq
|
|
222
|
+
|
|
223
|
+
with self.fs.open(obj_path, mode="rb") as f:
|
|
224
|
+
parquet_file = pq.ParquetFile(f) # pyright: ignore[reportArgumentType]
|
|
225
|
+
yield from parquet_file.iter_batches()
|
|
226
|
+
|
|
227
|
+
def stream_arrow(self, pattern: str, **kwargs: Any) -> "Iterator[ArrowRecordBatch]":
|
|
228
|
+
if not FSSPEC_INSTALLED:
|
|
229
|
+
raise MissingDependencyError(package="fsspec", install_package="fsspec")
|
|
230
|
+
if not PYARROW_INSTALLED:
|
|
231
|
+
raise MissingDependencyError(package="pyarrow", install_package="pyarrow")
|
|
232
|
+
|
|
233
|
+
# Stream each file as record batches
|
|
234
|
+
for obj_path in self.glob(pattern, **kwargs):
|
|
235
|
+
yield from self._stream_file_batches(obj_path)
|
|
236
|
+
|
|
237
|
+
async def read_bytes_async(self, path: str, **kwargs: Any) -> bytes:
|
|
238
|
+
"""Async read bytes. Wraps the sync implementation."""
|
|
239
|
+
return await async_(self.read_bytes)(path, **kwargs)
|
|
240
|
+
|
|
241
|
+
async def write_bytes_async(self, path: str, data: bytes, **kwargs: Any) -> None:
|
|
242
|
+
"""Async write bytes. Wras the sync implementation."""
|
|
243
|
+
return await async_(self.write_bytes)(path, data, **kwargs)
|
|
244
|
+
|
|
245
|
+
async def _stream_file_batches_async(self, obj_path: str) -> "AsyncIterator[ArrowRecordBatch]":
|
|
246
|
+
import pyarrow.parquet as pq
|
|
247
|
+
|
|
248
|
+
data = await self.read_bytes_async(obj_path)
|
|
249
|
+
parquet_file = pq.ParquetFile(BytesIO(data))
|
|
250
|
+
for batch in parquet_file.iter_batches():
|
|
251
|
+
yield batch
|
|
252
|
+
|
|
253
|
+
async def stream_arrow_async(self, pattern: str, **kwargs: Any) -> "AsyncIterator[ArrowRecordBatch]":
|
|
254
|
+
"""Async stream Arrow record batches.
|
|
255
|
+
|
|
256
|
+
This implementation provides file-level async streaming. Each file is
|
|
257
|
+
read into memory before its batches are processed.
|
|
258
|
+
|
|
259
|
+
Args:
|
|
260
|
+
pattern: The glob pattern to match.
|
|
261
|
+
**kwargs: Additional arguments to pass to the glob method.
|
|
262
|
+
|
|
263
|
+
Yields:
|
|
264
|
+
AsyncIterator of Arrow record batches
|
|
265
|
+
"""
|
|
266
|
+
if not PYARROW_INSTALLED:
|
|
267
|
+
raise MissingDependencyError(package="pyarrow", install_package="pyarrow")
|
|
268
|
+
|
|
269
|
+
# Get paths asynchronously
|
|
270
|
+
paths = await async_(self.glob)(pattern, **kwargs)
|
|
271
|
+
|
|
272
|
+
# Stream batches from each path
|
|
273
|
+
for path in paths:
|
|
274
|
+
async for batch in self._stream_file_batches_async(path):
|
|
275
|
+
yield batch
|
|
276
|
+
|
|
277
|
+
async def read_text_async(self, path: str, encoding: str = "utf-8", **kwargs: Any) -> str:
|
|
278
|
+
"""Async read text. Wraps the sync implementation."""
|
|
279
|
+
return await async_(self.read_text)(path, encoding, **kwargs)
|
|
280
|
+
|
|
281
|
+
async def write_text_async(self, path: str, data: str, encoding: str = "utf-8", **kwargs: Any) -> None:
|
|
282
|
+
"""Async write text. Wraps the sync implementation."""
|
|
283
|
+
await async_(self.write_text)(path, data, encoding, **kwargs)
|
|
284
|
+
|
|
285
|
+
async def list_objects_async(self, prefix: str = "", recursive: bool = True, **kwargs: Any) -> list[str]:
|
|
286
|
+
"""Async list objects. Wraps the sync implementation."""
|
|
287
|
+
return await async_(self.list_objects)(prefix, recursive, **kwargs)
|
|
288
|
+
|
|
289
|
+
async def exists_async(self, path: str, **kwargs: Any) -> bool:
|
|
290
|
+
"""Async exists check. Wraps the sync implementation."""
|
|
291
|
+
return await async_(self.exists)(path, **kwargs)
|
|
292
|
+
|
|
293
|
+
async def delete_async(self, path: str, **kwargs: Any) -> None:
|
|
294
|
+
"""Async delete. Wraps the sync implementation."""
|
|
295
|
+
await async_(self.delete)(path, **kwargs)
|
|
296
|
+
|
|
297
|
+
async def copy_async(self, source: str, destination: str, **kwargs: Any) -> None:
|
|
298
|
+
"""Async copy. Wraps the sync implementation."""
|
|
299
|
+
await async_(self.copy)(source, destination, **kwargs)
|
|
300
|
+
|
|
301
|
+
async def move_async(self, source: str, destination: str, **kwargs: Any) -> None:
|
|
302
|
+
"""Async move. Wraps the sync implementation."""
|
|
303
|
+
await async_(self.move)(source, destination, **kwargs)
|
|
304
|
+
|
|
305
|
+
async def get_metadata_async(self, path: str, **kwargs: Any) -> dict[str, Any]:
|
|
306
|
+
"""Async get metadata. Wraps the sync implementation."""
|
|
307
|
+
return await async_(self.get_metadata)(path, **kwargs)
|
|
308
|
+
|
|
309
|
+
async def read_arrow_async(self, path: str, **kwargs: Any) -> "ArrowTable":
|
|
310
|
+
"""Async read Arrow. Wraps the sync implementation."""
|
|
311
|
+
return await async_(self.read_arrow)(path, **kwargs)
|
|
312
|
+
|
|
313
|
+
async def write_arrow_async(self, path: str, table: "ArrowTable", **kwargs: Any) -> None:
|
|
314
|
+
"""Async write Arrow. Wraps the sync implementation."""
|
|
315
|
+
await async_(self.write_arrow)(path, table, **kwargs)
|