fsspec 2025.9.0__py3-none-any.whl → 2025.12.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fsspec/_version.py +2 -2
- fsspec/asyn.py +7 -1
- fsspec/caching.py +52 -45
- fsspec/conftest.py +75 -5
- fsspec/core.py +21 -4
- fsspec/generic.py +2 -0
- fsspec/implementations/arrow.py +13 -7
- fsspec/implementations/asyn_wrapper.py +3 -1
- fsspec/implementations/cache_metadata.py +1 -3
- fsspec/implementations/cached.py +9 -4
- fsspec/implementations/chained.py +23 -0
- fsspec/implementations/data.py +1 -2
- fsspec/implementations/dirfs.py +2 -1
- fsspec/implementations/gist.py +25 -16
- fsspec/implementations/http.py +8 -1
- fsspec/implementations/http_sync.py +7 -1
- fsspec/implementations/jupyter.py +7 -2
- fsspec/implementations/libarchive.py +1 -1
- fsspec/implementations/memory.py +4 -4
- fsspec/implementations/reference.py +15 -9
- fsspec/implementations/sftp.py +7 -0
- fsspec/implementations/webhdfs.py +1 -1
- fsspec/json.py +7 -12
- fsspec/parquet.py +100 -61
- fsspec/registry.py +3 -0
- fsspec/spec.py +17 -6
- fsspec/utils.py +11 -10
- {fsspec-2025.9.0.dist-info → fsspec-2025.12.0.dist-info}/METADATA +4 -4
- fsspec-2025.12.0.dist-info/RECORD +61 -0
- {fsspec-2025.9.0.dist-info → fsspec-2025.12.0.dist-info}/WHEEL +1 -1
- fsspec-2025.9.0.dist-info/RECORD +0 -60
- {fsspec-2025.9.0.dist-info → fsspec-2025.12.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -42,7 +42,7 @@ class JupyterFileSystem(fsspec.AbstractFileSystem):
|
|
|
42
42
|
path = self._strip_protocol(path)
|
|
43
43
|
r = self.session.get(f"{self.url}/{path}")
|
|
44
44
|
if r.status_code == 404:
|
|
45
|
-
|
|
45
|
+
raise FileNotFoundError(path)
|
|
46
46
|
r.raise_for_status()
|
|
47
47
|
out = r.json()
|
|
48
48
|
|
|
@@ -63,7 +63,7 @@ class JupyterFileSystem(fsspec.AbstractFileSystem):
|
|
|
63
63
|
path = self._strip_protocol(path)
|
|
64
64
|
r = self.session.get(f"{self.url}/{path}")
|
|
65
65
|
if r.status_code == 404:
|
|
66
|
-
|
|
66
|
+
raise FileNotFoundError(path)
|
|
67
67
|
r.raise_for_status()
|
|
68
68
|
out = r.json()
|
|
69
69
|
if out["format"] == "text":
|
|
@@ -98,6 +98,11 @@ class JupyterFileSystem(fsspec.AbstractFileSystem):
|
|
|
98
98
|
}
|
|
99
99
|
self.session.put(f"{self.url}/{path}", json=json)
|
|
100
100
|
|
|
101
|
+
def mv(self, path1, path2, recursive=False, maxdepth=None, **kwargs):
|
|
102
|
+
if path1 == path2:
|
|
103
|
+
return
|
|
104
|
+
self.session.patch(f"{self.url}/{path1}", json={"path": path2})
|
|
105
|
+
|
|
101
106
|
def _rm(self, path):
|
|
102
107
|
path = self._strip_protocol(path)
|
|
103
108
|
self.session.delete(f"{self.url}/{path}")
|
fsspec/implementations/memory.py
CHANGED
|
@@ -187,10 +187,10 @@ class MemoryFileSystem(AbstractFileSystem):
|
|
|
187
187
|
parent = self._parent(parent)
|
|
188
188
|
if self.isfile(parent):
|
|
189
189
|
raise FileExistsError(parent)
|
|
190
|
-
if mode in ["rb", "ab", "r+b"]:
|
|
190
|
+
if mode in ["rb", "ab", "r+b", "a+b"]:
|
|
191
191
|
if path in self.store:
|
|
192
192
|
f = self.store[path]
|
|
193
|
-
if
|
|
193
|
+
if "a" in mode:
|
|
194
194
|
# position at the end of file
|
|
195
195
|
f.seek(0, 2)
|
|
196
196
|
else:
|
|
@@ -199,8 +199,8 @@ class MemoryFileSystem(AbstractFileSystem):
|
|
|
199
199
|
return f
|
|
200
200
|
else:
|
|
201
201
|
raise FileNotFoundError(path)
|
|
202
|
-
elif mode in {"wb", "xb"}:
|
|
203
|
-
if
|
|
202
|
+
elif mode in {"wb", "w+b", "xb", "x+b"}:
|
|
203
|
+
if "x" in mode and self.exists(path):
|
|
204
204
|
raise FileExistsError
|
|
205
205
|
m = MemoryFile(self, path, kwargs.get("data"))
|
|
206
206
|
if not self._intrans:
|
|
@@ -22,7 +22,11 @@ from fsspec.asyn import AsyncFileSystem
|
|
|
22
22
|
from fsspec.callbacks import DEFAULT_CALLBACK
|
|
23
23
|
from fsspec.core import filesystem, open, split_protocol
|
|
24
24
|
from fsspec.implementations.asyn_wrapper import AsyncFileSystemWrapper
|
|
25
|
-
from fsspec.utils import
|
|
25
|
+
from fsspec.utils import (
|
|
26
|
+
isfilelike,
|
|
27
|
+
merge_offset_ranges,
|
|
28
|
+
other_paths,
|
|
29
|
+
)
|
|
26
30
|
|
|
27
31
|
logger = logging.getLogger("fsspec.reference")
|
|
28
32
|
|
|
@@ -215,7 +219,7 @@ class LazyReferenceMapper(collections.abc.MutableMapping):
|
|
|
215
219
|
fs.pipe("/".join([root, ".zmetadata"]), json.dumps(met).encode())
|
|
216
220
|
return LazyReferenceMapper(root, fs, **kwargs)
|
|
217
221
|
|
|
218
|
-
@lru_cache
|
|
222
|
+
@lru_cache
|
|
219
223
|
def listdir(self):
|
|
220
224
|
"""List top-level directories"""
|
|
221
225
|
dirs = (p.rsplit("/", 1)[0] for p in self.zmetadata if not p.startswith(".z"))
|
|
@@ -698,13 +702,9 @@ class ReferenceFileSystem(AsyncFileSystem):
|
|
|
698
702
|
**(ref_storage_args or target_options or {}), protocol=target_protocol
|
|
699
703
|
)
|
|
700
704
|
ref_fs, fo2 = fsspec.core.url_to_fs(fo, **dic)
|
|
701
|
-
if
|
|
702
|
-
|
|
703
|
-
|
|
704
|
-
logger.info("Read reference from URL %s", fo)
|
|
705
|
-
text = json.load(f)
|
|
706
|
-
self._process_references(text, template_overrides)
|
|
707
|
-
else:
|
|
705
|
+
if ".json" not in fo2 and (
|
|
706
|
+
fo.endswith(("parq", "parquet", "/")) or ref_fs.isdir(fo2)
|
|
707
|
+
):
|
|
708
708
|
# Lazy parquet refs
|
|
709
709
|
logger.info("Open lazy reference dict from URL %s", fo)
|
|
710
710
|
self.references = LazyReferenceMapper(
|
|
@@ -712,6 +712,12 @@ class ReferenceFileSystem(AsyncFileSystem):
|
|
|
712
712
|
fs=ref_fs,
|
|
713
713
|
cache_size=cache_size,
|
|
714
714
|
)
|
|
715
|
+
else:
|
|
716
|
+
# text JSON
|
|
717
|
+
with fsspec.open(fo, "rb", **dic) as f:
|
|
718
|
+
logger.info("Read reference from URL %s", fo)
|
|
719
|
+
text = json.load(f)
|
|
720
|
+
self._process_references(text, template_overrides)
|
|
715
721
|
else:
|
|
716
722
|
# dictionaries
|
|
717
723
|
self._process_references(fo, template_overrides)
|
fsspec/implementations/sftp.py
CHANGED
|
@@ -66,6 +66,7 @@ class SFTPFileSystem(AbstractFileSystem):
|
|
|
66
66
|
return out
|
|
67
67
|
|
|
68
68
|
def mkdir(self, path, create_parents=True, mode=511):
|
|
69
|
+
path = self._strip_protocol(path)
|
|
69
70
|
logger.debug("Creating folder %s", path)
|
|
70
71
|
if self.exists(path):
|
|
71
72
|
raise FileExistsError(f"File exists: {path}")
|
|
@@ -89,10 +90,12 @@ class SFTPFileSystem(AbstractFileSystem):
|
|
|
89
90
|
self.ftp.mkdir(new_path, mode)
|
|
90
91
|
|
|
91
92
|
def rmdir(self, path):
|
|
93
|
+
path = self._strip_protocol(path)
|
|
92
94
|
logger.debug("Removing folder %s", path)
|
|
93
95
|
self.ftp.rmdir(path)
|
|
94
96
|
|
|
95
97
|
def info(self, path):
|
|
98
|
+
path = self._strip_protocol(path)
|
|
96
99
|
stat = self._decode_stat(self.ftp.stat(path))
|
|
97
100
|
stat["name"] = path
|
|
98
101
|
return stat
|
|
@@ -123,6 +126,7 @@ class SFTPFileSystem(AbstractFileSystem):
|
|
|
123
126
|
return out
|
|
124
127
|
|
|
125
128
|
def ls(self, path, detail=False):
|
|
129
|
+
path = self._strip_protocol(path)
|
|
126
130
|
logger.debug("Listing folder %s", path)
|
|
127
131
|
stats = [self._decode_stat(stat, path) for stat in self.ftp.listdir_iter(path)]
|
|
128
132
|
if detail:
|
|
@@ -132,6 +136,7 @@ class SFTPFileSystem(AbstractFileSystem):
|
|
|
132
136
|
return sorted(paths)
|
|
133
137
|
|
|
134
138
|
def put(self, lpath, rpath, callback=None, **kwargs):
|
|
139
|
+
rpath = self._strip_protocol(rpath)
|
|
135
140
|
logger.debug("Put file %s into %s", lpath, rpath)
|
|
136
141
|
self.ftp.put(lpath, rpath)
|
|
137
142
|
|
|
@@ -168,6 +173,8 @@ class SFTPFileSystem(AbstractFileSystem):
|
|
|
168
173
|
self.ftp.remove(path)
|
|
169
174
|
|
|
170
175
|
def mv(self, old, new):
|
|
176
|
+
new = self._strip_protocol(new)
|
|
177
|
+
old = self._strip_protocol(old)
|
|
171
178
|
logger.debug("Renaming %s into %s", old, new)
|
|
172
179
|
self.ftp.posix_rename(old, new)
|
|
173
180
|
|
|
@@ -268,7 +268,7 @@ class WebHDFS(AbstractFileSystem):
|
|
|
268
268
|
info["name"] = path
|
|
269
269
|
return self._process_info(info)
|
|
270
270
|
|
|
271
|
-
def ls(self, path, detail=False):
|
|
271
|
+
def ls(self, path, detail=False, **kwargs):
|
|
272
272
|
out = self._call("LISTSTATUS", path=path)
|
|
273
273
|
infos = out.json()["FileStatuses"]["FileStatus"]
|
|
274
274
|
for info in infos:
|
fsspec/json.py
CHANGED
|
@@ -1,13 +1,8 @@
|
|
|
1
1
|
import json
|
|
2
|
-
from collections.abc import Mapping, Sequence
|
|
2
|
+
from collections.abc import Callable, Mapping, Sequence
|
|
3
3
|
from contextlib import suppress
|
|
4
4
|
from pathlib import PurePath
|
|
5
|
-
from typing import
|
|
6
|
-
Any,
|
|
7
|
-
Callable,
|
|
8
|
-
ClassVar,
|
|
9
|
-
Optional,
|
|
10
|
-
)
|
|
5
|
+
from typing import Any, ClassVar
|
|
11
6
|
|
|
12
7
|
from .registry import _import_class, get_filesystem_class
|
|
13
8
|
from .spec import AbstractFileSystem
|
|
@@ -45,12 +40,12 @@ class FilesystemJSONDecoder(json.JSONDecoder):
|
|
|
45
40
|
def __init__(
|
|
46
41
|
self,
|
|
47
42
|
*,
|
|
48
|
-
object_hook:
|
|
49
|
-
parse_float:
|
|
50
|
-
parse_int:
|
|
51
|
-
parse_constant:
|
|
43
|
+
object_hook: Callable[[dict[str, Any]], Any] | None = None,
|
|
44
|
+
parse_float: Callable[[str], Any] | None = None,
|
|
45
|
+
parse_int: Callable[[str], Any] | None = None,
|
|
46
|
+
parse_constant: Callable[[str], Any] | None = None,
|
|
52
47
|
strict: bool = True,
|
|
53
|
-
object_pairs_hook:
|
|
48
|
+
object_pairs_hook: Callable[[list[tuple[str, Any]]], Any] | None = None,
|
|
54
49
|
) -> None:
|
|
55
50
|
self.original_object_hook = object_hook
|
|
56
51
|
|
fsspec/parquet.py
CHANGED
|
@@ -1,8 +1,12 @@
|
|
|
1
1
|
import io
|
|
2
2
|
import json
|
|
3
3
|
import warnings
|
|
4
|
+
from typing import Literal
|
|
5
|
+
|
|
6
|
+
import fsspec
|
|
4
7
|
|
|
5
8
|
from .core import url_to_fs
|
|
9
|
+
from .spec import AbstractBufferedFile
|
|
6
10
|
from .utils import merge_offset_ranges
|
|
7
11
|
|
|
8
12
|
# Parquet-Specific Utilities for fsspec
|
|
@@ -14,19 +18,24 @@ from .utils import merge_offset_ranges
|
|
|
14
18
|
# on remote file systems.
|
|
15
19
|
|
|
16
20
|
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
+
class AlreadyBufferedFile(AbstractBufferedFile):
|
|
22
|
+
def _fetch_range(self, start, end):
|
|
23
|
+
raise NotImplementedError
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def open_parquet_files(
|
|
27
|
+
path: list[str],
|
|
28
|
+
mode: Literal["rb"] = "rb",
|
|
29
|
+
fs: None | fsspec.AbstractFileSystem = None,
|
|
21
30
|
metadata=None,
|
|
22
|
-
columns=None,
|
|
23
|
-
row_groups=None,
|
|
24
|
-
storage_options=None,
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
31
|
+
columns: None | list[str] = None,
|
|
32
|
+
row_groups: None | list[int] = None,
|
|
33
|
+
storage_options: None | dict = None,
|
|
34
|
+
engine: str = "auto",
|
|
35
|
+
max_gap: int = 64_000,
|
|
36
|
+
max_block: int = 256_000_000,
|
|
37
|
+
footer_sample_size: int = 1_000_000,
|
|
38
|
+
filters: None | list[list[list[str]]] = None,
|
|
30
39
|
**kwargs,
|
|
31
40
|
):
|
|
32
41
|
"""
|
|
@@ -72,12 +81,6 @@ def open_parquet_file(
|
|
|
72
81
|
storage_options : dict, optional
|
|
73
82
|
Used to generate an `AbstractFileSystem` object if `fs` was
|
|
74
83
|
not specified.
|
|
75
|
-
strict : bool, optional
|
|
76
|
-
Whether the resulting `KnownPartsOfAFile` cache should
|
|
77
|
-
fetch reads that go beyond a known byte-range boundary.
|
|
78
|
-
If `False` (the default), any read that ends outside a
|
|
79
|
-
known part will be zero padded. Note that using
|
|
80
|
-
`strict=True` may be useful for debugging.
|
|
81
84
|
max_gap : int, optional
|
|
82
85
|
Neighboring byte ranges will only be merged when their
|
|
83
86
|
inter-range gap is <= `max_gap`. Default is 64KB.
|
|
@@ -89,6 +92,10 @@ def open_parquet_file(
|
|
|
89
92
|
for the footer metadata. If the sampled bytes do not contain
|
|
90
93
|
the footer, a second read request will be required, and
|
|
91
94
|
performance will suffer. Default is 1MB.
|
|
95
|
+
filters : list[list], optional
|
|
96
|
+
List of filters to apply to prevent reading row groups, of the
|
|
97
|
+
same format as accepted by the loading engines. Ignored if
|
|
98
|
+
``row_groups`` is specified.
|
|
92
99
|
**kwargs :
|
|
93
100
|
Optional key-word arguments to pass to `fs.open`
|
|
94
101
|
"""
|
|
@@ -96,20 +103,36 @@ def open_parquet_file(
|
|
|
96
103
|
# Make sure we have an `AbstractFileSystem` object
|
|
97
104
|
# to work with
|
|
98
105
|
if fs is None:
|
|
99
|
-
|
|
106
|
+
path0 = path
|
|
107
|
+
if isinstance(path, (list, tuple)):
|
|
108
|
+
path = path[0]
|
|
109
|
+
fs, path = url_to_fs(path, **(storage_options or {}))
|
|
110
|
+
else:
|
|
111
|
+
path0 = path
|
|
100
112
|
|
|
101
|
-
# For now, `columns == []` not supported
|
|
102
|
-
#
|
|
113
|
+
# For now, `columns == []` not supported, is the same
|
|
114
|
+
# as all columns
|
|
103
115
|
if columns is not None and len(columns) == 0:
|
|
104
|
-
|
|
116
|
+
columns = None
|
|
105
117
|
|
|
106
118
|
# Set the engine
|
|
107
119
|
engine = _set_engine(engine)
|
|
108
120
|
|
|
109
|
-
|
|
110
|
-
|
|
121
|
+
if isinstance(path0, (list, tuple)):
|
|
122
|
+
paths = path0
|
|
123
|
+
elif "*" in path:
|
|
124
|
+
paths = fs.glob(path)
|
|
125
|
+
elif path0.endswith("/"): # or fs.isdir(path):
|
|
126
|
+
paths = [
|
|
127
|
+
_
|
|
128
|
+
for _ in fs.find(path, withdirs=False, detail=False)
|
|
129
|
+
if _.endswith((".parquet", ".parq"))
|
|
130
|
+
]
|
|
131
|
+
else:
|
|
132
|
+
paths = [path]
|
|
133
|
+
|
|
111
134
|
data = _get_parquet_byte_ranges(
|
|
112
|
-
|
|
135
|
+
paths,
|
|
113
136
|
fs,
|
|
114
137
|
metadata=metadata,
|
|
115
138
|
columns=columns,
|
|
@@ -118,24 +141,37 @@ def open_parquet_file(
|
|
|
118
141
|
max_gap=max_gap,
|
|
119
142
|
max_block=max_block,
|
|
120
143
|
footer_sample_size=footer_sample_size,
|
|
144
|
+
filters=filters,
|
|
121
145
|
)
|
|
122
146
|
|
|
123
|
-
# Extract file name from `data`
|
|
124
|
-
fn = next(iter(data)) if data else path
|
|
125
|
-
|
|
126
147
|
# Call self.open with "parts" caching
|
|
127
148
|
options = kwargs.pop("cache_options", {}).copy()
|
|
128
|
-
return
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
149
|
+
return [
|
|
150
|
+
AlreadyBufferedFile(
|
|
151
|
+
fs=None,
|
|
152
|
+
path=fn,
|
|
153
|
+
mode=mode,
|
|
154
|
+
cache_type="parts",
|
|
155
|
+
cache_options={
|
|
156
|
+
**options,
|
|
157
|
+
"data": data.get(fn, {}),
|
|
158
|
+
},
|
|
159
|
+
size=max(_[1] for _ in data.get(fn, {})),
|
|
160
|
+
**kwargs,
|
|
161
|
+
)
|
|
162
|
+
for fn in data
|
|
163
|
+
]
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
def open_parquet_file(*args, **kwargs):
|
|
167
|
+
"""Create files tailed to reading specific parts of parquet files
|
|
168
|
+
|
|
169
|
+
Please see ``open_parquet_files`` for details of the arguments. The
|
|
170
|
+
difference is, this function always returns a single ``AleadyBufferedFile``,
|
|
171
|
+
whereas `open_parquet_files`` always returns a list of files, even if
|
|
172
|
+
there are one or zero matching parquet files.
|
|
173
|
+
"""
|
|
174
|
+
return open_parquet_files(*args, **kwargs)[0]
|
|
139
175
|
|
|
140
176
|
|
|
141
177
|
def _get_parquet_byte_ranges(
|
|
@@ -148,6 +184,7 @@ def _get_parquet_byte_ranges(
|
|
|
148
184
|
max_block=256_000_000,
|
|
149
185
|
footer_sample_size=1_000_000,
|
|
150
186
|
engine="auto",
|
|
187
|
+
filters=None,
|
|
151
188
|
):
|
|
152
189
|
"""Get a dictionary of the known byte ranges needed
|
|
153
190
|
to read a specific column/row-group selection from a
|
|
@@ -172,6 +209,7 @@ def _get_parquet_byte_ranges(
|
|
|
172
209
|
row_groups=row_groups,
|
|
173
210
|
max_gap=max_gap,
|
|
174
211
|
max_block=max_block,
|
|
212
|
+
filters=filters,
|
|
175
213
|
)
|
|
176
214
|
|
|
177
215
|
# Get file sizes asynchronously
|
|
@@ -183,17 +221,16 @@ def _get_parquet_byte_ranges(
|
|
|
183
221
|
data_starts = []
|
|
184
222
|
data_ends = []
|
|
185
223
|
add_header_magic = True
|
|
186
|
-
if columns is None and row_groups is None:
|
|
224
|
+
if columns is None and row_groups is None and filters is None:
|
|
187
225
|
# We are NOT selecting specific columns or row-groups.
|
|
188
226
|
#
|
|
189
227
|
# We can avoid sampling the footers, and just transfer
|
|
190
228
|
# all file data with cat_ranges
|
|
191
229
|
for i, path in enumerate(paths):
|
|
192
230
|
result[path] = {}
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
data_ends.append(min(b + max_block, file_sizes[i]))
|
|
231
|
+
data_paths.append(path)
|
|
232
|
+
data_starts.append(0)
|
|
233
|
+
data_ends.append(file_sizes[i])
|
|
197
234
|
add_header_magic = False # "Magic" should already be included
|
|
198
235
|
else:
|
|
199
236
|
# We ARE selecting specific columns or row-groups.
|
|
@@ -235,29 +272,21 @@ def _get_parquet_byte_ranges(
|
|
|
235
272
|
|
|
236
273
|
# Calculate required byte ranges for each path
|
|
237
274
|
for i, path in enumerate(paths):
|
|
238
|
-
# Deal with small-file case.
|
|
239
|
-
# Just include all remaining bytes of the file
|
|
240
|
-
# in a single range.
|
|
241
|
-
if file_sizes[i] < max_block:
|
|
242
|
-
if footer_starts[i] > 0:
|
|
243
|
-
# Only need to transfer the data if the
|
|
244
|
-
# footer sample isn't already the whole file
|
|
245
|
-
data_paths.append(path)
|
|
246
|
-
data_starts.append(0)
|
|
247
|
-
data_ends.append(footer_starts[i])
|
|
248
|
-
continue
|
|
249
|
-
|
|
250
275
|
# Use "engine" to collect data byte ranges
|
|
251
276
|
path_data_starts, path_data_ends = engine._parquet_byte_ranges(
|
|
252
277
|
columns,
|
|
253
278
|
row_groups=row_groups,
|
|
254
279
|
footer=footer_samples[i],
|
|
255
280
|
footer_start=footer_starts[i],
|
|
281
|
+
filters=filters,
|
|
256
282
|
)
|
|
257
283
|
|
|
258
284
|
data_paths += [path] * len(path_data_starts)
|
|
259
285
|
data_starts += path_data_starts
|
|
260
286
|
data_ends += path_data_ends
|
|
287
|
+
result.setdefault(path, {})[(footer_starts[i], file_sizes[i])] = (
|
|
288
|
+
footer_samples[i]
|
|
289
|
+
)
|
|
261
290
|
|
|
262
291
|
# Merge adjacent offset ranges
|
|
263
292
|
data_paths, data_starts, data_ends = merge_offset_ranges(
|
|
@@ -291,6 +320,7 @@ def _get_parquet_byte_ranges_from_metadata(
|
|
|
291
320
|
row_groups=None,
|
|
292
321
|
max_gap=64_000,
|
|
293
322
|
max_block=256_000_000,
|
|
323
|
+
filters=None,
|
|
294
324
|
):
|
|
295
325
|
"""Simplified version of `_get_parquet_byte_ranges` for
|
|
296
326
|
the case that an engine-specific `metadata` object is
|
|
@@ -300,9 +330,7 @@ def _get_parquet_byte_ranges_from_metadata(
|
|
|
300
330
|
|
|
301
331
|
# Use "engine" to collect data byte ranges
|
|
302
332
|
data_paths, data_starts, data_ends = engine._parquet_byte_ranges(
|
|
303
|
-
columns,
|
|
304
|
-
row_groups=row_groups,
|
|
305
|
-
metadata=metadata,
|
|
333
|
+
columns, row_groups=row_groups, metadata=metadata, filters=filters
|
|
306
334
|
)
|
|
307
335
|
|
|
308
336
|
# Merge adjacent offset ranges
|
|
@@ -401,16 +429,19 @@ class FastparquetEngine:
|
|
|
401
429
|
metadata=None,
|
|
402
430
|
footer=None,
|
|
403
431
|
footer_start=None,
|
|
432
|
+
filters=None,
|
|
404
433
|
):
|
|
405
434
|
# Initialize offset ranges and define ParqetFile metadata
|
|
406
435
|
pf = metadata
|
|
407
436
|
data_paths, data_starts, data_ends = [], [], []
|
|
437
|
+
if filters and row_groups:
|
|
438
|
+
raise ValueError("filters and row_groups cannot be used together")
|
|
408
439
|
if pf is None:
|
|
409
440
|
pf = self.fp.ParquetFile(io.BytesIO(footer))
|
|
410
441
|
|
|
411
442
|
# Convert columns to a set and add any index columns
|
|
412
443
|
# specified in the pandas metadata (just in case)
|
|
413
|
-
column_set = None if columns is None else
|
|
444
|
+
column_set = None if columns is None else {c.split(".", 1)[0] for c in columns}
|
|
414
445
|
if column_set is not None and hasattr(pf, "pandas_metadata"):
|
|
415
446
|
md_index = [
|
|
416
447
|
ind
|
|
@@ -422,7 +453,12 @@ class FastparquetEngine:
|
|
|
422
453
|
|
|
423
454
|
# Check if row_groups is a list of integers
|
|
424
455
|
# or a list of row-group metadata
|
|
425
|
-
if
|
|
456
|
+
if filters:
|
|
457
|
+
from fastparquet.api import filter_row_groups
|
|
458
|
+
|
|
459
|
+
row_group_indices = None
|
|
460
|
+
row_groups = filter_row_groups(pf, filters)
|
|
461
|
+
elif row_groups and not isinstance(row_groups[0], int):
|
|
426
462
|
# Input row_groups contains row-group metadata
|
|
427
463
|
row_group_indices = None
|
|
428
464
|
else:
|
|
@@ -486,9 +522,12 @@ class PyarrowEngine:
|
|
|
486
522
|
metadata=None,
|
|
487
523
|
footer=None,
|
|
488
524
|
footer_start=None,
|
|
525
|
+
filters=None,
|
|
489
526
|
):
|
|
490
527
|
if metadata is not None:
|
|
491
528
|
raise ValueError("metadata input not supported for PyarrowEngine")
|
|
529
|
+
if filters:
|
|
530
|
+
raise NotImplementedError
|
|
492
531
|
|
|
493
532
|
data_starts, data_ends = [], []
|
|
494
533
|
md = self.pq.ParquetFile(io.BytesIO(footer)).metadata
|
fsspec/registry.py
CHANGED
|
@@ -72,6 +72,9 @@ known_implementations = {
|
|
|
72
72
|
"class": "fsspec.implementations.arrow.HadoopFileSystem",
|
|
73
73
|
"err": "pyarrow and local java libraries required for HDFS",
|
|
74
74
|
},
|
|
75
|
+
"async_wrapper": {
|
|
76
|
+
"class": "fsspec.implementations.asyn_wrapper.AsyncFileSystemWrapper",
|
|
77
|
+
},
|
|
75
78
|
"asynclocal": {
|
|
76
79
|
"class": "morefs.asyn_local.AsyncLocalFileSystem",
|
|
77
80
|
"err": "Install 'morefs[asynclocalfs]' to use AsyncLocalFileSystem",
|
fsspec/spec.py
CHANGED
|
@@ -67,6 +67,9 @@ class _Cached(type):
|
|
|
67
67
|
extra_tokens = tuple(
|
|
68
68
|
getattr(cls, attr, None) for attr in cls._extra_tokenize_attributes
|
|
69
69
|
)
|
|
70
|
+
strip_tokenize_options = {
|
|
71
|
+
k: kwargs.pop(k) for k in cls._strip_tokenize_options if k in kwargs
|
|
72
|
+
}
|
|
70
73
|
token = tokenize(
|
|
71
74
|
cls, cls._pid, threading.get_ident(), *args, *extra_tokens, **kwargs
|
|
72
75
|
)
|
|
@@ -78,7 +81,7 @@ class _Cached(type):
|
|
|
78
81
|
cls._latest = token
|
|
79
82
|
return cls._cache[token]
|
|
80
83
|
else:
|
|
81
|
-
obj = super().__call__(*args, **kwargs)
|
|
84
|
+
obj = super().__call__(*args, **kwargs, **strip_tokenize_options)
|
|
82
85
|
# Setting _fs_token here causes some static linters to complain.
|
|
83
86
|
obj._fs_token_ = token
|
|
84
87
|
obj.storage_args = args
|
|
@@ -115,6 +118,8 @@ class AbstractFileSystem(metaclass=_Cached):
|
|
|
115
118
|
|
|
116
119
|
#: Extra *class attributes* that should be considered when hashing.
|
|
117
120
|
_extra_tokenize_attributes = ()
|
|
121
|
+
#: *storage options* that should not be considered when hashing.
|
|
122
|
+
_strip_tokenize_options = ()
|
|
118
123
|
|
|
119
124
|
# Set by _Cached metaclass
|
|
120
125
|
storage_args: tuple[Any, ...]
|
|
@@ -892,7 +897,7 @@ class AbstractFileSystem(metaclass=_Cached):
|
|
|
892
897
|
dict of {path: contents} if there are multiple paths
|
|
893
898
|
or the path has been otherwise expanded
|
|
894
899
|
"""
|
|
895
|
-
paths = self.expand_path(path, recursive=recursive)
|
|
900
|
+
paths = self.expand_path(path, recursive=recursive, **kwargs)
|
|
896
901
|
if (
|
|
897
902
|
len(paths) > 1
|
|
898
903
|
or isinstance(path, list)
|
|
@@ -972,7 +977,9 @@ class AbstractFileSystem(metaclass=_Cached):
|
|
|
972
977
|
)
|
|
973
978
|
|
|
974
979
|
source_is_str = isinstance(rpath, str)
|
|
975
|
-
rpaths = self.expand_path(
|
|
980
|
+
rpaths = self.expand_path(
|
|
981
|
+
rpath, recursive=recursive, maxdepth=maxdepth, **kwargs
|
|
982
|
+
)
|
|
976
983
|
if source_is_str and (not recursive or maxdepth is not None):
|
|
977
984
|
# Non-recursive glob does not copy directories
|
|
978
985
|
rpaths = [p for p in rpaths if not (trailing_sep(p) or self.isdir(p))]
|
|
@@ -1060,7 +1067,9 @@ class AbstractFileSystem(metaclass=_Cached):
|
|
|
1060
1067
|
if source_is_str:
|
|
1061
1068
|
lpath = make_path_posix(lpath)
|
|
1062
1069
|
fs = LocalFileSystem()
|
|
1063
|
-
lpaths = fs.expand_path(
|
|
1070
|
+
lpaths = fs.expand_path(
|
|
1071
|
+
lpath, recursive=recursive, maxdepth=maxdepth, **kwargs
|
|
1072
|
+
)
|
|
1064
1073
|
if source_is_str and (not recursive or maxdepth is not None):
|
|
1065
1074
|
# Non-recursive glob does not copy directories
|
|
1066
1075
|
lpaths = [p for p in lpaths if not (trailing_sep(p) or fs.isdir(p))]
|
|
@@ -1131,7 +1140,9 @@ class AbstractFileSystem(metaclass=_Cached):
|
|
|
1131
1140
|
from .implementations.local import trailing_sep
|
|
1132
1141
|
|
|
1133
1142
|
source_is_str = isinstance(path1, str)
|
|
1134
|
-
paths1 = self.expand_path(
|
|
1143
|
+
paths1 = self.expand_path(
|
|
1144
|
+
path1, recursive=recursive, maxdepth=maxdepth, **kwargs
|
|
1145
|
+
)
|
|
1135
1146
|
if source_is_str and (not recursive or maxdepth is not None):
|
|
1136
1147
|
# Non-recursive glob does not copy directories
|
|
1137
1148
|
paths1 = [p for p in paths1 if not (trailing_sep(p) or self.isdir(p))]
|
|
@@ -1172,7 +1183,7 @@ class AbstractFileSystem(metaclass=_Cached):
|
|
|
1172
1183
|
raise ValueError("maxdepth must be at least 1")
|
|
1173
1184
|
|
|
1174
1185
|
if isinstance(path, (str, os.PathLike)):
|
|
1175
|
-
out = self.expand_path([path], recursive, maxdepth)
|
|
1186
|
+
out = self.expand_path([path], recursive, maxdepth, **kwargs)
|
|
1176
1187
|
else:
|
|
1177
1188
|
out = set()
|
|
1178
1189
|
path = [self._strip_protocol(p) for p in path]
|
fsspec/utils.py
CHANGED
|
@@ -7,23 +7,16 @@ import os
|
|
|
7
7
|
import re
|
|
8
8
|
import sys
|
|
9
9
|
import tempfile
|
|
10
|
-
from collections.abc import Iterable, Iterator, Sequence
|
|
10
|
+
from collections.abc import Callable, Iterable, Iterator, Sequence
|
|
11
11
|
from functools import partial
|
|
12
12
|
from hashlib import md5
|
|
13
13
|
from importlib.metadata import version
|
|
14
|
-
from typing import
|
|
15
|
-
IO,
|
|
16
|
-
TYPE_CHECKING,
|
|
17
|
-
Any,
|
|
18
|
-
Callable,
|
|
19
|
-
TypeVar,
|
|
20
|
-
)
|
|
14
|
+
from typing import IO, TYPE_CHECKING, Any, TypeVar
|
|
21
15
|
from urllib.parse import urlsplit
|
|
22
16
|
|
|
23
17
|
if TYPE_CHECKING:
|
|
24
18
|
import pathlib
|
|
25
|
-
|
|
26
|
-
from typing_extensions import TypeGuard
|
|
19
|
+
from typing import TypeGuard
|
|
27
20
|
|
|
28
21
|
from fsspec.spec import AbstractFileSystem
|
|
29
22
|
|
|
@@ -438,6 +431,14 @@ def get_protocol(url: str) -> str:
|
|
|
438
431
|
return "file"
|
|
439
432
|
|
|
440
433
|
|
|
434
|
+
def get_file_extension(url: str) -> str:
|
|
435
|
+
url = stringify_path(url)
|
|
436
|
+
ext_parts = url.rsplit(".", 1)
|
|
437
|
+
if len(ext_parts) > 1:
|
|
438
|
+
return ext_parts[-1]
|
|
439
|
+
return ""
|
|
440
|
+
|
|
441
|
+
|
|
441
442
|
def can_be_local(path: str) -> bool:
|
|
442
443
|
"""Can the given URL be used with open_local?"""
|
|
443
444
|
from fsspec import get_filesystem_class
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: fsspec
|
|
3
|
-
Version: 2025.
|
|
3
|
+
Version: 2025.12.0
|
|
4
4
|
Summary: File-system specification
|
|
5
5
|
Project-URL: Changelog, https://filesystem-spec.readthedocs.io/en/latest/changelog.html
|
|
6
6
|
Project-URL: Documentation, https://filesystem-spec.readthedocs.io/en/latest/
|
|
@@ -12,12 +12,12 @@ Keywords: file
|
|
|
12
12
|
Classifier: Development Status :: 4 - Beta
|
|
13
13
|
Classifier: Intended Audience :: Developers
|
|
14
14
|
Classifier: Operating System :: OS Independent
|
|
15
|
-
Classifier: Programming Language :: Python :: 3.9
|
|
16
15
|
Classifier: Programming Language :: Python :: 3.10
|
|
17
16
|
Classifier: Programming Language :: Python :: 3.11
|
|
18
17
|
Classifier: Programming Language :: Python :: 3.12
|
|
19
18
|
Classifier: Programming Language :: Python :: 3.13
|
|
20
|
-
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.14
|
|
20
|
+
Requires-Python: >=3.10
|
|
21
21
|
Provides-Extra: abfs
|
|
22
22
|
Requires-Dist: adlfs; extra == 'abfs'
|
|
23
23
|
Provides-Extra: adl
|
|
@@ -197,7 +197,7 @@ CI runtime. For local use, pick a version suitable for you.
|
|
|
197
197
|
|
|
198
198
|
```bash
|
|
199
199
|
# For a new environment (mamba / conda).
|
|
200
|
-
mamba create -n fsspec -c conda-forge python=3.
|
|
200
|
+
mamba create -n fsspec -c conda-forge python=3.10 -y
|
|
201
201
|
conda activate fsspec
|
|
202
202
|
|
|
203
203
|
# Standard dev install with docs and tests.
|