fsspec 2025.9.0__py3-none-any.whl → 2025.12.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fsspec/_version.py +2 -2
- fsspec/asyn.py +7 -1
- fsspec/caching.py +52 -45
- fsspec/conftest.py +75 -5
- fsspec/core.py +21 -4
- fsspec/generic.py +2 -0
- fsspec/implementations/arrow.py +13 -7
- fsspec/implementations/asyn_wrapper.py +3 -1
- fsspec/implementations/cache_metadata.py +1 -3
- fsspec/implementations/cached.py +9 -4
- fsspec/implementations/chained.py +23 -0
- fsspec/implementations/data.py +1 -2
- fsspec/implementations/dirfs.py +2 -1
- fsspec/implementations/gist.py +25 -16
- fsspec/implementations/http.py +8 -1
- fsspec/implementations/http_sync.py +7 -1
- fsspec/implementations/jupyter.py +7 -2
- fsspec/implementations/libarchive.py +1 -1
- fsspec/implementations/memory.py +4 -4
- fsspec/implementations/reference.py +15 -9
- fsspec/implementations/sftp.py +7 -0
- fsspec/implementations/webhdfs.py +1 -1
- fsspec/json.py +7 -12
- fsspec/parquet.py +100 -61
- fsspec/registry.py +3 -0
- fsspec/spec.py +17 -6
- fsspec/utils.py +11 -10
- {fsspec-2025.9.0.dist-info → fsspec-2025.12.0.dist-info}/METADATA +4 -4
- fsspec-2025.12.0.dist-info/RECORD +61 -0
- {fsspec-2025.9.0.dist-info → fsspec-2025.12.0.dist-info}/WHEEL +1 -1
- fsspec-2025.9.0.dist-info/RECORD +0 -60
- {fsspec-2025.9.0.dist-info → fsspec-2025.12.0.dist-info}/licenses/LICENSE +0 -0
fsspec/_version.py
CHANGED
|
@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
|
|
|
28
28
|
commit_id: COMMIT_ID
|
|
29
29
|
__commit_id__: COMMIT_ID
|
|
30
30
|
|
|
31
|
-
__version__ = version = '2025.
|
|
32
|
-
__version_tuple__ = version_tuple = (2025,
|
|
31
|
+
__version__ = version = '2025.12.0'
|
|
32
|
+
__version_tuple__ = version_tuple = (2025, 12, 0)
|
|
33
33
|
|
|
34
34
|
__commit_id__ = commit_id = None
|
fsspec/asyn.py
CHANGED
|
@@ -328,6 +328,11 @@ class AsyncFileSystem(AbstractFileSystem):
|
|
|
328
328
|
return self._loop
|
|
329
329
|
|
|
330
330
|
async def _rm_file(self, path, **kwargs):
|
|
331
|
+
if (
|
|
332
|
+
inspect.iscoroutinefunction(self._rm)
|
|
333
|
+
and type(self)._rm is not AsyncFileSystem._rm
|
|
334
|
+
):
|
|
335
|
+
return await self._rm(path, recursive=False, batch_size=1, **kwargs)
|
|
331
336
|
raise NotImplementedError
|
|
332
337
|
|
|
333
338
|
async def _rm(self, path, recursive=False, batch_size=None, **kwargs):
|
|
@@ -776,6 +781,7 @@ class AsyncFileSystem(AbstractFileSystem):
|
|
|
776
781
|
min_idx = min(idx_star, idx_qmark, idx_brace)
|
|
777
782
|
|
|
778
783
|
detail = kwargs.pop("detail", False)
|
|
784
|
+
withdirs = kwargs.pop("withdirs", True)
|
|
779
785
|
|
|
780
786
|
if not has_magic(path):
|
|
781
787
|
if await self._exists(path, **kwargs):
|
|
@@ -805,7 +811,7 @@ class AsyncFileSystem(AbstractFileSystem):
|
|
|
805
811
|
depth = None
|
|
806
812
|
|
|
807
813
|
allpaths = await self._find(
|
|
808
|
-
root, maxdepth=depth, withdirs=
|
|
814
|
+
root, maxdepth=depth, withdirs=withdirs, detail=True, **kwargs
|
|
809
815
|
)
|
|
810
816
|
|
|
811
817
|
pattern = glob_translate(path + ("/" if ends_with_sep else ""))
|
fsspec/caching.py
CHANGED
|
@@ -6,20 +6,12 @@ import logging
|
|
|
6
6
|
import math
|
|
7
7
|
import os
|
|
8
8
|
import threading
|
|
9
|
-
import warnings
|
|
10
9
|
from collections import OrderedDict
|
|
10
|
+
from collections.abc import Callable
|
|
11
11
|
from concurrent.futures import Future, ThreadPoolExecutor
|
|
12
12
|
from itertools import groupby
|
|
13
13
|
from operator import itemgetter
|
|
14
|
-
from typing import
|
|
15
|
-
TYPE_CHECKING,
|
|
16
|
-
Any,
|
|
17
|
-
Callable,
|
|
18
|
-
ClassVar,
|
|
19
|
-
Generic,
|
|
20
|
-
NamedTuple,
|
|
21
|
-
TypeVar,
|
|
22
|
-
)
|
|
14
|
+
from typing import TYPE_CHECKING, Any, ClassVar, Generic, NamedTuple, TypeVar
|
|
23
15
|
|
|
24
16
|
if TYPE_CHECKING:
|
|
25
17
|
import mmap
|
|
@@ -629,7 +621,7 @@ class KnownPartsOfAFile(BaseCache):
|
|
|
629
621
|
fetcher: Fetcher,
|
|
630
622
|
size: int,
|
|
631
623
|
data: dict[tuple[int, int], bytes] | None = None,
|
|
632
|
-
strict: bool =
|
|
624
|
+
strict: bool = False,
|
|
633
625
|
**_: Any,
|
|
634
626
|
):
|
|
635
627
|
super().__init__(blocksize, fetcher, size)
|
|
@@ -653,50 +645,65 @@ class KnownPartsOfAFile(BaseCache):
|
|
|
653
645
|
else:
|
|
654
646
|
self.data = {}
|
|
655
647
|
|
|
648
|
+
@property
|
|
649
|
+
def size(self):
|
|
650
|
+
return sum(_[1] - _[0] for _ in self.data)
|
|
651
|
+
|
|
652
|
+
@size.setter
|
|
653
|
+
def size(self, value):
|
|
654
|
+
pass
|
|
655
|
+
|
|
656
|
+
@property
|
|
657
|
+
def nblocks(self):
|
|
658
|
+
return len(self.data)
|
|
659
|
+
|
|
660
|
+
@nblocks.setter
|
|
661
|
+
def nblocks(self, value):
|
|
662
|
+
pass
|
|
663
|
+
|
|
656
664
|
def _fetch(self, start: int | None, stop: int | None) -> bytes:
|
|
657
665
|
if start is None:
|
|
658
666
|
start = 0
|
|
659
667
|
if stop is None:
|
|
660
668
|
stop = self.size
|
|
669
|
+
self.total_requested_bytes += stop - start
|
|
661
670
|
|
|
662
671
|
out = b""
|
|
663
|
-
|
|
664
|
-
|
|
665
|
-
|
|
672
|
+
started = False
|
|
673
|
+
loc_old = 0
|
|
674
|
+
for loc0, loc1 in sorted(self.data):
|
|
675
|
+
if (loc0 <= start < loc1) and (loc0 <= stop <= loc1):
|
|
676
|
+
# entirely within the block
|
|
677
|
+
off = start - loc0
|
|
678
|
+
self.hit_count += 1
|
|
679
|
+
return self.data[(loc0, loc1)][off : off + stop - start]
|
|
680
|
+
if stop <= loc0:
|
|
681
|
+
break
|
|
682
|
+
if started and loc0 > loc_old:
|
|
683
|
+
# a gap where we need data
|
|
684
|
+
self.miss_count += 1
|
|
685
|
+
if self.strict:
|
|
686
|
+
raise ValueError
|
|
687
|
+
out += b"\x00" * (loc0 - loc_old)
|
|
666
688
|
if loc0 <= start < loc1:
|
|
689
|
+
# found the start
|
|
690
|
+
self.hit_count += 1
|
|
667
691
|
off = start - loc0
|
|
668
|
-
out = data[off : off + stop - start]
|
|
669
|
-
|
|
670
|
-
|
|
671
|
-
|
|
672
|
-
|
|
673
|
-
|
|
674
|
-
|
|
675
|
-
|
|
676
|
-
|
|
677
|
-
|
|
678
|
-
|
|
679
|
-
# and we are being "strict" about reads
|
|
680
|
-
# beyond the buffer
|
|
681
|
-
start = loc1
|
|
682
|
-
break
|
|
683
|
-
|
|
684
|
-
# We only get here if there is a request outside the
|
|
685
|
-
# known parts of the file. In an ideal world, this
|
|
686
|
-
# should never happen
|
|
687
|
-
if self.fetcher is None:
|
|
688
|
-
# We cannot fetch the data, so raise an error
|
|
689
|
-
raise ValueError(f"Read is outside the known file parts: {(start, stop)}. ")
|
|
690
|
-
# We can fetch the data, but should warn the user
|
|
691
|
-
# that this may be slow
|
|
692
|
-
warnings.warn(
|
|
693
|
-
f"Read is outside the known file parts: {(start, stop)}. "
|
|
694
|
-
f"IO/caching performance may be poor!"
|
|
695
|
-
)
|
|
696
|
-
logger.debug(f"KnownPartsOfAFile cache fetching {start}-{stop}")
|
|
697
|
-
self.total_requested_bytes += stop - start
|
|
692
|
+
out = self.data[(loc0, loc1)][off : off + stop - start]
|
|
693
|
+
started = True
|
|
694
|
+
elif start < loc0 and stop > loc1:
|
|
695
|
+
# the whole block
|
|
696
|
+
self.hit_count += 1
|
|
697
|
+
out += self.data[(loc0, loc1)]
|
|
698
|
+
elif loc0 <= stop <= loc1:
|
|
699
|
+
# end block
|
|
700
|
+
self.hit_count += 1
|
|
701
|
+
return out + self.data[(loc0, loc1)][: stop - loc0]
|
|
702
|
+
loc_old = loc1
|
|
698
703
|
self.miss_count += 1
|
|
699
|
-
|
|
704
|
+
if started and not self.strict:
|
|
705
|
+
return out + b"\x00" * (stop - loc_old)
|
|
706
|
+
raise ValueError
|
|
700
707
|
|
|
701
708
|
|
|
702
709
|
class UpdatableLRU(Generic[P, T]):
|
fsspec/conftest.py
CHANGED
|
@@ -3,11 +3,12 @@ import shutil
|
|
|
3
3
|
import subprocess
|
|
4
4
|
import sys
|
|
5
5
|
import time
|
|
6
|
+
from collections import deque
|
|
7
|
+
from collections.abc import Generator, Sequence
|
|
6
8
|
|
|
7
9
|
import pytest
|
|
8
10
|
|
|
9
11
|
import fsspec
|
|
10
|
-
from fsspec.implementations.cached import CachingFileSystem
|
|
11
12
|
|
|
12
13
|
|
|
13
14
|
@pytest.fixture()
|
|
@@ -27,16 +28,85 @@ def m():
|
|
|
27
28
|
m.pseudo_dirs.append("")
|
|
28
29
|
|
|
29
30
|
|
|
30
|
-
|
|
31
|
+
class InstanceCacheInspector:
|
|
32
|
+
"""
|
|
33
|
+
Helper class to inspect instance caches of filesystem classes in tests.
|
|
34
|
+
"""
|
|
35
|
+
|
|
36
|
+
def clear(self) -> None:
|
|
37
|
+
"""
|
|
38
|
+
Clear instance caches of all currently imported filesystem classes.
|
|
39
|
+
"""
|
|
40
|
+
classes = deque([fsspec.spec.AbstractFileSystem])
|
|
41
|
+
while classes:
|
|
42
|
+
cls = classes.popleft()
|
|
43
|
+
cls.clear_instance_cache()
|
|
44
|
+
classes.extend(cls.__subclasses__())
|
|
45
|
+
|
|
46
|
+
def gather_counts(self, *, omit_zero: bool = True) -> dict[str, int]:
|
|
47
|
+
"""
|
|
48
|
+
Gather counts of filesystem instances in the instance caches
|
|
49
|
+
of all currently imported filesystem classes.
|
|
50
|
+
|
|
51
|
+
Parameters
|
|
52
|
+
----------
|
|
53
|
+
omit_zero:
|
|
54
|
+
Whether to omit instance types with no cached instances.
|
|
55
|
+
"""
|
|
56
|
+
out: dict[str, int] = {}
|
|
57
|
+
classes = deque([fsspec.spec.AbstractFileSystem])
|
|
58
|
+
while classes:
|
|
59
|
+
cls = classes.popleft()
|
|
60
|
+
count = len(cls._cache) # there is no public interface for the cache
|
|
61
|
+
# note: skip intermediate AbstractFileSystem subclasses
|
|
62
|
+
# if they proxy the protocol attribute via a property.
|
|
63
|
+
if isinstance(cls.protocol, (Sequence, str)):
|
|
64
|
+
key = cls.protocol if isinstance(cls.protocol, str) else cls.protocol[0]
|
|
65
|
+
if count or not omit_zero:
|
|
66
|
+
out[key] = count
|
|
67
|
+
classes.extend(cls.__subclasses__())
|
|
68
|
+
return out
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
@pytest.fixture(scope="function", autouse=True)
|
|
72
|
+
def instance_caches() -> Generator[InstanceCacheInspector, None, None]:
|
|
73
|
+
"""
|
|
74
|
+
Fixture to ensure empty filesystem instance caches before and after a test.
|
|
75
|
+
|
|
76
|
+
Used by default for all tests.
|
|
77
|
+
Clears caches of all imported filesystem classes.
|
|
78
|
+
Can be used to write test assertions about instance caches.
|
|
79
|
+
|
|
80
|
+
Usage:
|
|
81
|
+
|
|
82
|
+
def test_something(instance_caches):
|
|
83
|
+
# Test code here
|
|
84
|
+
fsspec.open("file://abc")
|
|
85
|
+
fsspec.open("memory://foo/bar")
|
|
86
|
+
|
|
87
|
+
# Test assertion
|
|
88
|
+
assert instance_caches.gather_counts() == {"file": 1, "memory": 1}
|
|
89
|
+
|
|
90
|
+
Returns
|
|
91
|
+
-------
|
|
92
|
+
instance_caches: An instance cache inspector for clearing and inspecting caches.
|
|
93
|
+
"""
|
|
94
|
+
ic = InstanceCacheInspector()
|
|
95
|
+
|
|
96
|
+
ic.clear()
|
|
97
|
+
try:
|
|
98
|
+
yield ic
|
|
99
|
+
finally:
|
|
100
|
+
ic.clear()
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
@pytest.fixture(scope="function")
|
|
31
104
|
def ftp_writable(tmpdir):
|
|
32
105
|
"""
|
|
33
106
|
Fixture providing a writable FTP filesystem.
|
|
34
107
|
"""
|
|
35
108
|
pytest.importorskip("pyftpdlib")
|
|
36
|
-
from fsspec.implementations.ftp import FTPFileSystem
|
|
37
109
|
|
|
38
|
-
FTPFileSystem.clear_instance_cache() # remove lingering connections
|
|
39
|
-
CachingFileSystem.clear_instance_cache()
|
|
40
110
|
d = str(tmpdir)
|
|
41
111
|
with open(os.path.join(d, "out"), "wb") as f:
|
|
42
112
|
f.write(b"hello" * 10000)
|
fsspec/core.py
CHANGED
|
@@ -18,7 +18,7 @@ from fsspec.caching import ( # noqa: F401
|
|
|
18
18
|
)
|
|
19
19
|
from fsspec.compression import compr
|
|
20
20
|
from fsspec.config import conf
|
|
21
|
-
from fsspec.registry import filesystem, get_filesystem_class
|
|
21
|
+
from fsspec.registry import available_protocols, filesystem, get_filesystem_class
|
|
22
22
|
from fsspec.utils import (
|
|
23
23
|
_unstrip_protocol,
|
|
24
24
|
build_name_function,
|
|
@@ -330,38 +330,55 @@ def open_files(
|
|
|
330
330
|
|
|
331
331
|
def _un_chain(path, kwargs):
|
|
332
332
|
# Avoid a circular import
|
|
333
|
-
from fsspec.implementations.
|
|
333
|
+
from fsspec.implementations.chained import ChainedFileSystem
|
|
334
334
|
|
|
335
335
|
if "::" in path:
|
|
336
336
|
x = re.compile(".*[^a-z]+.*") # test for non protocol-like single word
|
|
337
|
+
known_protocols = set(available_protocols())
|
|
337
338
|
bits = []
|
|
339
|
+
|
|
340
|
+
# split on '::', then ensure each bit has a protocol
|
|
338
341
|
for p in path.split("::"):
|
|
339
|
-
if
|
|
342
|
+
if p in known_protocols:
|
|
343
|
+
bits.append(p + "://")
|
|
344
|
+
elif "://" in p or x.match(p):
|
|
340
345
|
bits.append(p)
|
|
341
346
|
else:
|
|
342
347
|
bits.append(p + "://")
|
|
343
348
|
else:
|
|
344
349
|
bits = [path]
|
|
350
|
+
|
|
345
351
|
# [[url, protocol, kwargs], ...]
|
|
346
352
|
out = []
|
|
347
353
|
previous_bit = None
|
|
348
354
|
kwargs = kwargs.copy()
|
|
355
|
+
|
|
349
356
|
for bit in reversed(bits):
|
|
350
357
|
protocol = kwargs.pop("protocol", None) or split_protocol(bit)[0] or "file"
|
|
351
358
|
cls = get_filesystem_class(protocol)
|
|
352
359
|
extra_kwargs = cls._get_kwargs_from_urls(bit)
|
|
353
360
|
kws = kwargs.pop(protocol, {})
|
|
361
|
+
|
|
354
362
|
if bit is bits[0]:
|
|
355
363
|
kws.update(kwargs)
|
|
364
|
+
|
|
356
365
|
kw = dict(
|
|
357
366
|
**{k: v for k, v in extra_kwargs.items() if k not in kws or v != kws[k]},
|
|
358
367
|
**kws,
|
|
359
368
|
)
|
|
360
369
|
bit = cls._strip_protocol(bit)
|
|
361
|
-
|
|
370
|
+
|
|
371
|
+
if (
|
|
372
|
+
"target_protocol" not in kw
|
|
373
|
+
and issubclass(cls, ChainedFileSystem)
|
|
374
|
+
and not bit
|
|
375
|
+
):
|
|
376
|
+
# replace bit if we are chaining and no path given
|
|
362
377
|
bit = previous_bit
|
|
378
|
+
|
|
363
379
|
out.append((bit, protocol, kw))
|
|
364
380
|
previous_bit = bit
|
|
381
|
+
|
|
365
382
|
out.reverse()
|
|
366
383
|
return out
|
|
367
384
|
|
fsspec/generic.py
CHANGED
|
@@ -118,6 +118,8 @@ def rsync(
|
|
|
118
118
|
if otherfile in otherfiles:
|
|
119
119
|
if update_cond == "always":
|
|
120
120
|
allfiles[k] = otherfile
|
|
121
|
+
elif update_cond == "never":
|
|
122
|
+
allfiles.pop(k)
|
|
121
123
|
elif update_cond == "different":
|
|
122
124
|
inf1 = source_field(v) if callable(source_field) else v[source_field]
|
|
123
125
|
v2 = otherfiles[otherfile]
|
fsspec/implementations/arrow.py
CHANGED
|
@@ -75,10 +75,13 @@ class ArrowFSWrapper(AbstractFileSystem):
|
|
|
75
75
|
path = self._strip_protocol(path)
|
|
76
76
|
from pyarrow.fs import FileSelector
|
|
77
77
|
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
78
|
+
try:
|
|
79
|
+
entries = [
|
|
80
|
+
self._make_entry(entry)
|
|
81
|
+
for entry in self.fs.get_file_info(FileSelector(path))
|
|
82
|
+
]
|
|
83
|
+
except (FileNotFoundError, NotADirectoryError):
|
|
84
|
+
entries = [self.info(path, **kwargs)]
|
|
82
85
|
if detail:
|
|
83
86
|
return entries
|
|
84
87
|
else:
|
|
@@ -202,11 +205,11 @@ class ArrowFSWrapper(AbstractFileSystem):
|
|
|
202
205
|
return self.fs.get_file_info(path).mtime
|
|
203
206
|
|
|
204
207
|
def cat_file(self, path, start=None, end=None, **kwargs):
|
|
205
|
-
kwargs
|
|
208
|
+
kwargs.setdefault("seekable", start not in [None, 0])
|
|
206
209
|
return super().cat_file(path, start=None, end=None, **kwargs)
|
|
207
210
|
|
|
208
211
|
def get_file(self, rpath, lpath, **kwargs):
|
|
209
|
-
kwargs
|
|
212
|
+
kwargs.setdefault("seekable", False)
|
|
210
213
|
super().get_file(rpath, lpath, **kwargs)
|
|
211
214
|
|
|
212
215
|
|
|
@@ -220,7 +223,6 @@ class ArrowFSWrapper(AbstractFileSystem):
|
|
|
220
223
|
"readable",
|
|
221
224
|
"writable",
|
|
222
225
|
"close",
|
|
223
|
-
"size",
|
|
224
226
|
"seekable",
|
|
225
227
|
],
|
|
226
228
|
)
|
|
@@ -238,6 +240,10 @@ class ArrowFile(io.IOBase):
|
|
|
238
240
|
def __enter__(self):
|
|
239
241
|
return self
|
|
240
242
|
|
|
243
|
+
@property
|
|
244
|
+
def size(self):
|
|
245
|
+
return self.stream.size()
|
|
246
|
+
|
|
241
247
|
def __exit__(self, *args):
|
|
242
248
|
return self.close()
|
|
243
249
|
|
|
@@ -5,6 +5,8 @@ import inspect
|
|
|
5
5
|
import fsspec
|
|
6
6
|
from fsspec.asyn import AsyncFileSystem, running_async
|
|
7
7
|
|
|
8
|
+
from .chained import ChainedFileSystem
|
|
9
|
+
|
|
8
10
|
|
|
9
11
|
def async_wrapper(func, obj=None, semaphore=None):
|
|
10
12
|
"""
|
|
@@ -35,7 +37,7 @@ def async_wrapper(func, obj=None, semaphore=None):
|
|
|
35
37
|
return wrapper
|
|
36
38
|
|
|
37
39
|
|
|
38
|
-
class AsyncFileSystemWrapper(AsyncFileSystem):
|
|
40
|
+
class AsyncFileSystemWrapper(AsyncFileSystem, ChainedFileSystem):
|
|
39
41
|
"""
|
|
40
42
|
A wrapper class to convert a synchronous filesystem into an asynchronous one.
|
|
41
43
|
|
|
@@ -15,9 +15,7 @@ except ImportError:
|
|
|
15
15
|
|
|
16
16
|
if TYPE_CHECKING:
|
|
17
17
|
from collections.abc import Iterator
|
|
18
|
-
from typing import Any, Literal
|
|
19
|
-
|
|
20
|
-
from typing_extensions import TypeAlias
|
|
18
|
+
from typing import Any, Literal, TypeAlias
|
|
21
19
|
|
|
22
20
|
from .cached import CachingFileSystem
|
|
23
21
|
|
fsspec/implementations/cached.py
CHANGED
|
@@ -6,16 +6,18 @@ import os
|
|
|
6
6
|
import tempfile
|
|
7
7
|
import time
|
|
8
8
|
import weakref
|
|
9
|
+
from collections.abc import Callable
|
|
9
10
|
from shutil import rmtree
|
|
10
|
-
from typing import TYPE_CHECKING, Any,
|
|
11
|
+
from typing import TYPE_CHECKING, Any, ClassVar
|
|
11
12
|
|
|
12
|
-
from fsspec import
|
|
13
|
+
from fsspec import filesystem
|
|
13
14
|
from fsspec.callbacks import DEFAULT_CALLBACK
|
|
14
15
|
from fsspec.compression import compr
|
|
15
16
|
from fsspec.core import BaseCache, MMapCache
|
|
16
17
|
from fsspec.exceptions import BlocksizeMismatchError
|
|
17
18
|
from fsspec.implementations.cache_mapper import create_cache_mapper
|
|
18
19
|
from fsspec.implementations.cache_metadata import CacheMetadata
|
|
20
|
+
from fsspec.implementations.chained import ChainedFileSystem
|
|
19
21
|
from fsspec.implementations.local import LocalFileSystem
|
|
20
22
|
from fsspec.spec import AbstractBufferedFile
|
|
21
23
|
from fsspec.transaction import Transaction
|
|
@@ -39,7 +41,7 @@ class WriteCachedTransaction(Transaction):
|
|
|
39
41
|
self.fs = None # break cycle
|
|
40
42
|
|
|
41
43
|
|
|
42
|
-
class CachingFileSystem(
|
|
44
|
+
class CachingFileSystem(ChainedFileSystem):
|
|
43
45
|
"""Locally caching filesystem, layer over any other FS
|
|
44
46
|
|
|
45
47
|
This class implements chunk-wise local storage of remote files, for quick
|
|
@@ -60,6 +62,7 @@ class CachingFileSystem(AbstractFileSystem):
|
|
|
60
62
|
"""
|
|
61
63
|
|
|
62
64
|
protocol: ClassVar[str | tuple[str, ...]] = ("blockcache", "cached")
|
|
65
|
+
_strip_tokenize_options = ("fo",)
|
|
63
66
|
|
|
64
67
|
def __init__(
|
|
65
68
|
self,
|
|
@@ -984,7 +987,9 @@ class LocalTempFile:
|
|
|
984
987
|
os.remove(self.fn)
|
|
985
988
|
|
|
986
989
|
def commit(self):
|
|
987
|
-
|
|
990
|
+
# calling put() with list arguments avoids path expansion and additional operations
|
|
991
|
+
# like isdir()
|
|
992
|
+
self.fs.put([self.fn], [self.path], **self.kwargs)
|
|
988
993
|
# we do not delete the local copy, it's still in the cache.
|
|
989
994
|
|
|
990
995
|
@property
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
from typing import ClassVar
|
|
2
|
+
|
|
3
|
+
from fsspec import AbstractFileSystem
|
|
4
|
+
|
|
5
|
+
__all__ = ("ChainedFileSystem",)
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class ChainedFileSystem(AbstractFileSystem):
|
|
9
|
+
"""Chained filesystem base class.
|
|
10
|
+
|
|
11
|
+
A chained filesystem is designed to be layered over another FS.
|
|
12
|
+
This is useful to implement things like caching.
|
|
13
|
+
|
|
14
|
+
This base class does very little on its own, but is used as a marker
|
|
15
|
+
that the class is designed for chaining.
|
|
16
|
+
|
|
17
|
+
Right now this is only used in `url_to_fs` to provide the path argument
|
|
18
|
+
(`fo`) to the chained filesystem from the underlying filesystem.
|
|
19
|
+
|
|
20
|
+
Additional functionality may be added in the future.
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
protocol: ClassVar[str] = "chained"
|
fsspec/implementations/data.py
CHANGED
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
import base64
|
|
2
2
|
import io
|
|
3
|
-
from typing import Optional
|
|
4
3
|
from urllib.parse import unquote
|
|
5
4
|
|
|
6
5
|
from fsspec import AbstractFileSystem
|
|
@@ -50,7 +49,7 @@ class DataFileSystem(AbstractFileSystem):
|
|
|
50
49
|
return io.BytesIO(self.cat_file(path))
|
|
51
50
|
|
|
52
51
|
@staticmethod
|
|
53
|
-
def encode(data: bytes, mime:
|
|
52
|
+
def encode(data: bytes, mime: str | None = None):
|
|
54
53
|
"""Format the given data into data-URL syntax
|
|
55
54
|
|
|
56
55
|
This version always base64 encodes, even when the data is ascii/url-safe.
|
fsspec/implementations/dirfs.py
CHANGED
|
@@ -1,8 +1,9 @@
|
|
|
1
1
|
from .. import filesystem
|
|
2
2
|
from ..asyn import AsyncFileSystem
|
|
3
|
+
from .chained import ChainedFileSystem
|
|
3
4
|
|
|
4
5
|
|
|
5
|
-
class DirFileSystem(AsyncFileSystem):
|
|
6
|
+
class DirFileSystem(AsyncFileSystem, ChainedFileSystem):
|
|
6
7
|
"""Directory prefix filesystem
|
|
7
8
|
|
|
8
9
|
The DirFileSystem is a filesystem-wrapper. It assumes every path it is dealing with
|
fsspec/implementations/gist.py
CHANGED
|
@@ -14,21 +14,21 @@ class GistFileSystem(AbstractFileSystem):
|
|
|
14
14
|
|
|
15
15
|
Parameters
|
|
16
16
|
----------
|
|
17
|
-
gist_id
|
|
17
|
+
gist_id: str
|
|
18
18
|
The ID of the gist you want to access (the long hex value from the URL).
|
|
19
|
-
filenames
|
|
19
|
+
filenames: list[str] (optional)
|
|
20
20
|
If provided, only make a file system representing these files, and do not fetch
|
|
21
21
|
the list of all files for this gist.
|
|
22
|
-
sha
|
|
22
|
+
sha: str (optional)
|
|
23
23
|
If provided, fetch a particular revision of the gist. If omitted,
|
|
24
24
|
the latest revision is used.
|
|
25
|
-
username
|
|
26
|
-
GitHub username for authentication
|
|
27
|
-
token
|
|
28
|
-
GitHub personal access token (required if username is given).
|
|
29
|
-
timeout
|
|
25
|
+
username: str (optional)
|
|
26
|
+
GitHub username for authentication.
|
|
27
|
+
token: str (optional)
|
|
28
|
+
GitHub personal access token (required if username is given), or.
|
|
29
|
+
timeout: (float, float) or float, optional
|
|
30
30
|
Connect and read timeouts for requests (default 60s each).
|
|
31
|
-
kwargs
|
|
31
|
+
kwargs: dict
|
|
32
32
|
Stored on `self.request_kw` and passed to `requests.get` when fetching Gist
|
|
33
33
|
metadata or reading ("opening") a file.
|
|
34
34
|
"""
|
|
@@ -51,10 +51,8 @@ class GistFileSystem(AbstractFileSystem):
|
|
|
51
51
|
self.gist_id = gist_id
|
|
52
52
|
self.filenames = filenames
|
|
53
53
|
self.sha = sha # revision of the gist (optional)
|
|
54
|
-
if
|
|
55
|
-
|
|
56
|
-
if username or token:
|
|
57
|
-
raise ValueError("Auth requires both username and token, or neither.")
|
|
54
|
+
if username is not None and token is None:
|
|
55
|
+
raise ValueError("User auth requires a token")
|
|
58
56
|
self.username = username
|
|
59
57
|
self.token = token
|
|
60
58
|
self.request_kw = kwargs
|
|
@@ -67,9 +65,18 @@ class GistFileSystem(AbstractFileSystem):
|
|
|
67
65
|
@property
|
|
68
66
|
def kw(self):
|
|
69
67
|
"""Auth parameters passed to 'requests' if we have username/token."""
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
68
|
+
kw = {
|
|
69
|
+
"headers": {
|
|
70
|
+
"Accept": "application/vnd.github+json",
|
|
71
|
+
"X-GitHub-Api-Version": "2022-11-28",
|
|
72
|
+
}
|
|
73
|
+
}
|
|
74
|
+
kw.update(self.request_kw)
|
|
75
|
+
if self.username and self.token:
|
|
76
|
+
kw["auth"] = (self.username, self.token)
|
|
77
|
+
elif self.token:
|
|
78
|
+
kw["headers"]["Authorization"] = f"Bearer {self.token}"
|
|
79
|
+
return kw
|
|
73
80
|
|
|
74
81
|
def _fetch_gist_metadata(self):
|
|
75
82
|
"""
|
|
@@ -229,4 +236,6 @@ class GistFileSystem(AbstractFileSystem):
|
|
|
229
236
|
pass # skip
|
|
230
237
|
else:
|
|
231
238
|
out[p] = e
|
|
239
|
+
if len(paths) == 1 and paths[0] == path:
|
|
240
|
+
return out[path]
|
|
232
241
|
return out
|
fsspec/implementations/http.py
CHANGED
|
@@ -43,6 +43,7 @@ class HTTPFileSystem(AsyncFileSystem):
|
|
|
43
43
|
HTML href tags will be used.
|
|
44
44
|
"""
|
|
45
45
|
|
|
46
|
+
protocol = ("http", "https")
|
|
46
47
|
sep = "/"
|
|
47
48
|
|
|
48
49
|
def __init__(
|
|
@@ -326,7 +327,7 @@ class HTTPFileSystem(AsyncFileSystem):
|
|
|
326
327
|
async with meth(self.encode_url(rpath), data=gen_chunks(), **kw) as resp:
|
|
327
328
|
self._raise_not_found_for_status(resp, rpath)
|
|
328
329
|
|
|
329
|
-
async def _exists(self, path, **kwargs):
|
|
330
|
+
async def _exists(self, path, strict=False, **kwargs):
|
|
330
331
|
kw = self.kwargs.copy()
|
|
331
332
|
kw.update(kwargs)
|
|
332
333
|
try:
|
|
@@ -334,8 +335,14 @@ class HTTPFileSystem(AsyncFileSystem):
|
|
|
334
335
|
session = await self.set_session()
|
|
335
336
|
r = await session.get(self.encode_url(path), **kw)
|
|
336
337
|
async with r:
|
|
338
|
+
if strict:
|
|
339
|
+
self._raise_not_found_for_status(r, path)
|
|
337
340
|
return r.status < 400
|
|
341
|
+
except FileNotFoundError:
|
|
342
|
+
return False
|
|
338
343
|
except aiohttp.ClientError:
|
|
344
|
+
if strict:
|
|
345
|
+
raise
|
|
339
346
|
return False
|
|
340
347
|
|
|
341
348
|
async def _isfile(self, path, **kwargs):
|
|
@@ -463,14 +463,20 @@ class HTTPFileSystem(AbstractFileSystem):
|
|
|
463
463
|
end -= 1 # bytes range is inclusive
|
|
464
464
|
return f"bytes={start}-{end}"
|
|
465
465
|
|
|
466
|
-
def exists(self, path, **kwargs):
|
|
466
|
+
def exists(self, path, strict=False, **kwargs):
|
|
467
467
|
kw = self.kwargs.copy()
|
|
468
468
|
kw.update(kwargs)
|
|
469
469
|
try:
|
|
470
470
|
logger.debug(path)
|
|
471
471
|
r = self.session.get(self.encode_url(path), **kw)
|
|
472
|
+
if strict:
|
|
473
|
+
self._raise_not_found_for_status(r, path)
|
|
472
474
|
return r.status_code < 400
|
|
475
|
+
except FileNotFoundError:
|
|
476
|
+
return False
|
|
473
477
|
except Exception:
|
|
478
|
+
if strict:
|
|
479
|
+
raise
|
|
474
480
|
return False
|
|
475
481
|
|
|
476
482
|
def isfile(self, path, **kwargs):
|