fsspec 2023.6.0__py3-none-any.whl → 2023.9.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fsspec/_version.py +3 -3
- fsspec/asyn.py +154 -92
- fsspec/caching.py +1 -1
- fsspec/compression.py +7 -2
- fsspec/core.py +16 -8
- fsspec/generic.py +111 -17
- fsspec/gui.py +4 -2
- fsspec/implementations/cache_mapper.py +80 -0
- fsspec/implementations/cache_metadata.py +232 -0
- fsspec/implementations/cached.py +74 -157
- fsspec/implementations/dirfs.py +3 -1
- fsspec/implementations/http.py +36 -19
- fsspec/implementations/local.py +4 -21
- fsspec/implementations/memory.py +8 -9
- fsspec/implementations/reference.py +8 -8
- fsspec/implementations/sftp.py +6 -2
- fsspec/implementations/smb.py +39 -23
- fsspec/mapping.py +8 -0
- fsspec/registry.py +22 -0
- fsspec/spec.py +164 -96
- fsspec/tests/abstract/__init__.py +147 -0
- fsspec/tests/abstract/common.py +175 -0
- fsspec/tests/abstract/copy.py +250 -56
- fsspec/tests/abstract/get.py +248 -38
- fsspec/tests/abstract/put.py +246 -66
- fsspec/utils.py +25 -8
- {fsspec-2023.6.0.dist-info → fsspec-2023.9.1.dist-info}/METADATA +1 -1
- fsspec-2023.9.1.dist-info/RECORD +54 -0
- fsspec-2023.6.0.dist-info/RECORD +0 -51
- {fsspec-2023.6.0.dist-info → fsspec-2023.9.1.dist-info}/LICENSE +0 -0
- {fsspec-2023.6.0.dist-info → fsspec-2023.9.1.dist-info}/WHEEL +0 -0
- {fsspec-2023.6.0.dist-info → fsspec-2023.9.1.dist-info}/top_level.txt +0 -0
fsspec/implementations/cached.py
CHANGED
|
@@ -1,24 +1,27 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
-
import contextlib
|
|
4
|
-
import hashlib
|
|
5
3
|
import inspect
|
|
6
4
|
import logging
|
|
7
5
|
import os
|
|
8
|
-
import pickle
|
|
9
6
|
import tempfile
|
|
10
7
|
import time
|
|
8
|
+
import weakref
|
|
11
9
|
from shutil import rmtree
|
|
12
|
-
from typing import ClassVar
|
|
10
|
+
from typing import TYPE_CHECKING, Any, Callable, ClassVar
|
|
13
11
|
|
|
14
12
|
from fsspec import AbstractFileSystem, filesystem
|
|
15
13
|
from fsspec.callbacks import _DEFAULT_CALLBACK
|
|
16
14
|
from fsspec.compression import compr
|
|
17
15
|
from fsspec.core import BaseCache, MMapCache
|
|
18
16
|
from fsspec.exceptions import BlocksizeMismatchError
|
|
17
|
+
from fsspec.implementations.cache_mapper import create_cache_mapper
|
|
18
|
+
from fsspec.implementations.cache_metadata import CacheMetadata
|
|
19
19
|
from fsspec.spec import AbstractBufferedFile
|
|
20
20
|
from fsspec.utils import infer_compression
|
|
21
21
|
|
|
22
|
+
if TYPE_CHECKING:
|
|
23
|
+
from fsspec.implementations.cache_mapper import AbstractCacheMapper
|
|
24
|
+
|
|
22
25
|
logger = logging.getLogger("fsspec.cached")
|
|
23
26
|
|
|
24
27
|
|
|
@@ -53,8 +56,9 @@ class CachingFileSystem(AbstractFileSystem):
|
|
|
53
56
|
expiry_time=604800,
|
|
54
57
|
target_options=None,
|
|
55
58
|
fs=None,
|
|
56
|
-
same_names=
|
|
59
|
+
same_names: bool | None = None,
|
|
57
60
|
compression=None,
|
|
61
|
+
cache_mapper: AbstractCacheMapper | None = None,
|
|
58
62
|
**kwargs,
|
|
59
63
|
):
|
|
60
64
|
"""
|
|
@@ -84,13 +88,19 @@ class CachingFileSystem(AbstractFileSystem):
|
|
|
84
88
|
fs: filesystem instance
|
|
85
89
|
The target filesystem to run against. Provide this or ``protocol``.
|
|
86
90
|
same_names: bool (optional)
|
|
87
|
-
By default, target URLs are hashed
|
|
88
|
-
with the same basename do not
|
|
89
|
-
|
|
91
|
+
By default, target URLs are hashed using a ``HashCacheMapper`` so
|
|
92
|
+
that files from different backends with the same basename do not
|
|
93
|
+
conflict. If this argument is ``true``, a ``BasenameCacheMapper``
|
|
94
|
+
is used instead. Other cache mapper options are available by using
|
|
95
|
+
the ``cache_mapper`` keyword argument. Only one of this and
|
|
96
|
+
``cache_mapper`` should be specified.
|
|
90
97
|
compression: str (optional)
|
|
91
98
|
To decompress on download. Can be 'infer' (guess from the URL name),
|
|
92
99
|
one of the entries in ``fsspec.compression.compr``, or None for no
|
|
93
100
|
decompression.
|
|
101
|
+
cache_mapper: AbstractCacheMapper (optional)
|
|
102
|
+
The object use to map from original filenames to cached filenames.
|
|
103
|
+
Only one of this and ``same_names`` should be specified.
|
|
94
104
|
"""
|
|
95
105
|
super().__init__(**kwargs)
|
|
96
106
|
if fs is None and target_protocol is None:
|
|
@@ -102,7 +112,9 @@ class CachingFileSystem(AbstractFileSystem):
|
|
|
102
112
|
"Both filesystems (fs) and target_protocol may not be both given."
|
|
103
113
|
)
|
|
104
114
|
if cache_storage == "TMP":
|
|
105
|
-
|
|
115
|
+
tempdir = tempfile.mkdtemp()
|
|
116
|
+
storage = [tempdir]
|
|
117
|
+
weakref.finalize(self, self._remove_tempdir, tempdir)
|
|
106
118
|
else:
|
|
107
119
|
if isinstance(cache_storage, str):
|
|
108
120
|
storage = [cache_storage]
|
|
@@ -115,14 +127,25 @@ class CachingFileSystem(AbstractFileSystem):
|
|
|
115
127
|
self.check_files = check_files
|
|
116
128
|
self.expiry = expiry_time
|
|
117
129
|
self.compression = compression
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
130
|
+
|
|
131
|
+
if same_names is not None and cache_mapper is not None:
|
|
132
|
+
raise ValueError(
|
|
133
|
+
"Cannot specify both same_names and cache_mapper in "
|
|
134
|
+
"CachingFileSystem.__init__"
|
|
135
|
+
)
|
|
136
|
+
if cache_mapper is not None:
|
|
137
|
+
self._mapper = cache_mapper
|
|
138
|
+
else:
|
|
139
|
+
self._mapper = create_cache_mapper(
|
|
140
|
+
same_names if same_names is not None else False
|
|
141
|
+
)
|
|
142
|
+
|
|
121
143
|
self.target_protocol = (
|
|
122
144
|
target_protocol
|
|
123
145
|
if isinstance(target_protocol, str)
|
|
124
146
|
else (fs.protocol if isinstance(fs.protocol, str) else fs.protocol[0])
|
|
125
147
|
)
|
|
148
|
+
self._metadata = CacheMetadata(self.storage)
|
|
126
149
|
self.load_cache()
|
|
127
150
|
self.fs = fs if fs is not None else filesystem(target_protocol, **self.kwargs)
|
|
128
151
|
|
|
@@ -130,68 +153,28 @@ class CachingFileSystem(AbstractFileSystem):
|
|
|
130
153
|
# acts as a method, since each instance has a difference target
|
|
131
154
|
return self.fs._strip_protocol(type(self)._strip_protocol(path))
|
|
132
155
|
|
|
133
|
-
self._strip_protocol = _strip_protocol
|
|
156
|
+
self._strip_protocol: Callable = _strip_protocol
|
|
157
|
+
|
|
158
|
+
@staticmethod
|
|
159
|
+
def _remove_tempdir(tempdir):
|
|
160
|
+
try:
|
|
161
|
+
rmtree(tempdir)
|
|
162
|
+
except Exception:
|
|
163
|
+
pass
|
|
134
164
|
|
|
135
165
|
def _mkcache(self):
|
|
136
166
|
os.makedirs(self.storage[-1], exist_ok=True)
|
|
137
167
|
|
|
138
168
|
def load_cache(self):
|
|
139
169
|
"""Read set of stored blocks from file"""
|
|
140
|
-
|
|
141
|
-
for storage in self.storage:
|
|
142
|
-
fn = os.path.join(storage, "cache")
|
|
143
|
-
if os.path.exists(fn):
|
|
144
|
-
with open(fn, "rb") as f:
|
|
145
|
-
# TODO: consolidate blocks here
|
|
146
|
-
loaded_cached_files = pickle.load(f)
|
|
147
|
-
for c in loaded_cached_files.values():
|
|
148
|
-
if isinstance(c["blocks"], list):
|
|
149
|
-
c["blocks"] = set(c["blocks"])
|
|
150
|
-
cached_files.append(loaded_cached_files)
|
|
151
|
-
else:
|
|
152
|
-
cached_files.append({})
|
|
170
|
+
self._metadata.load()
|
|
153
171
|
self._mkcache()
|
|
154
|
-
self.cached_files = cached_files or [{}]
|
|
155
172
|
self.last_cache = time.time()
|
|
156
173
|
|
|
157
174
|
def save_cache(self):
|
|
158
175
|
"""Save set of stored blocks from file"""
|
|
159
|
-
fn = os.path.join(self.storage[-1], "cache")
|
|
160
|
-
# TODO: a file lock could be used to ensure file does not change
|
|
161
|
-
# between re-read and write; but occasional duplicated reads ok.
|
|
162
|
-
cache = self.cached_files[-1]
|
|
163
|
-
if os.path.exists(fn):
|
|
164
|
-
with open(fn, "rb") as f:
|
|
165
|
-
cached_files = pickle.load(f)
|
|
166
|
-
for k, c in cached_files.items():
|
|
167
|
-
if k in cache:
|
|
168
|
-
if c["blocks"] is True or cache[k]["blocks"] is True:
|
|
169
|
-
c["blocks"] = True
|
|
170
|
-
else:
|
|
171
|
-
# self.cached_files[*][*]["blocks"] must continue to
|
|
172
|
-
# point to the same set object so that updates
|
|
173
|
-
# performed by MMapCache are propagated back to
|
|
174
|
-
# self.cached_files.
|
|
175
|
-
blocks = cache[k]["blocks"]
|
|
176
|
-
blocks.update(c["blocks"])
|
|
177
|
-
c["blocks"] = blocks
|
|
178
|
-
c["time"] = max(c["time"], cache[k]["time"])
|
|
179
|
-
c["uid"] = cache[k]["uid"]
|
|
180
|
-
|
|
181
|
-
# Files can be added to cache after it was written once
|
|
182
|
-
for k, c in cache.items():
|
|
183
|
-
if k not in cached_files:
|
|
184
|
-
cached_files[k] = c
|
|
185
|
-
else:
|
|
186
|
-
cached_files = cache
|
|
187
|
-
cache = {k: v.copy() for k, v in cached_files.items()}
|
|
188
|
-
for c in cache.values():
|
|
189
|
-
if isinstance(c["blocks"], set):
|
|
190
|
-
c["blocks"] = list(c["blocks"])
|
|
191
176
|
self._mkcache()
|
|
192
|
-
|
|
193
|
-
pickle.dump(cache, f)
|
|
194
|
-
self.cached_files[-1] = cached_files
|
|
177
|
+
self._metadata.save()
|
|
195
178
|
self.last_cache = time.time()
|
|
196
179
|
|
|
197
180
|
def _check_cache(self):
|
|
@@ -208,25 +191,11 @@ class CachingFileSystem(AbstractFileSystem):
|
|
|
208
191
|
def _check_file(self, path):
|
|
209
192
|
"""Is path in cache and still valid"""
|
|
210
193
|
path = self._strip_protocol(path)
|
|
211
|
-
|
|
212
194
|
self._check_cache()
|
|
213
|
-
|
|
214
|
-
if path not in cache:
|
|
215
|
-
continue
|
|
216
|
-
detail = cache[path].copy()
|
|
217
|
-
if self.check_files:
|
|
218
|
-
if detail["uid"] != self.fs.ukey(path):
|
|
219
|
-
continue
|
|
220
|
-
if self.expiry:
|
|
221
|
-
if time.time() - detail["time"] > self.expiry:
|
|
222
|
-
continue
|
|
223
|
-
fn = os.path.join(storage, detail["fn"])
|
|
224
|
-
if os.path.exists(fn):
|
|
225
|
-
return detail, fn
|
|
226
|
-
return False
|
|
195
|
+
return self._metadata.check_file(path, self)
|
|
227
196
|
|
|
228
197
|
def clear_cache(self):
|
|
229
|
-
"""Remove all files and
|
|
198
|
+
"""Remove all files and metadata from the cache
|
|
230
199
|
|
|
231
200
|
In the case of multiple cache locations, this clears only the last one,
|
|
232
201
|
which is assumed to be the read/write one.
|
|
@@ -253,22 +222,12 @@ class CachingFileSystem(AbstractFileSystem):
|
|
|
253
222
|
|
|
254
223
|
self._check_cache()
|
|
255
224
|
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
fn = os.path.join(self.storage[-1], detail["fn"])
|
|
263
|
-
if os.path.exists(fn):
|
|
264
|
-
os.remove(fn)
|
|
265
|
-
self.cached_files[-1].pop(path)
|
|
266
|
-
|
|
267
|
-
if self.cached_files[-1]:
|
|
268
|
-
cache_path = os.path.join(self.storage[-1], "cache")
|
|
269
|
-
with atomic_write(cache_path) as fc:
|
|
270
|
-
pickle.dump(self.cached_files[-1], fc)
|
|
271
|
-
else:
|
|
225
|
+
expired_files, writable_cache_empty = self._metadata.clear_expired(expiry_time)
|
|
226
|
+
for fn in expired_files:
|
|
227
|
+
if os.path.exists(fn):
|
|
228
|
+
os.remove(fn)
|
|
229
|
+
|
|
230
|
+
if writable_cache_empty:
|
|
272
231
|
rmtree(self.storage[-1])
|
|
273
232
|
self.load_cache()
|
|
274
233
|
|
|
@@ -280,19 +239,9 @@ class CachingFileSystem(AbstractFileSystem):
|
|
|
280
239
|
raises PermissionError
|
|
281
240
|
"""
|
|
282
241
|
path = self._strip_protocol(path)
|
|
283
|
-
|
|
284
|
-
if not
|
|
285
|
-
return
|
|
286
|
-
_, fn = details
|
|
287
|
-
if fn.startswith(self.storage[-1]):
|
|
288
|
-
# is in in writable cache
|
|
242
|
+
fn = self._metadata.pop_file(path)
|
|
243
|
+
if fn is not None:
|
|
289
244
|
os.remove(fn)
|
|
290
|
-
self.cached_files[-1].pop(path)
|
|
291
|
-
self.save_cache()
|
|
292
|
-
else:
|
|
293
|
-
raise PermissionError(
|
|
294
|
-
"Can only delete cached file in last, writable cache location"
|
|
295
|
-
)
|
|
296
245
|
|
|
297
246
|
def _open(
|
|
298
247
|
self,
|
|
@@ -339,7 +288,7 @@ class CachingFileSystem(AbstractFileSystem):
|
|
|
339
288
|
# TODO: action where partial file exists in read-only cache
|
|
340
289
|
logger.debug("Opening partially cached copy of %s" % path)
|
|
341
290
|
else:
|
|
342
|
-
hash = self.
|
|
291
|
+
hash = self._mapper(path)
|
|
343
292
|
fn = os.path.join(self.storage[-1], hash)
|
|
344
293
|
blocks = set()
|
|
345
294
|
detail = {
|
|
@@ -349,7 +298,7 @@ class CachingFileSystem(AbstractFileSystem):
|
|
|
349
298
|
"time": time.time(),
|
|
350
299
|
"uid": self.fs.ukey(path),
|
|
351
300
|
}
|
|
352
|
-
self.
|
|
301
|
+
self._metadata.update_file(path, detail)
|
|
353
302
|
logger.debug("Creating local sparse file for %s" % path)
|
|
354
303
|
|
|
355
304
|
# call target filesystems open
|
|
@@ -385,18 +334,17 @@ class CachingFileSystem(AbstractFileSystem):
|
|
|
385
334
|
self.save_cache()
|
|
386
335
|
return f
|
|
387
336
|
|
|
388
|
-
def hash_name(self, path,
|
|
389
|
-
|
|
337
|
+
def hash_name(self, path: str, *args: Any) -> str:
|
|
338
|
+
# Kept for backward compatibility with downstream libraries.
|
|
339
|
+
# Ignores extra arguments, previously same_name boolean.
|
|
340
|
+
return self._mapper(path)
|
|
390
341
|
|
|
391
342
|
def close_and_update(self, f, close):
|
|
392
343
|
"""Called when a file is closing, so store the set of blocks"""
|
|
393
344
|
if f.closed:
|
|
394
345
|
return
|
|
395
346
|
path = self._strip_protocol(f.path)
|
|
396
|
-
|
|
397
|
-
c = self.cached_files[-1][path]
|
|
398
|
-
if c["blocks"] is not True and len(c["blocks"]) * f.blocksize >= f.size:
|
|
399
|
-
c["blocks"] = True
|
|
347
|
+
self._metadata.on_close_cached_file(f, path)
|
|
400
348
|
try:
|
|
401
349
|
logger.debug("going to save")
|
|
402
350
|
self.save_cache()
|
|
@@ -488,7 +436,7 @@ class CachingFileSystem(AbstractFileSystem):
|
|
|
488
436
|
and self.check_files == other.check_files
|
|
489
437
|
and self.expiry == other.expiry
|
|
490
438
|
and self.compression == other.compression
|
|
491
|
-
and self.
|
|
439
|
+
and self._mapper == other._mapper
|
|
492
440
|
and self.target_protocol == other.target_protocol
|
|
493
441
|
)
|
|
494
442
|
|
|
@@ -501,7 +449,7 @@ class CachingFileSystem(AbstractFileSystem):
|
|
|
501
449
|
^ hash(self.check_files)
|
|
502
450
|
^ hash(self.expiry)
|
|
503
451
|
^ hash(self.compression)
|
|
504
|
-
^ hash(self.
|
|
452
|
+
^ hash(self._mapper)
|
|
505
453
|
^ hash(self.target_protocol)
|
|
506
454
|
)
|
|
507
455
|
|
|
@@ -546,7 +494,7 @@ class WholeFileCacheFileSystem(CachingFileSystem):
|
|
|
546
494
|
details = [self._check_file(sp) for sp in paths]
|
|
547
495
|
downpath = [p for p, d in zip(paths, details) if not d]
|
|
548
496
|
downfn0 = [
|
|
549
|
-
os.path.join(self.storage[-1], self.
|
|
497
|
+
os.path.join(self.storage[-1], self._mapper(p))
|
|
550
498
|
for p, d in zip(paths, details)
|
|
551
499
|
] # keep these path names for opening later
|
|
552
500
|
downfn = [fn for fn, d in zip(downfn0, details) if not d]
|
|
@@ -558,16 +506,15 @@ class WholeFileCacheFileSystem(CachingFileSystem):
|
|
|
558
506
|
newdetail = [
|
|
559
507
|
{
|
|
560
508
|
"original": path,
|
|
561
|
-
"fn": self.
|
|
509
|
+
"fn": self._mapper(path),
|
|
562
510
|
"blocks": True,
|
|
563
511
|
"time": time.time(),
|
|
564
512
|
"uid": self.fs.ukey(path),
|
|
565
513
|
}
|
|
566
514
|
for path in downpath
|
|
567
515
|
]
|
|
568
|
-
|
|
569
|
-
|
|
570
|
-
)
|
|
516
|
+
for path, detail in zip(downpath, newdetail):
|
|
517
|
+
self._metadata.update_file(path, detail)
|
|
571
518
|
self.save_cache()
|
|
572
519
|
|
|
573
520
|
def firstpart(fn):
|
|
@@ -590,7 +537,7 @@ class WholeFileCacheFileSystem(CachingFileSystem):
|
|
|
590
537
|
pass
|
|
591
538
|
|
|
592
539
|
def _make_local_details(self, path):
|
|
593
|
-
hash = self.
|
|
540
|
+
hash = self._mapper(path)
|
|
594
541
|
fn = os.path.join(self.storage[-1], hash)
|
|
595
542
|
detail = {
|
|
596
543
|
"original": path,
|
|
@@ -599,7 +546,7 @@ class WholeFileCacheFileSystem(CachingFileSystem):
|
|
|
599
546
|
"time": time.time(),
|
|
600
547
|
"uid": self.fs.ukey(path),
|
|
601
548
|
}
|
|
602
|
-
self.
|
|
549
|
+
self._metadata.update_file(path, detail)
|
|
603
550
|
logger.debug("Copying %s to local cache" % path)
|
|
604
551
|
return fn
|
|
605
552
|
|
|
@@ -695,7 +642,7 @@ class WholeFileCacheFileSystem(CachingFileSystem):
|
|
|
695
642
|
data = f.read(block)
|
|
696
643
|
f2.write(data)
|
|
697
644
|
else:
|
|
698
|
-
self.fs.
|
|
645
|
+
self.fs.get_file(path, fn)
|
|
699
646
|
self.save_cache()
|
|
700
647
|
return self._open(path, mode)
|
|
701
648
|
|
|
@@ -727,11 +674,10 @@ class SimpleCacheFileSystem(WholeFileCacheFileSystem):
|
|
|
727
674
|
for storage in self.storage:
|
|
728
675
|
if not os.path.exists(storage):
|
|
729
676
|
os.makedirs(storage, exist_ok=True)
|
|
730
|
-
self.cached_files = [{}]
|
|
731
677
|
|
|
732
678
|
def _check_file(self, path):
|
|
733
679
|
self._check_cache()
|
|
734
|
-
sha = self.
|
|
680
|
+
sha = self._mapper(path)
|
|
735
681
|
for storage in self.storage:
|
|
736
682
|
fn = os.path.join(storage, sha)
|
|
737
683
|
if os.path.exists(fn):
|
|
@@ -752,7 +698,7 @@ class SimpleCacheFileSystem(WholeFileCacheFileSystem):
|
|
|
752
698
|
if fn:
|
|
753
699
|
return open(fn, mode)
|
|
754
700
|
|
|
755
|
-
sha = self.
|
|
701
|
+
sha = self._mapper(path)
|
|
756
702
|
fn = os.path.join(self.storage[-1], sha)
|
|
757
703
|
logger.debug("Copying %s to local cache" % path)
|
|
758
704
|
kwargs["mode"] = mode
|
|
@@ -775,7 +721,7 @@ class SimpleCacheFileSystem(WholeFileCacheFileSystem):
|
|
|
775
721
|
data = f.read(block)
|
|
776
722
|
f2.write(data)
|
|
777
723
|
else:
|
|
778
|
-
self.fs.
|
|
724
|
+
self.fs.get_file(path, fn)
|
|
779
725
|
return self._open(path, mode)
|
|
780
726
|
|
|
781
727
|
|
|
@@ -836,32 +782,3 @@ class LocalTempFile:
|
|
|
836
782
|
|
|
837
783
|
def __getattr__(self, item):
|
|
838
784
|
return getattr(self.fh, item)
|
|
839
|
-
|
|
840
|
-
|
|
841
|
-
def hash_name(path, same_name):
|
|
842
|
-
if same_name:
|
|
843
|
-
hash = os.path.basename(path)
|
|
844
|
-
else:
|
|
845
|
-
hash = hashlib.sha256(path.encode()).hexdigest()
|
|
846
|
-
return hash
|
|
847
|
-
|
|
848
|
-
|
|
849
|
-
@contextlib.contextmanager
|
|
850
|
-
def atomic_write(path, mode="wb"):
|
|
851
|
-
"""
|
|
852
|
-
A context manager that opens a temporary file next to `path` and, on exit,
|
|
853
|
-
replaces `path` with the temporary file, thereby updating `path`
|
|
854
|
-
atomically.
|
|
855
|
-
"""
|
|
856
|
-
fd, fn = tempfile.mkstemp(
|
|
857
|
-
dir=os.path.dirname(path), prefix=os.path.basename(path) + "-"
|
|
858
|
-
)
|
|
859
|
-
try:
|
|
860
|
-
with open(fd, mode) as fp:
|
|
861
|
-
yield fp
|
|
862
|
-
except BaseException:
|
|
863
|
-
with contextlib.suppress(FileNotFoundError):
|
|
864
|
-
os.unlink(fn)
|
|
865
|
-
raise
|
|
866
|
-
else:
|
|
867
|
-
os.replace(fn, path)
|
fsspec/implementations/dirfs.py
CHANGED
|
@@ -10,6 +10,8 @@ class DirFileSystem(AsyncFileSystem):
|
|
|
10
10
|
delegates everything to the wrapped filesystem.
|
|
11
11
|
"""
|
|
12
12
|
|
|
13
|
+
protocol = "dir"
|
|
14
|
+
|
|
13
15
|
def __init__(
|
|
14
16
|
self,
|
|
15
17
|
path=None,
|
|
@@ -53,7 +55,7 @@ class DirFileSystem(AsyncFileSystem):
|
|
|
53
55
|
return path
|
|
54
56
|
if not path:
|
|
55
57
|
return self.path
|
|
56
|
-
return self.fs.sep.join((self.path, path))
|
|
58
|
+
return self.fs.sep.join((self.path, self._strip_protocol(path)))
|
|
57
59
|
return [self._join(_path) for _path in path]
|
|
58
60
|
|
|
59
61
|
def _relpath(self, path):
|
fsspec/implementations/http.py
CHANGED
|
@@ -192,10 +192,9 @@ class HTTPFileSystem(AsyncFileSystem):
|
|
|
192
192
|
for u in out
|
|
193
193
|
]
|
|
194
194
|
else:
|
|
195
|
-
return
|
|
195
|
+
return sorted(out)
|
|
196
196
|
|
|
197
197
|
async def _ls(self, url, detail=True, **kwargs):
|
|
198
|
-
|
|
199
198
|
if self.use_listings_cache and url in self.dircache:
|
|
200
199
|
out = self.dircache[url]
|
|
201
200
|
else:
|
|
@@ -432,7 +431,7 @@ class HTTPFileSystem(AsyncFileSystem):
|
|
|
432
431
|
|
|
433
432
|
return {"name": url, "size": None, **info, "type": "file"}
|
|
434
433
|
|
|
435
|
-
async def _glob(self, path, **kwargs):
|
|
434
|
+
async def _glob(self, path, maxdepth=None, **kwargs):
|
|
436
435
|
"""
|
|
437
436
|
Find files by glob-matching.
|
|
438
437
|
|
|
@@ -440,23 +439,21 @@ class HTTPFileSystem(AsyncFileSystem):
|
|
|
440
439
|
but "?" is not considered as a character for globbing, because it is
|
|
441
440
|
so common in URLs, often identifying the "query" part.
|
|
442
441
|
"""
|
|
442
|
+
if maxdepth is not None and maxdepth < 1:
|
|
443
|
+
raise ValueError("maxdepth must be at least 1")
|
|
443
444
|
import re
|
|
444
445
|
|
|
445
446
|
ends = path.endswith("/")
|
|
446
447
|
path = self._strip_protocol(path)
|
|
447
|
-
|
|
448
|
-
|
|
448
|
+
idx_star = path.find("*") if path.find("*") >= 0 else len(path)
|
|
449
|
+
idx_brace = path.find("[") if path.find("[") >= 0 else len(path)
|
|
449
450
|
|
|
450
|
-
|
|
451
|
+
min_idx = min(idx_star, idx_brace)
|
|
451
452
|
|
|
452
453
|
detail = kwargs.pop("detail", False)
|
|
453
454
|
|
|
454
455
|
if not has_magic(path):
|
|
455
|
-
|
|
456
|
-
depth = 1
|
|
457
|
-
if ends:
|
|
458
|
-
path += "/*"
|
|
459
|
-
elif await self._exists(path):
|
|
456
|
+
if await self._exists(path):
|
|
460
457
|
if not detail:
|
|
461
458
|
return [path]
|
|
462
459
|
else:
|
|
@@ -466,13 +463,21 @@ class HTTPFileSystem(AsyncFileSystem):
|
|
|
466
463
|
return [] # glob of non-existent returns empty
|
|
467
464
|
else:
|
|
468
465
|
return {}
|
|
469
|
-
elif "/" in path[:
|
|
470
|
-
|
|
471
|
-
root = path[:
|
|
472
|
-
depth =
|
|
466
|
+
elif "/" in path[:min_idx]:
|
|
467
|
+
min_idx = path[:min_idx].rindex("/")
|
|
468
|
+
root = path[: min_idx + 1]
|
|
469
|
+
depth = path[min_idx + 1 :].count("/") + 1
|
|
473
470
|
else:
|
|
474
471
|
root = ""
|
|
475
|
-
depth =
|
|
472
|
+
depth = path[min_idx + 1 :].count("/") + 1
|
|
473
|
+
|
|
474
|
+
if "**" in path:
|
|
475
|
+
if maxdepth is not None:
|
|
476
|
+
idx_double_stars = path.find("**")
|
|
477
|
+
depth_double_stars = path[idx_double_stars:].count("/") + 1
|
|
478
|
+
depth = depth - depth_double_stars + maxdepth
|
|
479
|
+
else:
|
|
480
|
+
depth = None
|
|
476
481
|
|
|
477
482
|
allpaths = await self._find(
|
|
478
483
|
root, maxdepth=depth, withdirs=True, detail=True, **kwargs
|
|
@@ -499,14 +504,23 @@ class HTTPFileSystem(AsyncFileSystem):
|
|
|
499
504
|
)
|
|
500
505
|
+ "$"
|
|
501
506
|
)
|
|
502
|
-
pattern = re.sub("[*]{2}", "=
|
|
507
|
+
pattern = re.sub("/[*]{2}", "=SLASH_DOUBLE_STARS=", pattern)
|
|
508
|
+
pattern = re.sub("[*]{2}/?", "=DOUBLE_STARS=", pattern)
|
|
503
509
|
pattern = re.sub("[*]", "[^/]*", pattern)
|
|
504
|
-
pattern = re.
|
|
510
|
+
pattern = re.sub("=SLASH_DOUBLE_STARS=", "(|/.*)", pattern)
|
|
511
|
+
pattern = re.sub("=DOUBLE_STARS=", ".*", pattern)
|
|
512
|
+
pattern = re.compile(pattern)
|
|
505
513
|
out = {
|
|
506
514
|
p: allpaths[p]
|
|
507
515
|
for p in sorted(allpaths)
|
|
508
516
|
if pattern.match(p.replace("//", "/").rstrip("/"))
|
|
509
517
|
}
|
|
518
|
+
|
|
519
|
+
# Return directories only when the glob end by a slash
|
|
520
|
+
# This is needed for posix glob compliance
|
|
521
|
+
if ends:
|
|
522
|
+
out = {k: v for k, v in out.items() if v["type"] == "directory"}
|
|
523
|
+
|
|
510
524
|
if detail:
|
|
511
525
|
return out
|
|
512
526
|
else:
|
|
@@ -841,7 +855,10 @@ async def _file_info(url, session, size_policy="head", **kwargs):
|
|
|
841
855
|
# or 'Accept-Ranges': 'none' (not 'bytes')
|
|
842
856
|
# to mean streaming only, no random access => return None
|
|
843
857
|
if "Content-Length" in r.headers:
|
|
844
|
-
|
|
858
|
+
# Some servers may choose to ignore Accept-Encoding and return
|
|
859
|
+
# compressed content, in which case the returned size is unreliable.
|
|
860
|
+
if r.headers.get("Content-Encoding", "identity") == "identity":
|
|
861
|
+
info["size"] = int(r.headers["Content-Length"])
|
|
845
862
|
elif "Content-Range" in r.headers:
|
|
846
863
|
info["size"] = int(r.headers["Content-Range"].split("/")[1])
|
|
847
864
|
|
fsspec/implementations/local.py
CHANGED
|
@@ -65,10 +65,6 @@ class LocalFileSystem(AbstractFileSystem):
|
|
|
65
65
|
else:
|
|
66
66
|
return [posixpath.join(path, f) for f in os.listdir(path)]
|
|
67
67
|
|
|
68
|
-
def glob(self, path, **kwargs):
|
|
69
|
-
path = self._strip_protocol(path)
|
|
70
|
-
return super().glob(path, **kwargs)
|
|
71
|
-
|
|
72
68
|
def info(self, path, **kwargs):
|
|
73
69
|
if isinstance(path, os.DirEntry):
|
|
74
70
|
# scandir DirEntry
|
|
@@ -196,11 +192,13 @@ class LocalFileSystem(AbstractFileSystem):
|
|
|
196
192
|
|
|
197
193
|
def created(self, path):
|
|
198
194
|
info = self.info(path=path)
|
|
199
|
-
return datetime.datetime.
|
|
195
|
+
return datetime.datetime.fromtimestamp(
|
|
196
|
+
info["created"], tz=datetime.timezone.utc
|
|
197
|
+
)
|
|
200
198
|
|
|
201
199
|
def modified(self, path):
|
|
202
200
|
info = self.info(path=path)
|
|
203
|
-
return datetime.datetime.
|
|
201
|
+
return datetime.datetime.fromtimestamp(info["mtime"], tz=datetime.timezone.utc)
|
|
204
202
|
|
|
205
203
|
@classmethod
|
|
206
204
|
def _parent(cls, path):
|
|
@@ -285,21 +283,6 @@ def trailing_sep(path):
|
|
|
285
283
|
return path.endswith(os.sep) or (os.altsep is not None and path.endswith(os.altsep))
|
|
286
284
|
|
|
287
285
|
|
|
288
|
-
def trailing_sep_maybe_asterisk(path):
|
|
289
|
-
"""Return True if the path ends with a path separator and optionally an
|
|
290
|
-
asterisk.
|
|
291
|
-
|
|
292
|
-
A forward slash is always considered a path separator, even on Operating
|
|
293
|
-
Systems that normally use a backslash.
|
|
294
|
-
"""
|
|
295
|
-
# TODO: if all incoming paths were posix-compliant then separator would
|
|
296
|
-
# always be a forward slash, simplifying this function.
|
|
297
|
-
# See https://github.com/fsspec/filesystem_spec/pull/1250
|
|
298
|
-
return path.endswith((os.sep, os.sep + "*")) or (
|
|
299
|
-
os.altsep is not None and path.endswith((os.altsep, os.altsep + "*"))
|
|
300
|
-
)
|
|
301
|
-
|
|
302
|
-
|
|
303
286
|
class LocalFileOpener(io.IOBase):
|
|
304
287
|
def __init__(
|
|
305
288
|
self, path, mode, autocommit=True, fs=None, compression=None, **kwargs
|
fsspec/implementations/memory.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
from __future__ import absolute_import, annotations, division, print_function
|
|
2
2
|
|
|
3
3
|
import logging
|
|
4
|
-
from datetime import datetime
|
|
4
|
+
from datetime import datetime, timezone
|
|
5
5
|
from errno import ENOTEMPTY
|
|
6
6
|
from io import BytesIO
|
|
7
7
|
from typing import Any, ClassVar
|
|
@@ -137,10 +137,6 @@ class MemoryFileSystem(AbstractFileSystem):
|
|
|
137
137
|
else:
|
|
138
138
|
raise FileNotFoundError(path)
|
|
139
139
|
|
|
140
|
-
def exists(self, path, **kwargs):
|
|
141
|
-
path = self._strip_protocol(path)
|
|
142
|
-
return path in self.store or path in self.pseudo_dirs
|
|
143
|
-
|
|
144
140
|
def info(self, path, **kwargs):
|
|
145
141
|
path = self._strip_protocol(path)
|
|
146
142
|
if path in self.pseudo_dirs or any(
|
|
@@ -191,11 +187,14 @@ class MemoryFileSystem(AbstractFileSystem):
|
|
|
191
187
|
return f
|
|
192
188
|
else:
|
|
193
189
|
raise FileNotFoundError(path)
|
|
194
|
-
|
|
190
|
+
elif mode == "wb":
|
|
195
191
|
m = MemoryFile(self, path, kwargs.get("data"))
|
|
196
192
|
if not self._intrans:
|
|
197
193
|
m.commit()
|
|
198
194
|
return m
|
|
195
|
+
else:
|
|
196
|
+
name = self.__class__.__name__
|
|
197
|
+
raise ValueError(f"unsupported file mode for {name}: {mode!r}")
|
|
199
198
|
|
|
200
199
|
def cp_file(self, path1, path2, **kwargs):
|
|
201
200
|
path1 = self._strip_protocol(path1)
|
|
@@ -269,8 +268,8 @@ class MemoryFile(BytesIO):
|
|
|
269
268
|
logger.debug("open file %s", path)
|
|
270
269
|
self.fs = fs
|
|
271
270
|
self.path = path
|
|
272
|
-
self.created = datetime.
|
|
273
|
-
self.modified = datetime.
|
|
271
|
+
self.created = datetime.now(tz=timezone.utc)
|
|
272
|
+
self.modified = datetime.now(tz=timezone.utc)
|
|
274
273
|
if data:
|
|
275
274
|
super().__init__(data)
|
|
276
275
|
self.seek(0)
|
|
@@ -290,4 +289,4 @@ class MemoryFile(BytesIO):
|
|
|
290
289
|
|
|
291
290
|
def commit(self):
|
|
292
291
|
self.fs.store[self.path] = self
|
|
293
|
-
self.modified = datetime.
|
|
292
|
+
self.modified = datetime.now(tz=timezone.utc)
|