fsspec 2024.3.1__py3-none-any.whl → 2024.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fsspec/__init__.py +2 -3
- fsspec/_version.py +14 -19
- fsspec/caching.py +86 -16
- fsspec/compression.py +2 -1
- fsspec/core.py +32 -8
- fsspec/exceptions.py +1 -0
- fsspec/generic.py +1 -1
- fsspec/gui.py +1 -1
- fsspec/implementations/arrow.py +0 -2
- fsspec/implementations/cache_mapper.py +1 -2
- fsspec/implementations/cache_metadata.py +7 -7
- fsspec/implementations/cached.py +1 -13
- fsspec/implementations/dirfs.py +2 -2
- fsspec/implementations/github.py +12 -0
- fsspec/implementations/http.py +9 -9
- fsspec/implementations/local.py +78 -45
- fsspec/implementations/memory.py +9 -0
- fsspec/implementations/reference.py +6 -0
- fsspec/implementations/smb.py +13 -1
- fsspec/implementations/webhdfs.py +1 -3
- fsspec/json.py +81 -0
- fsspec/parquet.py +0 -8
- fsspec/registry.py +28 -18
- fsspec/spec.py +97 -38
- fsspec/tests/abstract/mv.py +57 -0
- fsspec/utils.py +1 -3
- fsspec-2024.6.0.dist-info/METADATA +279 -0
- fsspec-2024.6.0.dist-info/RECORD +55 -0
- {fsspec-2024.3.1.dist-info → fsspec-2024.6.0.dist-info}/WHEEL +1 -2
- fsspec-2024.3.1.dist-info/METADATA +0 -167
- fsspec-2024.3.1.dist-info/RECORD +0 -54
- fsspec-2024.3.1.dist-info/top_level.txt +0 -1
- {fsspec-2024.3.1.dist-info → fsspec-2024.6.0.dist-info/licenses}/LICENSE +0 -0
fsspec/__init__.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
from importlib.metadata import entry_points
|
|
2
2
|
|
|
3
|
-
from . import
|
|
3
|
+
from . import caching
|
|
4
|
+
from ._version import __version__ # noqa: F401
|
|
4
5
|
from .callbacks import Callback
|
|
5
6
|
from .compression import available_compressions
|
|
6
7
|
from .core import get_fs_token_paths, open, open_files, open_local, url_to_fs
|
|
@@ -15,8 +16,6 @@ from .registry import (
|
|
|
15
16
|
)
|
|
16
17
|
from .spec import AbstractFileSystem
|
|
17
18
|
|
|
18
|
-
__version__ = _version.get_versions()["version"]
|
|
19
|
-
|
|
20
19
|
__all__ = [
|
|
21
20
|
"AbstractFileSystem",
|
|
22
21
|
"FSTimeoutError",
|
fsspec/_version.py
CHANGED
|
@@ -1,21 +1,16 @@
|
|
|
1
|
+
# file generated by setuptools_scm
|
|
2
|
+
# don't change, don't track in version control
|
|
3
|
+
TYPE_CHECKING = False
|
|
4
|
+
if TYPE_CHECKING:
|
|
5
|
+
from typing import Tuple, Union
|
|
6
|
+
VERSION_TUPLE = Tuple[Union[int, str], ...]
|
|
7
|
+
else:
|
|
8
|
+
VERSION_TUPLE = object
|
|
1
9
|
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
10
|
+
version: str
|
|
11
|
+
__version__: str
|
|
12
|
+
__version_tuple__: VERSION_TUPLE
|
|
13
|
+
version_tuple: VERSION_TUPLE
|
|
6
14
|
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
version_json = '''
|
|
10
|
-
{
|
|
11
|
-
"date": "2024-03-18T15:33:58-0400",
|
|
12
|
-
"dirty": false,
|
|
13
|
-
"error": null,
|
|
14
|
-
"full-revisionid": "47b445ae4c284a82dd15e0287b1ffc410e8fc470",
|
|
15
|
-
"version": "2024.3.1"
|
|
16
|
-
}
|
|
17
|
-
''' # END VERSION_JSON
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
def get_versions():
|
|
21
|
-
return json.loads(version_json)
|
|
15
|
+
__version__ = version = '2024.6.0'
|
|
16
|
+
__version_tuple__ = version_tuple = (2024, 6, 0)
|
fsspec/caching.py
CHANGED
|
@@ -15,6 +15,7 @@ from typing import (
|
|
|
15
15
|
ClassVar,
|
|
16
16
|
Generic,
|
|
17
17
|
NamedTuple,
|
|
18
|
+
Optional,
|
|
18
19
|
OrderedDict,
|
|
19
20
|
TypeVar,
|
|
20
21
|
)
|
|
@@ -56,8 +57,13 @@ class BaseCache:
|
|
|
56
57
|
|
|
57
58
|
def __init__(self, blocksize: int, fetcher: Fetcher, size: int) -> None:
|
|
58
59
|
self.blocksize = blocksize
|
|
60
|
+
self.nblocks = 0
|
|
59
61
|
self.fetcher = fetcher
|
|
60
62
|
self.size = size
|
|
63
|
+
self.hit_count = 0
|
|
64
|
+
self.miss_count = 0
|
|
65
|
+
# the bytes that we actually requested
|
|
66
|
+
self.total_requested_bytes = 0
|
|
61
67
|
|
|
62
68
|
def _fetch(self, start: int | None, stop: int | None) -> bytes:
|
|
63
69
|
if start is None:
|
|
@@ -68,6 +74,36 @@ class BaseCache:
|
|
|
68
74
|
return b""
|
|
69
75
|
return self.fetcher(start, stop)
|
|
70
76
|
|
|
77
|
+
def _reset_stats(self) -> None:
|
|
78
|
+
"""Reset hit and miss counts for a more ganular report e.g. by file."""
|
|
79
|
+
self.hit_count = 0
|
|
80
|
+
self.miss_count = 0
|
|
81
|
+
self.total_requested_bytes = 0
|
|
82
|
+
|
|
83
|
+
def _log_stats(self) -> str:
|
|
84
|
+
"""Return a formatted string of the cache statistics."""
|
|
85
|
+
if self.hit_count == 0 and self.miss_count == 0:
|
|
86
|
+
# a cache that does nothing, this is for logs only
|
|
87
|
+
return ""
|
|
88
|
+
return " , %s: %d hits, %d misses, %d total requested bytes" % (
|
|
89
|
+
self.name,
|
|
90
|
+
self.hit_count,
|
|
91
|
+
self.miss_count,
|
|
92
|
+
self.total_requested_bytes,
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
def __repr__(self) -> str:
|
|
96
|
+
# TODO: use rich for better formatting
|
|
97
|
+
return f"""
|
|
98
|
+
<{self.__class__.__name__}:
|
|
99
|
+
block size : {self.blocksize}
|
|
100
|
+
block count : {self.nblocks}
|
|
101
|
+
file size : {self.size}
|
|
102
|
+
cache hits : {self.hit_count}
|
|
103
|
+
cache misses: {self.miss_count}
|
|
104
|
+
total requested bytes: {self.total_requested_bytes}>
|
|
105
|
+
"""
|
|
106
|
+
|
|
71
107
|
|
|
72
108
|
class MMapCache(BaseCache):
|
|
73
109
|
"""memory-mapped sparse file cache
|
|
@@ -126,13 +162,18 @@ class MMapCache(BaseCache):
|
|
|
126
162
|
start_block = start // self.blocksize
|
|
127
163
|
end_block = end // self.blocksize
|
|
128
164
|
need = [i for i in range(start_block, end_block + 1) if i not in self.blocks]
|
|
165
|
+
hits = [i for i in range(start_block, end_block + 1) if i in self.blocks]
|
|
166
|
+
self.miss_count += len(need)
|
|
167
|
+
self.hit_count += len(hits)
|
|
129
168
|
while need:
|
|
130
169
|
# TODO: not a for loop so we can consolidate blocks later to
|
|
131
170
|
# make fewer fetch calls; this could be parallel
|
|
132
171
|
i = need.pop(0)
|
|
172
|
+
|
|
133
173
|
sstart = i * self.blocksize
|
|
134
174
|
send = min(sstart + self.blocksize, self.size)
|
|
135
|
-
|
|
175
|
+
self.total_requested_bytes += send - sstart
|
|
176
|
+
logger.debug(f"MMap get block #{i} ({sstart}-{send})")
|
|
136
177
|
self.cache[sstart:send] = self.fetcher(sstart, send)
|
|
137
178
|
self.blocks.add(i)
|
|
138
179
|
|
|
@@ -176,16 +217,20 @@ class ReadAheadCache(BaseCache):
|
|
|
176
217
|
l = end - start
|
|
177
218
|
if start >= self.start and end <= self.end:
|
|
178
219
|
# cache hit
|
|
220
|
+
self.hit_count += 1
|
|
179
221
|
return self.cache[start - self.start : end - self.start]
|
|
180
222
|
elif self.start <= start < self.end:
|
|
181
223
|
# partial hit
|
|
224
|
+
self.miss_count += 1
|
|
182
225
|
part = self.cache[start - self.start :]
|
|
183
226
|
l -= len(part)
|
|
184
227
|
start = self.end
|
|
185
228
|
else:
|
|
186
229
|
# miss
|
|
230
|
+
self.miss_count += 1
|
|
187
231
|
part = b""
|
|
188
232
|
end = min(self.size, end + self.blocksize)
|
|
233
|
+
self.total_requested_bytes += end - start
|
|
189
234
|
self.cache = self.fetcher(start, end) # new block replaces old
|
|
190
235
|
self.start = start
|
|
191
236
|
self.end = self.start + len(self.cache)
|
|
@@ -202,24 +247,39 @@ class FirstChunkCache(BaseCache):
|
|
|
202
247
|
name = "first"
|
|
203
248
|
|
|
204
249
|
def __init__(self, blocksize: int, fetcher: Fetcher, size: int) -> None:
|
|
250
|
+
if blocksize > size:
|
|
251
|
+
# this will buffer the whole thing
|
|
252
|
+
blocksize = size
|
|
205
253
|
super().__init__(blocksize, fetcher, size)
|
|
206
254
|
self.cache: bytes | None = None
|
|
207
255
|
|
|
208
256
|
def _fetch(self, start: int | None, end: int | None) -> bytes:
|
|
209
257
|
start = start or 0
|
|
210
|
-
|
|
258
|
+
if start > self.size:
|
|
259
|
+
logger.debug("FirstChunkCache: requested start > file size")
|
|
260
|
+
return b""
|
|
261
|
+
|
|
262
|
+
end = min(end, self.size)
|
|
263
|
+
|
|
211
264
|
if start < self.blocksize:
|
|
212
265
|
if self.cache is None:
|
|
266
|
+
self.miss_count += 1
|
|
213
267
|
if end > self.blocksize:
|
|
268
|
+
self.total_requested_bytes += end
|
|
214
269
|
data = self.fetcher(0, end)
|
|
215
270
|
self.cache = data[: self.blocksize]
|
|
216
271
|
return data[start:]
|
|
217
272
|
self.cache = self.fetcher(0, self.blocksize)
|
|
273
|
+
self.total_requested_bytes += self.blocksize
|
|
218
274
|
part = self.cache[start:end]
|
|
219
275
|
if end > self.blocksize:
|
|
276
|
+
self.total_requested_bytes += end - self.blocksize
|
|
220
277
|
part += self.fetcher(self.blocksize, end)
|
|
278
|
+
self.hit_count += 1
|
|
221
279
|
return part
|
|
222
280
|
else:
|
|
281
|
+
self.miss_count += 1
|
|
282
|
+
self.total_requested_bytes += end - start
|
|
223
283
|
return self.fetcher(start, end)
|
|
224
284
|
|
|
225
285
|
|
|
@@ -256,12 +316,6 @@ class BlockCache(BaseCache):
|
|
|
256
316
|
self.maxblocks = maxblocks
|
|
257
317
|
self._fetch_block_cached = functools.lru_cache(maxblocks)(self._fetch_block)
|
|
258
318
|
|
|
259
|
-
def __repr__(self) -> str:
|
|
260
|
-
return (
|
|
261
|
-
f"<BlockCache blocksize={self.blocksize}, "
|
|
262
|
-
f"size={self.size}, nblocks={self.nblocks}>"
|
|
263
|
-
)
|
|
264
|
-
|
|
265
319
|
def cache_info(self):
|
|
266
320
|
"""
|
|
267
321
|
The statistics on the block cache.
|
|
@@ -319,6 +373,8 @@ class BlockCache(BaseCache):
|
|
|
319
373
|
|
|
320
374
|
start = block_number * self.blocksize
|
|
321
375
|
end = start + self.blocksize
|
|
376
|
+
self.total_requested_bytes += end - start
|
|
377
|
+
self.miss_count += 1
|
|
322
378
|
logger.info("BlockCache fetching block %d", block_number)
|
|
323
379
|
block_contents = super()._fetch(start, end)
|
|
324
380
|
return block_contents
|
|
@@ -339,6 +395,7 @@ class BlockCache(BaseCache):
|
|
|
339
395
|
start_pos = start % self.blocksize
|
|
340
396
|
end_pos = end % self.blocksize
|
|
341
397
|
|
|
398
|
+
self.hit_count += 1
|
|
342
399
|
if start_block_number == end_block_number:
|
|
343
400
|
block: bytes = self._fetch_block_cached(start_block_number)
|
|
344
401
|
return block[start_pos:end_pos]
|
|
@@ -404,6 +461,7 @@ class BytesCache(BaseCache):
|
|
|
404
461
|
):
|
|
405
462
|
# cache hit: we have all the required data
|
|
406
463
|
offset = start - self.start
|
|
464
|
+
self.hit_count += 1
|
|
407
465
|
return self.cache[offset : offset + end - start]
|
|
408
466
|
|
|
409
467
|
if self.blocksize:
|
|
@@ -418,17 +476,22 @@ class BytesCache(BaseCache):
|
|
|
418
476
|
self.end is None or end > self.end
|
|
419
477
|
):
|
|
420
478
|
# First read, or extending both before and after
|
|
479
|
+
self.total_requested_bytes += bend - start
|
|
480
|
+
self.miss_count += 1
|
|
421
481
|
self.cache = self.fetcher(start, bend)
|
|
422
482
|
self.start = start
|
|
423
483
|
else:
|
|
424
484
|
assert self.start is not None
|
|
425
485
|
assert self.end is not None
|
|
486
|
+
self.miss_count += 1
|
|
426
487
|
|
|
427
488
|
if start < self.start:
|
|
428
489
|
if self.end is None or self.end - end > self.blocksize:
|
|
490
|
+
self.total_requested_bytes += bend - start
|
|
429
491
|
self.cache = self.fetcher(start, bend)
|
|
430
492
|
self.start = start
|
|
431
493
|
else:
|
|
494
|
+
self.total_requested_bytes += self.start - start
|
|
432
495
|
new = self.fetcher(start, self.start)
|
|
433
496
|
self.start = start
|
|
434
497
|
self.cache = new + self.cache
|
|
@@ -436,9 +499,11 @@ class BytesCache(BaseCache):
|
|
|
436
499
|
if self.end > self.size:
|
|
437
500
|
pass
|
|
438
501
|
elif end - self.end > self.blocksize:
|
|
502
|
+
self.total_requested_bytes += bend - start
|
|
439
503
|
self.cache = self.fetcher(start, bend)
|
|
440
504
|
self.start = start
|
|
441
505
|
else:
|
|
506
|
+
self.total_requested_bytes += bend - self.end
|
|
442
507
|
new = self.fetcher(self.end, bend)
|
|
443
508
|
self.cache = self.cache + new
|
|
444
509
|
|
|
@@ -470,10 +535,13 @@ class AllBytes(BaseCache):
|
|
|
470
535
|
) -> None:
|
|
471
536
|
super().__init__(blocksize, fetcher, size) # type: ignore[arg-type]
|
|
472
537
|
if data is None:
|
|
538
|
+
self.miss_count += 1
|
|
539
|
+
self.total_requested_bytes += self.size
|
|
473
540
|
data = self.fetcher(0, self.size)
|
|
474
541
|
self.data = data
|
|
475
542
|
|
|
476
543
|
def _fetch(self, start: int | None, stop: int | None) -> bytes:
|
|
544
|
+
self.hit_count += 1
|
|
477
545
|
return self.data[start:stop]
|
|
478
546
|
|
|
479
547
|
|
|
@@ -507,7 +575,7 @@ class KnownPartsOfAFile(BaseCache):
|
|
|
507
575
|
blocksize: int,
|
|
508
576
|
fetcher: Fetcher,
|
|
509
577
|
size: int,
|
|
510
|
-
data: dict[tuple[int, int], bytes] =
|
|
578
|
+
data: Optional[dict[tuple[int, int], bytes]] = None,
|
|
511
579
|
strict: bool = True,
|
|
512
580
|
**_: Any,
|
|
513
581
|
):
|
|
@@ -530,7 +598,7 @@ class KnownPartsOfAFile(BaseCache):
|
|
|
530
598
|
|
|
531
599
|
self.data = dict(zip(offsets, blocks))
|
|
532
600
|
else:
|
|
533
|
-
self.data =
|
|
601
|
+
self.data = {}
|
|
534
602
|
|
|
535
603
|
def _fetch(self, start: int | None, stop: int | None) -> bytes:
|
|
536
604
|
if start is None:
|
|
@@ -551,6 +619,7 @@ class KnownPartsOfAFile(BaseCache):
|
|
|
551
619
|
# are allowed to pad reads beyond the
|
|
552
620
|
# buffer with zero
|
|
553
621
|
out += b"\x00" * (stop - start - len(out))
|
|
622
|
+
self.hit_count += 1
|
|
554
623
|
return out
|
|
555
624
|
else:
|
|
556
625
|
# The request ends outside a known range,
|
|
@@ -572,6 +641,8 @@ class KnownPartsOfAFile(BaseCache):
|
|
|
572
641
|
f"IO/caching performance may be poor!"
|
|
573
642
|
)
|
|
574
643
|
logger.debug(f"KnownPartsOfAFile cache fetching {start}-{stop}")
|
|
644
|
+
self.total_requested_bytes += stop - start
|
|
645
|
+
self.miss_count += 1
|
|
575
646
|
return out + super()._fetch(start, stop)
|
|
576
647
|
|
|
577
648
|
|
|
@@ -676,12 +747,6 @@ class BackgroundBlockCache(BaseCache):
|
|
|
676
747
|
self._fetch_future: Future[bytes] | None = None
|
|
677
748
|
self._fetch_future_lock = threading.Lock()
|
|
678
749
|
|
|
679
|
-
def __repr__(self) -> str:
|
|
680
|
-
return (
|
|
681
|
-
f"<BackgroundBlockCache blocksize={self.blocksize}, "
|
|
682
|
-
f"size={self.size}, nblocks={self.nblocks}>"
|
|
683
|
-
)
|
|
684
|
-
|
|
685
750
|
def cache_info(self) -> UpdatableLRU.CacheInfo:
|
|
686
751
|
"""
|
|
687
752
|
The statistics on the block cache.
|
|
@@ -799,6 +864,8 @@ class BackgroundBlockCache(BaseCache):
|
|
|
799
864
|
start = block_number * self.blocksize
|
|
800
865
|
end = start + self.blocksize
|
|
801
866
|
logger.info("BlockCache fetching block (%s) %d", log_info, block_number)
|
|
867
|
+
self.total_requested_bytes += end - start
|
|
868
|
+
self.miss_count += 1
|
|
802
869
|
block_contents = super()._fetch(start, end)
|
|
803
870
|
return block_contents
|
|
804
871
|
|
|
@@ -818,6 +885,9 @@ class BackgroundBlockCache(BaseCache):
|
|
|
818
885
|
start_pos = start % self.blocksize
|
|
819
886
|
end_pos = end % self.blocksize
|
|
820
887
|
|
|
888
|
+
# kind of pointless to count this as a hit, but it is
|
|
889
|
+
self.hit_count += 1
|
|
890
|
+
|
|
821
891
|
if start_block_number == end_block_number:
|
|
822
892
|
block = self._fetch_block_cached(start_block_number)
|
|
823
893
|
return block[start_pos:end_pos]
|
fsspec/compression.py
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
"""Helper functions for a standard streaming compression API"""
|
|
2
|
+
|
|
2
3
|
from zipfile import ZipFile
|
|
3
4
|
|
|
4
5
|
import fsspec.utils
|
|
@@ -138,7 +139,7 @@ class SnappyFile(AbstractBufferedFile):
|
|
|
138
139
|
try:
|
|
139
140
|
import snappy
|
|
140
141
|
|
|
141
|
-
snappy.compress
|
|
142
|
+
snappy.compress(b"")
|
|
142
143
|
# Snappy may use the .sz file extension, but this is not part of the
|
|
143
144
|
# standard implementation.
|
|
144
145
|
register_compression("snappy", SnappyFile, [])
|
fsspec/core.py
CHANGED
|
@@ -8,7 +8,7 @@ from glob import has_magic
|
|
|
8
8
|
from pathlib import Path
|
|
9
9
|
|
|
10
10
|
# for backwards compat, we export cache things from here too
|
|
11
|
-
from .caching import ( # noqa: F401
|
|
11
|
+
from fsspec.caching import ( # noqa: F401
|
|
12
12
|
BaseCache,
|
|
13
13
|
BlockCache,
|
|
14
14
|
BytesCache,
|
|
@@ -16,9 +16,10 @@ from .caching import ( # noqa: F401
|
|
|
16
16
|
ReadAheadCache,
|
|
17
17
|
caches,
|
|
18
18
|
)
|
|
19
|
-
from .compression import compr
|
|
20
|
-
from .
|
|
21
|
-
from .
|
|
19
|
+
from fsspec.compression import compr
|
|
20
|
+
from fsspec.config import conf
|
|
21
|
+
from fsspec.registry import filesystem, get_filesystem_class
|
|
22
|
+
from fsspec.utils import (
|
|
22
23
|
_unstrip_protocol,
|
|
23
24
|
build_name_function,
|
|
24
25
|
infer_compression,
|
|
@@ -100,7 +101,18 @@ class OpenFile:
|
|
|
100
101
|
def __enter__(self):
|
|
101
102
|
mode = self.mode.replace("t", "").replace("b", "") + "b"
|
|
102
103
|
|
|
103
|
-
|
|
104
|
+
try:
|
|
105
|
+
f = self.fs.open(self.path, mode=mode)
|
|
106
|
+
except FileNotFoundError as e:
|
|
107
|
+
if has_magic(self.path):
|
|
108
|
+
raise FileNotFoundError(
|
|
109
|
+
"%s not found. The URL contains glob characters: you maybe needed\n"
|
|
110
|
+
"to pass expand=True in fsspec.open() or the storage_options of \n"
|
|
111
|
+
"your library. You can also set the config value 'open_expand'\n"
|
|
112
|
+
"before import, or fsspec.core.DEFAULT_EXPAND at runtime, to True.",
|
|
113
|
+
self.path,
|
|
114
|
+
) from e
|
|
115
|
+
raise
|
|
104
116
|
|
|
105
117
|
self.fobjects = [f]
|
|
106
118
|
|
|
@@ -367,6 +379,7 @@ def url_to_fs(url, **kwargs):
|
|
|
367
379
|
urlpath : str
|
|
368
380
|
The file-systems-specific URL for ``url``.
|
|
369
381
|
"""
|
|
382
|
+
url = stringify_path(url)
|
|
370
383
|
# non-FS arguments that appear in fsspec.open()
|
|
371
384
|
# inspect could keep this in sync with open()'s signature
|
|
372
385
|
known_kwargs = {
|
|
@@ -396,6 +409,9 @@ def url_to_fs(url, **kwargs):
|
|
|
396
409
|
return fs, urlpath
|
|
397
410
|
|
|
398
411
|
|
|
412
|
+
DEFAULT_EXPAND = conf.get("open_expand", False)
|
|
413
|
+
|
|
414
|
+
|
|
399
415
|
def open(
|
|
400
416
|
urlpath,
|
|
401
417
|
mode="rb",
|
|
@@ -404,6 +420,7 @@ def open(
|
|
|
404
420
|
errors=None,
|
|
405
421
|
protocol=None,
|
|
406
422
|
newline=None,
|
|
423
|
+
expand=None,
|
|
407
424
|
**kwargs,
|
|
408
425
|
):
|
|
409
426
|
"""Given a path or paths, return one ``OpenFile`` object.
|
|
@@ -428,6 +445,13 @@ def open(
|
|
|
428
445
|
newline: bytes or None
|
|
429
446
|
Used for line terminator in text mode. If None, uses system default;
|
|
430
447
|
if blank, uses no translation.
|
|
448
|
+
expand: bool or Nonw
|
|
449
|
+
Whether to regard file paths containing special glob characters as needing
|
|
450
|
+
expansion (finding the first match) or absolute. Setting False allows using
|
|
451
|
+
paths which do embed such characters. If None (default), this argument
|
|
452
|
+
takes its value from the DEFAULT_EXPAND module variable, which takes
|
|
453
|
+
its initial value from the "open_expand" config value at startup, which will
|
|
454
|
+
be False if not set.
|
|
431
455
|
**kwargs: dict
|
|
432
456
|
Extra options that make sense to a particular storage connection, e.g.
|
|
433
457
|
host, port, username, password, etc.
|
|
@@ -456,8 +480,7 @@ def open(
|
|
|
456
480
|
- For implementations in separate packages see
|
|
457
481
|
https://filesystem-spec.readthedocs.io/en/latest/api.html#other-known-implementations
|
|
458
482
|
"""
|
|
459
|
-
|
|
460
|
-
kw.update(kwargs)
|
|
483
|
+
expand = DEFAULT_EXPAND if expand is None else expand
|
|
461
484
|
out = open_files(
|
|
462
485
|
urlpath=[urlpath],
|
|
463
486
|
mode=mode,
|
|
@@ -466,7 +489,8 @@ def open(
|
|
|
466
489
|
errors=errors,
|
|
467
490
|
protocol=protocol,
|
|
468
491
|
newline=newline,
|
|
469
|
-
|
|
492
|
+
expand=expand,
|
|
493
|
+
**kwargs,
|
|
470
494
|
)
|
|
471
495
|
if not out:
|
|
472
496
|
raise FileNotFoundError(urlpath)
|
fsspec/exceptions.py
CHANGED
fsspec/generic.py
CHANGED
|
@@ -139,7 +139,7 @@ def rsync(
|
|
|
139
139
|
source_files, target_files = zip(*allfiles.items())
|
|
140
140
|
fs.cp(source_files, target_files, **kwargs)
|
|
141
141
|
logger.debug(f"{len(to_delete)} files to delete")
|
|
142
|
-
if delete_missing:
|
|
142
|
+
if delete_missing and to_delete:
|
|
143
143
|
fs.rm(to_delete)
|
|
144
144
|
return allfiles
|
|
145
145
|
|
fsspec/gui.py
CHANGED
|
@@ -94,7 +94,7 @@ class SigSlot:
|
|
|
94
94
|
try:
|
|
95
95
|
return self.panel._repr_mimebundle_(*args, **kwargs)
|
|
96
96
|
except (ValueError, AttributeError):
|
|
97
|
-
raise NotImplementedError("Panel does not seem to be set
|
|
97
|
+
raise NotImplementedError("Panel does not seem to be set up properly")
|
|
98
98
|
|
|
99
99
|
def connect(self, signal, slot):
|
|
100
100
|
"""Associate call back with given event
|
fsspec/implementations/arrow.py
CHANGED
|
@@ -12,8 +12,7 @@ class AbstractCacheMapper(abc.ABC):
|
|
|
12
12
|
"""
|
|
13
13
|
|
|
14
14
|
@abc.abstractmethod
|
|
15
|
-
def __call__(self, path: str) -> str:
|
|
16
|
-
...
|
|
15
|
+
def __call__(self, path: str) -> str: ...
|
|
17
16
|
|
|
18
17
|
def __eq__(self, other: object) -> bool:
|
|
19
18
|
# Identity only depends on class. When derived classes have attributes
|
|
@@ -57,10 +57,14 @@ class CacheMetadata:
|
|
|
57
57
|
"""Low-level function to load metadata from specific file"""
|
|
58
58
|
try:
|
|
59
59
|
with open(fn, "r") as f:
|
|
60
|
-
|
|
60
|
+
loaded = json.load(f)
|
|
61
61
|
except ValueError:
|
|
62
62
|
with open(fn, "rb") as f:
|
|
63
|
-
|
|
63
|
+
loaded = pickle.load(f)
|
|
64
|
+
for c in loaded.values():
|
|
65
|
+
if isinstance(c.get("blocks"), list):
|
|
66
|
+
c["blocks"] = set(c["blocks"])
|
|
67
|
+
return loaded
|
|
64
68
|
|
|
65
69
|
def _save(self, metadata_to_save: Detail, fn: str) -> None:
|
|
66
70
|
"""Low-level function to save metadata to specific file"""
|
|
@@ -152,11 +156,7 @@ class CacheMetadata:
|
|
|
152
156
|
for fn, _, _ in self._scan_locations():
|
|
153
157
|
if os.path.exists(fn):
|
|
154
158
|
# TODO: consolidate blocks here
|
|
155
|
-
|
|
156
|
-
for c in loaded_cached_files.values():
|
|
157
|
-
if isinstance(c["blocks"], list):
|
|
158
|
-
c["blocks"] = set(c["blocks"])
|
|
159
|
-
cached_files.append(loaded_cached_files)
|
|
159
|
+
cached_files.append(self._load(fn))
|
|
160
160
|
else:
|
|
161
161
|
cached_files.append({})
|
|
162
162
|
self.cached_files = cached_files or [{}]
|
fsspec/implementations/cached.py
CHANGED
|
@@ -425,7 +425,6 @@ class CachingFileSystem(AbstractFileSystem):
|
|
|
425
425
|
"clear_cache",
|
|
426
426
|
"clear_expired_cache",
|
|
427
427
|
"pop_from_cache",
|
|
428
|
-
"_mkcache",
|
|
429
428
|
"local_file",
|
|
430
429
|
"_paths_from_path",
|
|
431
430
|
"get_mapper",
|
|
@@ -435,12 +434,10 @@ class CachingFileSystem(AbstractFileSystem):
|
|
|
435
434
|
"__hash__",
|
|
436
435
|
"__eq__",
|
|
437
436
|
"to_json",
|
|
437
|
+
"to_dict",
|
|
438
438
|
"cache_size",
|
|
439
439
|
"pipe_file",
|
|
440
440
|
"pipe",
|
|
441
|
-
"isdir",
|
|
442
|
-
"isfile",
|
|
443
|
-
"exists",
|
|
444
441
|
"start_transaction",
|
|
445
442
|
"end_transaction",
|
|
446
443
|
}:
|
|
@@ -510,15 +507,6 @@ class CachingFileSystem(AbstractFileSystem):
|
|
|
510
507
|
^ hash(self.target_protocol)
|
|
511
508
|
)
|
|
512
509
|
|
|
513
|
-
def to_json(self):
|
|
514
|
-
"""Calculate JSON representation.
|
|
515
|
-
|
|
516
|
-
Not implemented yet for CachingFileSystem.
|
|
517
|
-
"""
|
|
518
|
-
raise NotImplementedError(
|
|
519
|
-
"CachingFileSystem JSON representation not implemented"
|
|
520
|
-
)
|
|
521
|
-
|
|
522
510
|
|
|
523
511
|
class WholeFileCacheFileSystem(CachingFileSystem):
|
|
524
512
|
"""Caches whole remote files on first access
|
fsspec/implementations/dirfs.py
CHANGED
|
@@ -329,8 +329,8 @@ class DirFileSystem(AsyncFileSystem):
|
|
|
329
329
|
def rmdir(self, path):
|
|
330
330
|
return self.fs.rmdir(self._join(path))
|
|
331
331
|
|
|
332
|
-
def
|
|
333
|
-
return self.fs.
|
|
332
|
+
def mv(self, path1, path2, **kwargs):
|
|
333
|
+
return self.fs.mv(
|
|
334
334
|
self._join(path1),
|
|
335
335
|
self._join(path2),
|
|
336
336
|
**kwargs,
|
fsspec/implementations/github.py
CHANGED
|
@@ -1,5 +1,7 @@
|
|
|
1
1
|
import requests
|
|
2
2
|
|
|
3
|
+
import fsspec
|
|
4
|
+
|
|
3
5
|
from ..spec import AbstractFileSystem
|
|
4
6
|
from ..utils import infer_storage_options
|
|
5
7
|
from .memory import MemoryFile
|
|
@@ -225,3 +227,13 @@ class GithubFileSystem(AbstractFileSystem):
|
|
|
225
227
|
raise FileNotFoundError(path)
|
|
226
228
|
r.raise_for_status()
|
|
227
229
|
return MemoryFile(None, None, r.content)
|
|
230
|
+
|
|
231
|
+
def cat(self, path, recursive=False, on_error="raise", **kwargs):
|
|
232
|
+
paths = self.expand_path(path, recursive=recursive)
|
|
233
|
+
urls = [
|
|
234
|
+
self.rurl.format(org=self.org, repo=self.repo, path=u, sha=self.root)
|
|
235
|
+
for u, sh in paths
|
|
236
|
+
]
|
|
237
|
+
fs = fsspec.filesystem("http")
|
|
238
|
+
data = fs.cat(urls, on_error="return")
|
|
239
|
+
return {u: v for ((k, v), u) in zip(data.items(), urls)}
|
fsspec/implementations/http.py
CHANGED
|
@@ -451,7 +451,7 @@ class HTTPFileSystem(AsyncFileSystem):
|
|
|
451
451
|
|
|
452
452
|
ends_with_slash = path.endswith("/") # _strip_protocol strips trailing slash
|
|
453
453
|
path = self._strip_protocol(path)
|
|
454
|
-
append_slash_to_dirname = ends_with_slash or path.endswith("/**")
|
|
454
|
+
append_slash_to_dirname = ends_with_slash or path.endswith(("/**", "/*"))
|
|
455
455
|
idx_star = path.find("*") if path.find("*") >= 0 else len(path)
|
|
456
456
|
idx_brace = path.find("[") if path.find("[") >= 0 else len(path)
|
|
457
457
|
|
|
@@ -494,15 +494,15 @@ class HTTPFileSystem(AsyncFileSystem):
|
|
|
494
494
|
pattern = re.compile(pattern)
|
|
495
495
|
|
|
496
496
|
out = {
|
|
497
|
-
|
|
497
|
+
(
|
|
498
|
+
p.rstrip("/")
|
|
499
|
+
if not append_slash_to_dirname
|
|
500
|
+
and info["type"] == "directory"
|
|
501
|
+
and p.endswith("/")
|
|
502
|
+
else p
|
|
503
|
+
): info
|
|
498
504
|
for p, info in sorted(allpaths.items())
|
|
499
|
-
if pattern.match(
|
|
500
|
-
(
|
|
501
|
-
p + "/"
|
|
502
|
-
if append_slash_to_dirname and info["type"] == "directory"
|
|
503
|
-
else p
|
|
504
|
-
)
|
|
505
|
-
)
|
|
505
|
+
if pattern.match(p.rstrip("/"))
|
|
506
506
|
}
|
|
507
507
|
|
|
508
508
|
if detail:
|