fsspec 2023.6.0__py3-none-any.whl → 2023.9.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fsspec/_version.py +3 -3
- fsspec/asyn.py +154 -92
- fsspec/caching.py +1 -1
- fsspec/compression.py +7 -2
- fsspec/core.py +16 -8
- fsspec/generic.py +111 -17
- fsspec/gui.py +4 -2
- fsspec/implementations/cache_mapper.py +80 -0
- fsspec/implementations/cache_metadata.py +232 -0
- fsspec/implementations/cached.py +74 -157
- fsspec/implementations/dirfs.py +3 -1
- fsspec/implementations/http.py +36 -19
- fsspec/implementations/local.py +4 -21
- fsspec/implementations/memory.py +8 -9
- fsspec/implementations/reference.py +8 -8
- fsspec/implementations/sftp.py +6 -2
- fsspec/implementations/smb.py +39 -23
- fsspec/mapping.py +8 -0
- fsspec/registry.py +22 -0
- fsspec/spec.py +164 -96
- fsspec/tests/abstract/__init__.py +147 -0
- fsspec/tests/abstract/common.py +175 -0
- fsspec/tests/abstract/copy.py +250 -56
- fsspec/tests/abstract/get.py +248 -38
- fsspec/tests/abstract/put.py +246 -66
- fsspec/utils.py +25 -8
- {fsspec-2023.6.0.dist-info → fsspec-2023.9.1.dist-info}/METADATA +1 -1
- fsspec-2023.9.1.dist-info/RECORD +54 -0
- fsspec-2023.6.0.dist-info/RECORD +0 -51
- {fsspec-2023.6.0.dist-info → fsspec-2023.9.1.dist-info}/LICENSE +0 -0
- {fsspec-2023.6.0.dist-info → fsspec-2023.9.1.dist-info}/WHEEL +0 -0
- {fsspec-2023.6.0.dist-info → fsspec-2023.9.1.dist-info}/top_level.txt +0 -0
fsspec/_version.py
CHANGED
|
@@ -8,11 +8,11 @@ import json
|
|
|
8
8
|
|
|
9
9
|
version_json = '''
|
|
10
10
|
{
|
|
11
|
-
"date": "2023-
|
|
11
|
+
"date": "2023-09-15T16:17:21-0400",
|
|
12
12
|
"dirty": false,
|
|
13
13
|
"error": null,
|
|
14
|
-
"full-revisionid": "
|
|
15
|
-
"version": "2023.
|
|
14
|
+
"full-revisionid": "247b249a008990c584d2619f030bd42916a82e4a",
|
|
15
|
+
"version": "2023.9.1"
|
|
16
16
|
}
|
|
17
17
|
''' # END VERSION_JSON
|
|
18
18
|
|
fsspec/asyn.py
CHANGED
|
@@ -13,12 +13,7 @@ from typing import TYPE_CHECKING, Iterable
|
|
|
13
13
|
|
|
14
14
|
from .callbacks import _DEFAULT_CALLBACK
|
|
15
15
|
from .exceptions import FSTimeoutError
|
|
16
|
-
from .implementations.local import
|
|
17
|
-
LocalFileSystem,
|
|
18
|
-
make_path_posix,
|
|
19
|
-
trailing_sep,
|
|
20
|
-
trailing_sep_maybe_asterisk,
|
|
21
|
-
)
|
|
16
|
+
from .implementations.local import LocalFileSystem, make_path_posix, trailing_sep
|
|
22
17
|
from .spec import AbstractBufferedFile, AbstractFileSystem
|
|
23
18
|
from .utils import is_exception, other_paths
|
|
24
19
|
|
|
@@ -83,6 +78,8 @@ def sync(loop, func, *args, timeout=None, **kwargs):
|
|
|
83
78
|
loop0 = asyncio.events.get_running_loop()
|
|
84
79
|
if loop0 is loop:
|
|
85
80
|
raise NotImplementedError("Calling sync() from within a running loop")
|
|
81
|
+
except NotImplementedError:
|
|
82
|
+
raise
|
|
86
83
|
except RuntimeError:
|
|
87
84
|
pass
|
|
88
85
|
coro = func(*args, **kwargs)
|
|
@@ -347,26 +344,42 @@ class AsyncFileSystem(AbstractFileSystem):
|
|
|
347
344
|
elif on_error is None:
|
|
348
345
|
on_error = "raise"
|
|
349
346
|
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
347
|
+
if isinstance(path1, list) and isinstance(path2, list):
|
|
348
|
+
# No need to expand paths when both source and destination
|
|
349
|
+
# are provided as lists
|
|
350
|
+
paths1 = path1
|
|
351
|
+
paths2 = path2
|
|
352
|
+
else:
|
|
353
|
+
source_is_str = isinstance(path1, str)
|
|
354
|
+
paths1 = await self._expand_path(
|
|
355
|
+
path1, maxdepth=maxdepth, recursive=recursive
|
|
356
|
+
)
|
|
357
|
+
if source_is_str and (not recursive or maxdepth is not None):
|
|
358
|
+
# Non-recursive glob does not copy directories
|
|
359
|
+
paths1 = [
|
|
360
|
+
p for p in paths1 if not (trailing_sep(p) or await self._isdir(p))
|
|
361
|
+
]
|
|
362
|
+
if not paths1:
|
|
363
|
+
return
|
|
364
|
+
|
|
365
|
+
source_is_file = len(paths1) == 1
|
|
366
|
+
dest_is_dir = isinstance(path2, str) and (
|
|
367
|
+
trailing_sep(path2) or await self._isdir(path2)
|
|
368
|
+
)
|
|
369
|
+
|
|
370
|
+
exists = source_is_str and (
|
|
371
|
+
(has_magic(path1) and source_is_file)
|
|
372
|
+
or (not has_magic(path1) and dest_is_dir and not trailing_sep(path1))
|
|
373
|
+
)
|
|
374
|
+
paths2 = other_paths(
|
|
375
|
+
paths1,
|
|
376
|
+
path2,
|
|
377
|
+
exists=exists,
|
|
378
|
+
flatten=not source_is_str,
|
|
379
|
+
)
|
|
357
380
|
|
|
358
|
-
isdir = isinstance(path2, str) and (
|
|
359
|
-
trailing_sep(path2) or await self._isdir(path2)
|
|
360
|
-
)
|
|
361
|
-
path2 = other_paths(
|
|
362
|
-
paths,
|
|
363
|
-
path2,
|
|
364
|
-
exists=isdir and source_is_str and not trailing_sep_maybe_asterisk(path1),
|
|
365
|
-
is_dir=isdir,
|
|
366
|
-
flatten=not source_is_str,
|
|
367
|
-
)
|
|
368
381
|
batch_size = batch_size or self.batch_size
|
|
369
|
-
coros = [self._cp_file(p1, p2, **kwargs) for p1, p2 in zip(
|
|
382
|
+
coros = [self._cp_file(p1, p2, **kwargs) for p1, p2 in zip(paths1, paths2)]
|
|
370
383
|
result = await _run_coros_in_chunks(
|
|
371
384
|
coros, batch_size=batch_size, return_exceptions=True, nofiles=True
|
|
372
385
|
)
|
|
@@ -501,28 +514,39 @@ class AsyncFileSystem(AbstractFileSystem):
|
|
|
501
514
|
constructor, or for all instances by setting the "gather_batch_size" key
|
|
502
515
|
in ``fsspec.config.conf``, falling back to 1/8th of the system limit .
|
|
503
516
|
"""
|
|
504
|
-
|
|
505
|
-
|
|
506
|
-
|
|
507
|
-
|
|
508
|
-
|
|
509
|
-
|
|
510
|
-
|
|
511
|
-
|
|
512
|
-
|
|
513
|
-
|
|
517
|
+
if isinstance(lpath, list) and isinstance(rpath, list):
|
|
518
|
+
# No need to expand paths when both source and destination
|
|
519
|
+
# are provided as lists
|
|
520
|
+
rpaths = rpath
|
|
521
|
+
lpaths = lpath
|
|
522
|
+
else:
|
|
523
|
+
source_is_str = isinstance(lpath, str)
|
|
524
|
+
if source_is_str:
|
|
525
|
+
lpath = make_path_posix(lpath)
|
|
526
|
+
fs = LocalFileSystem()
|
|
527
|
+
lpaths = fs.expand_path(lpath, recursive=recursive, maxdepth=maxdepth)
|
|
528
|
+
if source_is_str and (not recursive or maxdepth is not None):
|
|
529
|
+
# Non-recursive glob does not copy directories
|
|
530
|
+
lpaths = [p for p in lpaths if not (trailing_sep(p) or fs.isdir(p))]
|
|
531
|
+
if not lpaths:
|
|
532
|
+
return
|
|
533
|
+
|
|
534
|
+
source_is_file = len(lpaths) == 1
|
|
535
|
+
dest_is_dir = isinstance(rpath, str) and (
|
|
536
|
+
trailing_sep(rpath) or await self._isdir(rpath)
|
|
537
|
+
)
|
|
514
538
|
|
|
515
|
-
|
|
516
|
-
|
|
517
|
-
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
|
|
539
|
+
rpath = self._strip_protocol(rpath)
|
|
540
|
+
exists = source_is_str and (
|
|
541
|
+
(has_magic(lpath) and source_is_file)
|
|
542
|
+
or (not has_magic(lpath) and dest_is_dir and not trailing_sep(lpath))
|
|
543
|
+
)
|
|
544
|
+
rpaths = other_paths(
|
|
545
|
+
lpaths,
|
|
546
|
+
rpath,
|
|
547
|
+
exists=exists,
|
|
548
|
+
flatten=not source_is_str,
|
|
549
|
+
)
|
|
526
550
|
|
|
527
551
|
is_dir = {l: os.path.isdir(l) for l in lpaths}
|
|
528
552
|
rdirs = [r for l, r in zip(lpaths, rpaths) if is_dir[l]]
|
|
@@ -567,32 +591,44 @@ class AsyncFileSystem(AbstractFileSystem):
|
|
|
567
591
|
constructor, or for all instances by setting the "gather_batch_size" key
|
|
568
592
|
in ``fsspec.config.conf``, falling back to 1/8th of the system limit .
|
|
569
593
|
"""
|
|
570
|
-
|
|
571
|
-
|
|
572
|
-
|
|
573
|
-
rpath
|
|
574
|
-
|
|
575
|
-
|
|
576
|
-
|
|
577
|
-
|
|
578
|
-
|
|
579
|
-
|
|
580
|
-
|
|
581
|
-
|
|
582
|
-
|
|
583
|
-
|
|
594
|
+
if isinstance(lpath, list) and isinstance(rpath, list):
|
|
595
|
+
# No need to expand paths when both source and destination
|
|
596
|
+
# are provided as lists
|
|
597
|
+
rpaths = rpath
|
|
598
|
+
lpaths = lpath
|
|
599
|
+
else:
|
|
600
|
+
source_is_str = isinstance(rpath, str)
|
|
601
|
+
# First check for rpath trailing slash as _strip_protocol removes it.
|
|
602
|
+
source_not_trailing_sep = source_is_str and not trailing_sep(rpath)
|
|
603
|
+
rpath = self._strip_protocol(rpath)
|
|
604
|
+
rpaths = await self._expand_path(
|
|
605
|
+
rpath, recursive=recursive, maxdepth=maxdepth
|
|
606
|
+
)
|
|
607
|
+
if source_is_str and (not recursive or maxdepth is not None):
|
|
608
|
+
# Non-recursive glob does not copy directories
|
|
609
|
+
rpaths = [
|
|
610
|
+
p for p in rpaths if not (trailing_sep(p) or await self._isdir(p))
|
|
611
|
+
]
|
|
612
|
+
if not rpaths:
|
|
613
|
+
return
|
|
614
|
+
|
|
615
|
+
lpath = make_path_posix(lpath)
|
|
616
|
+
source_is_file = len(rpaths) == 1
|
|
617
|
+
dest_is_dir = isinstance(lpath, str) and (
|
|
618
|
+
trailing_sep(lpath) or LocalFileSystem().isdir(lpath)
|
|
619
|
+
)
|
|
620
|
+
|
|
621
|
+
exists = source_is_str and (
|
|
622
|
+
(has_magic(rpath) and source_is_file)
|
|
623
|
+
or (not has_magic(rpath) and dest_is_dir and source_not_trailing_sep)
|
|
624
|
+
)
|
|
625
|
+
lpaths = other_paths(
|
|
626
|
+
rpaths,
|
|
627
|
+
lpath,
|
|
628
|
+
exists=exists,
|
|
629
|
+
flatten=not source_is_str,
|
|
630
|
+
)
|
|
584
631
|
|
|
585
|
-
lpath = make_path_posix(lpath)
|
|
586
|
-
isdir = isinstance(lpath, str) and (
|
|
587
|
-
trailing_sep(lpath) or LocalFileSystem().isdir(lpath)
|
|
588
|
-
)
|
|
589
|
-
lpaths = other_paths(
|
|
590
|
-
rpaths,
|
|
591
|
-
lpath,
|
|
592
|
-
exists=isdir and source_not_trailing_sep,
|
|
593
|
-
is_dir=isdir,
|
|
594
|
-
flatten=not source_is_str,
|
|
595
|
-
)
|
|
596
632
|
[os.makedirs(os.path.dirname(lp), exist_ok=True) for lp in lpaths]
|
|
597
633
|
batch_size = kwargs.pop("batch_size", self.batch_size)
|
|
598
634
|
|
|
@@ -639,7 +675,7 @@ class AsyncFileSystem(AbstractFileSystem):
|
|
|
639
675
|
async def _ls(self, path, detail=True, **kwargs):
|
|
640
676
|
raise NotImplementedError
|
|
641
677
|
|
|
642
|
-
async def _walk(self, path, maxdepth=None, **kwargs):
|
|
678
|
+
async def _walk(self, path, maxdepth=None, on_error="omit", **kwargs):
|
|
643
679
|
if maxdepth is not None and maxdepth < 1:
|
|
644
680
|
raise ValueError("maxdepth must be at least 1")
|
|
645
681
|
|
|
@@ -651,7 +687,11 @@ class AsyncFileSystem(AbstractFileSystem):
|
|
|
651
687
|
detail = kwargs.pop("detail", False)
|
|
652
688
|
try:
|
|
653
689
|
listing = await self._ls(path, detail=True, **kwargs)
|
|
654
|
-
except (FileNotFoundError, OSError):
|
|
690
|
+
except (FileNotFoundError, OSError) as e:
|
|
691
|
+
if on_error == "raise":
|
|
692
|
+
raise
|
|
693
|
+
elif callable(on_error):
|
|
694
|
+
on_error(e)
|
|
655
695
|
if detail:
|
|
656
696
|
yield path, {}, {}
|
|
657
697
|
else:
|
|
@@ -689,25 +729,24 @@ class AsyncFileSystem(AbstractFileSystem):
|
|
|
689
729
|
):
|
|
690
730
|
yield _
|
|
691
731
|
|
|
692
|
-
async def _glob(self, path, **kwargs):
|
|
732
|
+
async def _glob(self, path, maxdepth=None, **kwargs):
|
|
733
|
+
if maxdepth is not None and maxdepth < 1:
|
|
734
|
+
raise ValueError("maxdepth must be at least 1")
|
|
735
|
+
|
|
693
736
|
import re
|
|
694
737
|
|
|
695
738
|
ends = path.endswith("/")
|
|
696
739
|
path = self._strip_protocol(path)
|
|
697
|
-
|
|
698
|
-
|
|
699
|
-
|
|
740
|
+
idx_star = path.find("*") if path.find("*") >= 0 else len(path)
|
|
741
|
+
idx_qmark = path.find("?") if path.find("?") >= 0 else len(path)
|
|
742
|
+
idx_brace = path.find("[") if path.find("[") >= 0 else len(path)
|
|
700
743
|
|
|
701
|
-
|
|
744
|
+
min_idx = min(idx_star, idx_qmark, idx_brace)
|
|
702
745
|
|
|
703
746
|
detail = kwargs.pop("detail", False)
|
|
704
747
|
|
|
705
748
|
if not has_magic(path):
|
|
706
|
-
|
|
707
|
-
depth = 1
|
|
708
|
-
if ends:
|
|
709
|
-
path += "/*"
|
|
710
|
-
elif await self._exists(path):
|
|
749
|
+
if await self._exists(path):
|
|
711
750
|
if not detail:
|
|
712
751
|
return [path]
|
|
713
752
|
else:
|
|
@@ -717,13 +756,21 @@ class AsyncFileSystem(AbstractFileSystem):
|
|
|
717
756
|
return [] # glob of non-existent returns empty
|
|
718
757
|
else:
|
|
719
758
|
return {}
|
|
720
|
-
elif "/" in path[:
|
|
721
|
-
|
|
722
|
-
root = path[:
|
|
723
|
-
depth =
|
|
759
|
+
elif "/" in path[:min_idx]:
|
|
760
|
+
min_idx = path[:min_idx].rindex("/")
|
|
761
|
+
root = path[: min_idx + 1]
|
|
762
|
+
depth = path[min_idx + 1 :].count("/") + 1
|
|
724
763
|
else:
|
|
725
764
|
root = ""
|
|
726
|
-
depth =
|
|
765
|
+
depth = path[min_idx + 1 :].count("/") + 1
|
|
766
|
+
|
|
767
|
+
if "**" in path:
|
|
768
|
+
if maxdepth is not None:
|
|
769
|
+
idx_double_stars = path.find("**")
|
|
770
|
+
depth_double_stars = path[idx_double_stars:].count("/") + 1
|
|
771
|
+
depth = depth - depth_double_stars + maxdepth
|
|
772
|
+
else:
|
|
773
|
+
depth = None
|
|
727
774
|
|
|
728
775
|
allpaths = await self._find(
|
|
729
776
|
root, maxdepth=depth, withdirs=True, detail=True, **kwargs
|
|
@@ -751,14 +798,23 @@ class AsyncFileSystem(AbstractFileSystem):
|
|
|
751
798
|
)
|
|
752
799
|
+ "$"
|
|
753
800
|
)
|
|
754
|
-
pattern = re.sub("[*]{2}", "=
|
|
801
|
+
pattern = re.sub("/[*]{2}", "=SLASH_DOUBLE_STARS=", pattern)
|
|
802
|
+
pattern = re.sub("[*]{2}/?", "=DOUBLE_STARS=", pattern)
|
|
755
803
|
pattern = re.sub("[*]", "[^/]*", pattern)
|
|
756
|
-
pattern = re.
|
|
804
|
+
pattern = re.sub("=SLASH_DOUBLE_STARS=", "(|/.*)", pattern)
|
|
805
|
+
pattern = re.sub("=DOUBLE_STARS=", ".*", pattern)
|
|
806
|
+
pattern = re.compile(pattern)
|
|
757
807
|
out = {
|
|
758
808
|
p: allpaths[p]
|
|
759
809
|
for p in sorted(allpaths)
|
|
760
810
|
if pattern.match(p.replace("//", "/").rstrip("/"))
|
|
761
811
|
}
|
|
812
|
+
|
|
813
|
+
# Return directories only when the glob end by a slash
|
|
814
|
+
# This is needed for posix glob compliance
|
|
815
|
+
if ends:
|
|
816
|
+
out = {k: v for k, v in out.items() if v["type"] == "directory"}
|
|
817
|
+
|
|
762
818
|
if detail:
|
|
763
819
|
return out
|
|
764
820
|
else:
|
|
@@ -777,8 +833,14 @@ class AsyncFileSystem(AbstractFileSystem):
|
|
|
777
833
|
|
|
778
834
|
async def _find(self, path, maxdepth=None, withdirs=False, **kwargs):
|
|
779
835
|
path = self._strip_protocol(path)
|
|
780
|
-
out =
|
|
836
|
+
out = {}
|
|
781
837
|
detail = kwargs.pop("detail", False)
|
|
838
|
+
|
|
839
|
+
# Add the root directory if withdirs is requested
|
|
840
|
+
# This is needed for posix glob compliance
|
|
841
|
+
if withdirs and path != "" and await self._isdir(path):
|
|
842
|
+
out[path] = await self._info(path)
|
|
843
|
+
|
|
782
844
|
# async for?
|
|
783
845
|
async for _, dirs, files in self._walk(path, maxdepth, detail=True, **kwargs):
|
|
784
846
|
if withdirs:
|
|
@@ -805,7 +867,7 @@ class AsyncFileSystem(AbstractFileSystem):
|
|
|
805
867
|
path = [self._strip_protocol(p) for p in path]
|
|
806
868
|
for p in path: # can gather here
|
|
807
869
|
if has_magic(p):
|
|
808
|
-
bit = set(await self._glob(p))
|
|
870
|
+
bit = set(await self._glob(p, maxdepth=maxdepth))
|
|
809
871
|
out |= bit
|
|
810
872
|
if recursive:
|
|
811
873
|
# glob call above expanded one depth so if maxdepth is defined
|
|
@@ -829,7 +891,7 @@ class AsyncFileSystem(AbstractFileSystem):
|
|
|
829
891
|
out.add(p)
|
|
830
892
|
if not out:
|
|
831
893
|
raise FileNotFoundError(path)
|
|
832
|
-
return
|
|
894
|
+
return sorted(out)
|
|
833
895
|
|
|
834
896
|
async def _mkdir(self, path, create_parents=True, **kwargs):
|
|
835
897
|
pass # not necessary to implement, may not have directories
|
fsspec/caching.py
CHANGED
|
@@ -456,7 +456,7 @@ class KnownPartsOfAFile(BaseCache):
|
|
|
456
456
|
|
|
457
457
|
# simple consolidation of contiguous blocks
|
|
458
458
|
if data:
|
|
459
|
-
old_offsets = sorted(
|
|
459
|
+
old_offsets = sorted(data.keys())
|
|
460
460
|
offsets = [old_offsets[0]]
|
|
461
461
|
blocks = [data.pop(old_offsets[0])]
|
|
462
462
|
for start, stop in old_offsets[1:]:
|
fsspec/compression.py
CHANGED
|
@@ -1,5 +1,4 @@
|
|
|
1
1
|
"""Helper functions for a standard streaming compression API"""
|
|
2
|
-
from bz2 import BZ2File
|
|
3
2
|
from zipfile import ZipFile
|
|
4
3
|
|
|
5
4
|
import fsspec.utils
|
|
@@ -68,7 +67,13 @@ def unzip(infile, mode="rb", filename=None, **kwargs):
|
|
|
68
67
|
|
|
69
68
|
|
|
70
69
|
register_compression("zip", unzip, "zip")
|
|
71
|
-
|
|
70
|
+
|
|
71
|
+
try:
|
|
72
|
+
from bz2 import BZ2File
|
|
73
|
+
except ImportError:
|
|
74
|
+
pass
|
|
75
|
+
else:
|
|
76
|
+
register_compression("bz2", BZ2File, "bz2")
|
|
72
77
|
|
|
73
78
|
try: # pragma: no cover
|
|
74
79
|
from isal import igzip
|
fsspec/core.py
CHANGED
|
@@ -210,7 +210,6 @@ def open_files(
|
|
|
210
210
|
num=1,
|
|
211
211
|
protocol=None,
|
|
212
212
|
newline=None,
|
|
213
|
-
auto_mkdir=True,
|
|
214
213
|
expand=True,
|
|
215
214
|
**kwargs,
|
|
216
215
|
):
|
|
@@ -249,9 +248,6 @@ def open_files(
|
|
|
249
248
|
newline: bytes or None
|
|
250
249
|
Used for line terminator in text mode. If None, uses system default;
|
|
251
250
|
if blank, uses no translation.
|
|
252
|
-
auto_mkdir: bool (True)
|
|
253
|
-
If in write mode, this will ensure the target directory exists before
|
|
254
|
-
writing, by calling ``fs.mkdirs(exist_ok=True)``.
|
|
255
251
|
expand: bool
|
|
256
252
|
**kwargs: dict
|
|
257
253
|
Extra options that make sense to a particular storage connection, e.g.
|
|
@@ -288,9 +284,6 @@ def open_files(
|
|
|
288
284
|
protocol=protocol,
|
|
289
285
|
expand=expand,
|
|
290
286
|
)
|
|
291
|
-
if "r" not in mode and auto_mkdir:
|
|
292
|
-
parents = {fs._parent(path) for path in paths}
|
|
293
|
-
[fs.makedirs(parent, exist_ok=True) for parent in parents]
|
|
294
287
|
return OpenFiles(
|
|
295
288
|
[
|
|
296
289
|
OpenFile(
|
|
@@ -360,6 +353,19 @@ def url_to_fs(url, **kwargs):
|
|
|
360
353
|
urlpath : str
|
|
361
354
|
The file-systems-specific URL for ``url``.
|
|
362
355
|
"""
|
|
356
|
+
# non-FS arguments that appear in fsspec.open()
|
|
357
|
+
# inspect could keep this in sync with open()'s signature
|
|
358
|
+
known_kwargs = {
|
|
359
|
+
"compression",
|
|
360
|
+
"encoding",
|
|
361
|
+
"errors",
|
|
362
|
+
"expand",
|
|
363
|
+
"mode",
|
|
364
|
+
"name_function",
|
|
365
|
+
"newline",
|
|
366
|
+
"num",
|
|
367
|
+
}
|
|
368
|
+
kwargs = {k: v for k, v in kwargs.items() if k not in known_kwargs}
|
|
363
369
|
chain = _un_chain(url, kwargs)
|
|
364
370
|
inkwargs = {}
|
|
365
371
|
# Reverse iterate the chain, creating a nested target_* structure
|
|
@@ -611,7 +617,7 @@ def get_fs_token_paths(
|
|
|
611
617
|
pchains = [
|
|
612
618
|
_un_chain(stringify_path(u), storage_options or {})[0] for u in urlpath
|
|
613
619
|
]
|
|
614
|
-
if len(
|
|
620
|
+
if len({pc[1] for pc in pchains}) > 1:
|
|
615
621
|
raise ValueError("Protocol mismatch getting fs from %s", urlpath)
|
|
616
622
|
paths = [pc[0] for pc in pchains]
|
|
617
623
|
else:
|
|
@@ -621,6 +627,8 @@ def get_fs_token_paths(
|
|
|
621
627
|
else:
|
|
622
628
|
if "w" in mode and expand:
|
|
623
629
|
paths = _expand_paths(paths, name_function, num)
|
|
630
|
+
elif "x" in mode and expand:
|
|
631
|
+
paths = _expand_paths(paths, name_function, num)
|
|
624
632
|
elif "*" in paths:
|
|
625
633
|
paths = [f for f in sorted(fs.glob(paths)) if not fs.isdir(f)]
|
|
626
634
|
else:
|
fsspec/generic.py
CHANGED
|
@@ -1,9 +1,15 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
import inspect
|
|
2
4
|
import logging
|
|
5
|
+
import os
|
|
6
|
+
import shutil
|
|
7
|
+
import uuid
|
|
8
|
+
from typing import Optional
|
|
3
9
|
|
|
4
|
-
from .asyn import AsyncFileSystem
|
|
10
|
+
from .asyn import AsyncFileSystem, _run_coros_in_chunks, sync_wrapper
|
|
5
11
|
from .callbacks import _DEFAULT_CALLBACK
|
|
6
|
-
from .core import filesystem, get_filesystem_class, split_protocol
|
|
12
|
+
from .core import filesystem, get_filesystem_class, split_protocol, url_to_fs
|
|
7
13
|
|
|
8
14
|
_generic_fs = {}
|
|
9
15
|
logger = logging.getLogger("fsspec.generic")
|
|
@@ -29,7 +35,8 @@ def _resolve_fs(url, method=None, protocol=None, storage_options=None):
|
|
|
29
35
|
cls = get_filesystem_class(protocol)
|
|
30
36
|
return cls.current()
|
|
31
37
|
if method == "options":
|
|
32
|
-
|
|
38
|
+
fs, _ = url_to_fs(url, **storage_options.get(protocol, {}))
|
|
39
|
+
return fs
|
|
33
40
|
raise ValueError(f"Unknown FS resolution method: {method}")
|
|
34
41
|
|
|
35
42
|
|
|
@@ -51,19 +58,23 @@ def rsync(
|
|
|
51
58
|
Parameters
|
|
52
59
|
----------
|
|
53
60
|
source: str
|
|
54
|
-
Root of the directory tree to take files from.
|
|
61
|
+
Root of the directory tree to take files from. This must be a directory, but
|
|
62
|
+
do not include any terminating "/" character
|
|
55
63
|
destination: str
|
|
56
64
|
Root path to copy into. The contents of this location should be
|
|
57
|
-
identical to the contents of ``source`` when done.
|
|
65
|
+
identical to the contents of ``source`` when done. This will be made a
|
|
66
|
+
directory, and the terminal "/" should not be included.
|
|
58
67
|
delete_missing: bool
|
|
59
68
|
If there are paths in the destination that don't exist in the
|
|
60
69
|
source and this is True, delete them. Otherwise, leave them alone.
|
|
61
|
-
source_field: str
|
|
70
|
+
source_field: str | callable
|
|
62
71
|
If ``update_field`` is "different", this is the key in the info
|
|
63
|
-
of source files to consider for difference.
|
|
64
|
-
|
|
72
|
+
of source files to consider for difference. Maybe a function of the
|
|
73
|
+
info dict.
|
|
74
|
+
dest_field: str | callable
|
|
65
75
|
If ``update_field`` is "different", this is the key in the info
|
|
66
|
-
of destination files to consider for difference.
|
|
76
|
+
of destination files to consider for difference. May be a function of
|
|
77
|
+
the info dict.
|
|
67
78
|
update_cond: "different"|"always"|"never"
|
|
68
79
|
If "always", every file is copied, regardless of whether it exists in
|
|
69
80
|
the destination. If "never", files that exist in the destination are
|
|
@@ -90,9 +101,10 @@ def rsync(
|
|
|
90
101
|
if v["type"] == "directory" and a.replace(source, destination) not in otherfiles
|
|
91
102
|
]
|
|
92
103
|
logger.debug(f"{len(dirs)} directories to create")
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
104
|
+
if dirs:
|
|
105
|
+
fs.make_many_dirs(
|
|
106
|
+
[dirn.replace(source, destination) for dirn in dirs], exist_ok=True
|
|
107
|
+
)
|
|
96
108
|
allfiles = {a: v for a, v in allfiles.items() if v["type"] == "file"}
|
|
97
109
|
logger.debug(f"{len(allfiles)} files to consider for copy")
|
|
98
110
|
to_delete = [
|
|
@@ -106,7 +118,10 @@ def rsync(
|
|
|
106
118
|
if update_cond == "always":
|
|
107
119
|
allfiles[k] = otherfile
|
|
108
120
|
elif update_cond == "different":
|
|
109
|
-
if
|
|
121
|
+
inf1 = source_field(v) if callable(source_field) else v[source_field]
|
|
122
|
+
v2 = otherfiles[otherfile]
|
|
123
|
+
inf2 = dest_field(v2) if callable(dest_field) else v2[dest_field]
|
|
124
|
+
if inf1 != inf2:
|
|
110
125
|
# details mismatch, make copy
|
|
111
126
|
allfiles[k] = otherfile
|
|
112
127
|
else:
|
|
@@ -115,12 +130,12 @@ def rsync(
|
|
|
115
130
|
else:
|
|
116
131
|
# file not in target yet
|
|
117
132
|
allfiles[k] = otherfile
|
|
133
|
+
logger.debug(f"{len(allfiles)} files to copy")
|
|
118
134
|
if allfiles:
|
|
119
135
|
source_files, target_files = zip(*allfiles.items())
|
|
120
|
-
logger.debug(f"{len(source_files)} files to copy")
|
|
121
136
|
fs.cp(source_files, target_files, **kwargs)
|
|
137
|
+
logger.debug(f"{len(to_delete)} files to delete")
|
|
122
138
|
if delete_missing:
|
|
123
|
-
logger.debug(f"{len(to_delete)} files to delete")
|
|
124
139
|
fs.rm(to_delete)
|
|
125
140
|
|
|
126
141
|
|
|
@@ -165,11 +180,11 @@ class GenericFileSystem(AsyncFileSystem):
|
|
|
165
180
|
fs = _resolve_fs(path, self.method)
|
|
166
181
|
if fs.async_impl:
|
|
167
182
|
out = await fs._find(
|
|
168
|
-
path, maxdepth=maxdepth, withdirs=withdirs, detail=
|
|
183
|
+
path, maxdepth=maxdepth, withdirs=withdirs, detail=True, **kwargs
|
|
169
184
|
)
|
|
170
185
|
else:
|
|
171
186
|
out = fs.find(
|
|
172
|
-
path, maxdepth=maxdepth, withdirs=withdirs, detail=
|
|
187
|
+
path, maxdepth=maxdepth, withdirs=withdirs, detail=True, **kwargs
|
|
173
188
|
)
|
|
174
189
|
result = {}
|
|
175
190
|
for k, v in out.items():
|
|
@@ -238,6 +253,7 @@ class GenericFileSystem(AsyncFileSystem):
|
|
|
238
253
|
fs.rm(url, **kwargs)
|
|
239
254
|
|
|
240
255
|
async def _makedirs(self, path, exist_ok=False):
|
|
256
|
+
logger.debug("Make dir %s", path)
|
|
241
257
|
fs = _resolve_fs(path, self.method)
|
|
242
258
|
if fs.async_impl:
|
|
243
259
|
await fs._makedirs(path, exist_ok=exist_ok)
|
|
@@ -294,6 +310,84 @@ class GenericFileSystem(AsyncFileSystem):
|
|
|
294
310
|
# fail while opening f1 or f2
|
|
295
311
|
pass
|
|
296
312
|
|
|
313
|
+
async def _make_many_dirs(self, urls, exist_ok=True):
|
|
314
|
+
fs = _resolve_fs(urls[0], self.method)
|
|
315
|
+
if fs.async_impl:
|
|
316
|
+
coros = [fs._makedirs(u, exist_ok=exist_ok) for u in urls]
|
|
317
|
+
await _run_coros_in_chunks(coros)
|
|
318
|
+
else:
|
|
319
|
+
for u in urls:
|
|
320
|
+
fs.makedirs(u, exist_ok=exist_ok)
|
|
321
|
+
|
|
322
|
+
make_many_dirs = sync_wrapper(_make_many_dirs)
|
|
323
|
+
|
|
324
|
+
async def _copy(
|
|
325
|
+
self,
|
|
326
|
+
path1: list[str],
|
|
327
|
+
path2: list[str],
|
|
328
|
+
recursive: bool = False,
|
|
329
|
+
on_error: str = "ignore",
|
|
330
|
+
maxdepth: Optional[int] = None,
|
|
331
|
+
batch_size: Optional[int] = None,
|
|
332
|
+
tempdir: Optional[str] = None,
|
|
333
|
+
**kwargs,
|
|
334
|
+
):
|
|
335
|
+
if recursive:
|
|
336
|
+
raise NotImplementedError
|
|
337
|
+
fs = _resolve_fs(path1[0], self.method)
|
|
338
|
+
fs2 = _resolve_fs(path2[0], self.method)
|
|
339
|
+
# not expanding paths atm., assume call is from rsync()
|
|
340
|
+
if fs is fs2:
|
|
341
|
+
# pure remote
|
|
342
|
+
if fs.async_impl:
|
|
343
|
+
return await fs._copy(path1, path2, **kwargs)
|
|
344
|
+
else:
|
|
345
|
+
return fs.copy(path1, path2, **kwargs)
|
|
346
|
+
await copy_file_op(
|
|
347
|
+
fs, path1, fs2, path2, tempdir, batch_size, on_error=on_error
|
|
348
|
+
)
|
|
349
|
+
|
|
350
|
+
|
|
351
|
+
async def copy_file_op(
|
|
352
|
+
fs1, url1, fs2, url2, tempdir=None, batch_size=20, on_error="ignore"
|
|
353
|
+
):
|
|
354
|
+
import tempfile
|
|
355
|
+
|
|
356
|
+
tempdir = tempdir or tempfile.mkdtemp()
|
|
357
|
+
try:
|
|
358
|
+
coros = [
|
|
359
|
+
_copy_file_op(
|
|
360
|
+
fs1,
|
|
361
|
+
u1,
|
|
362
|
+
fs2,
|
|
363
|
+
u2,
|
|
364
|
+
os.path.join(tempdir, uuid.uuid4().hex),
|
|
365
|
+
on_error=on_error,
|
|
366
|
+
)
|
|
367
|
+
for u1, u2 in zip(url1, url2)
|
|
368
|
+
]
|
|
369
|
+
await _run_coros_in_chunks(coros, batch_size=batch_size)
|
|
370
|
+
finally:
|
|
371
|
+
shutil.rmtree(tempdir)
|
|
372
|
+
|
|
373
|
+
|
|
374
|
+
async def _copy_file_op(fs1, url1, fs2, url2, local, on_error="ignore"):
|
|
375
|
+
ex = () if on_error == "raise" else Exception
|
|
376
|
+
logger.debug("Copy %s -> %s", url1, url2)
|
|
377
|
+
try:
|
|
378
|
+
if fs1.async_impl:
|
|
379
|
+
await fs1._get_file(url1, local)
|
|
380
|
+
else:
|
|
381
|
+
fs1.get_file(url1, local)
|
|
382
|
+
if fs2.async_impl:
|
|
383
|
+
await fs2._put_file(local, url2)
|
|
384
|
+
else:
|
|
385
|
+
fs2.put_file(local, url2)
|
|
386
|
+
os.unlink(local)
|
|
387
|
+
logger.debug("Copy %s -> %s; done", url1, url2)
|
|
388
|
+
except ex as e:
|
|
389
|
+
logger.debug("ignoring cp exception for %s: %s", url1, e)
|
|
390
|
+
|
|
297
391
|
|
|
298
392
|
async def maybe_await(cor):
|
|
299
393
|
if inspect.iscoroutine(cor):
|