fsspec 2023.6.0__py3-none-any.whl → 2023.9.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
fsspec/spec.py CHANGED
@@ -372,7 +372,7 @@ class AbstractFileSystem(metaclass=_Cached):
372
372
  except KeyError:
373
373
  pass
374
374
 
375
- def walk(self, path, maxdepth=None, topdown=True, **kwargs):
375
+ def walk(self, path, maxdepth=None, topdown=True, on_error="omit", **kwargs):
376
376
  """Return all files belows path
377
377
 
378
378
  List all files, recursing into subdirectories; output is iterator-style,
@@ -399,6 +399,10 @@ class AbstractFileSystem(metaclass=_Cached):
399
399
  topdown: bool (True)
400
400
  Whether to walk the directory tree from the top downwards or from
401
401
  the bottom upwards.
402
+ on_error: "omit", "raise", a collable
403
+ if omit (default), path with exception will simply be empty;
404
+ If raise, an underlying exception will be raised;
405
+ if callable, it will be called with a single OSError instance as argument
402
406
  kwargs: passed to ``ls``
403
407
  """
404
408
  if maxdepth is not None and maxdepth < 1:
@@ -412,7 +416,11 @@ class AbstractFileSystem(metaclass=_Cached):
412
416
  detail = kwargs.pop("detail", False)
413
417
  try:
414
418
  listing = self.ls(path, detail=True, **kwargs)
415
- except (FileNotFoundError, OSError):
419
+ except (FileNotFoundError, OSError) as e:
420
+ if on_error == "raise":
421
+ raise
422
+ elif callable(on_error):
423
+ on_error(e)
416
424
  if detail:
417
425
  return path, {}, {}
418
426
  return path, [], []
@@ -477,7 +485,13 @@ class AbstractFileSystem(metaclass=_Cached):
477
485
  """
478
486
  # TODO: allow equivalent of -name parameter
479
487
  path = self._strip_protocol(path)
480
- out = dict()
488
+ out = {}
489
+
490
+ # Add the root directory if withdirs is requested
491
+ # This is needed for posix glob compliance
492
+ if withdirs and path != "" and self.isdir(path):
493
+ out[path] = self.info(path)
494
+
481
495
  for _, dirs, files in self.walk(path, maxdepth, detail=True, **kwargs):
482
496
  if withdirs:
483
497
  files.update(dirs)
@@ -526,40 +540,40 @@ class AbstractFileSystem(metaclass=_Cached):
526
540
  else:
527
541
  return sizes
528
542
 
529
- def glob(self, path, **kwargs):
543
+ def glob(self, path, maxdepth=None, **kwargs):
530
544
  """
531
545
  Find files by glob-matching.
532
546
 
533
- If the path ends with '/' and does not contain "*", it is essentially
534
- the same as ``ls(path)``, returning only files.
547
+ If the path ends with '/', only folders are returned.
535
548
 
536
549
  We support ``"**"``,
537
550
  ``"?"`` and ``"[..]"``. We do not support ^ for pattern negation.
538
551
 
552
+ The `maxdepth` option is applied on the first `**` found in the path.
553
+
539
554
  Search path names that contain embedded characters special to this
540
555
  implementation of glob may not produce expected results;
541
556
  e.g., 'foo/bar/*starredfilename*'.
542
557
 
543
558
  kwargs are passed to ``ls``.
544
559
  """
560
+ if maxdepth is not None and maxdepth < 1:
561
+ raise ValueError("maxdepth must be at least 1")
562
+
545
563
  import re
546
564
 
547
565
  ends = path.endswith("/")
548
566
  path = self._strip_protocol(path)
549
- indstar = path.find("*") if path.find("*") >= 0 else len(path)
550
- indques = path.find("?") if path.find("?") >= 0 else len(path)
551
- indbrace = path.find("[") if path.find("[") >= 0 else len(path)
567
+ idx_star = path.find("*") if path.find("*") >= 0 else len(path)
568
+ idx_qmark = path.find("?") if path.find("?") >= 0 else len(path)
569
+ idx_brace = path.find("[") if path.find("[") >= 0 else len(path)
552
570
 
553
- ind = min(indstar, indques, indbrace)
571
+ min_idx = min(idx_star, idx_qmark, idx_brace)
554
572
 
555
573
  detail = kwargs.pop("detail", False)
556
574
 
557
575
  if not has_magic(path):
558
- root = path
559
- depth = 1
560
- if ends:
561
- path += "/*"
562
- elif self.exists(path):
576
+ if self.exists(path):
563
577
  if not detail:
564
578
  return [path]
565
579
  else:
@@ -569,13 +583,21 @@ class AbstractFileSystem(metaclass=_Cached):
569
583
  return [] # glob of non-existent returns empty
570
584
  else:
571
585
  return {}
572
- elif "/" in path[:ind]:
573
- ind2 = path[:ind].rindex("/")
574
- root = path[: ind2 + 1]
575
- depth = None if "**" in path else path[ind2 + 1 :].count("/") + 1
586
+ elif "/" in path[:min_idx]:
587
+ min_idx = path[:min_idx].rindex("/")
588
+ root = path[: min_idx + 1]
589
+ depth = path[min_idx + 1 :].count("/") + 1
576
590
  else:
577
591
  root = ""
578
- depth = None if "**" in path else path[ind + 1 :].count("/") + 1
592
+ depth = path[min_idx + 1 :].count("/") + 1
593
+
594
+ if "**" in path:
595
+ if maxdepth is not None:
596
+ idx_double_stars = path.find("**")
597
+ depth_double_stars = path[idx_double_stars:].count("/") + 1
598
+ depth = depth - depth_double_stars + maxdepth
599
+ else:
600
+ depth = None
579
601
 
580
602
  allpaths = self.find(root, maxdepth=depth, withdirs=True, detail=True, **kwargs)
581
603
  # Escape characters special to python regex, leaving our supported
@@ -601,14 +623,24 @@ class AbstractFileSystem(metaclass=_Cached):
601
623
  )
602
624
  + "$"
603
625
  )
604
- pattern = re.sub("[*]{2}", "=PLACEHOLDER=", pattern)
626
+ pattern = re.sub("/[*]{2}", "=SLASH_DOUBLE_STARS=", pattern)
627
+ pattern = re.sub("[*]{2}/?", "=DOUBLE_STARS=", pattern)
605
628
  pattern = re.sub("[*]", "[^/]*", pattern)
606
- pattern = re.compile(pattern.replace("=PLACEHOLDER=", ".*"))
629
+ pattern = re.sub("=SLASH_DOUBLE_STARS=", "(|/.*)", pattern)
630
+ pattern = re.sub("=DOUBLE_STARS=", ".*", pattern)
631
+ pattern = re.compile(pattern)
632
+
607
633
  out = {
608
634
  p: allpaths[p]
609
635
  for p in sorted(allpaths)
610
636
  if pattern.match(p.replace("//", "/").rstrip("/"))
611
637
  }
638
+
639
+ # Return directories only when the glob end by a slash
640
+ # This is needed for posix glob compliance
641
+ if ends:
642
+ out = {k: v for k, v in out.items() if v["type"] == "directory"}
643
+
612
644
  if detail:
613
645
  return out
614
646
  else:
@@ -906,33 +938,44 @@ class AbstractFileSystem(metaclass=_Cached):
906
938
 
907
939
  Calls get_file for each source.
908
940
  """
909
- from .implementations.local import (
910
- LocalFileSystem,
911
- make_path_posix,
912
- trailing_sep,
913
- trailing_sep_maybe_asterisk,
914
- )
941
+ if isinstance(lpath, list) and isinstance(rpath, list):
942
+ # No need to expand paths when both source and destination
943
+ # are provided as lists
944
+ rpaths = rpath
945
+ lpaths = lpath
946
+ else:
947
+ from .implementations.local import (
948
+ LocalFileSystem,
949
+ make_path_posix,
950
+ trailing_sep,
951
+ )
915
952
 
916
- source_is_str = isinstance(rpath, str)
917
- rpaths = self.expand_path(rpath, recursive=recursive, maxdepth=maxdepth)
918
- if source_is_str and (not recursive or maxdepth is not None):
919
- # Non-recursive glob does not copy directories
920
- rpaths = [p for p in rpaths if not (trailing_sep(p) or self.isdir(p))]
921
- if not rpaths:
922
- return
953
+ source_is_str = isinstance(rpath, str)
954
+ rpaths = self.expand_path(rpath, recursive=recursive, maxdepth=maxdepth)
955
+ if source_is_str and (not recursive or maxdepth is not None):
956
+ # Non-recursive glob does not copy directories
957
+ rpaths = [p for p in rpaths if not (trailing_sep(p) or self.isdir(p))]
958
+ if not rpaths:
959
+ return
923
960
 
924
- if isinstance(lpath, str):
925
- lpath = make_path_posix(lpath)
926
- isdir = isinstance(lpath, str) and (
927
- trailing_sep(lpath) or LocalFileSystem().isdir(lpath)
928
- )
929
- lpaths = other_paths(
930
- rpaths,
931
- lpath,
932
- exists=isdir and source_is_str and not trailing_sep_maybe_asterisk(rpath),
933
- is_dir=isdir,
934
- flatten=not source_is_str,
935
- )
961
+ if isinstance(lpath, str):
962
+ lpath = make_path_posix(lpath)
963
+
964
+ source_is_file = len(rpaths) == 1
965
+ dest_is_dir = isinstance(lpath, str) and (
966
+ trailing_sep(lpath) or LocalFileSystem().isdir(lpath)
967
+ )
968
+
969
+ exists = source_is_str and (
970
+ (has_magic(rpath) and source_is_file)
971
+ or (not has_magic(rpath) and dest_is_dir and not trailing_sep(rpath))
972
+ )
973
+ lpaths = other_paths(
974
+ rpaths,
975
+ lpath,
976
+ exists=exists,
977
+ flatten=not source_is_str,
978
+ )
936
979
 
937
980
  callback.set_size(len(lpaths))
938
981
  for lpath, rpath in callback.wrap(zip(lpaths, rpaths)):
@@ -976,37 +1019,49 @@ class AbstractFileSystem(metaclass=_Cached):
976
1019
 
977
1020
  Calls put_file for each source.
978
1021
  """
979
- from .implementations.local import (
980
- LocalFileSystem,
981
- make_path_posix,
982
- trailing_sep,
983
- trailing_sep_maybe_asterisk,
984
- )
1022
+ if isinstance(lpath, list) and isinstance(rpath, list):
1023
+ # No need to expand paths when both source and destination
1024
+ # are provided as lists
1025
+ rpaths = rpath
1026
+ lpaths = lpath
1027
+ else:
1028
+ from .implementations.local import (
1029
+ LocalFileSystem,
1030
+ make_path_posix,
1031
+ trailing_sep,
1032
+ )
985
1033
 
986
- source_is_str = isinstance(lpath, str)
987
- if source_is_str:
988
- lpath = make_path_posix(lpath)
989
- fs = LocalFileSystem()
990
- lpaths = fs.expand_path(lpath, recursive=recursive, maxdepth=maxdepth)
991
- if source_is_str and (not recursive or maxdepth is not None):
992
- # Non-recursive glob does not copy directories
993
- lpaths = [p for p in lpaths if not (trailing_sep(p) or fs.isdir(p))]
994
- if not lpaths:
995
- return
1034
+ source_is_str = isinstance(lpath, str)
1035
+ if source_is_str:
1036
+ lpath = make_path_posix(lpath)
1037
+ fs = LocalFileSystem()
1038
+ lpaths = fs.expand_path(lpath, recursive=recursive, maxdepth=maxdepth)
1039
+ if source_is_str and (not recursive or maxdepth is not None):
1040
+ # Non-recursive glob does not copy directories
1041
+ lpaths = [p for p in lpaths if not (trailing_sep(p) or fs.isdir(p))]
1042
+ if not lpaths:
1043
+ return
1044
+
1045
+ source_is_file = len(lpaths) == 1
1046
+ dest_is_dir = isinstance(rpath, str) and (
1047
+ trailing_sep(rpath) or self.isdir(rpath)
1048
+ )
996
1049
 
997
- isdir = isinstance(rpath, str) and (trailing_sep(rpath) or self.isdir(rpath))
998
- rpath = (
999
- self._strip_protocol(rpath)
1000
- if isinstance(rpath, str)
1001
- else [self._strip_protocol(p) for p in rpath]
1002
- )
1003
- rpaths = other_paths(
1004
- lpaths,
1005
- rpath,
1006
- exists=isdir and source_is_str and not trailing_sep_maybe_asterisk(lpath),
1007
- is_dir=isdir,
1008
- flatten=not source_is_str,
1009
- )
1050
+ rpath = (
1051
+ self._strip_protocol(rpath)
1052
+ if isinstance(rpath, str)
1053
+ else [self._strip_protocol(p) for p in rpath]
1054
+ )
1055
+ exists = source_is_str and (
1056
+ (has_magic(lpath) and source_is_file)
1057
+ or (not has_magic(lpath) and dest_is_dir and not trailing_sep(lpath))
1058
+ )
1059
+ rpaths = other_paths(
1060
+ lpaths,
1061
+ rpath,
1062
+ exists=exists,
1063
+ flatten=not source_is_str,
1064
+ )
1010
1065
 
1011
1066
  callback.set_size(len(rpaths))
1012
1067
  for lpath, rpath in callback.wrap(zip(lpaths, rpaths)):
@@ -1037,31 +1092,44 @@ class AbstractFileSystem(metaclass=_Cached):
1037
1092
  not-found exceptions will cause the path to be skipped; defaults to
1038
1093
  raise unless recursive is true, where the default is ignore
1039
1094
  """
1040
- from .implementations.local import trailing_sep, trailing_sep_maybe_asterisk
1041
-
1042
1095
  if on_error is None and recursive:
1043
1096
  on_error = "ignore"
1044
1097
  elif on_error is None:
1045
1098
  on_error = "raise"
1046
1099
 
1047
- source_is_str = isinstance(path1, str)
1048
- paths = self.expand_path(path1, recursive=recursive, maxdepth=maxdepth)
1049
- if source_is_str and (not recursive or maxdepth is not None):
1050
- # Non-recursive glob does not copy directories
1051
- paths = [p for p in paths if not (trailing_sep(p) or self.isdir(p))]
1052
- if not paths:
1053
- return
1100
+ if isinstance(path1, list) and isinstance(path2, list):
1101
+ # No need to expand paths when both source and destination
1102
+ # are provided as lists
1103
+ paths1 = path1
1104
+ paths2 = path2
1105
+ else:
1106
+ from .implementations.local import trailing_sep
1107
+
1108
+ source_is_str = isinstance(path1, str)
1109
+ paths1 = self.expand_path(path1, recursive=recursive, maxdepth=maxdepth)
1110
+ if source_is_str and (not recursive or maxdepth is not None):
1111
+ # Non-recursive glob does not copy directories
1112
+ paths1 = [p for p in paths1 if not (trailing_sep(p) or self.isdir(p))]
1113
+ if not paths1:
1114
+ return
1115
+
1116
+ source_is_file = len(paths1) == 1
1117
+ dest_is_dir = isinstance(path2, str) and (
1118
+ trailing_sep(path2) or self.isdir(path2)
1119
+ )
1054
1120
 
1055
- isdir = isinstance(path2, str) and (trailing_sep(path2) or self.isdir(path2))
1056
- path2 = other_paths(
1057
- paths,
1058
- path2,
1059
- exists=isdir and source_is_str and not trailing_sep_maybe_asterisk(path1),
1060
- is_dir=isdir,
1061
- flatten=not source_is_str,
1062
- )
1121
+ exists = source_is_str and (
1122
+ (has_magic(path1) and source_is_file)
1123
+ or (not has_magic(path1) and dest_is_dir and not trailing_sep(path1))
1124
+ )
1125
+ paths2 = other_paths(
1126
+ paths1,
1127
+ path2,
1128
+ exists=exists,
1129
+ flatten=not source_is_str,
1130
+ )
1063
1131
 
1064
- for p1, p2 in zip(paths, path2):
1132
+ for p1, p2 in zip(paths1, paths2):
1065
1133
  try:
1066
1134
  self.cp_file(p1, p2, **kwargs)
1067
1135
  except FileNotFoundError:
@@ -1085,7 +1153,7 @@ class AbstractFileSystem(metaclass=_Cached):
1085
1153
  path = [self._strip_protocol(p) for p in path]
1086
1154
  for p in path:
1087
1155
  if has_magic(p):
1088
- bit = set(self.glob(p, **kwargs))
1156
+ bit = set(self.glob(p, maxdepth=maxdepth, **kwargs))
1089
1157
  out |= bit
1090
1158
  if recursive:
1091
1159
  # glob call above expanded one depth so if maxdepth is defined
@@ -1114,7 +1182,7 @@ class AbstractFileSystem(metaclass=_Cached):
1114
1182
  out.add(p)
1115
1183
  if not out:
1116
1184
  raise FileNotFoundError(path)
1117
- return list(sorted(out))
1185
+ return sorted(out)
1118
1186
 
1119
1187
  def mv(self, path1, path2, recursive=False, maxdepth=None, **kwargs):
1120
1188
  """Move file(s) from one location to another"""
@@ -1,4 +1,5 @@
1
1
  import os
2
+ from hashlib import md5
2
3
 
3
4
  import pytest
4
5
 
@@ -26,6 +27,41 @@ class BaseAbstractFixtures:
26
27
  yield source
27
28
  fs.rm(source, recursive=True)
28
29
 
30
+ @pytest.fixture
31
+ def fs_glob_edge_cases_files(self, fs, fs_join, fs_path):
32
+ """
33
+ Scenario on remote filesystem that is used for glob edge cases cp/get/put tests.
34
+
35
+ Cleans up at the end of each test it which it is used.
36
+ """
37
+ source = self._glob_edge_cases_files(fs, fs_join, fs_path)
38
+ yield source
39
+ fs.rm(source, recursive=True)
40
+
41
+ @pytest.fixture
42
+ def fs_dir_and_file_with_same_name_prefix(self, fs, fs_join, fs_path):
43
+ """
44
+ Scenario on remote filesystem that is used to check cp/get/put on directory
45
+ and file with the same name prefixes.
46
+
47
+ Cleans up at the end of each test it which it is used.
48
+ """
49
+ source = self._dir_and_file_with_same_name_prefix(fs, fs_join, fs_path)
50
+ yield source
51
+ fs.rm(source, recursive=True)
52
+
53
+ @pytest.fixture
54
+ def fs_10_files_with_hashed_names(self, fs, fs_join, fs_path):
55
+ """
56
+ Scenario on remote filesystem that is used to check cp/get/put files order
57
+ when source and destination are lists.
58
+
59
+ Cleans up at the end of each test it which it is used.
60
+ """
61
+ source = self._10_files_with_hashed_names(fs, fs_join, fs_path)
62
+ yield source
63
+ fs.rm(source, recursive=True)
64
+
29
65
  @pytest.fixture
30
66
  def fs_target(self, fs, fs_join, fs_path):
31
67
  """
@@ -49,6 +85,45 @@ class BaseAbstractFixtures:
49
85
  yield source
50
86
  local_fs.rm(source, recursive=True)
51
87
 
88
+ @pytest.fixture
89
+ def local_glob_edge_cases_files(self, local_fs, local_join, local_path):
90
+ """
91
+ Scenario on local filesystem that is used for glob edge cases cp/get/put tests.
92
+
93
+ Cleans up at the end of each test it which it is used.
94
+ """
95
+ source = self._glob_edge_cases_files(local_fs, local_join, local_path)
96
+ yield source
97
+ local_fs.rm(source, recursive=True)
98
+
99
+ @pytest.fixture
100
+ def local_dir_and_file_with_same_name_prefix(
101
+ self, local_fs, local_join, local_path
102
+ ):
103
+ """
104
+ Scenario on local filesystem that is used to check cp/get/put on directory
105
+ and file with the same name prefixes.
106
+
107
+ Cleans up at the end of each test it which it is used.
108
+ """
109
+ source = self._dir_and_file_with_same_name_prefix(
110
+ local_fs, local_join, local_path
111
+ )
112
+ yield source
113
+ local_fs.rm(source, recursive=True)
114
+
115
+ @pytest.fixture
116
+ def local_10_files_with_hashed_names(self, local_fs, local_join, local_path):
117
+ """
118
+ Scenario on local filesystem that is used to check cp/get/put files order
119
+ when source and destination are lists.
120
+
121
+ Cleans up at the end of each test it which it is used.
122
+ """
123
+ source = self._10_files_with_hashed_names(local_fs, local_join, local_path)
124
+ yield source
125
+ local_fs.rm(source, recursive=True)
126
+
52
127
  @pytest.fixture
53
128
  def local_target(self, local_fs, local_join, local_path):
54
129
  """
@@ -61,6 +136,39 @@ class BaseAbstractFixtures:
61
136
  if local_fs.exists(target):
62
137
  local_fs.rm(target, recursive=True)
63
138
 
139
+ def _glob_edge_cases_files(self, some_fs, some_join, some_path):
140
+ """
141
+ Scenario that is used for glob edge cases cp/get/put tests.
142
+ Creates the following directory and file structure:
143
+
144
+ 📁 source
145
+ ├── 📄 file1
146
+ ├── 📄 file2
147
+ ├── 📁 subdir0
148
+ │ ├── 📄 subfile1
149
+ │ ├── 📄 subfile2
150
+ │ └── 📁 nesteddir
151
+ │ └── 📄 nestedfile
152
+ └── 📁 subdir1
153
+ ├── 📄 subfile1
154
+ ├── 📄 subfile2
155
+ └── 📁 nesteddir
156
+ └── 📄 nestedfile
157
+ """
158
+ source = some_join(some_path, "source")
159
+ some_fs.touch(some_join(source, "file1"))
160
+ some_fs.touch(some_join(source, "file2"))
161
+
162
+ for subdir_idx in range(2):
163
+ subdir = some_join(source, f"subdir{subdir_idx}")
164
+ nesteddir = some_join(subdir, "nesteddir")
165
+ some_fs.makedirs(nesteddir)
166
+ some_fs.touch(some_join(subdir, "subfile1"))
167
+ some_fs.touch(some_join(subdir, "subfile2"))
168
+ some_fs.touch(some_join(nesteddir, "nestedfile"))
169
+
170
+ return source
171
+
64
172
  def _bulk_operations_scenario_0(self, some_fs, some_join, some_path):
65
173
  """
66
174
  Scenario that is used for many cp/get/put tests. Creates the following
@@ -86,6 +194,40 @@ class BaseAbstractFixtures:
86
194
  some_fs.touch(some_join(nesteddir, "nestedfile"))
87
195
  return source
88
196
 
197
+ def _dir_and_file_with_same_name_prefix(self, some_fs, some_join, some_path):
198
+ """
199
+ Scenario that is used to check cp/get/put on directory and file with
200
+ the same name prefixes. Creates the following directory and file structure:
201
+
202
+ 📁 source
203
+ ├── 📄 subdir.txt
204
+ └── 📁 subdir
205
+ └── 📄 subfile.txt
206
+ """
207
+ source = some_join(some_path, "source")
208
+ subdir = some_join(source, "subdir")
209
+ file = some_join(source, "subdir.txt")
210
+ subfile = some_join(subdir, "subfile.txt")
211
+ some_fs.makedirs(subdir)
212
+ some_fs.touch(file)
213
+ some_fs.touch(subfile)
214
+ return source
215
+
216
+ def _10_files_with_hashed_names(self, some_fs, some_join, some_path):
217
+ """
218
+ Scenario that is used to check cp/get/put files order when source and
219
+ destination are lists. Creates the following directory and file structure:
220
+
221
+ 📁 source
222
+ └── 📄 {hashed([0-9])}.txt
223
+ """
224
+ source = some_join(some_path, "source")
225
+ for i in range(10):
226
+ hashed_i = md5(str(i).encode("utf-8")).hexdigest()
227
+ path = some_join(source, f"{hashed_i}.txt")
228
+ some_fs.pipe(path=path, value=f"{i}".encode("utf-8"))
229
+ return source
230
+
89
231
 
90
232
  class AbstractFixtures(BaseAbstractFixtures):
91
233
  """
@@ -133,8 +275,13 @@ class AbstractFixtures(BaseAbstractFixtures):
133
275
  def local_path(self, tmpdir):
134
276
  return tmpdir
135
277
 
278
+ @pytest.fixture
136
279
  def supports_empty_directories(self):
137
280
  """
138
281
  Return whether this implementation supports empty directories.
139
282
  """
140
283
  return True
284
+
285
+ @pytest.fixture
286
+ def fs_sanitize_path(self):
287
+ return lambda x: x