megfile 3.1.1__py3-none-any.whl → 3.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. docs/conf.py +2 -4
  2. megfile/__init__.py +394 -203
  3. megfile/cli.py +258 -238
  4. megfile/config.py +25 -21
  5. megfile/errors.py +124 -114
  6. megfile/fs.py +174 -140
  7. megfile/fs_path.py +462 -354
  8. megfile/hdfs.py +133 -101
  9. megfile/hdfs_path.py +290 -236
  10. megfile/http.py +15 -14
  11. megfile/http_path.py +111 -107
  12. megfile/interfaces.py +70 -65
  13. megfile/lib/base_prefetch_reader.py +84 -65
  14. megfile/lib/combine_reader.py +12 -12
  15. megfile/lib/compare.py +17 -13
  16. megfile/lib/compat.py +1 -5
  17. megfile/lib/fnmatch.py +29 -30
  18. megfile/lib/glob.py +46 -54
  19. megfile/lib/hdfs_prefetch_reader.py +40 -25
  20. megfile/lib/hdfs_tools.py +1 -3
  21. megfile/lib/http_prefetch_reader.py +69 -46
  22. megfile/lib/joinpath.py +5 -5
  23. megfile/lib/lazy_handler.py +7 -3
  24. megfile/lib/s3_buffered_writer.py +58 -51
  25. megfile/lib/s3_cached_handler.py +13 -14
  26. megfile/lib/s3_limited_seekable_writer.py +37 -28
  27. megfile/lib/s3_memory_handler.py +34 -30
  28. megfile/lib/s3_pipe_handler.py +24 -25
  29. megfile/lib/s3_prefetch_reader.py +71 -52
  30. megfile/lib/s3_share_cache_reader.py +37 -24
  31. megfile/lib/shadow_handler.py +7 -3
  32. megfile/lib/stdio_handler.py +9 -8
  33. megfile/lib/url.py +3 -3
  34. megfile/pathlike.py +259 -228
  35. megfile/s3.py +220 -153
  36. megfile/s3_path.py +977 -802
  37. megfile/sftp.py +190 -156
  38. megfile/sftp_path.py +540 -450
  39. megfile/smart.py +397 -330
  40. megfile/smart_path.py +100 -105
  41. megfile/stdio.py +10 -9
  42. megfile/stdio_path.py +32 -35
  43. megfile/utils/__init__.py +73 -54
  44. megfile/utils/mutex.py +11 -14
  45. megfile/version.py +1 -1
  46. {megfile-3.1.1.dist-info → megfile-3.1.2.dist-info}/METADATA +5 -8
  47. megfile-3.1.2.dist-info/RECORD +55 -0
  48. {megfile-3.1.1.dist-info → megfile-3.1.2.dist-info}/WHEEL +1 -1
  49. scripts/convert_results_to_sarif.py +45 -78
  50. scripts/generate_file.py +140 -64
  51. megfile-3.1.1.dist-info/RECORD +0 -55
  52. {megfile-3.1.1.dist-info → megfile-3.1.2.dist-info}/LICENSE +0 -0
  53. {megfile-3.1.1.dist-info → megfile-3.1.2.dist-info}/LICENSE.pyre +0 -0
  54. {megfile-3.1.1.dist-info → megfile-3.1.2.dist-info}/entry_points.txt +0 -0
  55. {megfile-3.1.1.dist-info → megfile-3.1.2.dist-info}/top_level.txt +0 -0
megfile/hdfs_path.py CHANGED
@@ -13,17 +13,16 @@ from megfile.lib.glob import FSFunc, iglob
13
13
  from megfile.lib.hdfs_prefetch_reader import HdfsPrefetchReader
14
14
  from megfile.lib.hdfs_tools import hdfs_api
15
15
  from megfile.lib.url import get_url_scheme
16
- from megfile.pathlike import PathLike, URIPath
17
16
  from megfile.smart_path import SmartPath
18
17
  from megfile.utils import _is_pickle
19
18
 
20
19
  __all__ = [
21
- 'HdfsPath',
22
- 'is_hdfs',
23
- 'hdfs_glob',
24
- 'hdfs_glob_stat',
25
- 'hdfs_iglob',
26
- 'hdfs_makedirs',
20
+ "HdfsPath",
21
+ "is_hdfs",
22
+ "hdfs_glob",
23
+ "hdfs_glob_stat",
24
+ "hdfs_iglob",
25
+ "hdfs_makedirs",
27
26
  ]
28
27
 
29
28
  HDFS_USER = "HDFS_USER"
@@ -37,39 +36,39 @@ DEFAULT_HDFS_TIMEOUT = 10
37
36
 
38
37
 
39
38
  def is_hdfs(path: PathLike) -> bool:
40
- '''Test if a path is sftp path
39
+ """Test if a path is sftp path
41
40
 
42
41
  :param path: Path to be tested
43
42
  :returns: True of a path is sftp path, else False
44
- '''
43
+ """
45
44
  return fspath(path).startswith("hdfs://")
46
45
 
47
46
 
48
47
  def get_hdfs_config(profile_name: Optional[str] = None):
49
48
  env_profile = f"{profile_name.upper()}__" if profile_name else ""
50
49
  config = {
51
- 'user': os.getenv(f"{env_profile}{HDFS_USER}"),
52
- 'url': os.getenv(f"{env_profile}{HDFS_URL}"),
53
- 'root': os.getenv(f"{env_profile}{HDFS_ROOT}"),
54
- 'timeout': DEFAULT_HDFS_TIMEOUT,
55
- 'token': os.getenv(f"{env_profile}{HDFS_TOKEN}"),
50
+ "user": os.getenv(f"{env_profile}{HDFS_USER}"),
51
+ "url": os.getenv(f"{env_profile}{HDFS_URL}"),
52
+ "root": os.getenv(f"{env_profile}{HDFS_ROOT}"),
53
+ "timeout": DEFAULT_HDFS_TIMEOUT,
54
+ "token": os.getenv(f"{env_profile}{HDFS_TOKEN}"),
56
55
  }
57
56
  timeout_env = f"{env_profile}{HDFS_TIMEOUT}"
58
57
  if os.getenv(timeout_env):
59
- config['timeout'] = int(os.environ[timeout_env])
58
+ config["timeout"] = int(os.environ[timeout_env])
60
59
 
61
- config_path = os.getenv(HDFS_CONFIG_PATH) or os.path.expanduser(
62
- '~/.hdfscli.cfg')
60
+ config_path = os.getenv(HDFS_CONFIG_PATH) or os.path.expanduser("~/.hdfscli.cfg")
63
61
  if os.path.exists(config_path):
64
62
  all_config = hdfs_api.config.Config(path=config_path)
65
63
  if not profile_name:
66
- if (all_config.has_section(all_config.global_section) and
67
- all_config.has_option(all_config.global_section,
68
- 'default.alias')):
64
+ if all_config.has_section(
65
+ all_config.global_section
66
+ ) and all_config.has_option(all_config.global_section, "default.alias"):
69
67
  profile_name = all_config.get(
70
- all_config.global_section, 'default.alias')
71
- for suffix in ('.alias', '_alias'):
72
- section = '{}{}'.format(profile_name, suffix)
68
+ all_config.global_section, "default.alias"
69
+ )
70
+ for suffix in (".alias", "_alias"):
71
+ section = "{}{}".format(profile_name, suffix)
73
72
  if all_config.has_section(section):
74
73
  options = dict(all_config.items(section))
75
74
  for key, value in config.items():
@@ -77,7 +76,7 @@ def get_hdfs_config(profile_name: Optional[str] = None):
77
76
  config[key] = options[key]
78
77
  break
79
78
 
80
- if config['url']:
79
+ if config["url"]:
81
80
  return config
82
81
 
83
82
  raise hdfs_api.HdfsError(
@@ -88,69 +87,78 @@ def get_hdfs_config(profile_name: Optional[str] = None):
88
87
  @lru_cache()
89
88
  def get_hdfs_client(profile_name: Optional[str] = None):
90
89
  if not hdfs_api: # pragma: no cover
91
- raise ImportError(
92
- "hdfs not found, please `pip install 'megfile[hdfs]'`")
90
+ raise ImportError("hdfs not found, please `pip install 'megfile[hdfs]'`")
93
91
 
94
92
  config = get_hdfs_config(profile_name)
95
- if config['token']:
96
- config.pop('user', None)
93
+ if config["token"]:
94
+ config.pop("user", None)
97
95
  return hdfs_api.TokenClient(**config)
98
- config.pop('token', None)
96
+ config.pop("token", None)
99
97
  return hdfs_api.InsecureClient(**config)
100
98
 
101
99
 
102
100
  def hdfs_glob(
103
- path: PathLike,
104
- recursive: bool = True,
105
- missing_ok: bool = True,
101
+ path: PathLike, recursive: bool = True, missing_ok: bool = True
106
102
  ) -> List[str]:
107
- '''Return hdfs path list in ascending alphabetical order, in which path matches glob pattern
108
- Notes: Only glob in bucket. If trying to match bucket with wildcard characters, raise UnsupportedError
103
+ """Return hdfs path list in ascending alphabetical order,
104
+ in which path matches glob pattern
105
+
106
+ Notes: Only glob in bucket. If trying to match bucket with wildcard characters,
107
+ raise UnsupportedError
109
108
 
110
109
  :param recursive: If False, `**` will not search directory recursively
111
- :param missing_ok: If False and target path doesn't match any file, raise FileNotFoundError
110
+ :param missing_ok: If False and target path doesn't match any file,
111
+ raise FileNotFoundError
112
112
  :raises: UnsupportedError, when bucket part contains wildcard characters
113
113
  :returns: A list contains paths match `path`
114
- '''
114
+ """
115
115
  return list(hdfs_iglob(path, recursive=recursive, missing_ok=missing_ok))
116
116
 
117
117
 
118
118
  def hdfs_glob_stat(
119
- path: PathLike,
120
- recursive: bool = True,
121
- missing_ok: bool = True) -> Iterator[FileEntry]:
122
- '''Return a generator contains tuples of path and file stat, in ascending alphabetical order, in which path matches glob pattern
123
- Notes: Only glob in bucket. If trying to match bucket with wildcard characters, raise UnsupportedError
119
+ path: PathLike, recursive: bool = True, missing_ok: bool = True
120
+ ) -> Iterator[FileEntry]:
121
+ """Return a generator contains tuples of path and file stat,
122
+ in ascending alphabetical order, in which path matches glob pattern
123
+
124
+ Notes: Only glob in bucket. If trying to match bucket with wildcard characters,
125
+ raise UnsupportedError
124
126
 
125
127
  :param recursive: If False, `**` will not search directory recursively
126
- :param missing_ok: If False and target path doesn't match any file, raise FileNotFoundError
128
+ :param missing_ok: If False and target path doesn't match any file,
129
+ raise FileNotFoundError
127
130
  :raises: UnsupportedError, when bucket part contains wildcard characters
128
- :returns: A generator contains tuples of path and file stat, in which paths match `path`
129
- '''
131
+ :returns: A generator contains tuples of path and file stat,
132
+ in which paths match `path`
133
+ """
130
134
  return HdfsPath(path).glob_stat(
131
- pattern="", recursive=recursive, missing_ok=missing_ok)
135
+ pattern="", recursive=recursive, missing_ok=missing_ok
136
+ )
132
137
 
133
138
 
134
139
  def hdfs_iglob(
135
- path: PathLike,
136
- recursive: bool = True,
137
- missing_ok: bool = True,
140
+ path: PathLike, recursive: bool = True, missing_ok: bool = True
138
141
  ) -> Iterator[str]:
139
- '''Return hdfs path iterator in ascending alphabetical order, in which path matches glob pattern
140
- Notes: Only glob in bucket. If trying to match bucket with wildcard characters, raise UnsupportedError
142
+ """Return hdfs path iterator in ascending alphabetical order,
143
+ in which path matches glob pattern
144
+
145
+ Notes: Only glob in bucket. If trying to match bucket with wildcard characters,
146
+ raise UnsupportedError
141
147
 
142
148
  :param recursive: If False, `**` will not search directory recursively
143
- :param missing_ok: If False and target path doesn't match any file, raise FileNotFoundError
149
+ :param missing_ok: If False and target path doesn't match any file,
150
+ raise FileNotFoundError
144
151
  :raises: UnsupportedError, when bucket part contains wildcard characters
145
152
  :returns: An iterator contains paths match `path`
146
- '''
147
- for path_obj in HdfsPath(path).iglob(pattern="", recursive=recursive,
148
- missing_ok=missing_ok):
153
+ """
154
+ for path_obj in HdfsPath(path).iglob(
155
+ pattern="", recursive=recursive, missing_ok=missing_ok
156
+ ):
149
157
  yield path_obj.path_with_protocol
150
158
 
151
159
 
152
160
  def hdfs_makedirs(path: PathLike, exist_ok: bool = False):
153
- '''
161
+ """
154
162
  Create an hdfs directory.
155
163
  Purely creating directory is invalid because it's unavailable on OSS.
156
164
  This function is to test the target bucket have WRITE access.
@@ -158,7 +166,7 @@ def hdfs_makedirs(path: PathLike, exist_ok: bool = False):
158
166
  :param path: Given path
159
167
  :param exist_ok: If False and target directory exists, raise S3FileExistsError
160
168
  :raises: FileExistsError
161
- '''
169
+ """
162
170
  return HdfsPath(path).mkdir(parents=True, exist_ok=exist_ok)
163
171
 
164
172
 
@@ -171,7 +179,7 @@ class HdfsPath(URIPath):
171
179
  protocol = get_url_scheme(self.path)
172
180
  self._protocol_with_profile = self.protocol
173
181
  self._profile_name = None
174
- if protocol.startswith('hdfs+'):
182
+ if protocol.startswith("hdfs+"):
175
183
  self._protocol_with_profile = protocol
176
184
  self._profile_name = protocol[5:]
177
185
 
@@ -181,140 +189,155 @@ class HdfsPath(URIPath):
181
189
 
182
190
  @cached_property
183
191
  def path_with_protocol(self) -> str:
184
- '''Return path with protocol, like hdfs://path'''
192
+ """Return path with protocol, like hdfs://path"""
185
193
  path = self.path
186
194
  protocol_prefix = self._protocol_with_profile + "://"
187
195
  if path.startswith(protocol_prefix):
188
196
  return path
189
- return protocol_prefix + path.lstrip('/')
197
+ return protocol_prefix + path.lstrip("/")
190
198
 
191
199
  @cached_property
192
200
  def path_without_protocol(self) -> str:
193
- '''Return path without protocol, example: if path is hdfs://path, return path'''
201
+ """Return path without protocol, example: if path is hdfs://path, return path"""
194
202
  path = self.path
195
203
  protocol_prefix = self._protocol_with_profile + "://"
196
204
  if path.startswith(protocol_prefix):
197
- path = path[len(protocol_prefix):]
205
+ path = path[len(protocol_prefix) :]
198
206
  return path
199
207
 
200
208
  @cached_property
201
209
  def parts(self) -> Tuple[str, ...]:
202
- '''A tuple giving access to the path’s various components'''
210
+ """A tuple giving access to the path’s various components"""
203
211
  parts = [f"{self._protocol_with_profile}://"]
204
212
  path = self.path_without_protocol
205
- path = path.lstrip('/')
206
- if path != '':
207
- parts.extend(path.split('/'))
213
+ path = path.lstrip("/")
214
+ if path != "":
215
+ parts.extend(path.split("/"))
208
216
  return tuple(parts)
209
217
 
210
218
  def exists(self, followlinks: bool = False) -> bool:
211
- '''
219
+ """
212
220
  Test if path exists
213
221
 
214
222
  If the bucket of path are not permitted to read, return False
215
223
 
216
224
  :returns: True if path exists, else False
217
- '''
218
- return bool(
219
- self._client.status(self.path_without_protocol, strict=False))
225
+ """
226
+ return bool(self._client.status(self.path_without_protocol, strict=False))
220
227
 
221
228
  def stat(self, follow_symlinks=True) -> StatResult:
222
- '''
223
- Get StatResult of path file, including file size and mtime, referring to hdfs_getsize and hdfs_getmtime
229
+ """
230
+ Get StatResult of path file, including file size and mtime,
231
+ referring to hdfs_getsize and hdfs_getmtime
232
+
233
+ If path is not an existent path, which means hdfs_exist(path) returns False,
234
+ then raise FileNotFoundError
224
235
 
225
- If path is not an existent path, which means hdfs_exist(path) returns False, then raise FileNotFoundError
226
- If attempt to get StatResult of complete hdfs, such as hdfs_dir_url == 'hdfs://', raise BucketNotFoundError
236
+ If attempt to get StatResult of complete hdfs, such as hdfs_dir_url == 'hdfs://',
237
+ raise BucketNotFoundError
227
238
 
228
239
  :returns: StatResult
229
240
  :raises: FileNotFoundError
230
- '''
241
+ """
231
242
  with raise_hdfs_error(self.path_with_protocol):
232
243
  stat_data = self._client.status(self.path_without_protocol)
233
244
  return StatResult(
234
- size=stat_data['length'],
235
- mtime=stat_data['modificationTime'] / 1000,
236
- isdir=stat_data['type'] == 'DIRECTORY',
245
+ size=stat_data["length"],
246
+ mtime=stat_data["modificationTime"] / 1000,
247
+ isdir=stat_data["type"] == "DIRECTORY",
237
248
  islnk=False,
238
249
  extra=stat_data,
239
250
  )
240
251
 
241
252
  def getmtime(self, follow_symlinks: bool = False) -> float:
242
- '''
243
- Get last-modified time of the file on the given path path (in Unix timestamp format).
244
- If the path is an existent directory, return the latest modified time of all file in it. The mtime of empty directory is 1970-01-01 00:00:00
253
+ """
254
+ Get last-modified time of the file on the given path path (in Unix timestamp
255
+ format).
256
+ If the path is an existent directory, return the latest modified time of all
257
+ file in it. The mtime of empty directory is 1970-01-01 00:00:00
245
258
 
246
- If path is not an existent path, which means hdfs_exist(path) returns False, then raise FileNotFoundError
259
+ If path is not an existent path, which means hdfs_exist(path) returns False,
260
+ then raise FileNotFoundError
247
261
 
248
262
  :returns: Last-modified time
249
263
  :raises: FileNotFoundError
250
- '''
264
+ """
251
265
  return self.stat(follow_symlinks=follow_symlinks).mtime
252
266
 
253
267
  def getsize(self, follow_symlinks: bool = False) -> int:
254
- '''
268
+ """
255
269
  Get file size on the given path path (in bytes).
256
- If the path in a directory, return the sum of all file size in it, including file in subdirectories (if exist).
257
- The result excludes the size of directory itself. In other words, return 0 Byte on an empty directory path.
270
+ If the path in a directory, return the sum of all file size in it,
271
+ including file in subdirectories (if exist).
258
272
 
259
- If path is not an existent path, which means hdfs_exist(path) returns False, then raise FileNotFoundError
273
+ The result excludes the size of directory itself. In other words,
274
+ return 0 Byte on an empty directory path.
275
+
276
+ If path is not an existent path, which means hdfs_exist(path) returns False,
277
+ then raise FileNotFoundError
260
278
 
261
279
  :returns: File size
262
280
  :raises: FileNotFoundError
263
- '''
281
+ """
264
282
  return self.stat(follow_symlinks=follow_symlinks).size
265
283
 
266
284
  def glob(
267
- self,
268
- pattern,
269
- recursive: bool = True,
270
- missing_ok: bool = True,
271
- ) -> List['HdfsPath']:
272
- '''Return hdfs path list, in which path matches glob pattern
273
- Notes: Only glob in bucket. If trying to match bucket with wildcard characters, raise UnsupportedError
274
-
275
- :param pattern: Glob the given relative pattern in the directory represented by this path
285
+ self, pattern, recursive: bool = True, missing_ok: bool = True
286
+ ) -> List["HdfsPath"]:
287
+ """Return hdfs path list, in which path matches glob pattern
288
+ Notes: Only glob in bucket. If trying to match bucket with wildcard characters,
289
+ raise UnsupportedError
290
+
291
+ :param pattern: Glob the given relative pattern in the directory represented
292
+ by this path
276
293
  :param recursive: If False, `**` will not search directory recursively
277
- :param missing_ok: If False and target path doesn't match any file, raise FileNotFoundError
294
+ :param missing_ok: If False and target path doesn't match any file,
295
+ raise FileNotFoundError
278
296
  :raises: UnsupportedError, when bucket part contains wildcard characters
279
297
  :returns: A list contains paths match `hdfs_pathname`
280
- '''
298
+ """
281
299
  return list(
282
- self.iglob(
283
- pattern=pattern, recursive=recursive, missing_ok=missing_ok))
300
+ self.iglob(pattern=pattern, recursive=recursive, missing_ok=missing_ok)
301
+ )
284
302
 
285
303
  def glob_stat(
286
- self,
287
- pattern,
288
- recursive: bool = True,
289
- missing_ok: bool = True) -> Iterator[FileEntry]:
290
- '''Return a generator contains tuples of path and file stat, in which path matches glob pattern
291
- Notes: Only glob in bucket. If trying to match bucket with wildcard characters, raise UnsupportedError
292
-
293
- :param pattern: Glob the given relative pattern in the directory represented by this path
304
+ self, pattern, recursive: bool = True, missing_ok: bool = True
305
+ ) -> Iterator[FileEntry]:
306
+ """Return a generator contains tuples of path and file stat,
307
+ in which path matches glob pattern
308
+
309
+ Notes: Only glob in bucket. If trying to match bucket with wildcard characters,
310
+ raise UnsupportedError
311
+
312
+ :param pattern: Glob the given relative pattern in the directory represented
313
+ by this path
294
314
  :param recursive: If False, `**` will not search directory recursively
295
- :param missing_ok: If False and target path doesn't match any file, raise FileNotFoundError
315
+ :param missing_ok: If False and target path doesn't match any file,
316
+ raise FileNotFoundError
296
317
  :raises: UnsupportedError, when bucket part contains wildcard characters
297
- :returns: A generator contains tuples of path and file stat, in which paths match `hdfs_pathname`
298
- '''
299
- for path_obj in self.iglob(pattern=pattern, recursive=recursive,
300
- missing_ok=missing_ok):
318
+ :returns: A generator contains tuples of path and file stat,
319
+ in which paths match `hdfs_pathname`
320
+ """
321
+ for path_obj in self.iglob(
322
+ pattern=pattern, recursive=recursive, missing_ok=missing_ok
323
+ ):
301
324
  yield FileEntry(path_obj.name, path_obj.path, path_obj.stat())
302
325
 
303
326
  def iglob(
304
- self,
305
- pattern,
306
- recursive: bool = True,
307
- missing_ok: bool = True,
308
- ) -> Iterator['HdfsPath']:
309
- '''Return hdfs path iterator, in which path matches glob pattern
310
- Notes: Only glob in bucket. If trying to match bucket with wildcard characters, raise UnsupportedError
311
-
312
- :param pattern: Glob the given relative pattern in the directory represented by this path
327
+ self, pattern, recursive: bool = True, missing_ok: bool = True
328
+ ) -> Iterator["HdfsPath"]:
329
+ """Return hdfs path iterator, in which path matches glob pattern
330
+ Notes: Only glob in bucket. If trying to match bucket with wildcard characters,
331
+ raise UnsupportedError
332
+
333
+ :param pattern: Glob the given relative pattern in the directory represented
334
+ by this path
313
335
  :param recursive: If False, `**` will not search directory recursively
314
- :param missing_ok: If False and target path doesn't match any file, raise FileNotFoundError
336
+ :param missing_ok: If False and target path doesn't match any file,
337
+ raise FileNotFoundError
315
338
  :raises: UnsupportedError, when bucket part contains wildcard characters
316
339
  :returns: An iterator contains paths match `hdfs_pathname`
317
- '''
340
+ """
318
341
  glob_path = self.path_with_protocol
319
342
  if pattern:
320
343
  glob_path = self.joinpath(pattern).path_with_protocol
@@ -331,91 +354,93 @@ class HdfsPath(URIPath):
331
354
 
332
355
  fs_func = FSFunc(_exist, _is_dir, _scandir)
333
356
  for real_path in _create_missing_ok_generator(
334
- iglob(fspath(glob_path), recursive=recursive,
335
- fs=fs_func), missing_ok,
336
- FileNotFoundError('No match any file: %r' % glob_path)):
357
+ iglob(fspath(glob_path), recursive=recursive, fs=fs_func),
358
+ missing_ok,
359
+ FileNotFoundError("No match any file: %r" % glob_path),
360
+ ):
337
361
  yield self.from_path(real_path)
338
362
 
339
363
  def is_dir(self, followlinks: bool = False) -> bool:
340
- '''
364
+ """
341
365
  Test if an hdfs url is directory
342
366
  Specific procedures are as follows:
343
367
  If there exists a suffix, of which ``os.path.join(path, suffix)`` is a file
344
368
  If the url is empty bucket or hdfs://
345
369
 
346
- :param followlinks: whether followlinks is True or False, result is the same. Because hdfs symlink not support dir.
370
+ :param followlinks: whether followlinks is True or False, result is the same.
371
+ Because hdfs symlink not support dir.
347
372
  :returns: True if path is hdfs directory, else False
348
- '''
373
+ """
349
374
  return self.stat().is_dir()
350
375
 
351
376
  def is_file(self, followlinks: bool = False) -> bool:
352
- '''
377
+ """
353
378
  Test if an path is file
354
379
 
355
380
  :returns: True if path is hdfs file, else False
356
- '''
381
+ """
357
382
  return self.stat().is_file()
358
383
 
359
384
  def listdir(self, followlinks: bool = False) -> List[str]:
360
- '''
385
+ """
361
386
  Get all contents of given path.
362
387
 
363
388
  :returns: All contents have prefix of path.
364
389
  :raises: FileNotFoundError, NotADirectoryError
365
- '''
390
+ """
366
391
  if not self.is_dir():
367
- raise NotADirectoryError('Not a directory: %r' % self.path)
392
+ raise NotADirectoryError("Not a directory: %r" % self.path)
368
393
  with raise_hdfs_error(self.path_with_protocol):
369
394
  return self._client.list(self.path_without_protocol)
370
395
 
371
- def iterdir(self, followlinks: bool = False) -> Iterator['HdfsPath']:
372
- '''
396
+ def iterdir(self, followlinks: bool = False) -> Iterator["HdfsPath"]:
397
+ """
373
398
  Get all contents of given path.
374
399
 
375
400
  :returns: All contents have prefix of path.
376
401
  :raises: FileNotFoundError, NotADirectoryError
377
- '''
402
+ """
378
403
  for filename in self.listdir(followlinks=followlinks):
379
404
  yield self.joinpath(filename)
380
405
 
381
406
  def load(self, followlinks: bool = False) -> BinaryIO:
382
- '''Read all content in binary on specified path and write into memory
407
+ """Read all content in binary on specified path and write into memory
383
408
 
384
409
  User should close the BinaryIO manually
385
410
 
386
411
  :returns: BinaryIO
387
- '''
412
+ """
388
413
 
389
414
  buffer = io.BytesIO()
390
- with self.open('rb') as f:
415
+ with self.open("rb") as f:
391
416
  buffer.write(f.read())
392
417
  buffer.seek(0)
393
418
  return buffer
394
419
 
395
420
  def mkdir(self, mode=0o777, parents: bool = False, exist_ok: bool = False):
396
- '''
421
+ """
397
422
  Create an hdfs directory.
398
423
  Purely creating directory is invalid because it's unavailable on OSS.
399
424
  This function is to test the target bucket have WRITE access.
400
425
 
401
- :param mode: Octal permission to set on the newly created directory.
426
+ :param mode: Octal permission to set on the newly created directory.
402
427
  These permissions will only be set on directories that do not already exist.
403
428
  :param parents: parents is ignored, only be compatible with pathlib.Path
404
429
  :param exist_ok: If False and target directory exists, raise FileExistsError
405
430
  :raises: BucketNotFoundError, FileExistsError
406
- '''
431
+ """
407
432
  if not exist_ok and self.exists():
408
- raise FileExistsError('File exists: %r' % self.path)
433
+ raise FileExistsError("File exists: %r" % self.path)
409
434
  with raise_hdfs_error(self.path_with_protocol):
410
435
  self._client.makedirs(self.path_without_protocol, permission=mode)
411
436
 
412
- def rename(self, dst_path: PathLike, overwrite: bool = True) -> 'HdfsPath':
413
- '''
437
+ def rename(self, dst_path: PathLike, overwrite: bool = True) -> "HdfsPath":
438
+ """
414
439
  Move hdfs file path from src_path to dst_path
415
440
 
416
441
  :param dst_path: Given destination path
417
442
  :param overwrite: whether or not overwrite file when exists
418
- '''
443
+ """
419
444
  dst_path = self.from_path(dst_path)
420
445
  if self.is_dir():
421
446
  for filename in self.iterdir():
@@ -426,26 +451,28 @@ class HdfsPath(URIPath):
426
451
  if overwrite or not dst_path.exists():
427
452
  with raise_hdfs_error(self.path_with_protocol):
428
453
  self._client.rename(
429
- self.path_without_protocol,
430
- dst_path.path_without_protocol)
454
+ self.path_without_protocol, dst_path.path_without_protocol
455
+ )
431
456
  self.remove(missing_ok=True)
432
457
  return dst_path
433
458
 
434
459
  def move(self, dst_path: PathLike, overwrite: bool = True) -> None:
435
- '''
460
+ """
436
461
  Move file/directory path from src_path to dst_path
437
462
 
438
463
  :param dst_path: Given destination path
439
- '''
464
+ """
440
465
  self.rename(dst_path=dst_path, overwrite=overwrite)
441
466
 
442
467
  def remove(self, missing_ok: bool = False) -> None:
443
- '''
444
- Remove the file or directory on hdfs, `hdfs://` and `hdfs://bucket` are not permitted to remove
468
+ """
469
+ Remove the file or directory on hdfs, `hdfs://` and `hdfs://bucket` are not
470
+ permitted to remove
445
471
 
446
- :param missing_ok: if False and target file/directory not exists, raise FileNotFoundError
472
+ :param missing_ok: if False and target file/directory not exists,
473
+ raise FileNotFoundError
447
474
  :raises: FileNotFoundError, UnsupportedError
448
- '''
475
+ """
449
476
  try:
450
477
  with raise_hdfs_error(self.path_with_protocol):
451
478
  self._client.delete(self.path_without_protocol, recursive=True)
@@ -453,10 +480,8 @@ class HdfsPath(URIPath):
453
480
  if not missing_ok or not isinstance(e, FileNotFoundError):
454
481
  raise
455
482
 
456
- def scan(self,
457
- missing_ok: bool = True,
458
- followlinks: bool = False) -> Iterator[str]:
459
- '''
483
+ def scan(self, missing_ok: bool = True, followlinks: bool = False) -> Iterator[str]:
484
+ """
460
485
  Iteratively traverse only files in given hdfs directory.
461
486
  Every iteration on generator yields a path string.
462
487
 
@@ -464,84 +489,94 @@ class HdfsPath(URIPath):
464
489
  If path is a non-existent path, return an empty generator
465
490
  If path is a bucket path, return all file paths in the bucket
466
491
  If path is an empty bucket, return an empty generator
467
- If path doesn't contain any bucket, which is path == 'hdfs://', raise UnsupportedError. walk() on complete hdfs is not supported in megfile
492
+ If path doesn't contain any bucket, which is path == 'hdfs://',
493
+ raise UnsupportedError. walk() on complete hdfs is not supported in megfile
468
494
 
469
- :param missing_ok: If False and there's no file in the directory, raise FileNotFoundError
495
+ :param missing_ok: If False and there's no file in the directory,
496
+ raise FileNotFoundError
470
497
  :raises: UnsupportedError
471
498
  :returns: A file path generator
472
- '''
473
- for file_entry in self.scan_stat(missing_ok=missing_ok,
474
- followlinks=followlinks):
499
+ """
500
+ for file_entry in self.scan_stat(
501
+ missing_ok=missing_ok, followlinks=followlinks
502
+ ):
475
503
  yield file_entry.path
476
504
 
477
- def scan_stat(self,
478
- missing_ok: bool = True,
479
- followlinks: bool = False) -> Iterator[FileEntry]:
480
- '''
505
+ def scan_stat(
506
+ self, missing_ok: bool = True, followlinks: bool = False
507
+ ) -> Iterator[FileEntry]:
508
+ """
481
509
  Iteratively traverse only files in given directory.
482
510
  Every iteration on generator yields a tuple of path string and file stat
483
511
 
484
- :param missing_ok: If False and there's no file in the directory, raise FileNotFoundError
512
+ :param missing_ok: If False and there's no file in the directory,
513
+ raise FileNotFoundError
485
514
  :raises: UnsupportedError
486
515
  :returns: A file path generator
487
- '''
516
+ """
488
517
  with raise_hdfs_error(self.path_with_protocol):
489
- for (root,
490
- _root_status), _dir_infos, file_infos in self._client.walk(
491
- self.path_without_protocol, status=True,
492
- ignore_missing=missing_ok):
518
+ for (root, _root_status), _dir_infos, file_infos in self._client.walk(
519
+ self.path_without_protocol, status=True, ignore_missing=missing_ok
520
+ ):
493
521
  for filename, stat_data in file_infos:
494
522
  yield FileEntry(
495
523
  name=filename,
496
524
  path=self.from_path(
497
525
  f"{self._protocol_with_profile}://{root.lstrip('/')}"
498
- ).joinpath(filename).path_with_protocol,
526
+ )
527
+ .joinpath(filename)
528
+ .path_with_protocol,
499
529
  stat=StatResult(
500
- size=stat_data['length'],
501
- mtime=stat_data['modificationTime'] / 1000,
530
+ size=stat_data["length"],
531
+ mtime=stat_data["modificationTime"] / 1000,
502
532
  isdir=False,
503
533
  islnk=False,
504
534
  extra=stat_data,
505
- ))
535
+ ),
536
+ )
506
537
 
507
538
  def scandir(self, followlinks: bool = False) -> Iterator[FileEntry]:
508
- '''
539
+ """
509
540
  Get all contents of given path, the order of result is not guaranteed.
510
541
 
511
542
  :returns: All contents have prefix of path
512
543
  :raises: FileNotFoundError, NotADirectoryError
513
- '''
544
+ """
514
545
  with raise_hdfs_error(self.path_with_protocol):
515
546
  for filename, stat_data in self._client.list(
516
- self.path_without_protocol, status=True):
547
+ self.path_without_protocol, status=True
548
+ ):
517
549
  yield FileEntry(
518
550
  name=filename,
519
551
  path=self.joinpath(filename).path_with_protocol,
520
552
  stat=StatResult(
521
- size=stat_data['length'],
522
- mtime=stat_data['modificationTime'] / 1000,
523
- isdir=stat_data['type'] == 'DIRECTORY',
553
+ size=stat_data["length"],
554
+ mtime=stat_data["modificationTime"] / 1000,
555
+ isdir=stat_data["type"] == "DIRECTORY",
524
556
  islnk=False,
525
557
  extra=stat_data,
526
- ))
558
+ ),
559
+ )
527
560
 
528
561
  def unlink(self, missing_ok: bool = False) -> None:
529
- '''
562
+ """
530
563
  Remove the file on hdfs
531
564
 
532
565
  :param missing_ok: if False and target file not exists, raise FileNotFoundError
533
566
  :raises: FileNotFoundError, IsADirectoryError
534
- '''
567
+ """
535
568
  if self.is_dir():
536
- raise IsADirectoryError('Path is a directory: %r' % self.path)
569
+ raise IsADirectoryError("Path is a directory: %r" % self.path)
537
570
  self.remove(missing_ok=missing_ok)
538
571
 
539
572
  def walk(
540
- self,
541
- followlinks: bool = False
573
+ self, followlinks: bool = False
542
574
  ) -> Iterator[Tuple[str, List[str], List[str]]]:
543
- '''
544
- Iteratively traverse the given hdfs directory, in top-bottom order. In other words, firstly traverse parent directory, if subdirectories exist, traverse the subdirectories.
575
+ """
576
+ Iteratively traverse the given hdfs directory, in top-bottom order.
577
+ In other words, firstly traverse parent directory, if subdirectories exist,
578
+ traverse the subdirectories.
579
+
545
580
  Every iteration on generator yields a 3-tuple: (root, dirs, files)
546
581
 
547
582
  - root: Current hdfs path;
@@ -549,68 +584,81 @@ class HdfsPath(URIPath):
549
584
  - files: Name list of files in current directory.
550
585
 
551
586
  If path is a file path, return an empty generator
587
+
552
588
  If path is a non-existent path, return an empty generator
553
- If path is a bucket path, bucket will be the top directory, and will be returned at first iteration of generator
554
- If path is an empty bucket, only yield one 3-tuple (notes: hdfs doesn't have empty directory)
555
- If path doesn't contain any bucket, which is path == 'hdfs://', raise UnsupportedError. walk() on complete hdfs is not supported in megfile
556
589
 
557
- :param followlinks: whether followlinks is True or False, result is the same. Because hdfs not support symlink.
590
+ If path is a bucket path, bucket will be the top directory,
591
+ and will be returned at first iteration of generator
592
+
593
+ If path is an empty bucket, only yield one 3-tuple
594
+ (notes: hdfs doesn't have empty directory)
595
+
596
+ If path doesn't contain any bucket, which is path == 'hdfs://',
597
+ raise UnsupportedError. walk() on complete hdfs is not supported in megfile
598
+
599
+ :param followlinks: whether followlinks is True or False, result is the same.
600
+ Because hdfs not support symlink.
558
601
  :returns: A 3-tuple generator
559
- '''
602
+ """
560
603
  with raise_hdfs_error(self.path_with_protocol):
561
604
  for path, dirs, files in self._client.walk(
562
- self.path_without_protocol, ignore_missing=True,
563
- allow_dir_changes=True):
605
+ self.path_without_protocol, ignore_missing=True, allow_dir_changes=True
606
+ ):
564
607
  yield f"{self._protocol_with_profile}://{path.lstrip('/')}", dirs, files
565
608
 
566
609
  def md5(self, recalculate: bool = False, followlinks: bool = False) -> str:
567
- '''
610
+ """
568
611
  Get checksum of the file or dir.
569
612
 
570
613
  :param recalculate: Ignore this parameter, just for compatibility
571
614
  :param followlinks: Ignore this parameter, just for compatibility
572
615
  :returns: checksum
573
- '''
616
+ """
574
617
  if self.is_dir(followlinks=followlinks):
575
618
  hash_md5 = hashlib.md5() # nosec
576
619
  for file_name in self.listdir():
577
- chunk = self.joinpath(file_name).md5(
578
- recalculate=recalculate).encode()
620
+ chunk = self.joinpath(file_name).md5(recalculate=recalculate).encode()
579
621
  hash_md5.update(chunk)
580
622
  return hash_md5.hexdigest()
581
623
  with raise_hdfs_error(self.path_with_protocol):
582
- return self._client.checksum(self.path_without_protocol)['bytes']
624
+ return self._client.checksum(self.path_without_protocol)["bytes"]
583
625
 
584
626
  def save(self, file_object: BinaryIO):
585
- '''Write the opened binary stream to specified path, but the stream won't be closed
627
+ """Write the opened binary stream to specified path,
628
+ but the stream won't be closed
586
629
 
587
630
  :param file_object: Stream to be read
588
- '''
631
+ """
589
632
  with raise_hdfs_error(self.path_with_protocol):
590
633
  self._client.write(
591
- self.path_without_protocol, overwrite=True, data=file_object)
634
+ self.path_without_protocol, overwrite=True, data=file_object
635
+ )
592
636
 
593
637
  def open(
594
- self,
595
- mode: str = 'r',
596
- *,
597
- buffering: Optional[int] = None,
598
- encoding: Optional[str] = None,
599
- errors: Optional[str] = None,
600
- **kwargs) -> IO:
601
- if '+' in mode:
602
- raise ValueError('unacceptable mode: %r' % mode)
603
-
604
- if 'b' in mode:
638
+ self,
639
+ mode: str = "r",
640
+ *,
641
+ buffering: Optional[int] = None,
642
+ encoding: Optional[str] = None,
643
+ errors: Optional[str] = None,
644
+ **kwargs,
645
+ ) -> IO:
646
+ if "+" in mode:
647
+ raise ValueError("unacceptable mode: %r" % mode)
648
+
649
+ if "b" in mode:
605
650
  encoding = None
606
651
  elif not encoding:
607
652
  encoding = sys.getdefaultencoding()
608
653
 
609
654
  with raise_hdfs_error(self.path_with_protocol):
610
- if mode in ('r', 'rb'):
655
+ if mode in ("r", "rb"):
611
656
  keys = [
612
- 'block_size', 'block_capacity', 'block_forward',
613
- 'max_retries', 'max_workers'
657
+ "block_size",
658
+ "block_capacity",
659
+ "block_forward",
660
+ "max_retries",
661
+ "max_workers",
614
662
  ]
615
663
  input_kwargs = {}
616
664
  for key in keys:
@@ -620,33 +668,39 @@ class HdfsPath(URIPath):
620
668
  hdfs_path=self.path_without_protocol,
621
669
  client=self._client,
622
670
  profile_name=self._profile_name,
623
- **input_kwargs)
671
+ **input_kwargs,
672
+ )
624
673
  if _is_pickle(file_obj):
625
674
  file_obj = io.BufferedReader(file_obj) # type: ignore
626
- if 'b' not in mode:
675
+ if "b" not in mode:
627
676
  file_obj = io.TextIOWrapper(
628
- file_obj, encoding=encoding, errors=errors)
677
+ file_obj, encoding=encoding, errors=errors
678
+ )
629
679
  file_obj.mode = mode # pyre-ignore[41]
630
680
  return file_obj
631
- elif mode in ('w', 'wb'):
681
+ elif mode in ("w", "wb"):
632
682
  return self._client.write(
633
683
  self.path_without_protocol,
634
684
  overwrite=True,
635
685
  buffersize=buffering,
636
- encoding=encoding)
637
- elif mode in ('a', 'ab'):
686
+ encoding=encoding,
687
+ )
688
+ elif mode in ("a", "ab"):
638
689
  return self._client.write(
639
690
  self.path_without_protocol,
640
691
  append=True,
641
692
  buffersize=buffering,
642
- encoding=encoding)
643
- raise ValueError('unacceptable mode: %r' % mode)
644
-
645
- def absolute(self) -> 'HdfsPath':
646
- '''
647
- Make the path absolute, without normalization or resolving symlinks. Returns a new path object
648
- '''
693
+ encoding=encoding,
694
+ )
695
+ raise ValueError("unacceptable mode: %r" % mode)
696
+
697
+ def absolute(self) -> "HdfsPath":
698
+ """
699
+ Make the path absolute, without normalization or resolving symlinks.
700
+ Returns a new path object
701
+ """
649
702
  with raise_hdfs_error(self.path_with_protocol):
650
703
  real_path = self._client.resolve(self.path_without_protocol)
651
704
  return self.from_path(
652
- f"{self._protocol_with_profile}:///{real_path.lstrip('/')}")
705
+ f"{self._protocol_with_profile}:///{real_path.lstrip('/')}"
706
+ )