megfile 3.0.6.post1__py3-none-any.whl → 3.1.0.post1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docs/conf.py +67 -0
- megfile/cli.py +16 -16
- megfile/config.py +37 -6
- megfile/errors.py +26 -20
- megfile/fs.py +13 -8
- megfile/fs_path.py +69 -49
- megfile/hdfs.py +13 -8
- megfile/hdfs_path.py +49 -41
- megfile/http.py +1 -1
- megfile/http_path.py +35 -28
- megfile/interfaces.py +119 -48
- megfile/lib/base_prefetch_reader.py +9 -8
- megfile/lib/combine_reader.py +7 -7
- megfile/lib/fnmatch.py +2 -2
- megfile/lib/glob.py +3 -3
- megfile/lib/hdfs_prefetch_reader.py +2 -1
- megfile/lib/http_prefetch_reader.py +3 -2
- megfile/lib/lazy_handler.py +6 -5
- megfile/lib/s3_buffered_writer.py +8 -7
- megfile/lib/s3_cached_handler.py +3 -4
- megfile/lib/s3_limited_seekable_writer.py +5 -3
- megfile/lib/s3_memory_handler.py +10 -6
- megfile/lib/s3_pipe_handler.py +1 -1
- megfile/lib/s3_prefetch_reader.py +7 -5
- megfile/lib/s3_share_cache_reader.py +2 -2
- megfile/lib/shadow_handler.py +5 -5
- megfile/lib/stdio_handler.py +3 -3
- megfile/pathlike.py +156 -170
- megfile/s3.py +19 -13
- megfile/s3_path.py +98 -83
- megfile/sftp.py +25 -16
- megfile/sftp_path.py +109 -94
- megfile/smart.py +38 -28
- megfile/smart_path.py +6 -6
- megfile/stdio.py +3 -3
- megfile/stdio_path.py +5 -5
- megfile/utils/__init__.py +8 -27
- megfile/version.py +1 -1
- {megfile-3.0.6.post1.dist-info → megfile-3.1.0.post1.dist-info}/METADATA +4 -5
- megfile-3.1.0.post1.dist-info/RECORD +55 -0
- {megfile-3.0.6.post1.dist-info → megfile-3.1.0.post1.dist-info}/WHEEL +1 -1
- megfile-3.1.0.post1.dist-info/top_level.txt +7 -0
- scripts/convert_results_to_sarif.py +124 -0
- scripts/generate_file.py +268 -0
- megfile-3.0.6.post1.dist-info/RECORD +0 -52
- megfile-3.0.6.post1.dist-info/top_level.txt +0 -1
- {megfile-3.0.6.post1.dist-info → megfile-3.1.0.post1.dist-info}/LICENSE +0 -0
- {megfile-3.0.6.post1.dist-info → megfile-3.1.0.post1.dist-info}/LICENSE.pyre +0 -0
- {megfile-3.0.6.post1.dist-info → megfile-3.1.0.post1.dist-info}/entry_points.txt +0 -0
megfile/fs_path.py
CHANGED
|
@@ -3,16 +3,17 @@ import io
|
|
|
3
3
|
import os
|
|
4
4
|
import pathlib
|
|
5
5
|
import shutil
|
|
6
|
+
from functools import cached_property
|
|
6
7
|
from stat import S_ISDIR as stat_isdir
|
|
7
8
|
from stat import S_ISLNK as stat_islnk
|
|
8
|
-
from typing import IO,
|
|
9
|
+
from typing import IO, BinaryIO, Callable, Iterator, List, Optional, Tuple, Union
|
|
9
10
|
|
|
10
11
|
from megfile.errors import _create_missing_ok_generator
|
|
11
12
|
from megfile.interfaces import Access, ContextIterator, FileEntry, PathLike, StatResult
|
|
12
13
|
from megfile.lib.compare import is_same_file
|
|
13
14
|
from megfile.lib.glob import iglob
|
|
14
15
|
from megfile.lib.url import get_url_scheme
|
|
15
|
-
from megfile.utils import
|
|
16
|
+
from megfile.utils import calculate_md5
|
|
16
17
|
|
|
17
18
|
from .interfaces import PathLike, URIPath
|
|
18
19
|
from .lib.compat import fspath
|
|
@@ -91,7 +92,8 @@ def fs_home():
|
|
|
91
92
|
return os.path.expanduser('~')
|
|
92
93
|
|
|
93
94
|
|
|
94
|
-
def fs_iglob(path: PathLike,
|
|
95
|
+
def fs_iglob(path: PathLike,
|
|
96
|
+
recursive: bool = True,
|
|
95
97
|
missing_ok: bool = True) -> Iterator[str]:
|
|
96
98
|
'''Return path iterator in ascending alphabetical order, in which path matches glob pattern
|
|
97
99
|
|
|
@@ -101,7 +103,7 @@ def fs_iglob(path: PathLike, recursive: bool = True,
|
|
|
101
103
|
Assume there exists a path `/a/b/c/b/d.txt`
|
|
102
104
|
use path pattern like `/**/b/**/*.txt` to glob, the path above will be returned twice
|
|
103
105
|
3. `**` will match any matched file, directory, symlink and '' by default, when recursive is `True`
|
|
104
|
-
4. fs_glob returns same as glob.glob(pathname, recursive=True) in
|
|
106
|
+
4. fs_glob returns same as glob.glob(pathname, recursive=True) in ascending alphabetical order.
|
|
105
107
|
5. Hidden files (filename stars with '.') will not be found in the result
|
|
106
108
|
|
|
107
109
|
:param recursive: If False, `**` will not search directory recursively
|
|
@@ -114,7 +116,8 @@ def fs_iglob(path: PathLike, recursive: bool = True,
|
|
|
114
116
|
yield path
|
|
115
117
|
|
|
116
118
|
|
|
117
|
-
def fs_glob(path: PathLike,
|
|
119
|
+
def fs_glob(path: PathLike,
|
|
120
|
+
recursive: bool = True,
|
|
118
121
|
missing_ok: bool = True) -> List[str]:
|
|
119
122
|
'''Return path list in ascending alphabetical order, in which path matches glob pattern
|
|
120
123
|
|
|
@@ -124,7 +127,7 @@ def fs_glob(path: PathLike, recursive: bool = True,
|
|
|
124
127
|
Assume there exists a path `/a/b/c/b/d.txt`
|
|
125
128
|
use path pattern like `/**/b/**/*.txt` to glob, the path above will be returned twice
|
|
126
129
|
3. `**` will match any matched file, directory, symlink and '' by default, when recursive is `True`
|
|
127
|
-
4. fs_glob returns same as glob.glob(pathname, recursive=True) in
|
|
130
|
+
4. fs_glob returns same as glob.glob(pathname, recursive=True) in ascending alphabetical order.
|
|
128
131
|
5. Hidden files (filename stars with '.') will not be found in the result
|
|
129
132
|
|
|
130
133
|
:param recursive: If False, `**` will not search directory recursively
|
|
@@ -135,7 +138,8 @@ def fs_glob(path: PathLike, recursive: bool = True,
|
|
|
135
138
|
|
|
136
139
|
|
|
137
140
|
def fs_glob_stat(
|
|
138
|
-
path: PathLike,
|
|
141
|
+
path: PathLike,
|
|
142
|
+
recursive: bool = True,
|
|
139
143
|
missing_ok: bool = True) -> Iterator[FileEntry]:
|
|
140
144
|
'''Return a list contains tuples of path and file stat, in ascending alphabetical order, in which path matches glob pattern
|
|
141
145
|
|
|
@@ -145,7 +149,7 @@ def fs_glob_stat(
|
|
|
145
149
|
Assume there exists a path `/a/b/c/b/d.txt`
|
|
146
150
|
use path pattern like `/**/b/**/*.txt` to glob, the path above will be returned twice
|
|
147
151
|
3. `**` will match any matched file, directory, symlink and '' by default, when recursive is `True`
|
|
148
|
-
4. fs_glob returns same as glob.glob(pathname, recursive=True) in
|
|
152
|
+
4. fs_glob returns same as glob.glob(pathname, recursive=True) in ascending alphabetical order.
|
|
149
153
|
5. Hidden files (filename stars with '.') will not be found in the result
|
|
150
154
|
|
|
151
155
|
:param recursive: If False, `**` will not search directory recursively
|
|
@@ -262,7 +266,7 @@ class FSPath(URIPath):
|
|
|
262
266
|
|
|
263
267
|
protocol = "file"
|
|
264
268
|
|
|
265
|
-
def __init__(self, path: Union[
|
|
269
|
+
def __init__(self, path: Union[PathLike, int], *other_paths: PathLike):
|
|
266
270
|
if not isinstance(path, int):
|
|
267
271
|
if len(other_paths) > 0:
|
|
268
272
|
path = self.from_path(path).joinpath(*other_paths)
|
|
@@ -272,20 +276,20 @@ class FSPath(URIPath):
|
|
|
272
276
|
def __fspath__(self) -> str:
|
|
273
277
|
return os.path.normpath(self.path_without_protocol)
|
|
274
278
|
|
|
275
|
-
@
|
|
279
|
+
@cached_property
|
|
276
280
|
def root(self) -> str:
|
|
277
281
|
return pathlib.Path(self.path_without_protocol).root
|
|
278
282
|
|
|
279
|
-
@
|
|
283
|
+
@cached_property
|
|
280
284
|
def anchor(self) -> str:
|
|
281
285
|
return pathlib.Path(self.path_without_protocol).anchor
|
|
282
286
|
|
|
283
|
-
@
|
|
287
|
+
@cached_property
|
|
284
288
|
def drive(self) -> str:
|
|
285
289
|
return pathlib.Path(self.path_without_protocol).drive
|
|
286
290
|
|
|
287
291
|
@classmethod
|
|
288
|
-
def from_uri(cls, path:
|
|
292
|
+
def from_uri(cls, path: PathLike) -> "FSPath":
|
|
289
293
|
return cls.from_path(path)
|
|
290
294
|
|
|
291
295
|
@property
|
|
@@ -293,9 +297,9 @@ class FSPath(URIPath):
|
|
|
293
297
|
if isinstance(self.path, int):
|
|
294
298
|
return self.path
|
|
295
299
|
protocol_prefix = self.protocol + "://"
|
|
296
|
-
if self.path.startswith(protocol_prefix):
|
|
297
|
-
return self.path
|
|
298
|
-
return protocol_prefix + self.path
|
|
300
|
+
if self.path.startswith(protocol_prefix): # pyre-ignore[16]
|
|
301
|
+
return self.path # pyre-ignore[7]
|
|
302
|
+
return protocol_prefix + self.path # pyre-ignore[58]
|
|
299
303
|
|
|
300
304
|
def is_absolute(self) -> bool:
|
|
301
305
|
'''Test whether a path is absolute
|
|
@@ -319,14 +323,14 @@ class FSPath(URIPath):
|
|
|
319
323
|
:param mode: access mode
|
|
320
324
|
:returns: Access: Enum, the read/write access that path has.
|
|
321
325
|
'''
|
|
322
|
-
if not isinstance(mode, Access):
|
|
323
|
-
raise TypeError(
|
|
324
|
-
'Unsupported mode: {} -- Mode should use one of the enums belonging to: {}'
|
|
325
|
-
.format(mode, ', '.join([str(a) for a in Access])))
|
|
326
326
|
if mode == Access.READ:
|
|
327
327
|
return os.access(self.path_without_protocol, os.R_OK)
|
|
328
|
-
|
|
328
|
+
elif mode == Access.WRITE:
|
|
329
329
|
return os.access(self.path_without_protocol, os.W_OK)
|
|
330
|
+
else:
|
|
331
|
+
raise TypeError(
|
|
332
|
+
'Unsupported mode: {} -- Mode should use one of the enums belonging to: {}'
|
|
333
|
+
.format(mode, ', '.join([str(a) for a in Access])))
|
|
330
334
|
|
|
331
335
|
def exists(self, followlinks: bool = False) -> bool:
|
|
332
336
|
'''
|
|
@@ -365,7 +369,9 @@ class FSPath(URIPath):
|
|
|
365
369
|
'''
|
|
366
370
|
return self.stat(follow_symlinks=follow_symlinks).size
|
|
367
371
|
|
|
368
|
-
def glob(self,
|
|
372
|
+
def glob(self,
|
|
373
|
+
pattern,
|
|
374
|
+
recursive: bool = True,
|
|
369
375
|
missing_ok: bool = True) -> List['FSPath']:
|
|
370
376
|
'''Return path list in ascending alphabetical order, in which path matches glob pattern
|
|
371
377
|
|
|
@@ -375,7 +381,7 @@ class FSPath(URIPath):
|
|
|
375
381
|
Assume there exists a path `/a/b/c/b/d.txt`
|
|
376
382
|
use path pattern like `/**/b/**/*.txt` to glob, the path above will be returned twice
|
|
377
383
|
3. `**` will match any matched file, directory, symlink and '' by default, when recursive is `True`
|
|
378
|
-
4. fs_glob returns same as glob.glob(pathname, recursive=True) in
|
|
384
|
+
4. fs_glob returns same as glob.glob(pathname, recursive=True) in ascending alphabetical order.
|
|
379
385
|
5. Hidden files (filename stars with '.') will not be found in the result
|
|
380
386
|
|
|
381
387
|
:param pattern: Glob the given relative pattern in the directory represented by this path
|
|
@@ -388,7 +394,9 @@ class FSPath(URIPath):
|
|
|
388
394
|
pattern=pattern, recursive=recursive, missing_ok=missing_ok))
|
|
389
395
|
|
|
390
396
|
def glob_stat(
|
|
391
|
-
self,
|
|
397
|
+
self,
|
|
398
|
+
pattern,
|
|
399
|
+
recursive: bool = True,
|
|
392
400
|
missing_ok: bool = True) -> Iterator[FileEntry]:
|
|
393
401
|
'''Return a list contains tuples of path and file stat, in ascending alphabetical order, in which path matches glob pattern
|
|
394
402
|
|
|
@@ -398,7 +406,7 @@ class FSPath(URIPath):
|
|
|
398
406
|
Assume there exists a path `/a/b/c/b/d.txt`
|
|
399
407
|
use path pattern like `/**/b/**/*.txt` to glob, the path above will be returned twice
|
|
400
408
|
3. `**` will match any matched file, directory, symlink and '' by default, when recursive is `True`
|
|
401
|
-
4. fs_glob returns same as glob.glob(pathname, recursive=True) in
|
|
409
|
+
4. fs_glob returns same as glob.glob(pathname, recursive=True) in ascending alphabetical order.
|
|
402
410
|
5. Hidden files (filename stars with '.') will not be found in the result
|
|
403
411
|
|
|
404
412
|
:param pattern: Glob the given relative pattern in the directory represented by this path
|
|
@@ -409,8 +417,9 @@ class FSPath(URIPath):
|
|
|
409
417
|
for path_obj in self.iglob(pattern=pattern, recursive=recursive,
|
|
410
418
|
missing_ok=missing_ok):
|
|
411
419
|
yield FileEntry(
|
|
412
|
-
path_obj.name,
|
|
413
|
-
|
|
420
|
+
path_obj.name,
|
|
421
|
+
path_obj.path, # pyre-ignore[6]
|
|
422
|
+
_make_stat(os.lstat(path_obj.path))) # pyre-ignore[6]
|
|
414
423
|
|
|
415
424
|
def expanduser(self):
|
|
416
425
|
'''Expand ~ and ~user constructions. If user or $HOME is unknown,
|
|
@@ -418,7 +427,9 @@ class FSPath(URIPath):
|
|
|
418
427
|
'''
|
|
419
428
|
return os.path.expanduser(self.path_without_protocol)
|
|
420
429
|
|
|
421
|
-
def iglob(self,
|
|
430
|
+
def iglob(self,
|
|
431
|
+
pattern,
|
|
432
|
+
recursive: bool = True,
|
|
422
433
|
missing_ok: bool = True) -> Iterator['FSPath']:
|
|
423
434
|
'''Return path iterator in ascending alphabetical order, in which path matches glob pattern
|
|
424
435
|
|
|
@@ -428,7 +439,7 @@ class FSPath(URIPath):
|
|
|
428
439
|
Assume there exists a path `/a/b/c/b/d.txt`
|
|
429
440
|
use path pattern like `/**/b/**/*.txt` to glob, the path above will be returned twice
|
|
430
441
|
3. `**` will match any matched file, directory, symlink and '' by default, when recursive is `True`
|
|
431
|
-
4. fs_glob returns same as glob.glob(pathname, recursive=True) in
|
|
442
|
+
4. fs_glob returns same as glob.glob(pathname, recursive=True) in ascending alphabetical order.
|
|
432
443
|
5. Hidden files (filename stars with '.') will not be found in the result
|
|
433
444
|
|
|
434
445
|
:param pattern: Glob the given relative pattern in the directory represented by this path
|
|
@@ -477,20 +488,20 @@ class FSPath(URIPath):
|
|
|
477
488
|
|
|
478
489
|
def listdir(self) -> List[str]:
|
|
479
490
|
'''
|
|
480
|
-
Get all contents of given fs path. The result is in
|
|
491
|
+
Get all contents of given fs path. The result is in ascending alphabetical order.
|
|
481
492
|
|
|
482
|
-
:returns: All contents have in the path in
|
|
493
|
+
:returns: All contents have in the path in ascending alphabetical order
|
|
483
494
|
'''
|
|
484
495
|
return sorted(os.listdir(self.path_without_protocol))
|
|
485
496
|
|
|
486
497
|
def iterdir(self) -> Iterator['FSPath']:
|
|
487
498
|
'''
|
|
488
|
-
Get all contents of given fs path. The result is in
|
|
499
|
+
Get all contents of given fs path. The result is in ascending alphabetical order.
|
|
489
500
|
|
|
490
|
-
:returns: All contents have in the path in
|
|
501
|
+
:returns: All contents have in the path in ascending alphabetical order
|
|
491
502
|
'''
|
|
492
503
|
for path in self.listdir():
|
|
493
|
-
yield self.joinpath(path)
|
|
504
|
+
yield self.joinpath(path)
|
|
494
505
|
|
|
495
506
|
def load(self) -> BinaryIO:
|
|
496
507
|
'''Read all content on specified path and write into memory
|
|
@@ -505,14 +516,14 @@ class FSPath(URIPath):
|
|
|
505
516
|
|
|
506
517
|
def mkdir(self, mode=0o777, parents: bool = False, exist_ok: bool = False):
|
|
507
518
|
'''
|
|
508
|
-
make a directory on fs, including parent directory
|
|
509
|
-
|
|
519
|
+
make a directory on fs, including parent directory.
|
|
510
520
|
If there exists a file on the path, raise FileExistsError
|
|
511
521
|
|
|
512
522
|
:param mode: If mode is given, it is combined with the process’ umask value to determine the file mode and access flags.
|
|
513
523
|
:param parents: If parents is true, any missing parents of this path are created as needed;
|
|
514
|
-
|
|
524
|
+
If parents is false (the default), a missing parent raises FileNotFoundError.
|
|
515
525
|
:param exist_ok: If False and target directory exists, raise FileExistsError
|
|
526
|
+
|
|
516
527
|
:raises: FileExistsError
|
|
517
528
|
'''
|
|
518
529
|
if exist_ok and self.path_without_protocol == '':
|
|
@@ -567,7 +578,8 @@ class FSPath(URIPath):
|
|
|
567
578
|
else:
|
|
568
579
|
os.remove(self.path_without_protocol)
|
|
569
580
|
|
|
570
|
-
def _scan(self,
|
|
581
|
+
def _scan(self,
|
|
582
|
+
missing_ok: bool = True,
|
|
571
583
|
followlinks: bool = False) -> Iterator[str]:
|
|
572
584
|
if self.is_file(followlinks=followlinks):
|
|
573
585
|
path = fspath(self.path_without_protocol)
|
|
@@ -577,7 +589,8 @@ class FSPath(URIPath):
|
|
|
577
589
|
for filename in files:
|
|
578
590
|
yield os.path.join(root, filename)
|
|
579
591
|
|
|
580
|
-
def scan(self,
|
|
592
|
+
def scan(self,
|
|
593
|
+
missing_ok: bool = True,
|
|
581
594
|
followlinks: bool = False) -> Iterator[str]:
|
|
582
595
|
'''
|
|
583
596
|
Iteratively traverse only files in given directory, in alphabetical order.
|
|
@@ -594,7 +607,8 @@ class FSPath(URIPath):
|
|
|
594
607
|
self._scan(followlinks=followlinks), missing_ok,
|
|
595
608
|
FileNotFoundError('No match any file in: %r' % self.path))
|
|
596
609
|
|
|
597
|
-
def scan_stat(self,
|
|
610
|
+
def scan_stat(self,
|
|
611
|
+
missing_ok: bool = True,
|
|
598
612
|
followlinks: bool = False) -> Iterator[FileEntry]:
|
|
599
613
|
'''
|
|
600
614
|
Iteratively traverse only files in given directory, in alphabetical order.
|
|
@@ -668,8 +682,10 @@ class FSPath(URIPath):
|
|
|
668
682
|
return
|
|
669
683
|
os.unlink(self.path_without_protocol)
|
|
670
684
|
|
|
671
|
-
def walk(
|
|
672
|
-
|
|
685
|
+
def walk(
|
|
686
|
+
self,
|
|
687
|
+
followlinks: bool = False
|
|
688
|
+
) -> Iterator[Tuple[str, List[str], List[str]]]:
|
|
673
689
|
'''
|
|
674
690
|
Generate the file names in a directory tree by walking the tree top-down.
|
|
675
691
|
For each directory in the tree rooted at directory path (including path itself),
|
|
@@ -734,6 +750,7 @@ class FSPath(URIPath):
|
|
|
734
750
|
|
|
735
751
|
:param recalculate: Ignore this parameter, just for compatibility
|
|
736
752
|
:param followlinks: Ignore this parameter, just for compatibility
|
|
753
|
+
|
|
737
754
|
returns: md5 of file
|
|
738
755
|
'''
|
|
739
756
|
if os.path.isdir(self.path_without_protocol):
|
|
@@ -743,7 +760,7 @@ class FSPath(URIPath):
|
|
|
743
760
|
recalculate=recalculate, followlinks=followlinks).encode()
|
|
744
761
|
hash_md5.update(chunk)
|
|
745
762
|
return hash_md5.hexdigest()
|
|
746
|
-
with open(self.path_without_protocol, 'rb') as src:
|
|
763
|
+
with open(self.path_without_protocol, 'rb') as src:
|
|
747
764
|
md5 = calculate_md5(src)
|
|
748
765
|
return md5
|
|
749
766
|
|
|
@@ -754,7 +771,9 @@ class FSPath(URIPath):
|
|
|
754
771
|
followlinks: bool = False):
|
|
755
772
|
|
|
756
773
|
shutil.copy2(
|
|
757
|
-
self.path_without_protocol,
|
|
774
|
+
self.path_without_protocol,
|
|
775
|
+
fspath(dst_path),
|
|
776
|
+
follow_symlinks=followlinks)
|
|
758
777
|
|
|
759
778
|
# After python3.8, patch `shutil.copyfile` is not a good way, because `shutil.copy2` will not call it in some cases.
|
|
760
779
|
if callback:
|
|
@@ -786,6 +805,7 @@ class FSPath(URIPath):
|
|
|
786
805
|
:param followlinks: False if regard symlink as file, else True
|
|
787
806
|
:param overwrite: whether or not overwrite file when exists, default is True
|
|
788
807
|
'''
|
|
808
|
+
dst_path = fspath(dst_path)
|
|
789
809
|
if not overwrite and os.path.exists((dst_path)):
|
|
790
810
|
return
|
|
791
811
|
|
|
@@ -811,7 +831,7 @@ class FSPath(URIPath):
|
|
|
811
831
|
|
|
812
832
|
:param dst_path: Target file path
|
|
813
833
|
:param followlinks: False if regard symlink as file, else True
|
|
814
|
-
:param force: Sync file
|
|
834
|
+
:param force: Sync file forcible, do not ignore same files, priority is higher than 'overwrite', default is False
|
|
815
835
|
:param overwrite: whether or not overwrite file when exists, default is True
|
|
816
836
|
'''
|
|
817
837
|
if self.is_dir(followlinks=followlinks):
|
|
@@ -841,7 +861,7 @@ class FSPath(URIPath):
|
|
|
841
861
|
'''
|
|
842
862
|
Create a symbolic link pointing to src_path named dst_path.
|
|
843
863
|
|
|
844
|
-
:param dst_path:
|
|
864
|
+
:param dst_path: Destination path
|
|
845
865
|
'''
|
|
846
866
|
return os.symlink(self.path_without_protocol, dst_path)
|
|
847
867
|
|
|
@@ -906,7 +926,7 @@ class FSPath(URIPath):
|
|
|
906
926
|
errors=None,
|
|
907
927
|
newline=None,
|
|
908
928
|
closefd=True,
|
|
909
|
-
**kwargs) -> IO
|
|
929
|
+
**kwargs) -> IO:
|
|
910
930
|
if not isinstance(self.path_without_protocol, int) and ('w' in mode or
|
|
911
931
|
'x' in mode or
|
|
912
932
|
'a' in mode):
|
|
@@ -921,8 +941,8 @@ class FSPath(URIPath):
|
|
|
921
941
|
newline=newline,
|
|
922
942
|
closefd=closefd)
|
|
923
943
|
|
|
924
|
-
@
|
|
925
|
-
def parts(self) -> Tuple[str]:
|
|
944
|
+
@cached_property
|
|
945
|
+
def parts(self) -> Tuple[str, ...]:
|
|
926
946
|
'''
|
|
927
947
|
A tuple giving access to the path’s various components
|
|
928
948
|
'''
|
megfile/hdfs.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from typing import IO,
|
|
1
|
+
from typing import IO, BinaryIO, Iterator, List, Optional, Tuple
|
|
2
2
|
|
|
3
3
|
from megfile.hdfs_path import HdfsPath, hdfs_glob, hdfs_glob_stat, hdfs_iglob, hdfs_makedirs, is_hdfs
|
|
4
4
|
from megfile.interfaces import FileEntry, PathLike, StatResult
|
|
@@ -37,7 +37,7 @@ def hdfs_exists(path: PathLike, followlinks: bool = False) -> bool:
|
|
|
37
37
|
If the bucket of path are not permitted to read, return False
|
|
38
38
|
|
|
39
39
|
:param path: Given path
|
|
40
|
-
:returns: True if path
|
|
40
|
+
:returns: True if path exists, else False
|
|
41
41
|
'''
|
|
42
42
|
return HdfsPath(path).exists(followlinks)
|
|
43
43
|
|
|
@@ -154,7 +154,8 @@ def hdfs_remove(path: PathLike, missing_ok: bool = False) -> None:
|
|
|
154
154
|
|
|
155
155
|
|
|
156
156
|
def hdfs_scan(
|
|
157
|
-
path: PathLike,
|
|
157
|
+
path: PathLike,
|
|
158
|
+
missing_ok: bool = True,
|
|
158
159
|
followlinks: bool = False) -> Iterator[str]:
|
|
159
160
|
'''
|
|
160
161
|
Iteratively traverse only files in given hdfs directory.
|
|
@@ -175,7 +176,8 @@ def hdfs_scan(
|
|
|
175
176
|
|
|
176
177
|
|
|
177
178
|
def hdfs_scan_stat(
|
|
178
|
-
path: PathLike,
|
|
179
|
+
path: PathLike,
|
|
180
|
+
missing_ok: bool = True,
|
|
179
181
|
followlinks: bool = False) -> Iterator[FileEntry]:
|
|
180
182
|
'''
|
|
181
183
|
Iteratively traverse only files in given directory.
|
|
@@ -212,8 +214,10 @@ def hdfs_unlink(path: PathLike, missing_ok: bool = False) -> None:
|
|
|
212
214
|
return HdfsPath(path).unlink(missing_ok)
|
|
213
215
|
|
|
214
216
|
|
|
215
|
-
def hdfs_walk(
|
|
216
|
-
|
|
217
|
+
def hdfs_walk(
|
|
218
|
+
path: PathLike,
|
|
219
|
+
followlinks: bool = False
|
|
220
|
+
) -> Iterator[Tuple[str, List[str], List[str]]]:
|
|
217
221
|
'''
|
|
218
222
|
Iteratively traverse the given hdfs directory, in top-bottom order. In other words, firstly traverse parent directory, if subdirectories exist, traverse the subdirectories.
|
|
219
223
|
Every iteration on generator yields a 3-tuple: (root, dirs, files)
|
|
@@ -236,7 +240,8 @@ def hdfs_walk(path: PathLike, followlinks: bool = False
|
|
|
236
240
|
|
|
237
241
|
|
|
238
242
|
def hdfs_getmd5(
|
|
239
|
-
path: PathLike,
|
|
243
|
+
path: PathLike,
|
|
244
|
+
recalculate: bool = False,
|
|
240
245
|
followlinks: bool = False) -> str:
|
|
241
246
|
'''
|
|
242
247
|
Get checksum of the file or dir.
|
|
@@ -265,6 +270,6 @@ def hdfs_open(
|
|
|
265
270
|
buffering: Optional[int] = None,
|
|
266
271
|
encoding: Optional[str] = None,
|
|
267
272
|
errors: Optional[str] = None,
|
|
268
|
-
**kwargs) -> IO
|
|
273
|
+
**kwargs) -> IO:
|
|
269
274
|
return HdfsPath(path).open(
|
|
270
275
|
mode, buffering=buffering, encoding=encoding, errors=errors)
|
megfile/hdfs_path.py
CHANGED
|
@@ -1,9 +1,10 @@
|
|
|
1
|
+
# pyre-ignore-all-errors[16]
|
|
1
2
|
import hashlib
|
|
2
3
|
import io
|
|
3
4
|
import os
|
|
4
5
|
import sys
|
|
5
|
-
from functools import lru_cache
|
|
6
|
-
from typing import IO,
|
|
6
|
+
from functools import cached_property, lru_cache
|
|
7
|
+
from typing import IO, BinaryIO, Iterator, List, Optional, Tuple
|
|
7
8
|
|
|
8
9
|
from megfile.errors import _create_missing_ok_generator, raise_hdfs_error
|
|
9
10
|
from megfile.interfaces import FileEntry, PathLike, StatResult, URIPath
|
|
@@ -14,7 +15,7 @@ from megfile.lib.hdfs_tools import hdfs_api
|
|
|
14
15
|
from megfile.lib.url import get_url_scheme
|
|
15
16
|
from megfile.pathlike import PathLike, URIPath
|
|
16
17
|
from megfile.smart_path import SmartPath
|
|
17
|
-
from megfile.utils import _is_pickle
|
|
18
|
+
from megfile.utils import _is_pickle
|
|
18
19
|
|
|
19
20
|
__all__ = [
|
|
20
21
|
'HdfsPath',
|
|
@@ -35,7 +36,7 @@ MAX_RETRIES = 10
|
|
|
35
36
|
DEFAULT_HDFS_TIMEOUT = 10
|
|
36
37
|
|
|
37
38
|
|
|
38
|
-
def is_hdfs(path: PathLike) -> bool:
|
|
39
|
+
def is_hdfs(path: PathLike) -> bool:
|
|
39
40
|
'''Test if a path is sftp path
|
|
40
41
|
|
|
41
42
|
:param path: Path to be tested
|
|
@@ -55,7 +56,7 @@ def get_hdfs_config(profile_name: Optional[str] = None):
|
|
|
55
56
|
}
|
|
56
57
|
timeout_env = f"{env_profile}{HDFS_TIMEOUT}"
|
|
57
58
|
if os.getenv(timeout_env):
|
|
58
|
-
config['timeout'] = int(os.
|
|
59
|
+
config['timeout'] = int(os.environ[timeout_env])
|
|
59
60
|
|
|
60
61
|
config_path = os.getenv(HDFS_CONFIG_PATH) or os.path.expanduser(
|
|
61
62
|
'~/.hdfscli.cfg')
|
|
@@ -99,9 +100,9 @@ def get_hdfs_client(profile_name: Optional[str] = None):
|
|
|
99
100
|
|
|
100
101
|
|
|
101
102
|
def hdfs_glob(
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
103
|
+
path: PathLike,
|
|
104
|
+
recursive: bool = True,
|
|
105
|
+
missing_ok: bool = True,
|
|
105
106
|
) -> List[str]:
|
|
106
107
|
'''Return hdfs path list in ascending alphabetical order, in which path matches glob pattern
|
|
107
108
|
Notes: Only glob in bucket. If trying to match bucket with wildcard characters, raise UnsupportedError
|
|
@@ -115,7 +116,8 @@ def hdfs_glob(
|
|
|
115
116
|
|
|
116
117
|
|
|
117
118
|
def hdfs_glob_stat(
|
|
118
|
-
path: PathLike,
|
|
119
|
+
path: PathLike,
|
|
120
|
+
recursive: bool = True,
|
|
119
121
|
missing_ok: bool = True) -> Iterator[FileEntry]:
|
|
120
122
|
'''Return a generator contains tuples of path and file stat, in ascending alphabetical order, in which path matches glob pattern
|
|
121
123
|
Notes: Only glob in bucket. If trying to match bucket with wildcard characters, raise UnsupportedError
|
|
@@ -130,9 +132,9 @@ def hdfs_glob_stat(
|
|
|
130
132
|
|
|
131
133
|
|
|
132
134
|
def hdfs_iglob(
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
135
|
+
path: PathLike,
|
|
136
|
+
recursive: bool = True,
|
|
137
|
+
missing_ok: bool = True,
|
|
136
138
|
) -> Iterator[str]:
|
|
137
139
|
'''Return hdfs path iterator in ascending alphabetical order, in which path matches glob pattern
|
|
138
140
|
Notes: Only glob in bucket. If trying to match bucket with wildcard characters, raise UnsupportedError
|
|
@@ -177,7 +179,7 @@ class HdfsPath(URIPath):
|
|
|
177
179
|
def _client(self):
|
|
178
180
|
return get_hdfs_client(profile_name=self._profile_name)
|
|
179
181
|
|
|
180
|
-
@
|
|
182
|
+
@cached_property
|
|
181
183
|
def path_with_protocol(self) -> str:
|
|
182
184
|
'''Return path with protocol, like hdfs://path'''
|
|
183
185
|
path = self.path
|
|
@@ -186,7 +188,7 @@ class HdfsPath(URIPath):
|
|
|
186
188
|
return path
|
|
187
189
|
return protocol_prefix + path.lstrip('/')
|
|
188
190
|
|
|
189
|
-
@
|
|
191
|
+
@cached_property
|
|
190
192
|
def path_without_protocol(self) -> str:
|
|
191
193
|
'''Return path without protocol, example: if path is hdfs://path, return path'''
|
|
192
194
|
path = self.path
|
|
@@ -195,8 +197,8 @@ class HdfsPath(URIPath):
|
|
|
195
197
|
path = path[len(protocol_prefix):]
|
|
196
198
|
return path
|
|
197
199
|
|
|
198
|
-
@
|
|
199
|
-
def parts(self) -> Tuple[str]:
|
|
200
|
+
@cached_property
|
|
201
|
+
def parts(self) -> Tuple[str, ...]:
|
|
200
202
|
'''A tuple giving access to the path’s various components'''
|
|
201
203
|
parts = [f"{self._protocol_with_profile}://"]
|
|
202
204
|
path = self.path_without_protocol
|
|
@@ -211,7 +213,7 @@ class HdfsPath(URIPath):
|
|
|
211
213
|
|
|
212
214
|
If the bucket of path are not permitted to read, return False
|
|
213
215
|
|
|
214
|
-
:returns: True if path
|
|
216
|
+
:returns: True if path exists, else False
|
|
215
217
|
'''
|
|
216
218
|
return bool(
|
|
217
219
|
self._client.status(self.path_without_protocol, strict=False))
|
|
@@ -262,10 +264,10 @@ class HdfsPath(URIPath):
|
|
|
262
264
|
return self.stat(follow_symlinks=follow_symlinks).size
|
|
263
265
|
|
|
264
266
|
def glob(
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
267
|
+
self,
|
|
268
|
+
pattern,
|
|
269
|
+
recursive: bool = True,
|
|
270
|
+
missing_ok: bool = True,
|
|
269
271
|
) -> List['HdfsPath']:
|
|
270
272
|
'''Return hdfs path list, in which path matches glob pattern
|
|
271
273
|
Notes: Only glob in bucket. If trying to match bucket with wildcard characters, raise UnsupportedError
|
|
@@ -281,7 +283,9 @@ class HdfsPath(URIPath):
|
|
|
281
283
|
pattern=pattern, recursive=recursive, missing_ok=missing_ok))
|
|
282
284
|
|
|
283
285
|
def glob_stat(
|
|
284
|
-
self,
|
|
286
|
+
self,
|
|
287
|
+
pattern,
|
|
288
|
+
recursive: bool = True,
|
|
285
289
|
missing_ok: bool = True) -> Iterator[FileEntry]:
|
|
286
290
|
'''Return a generator contains tuples of path and file stat, in which path matches glob pattern
|
|
287
291
|
Notes: Only glob in bucket. If trying to match bucket with wildcard characters, raise UnsupportedError
|
|
@@ -297,10 +301,10 @@ class HdfsPath(URIPath):
|
|
|
297
301
|
yield FileEntry(path_obj.name, path_obj.path, path_obj.stat())
|
|
298
302
|
|
|
299
303
|
def iglob(
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
+
self,
|
|
305
|
+
pattern,
|
|
306
|
+
recursive: bool = True,
|
|
307
|
+
missing_ok: bool = True,
|
|
304
308
|
) -> Iterator['HdfsPath']:
|
|
305
309
|
'''Return hdfs path iterator, in which path matches glob pattern
|
|
306
310
|
Notes: Only glob in bucket. If trying to match bucket with wildcard characters, raise UnsupportedError
|
|
@@ -372,7 +376,7 @@ class HdfsPath(URIPath):
|
|
|
372
376
|
:raises: FileNotFoundError, NotADirectoryError
|
|
373
377
|
'''
|
|
374
378
|
for filename in self.listdir(followlinks=followlinks):
|
|
375
|
-
yield self.joinpath(filename)
|
|
379
|
+
yield self.joinpath(filename)
|
|
376
380
|
|
|
377
381
|
def load(self, followlinks: bool = False) -> BinaryIO:
|
|
378
382
|
'''Read all content in binary on specified path and write into memory
|
|
@@ -415,7 +419,7 @@ class HdfsPath(URIPath):
|
|
|
415
419
|
dst_path = self.from_path(dst_path)
|
|
416
420
|
if self.is_dir():
|
|
417
421
|
for filename in self.iterdir():
|
|
418
|
-
self.joinpath(filename).rename(dst_path.joinpath(filename))
|
|
422
|
+
self.joinpath(filename).rename(dst_path.joinpath(filename))
|
|
419
423
|
else:
|
|
420
424
|
if overwrite:
|
|
421
425
|
dst_path.remove(missing_ok=True)
|
|
@@ -449,7 +453,8 @@ class HdfsPath(URIPath):
|
|
|
449
453
|
if not missing_ok or not isinstance(e, FileNotFoundError):
|
|
450
454
|
raise
|
|
451
455
|
|
|
452
|
-
def scan(self,
|
|
456
|
+
def scan(self,
|
|
457
|
+
missing_ok: bool = True,
|
|
453
458
|
followlinks: bool = False) -> Iterator[str]:
|
|
454
459
|
'''
|
|
455
460
|
Iteratively traverse only files in given hdfs directory.
|
|
@@ -469,7 +474,8 @@ class HdfsPath(URIPath):
|
|
|
469
474
|
followlinks=followlinks):
|
|
470
475
|
yield file_entry.path
|
|
471
476
|
|
|
472
|
-
def scan_stat(self,
|
|
477
|
+
def scan_stat(self,
|
|
478
|
+
missing_ok: bool = True,
|
|
473
479
|
followlinks: bool = False) -> Iterator[FileEntry]:
|
|
474
480
|
'''
|
|
475
481
|
Iteratively traverse only files in given directory.
|
|
@@ -530,8 +536,10 @@ class HdfsPath(URIPath):
|
|
|
530
536
|
raise IsADirectoryError('Path is a directory: %r' % self.path)
|
|
531
537
|
self.remove(missing_ok=missing_ok)
|
|
532
538
|
|
|
533
|
-
def walk(
|
|
534
|
-
|
|
539
|
+
def walk(
|
|
540
|
+
self,
|
|
541
|
+
followlinks: bool = False
|
|
542
|
+
) -> Iterator[Tuple[str, List[str], List[str]]]:
|
|
535
543
|
'''
|
|
536
544
|
Iteratively traverse the given hdfs directory, in top-bottom order. In other words, firstly traverse parent directory, if subdirectories exist, traverse the subdirectories.
|
|
537
545
|
Every iteration on generator yields a 3-tuple: (root, dirs, files)
|
|
@@ -566,7 +574,7 @@ class HdfsPath(URIPath):
|
|
|
566
574
|
if self.is_dir(followlinks=followlinks):
|
|
567
575
|
hash_md5 = hashlib.md5() # nosec
|
|
568
576
|
for file_name in self.listdir():
|
|
569
|
-
chunk = self.joinpath(file_name).md5(
|
|
577
|
+
chunk = self.joinpath(file_name).md5(
|
|
570
578
|
recalculate=recalculate).encode()
|
|
571
579
|
hash_md5.update(chunk)
|
|
572
580
|
return hash_md5.hexdigest()
|
|
@@ -589,7 +597,7 @@ class HdfsPath(URIPath):
|
|
|
589
597
|
buffering: Optional[int] = None,
|
|
590
598
|
encoding: Optional[str] = None,
|
|
591
599
|
errors: Optional[str] = None,
|
|
592
|
-
**kwargs) -> IO
|
|
600
|
+
**kwargs) -> IO:
|
|
593
601
|
if '+' in mode:
|
|
594
602
|
raise ValueError('unacceptable mode: %r' % mode)
|
|
595
603
|
|
|
@@ -613,21 +621,21 @@ class HdfsPath(URIPath):
|
|
|
613
621
|
client=self._client,
|
|
614
622
|
profile_name=self._profile_name,
|
|
615
623
|
**input_kwargs)
|
|
616
|
-
if _is_pickle(file_obj):
|
|
617
|
-
file_obj = io.BufferedReader(file_obj) #
|
|
624
|
+
if _is_pickle(file_obj):
|
|
625
|
+
file_obj = io.BufferedReader(file_obj) # type: ignore
|
|
618
626
|
if 'b' not in mode:
|
|
619
627
|
file_obj = io.TextIOWrapper(
|
|
620
|
-
file_obj, encoding=encoding, errors=errors)
|
|
621
|
-
file_obj.mode = mode
|
|
622
|
-
return file_obj
|
|
628
|
+
file_obj, encoding=encoding, errors=errors)
|
|
629
|
+
file_obj.mode = mode # pyre-ignore[41]
|
|
630
|
+
return file_obj
|
|
623
631
|
elif mode in ('w', 'wb'):
|
|
624
|
-
return self._client.write(
|
|
632
|
+
return self._client.write(
|
|
625
633
|
self.path_without_protocol,
|
|
626
634
|
overwrite=True,
|
|
627
635
|
buffersize=buffering,
|
|
628
636
|
encoding=encoding)
|
|
629
637
|
elif mode in ('a', 'ab'):
|
|
630
|
-
return self._client.write(
|
|
638
|
+
return self._client.write(
|
|
631
639
|
self.path_without_protocol,
|
|
632
640
|
append=True,
|
|
633
641
|
buffersize=buffering,
|