megfile 3.1.0.post2__py3-none-any.whl → 3.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docs/conf.py +2 -4
- megfile/__init__.py +394 -203
- megfile/cli.py +258 -238
- megfile/config.py +25 -21
- megfile/errors.py +124 -114
- megfile/fs.py +174 -140
- megfile/fs_path.py +462 -354
- megfile/hdfs.py +133 -101
- megfile/hdfs_path.py +290 -236
- megfile/http.py +15 -14
- megfile/http_path.py +111 -107
- megfile/interfaces.py +70 -65
- megfile/lib/base_prefetch_reader.py +94 -69
- megfile/lib/combine_reader.py +13 -12
- megfile/lib/compare.py +17 -13
- megfile/lib/compat.py +1 -5
- megfile/lib/fnmatch.py +29 -30
- megfile/lib/glob.py +54 -55
- megfile/lib/hdfs_prefetch_reader.py +40 -25
- megfile/lib/hdfs_tools.py +1 -3
- megfile/lib/http_prefetch_reader.py +69 -46
- megfile/lib/joinpath.py +5 -5
- megfile/lib/lazy_handler.py +7 -3
- megfile/lib/s3_buffered_writer.py +61 -52
- megfile/lib/s3_cached_handler.py +14 -13
- megfile/lib/s3_limited_seekable_writer.py +38 -28
- megfile/lib/s3_memory_handler.py +35 -29
- megfile/lib/s3_pipe_handler.py +25 -24
- megfile/lib/s3_prefetch_reader.py +71 -52
- megfile/lib/s3_share_cache_reader.py +37 -24
- megfile/lib/shadow_handler.py +8 -3
- megfile/lib/stdio_handler.py +9 -8
- megfile/lib/url.py +3 -3
- megfile/pathlike.py +259 -228
- megfile/s3.py +220 -153
- megfile/s3_path.py +977 -802
- megfile/sftp.py +190 -156
- megfile/sftp_path.py +540 -450
- megfile/smart.py +397 -330
- megfile/smart_path.py +100 -105
- megfile/stdio.py +10 -9
- megfile/stdio_path.py +32 -35
- megfile/utils/__init__.py +75 -54
- megfile/utils/mutex.py +11 -14
- megfile/version.py +1 -1
- {megfile-3.1.0.post2.dist-info → megfile-3.1.2.dist-info}/METADATA +5 -8
- megfile-3.1.2.dist-info/RECORD +55 -0
- {megfile-3.1.0.post2.dist-info → megfile-3.1.2.dist-info}/WHEEL +1 -1
- scripts/convert_results_to_sarif.py +45 -78
- scripts/generate_file.py +140 -64
- megfile-3.1.0.post2.dist-info/RECORD +0 -55
- {megfile-3.1.0.post2.dist-info → megfile-3.1.2.dist-info}/LICENSE +0 -0
- {megfile-3.1.0.post2.dist-info → megfile-3.1.2.dist-info}/LICENSE.pyre +0 -0
- {megfile-3.1.0.post2.dist-info → megfile-3.1.2.dist-info}/entry_points.txt +0 -0
- {megfile-3.1.0.post2.dist-info → megfile-3.1.2.dist-info}/top_level.txt +0 -0
megfile/lib/combine_reader.py
CHANGED
|
@@ -5,11 +5,10 @@ from typing import IO, AnyStr, List, Optional, Union
|
|
|
5
5
|
from megfile.interfaces import Readable, Seekable
|
|
6
6
|
from megfile.utils import get_content_size, get_mode, get_name, is_readable
|
|
7
7
|
|
|
8
|
-
NEWLINE = ord(
|
|
8
|
+
NEWLINE = ord("\n")
|
|
9
9
|
|
|
10
10
|
|
|
11
11
|
class CombineReader(Readable, Seekable):
|
|
12
|
-
|
|
13
12
|
def __init__(self, file_objects: List[IO], name: str):
|
|
14
13
|
self._file_objects = file_objects
|
|
15
14
|
self._blocks_sizes = []
|
|
@@ -19,14 +18,15 @@ class CombineReader(Readable, Seekable):
|
|
|
19
18
|
self._mode = None
|
|
20
19
|
for file_object in self._file_objects:
|
|
21
20
|
if not is_readable(file_object):
|
|
22
|
-
raise IOError(
|
|
21
|
+
raise IOError("not readable: %r" % get_name(file_object))
|
|
23
22
|
mode = get_mode(file_object)
|
|
24
23
|
if self._mode is None:
|
|
25
24
|
self._mode = mode
|
|
26
25
|
if self._mode != mode:
|
|
27
26
|
raise IOError(
|
|
28
|
-
|
|
29
|
-
(get_name(file_object), self._mode, mode)
|
|
27
|
+
"inconsistent mode: %r, expected: %r, got: %r"
|
|
28
|
+
% (get_name(file_object), self._mode, mode)
|
|
29
|
+
)
|
|
30
30
|
self._blocks_sizes.append(self._content_size)
|
|
31
31
|
self._content_size += get_content_size(file_object)
|
|
32
32
|
self._blocks_sizes.append(self._content_size)
|
|
@@ -36,7 +36,7 @@ class CombineReader(Readable, Seekable):
|
|
|
36
36
|
for index, size in enumerate(self._blocks_sizes):
|
|
37
37
|
if self._offset < size:
|
|
38
38
|
return index - 1, self._offset - self._blocks_sizes[index - 1]
|
|
39
|
-
raise IOError(
|
|
39
|
+
raise IOError("offset out of range: %d" % self._offset)
|
|
40
40
|
|
|
41
41
|
@property
|
|
42
42
|
def name(self) -> str:
|
|
@@ -50,12 +50,12 @@ class CombineReader(Readable, Seekable):
|
|
|
50
50
|
return self._offset
|
|
51
51
|
|
|
52
52
|
def _empty_bytes(self) -> AnyStr: # pyre-ignore[34]
|
|
53
|
-
if
|
|
54
|
-
return b
|
|
55
|
-
return
|
|
53
|
+
if "b" in self._mode:
|
|
54
|
+
return b"" # pyre-ignore[7]
|
|
55
|
+
return "" # pyre-ignore[7]
|
|
56
56
|
|
|
57
57
|
def _empty_buffer(self) -> Union[BytesIO, StringIO]:
|
|
58
|
-
if
|
|
58
|
+
if "b" in self._mode:
|
|
59
59
|
return BytesIO()
|
|
60
60
|
return StringIO()
|
|
61
61
|
|
|
@@ -99,6 +99,7 @@ class CombineReader(Readable, Seekable):
|
|
|
99
99
|
return buffer.getvalue() # pyre-ignore[7]
|
|
100
100
|
|
|
101
101
|
def seek(self, offset: int, whence: int = os.SEEK_SET) -> int:
|
|
102
|
+
offset = int(offset) # user maybe put offset with 'numpy.uint64' type
|
|
102
103
|
if whence == os.SEEK_SET:
|
|
103
104
|
target_offset = offset
|
|
104
105
|
elif whence == os.SEEK_CUR:
|
|
@@ -106,10 +107,10 @@ class CombineReader(Readable, Seekable):
|
|
|
106
107
|
elif whence == os.SEEK_END:
|
|
107
108
|
target_offset = self._content_size + offset
|
|
108
109
|
else:
|
|
109
|
-
raise ValueError(
|
|
110
|
+
raise ValueError("invalid whence: %r" % whence)
|
|
110
111
|
|
|
111
112
|
if target_offset < 0:
|
|
112
|
-
raise ValueError(
|
|
113
|
+
raise ValueError("negative seek value %r" % target_offset)
|
|
113
114
|
|
|
114
115
|
self._offset = target_offset
|
|
115
116
|
return self._offset
|
megfile/lib/compare.py
CHANGED
|
@@ -5,17 +5,19 @@ from megfile.pathlike import StatResult
|
|
|
5
5
|
|
|
6
6
|
|
|
7
7
|
def get_sync_type(src_protocol, dst_protocol):
|
|
8
|
-
if src_protocol ==
|
|
9
|
-
return
|
|
10
|
-
elif src_protocol !=
|
|
11
|
-
return
|
|
8
|
+
if src_protocol == "s3" and dst_protocol != "s3":
|
|
9
|
+
return "download"
|
|
10
|
+
elif src_protocol != "s3" and dst_protocol == "s3":
|
|
11
|
+
return "upload"
|
|
12
12
|
else:
|
|
13
|
-
return
|
|
13
|
+
return "copy"
|
|
14
14
|
|
|
15
15
|
|
|
16
16
|
def compare_time(
|
|
17
|
-
|
|
18
|
-
|
|
17
|
+
src_stat: Union[StatResult, stat_result],
|
|
18
|
+
dest_stat: Union[StatResult, stat_result],
|
|
19
|
+
sync_type: str,
|
|
20
|
+
):
|
|
19
21
|
"""
|
|
20
22
|
:returns: True if the file does not need updating based on time of
|
|
21
23
|
last modification and type of operation.
|
|
@@ -35,7 +37,6 @@ def compare_time(
|
|
|
35
37
|
# at the source location.
|
|
36
38
|
return False
|
|
37
39
|
elif sync_type == "download":
|
|
38
|
-
|
|
39
40
|
if delta <= 0:
|
|
40
41
|
return True
|
|
41
42
|
else:
|
|
@@ -45,14 +46,17 @@ def compare_time(
|
|
|
45
46
|
|
|
46
47
|
|
|
47
48
|
def is_same_file(
|
|
48
|
-
|
|
49
|
-
|
|
49
|
+
src_stat: Union[StatResult, stat_result],
|
|
50
|
+
dest_stat: Union[StatResult, stat_result],
|
|
51
|
+
sync_type: str,
|
|
52
|
+
):
|
|
50
53
|
"""
|
|
51
|
-
Determines whether or not the source and destination files should be synced based on
|
|
54
|
+
Determines whether or not the source and destination files should be synced based on
|
|
55
|
+
a comparison of their size and last modified time.
|
|
52
56
|
|
|
53
|
-
:param src_stat: A
|
|
57
|
+
:param src_stat: A object representing the source file to be compared.
|
|
54
58
|
:type src_stat: Union[StatResult, stat_result]
|
|
55
|
-
:param dest_stat: A
|
|
59
|
+
:param dest_stat: A object representing the destination file to be compared.
|
|
56
60
|
:type dest_stat: Union[StatResult, stat_result]
|
|
57
61
|
|
|
58
62
|
:return: A boolean value indicating whether or not the files should be synced.
|
megfile/lib/compat.py
CHANGED
megfile/lib/fnmatch.py
CHANGED
|
@@ -9,6 +9,7 @@ expression. They cache the compiled regular expressions for speed.
|
|
|
9
9
|
The function translate(PATTERN) returns a regular expression
|
|
10
10
|
corresponding to PATTERN. (It does not compile it.)
|
|
11
11
|
"""
|
|
12
|
+
|
|
12
13
|
"""Compared with the standard library, syntax '{seq1,seq2}' is supported"""
|
|
13
14
|
|
|
14
15
|
import functools
|
|
@@ -67,7 +68,7 @@ def filter(names: List[str], pat: str) -> List[str]:
|
|
|
67
68
|
|
|
68
69
|
|
|
69
70
|
def _compat(res: str) -> str:
|
|
70
|
-
return r
|
|
71
|
+
return r"(?s:%s)\Z" % res
|
|
71
72
|
|
|
72
73
|
|
|
73
74
|
def translate(pat: str) -> str:
|
|
@@ -77,58 +78,56 @@ def translate(pat: str) -> str:
|
|
|
77
78
|
"""
|
|
78
79
|
|
|
79
80
|
i, n = 0, len(pat)
|
|
80
|
-
res =
|
|
81
|
+
res = ""
|
|
81
82
|
while i < n:
|
|
82
83
|
c = pat[i]
|
|
83
84
|
i = i + 1
|
|
84
|
-
if c ==
|
|
85
|
+
if c == "*":
|
|
85
86
|
j = i
|
|
86
|
-
while j < n and pat[j] ==
|
|
87
|
+
while j < n and pat[j] == "*":
|
|
87
88
|
j = j + 1
|
|
88
89
|
if j > i:
|
|
89
|
-
if (j < n and pat[j] ==
|
|
90
|
-
(i <= 1 or pat[i - 2] == '/'):
|
|
90
|
+
if (j < n and pat[j] == "/") and (i <= 1 or pat[i - 2] == "/"):
|
|
91
91
|
# hit /**/ instead of /seq**/
|
|
92
92
|
j = j + 1
|
|
93
|
-
res = res + r
|
|
93
|
+
res = res + r"(.*/)?"
|
|
94
94
|
else:
|
|
95
|
-
res = res + r
|
|
95
|
+
res = res + r".*"
|
|
96
96
|
else:
|
|
97
|
-
res = res + r
|
|
97
|
+
res = res + r"[^/]*"
|
|
98
98
|
i = j
|
|
99
|
-
elif c ==
|
|
100
|
-
res = res + r
|
|
101
|
-
elif c ==
|
|
99
|
+
elif c == "?":
|
|
100
|
+
res = res + r"."
|
|
101
|
+
elif c == "[":
|
|
102
102
|
j = i
|
|
103
|
-
if j < n and pat[j] ==
|
|
103
|
+
if j < n and pat[j] == "!":
|
|
104
104
|
j = j + 1
|
|
105
|
-
if j < n and pat[j] ==
|
|
105
|
+
if j < n and pat[j] == "]":
|
|
106
106
|
j = j + 1
|
|
107
|
-
while j < n and pat[j] !=
|
|
107
|
+
while j < n and pat[j] != "]":
|
|
108
108
|
j = j + 1
|
|
109
109
|
if j >= n:
|
|
110
|
-
res = res + r
|
|
110
|
+
res = res + r"\["
|
|
111
111
|
else:
|
|
112
|
-
stuff = pat[i:j].replace(
|
|
112
|
+
stuff = pat[i:j].replace("\\", r"\\")
|
|
113
113
|
i = j + 1
|
|
114
|
-
if stuff[0] ==
|
|
115
|
-
stuff = r
|
|
116
|
-
elif stuff[0] ==
|
|
117
|
-
stuff =
|
|
118
|
-
res = r
|
|
119
|
-
elif c ==
|
|
114
|
+
if stuff[0] == "!":
|
|
115
|
+
stuff = r"^" + stuff[1:]
|
|
116
|
+
elif stuff[0] == "^":
|
|
117
|
+
stuff = "\\" + stuff
|
|
118
|
+
res = r"%s[%s]" % (res, stuff)
|
|
119
|
+
elif c == "{":
|
|
120
120
|
j = i
|
|
121
|
-
if j < n and pat[j] ==
|
|
121
|
+
if j < n and pat[j] == "}":
|
|
122
122
|
j = j + 1
|
|
123
|
-
while j < n and pat[j] !=
|
|
123
|
+
while j < n and pat[j] != "}":
|
|
124
124
|
j = j + 1
|
|
125
125
|
if j >= n:
|
|
126
|
-
res = res + r
|
|
126
|
+
res = res + r"\{"
|
|
127
127
|
else:
|
|
128
|
-
stuff = pat[i:j].replace(
|
|
129
|
-
stuff = r
|
|
130
|
-
|
|
131
|
-
res = r'%s(%s)' % (res, stuff)
|
|
128
|
+
stuff = pat[i:j].replace("\\", r"\\")
|
|
129
|
+
stuff = r"|".join(map(re.escape, stuff.split(","))) # pyre-ignore[6]
|
|
130
|
+
res = r"%s(%s)" % (res, stuff)
|
|
132
131
|
i = j + 1
|
|
133
132
|
else:
|
|
134
133
|
res = res + re.escape(c)
|
megfile/lib/glob.py
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
"""Filename globbing utility."""
|
|
2
|
+
|
|
2
3
|
"""remove once py35 is dead"""
|
|
3
4
|
|
|
4
5
|
import os
|
|
@@ -10,16 +11,16 @@ from typing import Iterator, List, Tuple
|
|
|
10
11
|
from megfile.lib import fnmatch
|
|
11
12
|
|
|
12
13
|
# Python 3.5+ Compatible
|
|
13
|
-
|
|
14
|
+
"""
|
|
14
15
|
class FSFunc(NamedTuple):
|
|
15
16
|
exists: Callable[[str], bool]
|
|
16
17
|
isdir: Callable[[str], bool]
|
|
17
18
|
scandir: Callable[[str], Iterator[Tuple[str, bool]]] # name, isdir
|
|
18
19
|
|
|
19
20
|
in Python 3.6+
|
|
20
|
-
|
|
21
|
+
"""
|
|
21
22
|
|
|
22
|
-
FSFunc = NamedTuple(
|
|
23
|
+
FSFunc = NamedTuple("FSFunc", ["exists", "isdir", "scandir"])
|
|
23
24
|
|
|
24
25
|
|
|
25
26
|
def _exists(path: str) -> bool:
|
|
@@ -39,10 +40,8 @@ DEFAULT_FILESYSTEM_FUNC = FSFunc(_exists, _isdir, _scandir)
|
|
|
39
40
|
|
|
40
41
|
|
|
41
42
|
def glob(
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
recursive: bool = False,
|
|
45
|
-
fs: FSFunc = DEFAULT_FILESYSTEM_FUNC) -> List[str]:
|
|
43
|
+
pathname: str, *, recursive: bool = False, fs: FSFunc = DEFAULT_FILESYSTEM_FUNC
|
|
44
|
+
) -> List[str]:
|
|
46
45
|
"""Return a list of paths matching a pathname pattern.
|
|
47
46
|
|
|
48
47
|
The pattern may contain simple shell-style wildcards a la
|
|
@@ -57,10 +56,8 @@ def glob(
|
|
|
57
56
|
|
|
58
57
|
|
|
59
58
|
def iglob(
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
recursive: bool = False,
|
|
63
|
-
fs: FSFunc = DEFAULT_FILESYSTEM_FUNC) -> Iterator[str]:
|
|
59
|
+
pathname: str, *, recursive: bool = False, fs: FSFunc = DEFAULT_FILESYSTEM_FUNC
|
|
60
|
+
) -> Iterator[str]:
|
|
64
61
|
"""Return an iterator which yields the paths matching a pathname pattern.
|
|
65
62
|
|
|
66
63
|
The pattern may contain simple shell-style wildcards a la
|
|
@@ -74,21 +71,24 @@ def iglob(
|
|
|
74
71
|
it = _iglob(pathname, recursive, False, fs)
|
|
75
72
|
if recursive and _isrecursive(pathname):
|
|
76
73
|
s = next(it) # skip empty string
|
|
77
|
-
|
|
74
|
+
if s:
|
|
75
|
+
# TODO: replace AssertionError with OSError in 4.0.0
|
|
76
|
+
raise AssertionError("iglob with recursive=True error")
|
|
78
77
|
return it
|
|
79
78
|
|
|
80
79
|
|
|
81
|
-
def _iglob(pathname: str, recursive: bool, dironly: bool,
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
protocol, path_without_protocol = pathname.split('://', 1)
|
|
80
|
+
def _iglob(pathname: str, recursive: bool, dironly: bool, fs: FSFunc) -> Iterator[str]:
|
|
81
|
+
if "://" in pathname:
|
|
82
|
+
protocol, path_without_protocol = pathname.split("://", 1)
|
|
85
83
|
else:
|
|
86
84
|
protocol, path_without_protocol = "", pathname
|
|
87
85
|
dirname, basename = os.path.split(path_without_protocol)
|
|
88
86
|
if protocol:
|
|
89
87
|
dirname = "://".join([protocol, dirname])
|
|
90
88
|
if not has_magic(pathname):
|
|
91
|
-
|
|
89
|
+
if dironly:
|
|
90
|
+
# TODO: replace AssertionError with OSError in 4.0.0
|
|
91
|
+
raise AssertionError("can't use dironly with non-magic patterns in _iglob")
|
|
92
92
|
if basename:
|
|
93
93
|
if fs.exists(pathname):
|
|
94
94
|
yield pathname
|
|
@@ -148,9 +148,10 @@ def _glob0(dirname: str, basename: str, dironly: bool, fs: FSFunc) -> List[str]:
|
|
|
148
148
|
|
|
149
149
|
# This helper function recursively yields relative pathnames inside a literal
|
|
150
150
|
# directory.
|
|
151
|
-
def _glob2(dirname: str, pattern: str, dironly: bool,
|
|
152
|
-
|
|
153
|
-
|
|
151
|
+
def _glob2(dirname: str, pattern: str, dironly: bool, fs: FSFunc) -> Iterator[str]:
|
|
152
|
+
if not _isrecursive(pattern):
|
|
153
|
+
# TODO: replace AssertionError with OSError in 4.0.0
|
|
154
|
+
raise AssertionError("error call '_glob2' with non-glob pattern")
|
|
154
155
|
yield pattern[:0]
|
|
155
156
|
yield from _rlistdir(dirname, dironly, fs)
|
|
156
157
|
|
|
@@ -187,10 +188,10 @@ def _rlistdir(dirname: str, dironly: bool, fs: FSFunc) -> Iterator[str]:
|
|
|
187
188
|
yield os.path.join(x, y)
|
|
188
189
|
|
|
189
190
|
|
|
190
|
-
magic_check = re.compile(r
|
|
191
|
-
magic_decheck = re.compile(r
|
|
192
|
-
brace_check = re.compile(r
|
|
193
|
-
unbrace_check = re.compile(r
|
|
191
|
+
magic_check = re.compile(r"([*?[{])")
|
|
192
|
+
magic_decheck = re.compile(r"\[(.)\]")
|
|
193
|
+
brace_check = re.compile(r"(\{.*\})")
|
|
194
|
+
unbrace_check = re.compile(r"([*?[])")
|
|
194
195
|
|
|
195
196
|
|
|
196
197
|
def has_magic(s: str) -> bool:
|
|
@@ -204,46 +205,44 @@ def has_magic_ignore_brace(s: str) -> bool:
|
|
|
204
205
|
|
|
205
206
|
|
|
206
207
|
def _ishidden(path: str) -> bool:
|
|
207
|
-
return path[0] ==
|
|
208
|
+
return path[0] == "."
|
|
208
209
|
|
|
209
210
|
|
|
210
211
|
def _isrecursive(pattern: str) -> bool:
|
|
211
|
-
return pattern ==
|
|
212
|
+
return pattern == "**"
|
|
212
213
|
|
|
213
214
|
|
|
214
215
|
def escape(pathname):
|
|
215
|
-
"""Escape all special characters.
|
|
216
|
-
"""
|
|
216
|
+
"""Escape all special characters."""
|
|
217
217
|
# Escaping is done by wrapping any of "*?[" between square brackets.
|
|
218
218
|
# Metacharacters do not work in the drive part and shouldn't be escaped.
|
|
219
219
|
drive, pathname = os.path.splitdrive(pathname)
|
|
220
|
-
pathname = magic_check.sub(r
|
|
220
|
+
pathname = magic_check.sub(r"[\1]", pathname)
|
|
221
221
|
return drive + pathname
|
|
222
222
|
|
|
223
223
|
|
|
224
224
|
def unescape(pathname):
|
|
225
|
-
"""Unescape all special characters.
|
|
226
|
-
"""
|
|
225
|
+
"""Unescape all special characters."""
|
|
227
226
|
drive, pathname = os.path.splitdrive(pathname)
|
|
228
|
-
pathname = magic_decheck.sub(r
|
|
227
|
+
pathname = magic_decheck.sub(r"\1", pathname)
|
|
229
228
|
return drive + pathname
|
|
230
229
|
|
|
231
230
|
|
|
232
|
-
def _find_suffix(path_list: List[str], prefix: str,
|
|
233
|
-
split_sign: str) -> List[str]:
|
|
231
|
+
def _find_suffix(path_list: List[str], prefix: str, split_sign: str) -> List[str]:
|
|
234
232
|
suffix = []
|
|
235
233
|
temp_path_list = []
|
|
236
234
|
for path_index in range(0, len(path_list)):
|
|
237
|
-
temp_path_list.append(
|
|
238
|
-
path_list[path_index][len(prefix):].split(split_sign))
|
|
235
|
+
temp_path_list.append(path_list[path_index][len(prefix) :].split(split_sign))
|
|
239
236
|
i = 0
|
|
240
237
|
while True:
|
|
241
238
|
i = i - 1
|
|
242
239
|
if len(temp_path_list[0]) <= abs(i):
|
|
243
240
|
return suffix
|
|
244
241
|
for path_index in range(1, len(path_list)):
|
|
245
|
-
if
|
|
246
|
-
|
|
242
|
+
if (
|
|
243
|
+
len(temp_path_list[path_index]) <= abs(i)
|
|
244
|
+
or temp_path_list[path_index][i] != temp_path_list[0][i]
|
|
245
|
+
):
|
|
247
246
|
return suffix
|
|
248
247
|
else:
|
|
249
248
|
suffix.insert(0, temp_path_list[0][i])
|
|
@@ -253,8 +252,8 @@ def globlize(path_list: List[str]) -> str:
|
|
|
253
252
|
path_list = sorted(path_list)
|
|
254
253
|
if path_list[0] == path_list[-1]:
|
|
255
254
|
return path_list[0]
|
|
256
|
-
first_path = path_list[0].split(
|
|
257
|
-
last_path = path_list[-1].split(
|
|
255
|
+
first_path = path_list[0].split("/")
|
|
256
|
+
last_path = path_list[-1].split("/")
|
|
258
257
|
prefix = []
|
|
259
258
|
|
|
260
259
|
for i in range(0, min(len(first_path), len(last_path))):
|
|
@@ -265,46 +264,46 @@ def globlize(path_list: List[str]) -> str:
|
|
|
265
264
|
if len(prefix) == 0:
|
|
266
265
|
prefix = ""
|
|
267
266
|
else:
|
|
268
|
-
prefix =
|
|
269
|
-
suffix = _find_suffix(path_list, prefix,
|
|
267
|
+
prefix = "/".join(prefix) + "/"
|
|
268
|
+
suffix = _find_suffix(path_list, prefix, "/")
|
|
270
269
|
|
|
271
270
|
if len(suffix) == 0:
|
|
272
|
-
suffix = _find_suffix(path_list, prefix,
|
|
271
|
+
suffix = _find_suffix(path_list, prefix, ".")
|
|
273
272
|
if len(suffix) == 0:
|
|
274
273
|
suffix = ""
|
|
275
274
|
else:
|
|
276
|
-
suffix =
|
|
275
|
+
suffix = "." + ".".join(suffix)
|
|
277
276
|
else:
|
|
278
|
-
suffix =
|
|
277
|
+
suffix = "/" + "/".join(suffix)
|
|
279
278
|
|
|
280
279
|
path = []
|
|
281
280
|
for i in path_list:
|
|
282
|
-
if i[len(prefix):len(i) - len(suffix)] not in path:
|
|
283
|
-
path.append(unescape(i[len(prefix):len(i) - len(suffix)]))
|
|
284
|
-
return prefix + "{" +
|
|
281
|
+
if i[len(prefix) : len(i) - len(suffix)] not in path:
|
|
282
|
+
path.append(unescape(i[len(prefix) : len(i) - len(suffix)]))
|
|
283
|
+
return prefix + "{" + ",".join(path) + "}" + suffix
|
|
285
284
|
|
|
286
285
|
|
|
287
286
|
def ungloblize(glob: str) -> List[str]:
|
|
288
287
|
path_list = [glob]
|
|
289
288
|
while True:
|
|
290
289
|
temp_path = path_list[0]
|
|
291
|
-
begin = temp_path.find(
|
|
292
|
-
end = temp_path.find(
|
|
290
|
+
begin = temp_path.find("{")
|
|
291
|
+
end = temp_path.find("}", begin)
|
|
293
292
|
if end == -1:
|
|
294
293
|
break
|
|
295
294
|
path_list.pop(0)
|
|
296
|
-
subpath_list = temp_path[begin + 1:end].split(
|
|
295
|
+
subpath_list = temp_path[begin + 1 : end].split(",")
|
|
297
296
|
for subpath in subpath_list:
|
|
298
|
-
path = temp_path[:begin] + escape(subpath) + temp_path[end + 1:]
|
|
297
|
+
path = temp_path[:begin] + escape(subpath) + temp_path[end + 1 :]
|
|
299
298
|
path_list.append(path)
|
|
300
299
|
return path_list
|
|
301
300
|
|
|
302
301
|
|
|
303
302
|
def get_non_glob_dir(glob: str):
|
|
304
303
|
root_dir = []
|
|
305
|
-
if glob.startswith(
|
|
306
|
-
root_dir.append(
|
|
307
|
-
for name in glob.split(
|
|
304
|
+
if glob.startswith("/"):
|
|
305
|
+
root_dir.append("/")
|
|
306
|
+
for name in glob.split("/"):
|
|
308
307
|
if has_magic(name):
|
|
309
308
|
break
|
|
310
309
|
root_dir.append(name)
|
|
@@ -1,28 +1,38 @@
|
|
|
1
1
|
from io import BytesIO
|
|
2
2
|
from typing import Optional
|
|
3
3
|
|
|
4
|
-
from megfile.config import
|
|
4
|
+
from megfile.config import (
|
|
5
|
+
DEFAULT_BLOCK_CAPACITY,
|
|
6
|
+
DEFAULT_BLOCK_SIZE,
|
|
7
|
+
HDFS_MAX_RETRY_TIMES,
|
|
8
|
+
)
|
|
5
9
|
from megfile.errors import raise_hdfs_error
|
|
6
10
|
from megfile.lib.base_prefetch_reader import BasePrefetchReader
|
|
7
11
|
|
|
8
12
|
|
|
9
13
|
class HdfsPrefetchReader(BasePrefetchReader):
|
|
10
|
-
|
|
11
|
-
Reader to fast read the hdfs content. This will divide the file content into equal
|
|
12
|
-
|
|
13
|
-
|
|
14
|
+
"""
|
|
15
|
+
Reader to fast read the hdfs content. This will divide the file content into equal
|
|
16
|
+
parts of block_size size, and will use LRU to cache at most block_capacity blocks
|
|
17
|
+
in memory.
|
|
18
|
+
|
|
19
|
+
open(), seek() and read() will trigger prefetch read. The prefetch will cached
|
|
20
|
+
block_forward blocks of data from offset position (the position after reading
|
|
21
|
+
if the called function is read).
|
|
22
|
+
"""
|
|
14
23
|
|
|
15
24
|
def __init__(
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
25
|
+
self,
|
|
26
|
+
hdfs_path: str,
|
|
27
|
+
*,
|
|
28
|
+
client,
|
|
29
|
+
block_size: int = DEFAULT_BLOCK_SIZE,
|
|
30
|
+
block_capacity: int = DEFAULT_BLOCK_CAPACITY,
|
|
31
|
+
block_forward: Optional[int] = None,
|
|
32
|
+
max_retries: int = HDFS_MAX_RETRY_TIMES,
|
|
33
|
+
max_workers: Optional[int] = None,
|
|
34
|
+
profile_name: Optional[str] = None,
|
|
35
|
+
):
|
|
26
36
|
self._path = hdfs_path
|
|
27
37
|
self._client = client
|
|
28
38
|
self._profile_name = profile_name
|
|
@@ -32,22 +42,27 @@ class HdfsPrefetchReader(BasePrefetchReader):
|
|
|
32
42
|
block_capacity=block_capacity,
|
|
33
43
|
block_forward=block_forward,
|
|
34
44
|
max_retries=max_retries,
|
|
35
|
-
max_workers=max_workers
|
|
45
|
+
max_workers=max_workers,
|
|
46
|
+
)
|
|
36
47
|
|
|
37
48
|
def _get_content_size(self):
|
|
38
49
|
with raise_hdfs_error(self._path):
|
|
39
|
-
return self._client.status(self._path)[
|
|
50
|
+
return self._client.status(self._path)["length"]
|
|
40
51
|
|
|
41
52
|
@property
|
|
42
53
|
def name(self) -> str:
|
|
43
|
-
return
|
|
44
|
-
f"+{self._profile_name}" if self._profile_name else "",
|
|
54
|
+
return "hdfs%s://%s" % (
|
|
55
|
+
f"+{self._profile_name}" if self._profile_name else "",
|
|
56
|
+
self._path,
|
|
57
|
+
)
|
|
45
58
|
|
|
46
59
|
def _fetch_response(
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
end: Optional[int] = None) -> dict:
|
|
60
|
+
self, start: Optional[int] = None, end: Optional[int] = None
|
|
61
|
+
) -> dict:
|
|
50
62
|
with raise_hdfs_error(self.name):
|
|
51
|
-
with self._client.read(
|
|
52
|
-
|
|
53
|
-
|
|
63
|
+
with self._client.read(
|
|
64
|
+
self._path,
|
|
65
|
+
offset=start or 0,
|
|
66
|
+
length=end - start if start and end else None,
|
|
67
|
+
) as f:
|
|
68
|
+
return {"Body": BytesIO(f.read())}
|