megfile 3.1.1__py3-none-any.whl → 3.1.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docs/conf.py +2 -4
- megfile/__init__.py +394 -203
- megfile/cli.py +258 -238
- megfile/config.py +25 -21
- megfile/errors.py +126 -114
- megfile/fs.py +174 -140
- megfile/fs_path.py +462 -354
- megfile/hdfs.py +133 -101
- megfile/hdfs_path.py +290 -236
- megfile/http.py +15 -14
- megfile/http_path.py +111 -107
- megfile/interfaces.py +70 -65
- megfile/lib/base_prefetch_reader.py +84 -65
- megfile/lib/combine_reader.py +12 -12
- megfile/lib/compare.py +17 -13
- megfile/lib/compat.py +1 -5
- megfile/lib/fnmatch.py +29 -30
- megfile/lib/glob.py +46 -54
- megfile/lib/hdfs_prefetch_reader.py +40 -25
- megfile/lib/hdfs_tools.py +1 -3
- megfile/lib/http_prefetch_reader.py +69 -46
- megfile/lib/joinpath.py +5 -5
- megfile/lib/lazy_handler.py +7 -3
- megfile/lib/s3_buffered_writer.py +58 -51
- megfile/lib/s3_cached_handler.py +13 -14
- megfile/lib/s3_limited_seekable_writer.py +37 -28
- megfile/lib/s3_memory_handler.py +34 -30
- megfile/lib/s3_pipe_handler.py +24 -25
- megfile/lib/s3_prefetch_reader.py +71 -52
- megfile/lib/s3_share_cache_reader.py +37 -24
- megfile/lib/shadow_handler.py +7 -3
- megfile/lib/stdio_handler.py +9 -8
- megfile/lib/url.py +3 -3
- megfile/pathlike.py +259 -228
- megfile/s3.py +220 -153
- megfile/s3_path.py +977 -802
- megfile/sftp.py +190 -156
- megfile/sftp_path.py +540 -450
- megfile/smart.py +397 -330
- megfile/smart_path.py +100 -105
- megfile/stdio.py +10 -9
- megfile/stdio_path.py +32 -35
- megfile/utils/__init__.py +73 -54
- megfile/utils/mutex.py +11 -14
- megfile/version.py +1 -1
- {megfile-3.1.1.dist-info → megfile-3.1.3.dist-info}/METADATA +5 -8
- megfile-3.1.3.dist-info/RECORD +55 -0
- {megfile-3.1.1.dist-info → megfile-3.1.3.dist-info}/WHEEL +1 -1
- scripts/convert_results_to_sarif.py +45 -78
- scripts/generate_file.py +140 -64
- megfile-3.1.1.dist-info/RECORD +0 -55
- {megfile-3.1.1.dist-info → megfile-3.1.3.dist-info}/LICENSE +0 -0
- {megfile-3.1.1.dist-info → megfile-3.1.3.dist-info}/LICENSE.pyre +0 -0
- {megfile-3.1.1.dist-info → megfile-3.1.3.dist-info}/entry_points.txt +0 -0
- {megfile-3.1.1.dist-info → megfile-3.1.3.dist-info}/top_level.txt +0 -0
megfile/lib/combine_reader.py
CHANGED
|
@@ -5,11 +5,10 @@ from typing import IO, AnyStr, List, Optional, Union
|
|
|
5
5
|
from megfile.interfaces import Readable, Seekable
|
|
6
6
|
from megfile.utils import get_content_size, get_mode, get_name, is_readable
|
|
7
7
|
|
|
8
|
-
NEWLINE = ord(
|
|
8
|
+
NEWLINE = ord("\n")
|
|
9
9
|
|
|
10
10
|
|
|
11
11
|
class CombineReader(Readable, Seekable):
|
|
12
|
-
|
|
13
12
|
def __init__(self, file_objects: List[IO], name: str):
|
|
14
13
|
self._file_objects = file_objects
|
|
15
14
|
self._blocks_sizes = []
|
|
@@ -19,14 +18,15 @@ class CombineReader(Readable, Seekable):
|
|
|
19
18
|
self._mode = None
|
|
20
19
|
for file_object in self._file_objects:
|
|
21
20
|
if not is_readable(file_object):
|
|
22
|
-
raise IOError(
|
|
21
|
+
raise IOError("not readable: %r" % get_name(file_object))
|
|
23
22
|
mode = get_mode(file_object)
|
|
24
23
|
if self._mode is None:
|
|
25
24
|
self._mode = mode
|
|
26
25
|
if self._mode != mode:
|
|
27
26
|
raise IOError(
|
|
28
|
-
|
|
29
|
-
(get_name(file_object), self._mode, mode)
|
|
27
|
+
"inconsistent mode: %r, expected: %r, got: %r"
|
|
28
|
+
% (get_name(file_object), self._mode, mode)
|
|
29
|
+
)
|
|
30
30
|
self._blocks_sizes.append(self._content_size)
|
|
31
31
|
self._content_size += get_content_size(file_object)
|
|
32
32
|
self._blocks_sizes.append(self._content_size)
|
|
@@ -36,7 +36,7 @@ class CombineReader(Readable, Seekable):
|
|
|
36
36
|
for index, size in enumerate(self._blocks_sizes):
|
|
37
37
|
if self._offset < size:
|
|
38
38
|
return index - 1, self._offset - self._blocks_sizes[index - 1]
|
|
39
|
-
raise IOError(
|
|
39
|
+
raise IOError("offset out of range: %d" % self._offset)
|
|
40
40
|
|
|
41
41
|
@property
|
|
42
42
|
def name(self) -> str:
|
|
@@ -50,12 +50,12 @@ class CombineReader(Readable, Seekable):
|
|
|
50
50
|
return self._offset
|
|
51
51
|
|
|
52
52
|
def _empty_bytes(self) -> AnyStr: # pyre-ignore[34]
|
|
53
|
-
if
|
|
54
|
-
return b
|
|
55
|
-
return
|
|
53
|
+
if "b" in self._mode:
|
|
54
|
+
return b"" # pyre-ignore[7]
|
|
55
|
+
return "" # pyre-ignore[7]
|
|
56
56
|
|
|
57
57
|
def _empty_buffer(self) -> Union[BytesIO, StringIO]:
|
|
58
|
-
if
|
|
58
|
+
if "b" in self._mode:
|
|
59
59
|
return BytesIO()
|
|
60
60
|
return StringIO()
|
|
61
61
|
|
|
@@ -107,10 +107,10 @@ class CombineReader(Readable, Seekable):
|
|
|
107
107
|
elif whence == os.SEEK_END:
|
|
108
108
|
target_offset = self._content_size + offset
|
|
109
109
|
else:
|
|
110
|
-
raise ValueError(
|
|
110
|
+
raise ValueError("invalid whence: %r" % whence)
|
|
111
111
|
|
|
112
112
|
if target_offset < 0:
|
|
113
|
-
raise ValueError(
|
|
113
|
+
raise ValueError("negative seek value %r" % target_offset)
|
|
114
114
|
|
|
115
115
|
self._offset = target_offset
|
|
116
116
|
return self._offset
|
megfile/lib/compare.py
CHANGED
|
@@ -5,17 +5,19 @@ from megfile.pathlike import StatResult
|
|
|
5
5
|
|
|
6
6
|
|
|
7
7
|
def get_sync_type(src_protocol, dst_protocol):
|
|
8
|
-
if src_protocol ==
|
|
9
|
-
return
|
|
10
|
-
elif src_protocol !=
|
|
11
|
-
return
|
|
8
|
+
if src_protocol == "s3" and dst_protocol != "s3":
|
|
9
|
+
return "download"
|
|
10
|
+
elif src_protocol != "s3" and dst_protocol == "s3":
|
|
11
|
+
return "upload"
|
|
12
12
|
else:
|
|
13
|
-
return
|
|
13
|
+
return "copy"
|
|
14
14
|
|
|
15
15
|
|
|
16
16
|
def compare_time(
|
|
17
|
-
|
|
18
|
-
|
|
17
|
+
src_stat: Union[StatResult, stat_result],
|
|
18
|
+
dest_stat: Union[StatResult, stat_result],
|
|
19
|
+
sync_type: str,
|
|
20
|
+
):
|
|
19
21
|
"""
|
|
20
22
|
:returns: True if the file does not need updating based on time of
|
|
21
23
|
last modification and type of operation.
|
|
@@ -35,7 +37,6 @@ def compare_time(
|
|
|
35
37
|
# at the source location.
|
|
36
38
|
return False
|
|
37
39
|
elif sync_type == "download":
|
|
38
|
-
|
|
39
40
|
if delta <= 0:
|
|
40
41
|
return True
|
|
41
42
|
else:
|
|
@@ -45,14 +46,17 @@ def compare_time(
|
|
|
45
46
|
|
|
46
47
|
|
|
47
48
|
def is_same_file(
|
|
48
|
-
|
|
49
|
-
|
|
49
|
+
src_stat: Union[StatResult, stat_result],
|
|
50
|
+
dest_stat: Union[StatResult, stat_result],
|
|
51
|
+
sync_type: str,
|
|
52
|
+
):
|
|
50
53
|
"""
|
|
51
|
-
Determines whether or not the source and destination files should be synced based on
|
|
54
|
+
Determines whether or not the source and destination files should be synced based on
|
|
55
|
+
a comparison of their size and last modified time.
|
|
52
56
|
|
|
53
|
-
:param src_stat: A
|
|
57
|
+
:param src_stat: A object representing the source file to be compared.
|
|
54
58
|
:type src_stat: Union[StatResult, stat_result]
|
|
55
|
-
:param dest_stat: A
|
|
59
|
+
:param dest_stat: A object representing the destination file to be compared.
|
|
56
60
|
:type dest_stat: Union[StatResult, stat_result]
|
|
57
61
|
|
|
58
62
|
:return: A boolean value indicating whether or not the files should be synced.
|
megfile/lib/compat.py
CHANGED
megfile/lib/fnmatch.py
CHANGED
|
@@ -9,6 +9,7 @@ expression. They cache the compiled regular expressions for speed.
|
|
|
9
9
|
The function translate(PATTERN) returns a regular expression
|
|
10
10
|
corresponding to PATTERN. (It does not compile it.)
|
|
11
11
|
"""
|
|
12
|
+
|
|
12
13
|
"""Compared with the standard library, syntax '{seq1,seq2}' is supported"""
|
|
13
14
|
|
|
14
15
|
import functools
|
|
@@ -67,7 +68,7 @@ def filter(names: List[str], pat: str) -> List[str]:
|
|
|
67
68
|
|
|
68
69
|
|
|
69
70
|
def _compat(res: str) -> str:
|
|
70
|
-
return r
|
|
71
|
+
return r"(?s:%s)\Z" % res
|
|
71
72
|
|
|
72
73
|
|
|
73
74
|
def translate(pat: str) -> str:
|
|
@@ -77,58 +78,56 @@ def translate(pat: str) -> str:
|
|
|
77
78
|
"""
|
|
78
79
|
|
|
79
80
|
i, n = 0, len(pat)
|
|
80
|
-
res =
|
|
81
|
+
res = ""
|
|
81
82
|
while i < n:
|
|
82
83
|
c = pat[i]
|
|
83
84
|
i = i + 1
|
|
84
|
-
if c ==
|
|
85
|
+
if c == "*":
|
|
85
86
|
j = i
|
|
86
|
-
while j < n and pat[j] ==
|
|
87
|
+
while j < n and pat[j] == "*":
|
|
87
88
|
j = j + 1
|
|
88
89
|
if j > i:
|
|
89
|
-
if (j < n and pat[j] ==
|
|
90
|
-
(i <= 1 or pat[i - 2] == '/'):
|
|
90
|
+
if (j < n and pat[j] == "/") and (i <= 1 or pat[i - 2] == "/"):
|
|
91
91
|
# hit /**/ instead of /seq**/
|
|
92
92
|
j = j + 1
|
|
93
|
-
res = res + r
|
|
93
|
+
res = res + r"(.*/)?"
|
|
94
94
|
else:
|
|
95
|
-
res = res + r
|
|
95
|
+
res = res + r".*"
|
|
96
96
|
else:
|
|
97
|
-
res = res + r
|
|
97
|
+
res = res + r"[^/]*"
|
|
98
98
|
i = j
|
|
99
|
-
elif c ==
|
|
100
|
-
res = res + r
|
|
101
|
-
elif c ==
|
|
99
|
+
elif c == "?":
|
|
100
|
+
res = res + r"."
|
|
101
|
+
elif c == "[":
|
|
102
102
|
j = i
|
|
103
|
-
if j < n and pat[j] ==
|
|
103
|
+
if j < n and pat[j] == "!":
|
|
104
104
|
j = j + 1
|
|
105
|
-
if j < n and pat[j] ==
|
|
105
|
+
if j < n and pat[j] == "]":
|
|
106
106
|
j = j + 1
|
|
107
|
-
while j < n and pat[j] !=
|
|
107
|
+
while j < n and pat[j] != "]":
|
|
108
108
|
j = j + 1
|
|
109
109
|
if j >= n:
|
|
110
|
-
res = res + r
|
|
110
|
+
res = res + r"\["
|
|
111
111
|
else:
|
|
112
|
-
stuff = pat[i:j].replace(
|
|
112
|
+
stuff = pat[i:j].replace("\\", r"\\")
|
|
113
113
|
i = j + 1
|
|
114
|
-
if stuff[0] ==
|
|
115
|
-
stuff = r
|
|
116
|
-
elif stuff[0] ==
|
|
117
|
-
stuff =
|
|
118
|
-
res = r
|
|
119
|
-
elif c ==
|
|
114
|
+
if stuff[0] == "!":
|
|
115
|
+
stuff = r"^" + stuff[1:]
|
|
116
|
+
elif stuff[0] == "^":
|
|
117
|
+
stuff = "\\" + stuff
|
|
118
|
+
res = r"%s[%s]" % (res, stuff)
|
|
119
|
+
elif c == "{":
|
|
120
120
|
j = i
|
|
121
|
-
if j < n and pat[j] ==
|
|
121
|
+
if j < n and pat[j] == "}":
|
|
122
122
|
j = j + 1
|
|
123
|
-
while j < n and pat[j] !=
|
|
123
|
+
while j < n and pat[j] != "}":
|
|
124
124
|
j = j + 1
|
|
125
125
|
if j >= n:
|
|
126
|
-
res = res + r
|
|
126
|
+
res = res + r"\{"
|
|
127
127
|
else:
|
|
128
|
-
stuff = pat[i:j].replace(
|
|
129
|
-
stuff = r
|
|
130
|
-
|
|
131
|
-
res = r'%s(%s)' % (res, stuff)
|
|
128
|
+
stuff = pat[i:j].replace("\\", r"\\")
|
|
129
|
+
stuff = r"|".join(map(re.escape, stuff.split(","))) # pyre-ignore[6]
|
|
130
|
+
res = r"%s(%s)" % (res, stuff)
|
|
132
131
|
i = j + 1
|
|
133
132
|
else:
|
|
134
133
|
res = res + re.escape(c)
|
megfile/lib/glob.py
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
"""Filename globbing utility."""
|
|
2
|
+
|
|
2
3
|
"""remove once py35 is dead"""
|
|
3
4
|
|
|
4
5
|
import os
|
|
@@ -10,16 +11,16 @@ from typing import Iterator, List, Tuple
|
|
|
10
11
|
from megfile.lib import fnmatch
|
|
11
12
|
|
|
12
13
|
# Python 3.5+ Compatible
|
|
13
|
-
|
|
14
|
+
"""
|
|
14
15
|
class FSFunc(NamedTuple):
|
|
15
16
|
exists: Callable[[str], bool]
|
|
16
17
|
isdir: Callable[[str], bool]
|
|
17
18
|
scandir: Callable[[str], Iterator[Tuple[str, bool]]] # name, isdir
|
|
18
19
|
|
|
19
20
|
in Python 3.6+
|
|
20
|
-
|
|
21
|
+
"""
|
|
21
22
|
|
|
22
|
-
FSFunc = NamedTuple(
|
|
23
|
+
FSFunc = NamedTuple("FSFunc", ["exists", "isdir", "scandir"])
|
|
23
24
|
|
|
24
25
|
|
|
25
26
|
def _exists(path: str) -> bool:
|
|
@@ -39,10 +40,8 @@ DEFAULT_FILESYSTEM_FUNC = FSFunc(_exists, _isdir, _scandir)
|
|
|
39
40
|
|
|
40
41
|
|
|
41
42
|
def glob(
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
recursive: bool = False,
|
|
45
|
-
fs: FSFunc = DEFAULT_FILESYSTEM_FUNC) -> List[str]:
|
|
43
|
+
pathname: str, *, recursive: bool = False, fs: FSFunc = DEFAULT_FILESYSTEM_FUNC
|
|
44
|
+
) -> List[str]:
|
|
46
45
|
"""Return a list of paths matching a pathname pattern.
|
|
47
46
|
|
|
48
47
|
The pattern may contain simple shell-style wildcards a la
|
|
@@ -57,10 +56,8 @@ def glob(
|
|
|
57
56
|
|
|
58
57
|
|
|
59
58
|
def iglob(
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
recursive: bool = False,
|
|
63
|
-
fs: FSFunc = DEFAULT_FILESYSTEM_FUNC) -> Iterator[str]:
|
|
59
|
+
pathname: str, *, recursive: bool = False, fs: FSFunc = DEFAULT_FILESYSTEM_FUNC
|
|
60
|
+
) -> Iterator[str]:
|
|
64
61
|
"""Return an iterator which yields the paths matching a pathname pattern.
|
|
65
62
|
|
|
66
63
|
The pattern may contain simple shell-style wildcards a la
|
|
@@ -80,10 +77,9 @@ def iglob(
|
|
|
80
77
|
return it
|
|
81
78
|
|
|
82
79
|
|
|
83
|
-
def _iglob(pathname: str, recursive: bool, dironly: bool,
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
protocol, path_without_protocol = pathname.split('://', 1)
|
|
80
|
+
def _iglob(pathname: str, recursive: bool, dironly: bool, fs: FSFunc) -> Iterator[str]:
|
|
81
|
+
if "://" in pathname:
|
|
82
|
+
protocol, path_without_protocol = pathname.split("://", 1)
|
|
87
83
|
else:
|
|
88
84
|
protocol, path_without_protocol = "", pathname
|
|
89
85
|
dirname, basename = os.path.split(path_without_protocol)
|
|
@@ -92,8 +88,7 @@ def _iglob(pathname: str, recursive: bool, dironly: bool,
|
|
|
92
88
|
if not has_magic(pathname):
|
|
93
89
|
if dironly:
|
|
94
90
|
# TODO: replace AssertionError with OSError in 4.0.0
|
|
95
|
-
raise AssertionError(
|
|
96
|
-
"can't use dironly with non-magic patterns in _iglob")
|
|
91
|
+
raise AssertionError("can't use dironly with non-magic patterns in _iglob")
|
|
97
92
|
if basename:
|
|
98
93
|
if fs.exists(pathname):
|
|
99
94
|
yield pathname
|
|
@@ -153,8 +148,7 @@ def _glob0(dirname: str, basename: str, dironly: bool, fs: FSFunc) -> List[str]:
|
|
|
153
148
|
|
|
154
149
|
# This helper function recursively yields relative pathnames inside a literal
|
|
155
150
|
# directory.
|
|
156
|
-
def _glob2(dirname: str, pattern: str, dironly: bool,
|
|
157
|
-
fs: FSFunc) -> Iterator[str]:
|
|
151
|
+
def _glob2(dirname: str, pattern: str, dironly: bool, fs: FSFunc) -> Iterator[str]:
|
|
158
152
|
if not _isrecursive(pattern):
|
|
159
153
|
# TODO: replace AssertionError with OSError in 4.0.0
|
|
160
154
|
raise AssertionError("error call '_glob2' with non-glob pattern")
|
|
@@ -194,10 +188,10 @@ def _rlistdir(dirname: str, dironly: bool, fs: FSFunc) -> Iterator[str]:
|
|
|
194
188
|
yield os.path.join(x, y)
|
|
195
189
|
|
|
196
190
|
|
|
197
|
-
magic_check = re.compile(r
|
|
198
|
-
magic_decheck = re.compile(r
|
|
199
|
-
brace_check = re.compile(r
|
|
200
|
-
unbrace_check = re.compile(r
|
|
191
|
+
magic_check = re.compile(r"([*?[{])")
|
|
192
|
+
magic_decheck = re.compile(r"\[(.)\]")
|
|
193
|
+
brace_check = re.compile(r"(\{.*\})")
|
|
194
|
+
unbrace_check = re.compile(r"([*?[])")
|
|
201
195
|
|
|
202
196
|
|
|
203
197
|
def has_magic(s: str) -> bool:
|
|
@@ -211,46 +205,44 @@ def has_magic_ignore_brace(s: str) -> bool:
|
|
|
211
205
|
|
|
212
206
|
|
|
213
207
|
def _ishidden(path: str) -> bool:
|
|
214
|
-
return path[0] ==
|
|
208
|
+
return path[0] == "."
|
|
215
209
|
|
|
216
210
|
|
|
217
211
|
def _isrecursive(pattern: str) -> bool:
|
|
218
|
-
return pattern ==
|
|
212
|
+
return pattern == "**"
|
|
219
213
|
|
|
220
214
|
|
|
221
215
|
def escape(pathname):
|
|
222
|
-
"""Escape all special characters.
|
|
223
|
-
"""
|
|
216
|
+
"""Escape all special characters."""
|
|
224
217
|
# Escaping is done by wrapping any of "*?[" between square brackets.
|
|
225
218
|
# Metacharacters do not work in the drive part and shouldn't be escaped.
|
|
226
219
|
drive, pathname = os.path.splitdrive(pathname)
|
|
227
|
-
pathname = magic_check.sub(r
|
|
220
|
+
pathname = magic_check.sub(r"[\1]", pathname)
|
|
228
221
|
return drive + pathname
|
|
229
222
|
|
|
230
223
|
|
|
231
224
|
def unescape(pathname):
|
|
232
|
-
"""Unescape all special characters.
|
|
233
|
-
"""
|
|
225
|
+
"""Unescape all special characters."""
|
|
234
226
|
drive, pathname = os.path.splitdrive(pathname)
|
|
235
|
-
pathname = magic_decheck.sub(r
|
|
227
|
+
pathname = magic_decheck.sub(r"\1", pathname)
|
|
236
228
|
return drive + pathname
|
|
237
229
|
|
|
238
230
|
|
|
239
|
-
def _find_suffix(path_list: List[str], prefix: str,
|
|
240
|
-
split_sign: str) -> List[str]:
|
|
231
|
+
def _find_suffix(path_list: List[str], prefix: str, split_sign: str) -> List[str]:
|
|
241
232
|
suffix = []
|
|
242
233
|
temp_path_list = []
|
|
243
234
|
for path_index in range(0, len(path_list)):
|
|
244
|
-
temp_path_list.append(
|
|
245
|
-
path_list[path_index][len(prefix):].split(split_sign))
|
|
235
|
+
temp_path_list.append(path_list[path_index][len(prefix) :].split(split_sign))
|
|
246
236
|
i = 0
|
|
247
237
|
while True:
|
|
248
238
|
i = i - 1
|
|
249
239
|
if len(temp_path_list[0]) <= abs(i):
|
|
250
240
|
return suffix
|
|
251
241
|
for path_index in range(1, len(path_list)):
|
|
252
|
-
if
|
|
253
|
-
|
|
242
|
+
if (
|
|
243
|
+
len(temp_path_list[path_index]) <= abs(i)
|
|
244
|
+
or temp_path_list[path_index][i] != temp_path_list[0][i]
|
|
245
|
+
):
|
|
254
246
|
return suffix
|
|
255
247
|
else:
|
|
256
248
|
suffix.insert(0, temp_path_list[0][i])
|
|
@@ -260,8 +252,8 @@ def globlize(path_list: List[str]) -> str:
|
|
|
260
252
|
path_list = sorted(path_list)
|
|
261
253
|
if path_list[0] == path_list[-1]:
|
|
262
254
|
return path_list[0]
|
|
263
|
-
first_path = path_list[0].split(
|
|
264
|
-
last_path = path_list[-1].split(
|
|
255
|
+
first_path = path_list[0].split("/")
|
|
256
|
+
last_path = path_list[-1].split("/")
|
|
265
257
|
prefix = []
|
|
266
258
|
|
|
267
259
|
for i in range(0, min(len(first_path), len(last_path))):
|
|
@@ -272,46 +264,46 @@ def globlize(path_list: List[str]) -> str:
|
|
|
272
264
|
if len(prefix) == 0:
|
|
273
265
|
prefix = ""
|
|
274
266
|
else:
|
|
275
|
-
prefix =
|
|
276
|
-
suffix = _find_suffix(path_list, prefix,
|
|
267
|
+
prefix = "/".join(prefix) + "/"
|
|
268
|
+
suffix = _find_suffix(path_list, prefix, "/")
|
|
277
269
|
|
|
278
270
|
if len(suffix) == 0:
|
|
279
|
-
suffix = _find_suffix(path_list, prefix,
|
|
271
|
+
suffix = _find_suffix(path_list, prefix, ".")
|
|
280
272
|
if len(suffix) == 0:
|
|
281
273
|
suffix = ""
|
|
282
274
|
else:
|
|
283
|
-
suffix =
|
|
275
|
+
suffix = "." + ".".join(suffix)
|
|
284
276
|
else:
|
|
285
|
-
suffix =
|
|
277
|
+
suffix = "/" + "/".join(suffix)
|
|
286
278
|
|
|
287
279
|
path = []
|
|
288
280
|
for i in path_list:
|
|
289
|
-
if i[len(prefix):len(i) - len(suffix)] not in path:
|
|
290
|
-
path.append(unescape(i[len(prefix):len(i) - len(suffix)]))
|
|
291
|
-
return prefix + "{" +
|
|
281
|
+
if i[len(prefix) : len(i) - len(suffix)] not in path:
|
|
282
|
+
path.append(unescape(i[len(prefix) : len(i) - len(suffix)]))
|
|
283
|
+
return prefix + "{" + ",".join(path) + "}" + suffix
|
|
292
284
|
|
|
293
285
|
|
|
294
286
|
def ungloblize(glob: str) -> List[str]:
|
|
295
287
|
path_list = [glob]
|
|
296
288
|
while True:
|
|
297
289
|
temp_path = path_list[0]
|
|
298
|
-
begin = temp_path.find(
|
|
299
|
-
end = temp_path.find(
|
|
290
|
+
begin = temp_path.find("{")
|
|
291
|
+
end = temp_path.find("}", begin)
|
|
300
292
|
if end == -1:
|
|
301
293
|
break
|
|
302
294
|
path_list.pop(0)
|
|
303
|
-
subpath_list = temp_path[begin + 1:end].split(
|
|
295
|
+
subpath_list = temp_path[begin + 1 : end].split(",")
|
|
304
296
|
for subpath in subpath_list:
|
|
305
|
-
path = temp_path[:begin] + escape(subpath) + temp_path[end + 1:]
|
|
297
|
+
path = temp_path[:begin] + escape(subpath) + temp_path[end + 1 :]
|
|
306
298
|
path_list.append(path)
|
|
307
299
|
return path_list
|
|
308
300
|
|
|
309
301
|
|
|
310
302
|
def get_non_glob_dir(glob: str):
|
|
311
303
|
root_dir = []
|
|
312
|
-
if glob.startswith(
|
|
313
|
-
root_dir.append(
|
|
314
|
-
for name in glob.split(
|
|
304
|
+
if glob.startswith("/"):
|
|
305
|
+
root_dir.append("/")
|
|
306
|
+
for name in glob.split("/"):
|
|
315
307
|
if has_magic(name):
|
|
316
308
|
break
|
|
317
309
|
root_dir.append(name)
|
|
@@ -1,28 +1,38 @@
|
|
|
1
1
|
from io import BytesIO
|
|
2
2
|
from typing import Optional
|
|
3
3
|
|
|
4
|
-
from megfile.config import
|
|
4
|
+
from megfile.config import (
|
|
5
|
+
DEFAULT_BLOCK_CAPACITY,
|
|
6
|
+
DEFAULT_BLOCK_SIZE,
|
|
7
|
+
HDFS_MAX_RETRY_TIMES,
|
|
8
|
+
)
|
|
5
9
|
from megfile.errors import raise_hdfs_error
|
|
6
10
|
from megfile.lib.base_prefetch_reader import BasePrefetchReader
|
|
7
11
|
|
|
8
12
|
|
|
9
13
|
class HdfsPrefetchReader(BasePrefetchReader):
|
|
10
|
-
|
|
11
|
-
Reader to fast read the hdfs content. This will divide the file content into equal
|
|
12
|
-
|
|
13
|
-
|
|
14
|
+
"""
|
|
15
|
+
Reader to fast read the hdfs content. This will divide the file content into equal
|
|
16
|
+
parts of block_size size, and will use LRU to cache at most block_capacity blocks
|
|
17
|
+
in memory.
|
|
18
|
+
|
|
19
|
+
open(), seek() and read() will trigger prefetch read. The prefetch will cached
|
|
20
|
+
block_forward blocks of data from offset position (the position after reading
|
|
21
|
+
if the called function is read).
|
|
22
|
+
"""
|
|
14
23
|
|
|
15
24
|
def __init__(
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
25
|
+
self,
|
|
26
|
+
hdfs_path: str,
|
|
27
|
+
*,
|
|
28
|
+
client,
|
|
29
|
+
block_size: int = DEFAULT_BLOCK_SIZE,
|
|
30
|
+
block_capacity: int = DEFAULT_BLOCK_CAPACITY,
|
|
31
|
+
block_forward: Optional[int] = None,
|
|
32
|
+
max_retries: int = HDFS_MAX_RETRY_TIMES,
|
|
33
|
+
max_workers: Optional[int] = None,
|
|
34
|
+
profile_name: Optional[str] = None,
|
|
35
|
+
):
|
|
26
36
|
self._path = hdfs_path
|
|
27
37
|
self._client = client
|
|
28
38
|
self._profile_name = profile_name
|
|
@@ -32,22 +42,27 @@ class HdfsPrefetchReader(BasePrefetchReader):
|
|
|
32
42
|
block_capacity=block_capacity,
|
|
33
43
|
block_forward=block_forward,
|
|
34
44
|
max_retries=max_retries,
|
|
35
|
-
max_workers=max_workers
|
|
45
|
+
max_workers=max_workers,
|
|
46
|
+
)
|
|
36
47
|
|
|
37
48
|
def _get_content_size(self):
|
|
38
49
|
with raise_hdfs_error(self._path):
|
|
39
|
-
return self._client.status(self._path)[
|
|
50
|
+
return self._client.status(self._path)["length"]
|
|
40
51
|
|
|
41
52
|
@property
|
|
42
53
|
def name(self) -> str:
|
|
43
|
-
return
|
|
44
|
-
f"+{self._profile_name}" if self._profile_name else "",
|
|
54
|
+
return "hdfs%s://%s" % (
|
|
55
|
+
f"+{self._profile_name}" if self._profile_name else "",
|
|
56
|
+
self._path,
|
|
57
|
+
)
|
|
45
58
|
|
|
46
59
|
def _fetch_response(
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
end: Optional[int] = None) -> dict:
|
|
60
|
+
self, start: Optional[int] = None, end: Optional[int] = None
|
|
61
|
+
) -> dict:
|
|
50
62
|
with raise_hdfs_error(self.name):
|
|
51
|
-
with self._client.read(
|
|
52
|
-
|
|
53
|
-
|
|
63
|
+
with self._client.read(
|
|
64
|
+
self._path,
|
|
65
|
+
offset=start or 0,
|
|
66
|
+
length=end - start if start and end else None,
|
|
67
|
+
) as f:
|
|
68
|
+
return {"Body": BytesIO(f.read())}
|