megfile 3.1.1__py3-none-any.whl → 3.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docs/conf.py +2 -4
- megfile/__init__.py +394 -203
- megfile/cli.py +258 -238
- megfile/config.py +25 -21
- megfile/errors.py +124 -114
- megfile/fs.py +174 -140
- megfile/fs_path.py +462 -354
- megfile/hdfs.py +133 -101
- megfile/hdfs_path.py +290 -236
- megfile/http.py +15 -14
- megfile/http_path.py +111 -107
- megfile/interfaces.py +70 -65
- megfile/lib/base_prefetch_reader.py +84 -65
- megfile/lib/combine_reader.py +12 -12
- megfile/lib/compare.py +17 -13
- megfile/lib/compat.py +1 -5
- megfile/lib/fnmatch.py +29 -30
- megfile/lib/glob.py +46 -54
- megfile/lib/hdfs_prefetch_reader.py +40 -25
- megfile/lib/hdfs_tools.py +1 -3
- megfile/lib/http_prefetch_reader.py +69 -46
- megfile/lib/joinpath.py +5 -5
- megfile/lib/lazy_handler.py +7 -3
- megfile/lib/s3_buffered_writer.py +58 -51
- megfile/lib/s3_cached_handler.py +13 -14
- megfile/lib/s3_limited_seekable_writer.py +37 -28
- megfile/lib/s3_memory_handler.py +34 -30
- megfile/lib/s3_pipe_handler.py +24 -25
- megfile/lib/s3_prefetch_reader.py +71 -52
- megfile/lib/s3_share_cache_reader.py +37 -24
- megfile/lib/shadow_handler.py +7 -3
- megfile/lib/stdio_handler.py +9 -8
- megfile/lib/url.py +3 -3
- megfile/pathlike.py +259 -228
- megfile/s3.py +220 -153
- megfile/s3_path.py +977 -802
- megfile/sftp.py +190 -156
- megfile/sftp_path.py +540 -450
- megfile/smart.py +397 -330
- megfile/smart_path.py +100 -105
- megfile/stdio.py +10 -9
- megfile/stdio_path.py +32 -35
- megfile/utils/__init__.py +73 -54
- megfile/utils/mutex.py +11 -14
- megfile/version.py +1 -1
- {megfile-3.1.1.dist-info → megfile-3.1.2.dist-info}/METADATA +5 -8
- megfile-3.1.2.dist-info/RECORD +55 -0
- {megfile-3.1.1.dist-info → megfile-3.1.2.dist-info}/WHEEL +1 -1
- scripts/convert_results_to_sarif.py +45 -78
- scripts/generate_file.py +140 -64
- megfile-3.1.1.dist-info/RECORD +0 -55
- {megfile-3.1.1.dist-info → megfile-3.1.2.dist-info}/LICENSE +0 -0
- {megfile-3.1.1.dist-info → megfile-3.1.2.dist-info}/LICENSE.pyre +0 -0
- {megfile-3.1.1.dist-info → megfile-3.1.2.dist-info}/entry_points.txt +0 -0
- {megfile-3.1.1.dist-info → megfile-3.1.2.dist-info}/top_level.txt +0 -0
megfile/lib/s3_memory_handler.py
CHANGED
|
@@ -2,24 +2,28 @@ import os
|
|
|
2
2
|
from io import BytesIO, UnsupportedOperation
|
|
3
3
|
from typing import Iterable, List, Optional
|
|
4
4
|
|
|
5
|
-
from megfile.errors import
|
|
5
|
+
from megfile.errors import (
|
|
6
|
+
S3ConfigError,
|
|
7
|
+
UnknownError,
|
|
8
|
+
raise_s3_error,
|
|
9
|
+
translate_s3_error,
|
|
10
|
+
)
|
|
6
11
|
from megfile.interfaces import Readable, Seekable, Writable
|
|
7
12
|
|
|
8
13
|
|
|
9
14
|
class S3MemoryHandler(Readable[bytes], Seekable, Writable[bytes]):
|
|
10
|
-
|
|
11
15
|
def __init__(
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
if mode not in (
|
|
16
|
+
self,
|
|
17
|
+
bucket: str,
|
|
18
|
+
key: str,
|
|
19
|
+
mode: str,
|
|
20
|
+
*,
|
|
21
|
+
s3_client,
|
|
22
|
+
profile_name: Optional[str] = None,
|
|
23
|
+
):
|
|
24
|
+
if mode not in ("rb", "wb", "ab", "rb+", "wb+", "ab+"):
|
|
21
25
|
# TODO: replace AssertionError with ValueError in 4.0.0
|
|
22
|
-
raise AssertionError(
|
|
26
|
+
raise AssertionError("unacceptable mode: %r" % mode)
|
|
23
27
|
|
|
24
28
|
self._bucket = bucket
|
|
25
29
|
self._key = key
|
|
@@ -32,9 +36,11 @@ class S3MemoryHandler(Readable[bytes], Seekable, Writable[bytes]):
|
|
|
32
36
|
|
|
33
37
|
@property
|
|
34
38
|
def name(self) -> str:
|
|
35
|
-
return
|
|
39
|
+
return "s3%s://%s/%s" % (
|
|
36
40
|
f"+{self._profile_name}" if self._profile_name else "",
|
|
37
|
-
self._bucket,
|
|
41
|
+
self._bucket,
|
|
42
|
+
self._key,
|
|
43
|
+
)
|
|
38
44
|
|
|
39
45
|
@property
|
|
40
46
|
def mode(self) -> str:
|
|
@@ -47,46 +53,44 @@ class S3MemoryHandler(Readable[bytes], Seekable, Writable[bytes]):
|
|
|
47
53
|
return self._fileobj.seek(offset, whence)
|
|
48
54
|
|
|
49
55
|
def readable(self) -> bool:
|
|
50
|
-
return self._mode[0] ==
|
|
56
|
+
return self._mode[0] == "r" or self._mode[-1] == "+"
|
|
51
57
|
|
|
52
58
|
def read(self, size: Optional[int] = None) -> bytes:
|
|
53
59
|
if not self.readable():
|
|
54
|
-
raise UnsupportedOperation(
|
|
60
|
+
raise UnsupportedOperation("not readable")
|
|
55
61
|
return self._fileobj.read(size)
|
|
56
62
|
|
|
57
63
|
def readline(self, size: Optional[int] = None) -> bytes:
|
|
58
64
|
if not self.readable():
|
|
59
|
-
raise UnsupportedOperation(
|
|
65
|
+
raise UnsupportedOperation("not readable")
|
|
60
66
|
if size is None:
|
|
61
67
|
size = -1
|
|
62
68
|
return self._fileobj.readline(size)
|
|
63
69
|
|
|
64
70
|
def readlines(self, hint: Optional[int] = None) -> List[bytes]:
|
|
65
71
|
if not self.readable():
|
|
66
|
-
raise UnsupportedOperation(
|
|
72
|
+
raise UnsupportedOperation("not readable")
|
|
67
73
|
if hint is None:
|
|
68
74
|
hint = -1
|
|
69
75
|
return self._fileobj.readlines(hint)
|
|
70
76
|
|
|
71
77
|
def writable(self) -> bool:
|
|
72
|
-
return self._mode[0] ==
|
|
73
|
-
self._mode[0] == 'a' or \
|
|
74
|
-
self._mode[-1] == '+'
|
|
78
|
+
return self._mode[0] == "w" or self._mode[0] == "a" or self._mode[-1] == "+"
|
|
75
79
|
|
|
76
80
|
def flush(self):
|
|
77
81
|
self._fileobj.flush()
|
|
78
82
|
|
|
79
83
|
def write(self, data: bytes) -> int:
|
|
80
84
|
if not self.writable():
|
|
81
|
-
raise UnsupportedOperation(
|
|
82
|
-
if self._mode[0] ==
|
|
85
|
+
raise UnsupportedOperation("not writable")
|
|
86
|
+
if self._mode[0] == "a":
|
|
83
87
|
self.seek(0, os.SEEK_END)
|
|
84
88
|
return self._fileobj.write(data)
|
|
85
89
|
|
|
86
90
|
def writelines(self, lines: Iterable[bytes]):
|
|
87
91
|
if not self.writable():
|
|
88
|
-
raise UnsupportedOperation(
|
|
89
|
-
if self._mode[0] ==
|
|
92
|
+
raise UnsupportedOperation("not writable")
|
|
93
|
+
if self._mode[0] == "a":
|
|
90
94
|
self.seek(0, os.SEEK_END)
|
|
91
95
|
self._fileobj.writelines(lines)
|
|
92
96
|
|
|
@@ -104,17 +108,17 @@ class S3MemoryHandler(Readable[bytes], Seekable, Writable[bytes]):
|
|
|
104
108
|
return True
|
|
105
109
|
|
|
106
110
|
def _download_fileobj(self):
|
|
107
|
-
need_download = self._mode[0] ==
|
|
108
|
-
self._mode[0] ==
|
|
111
|
+
need_download = self._mode[0] == "r" or (
|
|
112
|
+
self._mode[0] == "a" and self._file_exists()
|
|
113
|
+
)
|
|
109
114
|
if not need_download:
|
|
110
115
|
return
|
|
111
116
|
# directly download to the file handle
|
|
112
117
|
try:
|
|
113
|
-
self._client.download_fileobj(
|
|
114
|
-
self._bucket, self._key, self._fileobj)
|
|
118
|
+
self._client.download_fileobj(self._bucket, self._key, self._fileobj)
|
|
115
119
|
except Exception as error:
|
|
116
120
|
raise self._translate_error(error)
|
|
117
|
-
if self._mode[0] ==
|
|
121
|
+
if self._mode[0] == "r":
|
|
118
122
|
self.seek(0, os.SEEK_SET)
|
|
119
123
|
|
|
120
124
|
def _upload_fileobj(self):
|
megfile/lib/s3_pipe_handler.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import atexit
|
|
2
|
-
import concurrent.futures # don't delete this import, to ensure the _close_s3_pipes registration is earlier than concurrent.futures._python_exit
|
|
2
|
+
import concurrent.futures # noqa: F401 # don't delete this import, to ensure the _close_s3_pipes registration is earlier than concurrent.futures._python_exit
|
|
3
3
|
import os
|
|
4
4
|
from threading import Thread
|
|
5
5
|
from typing import Optional
|
|
@@ -12,7 +12,6 @@ _s3_opened_pipes = []
|
|
|
12
12
|
|
|
13
13
|
@atexit.register
|
|
14
14
|
def _close_s3_pipes(): # pragma: no cover
|
|
15
|
-
|
|
16
15
|
def try_close_pipe(fd):
|
|
17
16
|
try:
|
|
18
17
|
os.close(fd)
|
|
@@ -25,20 +24,19 @@ def _close_s3_pipes(): # pragma: no cover
|
|
|
25
24
|
|
|
26
25
|
|
|
27
26
|
class S3PipeHandler(Readable[bytes], Writable[bytes]):
|
|
28
|
-
|
|
29
27
|
def __init__(
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
if mode not in (
|
|
28
|
+
self,
|
|
29
|
+
bucket: str,
|
|
30
|
+
key: str,
|
|
31
|
+
mode: str,
|
|
32
|
+
*,
|
|
33
|
+
s3_client,
|
|
34
|
+
join_thread: bool = True,
|
|
35
|
+
profile_name: Optional[str] = None,
|
|
36
|
+
):
|
|
37
|
+
if mode not in ("rb", "wb"):
|
|
40
38
|
# TODO: replace AssertionError with ValueError in 4.0.0
|
|
41
|
-
raise AssertionError(
|
|
39
|
+
raise AssertionError("unacceptable mode: %r" % mode)
|
|
42
40
|
|
|
43
41
|
self._bucket = bucket
|
|
44
42
|
self._key = key
|
|
@@ -52,20 +50,21 @@ class S3PipeHandler(Readable[bytes], Writable[bytes]):
|
|
|
52
50
|
self._pipe = os.pipe()
|
|
53
51
|
_s3_opened_pipes.append(self._pipe)
|
|
54
52
|
|
|
55
|
-
if self._mode ==
|
|
56
|
-
self._fileobj = os.fdopen(self._pipe[0],
|
|
57
|
-
self._async_task = Thread(
|
|
58
|
-
target=self._download_fileobj, daemon=True)
|
|
53
|
+
if self._mode == "rb":
|
|
54
|
+
self._fileobj = os.fdopen(self._pipe[0], "rb")
|
|
55
|
+
self._async_task = Thread(target=self._download_fileobj, daemon=True)
|
|
59
56
|
else:
|
|
60
|
-
self._fileobj = os.fdopen(self._pipe[1],
|
|
57
|
+
self._fileobj = os.fdopen(self._pipe[1], "wb")
|
|
61
58
|
self._async_task = Thread(target=self._upload_fileobj, daemon=True)
|
|
62
59
|
self._async_task.start()
|
|
63
60
|
|
|
64
61
|
@property
|
|
65
62
|
def name(self) -> str:
|
|
66
|
-
return
|
|
63
|
+
return "s3%s://%s/%s" % (
|
|
67
64
|
f"+{self._profile_name}" if self._profile_name else "",
|
|
68
|
-
self._bucket,
|
|
65
|
+
self._bucket,
|
|
66
|
+
self._key,
|
|
67
|
+
)
|
|
69
68
|
|
|
70
69
|
@property
|
|
71
70
|
def mode(self) -> str:
|
|
@@ -76,7 +75,7 @@ class S3PipeHandler(Readable[bytes], Writable[bytes]):
|
|
|
76
75
|
|
|
77
76
|
def _download_fileobj(self):
|
|
78
77
|
try:
|
|
79
|
-
with os.fdopen(self._pipe[1],
|
|
78
|
+
with os.fdopen(self._pipe[1], "wb") as buffer:
|
|
80
79
|
self._client.download_fileobj(self._bucket, self._key, buffer)
|
|
81
80
|
except BrokenPipeError:
|
|
82
81
|
if self._fileobj.closed:
|
|
@@ -87,7 +86,7 @@ class S3PipeHandler(Readable[bytes], Writable[bytes]):
|
|
|
87
86
|
|
|
88
87
|
def _upload_fileobj(self):
|
|
89
88
|
try:
|
|
90
|
-
with os.fdopen(self._pipe[0],
|
|
89
|
+
with os.fdopen(self._pipe[0], "rb") as buffer:
|
|
91
90
|
self._client.upload_fileobj(buffer, self._bucket, self._key)
|
|
92
91
|
except Exception as error:
|
|
93
92
|
self._exc = error
|
|
@@ -97,7 +96,7 @@ class S3PipeHandler(Readable[bytes], Writable[bytes]):
|
|
|
97
96
|
raise translate_s3_error(self._exc, self.name)
|
|
98
97
|
|
|
99
98
|
def readable(self) -> bool:
|
|
100
|
-
return self._mode ==
|
|
99
|
+
return self._mode == "rb"
|
|
101
100
|
|
|
102
101
|
def read(self, size: Optional[int] = None) -> bytes:
|
|
103
102
|
self._raise_exception()
|
|
@@ -112,7 +111,7 @@ class S3PipeHandler(Readable[bytes], Writable[bytes]):
|
|
|
112
111
|
return data
|
|
113
112
|
|
|
114
113
|
def writable(self) -> bool:
|
|
115
|
-
return self._mode ==
|
|
114
|
+
return self._mode == "wb"
|
|
116
115
|
|
|
117
116
|
def flush(self):
|
|
118
117
|
self._fileobj.flush()
|
|
@@ -2,41 +2,61 @@ from concurrent.futures import Future
|
|
|
2
2
|
from io import BytesIO
|
|
3
3
|
from typing import Optional
|
|
4
4
|
|
|
5
|
-
from megfile.config import
|
|
6
|
-
|
|
5
|
+
from megfile.config import (
|
|
6
|
+
BACKOFF_FACTOR,
|
|
7
|
+
BACKOFF_INITIAL,
|
|
8
|
+
DEFAULT_BLOCK_CAPACITY,
|
|
9
|
+
DEFAULT_BLOCK_SIZE,
|
|
10
|
+
GLOBAL_MAX_WORKERS,
|
|
11
|
+
NEWLINE,
|
|
12
|
+
S3_MAX_RETRY_TIMES,
|
|
13
|
+
)
|
|
14
|
+
from megfile.errors import (
|
|
15
|
+
S3FileChangedError,
|
|
16
|
+
S3InvalidRangeError,
|
|
17
|
+
patch_method,
|
|
18
|
+
raise_s3_error,
|
|
19
|
+
s3_should_retry,
|
|
20
|
+
)
|
|
7
21
|
from megfile.lib.base_prefetch_reader import BasePrefetchReader, LRUCacheFutureManager
|
|
8
22
|
|
|
9
23
|
__all__ = [
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
24
|
+
"DEFAULT_BLOCK_CAPACITY",
|
|
25
|
+
"DEFAULT_BLOCK_SIZE",
|
|
26
|
+
"GLOBAL_MAX_WORKERS",
|
|
27
|
+
"BACKOFF_INITIAL",
|
|
28
|
+
"BACKOFF_FACTOR",
|
|
29
|
+
"NEWLINE",
|
|
30
|
+
"S3PrefetchReader",
|
|
31
|
+
"LRUCacheFutureManager",
|
|
18
32
|
]
|
|
19
33
|
|
|
20
34
|
|
|
21
35
|
class S3PrefetchReader(BasePrefetchReader):
|
|
22
|
-
|
|
23
|
-
Reader to fast read the s3 content.
|
|
24
|
-
open(), seek() and read() will trigger prefetch read. The prefetch will cached block_forward blocks of data from offset position (the position after reading if the called function is read).
|
|
25
|
-
'''
|
|
36
|
+
"""
|
|
37
|
+
Reader to fast read the s3 content.
|
|
26
38
|
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
block_capacity: int = DEFAULT_BLOCK_CAPACITY,
|
|
35
|
-
block_forward: Optional[int] = None,
|
|
36
|
-
max_retries: int = S3_MAX_RETRY_TIMES,
|
|
37
|
-
max_workers: Optional[int] = None,
|
|
38
|
-
profile_name: Optional[str] = None):
|
|
39
|
+
This will divide the file content into equalparts of block_size size,
|
|
40
|
+
and will use LRU to cache at most block_capacity blocks in memory.
|
|
41
|
+
|
|
42
|
+
open(), seek() and read() will trigger prefetch read.
|
|
43
|
+
The prefetch will cached block_forward blocks of data from offset position
|
|
44
|
+
(the position after reading if the called function is read).
|
|
45
|
+
"""
|
|
39
46
|
|
|
47
|
+
def __init__(
|
|
48
|
+
self,
|
|
49
|
+
bucket: str,
|
|
50
|
+
key: str,
|
|
51
|
+
*,
|
|
52
|
+
s3_client,
|
|
53
|
+
block_size: int = DEFAULT_BLOCK_SIZE,
|
|
54
|
+
block_capacity: int = DEFAULT_BLOCK_CAPACITY,
|
|
55
|
+
block_forward: Optional[int] = None,
|
|
56
|
+
max_retries: int = S3_MAX_RETRY_TIMES,
|
|
57
|
+
max_workers: Optional[int] = None,
|
|
58
|
+
profile_name: Optional[str] = None,
|
|
59
|
+
):
|
|
40
60
|
self._bucket = bucket
|
|
41
61
|
self._key = key
|
|
42
62
|
self._client = s3_client
|
|
@@ -49,65 +69,64 @@ class S3PrefetchReader(BasePrefetchReader):
|
|
|
49
69
|
block_capacity=block_capacity,
|
|
50
70
|
block_forward=block_forward,
|
|
51
71
|
max_retries=max_retries,
|
|
52
|
-
max_workers=max_workers
|
|
72
|
+
max_workers=max_workers,
|
|
73
|
+
)
|
|
53
74
|
|
|
54
75
|
def _get_content_size(self):
|
|
55
76
|
try:
|
|
56
77
|
start, end = 0, self._block_size - 1
|
|
57
78
|
first_index_response = self._fetch_response(start=start, end=end)
|
|
58
|
-
content_size = int(
|
|
59
|
-
first_index_response['ContentRange'].split('/')[-1])
|
|
79
|
+
content_size = int(first_index_response["ContentRange"].split("/")[-1])
|
|
60
80
|
except S3InvalidRangeError:
|
|
61
81
|
# usually when read a empty file
|
|
62
82
|
# can use minio test empty file: https://hub.docker.com/r/minio/minio
|
|
63
83
|
first_index_response = self._fetch_response()
|
|
64
|
-
content_size = int(first_index_response[
|
|
84
|
+
content_size = int(first_index_response["ContentLength"])
|
|
65
85
|
|
|
66
86
|
first_future = Future()
|
|
67
|
-
first_future.set_result(first_index_response[
|
|
87
|
+
first_future.set_result(first_index_response["Body"])
|
|
68
88
|
self._insert_futures(index=0, future=first_future)
|
|
69
|
-
self._content_etag = first_index_response[
|
|
89
|
+
self._content_etag = first_index_response["ETag"]
|
|
70
90
|
self._content_info = first_index_response
|
|
71
91
|
return content_size
|
|
72
92
|
|
|
73
93
|
@property
|
|
74
94
|
def name(self) -> str:
|
|
75
|
-
return
|
|
95
|
+
return "s3%s://%s/%s" % (
|
|
76
96
|
f"+{self._profile_name}" if self._profile_name else "",
|
|
77
|
-
self._bucket,
|
|
97
|
+
self._bucket,
|
|
98
|
+
self._key,
|
|
99
|
+
)
|
|
78
100
|
|
|
79
101
|
def _fetch_response(
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
end: Optional[int] = None) -> dict:
|
|
83
|
-
|
|
102
|
+
self, start: Optional[int] = None, end: Optional[int] = None
|
|
103
|
+
) -> dict:
|
|
84
104
|
def fetch_response() -> dict:
|
|
85
105
|
if start is None or end is None:
|
|
86
|
-
return self._client.get_object(
|
|
87
|
-
Bucket=self._bucket, Key=self._key)
|
|
106
|
+
return self._client.get_object(Bucket=self._bucket, Key=self._key)
|
|
88
107
|
|
|
89
|
-
range_str = f
|
|
108
|
+
range_str = f"bytes={start}-{end}"
|
|
90
109
|
response = self._client.get_object(
|
|
91
|
-
Bucket=self._bucket, Key=self._key, Range=range_str
|
|
92
|
-
|
|
110
|
+
Bucket=self._bucket, Key=self._key, Range=range_str
|
|
111
|
+
)
|
|
112
|
+
response["Body"] = BytesIO(response["Body"].read())
|
|
93
113
|
return response
|
|
94
114
|
|
|
95
115
|
fetch_response = patch_method(
|
|
96
|
-
fetch_response,
|
|
97
|
-
|
|
98
|
-
should_retry=s3_should_retry)
|
|
116
|
+
fetch_response, max_retries=self._max_retries, should_retry=s3_should_retry
|
|
117
|
+
)
|
|
99
118
|
|
|
100
119
|
with raise_s3_error(self.name):
|
|
101
120
|
return fetch_response()
|
|
102
121
|
|
|
103
122
|
def _fetch_buffer(self, index: int) -> BytesIO:
|
|
104
|
-
start, end = index * self._block_size, (
|
|
105
|
-
index + 1) * self._block_size - 1
|
|
123
|
+
start, end = index * self._block_size, (index + 1) * self._block_size - 1
|
|
106
124
|
response = self._fetch_response(start=start, end=end)
|
|
107
|
-
etag = response.get(
|
|
125
|
+
etag = response.get("ETag", None)
|
|
108
126
|
if etag is not None and etag != self._content_etag:
|
|
109
127
|
raise S3FileChangedError(
|
|
110
|
-
|
|
111
|
-
(self.name, self._content_info, response)
|
|
128
|
+
"File changed: %r, etag before: %s, after: %s"
|
|
129
|
+
% (self.name, self._content_info, response)
|
|
130
|
+
)
|
|
112
131
|
|
|
113
|
-
return response[
|
|
132
|
+
return response["Body"]
|
|
@@ -3,7 +3,11 @@ from concurrent.futures import Future
|
|
|
3
3
|
from logging import getLogger as get_logger
|
|
4
4
|
from typing import Optional
|
|
5
5
|
|
|
6
|
-
from megfile.config import
|
|
6
|
+
from megfile.config import (
|
|
7
|
+
DEFAULT_BLOCK_CAPACITY,
|
|
8
|
+
DEFAULT_BLOCK_SIZE,
|
|
9
|
+
S3_MAX_RETRY_TIMES,
|
|
10
|
+
)
|
|
7
11
|
from megfile.lib.s3_prefetch_reader import LRUCacheFutureManager, S3PrefetchReader
|
|
8
12
|
from megfile.utils import thread_local
|
|
9
13
|
|
|
@@ -11,25 +15,31 @@ _logger = get_logger(__name__)
|
|
|
11
15
|
|
|
12
16
|
|
|
13
17
|
class S3ShareCacheReader(S3PrefetchReader):
|
|
14
|
-
|
|
15
|
-
Reader to fast read the s3 content.
|
|
16
|
-
open(), seek() and read() will trigger prefetch read. The prefetch will cached block_forward blocks of data from offset position (the position after reading if the called function is read).
|
|
17
|
-
'''
|
|
18
|
+
"""
|
|
19
|
+
Reader to fast read the s3 content.
|
|
18
20
|
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
block_capacity: int = DEFAULT_BLOCK_CAPACITY,
|
|
27
|
-
block_forward: Optional[int] = None,
|
|
28
|
-
max_retries: int = S3_MAX_RETRY_TIMES,
|
|
29
|
-
cache_key: str = 'lru',
|
|
30
|
-
max_workers: Optional[int] = None,
|
|
31
|
-
profile_name: Optional[str] = None):
|
|
21
|
+
This will divide the file content into equal parts of block_size size,
|
|
22
|
+
and will use LRU to cache at most block_capacity blocks in memory.
|
|
23
|
+
|
|
24
|
+
open(), seek() and read() will trigger prefetch read.
|
|
25
|
+
The prefetch will cached block_forward blocks of data from offset position
|
|
26
|
+
(the position after reading if the called function is read).
|
|
27
|
+
"""
|
|
32
28
|
|
|
29
|
+
def __init__(
|
|
30
|
+
self,
|
|
31
|
+
bucket: str,
|
|
32
|
+
key: str,
|
|
33
|
+
*,
|
|
34
|
+
s3_client,
|
|
35
|
+
block_size: int = DEFAULT_BLOCK_SIZE,
|
|
36
|
+
block_capacity: int = DEFAULT_BLOCK_CAPACITY,
|
|
37
|
+
block_forward: Optional[int] = None,
|
|
38
|
+
max_retries: int = S3_MAX_RETRY_TIMES,
|
|
39
|
+
cache_key: str = "lru",
|
|
40
|
+
max_workers: Optional[int] = None,
|
|
41
|
+
profile_name: Optional[str] = None,
|
|
42
|
+
):
|
|
33
43
|
self._cache_key = cache_key
|
|
34
44
|
|
|
35
45
|
super().__init__(
|
|
@@ -44,14 +54,17 @@ class S3ShareCacheReader(S3PrefetchReader):
|
|
|
44
54
|
profile_name=profile_name,
|
|
45
55
|
)
|
|
46
56
|
|
|
47
|
-
def _get_futures(self) ->
|
|
57
|
+
def _get_futures(self) -> "ShareCacheFutureManager":
|
|
48
58
|
futures = thread_local(
|
|
49
|
-
|
|
59
|
+
"S3ShareCacheReader." + self._cache_key, ShareCacheFutureManager
|
|
60
|
+
)
|
|
50
61
|
futures.register(self.name)
|
|
51
62
|
return futures
|
|
52
63
|
|
|
53
64
|
def _seek_buffer(self, index: int, offset: int = 0):
|
|
54
|
-
# The corresponding block is probably not downloaded
|
|
65
|
+
# The corresponding block is probably not downloaded
|
|
66
|
+
# when sought to a new position
|
|
67
|
+
#
|
|
55
68
|
# So record the offset first, set it when it is accessed
|
|
56
69
|
self._cached_offset = offset
|
|
57
70
|
self._block_index = index
|
|
@@ -60,7 +73,8 @@ class S3ShareCacheReader(S3PrefetchReader):
|
|
|
60
73
|
if index < 0 or index >= self._block_stop:
|
|
61
74
|
return
|
|
62
75
|
self._futures.submit(
|
|
63
|
-
self._executor, (self.name, index), self._fetch_buffer, index
|
|
76
|
+
self._executor, (self.name, index), self._fetch_buffer, index
|
|
77
|
+
)
|
|
64
78
|
|
|
65
79
|
def _insert_futures(self, index: int, future: Future):
|
|
66
80
|
self._futures[(self.name, index)] = future
|
|
@@ -72,7 +86,7 @@ class S3ShareCacheReader(S3PrefetchReader):
|
|
|
72
86
|
self._futures.cleanup(DEFAULT_BLOCK_CAPACITY)
|
|
73
87
|
|
|
74
88
|
def _close(self):
|
|
75
|
-
_logger.debug(
|
|
89
|
+
_logger.debug("close file: %r" % self.name)
|
|
76
90
|
|
|
77
91
|
if not self._is_global_executor:
|
|
78
92
|
self._executor.shutdown()
|
|
@@ -80,7 +94,6 @@ class S3ShareCacheReader(S3PrefetchReader):
|
|
|
80
94
|
|
|
81
95
|
|
|
82
96
|
class ShareCacheFutureManager(LRUCacheFutureManager):
|
|
83
|
-
|
|
84
97
|
def __init__(self):
|
|
85
98
|
super().__init__()
|
|
86
99
|
self._references = Counter()
|
megfile/lib/shadow_handler.py
CHANGED
|
@@ -12,9 +12,13 @@ class BaseShadowHandler(RawIOBase):
|
|
|
12
12
|
|
|
13
13
|
|
|
14
14
|
class ShadowHandler(Readable, Seekable, Writable, BaseShadowHandler):
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
15
|
+
"""Create a File-Like Object, maintaining file pointer,
|
|
16
|
+
to avoid misunderstanding the position when read / write / seek.
|
|
17
|
+
|
|
18
|
+
It can be roughly regarded as the copy function of the file handle,
|
|
19
|
+
but you need to be careful with the write handle,
|
|
20
|
+
because no matter which copy will modify the data itself.
|
|
21
|
+
"""
|
|
18
22
|
|
|
19
23
|
def __init__(self, file_object: IO, intrusive: bool = True):
|
|
20
24
|
self._file_object = file_object
|
megfile/lib/stdio_handler.py
CHANGED
|
@@ -6,27 +6,27 @@ from megfile.interfaces import Readable, Writable
|
|
|
6
6
|
|
|
7
7
|
|
|
8
8
|
class STDHandler:
|
|
9
|
-
|
|
10
9
|
def tell(self):
|
|
11
|
-
raise UnsupportedOperation(
|
|
10
|
+
raise UnsupportedOperation("not tellable")
|
|
12
11
|
|
|
13
12
|
def _close(self):
|
|
14
13
|
pass
|
|
15
14
|
|
|
16
15
|
|
|
17
16
|
class STDReader(STDHandler, Readable):
|
|
18
|
-
"""
|
|
17
|
+
"""megfile encapsulation of stdin. Avoid direct operation on sys.stdin
|
|
19
18
|
|
|
20
19
|
.. note ::
|
|
21
20
|
|
|
22
21
|
1. For convenience, use buffer by default
|
|
23
22
|
|
|
24
|
-
2. There is currently no demand and no design for seek,
|
|
23
|
+
2. There is currently no demand and no design for seek,
|
|
24
|
+
so seek is not allowed now
|
|
25
25
|
"""
|
|
26
26
|
|
|
27
27
|
def __init__(self, mode: str):
|
|
28
28
|
handler = sys.stdin
|
|
29
|
-
if
|
|
29
|
+
if "b" in mode:
|
|
30
30
|
handler = handler.buffer
|
|
31
31
|
|
|
32
32
|
self._handler = handler
|
|
@@ -48,13 +48,14 @@ class STDReader(STDHandler, Readable):
|
|
|
48
48
|
|
|
49
49
|
|
|
50
50
|
class STDWriter(STDHandler, Writable):
|
|
51
|
-
"""
|
|
51
|
+
"""megfile encapsulation of stdin. Avoid direct operation on sys.stdin
|
|
52
52
|
|
|
53
53
|
.. note ::
|
|
54
54
|
|
|
55
55
|
1. For convenience, use buffer by default
|
|
56
56
|
|
|
57
|
-
2. There is currently no demand and no design for seek,
|
|
57
|
+
2. There is currently no demand and no design for seek,
|
|
58
|
+
so seek is not allowed now
|
|
58
59
|
"""
|
|
59
60
|
|
|
60
61
|
def __init__(self, path: str, mode: str):
|
|
@@ -64,7 +65,7 @@ class STDWriter(STDHandler, Writable):
|
|
|
64
65
|
else:
|
|
65
66
|
name = "stdout"
|
|
66
67
|
handler = sys.stdout
|
|
67
|
-
if
|
|
68
|
+
if "b" in mode:
|
|
68
69
|
handler = handler.buffer
|
|
69
70
|
|
|
70
71
|
self._handler = handler
|
megfile/lib/url.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
1
|
def get_url_scheme(url: str):
|
|
2
|
-
if
|
|
3
|
-
return url.split(
|
|
4
|
-
return
|
|
2
|
+
if "://" in url:
|
|
3
|
+
return url.split("://", 1)[0]
|
|
4
|
+
return ""
|