megfile 3.1.1__py3-none-any.whl → 3.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docs/conf.py +2 -4
- megfile/__init__.py +394 -203
- megfile/cli.py +258 -238
- megfile/config.py +25 -21
- megfile/errors.py +124 -114
- megfile/fs.py +174 -140
- megfile/fs_path.py +462 -354
- megfile/hdfs.py +133 -101
- megfile/hdfs_path.py +290 -236
- megfile/http.py +15 -14
- megfile/http_path.py +111 -107
- megfile/interfaces.py +70 -65
- megfile/lib/base_prefetch_reader.py +84 -65
- megfile/lib/combine_reader.py +12 -12
- megfile/lib/compare.py +17 -13
- megfile/lib/compat.py +1 -5
- megfile/lib/fnmatch.py +29 -30
- megfile/lib/glob.py +46 -54
- megfile/lib/hdfs_prefetch_reader.py +40 -25
- megfile/lib/hdfs_tools.py +1 -3
- megfile/lib/http_prefetch_reader.py +69 -46
- megfile/lib/joinpath.py +5 -5
- megfile/lib/lazy_handler.py +7 -3
- megfile/lib/s3_buffered_writer.py +58 -51
- megfile/lib/s3_cached_handler.py +13 -14
- megfile/lib/s3_limited_seekable_writer.py +37 -28
- megfile/lib/s3_memory_handler.py +34 -30
- megfile/lib/s3_pipe_handler.py +24 -25
- megfile/lib/s3_prefetch_reader.py +71 -52
- megfile/lib/s3_share_cache_reader.py +37 -24
- megfile/lib/shadow_handler.py +7 -3
- megfile/lib/stdio_handler.py +9 -8
- megfile/lib/url.py +3 -3
- megfile/pathlike.py +259 -228
- megfile/s3.py +220 -153
- megfile/s3_path.py +977 -802
- megfile/sftp.py +190 -156
- megfile/sftp_path.py +540 -450
- megfile/smart.py +397 -330
- megfile/smart_path.py +100 -105
- megfile/stdio.py +10 -9
- megfile/stdio_path.py +32 -35
- megfile/utils/__init__.py +73 -54
- megfile/utils/mutex.py +11 -14
- megfile/version.py +1 -1
- {megfile-3.1.1.dist-info → megfile-3.1.2.dist-info}/METADATA +5 -8
- megfile-3.1.2.dist-info/RECORD +55 -0
- {megfile-3.1.1.dist-info → megfile-3.1.2.dist-info}/WHEEL +1 -1
- scripts/convert_results_to_sarif.py +45 -78
- scripts/generate_file.py +140 -64
- megfile-3.1.1.dist-info/RECORD +0 -55
- {megfile-3.1.1.dist-info → megfile-3.1.2.dist-info}/LICENSE +0 -0
- {megfile-3.1.1.dist-info → megfile-3.1.2.dist-info}/LICENSE.pyre +0 -0
- {megfile-3.1.1.dist-info → megfile-3.1.2.dist-info}/entry_points.txt +0 -0
- {megfile-3.1.1.dist-info → megfile-3.1.2.dist-info}/top_level.txt +0 -0
|
@@ -3,8 +3,17 @@ from typing import Optional
|
|
|
3
3
|
|
|
4
4
|
import requests
|
|
5
5
|
|
|
6
|
-
from megfile.config import
|
|
7
|
-
|
|
6
|
+
from megfile.config import (
|
|
7
|
+
DEFAULT_BLOCK_CAPACITY,
|
|
8
|
+
DEFAULT_BLOCK_SIZE,
|
|
9
|
+
HTTP_MAX_RETRY_TIMES,
|
|
10
|
+
)
|
|
11
|
+
from megfile.errors import (
|
|
12
|
+
HttpBodyIncompleteError,
|
|
13
|
+
UnsupportedError,
|
|
14
|
+
http_should_retry,
|
|
15
|
+
patch_method,
|
|
16
|
+
)
|
|
8
17
|
from megfile.lib.base_prefetch_reader import BasePrefetchReader
|
|
9
18
|
from megfile.lib.compat import fspath
|
|
10
19
|
from megfile.pathlike import PathLike
|
|
@@ -13,24 +22,29 @@ DEFAULT_TIMEOUT = (60, 60 * 60 * 24)
|
|
|
13
22
|
|
|
14
23
|
|
|
15
24
|
class HttpPrefetchReader(BasePrefetchReader):
|
|
16
|
-
|
|
17
|
-
Reader to fast read the http content, service must support Accept-Ranges.
|
|
18
|
-
This will divide the file content into equal parts of block_size size, and will use LRU to cache at most block_capacity blocks in memory.
|
|
19
|
-
open(), seek() and read() will trigger prefetch read.
|
|
20
|
-
The prefetch will cached block_forward blocks of data from offset position (the position after reading if the called function is read).
|
|
21
|
-
'''
|
|
25
|
+
"""
|
|
26
|
+
Reader to fast read the http content, service must support Accept-Ranges.
|
|
22
27
|
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
max_retries: int = HTTP_MAX_RETRY_TIMES,
|
|
32
|
-
max_workers: Optional[int] = None):
|
|
28
|
+
This will divide the file content into equal parts of block_size size, and will use
|
|
29
|
+
LRU to cache at most block_capacity blocks in memory.
|
|
30
|
+
|
|
31
|
+
open(), seek() and read() will trigger prefetch read.
|
|
32
|
+
|
|
33
|
+
The prefetch will cached block_forward blocks of data from offset position
|
|
34
|
+
(the position after reading if the called function is read).
|
|
35
|
+
"""
|
|
33
36
|
|
|
37
|
+
def __init__(
|
|
38
|
+
self,
|
|
39
|
+
url: PathLike,
|
|
40
|
+
*,
|
|
41
|
+
content_size: Optional[int] = None,
|
|
42
|
+
block_size: int = DEFAULT_BLOCK_SIZE,
|
|
43
|
+
block_capacity: int = DEFAULT_BLOCK_CAPACITY,
|
|
44
|
+
block_forward: Optional[int] = None,
|
|
45
|
+
max_retries: int = HTTP_MAX_RETRY_TIMES,
|
|
46
|
+
max_workers: Optional[int] = None,
|
|
47
|
+
):
|
|
34
48
|
self._url = url
|
|
35
49
|
self._content_size = content_size
|
|
36
50
|
|
|
@@ -39,68 +53,77 @@ class HttpPrefetchReader(BasePrefetchReader):
|
|
|
39
53
|
block_capacity=block_capacity,
|
|
40
54
|
block_forward=block_forward,
|
|
41
55
|
max_retries=max_retries,
|
|
42
|
-
max_workers=max_workers
|
|
56
|
+
max_workers=max_workers,
|
|
57
|
+
)
|
|
43
58
|
|
|
44
59
|
def _get_content_size(self) -> int:
|
|
45
60
|
if self._content_size is not None:
|
|
46
61
|
return self._content_size
|
|
47
62
|
|
|
48
63
|
first_index_response = self._fetch_response()
|
|
49
|
-
if first_index_response[
|
|
64
|
+
if first_index_response["Headers"].get("Accept-Ranges") != "bytes":
|
|
50
65
|
raise UnsupportedError(
|
|
51
|
-
f
|
|
66
|
+
f"Unsupported server, server must support Accept-Ranges: {self._url}",
|
|
52
67
|
path=fspath(self._url),
|
|
53
68
|
)
|
|
54
|
-
return first_index_response[
|
|
69
|
+
return first_index_response["Headers"]["Content-Length"]
|
|
55
70
|
|
|
56
71
|
@property
|
|
57
72
|
def name(self) -> str:
|
|
58
73
|
return fspath(self._url)
|
|
59
74
|
|
|
60
75
|
def _fetch_response(
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
end: Optional[int] = None) -> dict:
|
|
64
|
-
|
|
76
|
+
self, start: Optional[int] = None, end: Optional[int] = None
|
|
77
|
+
) -> dict:
|
|
65
78
|
def fetch_response() -> dict:
|
|
66
79
|
request_kwargs = {}
|
|
67
|
-
if hasattr(self._url,
|
|
80
|
+
if hasattr(self._url, "request_kwargs"):
|
|
68
81
|
request_kwargs = self._url.request_kwargs # pyre-ignore[16]
|
|
69
|
-
timeout = request_kwargs.pop(
|
|
70
|
-
stream = request_kwargs.pop(
|
|
82
|
+
timeout = request_kwargs.pop("timeout", DEFAULT_TIMEOUT)
|
|
83
|
+
stream = request_kwargs.pop("stream", True)
|
|
71
84
|
|
|
72
85
|
if start is None or end is None:
|
|
73
|
-
with requests.get(
|
|
74
|
-
|
|
86
|
+
with requests.get(
|
|
87
|
+
fspath(self._url), timeout=timeout, stream=stream, **request_kwargs
|
|
88
|
+
) as response:
|
|
75
89
|
return {
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
90
|
+
"Headers": response.headers,
|
|
91
|
+
"Cookies": response.cookies,
|
|
92
|
+
"StatusCode": response.status_code,
|
|
79
93
|
}
|
|
80
94
|
else:
|
|
81
95
|
range_end = end
|
|
82
96
|
if self._content_size is not None:
|
|
83
97
|
range_end = min(range_end, self._content_size - 1)
|
|
84
|
-
headers = request_kwargs.pop(
|
|
98
|
+
headers = request_kwargs.pop("headers", {})
|
|
85
99
|
headers["Range"] = f"bytes={start}-{range_end}"
|
|
86
|
-
with requests.get(
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
100
|
+
with requests.get(
|
|
101
|
+
fspath(self._url),
|
|
102
|
+
timeout=timeout,
|
|
103
|
+
headers=headers,
|
|
104
|
+
stream=stream,
|
|
105
|
+
**request_kwargs,
|
|
106
|
+
) as response:
|
|
107
|
+
if len(response.content) != int(response.headers["Content-Length"]):
|
|
91
108
|
raise HttpBodyIncompleteError(
|
|
92
|
-
|
|
109
|
+
"The downloaded content is incomplete, "
|
|
110
|
+
"expected size: %s, actual size: %d"
|
|
111
|
+
% (
|
|
112
|
+
response.headers["Content-Length"],
|
|
113
|
+
len(response.content),
|
|
114
|
+
)
|
|
93
115
|
)
|
|
94
116
|
return {
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
117
|
+
"Body": BytesIO(response.content),
|
|
118
|
+
"Headers": response.headers,
|
|
119
|
+
"Cookies": response.cookies,
|
|
120
|
+
"StatusCode": response.status_code,
|
|
99
121
|
}
|
|
100
122
|
|
|
101
123
|
fetch_response = patch_method(
|
|
102
124
|
fetch_response,
|
|
103
125
|
max_retries=self._max_retries,
|
|
104
|
-
should_retry=http_should_retry
|
|
126
|
+
should_retry=http_should_retry,
|
|
127
|
+
)
|
|
105
128
|
|
|
106
129
|
return fetch_response()
|
megfile/lib/joinpath.py
CHANGED
|
@@ -10,22 +10,22 @@ def uri_join(path: str, *other_paths: str) -> str:
|
|
|
10
10
|
return path
|
|
11
11
|
|
|
12
12
|
first_path = path
|
|
13
|
-
if first_path.endswith(
|
|
13
|
+
if first_path.endswith("/"):
|
|
14
14
|
first_path = first_path[:-1]
|
|
15
15
|
|
|
16
16
|
last_path = other_paths[-1]
|
|
17
|
-
if last_path.startswith(
|
|
17
|
+
if last_path.startswith("/"):
|
|
18
18
|
last_path = last_path[1:]
|
|
19
19
|
|
|
20
20
|
middle_paths = []
|
|
21
21
|
for other_path in other_paths[:-1]:
|
|
22
|
-
if other_path.startswith(
|
|
22
|
+
if other_path.startswith("/"):
|
|
23
23
|
other_path = other_path[1:]
|
|
24
|
-
if other_path.endswith(
|
|
24
|
+
if other_path.endswith("/"):
|
|
25
25
|
other_path = other_path[:-1]
|
|
26
26
|
middle_paths.append(other_path)
|
|
27
27
|
|
|
28
|
-
return
|
|
28
|
+
return "/".join([first_path, *middle_paths, last_path])
|
|
29
29
|
|
|
30
30
|
# Imp. 2
|
|
31
31
|
# other_paths = (other_path.lstrip('/') for other_path in other_paths)
|
megfile/lib/lazy_handler.py
CHANGED
|
@@ -7,9 +7,13 @@ from megfile.utils import get_content_size
|
|
|
7
7
|
|
|
8
8
|
|
|
9
9
|
class LazyHandler(Readable, Seekable, Writable):
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
10
|
+
"""Create a File-Like Object, maintaining file pointer,
|
|
11
|
+
to avoid misunderstanding the position when read / write / seek.
|
|
12
|
+
|
|
13
|
+
It can be roughly regarded as the copy function of the file handle,
|
|
14
|
+
but you need to be careful with the write handle,
|
|
15
|
+
because no matter which copy will modify the data itself.
|
|
16
|
+
"""
|
|
13
17
|
|
|
14
18
|
def __init__(self, path: str, mode: str, open_func: Callable, **options):
|
|
15
19
|
self._open_func = open_func
|
|
@@ -5,13 +5,20 @@ from logging import getLogger as get_logger
|
|
|
5
5
|
from threading import Lock
|
|
6
6
|
from typing import NamedTuple, Optional
|
|
7
7
|
|
|
8
|
-
from megfile.config import
|
|
8
|
+
from megfile.config import (
|
|
9
|
+
BACKOFF_FACTOR,
|
|
10
|
+
BACKOFF_INITIAL,
|
|
11
|
+
DEFAULT_MAX_BLOCK_SIZE,
|
|
12
|
+
DEFAULT_MAX_BUFFER_SIZE,
|
|
13
|
+
DEFAULT_MIN_BLOCK_SIZE,
|
|
14
|
+
GLOBAL_MAX_WORKERS,
|
|
15
|
+
)
|
|
9
16
|
from megfile.errors import raise_s3_error
|
|
10
17
|
from megfile.interfaces import Writable
|
|
11
18
|
from megfile.utils import get_human_size, process_local
|
|
12
19
|
|
|
13
20
|
_logger = get_logger(__name__)
|
|
14
|
-
|
|
21
|
+
"""
|
|
15
22
|
class PartResult(NamedTuple):
|
|
16
23
|
|
|
17
24
|
etag: str
|
|
@@ -19,35 +26,31 @@ class PartResult(NamedTuple):
|
|
|
19
26
|
content_size: int
|
|
20
27
|
|
|
21
28
|
in Python 3.6+
|
|
22
|
-
|
|
29
|
+
"""
|
|
23
30
|
|
|
24
31
|
_PartResult = NamedTuple(
|
|
25
|
-
|
|
32
|
+
"PartResult", [("etag", str), ("part_number", int), ("content_size", int)]
|
|
33
|
+
)
|
|
26
34
|
|
|
27
35
|
|
|
28
36
|
class PartResult(_PartResult):
|
|
29
|
-
|
|
30
37
|
def asdict(self):
|
|
31
|
-
return {
|
|
32
|
-
'PartNumber': self.part_number,
|
|
33
|
-
'ETag': self.etag,
|
|
34
|
-
}
|
|
38
|
+
return {"PartNumber": self.part_number, "ETag": self.etag}
|
|
35
39
|
|
|
36
40
|
|
|
37
41
|
class S3BufferedWriter(Writable[bytes]):
|
|
38
|
-
|
|
39
42
|
def __init__(
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
43
|
+
self,
|
|
44
|
+
bucket: str,
|
|
45
|
+
key: str,
|
|
46
|
+
*,
|
|
47
|
+
s3_client,
|
|
48
|
+
block_size: int = DEFAULT_MIN_BLOCK_SIZE,
|
|
49
|
+
max_block_size: int = DEFAULT_MAX_BLOCK_SIZE,
|
|
50
|
+
max_buffer_size: int = DEFAULT_MAX_BUFFER_SIZE,
|
|
51
|
+
max_workers: Optional[int] = None,
|
|
52
|
+
profile_name: Optional[str] = None,
|
|
53
|
+
):
|
|
51
54
|
self._bucket = bucket
|
|
52
55
|
self._key = key
|
|
53
56
|
self._client = s3_client
|
|
@@ -68,9 +71,10 @@ class S3BufferedWriter(Writable[bytes]):
|
|
|
68
71
|
self._is_global_executor = False
|
|
69
72
|
if max_workers is None:
|
|
70
73
|
self._executor = process_local(
|
|
71
|
-
|
|
74
|
+
"S3BufferedWriter.executor",
|
|
72
75
|
ThreadPoolExecutor,
|
|
73
|
-
max_workers=GLOBAL_MAX_WORKERS
|
|
76
|
+
max_workers=GLOBAL_MAX_WORKERS,
|
|
77
|
+
)
|
|
74
78
|
self._is_global_executor = True
|
|
75
79
|
else:
|
|
76
80
|
self._executor = ThreadPoolExecutor(max_workers=max_workers)
|
|
@@ -79,17 +83,19 @@ class S3BufferedWriter(Writable[bytes]):
|
|
|
79
83
|
self.__upload_id = None
|
|
80
84
|
self.__upload_id_lock = Lock()
|
|
81
85
|
|
|
82
|
-
_logger.debug(
|
|
86
|
+
_logger.debug("open file: %r, mode: %s" % (self.name, self.mode))
|
|
83
87
|
|
|
84
88
|
@property
|
|
85
89
|
def name(self) -> str:
|
|
86
|
-
return
|
|
90
|
+
return "s3%s://%s/%s" % (
|
|
87
91
|
f"+{self._profile_name}" if self._profile_name else "",
|
|
88
|
-
self._bucket,
|
|
92
|
+
self._bucket,
|
|
93
|
+
self._key,
|
|
94
|
+
)
|
|
89
95
|
|
|
90
96
|
@property
|
|
91
97
|
def mode(self) -> str:
|
|
92
|
-
return
|
|
98
|
+
return "wb"
|
|
93
99
|
|
|
94
100
|
def tell(self) -> int:
|
|
95
101
|
return self._offset
|
|
@@ -102,8 +108,9 @@ class S3BufferedWriter(Writable[bytes]):
|
|
|
102
108
|
def _content_size(self, value: int):
|
|
103
109
|
if value > self._backoff_size:
|
|
104
110
|
_logger.debug(
|
|
105
|
-
|
|
106
|
-
(self.name, get_human_size(value))
|
|
111
|
+
"writing file: %r, current size: %s"
|
|
112
|
+
% (self.name, get_human_size(value))
|
|
113
|
+
)
|
|
107
114
|
while value > self._backoff_size:
|
|
108
115
|
self._backoff_size *= BACKOFF_FACTOR
|
|
109
116
|
self.__content_size = value
|
|
@@ -118,9 +125,8 @@ class S3BufferedWriter(Writable[bytes]):
|
|
|
118
125
|
if self.__upload_id is None:
|
|
119
126
|
with raise_s3_error(self.name):
|
|
120
127
|
self.__upload_id = self._client.create_multipart_upload(
|
|
121
|
-
Bucket=self._bucket,
|
|
122
|
-
|
|
123
|
-
)['UploadId']
|
|
128
|
+
Bucket=self._bucket, Key=self._key
|
|
129
|
+
)["UploadId"]
|
|
124
130
|
return self.__upload_id
|
|
125
131
|
|
|
126
132
|
@property
|
|
@@ -128,22 +134,19 @@ class S3BufferedWriter(Writable[bytes]):
|
|
|
128
134
|
return self._total_buffer_size - sum(
|
|
129
135
|
future.result().content_size
|
|
130
136
|
for future in self._futures.values()
|
|
131
|
-
if future.done()
|
|
137
|
+
if future.done()
|
|
138
|
+
)
|
|
132
139
|
|
|
133
140
|
@property
|
|
134
141
|
def _uploading_futures(self):
|
|
135
|
-
return [
|
|
136
|
-
future for future in self._futures.values() if not future.done()
|
|
137
|
-
]
|
|
142
|
+
return [future for future in self._futures.values() if not future.done()]
|
|
138
143
|
|
|
139
144
|
@property
|
|
140
145
|
def _multipart_upload(self):
|
|
141
146
|
return {
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
for _, future in sorted(self._futures.items())
|
|
146
|
-
],
|
|
147
|
+
"Parts": [
|
|
148
|
+
future.result().asdict() for _, future in sorted(self._futures.items())
|
|
149
|
+
]
|
|
147
150
|
}
|
|
148
151
|
|
|
149
152
|
def _upload_buffer(self, part_number, content):
|
|
@@ -155,24 +158,29 @@ class S3BufferedWriter(Writable[bytes]):
|
|
|
155
158
|
UploadId=self._upload_id,
|
|
156
159
|
PartNumber=part_number,
|
|
157
160
|
Body=content,
|
|
158
|
-
)[
|
|
161
|
+
)["ETag"],
|
|
162
|
+
part_number,
|
|
163
|
+
len(content),
|
|
164
|
+
)
|
|
159
165
|
|
|
160
166
|
def _submit_upload_buffer(self, part_number, content):
|
|
161
167
|
self._futures[part_number] = self._executor.submit(
|
|
162
|
-
self._upload_buffer, part_number, content
|
|
168
|
+
self._upload_buffer, part_number, content
|
|
169
|
+
)
|
|
163
170
|
self._total_buffer_size += len(content)
|
|
164
171
|
while self._buffer_size > self._max_buffer_size:
|
|
165
172
|
wait(self._uploading_futures, return_when=FIRST_COMPLETED)
|
|
166
173
|
|
|
167
174
|
def _submit_upload_content(self, content: bytes):
|
|
168
|
-
# s3 part needs at least 5MB,
|
|
175
|
+
# s3 part needs at least 5MB,
|
|
176
|
+
# so we need to divide content into equal-size parts,
|
|
177
|
+
# and give last part more size.
|
|
169
178
|
# e.g. 257MB can be divided into 2 parts, 128MB and 129MB
|
|
170
179
|
offset = 0
|
|
171
180
|
while len(content) - offset - self._max_block_size > self._block_size:
|
|
172
181
|
self._part_number += 1
|
|
173
182
|
offset_stop = offset + self._max_block_size
|
|
174
|
-
self._submit_upload_buffer(
|
|
175
|
-
self._part_number, content[offset:offset_stop])
|
|
183
|
+
self._submit_upload_buffer(self._part_number, content[offset:offset_stop])
|
|
176
184
|
offset = offset_stop
|
|
177
185
|
self._part_number += 1
|
|
178
186
|
self._submit_upload_buffer(self._part_number, content[offset:])
|
|
@@ -186,7 +194,7 @@ class S3BufferedWriter(Writable[bytes]):
|
|
|
186
194
|
|
|
187
195
|
def write(self, data: bytes) -> int:
|
|
188
196
|
if self.closed:
|
|
189
|
-
raise IOError(
|
|
197
|
+
raise IOError("file already closed: %r" % self.name)
|
|
190
198
|
|
|
191
199
|
result = self._buffer.write(data)
|
|
192
200
|
if self._buffer.tell() >= self._block_size:
|
|
@@ -200,14 +208,13 @@ class S3BufferedWriter(Writable[bytes]):
|
|
|
200
208
|
self._executor.shutdown()
|
|
201
209
|
|
|
202
210
|
def _close(self):
|
|
203
|
-
_logger.debug(
|
|
211
|
+
_logger.debug("close file: %r" % self.name)
|
|
204
212
|
|
|
205
213
|
if not self._is_multipart:
|
|
206
214
|
with raise_s3_error(self.name):
|
|
207
215
|
self._client.put_object(
|
|
208
|
-
Bucket=self._bucket,
|
|
209
|
-
|
|
210
|
-
Body=self._buffer.getvalue())
|
|
216
|
+
Bucket=self._bucket, Key=self._key, Body=self._buffer.getvalue()
|
|
217
|
+
)
|
|
211
218
|
self._shutdown()
|
|
212
219
|
return
|
|
213
220
|
|
megfile/lib/s3_cached_handler.py
CHANGED
|
@@ -7,21 +7,20 @@ from megfile.utils import generate_cache_path
|
|
|
7
7
|
|
|
8
8
|
|
|
9
9
|
class S3CachedHandler(S3MemoryHandler):
|
|
10
|
-
|
|
11
10
|
def __init__(
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
if mode not in (
|
|
11
|
+
self,
|
|
12
|
+
bucket: str,
|
|
13
|
+
key: str,
|
|
14
|
+
mode: str,
|
|
15
|
+
*,
|
|
16
|
+
s3_client,
|
|
17
|
+
cache_path: Optional[str] = None,
|
|
18
|
+
remove_cache_when_open: bool = True,
|
|
19
|
+
profile_name: Optional[str] = None,
|
|
20
|
+
):
|
|
21
|
+
if mode not in ("rb", "wb", "ab", "rb+", "wb+", "ab+"):
|
|
23
22
|
# TODO: replace AssertionError with ValueError in 4.0.0
|
|
24
|
-
raise AssertionError(
|
|
23
|
+
raise AssertionError("unacceptable mode: %r" % mode)
|
|
25
24
|
|
|
26
25
|
self._bucket = bucket
|
|
27
26
|
self._key = key
|
|
@@ -33,7 +32,7 @@ class S3CachedHandler(S3MemoryHandler):
|
|
|
33
32
|
cache_path = generate_cache_path(self.name)
|
|
34
33
|
|
|
35
34
|
self._cache_path = cache_path
|
|
36
|
-
self._fileobj = open(self._cache_path,
|
|
35
|
+
self._fileobj = open(self._cache_path, "wb+")
|
|
37
36
|
self._download_fileobj()
|
|
38
37
|
|
|
39
38
|
if remove_cache_when_open:
|
|
@@ -3,7 +3,11 @@ from io import BytesIO
|
|
|
3
3
|
from logging import getLogger as get_logger
|
|
4
4
|
from typing import Optional
|
|
5
5
|
|
|
6
|
-
from megfile.config import
|
|
6
|
+
from megfile.config import (
|
|
7
|
+
DEFAULT_MAX_BLOCK_SIZE,
|
|
8
|
+
DEFAULT_MAX_BUFFER_SIZE,
|
|
9
|
+
DEFAULT_MIN_BLOCK_SIZE,
|
|
10
|
+
)
|
|
7
11
|
from megfile.errors import raise_s3_error
|
|
8
12
|
from megfile.interfaces import Seekable
|
|
9
13
|
from megfile.lib.s3_buffered_writer import S3BufferedWriter
|
|
@@ -12,27 +16,27 @@ _logger = get_logger(__name__)
|
|
|
12
16
|
|
|
13
17
|
|
|
14
18
|
class S3LimitedSeekableWriter(S3BufferedWriter, Seekable):
|
|
15
|
-
|
|
19
|
+
"""For file format like msgpack and mp4, it's a pain that you need to write
|
|
16
20
|
header before writing the data. So it's kind of hard to make streaming write
|
|
17
21
|
to unseekable file system like s3. In this case, we will try to keep the first
|
|
18
22
|
and last parts of data in memory, so we can come back to head again and write
|
|
19
23
|
the header at the last second.
|
|
20
|
-
|
|
24
|
+
"""
|
|
21
25
|
|
|
22
26
|
def __init__(
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
27
|
+
self,
|
|
28
|
+
bucket: str,
|
|
29
|
+
key: str,
|
|
30
|
+
*,
|
|
31
|
+
s3_client,
|
|
32
|
+
block_size: int = DEFAULT_MIN_BLOCK_SIZE,
|
|
33
|
+
head_block_size: Optional[int] = None,
|
|
34
|
+
tail_block_size: Optional[int] = None,
|
|
35
|
+
max_block_size: int = DEFAULT_MAX_BLOCK_SIZE,
|
|
36
|
+
max_buffer_size: int = DEFAULT_MAX_BUFFER_SIZE,
|
|
37
|
+
max_workers: Optional[int] = None,
|
|
38
|
+
profile_name: Optional[str] = None,
|
|
39
|
+
):
|
|
36
40
|
super().__init__(
|
|
37
41
|
bucket,
|
|
38
42
|
key,
|
|
@@ -41,7 +45,8 @@ class S3LimitedSeekableWriter(S3BufferedWriter, Seekable):
|
|
|
41
45
|
max_block_size=max_block_size,
|
|
42
46
|
max_buffer_size=max_buffer_size,
|
|
43
47
|
max_workers=max_workers,
|
|
44
|
-
profile_name=profile_name
|
|
48
|
+
profile_name=profile_name,
|
|
49
|
+
)
|
|
45
50
|
|
|
46
51
|
self._head_block_size = head_block_size or block_size
|
|
47
52
|
self._tail_block_size = tail_block_size or block_size
|
|
@@ -61,7 +66,7 @@ class S3LimitedSeekableWriter(S3BufferedWriter, Seekable):
|
|
|
61
66
|
|
|
62
67
|
def seek(self, offset: int, whence: int = os.SEEK_SET) -> int:
|
|
63
68
|
if self.closed:
|
|
64
|
-
raise IOError(
|
|
69
|
+
raise IOError("file already closed: %r" % self.name)
|
|
65
70
|
|
|
66
71
|
offset = int(offset) # user maybe put offset with 'numpy.uint64' type
|
|
67
72
|
if whence == os.SEEK_SET:
|
|
@@ -71,7 +76,7 @@ class S3LimitedSeekableWriter(S3BufferedWriter, Seekable):
|
|
|
71
76
|
elif whence == os.SEEK_END:
|
|
72
77
|
target_offset = self._content_size + offset
|
|
73
78
|
else:
|
|
74
|
-
raise OSError(
|
|
79
|
+
raise OSError("Unsupported whence value: %d" % whence)
|
|
75
80
|
|
|
76
81
|
if target_offset < self._head_block_size:
|
|
77
82
|
self._head_buffer.seek(target_offset)
|
|
@@ -79,15 +84,16 @@ class S3LimitedSeekableWriter(S3BufferedWriter, Seekable):
|
|
|
79
84
|
self._buffer.seek(target_offset - self._tail_offset)
|
|
80
85
|
else:
|
|
81
86
|
raise OSError(
|
|
82
|
-
|
|
83
|
-
% target_offset
|
|
87
|
+
"Can only seek inside of head, or seek to tail, target offset: %d"
|
|
88
|
+
% target_offset
|
|
89
|
+
)
|
|
84
90
|
|
|
85
91
|
self._offset = target_offset
|
|
86
92
|
return self._offset
|
|
87
93
|
|
|
88
94
|
def write(self, data: bytes) -> int:
|
|
89
95
|
if self.closed:
|
|
90
|
-
raise IOError(
|
|
96
|
+
raise IOError("file already closed: %r" % self.name)
|
|
91
97
|
|
|
92
98
|
if self._head_size != self._head_block_size: # no tail part yet
|
|
93
99
|
self._write_to_head(data)
|
|
@@ -97,8 +103,9 @@ class S3LimitedSeekableWriter(S3BufferedWriter, Seekable):
|
|
|
97
103
|
self._write_to_tail(data)
|
|
98
104
|
else:
|
|
99
105
|
raise OSError(
|
|
100
|
-
|
|
101
|
-
% self._offset
|
|
106
|
+
"Can only write inside of head, or write to tail, current offset: %d"
|
|
107
|
+
% self._offset
|
|
108
|
+
)
|
|
102
109
|
return len(data)
|
|
103
110
|
|
|
104
111
|
def _write_to_head(self, data: bytes):
|
|
@@ -117,8 +124,9 @@ class S3LimitedSeekableWriter(S3BufferedWriter, Seekable):
|
|
|
117
124
|
def _write_to_head_after_tail_part_created(self, data: bytes):
|
|
118
125
|
if self._offset + len(data) > self._head_block_size:
|
|
119
126
|
raise Exception(
|
|
120
|
-
|
|
121
|
-
(self._head_block_size - self._offset, len(data))
|
|
127
|
+
"Head part overflow, %d bytes left but try to write %d bytes"
|
|
128
|
+
% (self._head_block_size - self._offset, len(data))
|
|
129
|
+
)
|
|
122
130
|
self._head_buffer.write(data)
|
|
123
131
|
self._offset += len(data)
|
|
124
132
|
|
|
@@ -140,14 +148,15 @@ class S3LimitedSeekableWriter(S3BufferedWriter, Seekable):
|
|
|
140
148
|
self._submit_upload_content(content[:offset])
|
|
141
149
|
|
|
142
150
|
def _close(self):
|
|
143
|
-
_logger.debug(
|
|
151
|
+
_logger.debug("close file: %r" % self.name)
|
|
144
152
|
|
|
145
153
|
if not self._is_multipart:
|
|
146
154
|
with raise_s3_error(self.name):
|
|
147
155
|
self._client.put_object(
|
|
148
156
|
Bucket=self._bucket,
|
|
149
157
|
Key=self._key,
|
|
150
|
-
Body=self._head_buffer.getvalue() + self._buffer.getvalue()
|
|
158
|
+
Body=self._head_buffer.getvalue() + self._buffer.getvalue(),
|
|
159
|
+
)
|
|
151
160
|
self._shutdown()
|
|
152
161
|
return
|
|
153
162
|
|