megfile 3.1.0.post2__py3-none-any.whl → 3.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docs/conf.py +2 -4
- megfile/__init__.py +394 -203
- megfile/cli.py +258 -238
- megfile/config.py +25 -21
- megfile/errors.py +124 -114
- megfile/fs.py +174 -140
- megfile/fs_path.py +462 -354
- megfile/hdfs.py +133 -101
- megfile/hdfs_path.py +290 -236
- megfile/http.py +15 -14
- megfile/http_path.py +111 -107
- megfile/interfaces.py +70 -65
- megfile/lib/base_prefetch_reader.py +94 -69
- megfile/lib/combine_reader.py +13 -12
- megfile/lib/compare.py +17 -13
- megfile/lib/compat.py +1 -5
- megfile/lib/fnmatch.py +29 -30
- megfile/lib/glob.py +54 -55
- megfile/lib/hdfs_prefetch_reader.py +40 -25
- megfile/lib/hdfs_tools.py +1 -3
- megfile/lib/http_prefetch_reader.py +69 -46
- megfile/lib/joinpath.py +5 -5
- megfile/lib/lazy_handler.py +7 -3
- megfile/lib/s3_buffered_writer.py +61 -52
- megfile/lib/s3_cached_handler.py +14 -13
- megfile/lib/s3_limited_seekable_writer.py +38 -28
- megfile/lib/s3_memory_handler.py +35 -29
- megfile/lib/s3_pipe_handler.py +25 -24
- megfile/lib/s3_prefetch_reader.py +71 -52
- megfile/lib/s3_share_cache_reader.py +37 -24
- megfile/lib/shadow_handler.py +8 -3
- megfile/lib/stdio_handler.py +9 -8
- megfile/lib/url.py +3 -3
- megfile/pathlike.py +259 -228
- megfile/s3.py +220 -153
- megfile/s3_path.py +977 -802
- megfile/sftp.py +190 -156
- megfile/sftp_path.py +540 -450
- megfile/smart.py +397 -330
- megfile/smart_path.py +100 -105
- megfile/stdio.py +10 -9
- megfile/stdio_path.py +32 -35
- megfile/utils/__init__.py +75 -54
- megfile/utils/mutex.py +11 -14
- megfile/version.py +1 -1
- {megfile-3.1.0.post2.dist-info → megfile-3.1.2.dist-info}/METADATA +5 -8
- megfile-3.1.2.dist-info/RECORD +55 -0
- {megfile-3.1.0.post2.dist-info → megfile-3.1.2.dist-info}/WHEEL +1 -1
- scripts/convert_results_to_sarif.py +45 -78
- scripts/generate_file.py +140 -64
- megfile-3.1.0.post2.dist-info/RECORD +0 -55
- {megfile-3.1.0.post2.dist-info → megfile-3.1.2.dist-info}/LICENSE +0 -0
- {megfile-3.1.0.post2.dist-info → megfile-3.1.2.dist-info}/LICENSE.pyre +0 -0
- {megfile-3.1.0.post2.dist-info → megfile-3.1.2.dist-info}/entry_points.txt +0 -0
- {megfile-3.1.0.post2.dist-info → megfile-3.1.2.dist-info}/top_level.txt +0 -0
|
@@ -3,8 +3,17 @@ from typing import Optional
|
|
|
3
3
|
|
|
4
4
|
import requests
|
|
5
5
|
|
|
6
|
-
from megfile.config import
|
|
7
|
-
|
|
6
|
+
from megfile.config import (
|
|
7
|
+
DEFAULT_BLOCK_CAPACITY,
|
|
8
|
+
DEFAULT_BLOCK_SIZE,
|
|
9
|
+
HTTP_MAX_RETRY_TIMES,
|
|
10
|
+
)
|
|
11
|
+
from megfile.errors import (
|
|
12
|
+
HttpBodyIncompleteError,
|
|
13
|
+
UnsupportedError,
|
|
14
|
+
http_should_retry,
|
|
15
|
+
patch_method,
|
|
16
|
+
)
|
|
8
17
|
from megfile.lib.base_prefetch_reader import BasePrefetchReader
|
|
9
18
|
from megfile.lib.compat import fspath
|
|
10
19
|
from megfile.pathlike import PathLike
|
|
@@ -13,24 +22,29 @@ DEFAULT_TIMEOUT = (60, 60 * 60 * 24)
|
|
|
13
22
|
|
|
14
23
|
|
|
15
24
|
class HttpPrefetchReader(BasePrefetchReader):
|
|
16
|
-
|
|
17
|
-
Reader to fast read the http content, service must support Accept-Ranges.
|
|
18
|
-
This will divide the file content into equal parts of block_size size, and will use LRU to cache at most block_capacity blocks in memory.
|
|
19
|
-
open(), seek() and read() will trigger prefetch read.
|
|
20
|
-
The prefetch will cached block_forward blocks of data from offset position (the position after reading if the called function is read).
|
|
21
|
-
'''
|
|
25
|
+
"""
|
|
26
|
+
Reader to fast read the http content, service must support Accept-Ranges.
|
|
22
27
|
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
max_retries: int = HTTP_MAX_RETRY_TIMES,
|
|
32
|
-
max_workers: Optional[int] = None):
|
|
28
|
+
This will divide the file content into equal parts of block_size size, and will use
|
|
29
|
+
LRU to cache at most block_capacity blocks in memory.
|
|
30
|
+
|
|
31
|
+
open(), seek() and read() will trigger prefetch read.
|
|
32
|
+
|
|
33
|
+
The prefetch will cached block_forward blocks of data from offset position
|
|
34
|
+
(the position after reading if the called function is read).
|
|
35
|
+
"""
|
|
33
36
|
|
|
37
|
+
def __init__(
|
|
38
|
+
self,
|
|
39
|
+
url: PathLike,
|
|
40
|
+
*,
|
|
41
|
+
content_size: Optional[int] = None,
|
|
42
|
+
block_size: int = DEFAULT_BLOCK_SIZE,
|
|
43
|
+
block_capacity: int = DEFAULT_BLOCK_CAPACITY,
|
|
44
|
+
block_forward: Optional[int] = None,
|
|
45
|
+
max_retries: int = HTTP_MAX_RETRY_TIMES,
|
|
46
|
+
max_workers: Optional[int] = None,
|
|
47
|
+
):
|
|
34
48
|
self._url = url
|
|
35
49
|
self._content_size = content_size
|
|
36
50
|
|
|
@@ -39,68 +53,77 @@ class HttpPrefetchReader(BasePrefetchReader):
|
|
|
39
53
|
block_capacity=block_capacity,
|
|
40
54
|
block_forward=block_forward,
|
|
41
55
|
max_retries=max_retries,
|
|
42
|
-
max_workers=max_workers
|
|
56
|
+
max_workers=max_workers,
|
|
57
|
+
)
|
|
43
58
|
|
|
44
59
|
def _get_content_size(self) -> int:
|
|
45
60
|
if self._content_size is not None:
|
|
46
61
|
return self._content_size
|
|
47
62
|
|
|
48
63
|
first_index_response = self._fetch_response()
|
|
49
|
-
if first_index_response[
|
|
64
|
+
if first_index_response["Headers"].get("Accept-Ranges") != "bytes":
|
|
50
65
|
raise UnsupportedError(
|
|
51
|
-
f
|
|
66
|
+
f"Unsupported server, server must support Accept-Ranges: {self._url}",
|
|
52
67
|
path=fspath(self._url),
|
|
53
68
|
)
|
|
54
|
-
return first_index_response[
|
|
69
|
+
return first_index_response["Headers"]["Content-Length"]
|
|
55
70
|
|
|
56
71
|
@property
|
|
57
72
|
def name(self) -> str:
|
|
58
73
|
return fspath(self._url)
|
|
59
74
|
|
|
60
75
|
def _fetch_response(
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
end: Optional[int] = None) -> dict:
|
|
64
|
-
|
|
76
|
+
self, start: Optional[int] = None, end: Optional[int] = None
|
|
77
|
+
) -> dict:
|
|
65
78
|
def fetch_response() -> dict:
|
|
66
79
|
request_kwargs = {}
|
|
67
|
-
if hasattr(self._url,
|
|
80
|
+
if hasattr(self._url, "request_kwargs"):
|
|
68
81
|
request_kwargs = self._url.request_kwargs # pyre-ignore[16]
|
|
69
|
-
timeout = request_kwargs.pop(
|
|
70
|
-
stream = request_kwargs.pop(
|
|
82
|
+
timeout = request_kwargs.pop("timeout", DEFAULT_TIMEOUT)
|
|
83
|
+
stream = request_kwargs.pop("stream", True)
|
|
71
84
|
|
|
72
85
|
if start is None or end is None:
|
|
73
|
-
with requests.get(
|
|
74
|
-
|
|
86
|
+
with requests.get(
|
|
87
|
+
fspath(self._url), timeout=timeout, stream=stream, **request_kwargs
|
|
88
|
+
) as response:
|
|
75
89
|
return {
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
90
|
+
"Headers": response.headers,
|
|
91
|
+
"Cookies": response.cookies,
|
|
92
|
+
"StatusCode": response.status_code,
|
|
79
93
|
}
|
|
80
94
|
else:
|
|
81
95
|
range_end = end
|
|
82
96
|
if self._content_size is not None:
|
|
83
97
|
range_end = min(range_end, self._content_size - 1)
|
|
84
|
-
headers = request_kwargs.pop(
|
|
98
|
+
headers = request_kwargs.pop("headers", {})
|
|
85
99
|
headers["Range"] = f"bytes={start}-{range_end}"
|
|
86
|
-
with requests.get(
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
100
|
+
with requests.get(
|
|
101
|
+
fspath(self._url),
|
|
102
|
+
timeout=timeout,
|
|
103
|
+
headers=headers,
|
|
104
|
+
stream=stream,
|
|
105
|
+
**request_kwargs,
|
|
106
|
+
) as response:
|
|
107
|
+
if len(response.content) != int(response.headers["Content-Length"]):
|
|
91
108
|
raise HttpBodyIncompleteError(
|
|
92
|
-
|
|
109
|
+
"The downloaded content is incomplete, "
|
|
110
|
+
"expected size: %s, actual size: %d"
|
|
111
|
+
% (
|
|
112
|
+
response.headers["Content-Length"],
|
|
113
|
+
len(response.content),
|
|
114
|
+
)
|
|
93
115
|
)
|
|
94
116
|
return {
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
117
|
+
"Body": BytesIO(response.content),
|
|
118
|
+
"Headers": response.headers,
|
|
119
|
+
"Cookies": response.cookies,
|
|
120
|
+
"StatusCode": response.status_code,
|
|
99
121
|
}
|
|
100
122
|
|
|
101
123
|
fetch_response = patch_method(
|
|
102
124
|
fetch_response,
|
|
103
125
|
max_retries=self._max_retries,
|
|
104
|
-
should_retry=http_should_retry
|
|
126
|
+
should_retry=http_should_retry,
|
|
127
|
+
)
|
|
105
128
|
|
|
106
129
|
return fetch_response()
|
megfile/lib/joinpath.py
CHANGED
|
@@ -10,22 +10,22 @@ def uri_join(path: str, *other_paths: str) -> str:
|
|
|
10
10
|
return path
|
|
11
11
|
|
|
12
12
|
first_path = path
|
|
13
|
-
if first_path.endswith(
|
|
13
|
+
if first_path.endswith("/"):
|
|
14
14
|
first_path = first_path[:-1]
|
|
15
15
|
|
|
16
16
|
last_path = other_paths[-1]
|
|
17
|
-
if last_path.startswith(
|
|
17
|
+
if last_path.startswith("/"):
|
|
18
18
|
last_path = last_path[1:]
|
|
19
19
|
|
|
20
20
|
middle_paths = []
|
|
21
21
|
for other_path in other_paths[:-1]:
|
|
22
|
-
if other_path.startswith(
|
|
22
|
+
if other_path.startswith("/"):
|
|
23
23
|
other_path = other_path[1:]
|
|
24
|
-
if other_path.endswith(
|
|
24
|
+
if other_path.endswith("/"):
|
|
25
25
|
other_path = other_path[:-1]
|
|
26
26
|
middle_paths.append(other_path)
|
|
27
27
|
|
|
28
|
-
return
|
|
28
|
+
return "/".join([first_path, *middle_paths, last_path])
|
|
29
29
|
|
|
30
30
|
# Imp. 2
|
|
31
31
|
# other_paths = (other_path.lstrip('/') for other_path in other_paths)
|
megfile/lib/lazy_handler.py
CHANGED
|
@@ -7,9 +7,13 @@ from megfile.utils import get_content_size
|
|
|
7
7
|
|
|
8
8
|
|
|
9
9
|
class LazyHandler(Readable, Seekable, Writable):
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
10
|
+
"""Create a File-Like Object, maintaining file pointer,
|
|
11
|
+
to avoid misunderstanding the position when read / write / seek.
|
|
12
|
+
|
|
13
|
+
It can be roughly regarded as the copy function of the file handle,
|
|
14
|
+
but you need to be careful with the write handle,
|
|
15
|
+
because no matter which copy will modify the data itself.
|
|
16
|
+
"""
|
|
13
17
|
|
|
14
18
|
def __init__(self, path: str, mode: str, open_func: Callable, **options):
|
|
15
19
|
self._open_func = open_func
|
|
@@ -5,13 +5,20 @@ from logging import getLogger as get_logger
|
|
|
5
5
|
from threading import Lock
|
|
6
6
|
from typing import NamedTuple, Optional
|
|
7
7
|
|
|
8
|
-
from megfile.config import
|
|
8
|
+
from megfile.config import (
|
|
9
|
+
BACKOFF_FACTOR,
|
|
10
|
+
BACKOFF_INITIAL,
|
|
11
|
+
DEFAULT_MAX_BLOCK_SIZE,
|
|
12
|
+
DEFAULT_MAX_BUFFER_SIZE,
|
|
13
|
+
DEFAULT_MIN_BLOCK_SIZE,
|
|
14
|
+
GLOBAL_MAX_WORKERS,
|
|
15
|
+
)
|
|
9
16
|
from megfile.errors import raise_s3_error
|
|
10
17
|
from megfile.interfaces import Writable
|
|
11
18
|
from megfile.utils import get_human_size, process_local
|
|
12
19
|
|
|
13
20
|
_logger = get_logger(__name__)
|
|
14
|
-
|
|
21
|
+
"""
|
|
15
22
|
class PartResult(NamedTuple):
|
|
16
23
|
|
|
17
24
|
etag: str
|
|
@@ -19,41 +26,39 @@ class PartResult(NamedTuple):
|
|
|
19
26
|
content_size: int
|
|
20
27
|
|
|
21
28
|
in Python 3.6+
|
|
22
|
-
|
|
29
|
+
"""
|
|
23
30
|
|
|
24
31
|
_PartResult = NamedTuple(
|
|
25
|
-
|
|
32
|
+
"PartResult", [("etag", str), ("part_number", int), ("content_size", int)]
|
|
33
|
+
)
|
|
26
34
|
|
|
27
35
|
|
|
28
36
|
class PartResult(_PartResult):
|
|
29
|
-
|
|
30
37
|
def asdict(self):
|
|
31
|
-
return {
|
|
32
|
-
'PartNumber': self.part_number,
|
|
33
|
-
'ETag': self.etag,
|
|
34
|
-
}
|
|
38
|
+
return {"PartNumber": self.part_number, "ETag": self.etag}
|
|
35
39
|
|
|
36
40
|
|
|
37
41
|
class S3BufferedWriter(Writable[bytes]):
|
|
38
|
-
|
|
39
42
|
def __init__(
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
43
|
+
self,
|
|
44
|
+
bucket: str,
|
|
45
|
+
key: str,
|
|
46
|
+
*,
|
|
47
|
+
s3_client,
|
|
48
|
+
block_size: int = DEFAULT_MIN_BLOCK_SIZE,
|
|
49
|
+
max_block_size: int = DEFAULT_MAX_BLOCK_SIZE,
|
|
50
|
+
max_buffer_size: int = DEFAULT_MAX_BUFFER_SIZE,
|
|
51
|
+
max_workers: Optional[int] = None,
|
|
52
|
+
profile_name: Optional[str] = None,
|
|
53
|
+
):
|
|
51
54
|
self._bucket = bucket
|
|
52
55
|
self._key = key
|
|
53
56
|
self._client = s3_client
|
|
54
57
|
self._profile_name = profile_name
|
|
55
58
|
|
|
56
|
-
|
|
59
|
+
# user maybe put block_size with 'numpy.uint64' type
|
|
60
|
+
self._block_size = int(block_size)
|
|
61
|
+
|
|
57
62
|
self._max_block_size = max_block_size
|
|
58
63
|
self._max_buffer_size = max_buffer_size
|
|
59
64
|
self._total_buffer_size = 0
|
|
@@ -66,9 +71,10 @@ class S3BufferedWriter(Writable[bytes]):
|
|
|
66
71
|
self._is_global_executor = False
|
|
67
72
|
if max_workers is None:
|
|
68
73
|
self._executor = process_local(
|
|
69
|
-
|
|
74
|
+
"S3BufferedWriter.executor",
|
|
70
75
|
ThreadPoolExecutor,
|
|
71
|
-
max_workers=GLOBAL_MAX_WORKERS
|
|
76
|
+
max_workers=GLOBAL_MAX_WORKERS,
|
|
77
|
+
)
|
|
72
78
|
self._is_global_executor = True
|
|
73
79
|
else:
|
|
74
80
|
self._executor = ThreadPoolExecutor(max_workers=max_workers)
|
|
@@ -77,17 +83,19 @@ class S3BufferedWriter(Writable[bytes]):
|
|
|
77
83
|
self.__upload_id = None
|
|
78
84
|
self.__upload_id_lock = Lock()
|
|
79
85
|
|
|
80
|
-
_logger.debug(
|
|
86
|
+
_logger.debug("open file: %r, mode: %s" % (self.name, self.mode))
|
|
81
87
|
|
|
82
88
|
@property
|
|
83
89
|
def name(self) -> str:
|
|
84
|
-
return
|
|
90
|
+
return "s3%s://%s/%s" % (
|
|
85
91
|
f"+{self._profile_name}" if self._profile_name else "",
|
|
86
|
-
self._bucket,
|
|
92
|
+
self._bucket,
|
|
93
|
+
self._key,
|
|
94
|
+
)
|
|
87
95
|
|
|
88
96
|
@property
|
|
89
97
|
def mode(self) -> str:
|
|
90
|
-
return
|
|
98
|
+
return "wb"
|
|
91
99
|
|
|
92
100
|
def tell(self) -> int:
|
|
93
101
|
return self._offset
|
|
@@ -100,8 +108,9 @@ class S3BufferedWriter(Writable[bytes]):
|
|
|
100
108
|
def _content_size(self, value: int):
|
|
101
109
|
if value > self._backoff_size:
|
|
102
110
|
_logger.debug(
|
|
103
|
-
|
|
104
|
-
(self.name, get_human_size(value))
|
|
111
|
+
"writing file: %r, current size: %s"
|
|
112
|
+
% (self.name, get_human_size(value))
|
|
113
|
+
)
|
|
105
114
|
while value > self._backoff_size:
|
|
106
115
|
self._backoff_size *= BACKOFF_FACTOR
|
|
107
116
|
self.__content_size = value
|
|
@@ -116,9 +125,8 @@ class S3BufferedWriter(Writable[bytes]):
|
|
|
116
125
|
if self.__upload_id is None:
|
|
117
126
|
with raise_s3_error(self.name):
|
|
118
127
|
self.__upload_id = self._client.create_multipart_upload(
|
|
119
|
-
Bucket=self._bucket,
|
|
120
|
-
|
|
121
|
-
)['UploadId']
|
|
128
|
+
Bucket=self._bucket, Key=self._key
|
|
129
|
+
)["UploadId"]
|
|
122
130
|
return self.__upload_id
|
|
123
131
|
|
|
124
132
|
@property
|
|
@@ -126,22 +134,19 @@ class S3BufferedWriter(Writable[bytes]):
|
|
|
126
134
|
return self._total_buffer_size - sum(
|
|
127
135
|
future.result().content_size
|
|
128
136
|
for future in self._futures.values()
|
|
129
|
-
if future.done()
|
|
137
|
+
if future.done()
|
|
138
|
+
)
|
|
130
139
|
|
|
131
140
|
@property
|
|
132
141
|
def _uploading_futures(self):
|
|
133
|
-
return [
|
|
134
|
-
future for future in self._futures.values() if not future.done()
|
|
135
|
-
]
|
|
142
|
+
return [future for future in self._futures.values() if not future.done()]
|
|
136
143
|
|
|
137
144
|
@property
|
|
138
145
|
def _multipart_upload(self):
|
|
139
146
|
return {
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
for _, future in sorted(self._futures.items())
|
|
144
|
-
],
|
|
147
|
+
"Parts": [
|
|
148
|
+
future.result().asdict() for _, future in sorted(self._futures.items())
|
|
149
|
+
]
|
|
145
150
|
}
|
|
146
151
|
|
|
147
152
|
def _upload_buffer(self, part_number, content):
|
|
@@ -153,24 +158,29 @@ class S3BufferedWriter(Writable[bytes]):
|
|
|
153
158
|
UploadId=self._upload_id,
|
|
154
159
|
PartNumber=part_number,
|
|
155
160
|
Body=content,
|
|
156
|
-
)[
|
|
161
|
+
)["ETag"],
|
|
162
|
+
part_number,
|
|
163
|
+
len(content),
|
|
164
|
+
)
|
|
157
165
|
|
|
158
166
|
def _submit_upload_buffer(self, part_number, content):
|
|
159
167
|
self._futures[part_number] = self._executor.submit(
|
|
160
|
-
self._upload_buffer, part_number, content
|
|
168
|
+
self._upload_buffer, part_number, content
|
|
169
|
+
)
|
|
161
170
|
self._total_buffer_size += len(content)
|
|
162
171
|
while self._buffer_size > self._max_buffer_size:
|
|
163
172
|
wait(self._uploading_futures, return_when=FIRST_COMPLETED)
|
|
164
173
|
|
|
165
174
|
def _submit_upload_content(self, content: bytes):
|
|
166
|
-
# s3 part needs at least 5MB,
|
|
175
|
+
# s3 part needs at least 5MB,
|
|
176
|
+
# so we need to divide content into equal-size parts,
|
|
177
|
+
# and give last part more size.
|
|
167
178
|
# e.g. 257MB can be divided into 2 parts, 128MB and 129MB
|
|
168
179
|
offset = 0
|
|
169
180
|
while len(content) - offset - self._max_block_size > self._block_size:
|
|
170
181
|
self._part_number += 1
|
|
171
182
|
offset_stop = offset + self._max_block_size
|
|
172
|
-
self._submit_upload_buffer(
|
|
173
|
-
self._part_number, content[offset:offset_stop])
|
|
183
|
+
self._submit_upload_buffer(self._part_number, content[offset:offset_stop])
|
|
174
184
|
offset = offset_stop
|
|
175
185
|
self._part_number += 1
|
|
176
186
|
self._submit_upload_buffer(self._part_number, content[offset:])
|
|
@@ -184,7 +194,7 @@ class S3BufferedWriter(Writable[bytes]):
|
|
|
184
194
|
|
|
185
195
|
def write(self, data: bytes) -> int:
|
|
186
196
|
if self.closed:
|
|
187
|
-
raise IOError(
|
|
197
|
+
raise IOError("file already closed: %r" % self.name)
|
|
188
198
|
|
|
189
199
|
result = self._buffer.write(data)
|
|
190
200
|
if self._buffer.tell() >= self._block_size:
|
|
@@ -198,14 +208,13 @@ class S3BufferedWriter(Writable[bytes]):
|
|
|
198
208
|
self._executor.shutdown()
|
|
199
209
|
|
|
200
210
|
def _close(self):
|
|
201
|
-
_logger.debug(
|
|
211
|
+
_logger.debug("close file: %r" % self.name)
|
|
202
212
|
|
|
203
213
|
if not self._is_multipart:
|
|
204
214
|
with raise_s3_error(self.name):
|
|
205
215
|
self._client.put_object(
|
|
206
|
-
Bucket=self._bucket,
|
|
207
|
-
|
|
208
|
-
Body=self._buffer.getvalue())
|
|
216
|
+
Bucket=self._bucket, Key=self._key, Body=self._buffer.getvalue()
|
|
217
|
+
)
|
|
209
218
|
self._shutdown()
|
|
210
219
|
return
|
|
211
220
|
|
megfile/lib/s3_cached_handler.py
CHANGED
|
@@ -7,19 +7,20 @@ from megfile.utils import generate_cache_path
|
|
|
7
7
|
|
|
8
8
|
|
|
9
9
|
class S3CachedHandler(S3MemoryHandler):
|
|
10
|
-
|
|
11
10
|
def __init__(
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
11
|
+
self,
|
|
12
|
+
bucket: str,
|
|
13
|
+
key: str,
|
|
14
|
+
mode: str,
|
|
15
|
+
*,
|
|
16
|
+
s3_client,
|
|
17
|
+
cache_path: Optional[str] = None,
|
|
18
|
+
remove_cache_when_open: bool = True,
|
|
19
|
+
profile_name: Optional[str] = None,
|
|
20
|
+
):
|
|
21
|
+
if mode not in ("rb", "wb", "ab", "rb+", "wb+", "ab+"):
|
|
22
|
+
# TODO: replace AssertionError with ValueError in 4.0.0
|
|
23
|
+
raise AssertionError("unacceptable mode: %r" % mode)
|
|
23
24
|
|
|
24
25
|
self._bucket = bucket
|
|
25
26
|
self._key = key
|
|
@@ -31,7 +32,7 @@ class S3CachedHandler(S3MemoryHandler):
|
|
|
31
32
|
cache_path = generate_cache_path(self.name)
|
|
32
33
|
|
|
33
34
|
self._cache_path = cache_path
|
|
34
|
-
self._fileobj = open(self._cache_path,
|
|
35
|
+
self._fileobj = open(self._cache_path, "wb+")
|
|
35
36
|
self._download_fileobj()
|
|
36
37
|
|
|
37
38
|
if remove_cache_when_open:
|
|
@@ -3,7 +3,11 @@ from io import BytesIO
|
|
|
3
3
|
from logging import getLogger as get_logger
|
|
4
4
|
from typing import Optional
|
|
5
5
|
|
|
6
|
-
from megfile.config import
|
|
6
|
+
from megfile.config import (
|
|
7
|
+
DEFAULT_MAX_BLOCK_SIZE,
|
|
8
|
+
DEFAULT_MAX_BUFFER_SIZE,
|
|
9
|
+
DEFAULT_MIN_BLOCK_SIZE,
|
|
10
|
+
)
|
|
7
11
|
from megfile.errors import raise_s3_error
|
|
8
12
|
from megfile.interfaces import Seekable
|
|
9
13
|
from megfile.lib.s3_buffered_writer import S3BufferedWriter
|
|
@@ -12,27 +16,27 @@ _logger = get_logger(__name__)
|
|
|
12
16
|
|
|
13
17
|
|
|
14
18
|
class S3LimitedSeekableWriter(S3BufferedWriter, Seekable):
|
|
15
|
-
|
|
19
|
+
"""For file format like msgpack and mp4, it's a pain that you need to write
|
|
16
20
|
header before writing the data. So it's kind of hard to make streaming write
|
|
17
21
|
to unseekable file system like s3. In this case, we will try to keep the first
|
|
18
22
|
and last parts of data in memory, so we can come back to head again and write
|
|
19
23
|
the header at the last second.
|
|
20
|
-
|
|
24
|
+
"""
|
|
21
25
|
|
|
22
26
|
def __init__(
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
27
|
+
self,
|
|
28
|
+
bucket: str,
|
|
29
|
+
key: str,
|
|
30
|
+
*,
|
|
31
|
+
s3_client,
|
|
32
|
+
block_size: int = DEFAULT_MIN_BLOCK_SIZE,
|
|
33
|
+
head_block_size: Optional[int] = None,
|
|
34
|
+
tail_block_size: Optional[int] = None,
|
|
35
|
+
max_block_size: int = DEFAULT_MAX_BLOCK_SIZE,
|
|
36
|
+
max_buffer_size: int = DEFAULT_MAX_BUFFER_SIZE,
|
|
37
|
+
max_workers: Optional[int] = None,
|
|
38
|
+
profile_name: Optional[str] = None,
|
|
39
|
+
):
|
|
36
40
|
super().__init__(
|
|
37
41
|
bucket,
|
|
38
42
|
key,
|
|
@@ -41,7 +45,8 @@ class S3LimitedSeekableWriter(S3BufferedWriter, Seekable):
|
|
|
41
45
|
max_block_size=max_block_size,
|
|
42
46
|
max_buffer_size=max_buffer_size,
|
|
43
47
|
max_workers=max_workers,
|
|
44
|
-
profile_name=profile_name
|
|
48
|
+
profile_name=profile_name,
|
|
49
|
+
)
|
|
45
50
|
|
|
46
51
|
self._head_block_size = head_block_size or block_size
|
|
47
52
|
self._tail_block_size = tail_block_size or block_size
|
|
@@ -61,8 +66,9 @@ class S3LimitedSeekableWriter(S3BufferedWriter, Seekable):
|
|
|
61
66
|
|
|
62
67
|
def seek(self, offset: int, whence: int = os.SEEK_SET) -> int:
|
|
63
68
|
if self.closed:
|
|
64
|
-
raise IOError(
|
|
69
|
+
raise IOError("file already closed: %r" % self.name)
|
|
65
70
|
|
|
71
|
+
offset = int(offset) # user maybe put offset with 'numpy.uint64' type
|
|
66
72
|
if whence == os.SEEK_SET:
|
|
67
73
|
target_offset = offset
|
|
68
74
|
elif whence == os.SEEK_CUR:
|
|
@@ -70,7 +76,7 @@ class S3LimitedSeekableWriter(S3BufferedWriter, Seekable):
|
|
|
70
76
|
elif whence == os.SEEK_END:
|
|
71
77
|
target_offset = self._content_size + offset
|
|
72
78
|
else:
|
|
73
|
-
raise OSError(
|
|
79
|
+
raise OSError("Unsupported whence value: %d" % whence)
|
|
74
80
|
|
|
75
81
|
if target_offset < self._head_block_size:
|
|
76
82
|
self._head_buffer.seek(target_offset)
|
|
@@ -78,15 +84,16 @@ class S3LimitedSeekableWriter(S3BufferedWriter, Seekable):
|
|
|
78
84
|
self._buffer.seek(target_offset - self._tail_offset)
|
|
79
85
|
else:
|
|
80
86
|
raise OSError(
|
|
81
|
-
|
|
82
|
-
% target_offset
|
|
87
|
+
"Can only seek inside of head, or seek to tail, target offset: %d"
|
|
88
|
+
% target_offset
|
|
89
|
+
)
|
|
83
90
|
|
|
84
91
|
self._offset = target_offset
|
|
85
92
|
return self._offset
|
|
86
93
|
|
|
87
94
|
def write(self, data: bytes) -> int:
|
|
88
95
|
if self.closed:
|
|
89
|
-
raise IOError(
|
|
96
|
+
raise IOError("file already closed: %r" % self.name)
|
|
90
97
|
|
|
91
98
|
if self._head_size != self._head_block_size: # no tail part yet
|
|
92
99
|
self._write_to_head(data)
|
|
@@ -96,8 +103,9 @@ class S3LimitedSeekableWriter(S3BufferedWriter, Seekable):
|
|
|
96
103
|
self._write_to_tail(data)
|
|
97
104
|
else:
|
|
98
105
|
raise OSError(
|
|
99
|
-
|
|
100
|
-
% self._offset
|
|
106
|
+
"Can only write inside of head, or write to tail, current offset: %d"
|
|
107
|
+
% self._offset
|
|
108
|
+
)
|
|
101
109
|
return len(data)
|
|
102
110
|
|
|
103
111
|
def _write_to_head(self, data: bytes):
|
|
@@ -116,8 +124,9 @@ class S3LimitedSeekableWriter(S3BufferedWriter, Seekable):
|
|
|
116
124
|
def _write_to_head_after_tail_part_created(self, data: bytes):
|
|
117
125
|
if self._offset + len(data) > self._head_block_size:
|
|
118
126
|
raise Exception(
|
|
119
|
-
|
|
120
|
-
(self._head_block_size - self._offset, len(data))
|
|
127
|
+
"Head part overflow, %d bytes left but try to write %d bytes"
|
|
128
|
+
% (self._head_block_size - self._offset, len(data))
|
|
129
|
+
)
|
|
121
130
|
self._head_buffer.write(data)
|
|
122
131
|
self._offset += len(data)
|
|
123
132
|
|
|
@@ -139,14 +148,15 @@ class S3LimitedSeekableWriter(S3BufferedWriter, Seekable):
|
|
|
139
148
|
self._submit_upload_content(content[:offset])
|
|
140
149
|
|
|
141
150
|
def _close(self):
|
|
142
|
-
_logger.debug(
|
|
151
|
+
_logger.debug("close file: %r" % self.name)
|
|
143
152
|
|
|
144
153
|
if not self._is_multipart:
|
|
145
154
|
with raise_s3_error(self.name):
|
|
146
155
|
self._client.put_object(
|
|
147
156
|
Bucket=self._bucket,
|
|
148
157
|
Key=self._key,
|
|
149
|
-
Body=self._head_buffer.getvalue() + self._buffer.getvalue()
|
|
158
|
+
Body=self._head_buffer.getvalue() + self._buffer.getvalue(),
|
|
159
|
+
)
|
|
150
160
|
self._shutdown()
|
|
151
161
|
return
|
|
152
162
|
|