megfile 3.1.6__py3-none-any.whl → 4.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- megfile/cli.py +12 -7
- megfile/config.py +34 -44
- megfile/fs.py +169 -11
- megfile/fs_path.py +183 -259
- megfile/hdfs.py +106 -5
- megfile/hdfs_path.py +34 -90
- megfile/http.py +50 -1
- megfile/http_path.py +27 -65
- megfile/interfaces.py +1 -8
- megfile/lib/base_prefetch_reader.py +62 -78
- megfile/lib/combine_reader.py +5 -0
- megfile/lib/glob.py +3 -6
- megfile/lib/hdfs_prefetch_reader.py +7 -7
- megfile/lib/http_prefetch_reader.py +6 -6
- megfile/lib/s3_buffered_writer.py +67 -64
- megfile/lib/s3_cached_handler.py +1 -2
- megfile/lib/s3_limited_seekable_writer.py +3 -7
- megfile/lib/s3_memory_handler.py +1 -2
- megfile/lib/s3_pipe_handler.py +1 -2
- megfile/lib/s3_prefetch_reader.py +15 -20
- megfile/lib/s3_share_cache_reader.py +8 -5
- megfile/pathlike.py +397 -401
- megfile/s3.py +118 -17
- megfile/s3_path.py +150 -224
- megfile/sftp.py +300 -10
- megfile/sftp_path.py +46 -322
- megfile/smart.py +33 -27
- megfile/smart_path.py +9 -14
- megfile/stdio.py +1 -1
- megfile/stdio_path.py +2 -2
- megfile/utils/__init__.py +11 -4
- megfile/version.py +1 -1
- {megfile-3.1.6.dist-info → megfile-4.0.0.dist-info}/METADATA +7 -7
- megfile-4.0.0.dist-info/RECORD +52 -0
- {megfile-3.1.6.dist-info → megfile-4.0.0.dist-info}/WHEEL +1 -1
- {megfile-3.1.6.dist-info → megfile-4.0.0.dist-info}/top_level.txt +0 -2
- docs/conf.py +0 -65
- megfile-3.1.6.dist-info/RECORD +0 -55
- scripts/convert_results_to_sarif.py +0 -91
- scripts/generate_file.py +0 -344
- {megfile-3.1.6.dist-info → megfile-4.0.0.dist-info}/LICENSE +0 -0
- {megfile-3.1.6.dist-info → megfile-4.0.0.dist-info}/LICENSE.pyre +0 -0
- {megfile-3.1.6.dist-info → megfile-4.0.0.dist-info}/entry_points.txt +0 -0
|
@@ -9,16 +9,14 @@ from statistics import mean
|
|
|
9
9
|
from typing import Optional
|
|
10
10
|
|
|
11
11
|
from megfile.config import (
|
|
12
|
-
BACKOFF_FACTOR,
|
|
13
|
-
BACKOFF_INITIAL,
|
|
14
|
-
DEFAULT_BLOCK_CAPACITY,
|
|
15
|
-
DEFAULT_BLOCK_SIZE,
|
|
16
12
|
DEFAULT_MAX_RETRY_TIMES,
|
|
17
13
|
GLOBAL_MAX_WORKERS,
|
|
18
14
|
NEWLINE,
|
|
15
|
+
READER_BLOCK_SIZE,
|
|
16
|
+
READER_MAX_BUFFER_SIZE,
|
|
19
17
|
)
|
|
20
18
|
from megfile.interfaces import Readable, Seekable
|
|
21
|
-
from megfile.utils import ProcessLocal,
|
|
19
|
+
from megfile.utils import ProcessLocal, process_local
|
|
22
20
|
|
|
23
21
|
_logger = get_logger(__name__)
|
|
24
22
|
|
|
@@ -31,35 +29,31 @@ class SeekRecord:
|
|
|
31
29
|
|
|
32
30
|
|
|
33
31
|
class BasePrefetchReader(Readable[bytes], Seekable, ABC):
|
|
34
|
-
"""
|
|
35
|
-
Reader to fast read the remote file content.
|
|
36
|
-
This will divide the file content into equal parts of block_size size,
|
|
37
|
-
and will use LRU to cache at most block_capacity blocks in memory.
|
|
38
|
-
open(), seek() and read() will trigger prefetch read.
|
|
39
|
-
The prefetch will cached block_forward blocks of data from offset position
|
|
40
|
-
(the position after reading if the called function is read).
|
|
41
|
-
"""
|
|
42
|
-
|
|
43
32
|
def __init__(
|
|
44
33
|
self,
|
|
45
34
|
*,
|
|
46
|
-
block_size: int =
|
|
47
|
-
|
|
35
|
+
block_size: int = READER_BLOCK_SIZE,
|
|
36
|
+
max_buffer_size: int = READER_MAX_BUFFER_SIZE,
|
|
48
37
|
block_forward: Optional[int] = None,
|
|
49
38
|
max_retries: int = DEFAULT_MAX_RETRY_TIMES,
|
|
50
39
|
max_workers: Optional[int] = None,
|
|
51
40
|
**kwargs,
|
|
52
41
|
):
|
|
53
|
-
|
|
42
|
+
if max_buffer_size == 0:
|
|
43
|
+
block_capacity = block_forward = 0
|
|
44
|
+
else:
|
|
45
|
+
block_capacity = max(max_buffer_size // block_size, 1)
|
|
46
|
+
|
|
47
|
+
self._is_auto_scaling = False
|
|
54
48
|
if block_forward is None:
|
|
55
|
-
block_forward = max(block_capacity - 1,
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
raise
|
|
60
|
-
"
|
|
61
|
-
"got:
|
|
62
|
-
% (
|
|
49
|
+
block_forward = max(block_capacity - 1, 0)
|
|
50
|
+
self._is_auto_scaling = block_forward > 0
|
|
51
|
+
|
|
52
|
+
if 0 < block_capacity <= block_forward:
|
|
53
|
+
raise ValueError(
|
|
54
|
+
"max_buffer_size should greater than block_forward * block_size, "
|
|
55
|
+
"got: max_buffer_size=%s, block_size=%s, block_forward=%s"
|
|
56
|
+
% (max_buffer_size, block_size, block_forward)
|
|
63
57
|
)
|
|
64
58
|
|
|
65
59
|
# user maybe put block_size with 'numpy.uint64' type
|
|
@@ -77,8 +71,7 @@ class BasePrefetchReader(Readable[bytes], Seekable, ABC):
|
|
|
77
71
|
self._content_size = self._get_content_size()
|
|
78
72
|
self._block_stop = ceil(self._content_size / block_size)
|
|
79
73
|
|
|
80
|
-
self.
|
|
81
|
-
self._backoff_size = BACKOFF_INITIAL
|
|
74
|
+
self._offset = 0
|
|
82
75
|
self._cached_buffer = None
|
|
83
76
|
self._block_index = None # Current block index
|
|
84
77
|
self._seek_history = []
|
|
@@ -102,7 +95,7 @@ class BasePrefetchReader(Readable[bytes], Seekable, ABC):
|
|
|
102
95
|
pass
|
|
103
96
|
|
|
104
97
|
@property
|
|
105
|
-
def _futures(self):
|
|
98
|
+
def _futures(self) -> "LRUCacheFutureManager":
|
|
106
99
|
return self._process_local("futures", self._get_futures)
|
|
107
100
|
|
|
108
101
|
def _get_futures(self):
|
|
@@ -120,21 +113,6 @@ class BasePrefetchReader(Readable[bytes], Seekable, ABC):
|
|
|
120
113
|
def tell(self) -> int:
|
|
121
114
|
return self._offset
|
|
122
115
|
|
|
123
|
-
@property
|
|
124
|
-
def _offset(self) -> int:
|
|
125
|
-
return self.__offset
|
|
126
|
-
|
|
127
|
-
@_offset.setter
|
|
128
|
-
def _offset(self, value: int):
|
|
129
|
-
if value > self._backoff_size:
|
|
130
|
-
_logger.debug(
|
|
131
|
-
"reading file: %r, current offset / total size: %s / %s"
|
|
132
|
-
% (self.name, get_human_size(value), get_human_size(self._content_size))
|
|
133
|
-
)
|
|
134
|
-
while value > self._backoff_size:
|
|
135
|
-
self._backoff_size *= BACKOFF_FACTOR
|
|
136
|
-
self.__offset = value
|
|
137
|
-
|
|
138
116
|
def seek(self, offset: int, whence: int = os.SEEK_SET) -> int:
|
|
139
117
|
"""Change stream position.
|
|
140
118
|
|
|
@@ -176,9 +154,6 @@ class BasePrefetchReader(Readable[bytes], Seekable, ABC):
|
|
|
176
154
|
if self.closed:
|
|
177
155
|
raise IOError("file already closed: %r" % self.name)
|
|
178
156
|
|
|
179
|
-
if len(self._seek_history) > 0:
|
|
180
|
-
self._seek_history[-1].read_count += 1
|
|
181
|
-
|
|
182
157
|
if self._offset >= self._content_size:
|
|
183
158
|
return b""
|
|
184
159
|
|
|
@@ -187,31 +162,9 @@ class BasePrefetchReader(Readable[bytes], Seekable, ABC):
|
|
|
187
162
|
else:
|
|
188
163
|
size = min(size, self._content_size - self._offset)
|
|
189
164
|
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
mean_read_count = mean(item.read_count for item in self._seek_history)
|
|
194
|
-
else:
|
|
195
|
-
mean_read_count = 0
|
|
196
|
-
if block_index not in self._futures and mean_read_count < 3:
|
|
197
|
-
# No using LRP will be better if read() are always called less than 3
|
|
198
|
-
# times after seek()
|
|
199
|
-
return self._read(size)
|
|
200
|
-
|
|
201
|
-
data = self._buffer.read(size)
|
|
202
|
-
if len(data) == size:
|
|
203
|
-
self._offset += len(data)
|
|
204
|
-
return data
|
|
205
|
-
|
|
206
|
-
buffer = BytesIO()
|
|
207
|
-
buffer.write(data)
|
|
208
|
-
while buffer.tell() < size:
|
|
209
|
-
remain_size = size - buffer.tell()
|
|
210
|
-
data = self._next_buffer.read(remain_size)
|
|
211
|
-
buffer.write(data)
|
|
212
|
-
|
|
213
|
-
self._offset += buffer.tell()
|
|
214
|
-
return buffer.getvalue()
|
|
165
|
+
buffer = bytearray(size)
|
|
166
|
+
self.readinto(buffer)
|
|
167
|
+
return bytes(buffer)
|
|
215
168
|
|
|
216
169
|
def readline(self, size: Optional[int] = None) -> bytes:
|
|
217
170
|
"""Next line from the file, as a bytes object.
|
|
@@ -270,12 +223,31 @@ class BasePrefetchReader(Readable[bytes], Seekable, ABC):
|
|
|
270
223
|
if self.closed:
|
|
271
224
|
raise IOError("file already closed: %r" % self.name)
|
|
272
225
|
|
|
226
|
+
if len(self._seek_history) > 0:
|
|
227
|
+
self._seek_history[-1].read_count += 1
|
|
228
|
+
|
|
273
229
|
if self._offset >= self._content_size:
|
|
274
230
|
return 0
|
|
275
231
|
|
|
276
232
|
size = len(buffer)
|
|
277
233
|
size = min(size, self._content_size - self._offset)
|
|
278
234
|
|
|
235
|
+
if self._block_capacity == 0:
|
|
236
|
+
buffer[:size] = self._read(size)
|
|
237
|
+
return size
|
|
238
|
+
|
|
239
|
+
if self._block_forward == 0:
|
|
240
|
+
block_index = self._offset // self._block_size
|
|
241
|
+
if len(self._seek_history) > 0:
|
|
242
|
+
mean_read_count = mean(item.read_count for item in self._seek_history)
|
|
243
|
+
else:
|
|
244
|
+
mean_read_count = 0
|
|
245
|
+
if block_index not in self._futures and mean_read_count < 3:
|
|
246
|
+
# No using LRP will be better if read() are always called less than 3
|
|
247
|
+
# times after seek()
|
|
248
|
+
buffer[:size] = self._read(size)
|
|
249
|
+
return size
|
|
250
|
+
|
|
279
251
|
data = self._buffer.read(size)
|
|
280
252
|
buffer[: len(data)] = data
|
|
281
253
|
if len(data) == size:
|
|
@@ -306,13 +278,22 @@ class BasePrefetchReader(Readable[bytes], Seekable, ABC):
|
|
|
306
278
|
|
|
307
279
|
@property
|
|
308
280
|
def _buffer(self) -> BytesIO:
|
|
281
|
+
if self._block_capacity == 0:
|
|
282
|
+
buffer = self._fetch_buffer(index=self._block_index)
|
|
283
|
+
buffer.seek(self._cached_offset)
|
|
284
|
+
self._cached_offset = None
|
|
285
|
+
return buffer
|
|
286
|
+
|
|
309
287
|
if self._cached_offset is not None:
|
|
310
|
-
|
|
311
|
-
|
|
288
|
+
if self._block_forward > 0: # pyre-ignore[58]
|
|
289
|
+
start = self._block_index
|
|
290
|
+
stop = min(start + self._block_forward, self._block_stop)
|
|
312
291
|
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
292
|
+
# reversed(range(start, stop))
|
|
293
|
+
for index in range(stop, start - 1, -1):
|
|
294
|
+
self._submit_future(index)
|
|
295
|
+
else:
|
|
296
|
+
self._submit_future(self._block_index)
|
|
316
297
|
self._cleanup_futures()
|
|
317
298
|
|
|
318
299
|
self._cached_buffer = self._fetch_future_result(self._block_index)
|
|
@@ -335,7 +316,7 @@ class BasePrefetchReader(Readable[bytes], Seekable, ABC):
|
|
|
335
316
|
def _seek_buffer(self, index: int, offset: int = 0):
|
|
336
317
|
# The corresponding block is probably not downloaded when seek to a new position
|
|
337
318
|
# So record the offset first, set it when it is accessed
|
|
338
|
-
if self._is_auto_scaling:
|
|
319
|
+
if self._is_auto_scaling:
|
|
339
320
|
history = []
|
|
340
321
|
for item in self._seek_history:
|
|
341
322
|
if item.seek_count > self._block_capacity * 2:
|
|
@@ -349,8 +330,11 @@ class BasePrefetchReader(Readable[bytes], Seekable, ABC):
|
|
|
349
330
|
history.append(SeekRecord(index))
|
|
350
331
|
self._seek_history = history
|
|
351
332
|
self._block_forward = max(
|
|
352
|
-
|
|
333
|
+
self._block_capacity // len(self._seek_history), 0
|
|
353
334
|
)
|
|
335
|
+
if self._block_forward == 0:
|
|
336
|
+
self._is_auto_scaling = False
|
|
337
|
+
self._seek_history = []
|
|
354
338
|
|
|
355
339
|
self._cached_offset = offset
|
|
356
340
|
self._block_index = index
|
megfile/lib/combine_reader.py
CHANGED
|
@@ -118,3 +118,8 @@ class CombineReader(Readable, Seekable):
|
|
|
118
118
|
def _close(self):
|
|
119
119
|
for file_object in self._file_objects:
|
|
120
120
|
file_object.close()
|
|
121
|
+
|
|
122
|
+
def __del__(self) -> None:
|
|
123
|
+
# CombineReader not close files in __del__
|
|
124
|
+
# user should use `close()` or use `with`
|
|
125
|
+
pass
|
megfile/lib/glob.py
CHANGED
|
@@ -72,8 +72,7 @@ def iglob(
|
|
|
72
72
|
if recursive and _isrecursive(pathname):
|
|
73
73
|
s = next(it) # skip empty string
|
|
74
74
|
if s:
|
|
75
|
-
|
|
76
|
-
raise AssertionError("iglob with recursive=True error")
|
|
75
|
+
raise OSError("iglob with recursive=True error")
|
|
77
76
|
return it
|
|
78
77
|
|
|
79
78
|
|
|
@@ -87,8 +86,7 @@ def _iglob(pathname: str, recursive: bool, dironly: bool, fs: FSFunc) -> Iterato
|
|
|
87
86
|
dirname = "://".join([protocol, dirname])
|
|
88
87
|
if not has_magic(pathname):
|
|
89
88
|
if dironly:
|
|
90
|
-
|
|
91
|
-
raise AssertionError("can't use dironly with non-magic patterns in _iglob")
|
|
89
|
+
raise OSError("can't use dironly with non-magic patterns in _iglob")
|
|
92
90
|
if basename:
|
|
93
91
|
if fs.exists(pathname):
|
|
94
92
|
yield pathname
|
|
@@ -150,8 +148,7 @@ def _glob0(dirname: str, basename: str, dironly: bool, fs: FSFunc) -> List[str]:
|
|
|
150
148
|
# directory.
|
|
151
149
|
def _glob2(dirname: str, pattern: str, dironly: bool, fs: FSFunc) -> Iterator[str]:
|
|
152
150
|
if not _isrecursive(pattern):
|
|
153
|
-
|
|
154
|
-
raise AssertionError("error call '_glob2' with non-glob pattern")
|
|
151
|
+
raise OSError("error call '_glob2' with non-glob pattern")
|
|
155
152
|
yield pattern[:0]
|
|
156
153
|
yield from _rlistdir(dirname, dironly, fs)
|
|
157
154
|
|
|
@@ -2,9 +2,9 @@ from io import BytesIO
|
|
|
2
2
|
from typing import Optional
|
|
3
3
|
|
|
4
4
|
from megfile.config import (
|
|
5
|
-
DEFAULT_BLOCK_CAPACITY,
|
|
6
|
-
DEFAULT_BLOCK_SIZE,
|
|
7
5
|
HDFS_MAX_RETRY_TIMES,
|
|
6
|
+
READER_BLOCK_SIZE,
|
|
7
|
+
READER_MAX_BUFFER_SIZE,
|
|
8
8
|
)
|
|
9
9
|
from megfile.errors import raise_hdfs_error
|
|
10
10
|
from megfile.lib.base_prefetch_reader import BasePrefetchReader
|
|
@@ -13,8 +13,8 @@ from megfile.lib.base_prefetch_reader import BasePrefetchReader
|
|
|
13
13
|
class HdfsPrefetchReader(BasePrefetchReader):
|
|
14
14
|
"""
|
|
15
15
|
Reader to fast read the hdfs content. This will divide the file content into equal
|
|
16
|
-
parts of block_size size, and will use LRU to cache at most
|
|
17
|
-
|
|
16
|
+
parts of block_size size, and will use LRU to cache at most blocks in
|
|
17
|
+
max_buffer_size memory.
|
|
18
18
|
|
|
19
19
|
open(), seek() and read() will trigger prefetch read. The prefetch will cached
|
|
20
20
|
block_forward blocks of data from offset position (the position after reading
|
|
@@ -26,8 +26,8 @@ class HdfsPrefetchReader(BasePrefetchReader):
|
|
|
26
26
|
hdfs_path: str,
|
|
27
27
|
*,
|
|
28
28
|
client,
|
|
29
|
-
block_size: int =
|
|
30
|
-
|
|
29
|
+
block_size: int = READER_BLOCK_SIZE,
|
|
30
|
+
max_buffer_size: int = READER_MAX_BUFFER_SIZE,
|
|
31
31
|
block_forward: Optional[int] = None,
|
|
32
32
|
max_retries: int = HDFS_MAX_RETRY_TIMES,
|
|
33
33
|
max_workers: Optional[int] = None,
|
|
@@ -39,7 +39,7 @@ class HdfsPrefetchReader(BasePrefetchReader):
|
|
|
39
39
|
|
|
40
40
|
super().__init__(
|
|
41
41
|
block_size=block_size,
|
|
42
|
-
|
|
42
|
+
max_buffer_size=max_buffer_size,
|
|
43
43
|
block_forward=block_forward,
|
|
44
44
|
max_retries=max_retries,
|
|
45
45
|
max_workers=max_workers,
|
|
@@ -4,9 +4,9 @@ from typing import Optional
|
|
|
4
4
|
import requests
|
|
5
5
|
|
|
6
6
|
from megfile.config import (
|
|
7
|
-
DEFAULT_BLOCK_CAPACITY,
|
|
8
|
-
DEFAULT_BLOCK_SIZE,
|
|
9
7
|
HTTP_MAX_RETRY_TIMES,
|
|
8
|
+
READER_BLOCK_SIZE,
|
|
9
|
+
READER_MAX_BUFFER_SIZE,
|
|
10
10
|
)
|
|
11
11
|
from megfile.errors import (
|
|
12
12
|
HttpBodyIncompleteError,
|
|
@@ -26,7 +26,7 @@ class HttpPrefetchReader(BasePrefetchReader):
|
|
|
26
26
|
Reader to fast read the http content, service must support Accept-Ranges.
|
|
27
27
|
|
|
28
28
|
This will divide the file content into equal parts of block_size size, and will use
|
|
29
|
-
LRU to cache at most
|
|
29
|
+
LRU to cache at most blocks in max_buffer_size memory.
|
|
30
30
|
|
|
31
31
|
open(), seek() and read() will trigger prefetch read.
|
|
32
32
|
|
|
@@ -39,8 +39,8 @@ class HttpPrefetchReader(BasePrefetchReader):
|
|
|
39
39
|
url: PathLike,
|
|
40
40
|
*,
|
|
41
41
|
content_size: Optional[int] = None,
|
|
42
|
-
block_size: int =
|
|
43
|
-
|
|
42
|
+
block_size: int = READER_BLOCK_SIZE,
|
|
43
|
+
max_buffer_size: int = READER_MAX_BUFFER_SIZE,
|
|
44
44
|
block_forward: Optional[int] = None,
|
|
45
45
|
max_retries: int = HTTP_MAX_RETRY_TIMES,
|
|
46
46
|
max_workers: Optional[int] = None,
|
|
@@ -50,7 +50,7 @@ class HttpPrefetchReader(BasePrefetchReader):
|
|
|
50
50
|
|
|
51
51
|
super().__init__(
|
|
52
52
|
block_size=block_size,
|
|
53
|
-
|
|
53
|
+
max_buffer_size=max_buffer_size,
|
|
54
54
|
block_forward=block_forward,
|
|
55
55
|
max_retries=max_retries,
|
|
56
56
|
max_workers=max_workers,
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import os
|
|
1
2
|
from collections import OrderedDict
|
|
2
3
|
from concurrent.futures import FIRST_COMPLETED, ThreadPoolExecutor, wait
|
|
3
4
|
from io import BytesIO
|
|
@@ -6,16 +7,14 @@ from threading import Lock
|
|
|
6
7
|
from typing import NamedTuple, Optional
|
|
7
8
|
|
|
8
9
|
from megfile.config import (
|
|
9
|
-
|
|
10
|
-
BACKOFF_INITIAL,
|
|
11
|
-
DEFAULT_MAX_BLOCK_SIZE,
|
|
12
|
-
DEFAULT_MAX_BUFFER_SIZE,
|
|
13
|
-
DEFAULT_MIN_BLOCK_SIZE,
|
|
10
|
+
DEFAULT_WRITER_BLOCK_AUTOSCALE,
|
|
14
11
|
GLOBAL_MAX_WORKERS,
|
|
12
|
+
WRITER_BLOCK_SIZE,
|
|
13
|
+
WRITER_MAX_BUFFER_SIZE,
|
|
15
14
|
)
|
|
16
15
|
from megfile.errors import raise_s3_error
|
|
17
16
|
from megfile.interfaces import Writable
|
|
18
|
-
from megfile.utils import
|
|
17
|
+
from megfile.utils import process_local
|
|
19
18
|
|
|
20
19
|
_logger = get_logger(__name__)
|
|
21
20
|
"""
|
|
@@ -39,15 +38,19 @@ class PartResult(_PartResult):
|
|
|
39
38
|
|
|
40
39
|
|
|
41
40
|
class S3BufferedWriter(Writable[bytes]):
|
|
41
|
+
# Multi-upload part size must be between 5 MiB and 5 GiB.
|
|
42
|
+
# There is no minimum size limit on the last part of your multipart upload.
|
|
43
|
+
MIN_BLOCK_SIZE = 8 * 2**20
|
|
44
|
+
|
|
42
45
|
def __init__(
|
|
43
46
|
self,
|
|
44
47
|
bucket: str,
|
|
45
48
|
key: str,
|
|
46
49
|
*,
|
|
47
50
|
s3_client,
|
|
48
|
-
block_size: int =
|
|
49
|
-
|
|
50
|
-
max_buffer_size: int =
|
|
51
|
+
block_size: int = WRITER_BLOCK_SIZE,
|
|
52
|
+
block_autoscale: bool = DEFAULT_WRITER_BLOCK_AUTOSCALE,
|
|
53
|
+
max_buffer_size: int = WRITER_MAX_BUFFER_SIZE,
|
|
51
54
|
max_workers: Optional[int] = None,
|
|
52
55
|
profile_name: Optional[str] = None,
|
|
53
56
|
):
|
|
@@ -57,17 +60,17 @@ class S3BufferedWriter(Writable[bytes]):
|
|
|
57
60
|
self._profile_name = profile_name
|
|
58
61
|
|
|
59
62
|
# user maybe put block_size with 'numpy.uint64' type
|
|
60
|
-
self.
|
|
63
|
+
self._base_block_size = int(block_size)
|
|
64
|
+
self._block_autoscale = block_autoscale
|
|
61
65
|
|
|
62
|
-
self._max_block_size = max_block_size
|
|
63
66
|
self._max_buffer_size = max_buffer_size
|
|
64
67
|
self._total_buffer_size = 0
|
|
65
68
|
self._offset = 0
|
|
66
|
-
self.
|
|
67
|
-
self._backoff_size = BACKOFF_INITIAL
|
|
69
|
+
self._content_size = 0
|
|
68
70
|
self._buffer = BytesIO()
|
|
69
71
|
|
|
70
|
-
self.
|
|
72
|
+
self._futures_result = OrderedDict()
|
|
73
|
+
self._uploading_futures = set()
|
|
71
74
|
self._is_global_executor = False
|
|
72
75
|
if max_workers is None:
|
|
73
76
|
self._executor = process_local(
|
|
@@ -101,53 +104,42 @@ class S3BufferedWriter(Writable[bytes]):
|
|
|
101
104
|
return self._offset
|
|
102
105
|
|
|
103
106
|
@property
|
|
104
|
-
def
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
self.__content_size = value
|
|
107
|
+
def _block_size(self) -> int:
|
|
108
|
+
if self._block_autoscale:
|
|
109
|
+
if self._part_number < 10:
|
|
110
|
+
return self._base_block_size
|
|
111
|
+
elif self._part_number < 100:
|
|
112
|
+
return min(self._base_block_size * 2, self._max_buffer_size)
|
|
113
|
+
elif self._part_number < 1000:
|
|
114
|
+
return min(self._base_block_size * 4, self._max_buffer_size)
|
|
115
|
+
elif self._part_number < 10000:
|
|
116
|
+
return min(self._base_block_size * 8, self._max_buffer_size)
|
|
117
|
+
return min(self._base_block_size * 16, self._max_buffer_size) # unreachable
|
|
118
|
+
return self._base_block_size
|
|
117
119
|
|
|
118
120
|
@property
|
|
119
121
|
def _is_multipart(self) -> bool:
|
|
120
|
-
return len(self.
|
|
122
|
+
return len(self._futures_result) > 0 or len(self._uploading_futures) > 0
|
|
121
123
|
|
|
122
124
|
@property
|
|
123
125
|
def _upload_id(self) -> str:
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
@property
|
|
133
|
-
def _buffer_size(self):
|
|
134
|
-
return self._total_buffer_size - sum(
|
|
135
|
-
future.result().content_size
|
|
136
|
-
for future in self._futures.values()
|
|
137
|
-
if future.done()
|
|
138
|
-
)
|
|
139
|
-
|
|
140
|
-
@property
|
|
141
|
-
def _uploading_futures(self):
|
|
142
|
-
return [future for future in self._futures.values() if not future.done()]
|
|
126
|
+
if self.__upload_id is None:
|
|
127
|
+
with self.__upload_id_lock:
|
|
128
|
+
if self.__upload_id is None:
|
|
129
|
+
with raise_s3_error(self.name):
|
|
130
|
+
self.__upload_id = self._client.create_multipart_upload(
|
|
131
|
+
Bucket=self._bucket, Key=self._key
|
|
132
|
+
)["UploadId"]
|
|
133
|
+
return self.__upload_id
|
|
143
134
|
|
|
144
135
|
@property
|
|
145
136
|
def _multipart_upload(self):
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
]
|
|
150
|
-
|
|
137
|
+
for future in self._uploading_futures:
|
|
138
|
+
result = future.result()
|
|
139
|
+
self._total_buffer_size -= result.content_size
|
|
140
|
+
self._futures_result[result.part_number] = result.asdict()
|
|
141
|
+
self._uploading_futures = set()
|
|
142
|
+
return {"Parts": [result for _, result in sorted(self._futures_result.items())]}
|
|
151
143
|
|
|
152
144
|
def _upload_buffer(self, part_number, content):
|
|
153
145
|
with raise_s3_error(self.name):
|
|
@@ -164,32 +156,43 @@ class S3BufferedWriter(Writable[bytes]):
|
|
|
164
156
|
)
|
|
165
157
|
|
|
166
158
|
def _submit_upload_buffer(self, part_number, content):
|
|
167
|
-
self.
|
|
168
|
-
self._upload_buffer, part_number, content
|
|
159
|
+
self._uploading_futures.add(
|
|
160
|
+
self._executor.submit(self._upload_buffer, part_number, content)
|
|
169
161
|
)
|
|
170
162
|
self._total_buffer_size += len(content)
|
|
171
|
-
|
|
172
|
-
|
|
163
|
+
|
|
164
|
+
while (
|
|
165
|
+
self._uploading_futures and self._total_buffer_size >= self._max_buffer_size
|
|
166
|
+
):
|
|
167
|
+
wait_result = wait(self._uploading_futures, return_when=FIRST_COMPLETED)
|
|
168
|
+
for future in wait_result.done:
|
|
169
|
+
result = future.result()
|
|
170
|
+
self._total_buffer_size -= result.content_size
|
|
171
|
+
self._futures_result[result.part_number] = result.asdict()
|
|
172
|
+
self._uploading_futures = wait_result.not_done
|
|
173
173
|
|
|
174
174
|
def _submit_upload_content(self, content: bytes):
|
|
175
175
|
# s3 part needs at least 5MB,
|
|
176
176
|
# so we need to divide content into equal-size parts,
|
|
177
177
|
# and give last part more size.
|
|
178
178
|
# e.g. 257MB can be divided into 2 parts, 128MB and 129MB
|
|
179
|
-
|
|
180
|
-
while len(content) - offset - self._max_block_size > self._block_size:
|
|
179
|
+
while len(content) - self._block_size > self.MIN_BLOCK_SIZE:
|
|
181
180
|
self._part_number += 1
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
181
|
+
current_content, content = (
|
|
182
|
+
content[: self._block_size],
|
|
183
|
+
content[self._block_size :],
|
|
184
|
+
)
|
|
185
|
+
self._submit_upload_buffer(self._part_number, current_content)
|
|
186
|
+
if content:
|
|
187
|
+
self._part_number += 1
|
|
188
|
+
self._submit_upload_buffer(self._part_number, content)
|
|
187
189
|
|
|
188
190
|
def _submit_futures(self):
|
|
189
191
|
content = self._buffer.getvalue()
|
|
190
192
|
if len(content) == 0:
|
|
191
193
|
return
|
|
192
|
-
self._buffer
|
|
194
|
+
self._buffer.seek(0, os.SEEK_SET)
|
|
195
|
+
self._buffer.truncate()
|
|
193
196
|
self._submit_upload_content(content)
|
|
194
197
|
|
|
195
198
|
def write(self, data: bytes) -> int:
|
megfile/lib/s3_cached_handler.py
CHANGED
|
@@ -19,8 +19,7 @@ class S3CachedHandler(S3MemoryHandler):
|
|
|
19
19
|
profile_name: Optional[str] = None,
|
|
20
20
|
):
|
|
21
21
|
if mode not in ("rb", "wb", "ab", "rb+", "wb+", "ab+"):
|
|
22
|
-
|
|
23
|
-
raise AssertionError("unacceptable mode: %r" % mode)
|
|
22
|
+
raise ValueError("unacceptable mode: %r" % mode)
|
|
24
23
|
|
|
25
24
|
self._bucket = bucket
|
|
26
25
|
self._key = key
|
|
@@ -4,9 +4,7 @@ from logging import getLogger as get_logger
|
|
|
4
4
|
from typing import Optional
|
|
5
5
|
|
|
6
6
|
from megfile.config import (
|
|
7
|
-
|
|
8
|
-
DEFAULT_MAX_BUFFER_SIZE,
|
|
9
|
-
DEFAULT_MIN_BLOCK_SIZE,
|
|
7
|
+
WRITER_MAX_BUFFER_SIZE,
|
|
10
8
|
)
|
|
11
9
|
from megfile.errors import raise_s3_error
|
|
12
10
|
from megfile.interfaces import Seekable
|
|
@@ -29,11 +27,10 @@ class S3LimitedSeekableWriter(S3BufferedWriter, Seekable):
|
|
|
29
27
|
key: str,
|
|
30
28
|
*,
|
|
31
29
|
s3_client,
|
|
32
|
-
block_size: int =
|
|
30
|
+
block_size: int = S3BufferedWriter.MIN_BLOCK_SIZE,
|
|
33
31
|
head_block_size: Optional[int] = None,
|
|
34
32
|
tail_block_size: Optional[int] = None,
|
|
35
|
-
|
|
36
|
-
max_buffer_size: int = DEFAULT_MAX_BUFFER_SIZE,
|
|
33
|
+
max_buffer_size: int = WRITER_MAX_BUFFER_SIZE,
|
|
37
34
|
max_workers: Optional[int] = None,
|
|
38
35
|
profile_name: Optional[str] = None,
|
|
39
36
|
):
|
|
@@ -42,7 +39,6 @@ class S3LimitedSeekableWriter(S3BufferedWriter, Seekable):
|
|
|
42
39
|
key,
|
|
43
40
|
s3_client=s3_client,
|
|
44
41
|
block_size=block_size,
|
|
45
|
-
max_block_size=max_block_size,
|
|
46
42
|
max_buffer_size=max_buffer_size,
|
|
47
43
|
max_workers=max_workers,
|
|
48
44
|
profile_name=profile_name,
|
megfile/lib/s3_memory_handler.py
CHANGED
|
@@ -22,8 +22,7 @@ class S3MemoryHandler(Readable[bytes], Seekable, Writable[bytes]):
|
|
|
22
22
|
profile_name: Optional[str] = None,
|
|
23
23
|
):
|
|
24
24
|
if mode not in ("rb", "wb", "ab", "rb+", "wb+", "ab+"):
|
|
25
|
-
|
|
26
|
-
raise AssertionError("unacceptable mode: %r" % mode)
|
|
25
|
+
raise ValueError("unacceptable mode: %r" % mode)
|
|
27
26
|
|
|
28
27
|
self._bucket = bucket
|
|
29
28
|
self._key = key
|
megfile/lib/s3_pipe_handler.py
CHANGED
|
@@ -35,8 +35,7 @@ class S3PipeHandler(Readable[bytes], Writable[bytes]):
|
|
|
35
35
|
profile_name: Optional[str] = None,
|
|
36
36
|
):
|
|
37
37
|
if mode not in ("rb", "wb"):
|
|
38
|
-
|
|
39
|
-
raise AssertionError("unacceptable mode: %r" % mode)
|
|
38
|
+
raise ValueError("unacceptable mode: %r" % mode)
|
|
40
39
|
|
|
41
40
|
self._bucket = bucket
|
|
42
41
|
self._key = key
|