megfile 3.1.6.post1__py3-none-any.whl → 4.0.0.post1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. megfile/cli.py +12 -7
  2. megfile/config.py +27 -39
  3. megfile/fs.py +169 -12
  4. megfile/fs_path.py +183 -260
  5. megfile/hdfs.py +106 -5
  6. megfile/hdfs_path.py +34 -90
  7. megfile/http.py +50 -1
  8. megfile/http_path.py +27 -65
  9. megfile/interfaces.py +1 -8
  10. megfile/lib/base_prefetch_reader.py +62 -78
  11. megfile/lib/combine_reader.py +5 -0
  12. megfile/lib/glob.py +3 -6
  13. megfile/lib/hdfs_prefetch_reader.py +7 -7
  14. megfile/lib/http_prefetch_reader.py +6 -6
  15. megfile/lib/s3_buffered_writer.py +71 -65
  16. megfile/lib/s3_cached_handler.py +1 -2
  17. megfile/lib/s3_limited_seekable_writer.py +3 -7
  18. megfile/lib/s3_memory_handler.py +1 -2
  19. megfile/lib/s3_pipe_handler.py +1 -2
  20. megfile/lib/s3_prefetch_reader.py +10 -19
  21. megfile/lib/s3_share_cache_reader.py +8 -5
  22. megfile/pathlike.py +397 -401
  23. megfile/s3.py +118 -17
  24. megfile/s3_path.py +126 -209
  25. megfile/sftp.py +300 -10
  26. megfile/sftp_path.py +46 -322
  27. megfile/smart.py +33 -27
  28. megfile/smart_path.py +9 -14
  29. megfile/stdio.py +1 -1
  30. megfile/stdio_path.py +2 -2
  31. megfile/utils/__init__.py +3 -4
  32. megfile/version.py +1 -1
  33. {megfile-3.1.6.post1.dist-info → megfile-4.0.0.post1.dist-info}/METADATA +7 -7
  34. megfile-4.0.0.post1.dist-info/RECORD +52 -0
  35. {megfile-3.1.6.post1.dist-info → megfile-4.0.0.post1.dist-info}/WHEEL +1 -1
  36. {megfile-3.1.6.post1.dist-info → megfile-4.0.0.post1.dist-info}/top_level.txt +0 -2
  37. docs/conf.py +0 -65
  38. megfile-3.1.6.post1.dist-info/RECORD +0 -55
  39. scripts/convert_results_to_sarif.py +0 -91
  40. scripts/generate_file.py +0 -344
  41. {megfile-3.1.6.post1.dist-info → megfile-4.0.0.post1.dist-info}/LICENSE +0 -0
  42. {megfile-3.1.6.post1.dist-info → megfile-4.0.0.post1.dist-info}/LICENSE.pyre +0 -0
  43. {megfile-3.1.6.post1.dist-info → megfile-4.0.0.post1.dist-info}/entry_points.txt +0 -0
@@ -9,16 +9,14 @@ from statistics import mean
9
9
  from typing import Optional
10
10
 
11
11
  from megfile.config import (
12
- BACKOFF_FACTOR,
13
- BACKOFF_INITIAL,
14
- DEFAULT_BLOCK_CAPACITY,
15
- DEFAULT_BLOCK_SIZE,
16
12
  DEFAULT_MAX_RETRY_TIMES,
17
13
  GLOBAL_MAX_WORKERS,
18
14
  NEWLINE,
15
+ READER_BLOCK_SIZE,
16
+ READER_MAX_BUFFER_SIZE,
19
17
  )
20
18
  from megfile.interfaces import Readable, Seekable
21
- from megfile.utils import ProcessLocal, get_human_size, process_local
19
+ from megfile.utils import ProcessLocal, process_local
22
20
 
23
21
  _logger = get_logger(__name__)
24
22
 
@@ -31,35 +29,31 @@ class SeekRecord:
31
29
 
32
30
 
33
31
  class BasePrefetchReader(Readable[bytes], Seekable, ABC):
34
- """
35
- Reader to fast read the remote file content.
36
- This will divide the file content into equal parts of block_size size,
37
- and will use LRU to cache at most block_capacity blocks in memory.
38
- open(), seek() and read() will trigger prefetch read.
39
- The prefetch will cached block_forward blocks of data from offset position
40
- (the position after reading if the called function is read).
41
- """
42
-
43
32
  def __init__(
44
33
  self,
45
34
  *,
46
- block_size: int = DEFAULT_BLOCK_SIZE,
47
- block_capacity: int = DEFAULT_BLOCK_CAPACITY,
35
+ block_size: int = READER_BLOCK_SIZE,
36
+ max_buffer_size: int = READER_MAX_BUFFER_SIZE,
48
37
  block_forward: Optional[int] = None,
49
38
  max_retries: int = DEFAULT_MAX_RETRY_TIMES,
50
39
  max_workers: Optional[int] = None,
51
40
  **kwargs,
52
41
  ):
53
- self._is_auto_scaling = block_forward is None
42
+ if max_buffer_size == 0:
43
+ block_capacity = block_forward = 0
44
+ else:
45
+ block_capacity = max(max_buffer_size // block_size, 1)
46
+
47
+ self._is_auto_scaling = False
54
48
  if block_forward is None:
55
- block_forward = max(block_capacity - 1, 1)
56
-
57
- if block_capacity <= block_forward:
58
- # TODO: replace AssertionError with ValueError in 4.0.0
59
- raise AssertionError(
60
- "block_capacity should greater than block_forward, "
61
- "got: block_capacity=%s, block_forward=%s"
62
- % (block_capacity, block_forward)
49
+ block_forward = max(block_capacity - 1, 0)
50
+ self._is_auto_scaling = block_forward > 0
51
+
52
+ if 0 < block_capacity <= block_forward:
53
+ raise ValueError(
54
+ "max_buffer_size should greater than block_forward * block_size, "
55
+ "got: max_buffer_size=%s, block_size=%s, block_forward=%s"
56
+ % (max_buffer_size, block_size, block_forward)
63
57
  )
64
58
 
65
59
  # user maybe put block_size with 'numpy.uint64' type
@@ -77,8 +71,7 @@ class BasePrefetchReader(Readable[bytes], Seekable, ABC):
77
71
  self._content_size = self._get_content_size()
78
72
  self._block_stop = ceil(self._content_size / block_size)
79
73
 
80
- self.__offset = 0
81
- self._backoff_size = BACKOFF_INITIAL
74
+ self._offset = 0
82
75
  self._cached_buffer = None
83
76
  self._block_index = None # Current block index
84
77
  self._seek_history = []
@@ -102,7 +95,7 @@ class BasePrefetchReader(Readable[bytes], Seekable, ABC):
102
95
  pass
103
96
 
104
97
  @property
105
- def _futures(self):
98
+ def _futures(self) -> "LRUCacheFutureManager":
106
99
  return self._process_local("futures", self._get_futures)
107
100
 
108
101
  def _get_futures(self):
@@ -120,21 +113,6 @@ class BasePrefetchReader(Readable[bytes], Seekable, ABC):
120
113
  def tell(self) -> int:
121
114
  return self._offset
122
115
 
123
- @property
124
- def _offset(self) -> int:
125
- return self.__offset
126
-
127
- @_offset.setter
128
- def _offset(self, value: int):
129
- if value > self._backoff_size:
130
- _logger.debug(
131
- "reading file: %r, current offset / total size: %s / %s"
132
- % (self.name, get_human_size(value), get_human_size(self._content_size))
133
- )
134
- while value > self._backoff_size:
135
- self._backoff_size *= BACKOFF_FACTOR
136
- self.__offset = value
137
-
138
116
  def seek(self, offset: int, whence: int = os.SEEK_SET) -> int:
139
117
  """Change stream position.
140
118
 
@@ -176,9 +154,6 @@ class BasePrefetchReader(Readable[bytes], Seekable, ABC):
176
154
  if self.closed:
177
155
  raise IOError("file already closed: %r" % self.name)
178
156
 
179
- if len(self._seek_history) > 0:
180
- self._seek_history[-1].read_count += 1
181
-
182
157
  if self._offset >= self._content_size:
183
158
  return b""
184
159
 
@@ -187,31 +162,9 @@ class BasePrefetchReader(Readable[bytes], Seekable, ABC):
187
162
  else:
188
163
  size = min(size, self._content_size - self._offset)
189
164
 
190
- if self._block_forward == 1:
191
- block_index = self._offset // self._block_size
192
- if len(self._seek_history) > 0:
193
- mean_read_count = mean(item.read_count for item in self._seek_history)
194
- else:
195
- mean_read_count = 0
196
- if block_index not in self._futures and mean_read_count < 3:
197
- # No using LRP will be better if read() are always called less than 3
198
- # times after seek()
199
- return self._read(size)
200
-
201
- data = self._buffer.read(size)
202
- if len(data) == size:
203
- self._offset += len(data)
204
- return data
205
-
206
- buffer = BytesIO()
207
- buffer.write(data)
208
- while buffer.tell() < size:
209
- remain_size = size - buffer.tell()
210
- data = self._next_buffer.read(remain_size)
211
- buffer.write(data)
212
-
213
- self._offset += buffer.tell()
214
- return buffer.getvalue()
165
+ buffer = bytearray(size)
166
+ self.readinto(buffer)
167
+ return bytes(buffer)
215
168
 
216
169
  def readline(self, size: Optional[int] = None) -> bytes:
217
170
  """Next line from the file, as a bytes object.
@@ -270,12 +223,31 @@ class BasePrefetchReader(Readable[bytes], Seekable, ABC):
270
223
  if self.closed:
271
224
  raise IOError("file already closed: %r" % self.name)
272
225
 
226
+ if len(self._seek_history) > 0:
227
+ self._seek_history[-1].read_count += 1
228
+
273
229
  if self._offset >= self._content_size:
274
230
  return 0
275
231
 
276
232
  size = len(buffer)
277
233
  size = min(size, self._content_size - self._offset)
278
234
 
235
+ if self._block_capacity == 0:
236
+ buffer[:size] = self._read(size)
237
+ return size
238
+
239
+ if self._block_forward == 0:
240
+ block_index = self._offset // self._block_size
241
+ if len(self._seek_history) > 0:
242
+ mean_read_count = mean(item.read_count for item in self._seek_history)
243
+ else:
244
+ mean_read_count = 0
245
+ if block_index not in self._futures and mean_read_count < 3:
246
+ # No using LRP will be better if read() are always called less than 3
247
+ # times after seek()
248
+ buffer[:size] = self._read(size)
249
+ return size
250
+
279
251
  data = self._buffer.read(size)
280
252
  buffer[: len(data)] = data
281
253
  if len(data) == size:
@@ -306,13 +278,22 @@ class BasePrefetchReader(Readable[bytes], Seekable, ABC):
306
278
 
307
279
  @property
308
280
  def _buffer(self) -> BytesIO:
281
+ if self._block_capacity == 0:
282
+ buffer = self._fetch_buffer(index=self._block_index)
283
+ buffer.seek(self._cached_offset)
284
+ self._cached_offset = None
285
+ return buffer
286
+
309
287
  if self._cached_offset is not None:
310
- start = self._block_index
311
- stop = min(start + self._block_forward, self._block_stop)
288
+ if self._block_forward > 0: # pyre-ignore[58]
289
+ start = self._block_index
290
+ stop = min(start + self._block_forward, self._block_stop)
312
291
 
313
- # reversed(range(start, stop))
314
- for index in range(stop - 1, start - 1, -1):
315
- self._submit_future(index)
292
+ # reversed(range(start, stop))
293
+ for index in range(stop, start - 1, -1):
294
+ self._submit_future(index)
295
+ else:
296
+ self._submit_future(self._block_index)
316
297
  self._cleanup_futures()
317
298
 
318
299
  self._cached_buffer = self._fetch_future_result(self._block_index)
@@ -335,7 +316,7 @@ class BasePrefetchReader(Readable[bytes], Seekable, ABC):
335
316
  def _seek_buffer(self, index: int, offset: int = 0):
336
317
  # The corresponding block is probably not downloaded when seek to a new position
337
318
  # So record the offset first, set it when it is accessed
338
- if self._is_auto_scaling: # When user doesn't define forward
319
+ if self._is_auto_scaling:
339
320
  history = []
340
321
  for item in self._seek_history:
341
322
  if item.seek_count > self._block_capacity * 2:
@@ -349,8 +330,11 @@ class BasePrefetchReader(Readable[bytes], Seekable, ABC):
349
330
  history.append(SeekRecord(index))
350
331
  self._seek_history = history
351
332
  self._block_forward = max(
352
- (self._block_capacity - 1) // len(self._seek_history), 1
333
+ self._block_capacity // len(self._seek_history), 0
353
334
  )
335
+ if self._block_forward == 0:
336
+ self._is_auto_scaling = False
337
+ self._seek_history = []
354
338
 
355
339
  self._cached_offset = offset
356
340
  self._block_index = index
@@ -118,3 +118,8 @@ class CombineReader(Readable, Seekable):
118
118
  def _close(self):
119
119
  for file_object in self._file_objects:
120
120
  file_object.close()
121
+
122
+ def __del__(self) -> None:
123
+ # CombineReader not close files in __del__
124
+ # user should use `close()` or use `with`
125
+ pass
megfile/lib/glob.py CHANGED
@@ -72,8 +72,7 @@ def iglob(
72
72
  if recursive and _isrecursive(pathname):
73
73
  s = next(it) # skip empty string
74
74
  if s:
75
- # TODO: replace AssertionError with OSError in 4.0.0
76
- raise AssertionError("iglob with recursive=True error")
75
+ raise OSError("iglob with recursive=True error")
77
76
  return it
78
77
 
79
78
 
@@ -87,8 +86,7 @@ def _iglob(pathname: str, recursive: bool, dironly: bool, fs: FSFunc) -> Iterato
87
86
  dirname = "://".join([protocol, dirname])
88
87
  if not has_magic(pathname):
89
88
  if dironly:
90
- # TODO: replace AssertionError with OSError in 4.0.0
91
- raise AssertionError("can't use dironly with non-magic patterns in _iglob")
89
+ raise OSError("can't use dironly with non-magic patterns in _iglob")
92
90
  if basename:
93
91
  if fs.exists(pathname):
94
92
  yield pathname
@@ -150,8 +148,7 @@ def _glob0(dirname: str, basename: str, dironly: bool, fs: FSFunc) -> List[str]:
150
148
  # directory.
151
149
  def _glob2(dirname: str, pattern: str, dironly: bool, fs: FSFunc) -> Iterator[str]:
152
150
  if not _isrecursive(pattern):
153
- # TODO: replace AssertionError with OSError in 4.0.0
154
- raise AssertionError("error call '_glob2' with non-glob pattern")
151
+ raise OSError("error call '_glob2' with non-glob pattern")
155
152
  yield pattern[:0]
156
153
  yield from _rlistdir(dirname, dironly, fs)
157
154
 
@@ -2,9 +2,9 @@ from io import BytesIO
2
2
  from typing import Optional
3
3
 
4
4
  from megfile.config import (
5
- DEFAULT_BLOCK_CAPACITY,
6
- DEFAULT_BLOCK_SIZE,
7
5
  HDFS_MAX_RETRY_TIMES,
6
+ READER_BLOCK_SIZE,
7
+ READER_MAX_BUFFER_SIZE,
8
8
  )
9
9
  from megfile.errors import raise_hdfs_error
10
10
  from megfile.lib.base_prefetch_reader import BasePrefetchReader
@@ -13,8 +13,8 @@ from megfile.lib.base_prefetch_reader import BasePrefetchReader
13
13
  class HdfsPrefetchReader(BasePrefetchReader):
14
14
  """
15
15
  Reader to fast read the hdfs content. This will divide the file content into equal
16
- parts of block_size size, and will use LRU to cache at most block_capacity blocks
17
- in memory.
16
+ parts of block_size size, and will use LRU to cache at most blocks in
17
+ max_buffer_size memory.
18
18
 
19
19
  open(), seek() and read() will trigger prefetch read. The prefetch will cached
20
20
  block_forward blocks of data from offset position (the position after reading
@@ -26,8 +26,8 @@ class HdfsPrefetchReader(BasePrefetchReader):
26
26
  hdfs_path: str,
27
27
  *,
28
28
  client,
29
- block_size: int = DEFAULT_BLOCK_SIZE,
30
- block_capacity: int = DEFAULT_BLOCK_CAPACITY,
29
+ block_size: int = READER_BLOCK_SIZE,
30
+ max_buffer_size: int = READER_MAX_BUFFER_SIZE,
31
31
  block_forward: Optional[int] = None,
32
32
  max_retries: int = HDFS_MAX_RETRY_TIMES,
33
33
  max_workers: Optional[int] = None,
@@ -39,7 +39,7 @@ class HdfsPrefetchReader(BasePrefetchReader):
39
39
 
40
40
  super().__init__(
41
41
  block_size=block_size,
42
- block_capacity=block_capacity,
42
+ max_buffer_size=max_buffer_size,
43
43
  block_forward=block_forward,
44
44
  max_retries=max_retries,
45
45
  max_workers=max_workers,
@@ -4,9 +4,9 @@ from typing import Optional
4
4
  import requests
5
5
 
6
6
  from megfile.config import (
7
- DEFAULT_BLOCK_CAPACITY,
8
- DEFAULT_BLOCK_SIZE,
9
7
  HTTP_MAX_RETRY_TIMES,
8
+ READER_BLOCK_SIZE,
9
+ READER_MAX_BUFFER_SIZE,
10
10
  )
11
11
  from megfile.errors import (
12
12
  HttpBodyIncompleteError,
@@ -26,7 +26,7 @@ class HttpPrefetchReader(BasePrefetchReader):
26
26
  Reader to fast read the http content, service must support Accept-Ranges.
27
27
 
28
28
  This will divide the file content into equal parts of block_size size, and will use
29
- LRU to cache at most block_capacity blocks in memory.
29
+ LRU to cache at most blocks in max_buffer_size memory.
30
30
 
31
31
  open(), seek() and read() will trigger prefetch read.
32
32
 
@@ -39,8 +39,8 @@ class HttpPrefetchReader(BasePrefetchReader):
39
39
  url: PathLike,
40
40
  *,
41
41
  content_size: Optional[int] = None,
42
- block_size: int = DEFAULT_BLOCK_SIZE,
43
- block_capacity: int = DEFAULT_BLOCK_CAPACITY,
42
+ block_size: int = READER_BLOCK_SIZE,
43
+ max_buffer_size: int = READER_MAX_BUFFER_SIZE,
44
44
  block_forward: Optional[int] = None,
45
45
  max_retries: int = HTTP_MAX_RETRY_TIMES,
46
46
  max_workers: Optional[int] = None,
@@ -50,7 +50,7 @@ class HttpPrefetchReader(BasePrefetchReader):
50
50
 
51
51
  super().__init__(
52
52
  block_size=block_size,
53
- block_capacity=block_capacity,
53
+ max_buffer_size=max_buffer_size,
54
54
  block_forward=block_forward,
55
55
  max_retries=max_retries,
56
56
  max_workers=max_workers,
@@ -1,3 +1,4 @@
1
+ import os
1
2
  from collections import OrderedDict
2
3
  from concurrent.futures import FIRST_COMPLETED, ThreadPoolExecutor, wait
3
4
  from io import BytesIO
@@ -6,16 +7,14 @@ from threading import Lock
6
7
  from typing import NamedTuple, Optional
7
8
 
8
9
  from megfile.config import (
9
- BACKOFF_FACTOR,
10
- BACKOFF_INITIAL,
11
- DEFAULT_MAX_BLOCK_SIZE,
12
- DEFAULT_MAX_BUFFER_SIZE,
13
- DEFAULT_MIN_BLOCK_SIZE,
10
+ DEFAULT_WRITER_BLOCK_AUTOSCALE,
14
11
  GLOBAL_MAX_WORKERS,
12
+ WRITER_BLOCK_SIZE,
13
+ WRITER_MAX_BUFFER_SIZE,
15
14
  )
16
15
  from megfile.errors import raise_s3_error
17
16
  from megfile.interfaces import Writable
18
- from megfile.utils import get_human_size, process_local
17
+ from megfile.utils import process_local
19
18
 
20
19
  _logger = get_logger(__name__)
21
20
  """
@@ -39,15 +38,19 @@ class PartResult(_PartResult):
39
38
 
40
39
 
41
40
  class S3BufferedWriter(Writable[bytes]):
41
+ # Multi-upload part size must be between 5 MiB and 5 GiB.
42
+ # There is no minimum size limit on the last part of your multipart upload.
43
+ MIN_BLOCK_SIZE = 8 * 2**20
44
+
42
45
  def __init__(
43
46
  self,
44
47
  bucket: str,
45
48
  key: str,
46
49
  *,
47
50
  s3_client,
48
- block_size: int = DEFAULT_MIN_BLOCK_SIZE,
49
- max_block_size: int = DEFAULT_MAX_BLOCK_SIZE,
50
- max_buffer_size: int = DEFAULT_MAX_BUFFER_SIZE,
51
+ block_size: int = WRITER_BLOCK_SIZE,
52
+ block_autoscale: bool = DEFAULT_WRITER_BLOCK_AUTOSCALE,
53
+ max_buffer_size: int = WRITER_MAX_BUFFER_SIZE,
51
54
  max_workers: Optional[int] = None,
52
55
  profile_name: Optional[str] = None,
53
56
  ):
@@ -57,17 +60,17 @@ class S3BufferedWriter(Writable[bytes]):
57
60
  self._profile_name = profile_name
58
61
 
59
62
  # user maybe put block_size with 'numpy.uint64' type
60
- self._block_size = int(block_size)
63
+ self._base_block_size = int(block_size)
64
+ self._block_autoscale = block_autoscale
61
65
 
62
- self._max_block_size = max_block_size
63
66
  self._max_buffer_size = max_buffer_size
64
67
  self._total_buffer_size = 0
65
68
  self._offset = 0
66
- self.__content_size = 0
67
- self._backoff_size = BACKOFF_INITIAL
69
+ self._content_size = 0
68
70
  self._buffer = BytesIO()
69
71
 
70
- self._futures = OrderedDict()
72
+ self._futures_result = OrderedDict()
73
+ self._uploading_futures = set()
71
74
  self._is_global_executor = False
72
75
  if max_workers is None:
73
76
  self._executor = process_local(
@@ -101,53 +104,42 @@ class S3BufferedWriter(Writable[bytes]):
101
104
  return self._offset
102
105
 
103
106
  @property
104
- def _content_size(self) -> int:
105
- return self.__content_size
106
-
107
- @_content_size.setter
108
- def _content_size(self, value: int):
109
- if value > self._backoff_size:
110
- _logger.debug(
111
- "writing file: %r, current size: %s"
112
- % (self.name, get_human_size(value))
113
- )
114
- while value > self._backoff_size:
115
- self._backoff_size *= BACKOFF_FACTOR
116
- self.__content_size = value
107
+ def _block_size(self) -> int:
108
+ if self._block_autoscale:
109
+ if self._part_number < 10:
110
+ return self._base_block_size
111
+ elif self._part_number < 100:
112
+ return min(self._base_block_size * 2, self._max_buffer_size)
113
+ elif self._part_number < 1000:
114
+ return min(self._base_block_size * 4, self._max_buffer_size)
115
+ elif self._part_number < 10000:
116
+ return min(self._base_block_size * 8, self._max_buffer_size)
117
+ return min(self._base_block_size * 16, self._max_buffer_size) # unreachable
118
+ return self._base_block_size
117
119
 
118
120
  @property
119
121
  def _is_multipart(self) -> bool:
120
- return len(self._futures) > 0
122
+ return len(self._futures_result) > 0 or len(self._uploading_futures) > 0
121
123
 
122
124
  @property
123
125
  def _upload_id(self) -> str:
124
- with self.__upload_id_lock:
125
- if self.__upload_id is None:
126
- with raise_s3_error(self.name):
127
- self.__upload_id = self._client.create_multipart_upload(
128
- Bucket=self._bucket, Key=self._key
129
- )["UploadId"]
130
- return self.__upload_id
131
-
132
- @property
133
- def _buffer_size(self):
134
- return self._total_buffer_size - sum(
135
- future.result().content_size
136
- for future in self._futures.values()
137
- if future.done()
138
- )
139
-
140
- @property
141
- def _uploading_futures(self):
142
- return [future for future in self._futures.values() if not future.done()]
126
+ if self.__upload_id is None:
127
+ with self.__upload_id_lock:
128
+ if self.__upload_id is None:
129
+ with raise_s3_error(self.name):
130
+ self.__upload_id = self._client.create_multipart_upload(
131
+ Bucket=self._bucket, Key=self._key
132
+ )["UploadId"]
133
+ return self.__upload_id
143
134
 
144
135
  @property
145
136
  def _multipart_upload(self):
146
- return {
147
- "Parts": [
148
- future.result().asdict() for _, future in sorted(self._futures.items())
149
- ]
150
- }
137
+ for future in self._uploading_futures:
138
+ result = future.result()
139
+ self._total_buffer_size -= result.content_size
140
+ self._futures_result[result.part_number] = result.asdict()
141
+ self._uploading_futures = set()
142
+ return {"Parts": [result for _, result in sorted(self._futures_result.items())]}
151
143
 
152
144
  def _upload_buffer(self, part_number, content):
153
145
  with raise_s3_error(self.name):
@@ -163,33 +155,47 @@ class S3BufferedWriter(Writable[bytes]):
163
155
  len(content),
164
156
  )
165
157
 
166
- def _submit_upload_buffer(self, part_number, content):
167
- self._futures[part_number] = self._executor.submit(
168
- self._upload_buffer, part_number, content
158
+ def _submit_upload_buffer(self, part_number: int, content: bytes):
159
+ self._uploading_futures.add(
160
+ self._executor.submit(self._upload_buffer, part_number, content)
169
161
  )
170
162
  self._total_buffer_size += len(content)
171
- while self._buffer_size > self._max_buffer_size:
172
- wait(self._uploading_futures, return_when=FIRST_COMPLETED)
163
+
164
+ while (
165
+ self._uploading_futures and self._total_buffer_size >= self._max_buffer_size
166
+ ):
167
+ wait_result = wait(self._uploading_futures, return_when=FIRST_COMPLETED)
168
+ for future in wait_result.done:
169
+ result = future.result()
170
+ self._total_buffer_size -= result.content_size
171
+ self._futures_result[result.part_number] = result.asdict()
172
+ self._uploading_futures = wait_result.not_done
173
173
 
174
174
  def _submit_upload_content(self, content: bytes):
175
175
  # s3 part needs at least 5MB,
176
176
  # so we need to divide content into equal-size parts,
177
177
  # and give last part more size.
178
178
  # e.g. 257MB can be divided into 2 parts, 128MB and 129MB
179
- offset = 0
180
- while len(content) - offset - self._max_block_size > self._block_size:
179
+ block_size = self._block_size
180
+ while len(content) - block_size > self.MIN_BLOCK_SIZE:
181
+ self._part_number += 1
182
+ current_content, content = (
183
+ content[:block_size],
184
+ content[block_size:],
185
+ )
186
+ self._submit_upload_buffer(self._part_number, current_content)
187
+ block_size = self._block_size
188
+
189
+ if content:
181
190
  self._part_number += 1
182
- offset_stop = offset + self._max_block_size
183
- self._submit_upload_buffer(self._part_number, content[offset:offset_stop])
184
- offset = offset_stop
185
- self._part_number += 1
186
- self._submit_upload_buffer(self._part_number, content[offset:])
191
+ self._submit_upload_buffer(self._part_number, content)
187
192
 
188
193
  def _submit_futures(self):
189
194
  content = self._buffer.getvalue()
190
195
  if len(content) == 0:
191
196
  return
192
- self._buffer = BytesIO()
197
+ self._buffer.seek(0, os.SEEK_SET)
198
+ self._buffer.truncate()
193
199
  self._submit_upload_content(content)
194
200
 
195
201
  def write(self, data: bytes) -> int:
@@ -19,8 +19,7 @@ class S3CachedHandler(S3MemoryHandler):
19
19
  profile_name: Optional[str] = None,
20
20
  ):
21
21
  if mode not in ("rb", "wb", "ab", "rb+", "wb+", "ab+"):
22
- # TODO: replace AssertionError with ValueError in 4.0.0
23
- raise AssertionError("unacceptable mode: %r" % mode)
22
+ raise ValueError("unacceptable mode: %r" % mode)
24
23
 
25
24
  self._bucket = bucket
26
25
  self._key = key
@@ -4,9 +4,7 @@ from logging import getLogger as get_logger
4
4
  from typing import Optional
5
5
 
6
6
  from megfile.config import (
7
- DEFAULT_MAX_BLOCK_SIZE,
8
- DEFAULT_MAX_BUFFER_SIZE,
9
- DEFAULT_MIN_BLOCK_SIZE,
7
+ WRITER_MAX_BUFFER_SIZE,
10
8
  )
11
9
  from megfile.errors import raise_s3_error
12
10
  from megfile.interfaces import Seekable
@@ -29,11 +27,10 @@ class S3LimitedSeekableWriter(S3BufferedWriter, Seekable):
29
27
  key: str,
30
28
  *,
31
29
  s3_client,
32
- block_size: int = DEFAULT_MIN_BLOCK_SIZE,
30
+ block_size: int = S3BufferedWriter.MIN_BLOCK_SIZE,
33
31
  head_block_size: Optional[int] = None,
34
32
  tail_block_size: Optional[int] = None,
35
- max_block_size: int = DEFAULT_MAX_BLOCK_SIZE,
36
- max_buffer_size: int = DEFAULT_MAX_BUFFER_SIZE,
33
+ max_buffer_size: int = WRITER_MAX_BUFFER_SIZE,
37
34
  max_workers: Optional[int] = None,
38
35
  profile_name: Optional[str] = None,
39
36
  ):
@@ -42,7 +39,6 @@ class S3LimitedSeekableWriter(S3BufferedWriter, Seekable):
42
39
  key,
43
40
  s3_client=s3_client,
44
41
  block_size=block_size,
45
- max_block_size=max_block_size,
46
42
  max_buffer_size=max_buffer_size,
47
43
  max_workers=max_workers,
48
44
  profile_name=profile_name,
@@ -22,8 +22,7 @@ class S3MemoryHandler(Readable[bytes], Seekable, Writable[bytes]):
22
22
  profile_name: Optional[str] = None,
23
23
  ):
24
24
  if mode not in ("rb", "wb", "ab", "rb+", "wb+", "ab+"):
25
- # TODO: replace AssertionError with ValueError in 4.0.0
26
- raise AssertionError("unacceptable mode: %r" % mode)
25
+ raise ValueError("unacceptable mode: %r" % mode)
27
26
 
28
27
  self._bucket = bucket
29
28
  self._key = key
@@ -35,8 +35,7 @@ class S3PipeHandler(Readable[bytes], Writable[bytes]):
35
35
  profile_name: Optional[str] = None,
36
36
  ):
37
37
  if mode not in ("rb", "wb"):
38
- # TODO: replace AssertionError with ValueError in 4.0.0
39
- raise AssertionError("unacceptable mode: %r" % mode)
38
+ raise ValueError("unacceptable mode: %r" % mode)
40
39
 
41
40
  self._bucket = bucket
42
41
  self._key = key