megfile 3.0.4__py3-none-any.whl → 3.0.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
megfile/http_path.py CHANGED
@@ -1,17 +1,19 @@
1
- import io
2
1
  import time
2
+ from copy import deepcopy
3
3
  from functools import partial
4
- from io import BufferedReader
4
+ from io import BufferedReader, BytesIO
5
5
  from logging import getLogger as get_logger
6
+ from threading import Lock
6
7
  from typing import Iterable, Iterator, Optional, Tuple, Union
7
8
 
8
9
  import requests
10
+ from urllib3 import HTTPResponse
9
11
 
10
- from megfile.config import DEFAULT_BLOCK_SIZE
12
+ from megfile.config import DEFAULT_BLOCK_SIZE, HTTP_MAX_RETRY_TIMES
11
13
  from megfile.errors import http_should_retry, patch_method, translate_http_error
12
- from megfile.interfaces import PathLike, StatResult, URIPath
14
+ from megfile.interfaces import PathLike, Readable, StatResult, URIPath
13
15
  from megfile.lib.compat import fspath
14
- from megfile.lib.http_prefetch_reader import HttpPrefetchReader
16
+ from megfile.lib.http_prefetch_reader import DEFAULT_TIMEOUT, HttpPrefetchReader
15
17
  from megfile.lib.s3_buffered_writer import DEFAULT_MAX_BUFFER_SIZE
16
18
  from megfile.lib.url import get_url_scheme
17
19
  from megfile.pathlike import PathLike
@@ -27,11 +29,11 @@ __all__ = [
27
29
  ]
28
30
 
29
31
  _logger = get_logger(__name__)
30
- max_retries = 10
32
+ max_retries = HTTP_MAX_RETRY_TIMES
31
33
 
32
34
 
33
35
  def get_http_session(
34
- timeout: Union[int, Tuple[int, int]] = (9, 60),
36
+ timeout: Optional[Union[int, Tuple[int, int]]] = DEFAULT_TIMEOUT,
35
37
  status_forcelist: Iterable[int] = (500, 502, 503, 504)
36
38
  ) -> requests.Session:
37
39
  session = requests.Session()
@@ -81,7 +83,7 @@ def get_http_session(
81
83
  return file_object
82
84
  elif hasattr(file_object, 'name'):
83
85
  with SmartPath(file_object.name).open('rb') as f:
84
- return io.BytesIO(f.read())
86
+ return BytesIO(f.read())
85
87
  else:
86
88
  _logger.warning(
87
89
  f'Can not retry http request, because the file object is not seekable and unsupport "name"'
@@ -171,10 +173,12 @@ class HttpPath(URIPath):
171
173
  protocol = "http"
172
174
 
173
175
  def __init__(self, path: PathLike, *other_paths: PathLike):
174
- if str(path).startswith('https://'):
175
- self.protocol = 'https'
176
176
  super().__init__(path, *other_paths)
177
177
 
178
+ if fspath(path).startswith('https://'):
179
+ self.protocol = 'https'
180
+ self.request_kwargs = {}
181
+
178
182
  @binary_open
179
183
  def open(
180
184
  self,
@@ -203,9 +207,15 @@ class HttpPath(URIPath):
203
207
  raise ValueError('unacceptable mode: %r' % mode)
204
208
 
205
209
  response = None
210
+ request_kwargs = deepcopy(self.request_kwargs)
211
+ timeout = request_kwargs.pop('timeout', DEFAULT_TIMEOUT)
212
+ stream = request_kwargs.pop('stream', True)
206
213
  try:
207
- response = get_http_session(status_forcelist=()).get(
208
- self.path_with_protocol, stream=True)
214
+ response = get_http_session(
215
+ timeout=timeout,
216
+ status_forcelist=(),
217
+ ).get(
218
+ self.path_with_protocol, stream=stream, **request_kwargs)
209
219
  response.raise_for_status()
210
220
  except Exception as error:
211
221
  if response:
@@ -213,8 +223,9 @@ class HttpPath(URIPath):
213
223
  raise translate_http_error(error, self.path_with_protocol)
214
224
 
215
225
  content_size = int(response.headers['Content-Length'])
216
- if response.headers.get(
217
- 'Accept-Ranges') == 'bytes' and content_size >= block_size * 2:
226
+ if (response.headers.get('Accept-Ranges') == 'bytes' and
227
+ content_size >= block_size * 2 and
228
+ not response.headers.get('Content-Encoding')):
218
229
  response.close()
219
230
 
220
231
  block_capacity = max_buffer_size // block_size
@@ -224,7 +235,7 @@ class HttpPath(URIPath):
224
235
  block_forward = max(int(block_capacity * forward_ratio), 1)
225
236
 
226
237
  reader = HttpPrefetchReader(
227
- self.path_with_protocol,
238
+ self,
228
239
  content_size=content_size,
229
240
  max_retries=max_retries,
230
241
  max_workers=max_concurrency,
@@ -233,12 +244,15 @@ class HttpPath(URIPath):
233
244
  block_size=block_size,
234
245
  )
235
246
  if _is_pickle(reader): # pytype: disable=wrong-arg-types
236
- reader = io.BufferedReader(reader) # pytype: disable=wrong-arg-types
247
+ reader = BufferedReader(reader) # pytype: disable=wrong-arg-types
237
248
  return reader
238
249
 
239
- response.raw.auto_close = False
240
250
  response.raw.name = self.path_with_protocol
241
- return BufferedReader(response.raw)
251
+ # TODO: When python version must bigger than 3.10, use urllib3>=2.0.0 instead of 'Response'
252
+ # response.raw.auto_close = False
253
+ # response.raw.decode_content = True
254
+ # return BufferedReader(response.raw)
255
+ return BufferedReader(Response(response.raw)) # pytype: disable=wrong-arg-types
242
256
 
243
257
  def stat(self, follow_symlinks=True) -> StatResult:
244
258
  '''
@@ -249,9 +263,14 @@ class HttpPath(URIPath):
249
263
  :raises: HttpPermissionError, HttpFileNotFoundError
250
264
  '''
251
265
 
266
+ request_kwargs = deepcopy(self.request_kwargs)
267
+ timeout = request_kwargs.pop('timeout', DEFAULT_TIMEOUT)
268
+ stream = request_kwargs.pop('stream', True)
269
+
252
270
  try:
253
- with get_http_session(status_forcelist=()).get(
254
- self.path_with_protocol, stream=True) as response:
271
+ with get_http_session(timeout=timeout, status_forcelist=()).get(
272
+ self.path_with_protocol, stream=stream,
273
+ **request_kwargs) as response:
255
274
  response.raise_for_status()
256
275
  headers = response.headers
257
276
  except Exception as error:
@@ -302,9 +321,14 @@ class HttpPath(URIPath):
302
321
  :return: return True if exists
303
322
  :rtype: bool
304
323
  """
324
+ request_kwargs = deepcopy(self.request_kwargs)
325
+ timeout = request_kwargs.pop('timeout', DEFAULT_TIMEOUT)
326
+ stream = request_kwargs.pop('stream', True)
327
+
305
328
  try:
306
- with get_http_session(status_forcelist=()).get(
307
- self.path_with_protocol, stream=True) as response:
329
+ with get_http_session(timeout=timeout, status_forcelist=()).get(
330
+ self.path_with_protocol, stream=stream,
331
+ **request_kwargs) as response:
308
332
  if response.status_code == 404:
309
333
  return False
310
334
  return True
@@ -316,3 +340,108 @@ class HttpPath(URIPath):
316
340
  class HttpsPath(HttpPath):
317
341
 
318
342
  protocol = "https"
343
+
344
+
345
+ class Response(Readable):
346
+
347
+ def __init__(self, raw: HTTPResponse) -> None:
348
+ super().__init__()
349
+
350
+ raw.auto_close = False
351
+ self._block_size = 128 * 2**10 # 128KB
352
+ self._raw = raw
353
+ self._offset = 0
354
+ self._buffer = BytesIO()
355
+ self._lock = Lock()
356
+
357
+ @property
358
+ def name(self):
359
+ return self._raw.name
360
+
361
+ @property
362
+ def mode(self):
363
+ return 'rb'
364
+
365
+ def tell(self) -> int:
366
+ return self._offset
367
+
368
+ def _clear_buffer(self) -> None:
369
+ self._buffer.seek(0)
370
+ self._buffer.truncate()
371
+
372
+ def read(self, size: Optional[int] = None) -> bytes:
373
+ if size == 0:
374
+ return b''
375
+ if size is not None and size < 0:
376
+ size = None
377
+
378
+ with self._lock:
379
+ while not size or self._buffer.tell() < size:
380
+ data = self._raw.read(self._block_size, decode_content=True)
381
+ if not data:
382
+ break
383
+ self._buffer.write(data)
384
+ self._buffer.seek(0)
385
+ content = self._buffer.read(size)
386
+ residue = self._buffer.read()
387
+ self._clear_buffer()
388
+ if residue:
389
+ self._buffer.write(residue)
390
+ self._offset += len(content)
391
+ return content
392
+
393
+ def readline(self, size: Optional[int] = None) -> bytes:
394
+ if size == 0:
395
+ return b''
396
+ if size is not None and size < 0:
397
+ size = None
398
+
399
+ with self._lock:
400
+ self._buffer.seek(0)
401
+ buffer = self._buffer.read()
402
+ self._clear_buffer()
403
+ if b'\n' in buffer:
404
+ content = buffer[:buffer.index(b'\n') + 1]
405
+ if size:
406
+ content = content[:size]
407
+ self._buffer.write(buffer[len(content):])
408
+ elif size and len(buffer) >= size:
409
+ content = buffer[:size]
410
+ self._buffer.write(buffer[size:])
411
+ else:
412
+ content = None
413
+ self._buffer.write(buffer)
414
+ while True:
415
+ if size and self._buffer.tell() >= size:
416
+ break
417
+ data = self._raw.read(self._block_size, decode_content=True)
418
+ if not data:
419
+ break
420
+ elif b"\n" in data:
421
+ last_content, residue = data.split(b"\n", 1)
422
+ self._buffer.write(last_content)
423
+ self._buffer.write(b"\n")
424
+ self._buffer.seek(0)
425
+ content = self._buffer.read()
426
+ self._clear_buffer()
427
+ if size and len(content) > size:
428
+ self._buffer.write(content[size:])
429
+ content = content[:size]
430
+ if residue:
431
+ self._buffer.write(residue)
432
+ break
433
+ else:
434
+ self._buffer.write(data)
435
+
436
+ if content is None:
437
+ self._buffer.seek(0)
438
+ content = self._buffer.read(size)
439
+ residue = self._buffer.read()
440
+ self._clear_buffer()
441
+ if residue:
442
+ self._buffer.write(residue)
443
+ self._offset += len(content)
444
+ return content
445
+
446
+ def _close(self) -> None:
447
+ return self._raw.close()
@@ -8,7 +8,7 @@ from math import ceil
8
8
  from statistics import mean
9
9
  from typing import Optional
10
10
 
11
- from megfile.config import BACKOFF_FACTOR, BACKOFF_INITIAL, DEFAULT_BLOCK_CAPACITY, DEFAULT_BLOCK_SIZE, GLOBAL_MAX_WORKERS, NEWLINE
11
+ from megfile.config import BACKOFF_FACTOR, BACKOFF_INITIAL, DEFAULT_BLOCK_CAPACITY, DEFAULT_BLOCK_SIZE, DEFAULT_MAX_RETRY_TIMES, GLOBAL_MAX_WORKERS, NEWLINE
12
12
  from megfile.interfaces import Readable, Seekable
13
13
  from megfile.utils import get_human_size, process_local
14
14
 
@@ -39,7 +39,7 @@ class BasePrefetchReader(Readable, Seekable, ABC):
39
39
  block_size: int = DEFAULT_BLOCK_SIZE,
40
40
  block_capacity: int = DEFAULT_BLOCK_CAPACITY,
41
41
  block_forward: Optional[int] = None,
42
- max_retries: int = 10,
42
+ max_retries: int = DEFAULT_MAX_RETRY_TIMES,
43
43
  max_workers: Optional[int] = None,
44
44
  **kwargs):
45
45
 
@@ -1,7 +1,7 @@
1
1
  from io import BytesIO
2
2
  from typing import Optional
3
3
 
4
- from megfile.config import DEFAULT_BLOCK_CAPACITY, DEFAULT_BLOCK_SIZE
4
+ from megfile.config import DEFAULT_BLOCK_CAPACITY, DEFAULT_BLOCK_SIZE, HDFS_MAX_RETRY_TIMES
5
5
  from megfile.errors import raise_hdfs_error
6
6
  from megfile.lib.base_prefetch_reader import BasePrefetchReader
7
7
 
@@ -20,7 +20,7 @@ class HdfsPrefetchReader(BasePrefetchReader):
20
20
  block_size: int = DEFAULT_BLOCK_SIZE,
21
21
  block_capacity: int = DEFAULT_BLOCK_CAPACITY,
22
22
  block_forward: Optional[int] = None,
23
- max_retries: int = 10,
23
+ max_retries: int = HDFS_MAX_RETRY_TIMES,
24
24
  max_workers: Optional[int] = None,
25
25
  profile_name: Optional[str] = None):
26
26
  self._path = hdfs_path
@@ -1,12 +1,15 @@
1
- import os
2
1
  from io import BytesIO
3
2
  from typing import Optional
4
3
 
5
4
  import requests
6
5
 
7
- from megfile.config import DEFAULT_BLOCK_CAPACITY, DEFAULT_BLOCK_SIZE
8
- from megfile.errors import UnsupportedError, http_should_retry, patch_method
6
+ from megfile.config import DEFAULT_BLOCK_CAPACITY, DEFAULT_BLOCK_SIZE, HTTP_MAX_RETRY_TIMES
7
+ from megfile.errors import HttpBodyIncompleteError, UnsupportedError, http_should_retry, patch_method
9
8
  from megfile.lib.base_prefetch_reader import BasePrefetchReader
9
+ from megfile.lib.compat import fspath
10
+ from megfile.pathlike import PathLike
11
+
12
+ DEFAULT_TIMEOUT = (60, 60 * 60 * 24)
10
13
 
11
14
 
12
15
  class HttpPrefetchReader(BasePrefetchReader):
@@ -19,13 +22,13 @@ class HttpPrefetchReader(BasePrefetchReader):
19
22
 
20
23
  def __init__(
21
24
  self,
22
- url: str,
25
+ url: PathLike,
23
26
  *,
24
27
  content_size: Optional[int] = None,
25
28
  block_size: int = DEFAULT_BLOCK_SIZE,
26
29
  block_capacity: int = DEFAULT_BLOCK_CAPACITY,
27
30
  block_forward: Optional[int] = None,
28
- max_retries: int = 10,
31
+ max_retries: int = HTTP_MAX_RETRY_TIMES,
29
32
  max_workers: Optional[int] = None):
30
33
 
31
34
  self._url = url
@@ -46,22 +49,28 @@ class HttpPrefetchReader(BasePrefetchReader):
46
49
  if first_index_response['Headers'].get('Accept-Ranges') != 'bytes':
47
50
  raise UnsupportedError(
48
51
  f'Unsupported server, server must support Accept-Ranges: {self._url}',
49
- path=self._url,
52
+ path=fspath(self._url),
50
53
  )
51
54
  return first_index_response['Headers']['Content-Length']
52
55
 
53
56
  @property
54
57
  def name(self) -> str:
55
- return self._url
58
+ return fspath(self._url)
56
59
 
57
60
  def _fetch_response(
58
61
  self, start: Optional[int] = None,
59
62
  end: Optional[int] = None) -> dict:
60
63
 
61
64
  def fetch_response() -> dict:
65
+ request_kwargs = {}
66
+ if hasattr(self._url, 'request_kwargs'):
67
+ request_kwargs = self._url.request_kwargs
68
+ timeout = request_kwargs.pop('timeout', DEFAULT_TIMEOUT)
69
+ stream = request_kwargs.pop('stream', True)
70
+
62
71
  if start is None or end is None:
63
- with requests.get(self._url, timeout=10,
64
- stream=True) as response:
72
+ with requests.get(fspath(self._url), timeout=timeout,
73
+ stream=stream, **request_kwargs) as response:
65
74
  return {
66
75
  'Headers': response.headers,
67
76
  'Cookies': response.cookies,
@@ -71,9 +80,16 @@ class HttpPrefetchReader(BasePrefetchReader):
71
80
  range_end = end
72
81
  if self._content_size is not None:
73
82
  range_end = min(range_end, self._content_size - 1)
74
- headers = {"Range": f"bytes={start}-{range_end}"}
75
- with requests.get(self._url, timeout=10, headers=headers,
76
- stream=True) as response:
83
+ headers = request_kwargs.pop('headers', {})
84
+ headers["Range"] = f"bytes={start}-{range_end}"
85
+ with requests.get(fspath(self._url), timeout=timeout,
86
+ headers=headers, stream=stream,
87
+ **request_kwargs) as response:
88
+ if len(response.content) != int(
89
+ response.headers['Content-Length']):
90
+ raise HttpBodyIncompleteError(
91
+ f"The downloaded content is incomplete, expected size: {response.headers['Content-Length']}, actual size: {len(response.content)}",
92
+ )
77
93
  return {
78
94
  'Body': BytesIO(response.content),
79
95
  'Headers': response.headers,
@@ -3,7 +3,7 @@ from concurrent.futures import Future
3
3
  from io import BytesIO
4
4
  from typing import Optional
5
5
 
6
- from megfile.config import BACKOFF_FACTOR, BACKOFF_INITIAL, DEFAULT_BLOCK_CAPACITY, DEFAULT_BLOCK_SIZE, GLOBAL_MAX_WORKERS, NEWLINE
6
+ from megfile.config import BACKOFF_FACTOR, BACKOFF_INITIAL, DEFAULT_BLOCK_CAPACITY, DEFAULT_BLOCK_SIZE, GLOBAL_MAX_WORKERS, NEWLINE, S3_MAX_RETRY_TIMES
7
7
  from megfile.errors import S3FileChangedError, S3InvalidRangeError, patch_method, raise_s3_error, s3_should_retry
8
8
  from megfile.lib.base_prefetch_reader import BasePrefetchReader, LRUCacheFutureManager
9
9
 
@@ -34,7 +34,7 @@ class S3PrefetchReader(BasePrefetchReader):
34
34
  block_size: int = DEFAULT_BLOCK_SIZE,
35
35
  block_capacity: int = DEFAULT_BLOCK_CAPACITY,
36
36
  block_forward: Optional[int] = None,
37
- max_retries: int = 10,
37
+ max_retries: int = S3_MAX_RETRY_TIMES,
38
38
  max_workers: Optional[int] = None,
39
39
  profile_name: Optional[str] = None):
40
40
 
@@ -3,7 +3,7 @@ from concurrent.futures import Future
3
3
  from logging import getLogger as get_logger
4
4
  from typing import Optional
5
5
 
6
- from megfile.config import DEFAULT_BLOCK_CAPACITY, DEFAULT_BLOCK_SIZE
6
+ from megfile.config import DEFAULT_BLOCK_CAPACITY, DEFAULT_BLOCK_SIZE, S3_MAX_RETRY_TIMES
7
7
  from megfile.lib.s3_prefetch_reader import LRUCacheFutureManager, S3PrefetchReader
8
8
  from megfile.utils import thread_local
9
9
 
@@ -25,7 +25,7 @@ class S3ShareCacheReader(S3PrefetchReader):
25
25
  block_size: int = DEFAULT_BLOCK_SIZE,
26
26
  block_capacity: int = DEFAULT_BLOCK_CAPACITY,
27
27
  block_forward: Optional[int] = None,
28
- max_retries: int = 10,
28
+ max_retries: int = S3_MAX_RETRY_TIMES,
29
29
  cache_key: str = 'lru',
30
30
  max_workers: Optional[int] = None,
31
31
  profile_name: Optional[str] = None):
megfile/pathlike.py CHANGED
@@ -730,16 +730,23 @@ class URIPath(BaseURIPath):
730
730
  with self.open(mode='r') as f:
731
731
  return f.read()
732
732
 
733
- def rename(self, dst_path: PathLike) -> 'URIPath':
733
+ def rename(self, dst_path: PathLike, overwrite: bool = True) -> 'URIPath':
734
+ '''
735
+ rename file
736
+
737
+ :param dst_path: Given destination path
738
+ :param overwrite: whether or not overwrite file when exists
739
+ '''
734
740
  raise NotImplementedError(f"'rename' is unsupported on '{type(self)}'")
735
741
 
736
- def replace(self, dst_path: PathLike) -> 'URIPath':
742
+ def replace(self, dst_path: PathLike, overwrite: bool = True) -> 'URIPath':
737
743
  '''
738
744
  move file
739
745
 
740
746
  :param dst_path: Given destination path
747
+ :param overwrite: whether or not overwrite file when exists
741
748
  '''
742
- return self.rename(dst_path=dst_path)
749
+ return self.rename(dst_path=dst_path, overwrite=overwrite)
743
750
 
744
751
  def rglob(self, pattern) -> List['URIPath']:
745
752
  '''
megfile/s3.py CHANGED
@@ -169,14 +169,16 @@ def s3_hasbucket(path: PathLike) -> bool:
169
169
  return S3Path(path).hasbucket()
170
170
 
171
171
 
172
- def s3_move(src_url: PathLike, dst_url: PathLike) -> None:
172
+ def s3_move(
173
+ src_url: PathLike, dst_url: PathLike, overwrite: bool = True) -> None:
173
174
  '''
174
175
  Move file/directory path from src_url to dst_url
175
176
 
176
177
  :param src_url: Given path
177
178
  :param dst_url: Given destination path
179
+ :param overwrite: whether or not overwrite file when exists
178
180
  '''
179
- return S3Path(src_url).move(dst_url)
181
+ return S3Path(src_url).move(dst_url, overwrite)
180
182
 
181
183
 
182
184
  def s3_remove(path: PathLike, missing_ok: bool = False) -> None:
@@ -305,8 +307,9 @@ def s3_getmd5(
305
307
  def s3_copy(
306
308
  src_url: PathLike,
307
309
  dst_url: PathLike,
310
+ callback: Optional[Callable[[int], None]] = None,
308
311
  followlinks: bool = False,
309
- callback: Optional[Callable[[int], None]] = None) -> None:
312
+ overwrite: bool = True) -> None:
310
313
  ''' File copy on S3
311
314
  Copy content of file on `src_path` to `dst_path`.
312
315
  It's caller's responsebility to ensure the s3_isfile(src_url) == True
@@ -314,24 +317,28 @@ def s3_copy(
314
317
  :param src_url: Given path
315
318
  :param dst_path: Target file path
316
319
  :param callback: Called periodically during copy, and the input parameter is the data size (in bytes) of copy since the last call
320
+ :param followlinks: False if regard symlink as file, else True
321
+ :param overwrite: whether or not overwrite file when exists, default is True
317
322
  '''
318
- return S3Path(src_url).copy(dst_url, followlinks, callback)
323
+ return S3Path(src_url).copy(dst_url, callback, followlinks, overwrite)
319
324
 
320
325
 
321
326
  def s3_sync(
322
327
  src_url: PathLike,
323
328
  dst_url: PathLike,
324
329
  followlinks: bool = False,
325
- force: bool = False) -> None:
330
+ force: bool = False,
331
+ overwrite: bool = True) -> None:
326
332
  '''
327
333
  Copy file/directory on src_url to dst_url
328
334
 
329
335
  :param src_url: Given path
330
336
  :param dst_url: Given destination path
331
337
  :param followlinks: False if regard symlink as file, else True
332
- :param force: Sync file forcely, do not ignore same files
338
+ :param force: Sync file forcely, do not ignore same files, priority is higher than 'overwrite', default is False
339
+ :param overwrite: whether or not overwrite file when exists, default is True
333
340
  '''
334
- return S3Path(src_url).sync(dst_url, followlinks, force)
341
+ return S3Path(src_url).sync(dst_url, followlinks, force, overwrite)
335
342
 
336
343
 
337
344
  def s3_symlink(src_path: PathLike, dst_path: PathLike) -> None: