PyPI - megfile - Versions diffs - 3.0.4__py3-none-any.whl → 3.0.6__py3-none-any.whl - Mend

megfile 3.0.4py3-none-any.whl → 3.0.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (27) hide show

megfile/cli.py +21 -10
megfile/config.py +9 -0
megfile/errors.py +37 -21
megfile/fs.py +9 -5
megfile/fs_path.py +72 -15
megfile/hdfs.py +3 -2
megfile/hdfs_path.py +16 -6
megfile/http_path.py +151 -22
megfile/lib/base_prefetch_reader.py +2 -2
megfile/lib/hdfs_prefetch_reader.py +2 -2
megfile/lib/http_prefetch_reader.py +28 -12
megfile/lib/s3_prefetch_reader.py +2 -2
megfile/lib/s3_share_cache_reader.py +2 -2
megfile/pathlike.py +10 -3
megfile/s3.py +14 -7
megfile/s3_path.py +57 -27
megfile/sftp.py +18 -9
megfile/sftp_path.py +63 -31
megfile/smart.py +60 -16
megfile/version.py +1 -1
{megfile-3.0.4.dist-info → megfile-3.0.6.dist-info}/METADATA +1 -1
{megfile-3.0.4.dist-info → megfile-3.0.6.dist-info}/RECORD +27 -27
{megfile-3.0.4.dist-info → megfile-3.0.6.dist-info}/LICENSE +0 -0
{megfile-3.0.4.dist-info → megfile-3.0.6.dist-info}/LICENSE.pyre +0 -0
{megfile-3.0.4.dist-info → megfile-3.0.6.dist-info}/WHEEL +0 -0
{megfile-3.0.4.dist-info → megfile-3.0.6.dist-info}/entry_points.txt +0 -0
{megfile-3.0.4.dist-info → megfile-3.0.6.dist-info}/top_level.txt +0 -0

megfile/http_path.py CHANGED Viewed

@@ -1,17 +1,19 @@
-import io
 import time
+from copy import deepcopy
 from functools import partial
-from io import BufferedReader
+from io import BufferedReader, BytesIO
 from logging import getLogger as get_logger
+from threading import Lock
 from typing import Iterable, Iterator, Optional, Tuple, Union
 import requests
+from urllib3 import HTTPResponse
-from megfile.config import DEFAULT_BLOCK_SIZE
+from megfile.config import DEFAULT_BLOCK_SIZE, HTTP_MAX_RETRY_TIMES
 from megfile.errors import http_should_retry, patch_method, translate_http_error
-from megfile.interfaces import PathLike, StatResult, URIPath
+from megfile.interfaces import PathLike, Readable, StatResult, URIPath
 from megfile.lib.compat import fspath
-from megfile.lib.http_prefetch_reader import HttpPrefetchReader
+from megfile.lib.http_prefetch_reader import DEFAULT_TIMEOUT, HttpPrefetchReader
 from megfile.lib.s3_buffered_writer import DEFAULT_MAX_BUFFER_SIZE
 from megfile.lib.url import get_url_scheme
 from megfile.pathlike import PathLike
@@ -27,11 +29,11 @@ __all__ = [
 ]
 _logger = get_logger(__name__)
-max_retries = 10
+max_retries = HTTP_MAX_RETRY_TIMES
 def get_http_session(
-        timeout: Union[int, Tuple[int, int]] = (9, 60),
+        timeout: Optional[Union[int, Tuple[int, int]]] = DEFAULT_TIMEOUT,
         status_forcelist: Iterable[int] = (500, 502, 503, 504)
 ) -> requests.Session:
     session = requests.Session()
@@ -81,7 +83,7 @@ def get_http_session(
                     return file_object
                 elif hasattr(file_object, 'name'):
                     with SmartPath(file_object.name).open('rb') as f:
-                        return io.BytesIO(f.read())
+                        return BytesIO(f.read())
                 else:
                     _logger.warning(
                         f'Can not retry http request, because the file object is not seekable and unsupport "name"'
@@ -171,10 +173,12 @@ class HttpPath(URIPath):
     protocol = "http"
     def __init__(self, path: PathLike, *other_paths: PathLike):
-        if str(path).startswith('https://'):
-            self.protocol = 'https'
         super().__init__(path, *other_paths)
+        if fspath(path).startswith('https://'):
+            self.protocol = 'https'
+        self.request_kwargs = {}
     @binary_open
     def open(
             self,
@@ -203,9 +207,15 @@ class HttpPath(URIPath):
             raise ValueError('unacceptable mode: %r' % mode)
         response = None
+        request_kwargs = deepcopy(self.request_kwargs)
+        timeout = request_kwargs.pop('timeout', DEFAULT_TIMEOUT)
+        stream = request_kwargs.pop('stream', True)
         try:
-            response = get_http_session(status_forcelist=()).get(
-                self.path_with_protocol, stream=True)
+            response = get_http_session(
+                timeout=timeout,
+                status_forcelist=(),
+            ).get(
+                self.path_with_protocol, stream=stream, **request_kwargs)
             response.raise_for_status()
         except Exception as error:
             if response:
@@ -213,8 +223,9 @@ class HttpPath(URIPath):
             raise translate_http_error(error, self.path_with_protocol)
         content_size = int(response.headers['Content-Length'])
-        if response.headers.get(
-                'Accept-Ranges') == 'bytes' and content_size >= block_size * 2:
+        if (response.headers.get('Accept-Ranges') == 'bytes' and
+                content_size >= block_size * 2 and
+                not response.headers.get('Content-Encoding')):
             response.close()
             block_capacity = max_buffer_size // block_size
@@ -224,7 +235,7 @@ class HttpPath(URIPath):
                 block_forward = max(int(block_capacity * forward_ratio), 1)
             reader = HttpPrefetchReader(
-                self.path_with_protocol,
+                self,
                 content_size=content_size,
                 max_retries=max_retries,
                 max_workers=max_concurrency,
@@ -233,12 +244,15 @@ class HttpPath(URIPath):
                 block_size=block_size,
             )
             if _is_pickle(reader):  # pytype: disable=wrong-arg-types
-                reader = io.BufferedReader(reader)  # pytype: disable=wrong-arg-types
+                reader = BufferedReader(reader)  # pytype: disable=wrong-arg-types
             return reader
-        response.raw.auto_close = False
         response.raw.name = self.path_with_protocol
-        return BufferedReader(response.raw)
+        # TODO: When python version must bigger than 3.10, use urllib3>=2.0.0 instead of 'Response'
+        # response.raw.auto_close = False
+        # response.raw.decode_content = True
+        # return BufferedReader(response.raw)
+        return BufferedReader(Response(response.raw))  # pytype: disable=wrong-arg-types
     def stat(self, follow_symlinks=True) -> StatResult:
         '''
@@ -249,9 +263,14 @@ class HttpPath(URIPath):
         :raises: HttpPermissionError, HttpFileNotFoundError
         '''
+        request_kwargs = deepcopy(self.request_kwargs)
+        timeout = request_kwargs.pop('timeout', DEFAULT_TIMEOUT)
+        stream = request_kwargs.pop('stream', True)
         try:
-            with get_http_session(status_forcelist=()).get(
-                    self.path_with_protocol, stream=True) as response:
+            with get_http_session(timeout=timeout, status_forcelist=()).get(
+                    self.path_with_protocol, stream=stream,
+                    **request_kwargs) as response:
                 response.raise_for_status()
                 headers = response.headers
         except Exception as error:
@@ -302,9 +321,14 @@ class HttpPath(URIPath):
         :return: return True if exists
         :rtype: bool
         """
+        request_kwargs = deepcopy(self.request_kwargs)
+        timeout = request_kwargs.pop('timeout', DEFAULT_TIMEOUT)
+        stream = request_kwargs.pop('stream', True)
         try:
-            with get_http_session(status_forcelist=()).get(
-                    self.path_with_protocol, stream=True) as response:
+            with get_http_session(timeout=timeout, status_forcelist=()).get(
+                    self.path_with_protocol, stream=stream,
+                    **request_kwargs) as response:
                 if response.status_code == 404:
                     return False
                 return True
@@ -316,3 +340,108 @@ class HttpPath(URIPath):
 class HttpsPath(HttpPath):
     protocol = "https"
+class Response(Readable):
+    def __init__(self, raw: HTTPResponse) -> None:
+        super().__init__()
+        raw.auto_close = False
+        self._block_size = 128 * 2**10  # 128KB
+        self._raw = raw
+        self._offset = 0
+        self._buffer = BytesIO()
+        self._lock = Lock()
+    @property
+    def name(self):
+        return self._raw.name
+    @property
+    def mode(self):
+        return 'rb'
+    def tell(self) -> int:
+        return self._offset
+    def _clear_buffer(self) -> None:
+        self._buffer.seek(0)
+        self._buffer.truncate()
+    def read(self, size: Optional[int] = None) -> bytes:
+        if size == 0:
+            return b''
+        if size is not None and size < 0:
+            size = None
+        with self._lock:
+            while not size or self._buffer.tell() < size:
+                data = self._raw.read(self._block_size, decode_content=True)
+                if not data:
+                    break
+                self._buffer.write(data)
+            self._buffer.seek(0)
+            content = self._buffer.read(size)
+            residue = self._buffer.read()
+            self._clear_buffer()
+            if residue:
+                self._buffer.write(residue)
+            self._offset += len(content)
+        return content
+    def readline(self, size: Optional[int] = None) -> bytes:
+        if size == 0:
+            return b''
+        if size is not None and size < 0:
+            size = None
+        with self._lock:
+            self._buffer.seek(0)
+            buffer = self._buffer.read()
+            self._clear_buffer()
+            if b'\n' in buffer:
+                content = buffer[:buffer.index(b'\n') + 1]
+                if size:
+                    content = content[:size]
+                self._buffer.write(buffer[len(content):])
+            elif size and len(buffer) >= size:
+                content = buffer[:size]
+                self._buffer.write(buffer[size:])
+            else:
+                content = None
+                self._buffer.write(buffer)
+                while True:
+                    if size and self._buffer.tell() >= size:
+                        break
+                    data = self._raw.read(self._block_size, decode_content=True)
+                    if not data:
+                        break
+                    elif b"\n" in data:
+                        last_content, residue = data.split(b"\n", 1)
+                        self._buffer.write(last_content)
+                        self._buffer.write(b"\n")
+                        self._buffer.seek(0)
+                        content = self._buffer.read()
+                        self._clear_buffer()
+                        if size and len(content) > size:
+                            self._buffer.write(content[size:])
+                            content = content[:size]
+                        if residue:
+                            self._buffer.write(residue)
+                        break
+                    else:
+                        self._buffer.write(data)
+                if content is None:
+                    self._buffer.seek(0)
+                    content = self._buffer.read(size)
+                    residue = self._buffer.read()
+                    self._clear_buffer()
+                    if residue:
+                        self._buffer.write(residue)
+            self._offset += len(content)
+        return content
+    def _close(self) -> None:
+        return self._raw.close()

megfile/lib/base_prefetch_reader.py CHANGED Viewed

@@ -8,7 +8,7 @@ from math import ceil
 from statistics import mean
 from typing import Optional
-from megfile.config import BACKOFF_FACTOR, BACKOFF_INITIAL, DEFAULT_BLOCK_CAPACITY, DEFAULT_BLOCK_SIZE, GLOBAL_MAX_WORKERS, NEWLINE
+from megfile.config import BACKOFF_FACTOR, BACKOFF_INITIAL, DEFAULT_BLOCK_CAPACITY, DEFAULT_BLOCK_SIZE, DEFAULT_MAX_RETRY_TIMES, GLOBAL_MAX_WORKERS, NEWLINE
 from megfile.interfaces import Readable, Seekable
 from megfile.utils import get_human_size, process_local
@@ -39,7 +39,7 @@ class BasePrefetchReader(Readable, Seekable, ABC):
             block_size: int = DEFAULT_BLOCK_SIZE,
             block_capacity: int = DEFAULT_BLOCK_CAPACITY,
             block_forward: Optional[int] = None,
-            max_retries: int = 10,
+            max_retries: int = DEFAULT_MAX_RETRY_TIMES,
             max_workers: Optional[int] = None,
             **kwargs):

megfile/lib/hdfs_prefetch_reader.py CHANGED Viewed

@@ -1,7 +1,7 @@
 from io import BytesIO
 from typing import Optional
-from megfile.config import DEFAULT_BLOCK_CAPACITY, DEFAULT_BLOCK_SIZE
+from megfile.config import DEFAULT_BLOCK_CAPACITY, DEFAULT_BLOCK_SIZE, HDFS_MAX_RETRY_TIMES
 from megfile.errors import raise_hdfs_error
 from megfile.lib.base_prefetch_reader import BasePrefetchReader
@@ -20,7 +20,7 @@ class HdfsPrefetchReader(BasePrefetchReader):
             block_size: int = DEFAULT_BLOCK_SIZE,
             block_capacity: int = DEFAULT_BLOCK_CAPACITY,
             block_forward: Optional[int] = None,
-            max_retries: int = 10,
+            max_retries: int = HDFS_MAX_RETRY_TIMES,
             max_workers: Optional[int] = None,
             profile_name: Optional[str] = None):
         self._path = hdfs_path

megfile/lib/http_prefetch_reader.py CHANGED Viewed

@@ -1,12 +1,15 @@
-import os
 from io import BytesIO
 from typing import Optional
 import requests
-from megfile.config import DEFAULT_BLOCK_CAPACITY, DEFAULT_BLOCK_SIZE
-from megfile.errors import UnsupportedError, http_should_retry, patch_method
+from megfile.config import DEFAULT_BLOCK_CAPACITY, DEFAULT_BLOCK_SIZE, HTTP_MAX_RETRY_TIMES
+from megfile.errors import HttpBodyIncompleteError, UnsupportedError, http_should_retry, patch_method
 from megfile.lib.base_prefetch_reader import BasePrefetchReader
+from megfile.lib.compat import fspath
+from megfile.pathlike import PathLike
+DEFAULT_TIMEOUT = (60, 60 * 60 * 24)
 class HttpPrefetchReader(BasePrefetchReader):
@@ -19,13 +22,13 @@ class HttpPrefetchReader(BasePrefetchReader):
     def __init__(
             self,
-            url: str,
+            url: PathLike,
             *,
             content_size: Optional[int] = None,
             block_size: int = DEFAULT_BLOCK_SIZE,
             block_capacity: int = DEFAULT_BLOCK_CAPACITY,
             block_forward: Optional[int] = None,
-            max_retries: int = 10,
+            max_retries: int = HTTP_MAX_RETRY_TIMES,
             max_workers: Optional[int] = None):
         self._url = url
@@ -46,22 +49,28 @@ class HttpPrefetchReader(BasePrefetchReader):
         if first_index_response['Headers'].get('Accept-Ranges') != 'bytes':
             raise UnsupportedError(
                 f'Unsupported server, server must support Accept-Ranges: {self._url}',
-                path=self._url,
+                path=fspath(self._url),
             )
         return first_index_response['Headers']['Content-Length']
     @property
     def name(self) -> str:
-        return self._url
+        return fspath(self._url)
     def _fetch_response(
             self, start: Optional[int] = None,
             end: Optional[int] = None) -> dict:
         def fetch_response() -> dict:
+            request_kwargs = {}
+            if hasattr(self._url, 'request_kwargs'):
+                request_kwargs = self._url.request_kwargs
+            timeout = request_kwargs.pop('timeout', DEFAULT_TIMEOUT)
+            stream = request_kwargs.pop('stream', True)
             if start is None or end is None:
-                with requests.get(self._url, timeout=10,
-                                  stream=True) as response:
+                with requests.get(fspath(self._url), timeout=timeout,
+                                  stream=stream, **request_kwargs) as response:
                     return {
                         'Headers': response.headers,
                         'Cookies': response.cookies,
@@ -71,9 +80,16 @@ class HttpPrefetchReader(BasePrefetchReader):
                 range_end = end
                 if self._content_size is not None:
                     range_end = min(range_end, self._content_size - 1)
-                headers = {"Range": f"bytes={start}-{range_end}"}
-                with requests.get(self._url, timeout=10, headers=headers,
-                                  stream=True) as response:
+                headers = request_kwargs.pop('headers', {})
+                headers["Range"] = f"bytes={start}-{range_end}"
+                with requests.get(fspath(self._url), timeout=timeout,
+                                  headers=headers, stream=stream,
+                                  **request_kwargs) as response:
+                    if len(response.content) != int(
+                            response.headers['Content-Length']):
+                        raise HttpBodyIncompleteError(
+                            f"The downloaded content is incomplete, expected size: {response.headers['Content-Length']}, actual size: {len(response.content)}",
+                        )
                     return {
                         'Body': BytesIO(response.content),
                         'Headers': response.headers,

megfile/lib/s3_prefetch_reader.py CHANGED Viewed

@@ -3,7 +3,7 @@ from concurrent.futures import Future
 from io import BytesIO
 from typing import Optional
-from megfile.config import BACKOFF_FACTOR, BACKOFF_INITIAL, DEFAULT_BLOCK_CAPACITY, DEFAULT_BLOCK_SIZE, GLOBAL_MAX_WORKERS, NEWLINE
+from megfile.config import BACKOFF_FACTOR, BACKOFF_INITIAL, DEFAULT_BLOCK_CAPACITY, DEFAULT_BLOCK_SIZE, GLOBAL_MAX_WORKERS, NEWLINE, S3_MAX_RETRY_TIMES
 from megfile.errors import S3FileChangedError, S3InvalidRangeError, patch_method, raise_s3_error, s3_should_retry
 from megfile.lib.base_prefetch_reader import BasePrefetchReader, LRUCacheFutureManager
@@ -34,7 +34,7 @@ class S3PrefetchReader(BasePrefetchReader):
             block_size: int = DEFAULT_BLOCK_SIZE,
             block_capacity: int = DEFAULT_BLOCK_CAPACITY,
             block_forward: Optional[int] = None,
-            max_retries: int = 10,
+            max_retries: int = S3_MAX_RETRY_TIMES,
             max_workers: Optional[int] = None,
             profile_name: Optional[str] = None):

megfile/lib/s3_share_cache_reader.py CHANGED Viewed

@@ -3,7 +3,7 @@ from concurrent.futures import Future
 from logging import getLogger as get_logger
 from typing import Optional
-from megfile.config import DEFAULT_BLOCK_CAPACITY, DEFAULT_BLOCK_SIZE
+from megfile.config import DEFAULT_BLOCK_CAPACITY, DEFAULT_BLOCK_SIZE, S3_MAX_RETRY_TIMES
 from megfile.lib.s3_prefetch_reader import LRUCacheFutureManager, S3PrefetchReader
 from megfile.utils import thread_local
@@ -25,7 +25,7 @@ class S3ShareCacheReader(S3PrefetchReader):
             block_size: int = DEFAULT_BLOCK_SIZE,
             block_capacity: int = DEFAULT_BLOCK_CAPACITY,
             block_forward: Optional[int] = None,
-            max_retries: int = 10,
+            max_retries: int = S3_MAX_RETRY_TIMES,
             cache_key: str = 'lru',
             max_workers: Optional[int] = None,
             profile_name: Optional[str] = None):

megfile/pathlike.py CHANGED Viewed

@@ -730,16 +730,23 @@ class URIPath(BaseURIPath):
         with self.open(mode='r') as f:
             return f.read()
-    def rename(self, dst_path: PathLike) -> 'URIPath':
+    def rename(self, dst_path: PathLike, overwrite: bool = True) -> 'URIPath':
+        '''
+        rename file
+        :param dst_path: Given destination path
+        :param overwrite: whether or not overwrite file when exists
+        '''
         raise NotImplementedError(f"'rename' is unsupported on '{type(self)}'")
-    def replace(self, dst_path: PathLike) -> 'URIPath':
+    def replace(self, dst_path: PathLike, overwrite: bool = True) -> 'URIPath':
         '''
         move file
         :param dst_path: Given destination path
+        :param overwrite: whether or not overwrite file when exists
         '''
-        return self.rename(dst_path=dst_path)
+        return self.rename(dst_path=dst_path, overwrite=overwrite)
     def rglob(self, pattern) -> List['URIPath']:
         '''

megfile/s3.py CHANGED Viewed

@@ -169,14 +169,16 @@ def s3_hasbucket(path: PathLike) -> bool:
     return S3Path(path).hasbucket()
-def s3_move(src_url: PathLike, dst_url: PathLike) -> None:
+def s3_move(
+        src_url: PathLike, dst_url: PathLike, overwrite: bool = True) -> None:
     '''
     Move file/directory path from src_url to dst_url
     :param src_url: Given path
     :param dst_url: Given destination path
+    :param overwrite: whether or not overwrite file when exists
     '''
-    return S3Path(src_url).move(dst_url)
+    return S3Path(src_url).move(dst_url, overwrite)
 def s3_remove(path: PathLike, missing_ok: bool = False) -> None:
@@ -305,8 +307,9 @@ def s3_getmd5(
 def s3_copy(
         src_url: PathLike,
         dst_url: PathLike,
+        callback: Optional[Callable[[int], None]] = None,
         followlinks: bool = False,
-        callback: Optional[Callable[[int], None]] = None) -> None:
+        overwrite: bool = True) -> None:
     ''' File copy on S3
     Copy content of file on `src_path` to `dst_path`.
     It's caller's responsebility to ensure the s3_isfile(src_url) == True
@@ -314,24 +317,28 @@ def s3_copy(
     :param src_url: Given path
     :param dst_path: Target file path
     :param callback: Called periodically during copy, and the input parameter is the data size (in bytes) of copy since the last call
+    :param followlinks: False if regard symlink as file, else True
+    :param overwrite: whether or not overwrite file when exists, default is True
     '''
-    return S3Path(src_url).copy(dst_url, followlinks, callback)
+    return S3Path(src_url).copy(dst_url, callback, followlinks, overwrite)
 def s3_sync(
         src_url: PathLike,
         dst_url: PathLike,
         followlinks: bool = False,
-        force: bool = False) -> None:
+        force: bool = False,
+        overwrite: bool = True) -> None:
     '''
     Copy file/directory on src_url to dst_url
     :param src_url: Given path
     :param dst_url: Given destination path
     :param followlinks: False if regard symlink as file, else True
-    :param force: Sync file forcely, do not ignore same files
+    :param force: Sync file forcely, do not ignore same files, priority is higher than 'overwrite', default is False
+    :param overwrite: whether or not overwrite file when exists, default is True
     '''
-    return S3Path(src_url).sync(dst_url, followlinks, force)
+    return S3Path(src_url).sync(dst_url, followlinks, force, overwrite)
 def s3_symlink(src_path: PathLike, dst_path: PathLike) -> None:

megfile 3.0.4__py3-none-any.whl → 3.0.6__py3-none-any.whl

megfile 3.0.4py3-none-any.whl → 3.0.6py3-none-any.whl