PyPI - megfile - Versions diffs - 3.1.6.post1__py3-none-any.whl → 4.0.0.post1__py3-none-any.whl - Mend

megfile 3.1.6.post1py3-none-any.whl → 4.0.0.post1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (43) hide show

megfile/cli.py +12 -7
megfile/config.py +27 -39
megfile/fs.py +169 -12
megfile/fs_path.py +183 -260
megfile/hdfs.py +106 -5
megfile/hdfs_path.py +34 -90
megfile/http.py +50 -1
megfile/http_path.py +27 -65
megfile/interfaces.py +1 -8
megfile/lib/base_prefetch_reader.py +62 -78
megfile/lib/combine_reader.py +5 -0
megfile/lib/glob.py +3 -6
megfile/lib/hdfs_prefetch_reader.py +7 -7
megfile/lib/http_prefetch_reader.py +6 -6
megfile/lib/s3_buffered_writer.py +71 -65
megfile/lib/s3_cached_handler.py +1 -2
megfile/lib/s3_limited_seekable_writer.py +3 -7
megfile/lib/s3_memory_handler.py +1 -2
megfile/lib/s3_pipe_handler.py +1 -2
megfile/lib/s3_prefetch_reader.py +10 -19
megfile/lib/s3_share_cache_reader.py +8 -5
megfile/pathlike.py +397 -401
megfile/s3.py +118 -17
megfile/s3_path.py +126 -209
megfile/sftp.py +300 -10
megfile/sftp_path.py +46 -322
megfile/smart.py +33 -27
megfile/smart_path.py +9 -14
megfile/stdio.py +1 -1
megfile/stdio_path.py +2 -2
megfile/utils/__init__.py +3 -4
megfile/version.py +1 -1
{megfile-3.1.6.post1.dist-info → megfile-4.0.0.post1.dist-info}/METADATA +7 -7
megfile-4.0.0.post1.dist-info/RECORD +52 -0
{megfile-3.1.6.post1.dist-info → megfile-4.0.0.post1.dist-info}/WHEEL +1 -1
{megfile-3.1.6.post1.dist-info → megfile-4.0.0.post1.dist-info}/top_level.txt +0 -2
docs/conf.py +0 -65
megfile-3.1.6.post1.dist-info/RECORD +0 -55
scripts/convert_results_to_sarif.py +0 -91
scripts/generate_file.py +0 -344
{megfile-3.1.6.post1.dist-info → megfile-4.0.0.post1.dist-info}/LICENSE +0 -0
{megfile-3.1.6.post1.dist-info → megfile-4.0.0.post1.dist-info}/LICENSE.pyre +0 -0
{megfile-3.1.6.post1.dist-info → megfile-4.0.0.post1.dist-info}/entry_points.txt +0 -0

megfile/hdfs.py CHANGED Viewed

@@ -1,11 +1,8 @@
 from typing import IO, BinaryIO, Iterator, List, Optional, Tuple
+from megfile.config import READER_BLOCK_SIZE, READER_MAX_BUFFER_SIZE
 from megfile.hdfs_path import (
     HdfsPath,
-    hdfs_glob,
-    hdfs_glob_stat,
-    hdfs_iglob,
-    hdfs_makedirs,
     is_hdfs,
 )
 from megfile.interfaces import FileEntry, PathLike, StatResult
@@ -300,8 +297,112 @@ def hdfs_open(
     buffering: Optional[int] = None,
     encoding: Optional[str] = None,
     errors: Optional[str] = None,
+    max_workers: Optional[int] = None,
+    max_buffer_size: int = READER_MAX_BUFFER_SIZE,
+    block_forward: Optional[int] = None,
+    block_size: int = READER_BLOCK_SIZE,
     **kwargs,
 ) -> IO:
+    """
+    Open a file on the specified path.
+    :param path: Given path
+    :param mode: Mode to open the file. Supports 'r', 'rb', 'w', 'wb', 'a', 'ab'.
+    :param buffering: Optional integer used to set the buffering policy.
+    :param encoding: Name of the encoding used to decode or encode the file.
+                    Should only be used in text mode.
+    :param errors: Optional string specifying how encoding and decoding errors are
+                to be handled. Cannot be used in binary mode.
+    :param max_workers: Max download thread number, `None` by default,
+        will use global thread pool with 8 threads.
+    :param max_buffer_size: Max cached buffer size in memory, 128MB by default.
+        Set to `0` will disable cache.
+    :param block_forward: Number of blocks of data for reader cached from the
+        offset position.
+    :param block_size: Size of a single block for reader, default is 8MB.
+    :returns: A file-like object.
+    :raises ValueError: If an unacceptable mode is provided.
+    """
     return HdfsPath(path).open(
-        mode, buffering=buffering, encoding=encoding, errors=errors
+        mode,
+        buffering=buffering,
+        encoding=encoding,
+        errors=errors,
+        max_workers=max_workers,
+        max_buffer_size=max_buffer_size,
+        block_forward=block_forward,
+        block_size=block_size,
     )
+def hdfs_glob(
+    path: PathLike, recursive: bool = True, missing_ok: bool = True
+) -> List[str]:
+    """Return hdfs path list in ascending alphabetical order,
+    in which path matches glob pattern
+    Notes: Only glob in bucket. If trying to match bucket with wildcard characters,
+    raise UnsupportedError
+    :param recursive: If False, `**` will not search directory recursively
+    :param missing_ok: If False and target path doesn't match any file,
+        raise FileNotFoundError
+    :raises: UnsupportedError, when bucket part contains wildcard characters
+    :returns: A list contains paths match `path`
+    """
+    return list(hdfs_iglob(path, recursive=recursive, missing_ok=missing_ok))
+def hdfs_glob_stat(
+    path: PathLike, recursive: bool = True, missing_ok: bool = True
+) -> Iterator[FileEntry]:
+    """Return a generator contains tuples of path and file stat,
+    in ascending alphabetical order, in which path matches glob pattern
+    Notes: Only glob in bucket. If trying to match bucket with wildcard characters,
+    raise UnsupportedError
+    :param recursive: If False, `**` will not search directory recursively
+    :param missing_ok: If False and target path doesn't match any file,
+        raise FileNotFoundError
+    :raises: UnsupportedError, when bucket part contains wildcard characters
+    :returns: A generator contains tuples of path and file stat,
+        in which paths match `path`
+    """
+    return HdfsPath(path).glob_stat(
+        pattern="", recursive=recursive, missing_ok=missing_ok
+    )
+def hdfs_iglob(
+    path: PathLike, recursive: bool = True, missing_ok: bool = True
+) -> Iterator[str]:
+    """Return hdfs path iterator in ascending alphabetical order,
+    in which path matches glob pattern
+    Notes: Only glob in bucket. If trying to match bucket with wildcard characters,
+    raise UnsupportedError
+    :param recursive: If False, `**` will not search directory recursively
+    :param missing_ok: If False and target path doesn't match any file,
+        raise FileNotFoundError
+    :raises: UnsupportedError, when bucket part contains wildcard characters
+    :returns: An iterator contains paths match `path`
+    """
+    for path_obj in HdfsPath(path).iglob(
+        pattern="", recursive=recursive, missing_ok=missing_ok
+    ):
+        yield path_obj.path_with_protocol
+def hdfs_makedirs(path: PathLike, exist_ok: bool = False):
+    """
+    Create an hdfs directory.
+    Purely creating directory is invalid because it's unavailable on OSS.
+    This function is to test the target bucket have WRITE access.
+    :param path: Given path
+    :param exist_ok: If False and target directory exists, raise S3FileExistsError
+    :raises: FileExistsError
+    """
+    return HdfsPath(path).mkdir(parents=True, exist_ok=exist_ok)

megfile/hdfs_path.py CHANGED Viewed

@@ -6,6 +6,11 @@ import sys
 from functools import cached_property, lru_cache
 from typing import IO, BinaryIO, Iterator, List, Optional, Tuple
+from megfile.config import (
+    HDFS_MAX_RETRY_TIMES,
+    READER_BLOCK_SIZE,
+    READER_MAX_BUFFER_SIZE,
+)
 from megfile.errors import _create_missing_ok_generator, raise_hdfs_error
 from megfile.interfaces import FileEntry, PathLike, StatResult, URIPath
 from megfile.lib.compat import fspath
@@ -19,17 +24,13 @@ from megfile.utils import _is_pickle
 __all__ = [
     "HdfsPath",
     "is_hdfs",
-    "hdfs_glob",
-    "hdfs_glob_stat",
-    "hdfs_iglob",
-    "hdfs_makedirs",
 ]
 HDFS_USER = "HDFS_USER"
 HDFS_URL = "HDFS_URL"
 HDFS_ROOT = "HDFS_ROOT"
 HDFS_TIMEOUT = "HDFS_TIMEOUT"
-HDFS_TOKEN = "HDFS_TOKEN"
+HDFS_TOKEN = "HDFS_TOKEN"  # nosec B105
 HDFS_CONFIG_PATH = "HDFS_CONFIG_PATH"
 MAX_RETRIES = 10
 DEFAULT_HDFS_TIMEOUT = 10
@@ -97,79 +98,6 @@ def get_hdfs_client(profile_name: Optional[str] = None):
     return hdfs_api.InsecureClient(**config)
-def hdfs_glob(
-    path: PathLike, recursive: bool = True, missing_ok: bool = True
-) -> List[str]:
-    """Return hdfs path list in ascending alphabetical order,
-    in which path matches glob pattern
-    Notes: Only glob in bucket. If trying to match bucket with wildcard characters,
-    raise UnsupportedError
-    :param recursive: If False, `**` will not search directory recursively
-    :param missing_ok: If False and target path doesn't match any file,
-        raise FileNotFoundError
-    :raises: UnsupportedError, when bucket part contains wildcard characters
-    :returns: A list contains paths match `path`
-    """
-    return list(hdfs_iglob(path, recursive=recursive, missing_ok=missing_ok))
-def hdfs_glob_stat(
-    path: PathLike, recursive: bool = True, missing_ok: bool = True
-) -> Iterator[FileEntry]:
-    """Return a generator contains tuples of path and file stat,
-    in ascending alphabetical order, in which path matches glob pattern
-    Notes: Only glob in bucket. If trying to match bucket with wildcard characters,
-    raise UnsupportedError
-    :param recursive: If False, `**` will not search directory recursively
-    :param missing_ok: If False and target path doesn't match any file,
-        raise FileNotFoundError
-    :raises: UnsupportedError, when bucket part contains wildcard characters
-    :returns: A generator contains tuples of path and file stat,
-        in which paths match `path`
-    """
-    return HdfsPath(path).glob_stat(
-        pattern="", recursive=recursive, missing_ok=missing_ok
-    )
-def hdfs_iglob(
-    path: PathLike, recursive: bool = True, missing_ok: bool = True
-) -> Iterator[str]:
-    """Return hdfs path iterator in ascending alphabetical order,
-    in which path matches glob pattern
-    Notes: Only glob in bucket. If trying to match bucket with wildcard characters,
-    raise UnsupportedError
-    :param recursive: If False, `**` will not search directory recursively
-    :param missing_ok: If False and target path doesn't match any file,
-        raise FileNotFoundError
-    :raises: UnsupportedError, when bucket part contains wildcard characters
-    :returns: An iterator contains paths match `path`
-    """
-    for path_obj in HdfsPath(path).iglob(
-        pattern="", recursive=recursive, missing_ok=missing_ok
-    ):
-        yield path_obj.path_with_protocol
-def hdfs_makedirs(path: PathLike, exist_ok: bool = False):
-    """
-    Create an hdfs directory.
-    Purely creating directory is invalid because it's unavailable on OSS.
-    This function is to test the target bucket have WRITE access.
-    :param path: Given path
-    :param exist_ok: If False and target directory exists, raise S3FileExistsError
-    :raises: FileExistsError
-    """
-    return HdfsPath(path).mkdir(parents=True, exist_ok=exist_ok)
 @SmartPath.register
 class HdfsPath(URIPath):
     protocol = "hdfs"
@@ -641,8 +569,31 @@ class HdfsPath(URIPath):
         buffering: Optional[int] = None,
         encoding: Optional[str] = None,
         errors: Optional[str] = None,
+        max_workers: Optional[int] = None,
+        max_buffer_size: int = READER_MAX_BUFFER_SIZE,
+        block_forward: Optional[int] = None,
+        block_size: int = READER_BLOCK_SIZE,
         **kwargs,
     ) -> IO:
+        """
+        Open a file on the specified path.
+        :param mode: Mode to open the file. Supports 'r', 'rb', 'w', 'wb', 'a', 'ab'.
+        :param buffering: Optional integer used to set the buffering policy.
+        :param encoding: Name of the encoding used to decode or encode the file.
+                        Should only be used in text mode.
+        :param errors: Optional string specifying how encoding and decoding errors are
+                    to be handled. Cannot be used in binary mode.
+        :param max_workers: Max download thread number, `None` by default,
+            will use global thread pool with 8 threads.
+        :param max_buffer_size: Max cached buffer size in memory, 128MB by default.
+            Set to `0` will disable cache.
+        :param block_forward: Number of blocks of data for reader cached from the
+            offset position.
+        :param block_size: Size of a single block for reader, default is 8MB.
+        :returns: A file-like object.
+        :raises ValueError: If an unacceptable mode is provided.
+        """
         if "+" in mode:
             raise ValueError("unacceptable mode: %r" % mode)
@@ -653,22 +604,15 @@ class HdfsPath(URIPath):
         with raise_hdfs_error(self.path_with_protocol):
             if mode in ("r", "rb"):
-                keys = [
-                    "block_size",
-                    "block_capacity",
-                    "block_forward",
-                    "max_retries",
-                    "max_workers",
-                ]
-                input_kwargs = {}
-                for key in keys:
-                    if key in kwargs:
-                        input_kwargs[key] = kwargs[key]
                 file_obj = HdfsPrefetchReader(
                     hdfs_path=self.path_without_protocol,
                     client=self._client,
                     profile_name=self._profile_name,
-                    **input_kwargs,
+                    block_size=block_size,
+                    max_buffer_size=max_buffer_size,
+                    block_forward=block_forward,
+                    max_retries=HDFS_MAX_RETRY_TIMES,
+                    max_workers=max_workers,
                 )
                 if _is_pickle(file_obj):
                     file_obj = io.BufferedReader(file_obj)  # type: ignore

megfile/http.py CHANGED Viewed

@@ -1,4 +1,8 @@
-from megfile.http_path import HttpPath, get_http_session, http_open, is_http
+from io import BufferedReader
+from typing import Optional, Union
+from megfile.config import READER_BLOCK_SIZE, READER_MAX_BUFFER_SIZE
+from megfile.http_path import HttpPath, HttpPrefetchReader, get_http_session, is_http
 from megfile.interfaces import PathLike, StatResult
 __all__ = [
@@ -12,6 +16,51 @@ __all__ = [
 ]
+def http_open(
+    path: PathLike,
+    mode: str = "rb",
+    *,
+    encoding: Optional[str] = None,
+    errors: Optional[str] = None,
+    max_workers: Optional[int] = None,
+    max_buffer_size: int = READER_MAX_BUFFER_SIZE,
+    block_forward: Optional[int] = None,
+    block_size: int = READER_BLOCK_SIZE,
+    **kwargs,
+) -> Union[BufferedReader, HttpPrefetchReader]:
+    """Open a BytesIO to read binary data of given http(s) url
+    .. note ::
+        Essentially, it reads data of http(s) url to memory by requests,
+        and then return BytesIO to user.
+    :param path: Given path
+    :param mode: Only supports 'r' or 'rb' mode now
+    :param encoding: encoding is the name of the encoding used to decode or encode
+        the file. This should only be used in text mode.
+    :param errors: errors is an optional string that specifies how encoding and decoding
+        errors are to be handled—this cannot be used in binary mode.
+    :param max_workers: Max download thread number, `None` by default,
+        will use global thread pool with 8 threads.
+    :param max_buffer_size: Max cached buffer size in memory, 128MB by default.
+        Set to `0` will disable cache.
+    :param block_forward: How many blocks of data cached from offset position
+    :param block_size: Size of single block, 8MB by default. Each block will be uploaded
+        or downloaded by single thread.
+    :return: A file-like object with http(s) data
+    """
+    return HttpPath(path).open(
+        mode,
+        encoding=encoding,
+        errors=errors,
+        max_workers=max_workers,
+        max_buffer_size=max_buffer_size,
+        block_forward=block_forward,
+        block_size=block_size,
+    )
 def http_stat(path: PathLike, follow_symlinks=True) -> StatResult:
     """
     Get StatResult of http_url response, including size and mtime,

megfile/http_path.py CHANGED Viewed

@@ -9,20 +9,27 @@ from typing import Iterable, Iterator, Optional, Tuple, Union
 import requests
 from urllib3 import HTTPResponse
-from megfile.config import DEFAULT_BLOCK_SIZE, HTTP_MAX_RETRY_TIMES
+from megfile.config import (
+    HTTP_MAX_RETRY_TIMES,
+    READER_BLOCK_SIZE,
+    READER_MAX_BUFFER_SIZE,
+)
 from megfile.errors import http_should_retry, patch_method, translate_http_error
 from megfile.interfaces import PathLike, Readable, StatResult, URIPath
 from megfile.lib.compat import fspath
 from megfile.lib.http_prefetch_reader import DEFAULT_TIMEOUT, HttpPrefetchReader
-from megfile.lib.s3_buffered_writer import DEFAULT_MAX_BUFFER_SIZE
 from megfile.lib.url import get_url_scheme
 from megfile.smart_path import SmartPath
 from megfile.utils import _is_pickle, binary_open
-__all__ = ["HttpPath", "HttpsPath", "get_http_session", "is_http", "http_open"]
+__all__ = [
+    "HttpPath",
+    "HttpsPath",
+    "get_http_session",
+    "is_http",
+]
 _logger = get_logger(__name__)
-max_retries = HTTP_MAX_RETRY_TIMES
 def get_http_session(
@@ -101,7 +108,7 @@ def get_http_session(
     session.request = patch_method(
         partial(session.request, timeout=timeout),
-        max_retries=max_retries,
+        max_retries=HTTP_MAX_RETRY_TIMES,
         should_retry=http_should_retry,
         before_callback=before_callback,
         after_callback=after_callback,
@@ -127,48 +134,6 @@ def is_http(path: PathLike) -> bool:
     return scheme == "http" or scheme == "https"
-def http_open(
-    path: PathLike,
-    mode: str = "rb",
-    *,
-    encoding: Optional[str] = None,
-    errors: Optional[str] = None,
-    max_concurrency: Optional[int] = None,
-    max_buffer_size: int = DEFAULT_MAX_BUFFER_SIZE,
-    forward_ratio: Optional[float] = None,
-    block_size: int = DEFAULT_BLOCK_SIZE,
-    **kwargs,
-) -> Union[BufferedReader, HttpPrefetchReader]:
-    """Open a BytesIO to read binary data of given http(s) url
-    .. note ::
-        Essentially, it reads data of http(s) url to memory by requests,
-        and then return BytesIO to user.
-    :param path: Given path
-    :param mode: Only supports 'rb' mode now
-    :param encoding: encoding is the name of the encoding used to decode or encode
-        the file. This should only be used in text mode.
-    :param errors: errors is an optional string that specifies how encoding and decoding
-        errors are to be handled—this cannot be used in binary mode.
-    :param max_concurrency: Max download thread number, None by default
-    :param max_buffer_size: Max cached buffer size in memory, 128MB by default
-    :param block_size: Size of single block, 8MB by default. Each block will be uploaded
-        or downloaded by single thread.
-    :return: BytesIO initialized with http(s) data
-    """
-    return HttpPath(path).open(
-        mode,
-        encoding=encoding,
-        errors=errors,
-        max_concurrency=max_concurrency,
-        max_buffer_size=max_buffer_size,
-        forward_ratio=forward_ratio,
-        block_size=block_size,
-    )
 @SmartPath.register
 class HttpPath(URIPath):
     protocol = "http"
@@ -185,10 +150,10 @@ class HttpPath(URIPath):
         self,
         mode: str = "rb",
         *,
-        max_concurrency: Optional[int] = None,
-        max_buffer_size: int = DEFAULT_MAX_BUFFER_SIZE,
-        forward_ratio: Optional[float] = None,
-        block_size: int = DEFAULT_BLOCK_SIZE,
+        max_workers: Optional[int] = None,
+        max_buffer_size: int = READER_MAX_BUFFER_SIZE,
+        block_forward: Optional[int] = None,
+        block_size: int = READER_BLOCK_SIZE,
         **kwargs,
     ) -> Union[BufferedReader, HttpPrefetchReader]:
         """Open a BytesIO to read binary data of given http(s) url
@@ -198,16 +163,19 @@ class HttpPath(URIPath):
             Essentially, it reads data of http(s) url to memory by requests,
             and then return BytesIO to user.
-        :param mode: Only supports 'rb' mode now
+        :param mode: Only supports 'r' or 'rb' mode now
         :param encoding: encoding is the name of the encoding used to decode or encode
             the file. This should only be used in text mode.
         :param errors: errors is an optional string that specifies how encoding and
             decoding errors are to be handled—this cannot be used in binary mode.
-        :param max_concurrency: Max download thread number, None by default
-        :param max_buffer_size: Max cached buffer size in memory, 128MB by default
+        :param max_workers: Max download thread number, `None` by default,
+            will use global thread pool with 8 threads.
+        :param max_buffer_size: Max cached buffer size in memory, 128MB by default.
+            Set to `0` will disable cache.
+        :param block_forward: How many blocks of data cached from offset position
         :param block_size: Size of single block, 8MB by default. Each block will
             be uploaded or downloaded by single thread.
-        :return: BytesIO initialized with http(s) data
+        :return: A file-like object with http(s) data
         """
         if mode not in ("rb",):
             raise ValueError("unacceptable mode: %r" % mode)
@@ -234,20 +202,14 @@ class HttpPath(URIPath):
         ):
             response.close()
-            block_capacity = max_buffer_size // block_size
-            if forward_ratio is None:
-                block_forward = None
-            else:
-                block_forward = max(int(block_capacity * forward_ratio), 1)
             reader = HttpPrefetchReader(
                 self,
                 content_size=content_size,
-                max_retries=max_retries,
-                max_workers=max_concurrency,
-                block_capacity=block_capacity,
-                block_forward=block_forward,
                 block_size=block_size,
+                max_buffer_size=max_buffer_size,
+                block_forward=block_forward,
+                max_retries=HTTP_MAX_RETRY_TIMES,
+                max_workers=max_workers,
             )
             if _is_pickle(reader):
                 reader = BufferedReader(reader)  # type: ignore

megfile/interfaces.py CHANGED Viewed

@@ -6,7 +6,6 @@ from typing import IO, AnyStr, Iterable, List, Optional
 from megfile.pathlike import (
     Access,
     BasePath,
-    BaseURIPath,
     FileEntry,
     PathLike,
     Self,
@@ -17,11 +16,9 @@ from megfile.pathlike import (
 __all__ = [
     "Access",
     "BasePath",
-    "BaseURIPath",
     "FileEntry",
     "PathLike",
     "StatResult",
-    "URIPath",
     "fullname",
     "Closable",
     "FileLike",
@@ -31,6 +28,7 @@ __all__ = [
     "FileCacher",
     "NullCacher",
     "ContextIterator",
+    "URIPath",
 ]
@@ -115,11 +113,6 @@ class FileLike(Closable, IOBase, IO[AnyStr], ABC):  # pytype: disable=signature-
         This is not implemented for read-only and non-blocking streams.
         """
-    def __del__(self) -> None:
-        # TODO: Next version should turn on __del__ for auto closing,
-        # and disable this in child class like CombineReader
-        pass
 class Seekable(FileLike, ABC):
     def seekable(self) -> bool:

megfile 3.1.6.post1__py3-none-any.whl → 4.0.0.post1__py3-none-any.whl

megfile 3.1.6.post1py3-none-any.whl → 4.0.0.post1py3-none-any.whl