PyPI - megfile - Versions diffs - 4.2.2__py3-none-any.whl → 4.2.4__py3-none-any.whl - Mend

megfile 4.2.2py3-none-any.whl → 4.2.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

megfile/__init__.py +5 -0
megfile/cli.py +5 -9
megfile/config.py +5 -0
megfile/fs_path.py +2 -10
megfile/lib/base_prefetch_reader.py +18 -7
megfile/lib/compare.py +3 -3
megfile/lib/glob.py +3 -0
megfile/lib/s3_prefetch_reader.py +2 -1
megfile/lib/s3_share_cache_reader.py +15 -10
megfile/s3_path.py +9 -4
megfile/sftp2.py +827 -0
megfile/sftp2_path.py +1084 -0
megfile/sftp_path.py +3 -15
megfile/smart.py +5 -17
megfile/utils/__init__.py +92 -9
megfile/version.py +1 -1
{megfile-4.2.2.dist-info → megfile-4.2.4.dist-info}/METADATA +3 -1
{megfile-4.2.2.dist-info → megfile-4.2.4.dist-info}/RECORD +23 -21
{megfile-4.2.2.dist-info → megfile-4.2.4.dist-info}/WHEEL +0 -0
{megfile-4.2.2.dist-info → megfile-4.2.4.dist-info}/entry_points.txt +0 -0
{megfile-4.2.2.dist-info → megfile-4.2.4.dist-info}/licenses/LICENSE +0 -0
{megfile-4.2.2.dist-info → megfile-4.2.4.dist-info}/licenses/LICENSE.pyre +0 -0
{megfile-4.2.2.dist-info → megfile-4.2.4.dist-info}/top_level.txt +0 -0

megfile/__init__.py CHANGED Viewed

@@ -206,6 +206,11 @@ from megfile.stdio import is_stdio, stdio_open
 from megfile.stdio_path import StdioPath
 from megfile.version import VERSION as __version__  # noqa: F401
+try:
+    from megfile.sftp2_path import Sftp2Path
+except ImportError:
+    Sftp2Path = None
 __all__ = [
     "smart_access",
     "smart_cache",

megfile/cli.py CHANGED Viewed

@@ -47,7 +47,7 @@ from megfile.smart import (
     smart_unlink,
 )
 from megfile.smart_path import SmartPath
-from megfile.utils import get_human_size
+from megfile.utils import copyfileobj_multi, get_human_size
 from megfile.version import VERSION
 options = {}
@@ -646,14 +646,10 @@ def to(path: str, append: bool, stdout: bool):
         smart_open(path, mode) as f,
         smart_open("stdio://1", "wb") as stdout_fd,
     ):
-        length = 16 * 1024
-        while True:
-            buf = stdin.read(length)
-            if not buf:
-                break
-            f.write(buf)
-            if stdout:
-                stdout_fd.write(buf)
+        destinations = [f]
+        if stdout:
+            destinations.append(stdout_fd)
+        copyfileobj_multi(stdin, destinations)
 @cli.command(short_help="Produce an md5sum file for all the objects in the path.")

megfile/config.py CHANGED Viewed

@@ -83,6 +83,7 @@ if READER_BLOCK_SIZE <= 0:
 READER_MAX_BUFFER_SIZE = parse_quantity(
     os.getenv("MEGFILE_READER_MAX_BUFFER_SIZE") or 128 * 2**20
 )
+READER_LAZY_PREFETCH = parse_boolean(os.getenv("MEGFILE_READER_LAZY_PREFETCH"), False)
 # Multi-upload in aws s3 has a maximum of 10,000 parts,
 # so the maximum supported file size is MEGFILE_WRITE_BLOCK_SIZE * 10,000,
@@ -105,6 +106,10 @@ GLOBAL_MAX_WORKERS = int(os.getenv("MEGFILE_MAX_WORKERS") or 8)
 NEWLINE = ord("\n")
+# Default buffer sizes for various operations
+DEFAULT_COPY_BUFFER_SIZE = 16 * 1024  # 16KB, same as shutil.copyfileobj
+DEFAULT_HASH_BUFFER_SIZE = 4 * 1024  # 4KB for hash calculations
 S3_CLIENT_CACHE_MODE = os.getenv("MEGFILE_S3_CLIENT_CACHE_MODE") or "thread_local"
 DEFAULT_MAX_RETRY_TIMES = int(os.getenv("MEGFILE_MAX_RETRY_TIMES") or 10)

megfile/fs_path.py CHANGED Viewed

@@ -27,7 +27,7 @@ from megfile.lib.glob import iglob
 from megfile.lib.joinpath import path_join
 from megfile.lib.url import get_url_scheme
 from megfile.smart_path import SmartPath
-from megfile.utils import calculate_md5
+from megfile.utils import calculate_md5, copyfd
 __all__ = [
     "FSPath",
@@ -737,15 +737,7 @@ class FSPath(URIPath):
     ):
         if isinstance(self.path_without_protocol, int):
             with open(fspath(dst_path), "wb") as fdst:
-                # This magic number is copied from  copyfileobj
-                length = 16 * 1024
-                while True:
-                    buf = os.read(self.path_without_protocol, length)  # pyre-ignore[6]
-                    if not buf:
-                        break
-                    fdst.write(buf)
-                    if callback:
-                        callback(len(buf))
+                copyfd(self.path_without_protocol, fdst, callback)
         else:
             shutil.copy2(
                 self.path_without_protocol,  # pyre-ignore[6]

megfile/lib/base_prefetch_reader.py CHANGED Viewed

@@ -82,9 +82,9 @@ class BasePrefetchReader(Readable[bytes], Seekable, ABC):
         self._offset = 0
         self._cached_buffer = None
-        self._block_index = None  # Current block index
+        self._block_index = 0  # Current block index
+        self._cached_offset = 0  # Current offset in the current block
         self._seek_history = []
         self._seek_buffer(0)
         _logger.debug("open file: %r, mode: %s" % (self.name, self.mode))
@@ -98,7 +98,9 @@ class BasePrefetchReader(Readable[bytes], Seekable, ABC):
         return self._process_local("futures", self._get_futures)
     def _get_futures(self):
-        return LRUCacheFutureManager()
+        futures = LRUCacheFutureManager()
+        futures.register(self.name)
+        return futures
     @property
     @abstractmethod
@@ -207,9 +209,8 @@ class BasePrefetchReader(Readable[bytes], Seekable, ABC):
         if size == 0 or self._offset >= self._content_size:
             return b""
-        data = self._fetch_response(start=self._offset, end=self._offset + size - 1)[
-            "Body"
-        ].read()
+        resp = self._fetch_response(start=self._offset, end=self._offset + size - 1)
+        data = resp["Body"].read()
         self.seek(size, os.SEEK_CUR)
         return data
@@ -369,12 +370,17 @@ class BasePrefetchReader(Readable[bytes], Seekable, ABC):
 class LRUCacheFutureManager(OrderedDict):
     def __init__(self):
         super().__init__()
+        self._name = None
+    def register(self, name):
+        self._name = name
     def submit(self, executor, key, *args, **kwargs):
         if key in self:
             self.move_to_end(key, last=True)
             return
         self[key] = executor.submit(*args, **kwargs)
+        _logger.debug("submit future: %r, key: %r" % (self._name, key))
     @property
     def finished(self):
@@ -385,7 +391,12 @@ class LRUCacheFutureManager(OrderedDict):
         return self[key].result()
     def cleanup(self, block_capacity: int):
+        keys = []
         while len(self) > block_capacity:
-            _, future = self.popitem(last=False)
+            key, future = self.popitem(last=False)
+            keys.append(key)
             if not future.done():
                 future.cancel()
+        if keys:
+            _logger.debug("cleanup futures: %r, keys: %s" % (self._name, keys))
+        return keys

megfile/lib/compare.py CHANGED Viewed

@@ -5,10 +5,10 @@ from megfile.pathlike import StatResult
 def get_sync_type(src_protocol, dst_protocol):
-    if src_protocol == "s3" and dst_protocol != "s3":
-        return "download"
-    elif src_protocol != "s3" and dst_protocol == "s3":
+    if dst_protocol == "s3" or dst_protocol.startswith("s3+"):
         return "upload"
+    elif src_protocol == "s3" or src_protocol.startswith("s3+"):
+        return "download"
     else:
         return "copy"

megfile/lib/glob.py CHANGED Viewed

@@ -289,6 +289,9 @@ def get_non_glob_dir(glob: str):
     root_dir = []
     if glob.startswith("/"):
         root_dir.append("/")
+    elif "://" in glob:
+        protocol, glob = glob.split("://", 1)
+        root_dir.append(f"{protocol}://")
     for name in glob.split("/"):
         if has_magic(name):
             break

megfile/lib/s3_prefetch_reader.py CHANGED Viewed

@@ -4,6 +4,7 @@ from typing import Optional
 from megfile.config import (
     READER_BLOCK_SIZE,
+    READER_LAZY_PREFETCH,
     READER_MAX_BUFFER_SIZE,
     S3_MAX_RETRY_TIMES,
 )
@@ -62,7 +63,7 @@ class S3PrefetchReader(BasePrefetchReader):
         )
     def _get_content_size(self):
-        if self._block_capacity <= 0:
+        if self._block_capacity <= 0 or READER_LAZY_PREFETCH:
             response = self._client.head_object(Bucket=self._bucket, Key=self._key)
             self._content_etag = response.get("ETag")
             return int(response["ContentLength"])

megfile/lib/s3_share_cache_reader.py CHANGED Viewed

@@ -101,16 +101,21 @@ class ShareCacheFutureManager(LRUCacheFutureManager):
         super().__init__()
         self._references = Counter()
-    def register(self, key):
-        self._references[key] += 1
-    def unregister(self, key):
-        self._references[key] -= 1
-        if self._references[key] == 0:
-            self._references.pop(key)
-            for key_tuple in list(self):
-                if key_tuple[0] != key:
+    def register(self, name):
+        self._references[name] += 1
+        _logger.debug("register reader: %r, count: %d" % (name, self._references[name]))
+    def unregister(self, name):
+        self._references[name] -= 1
+        _logger.debug(
+            "unregister reader: %r, count: %d" % (name, self._references[name])
+        )
+        if self._references[name] == 0:
+            self._references.pop(name)
+            for key in list(self):
+                if key[0] != name:
                     continue
-                future = self.pop(key_tuple)
+                future = self.pop(key)
                 if not future.done():
                     future.cancel()  # pragma: no cover
+            _logger.debug("cleanup all futures of reader: %r" % name)

megfile/s3_path.py CHANGED Viewed

@@ -610,6 +610,7 @@ def _s3_glob_stat_single_path(
                         yield FileEntry(S3Path(path).name, path, _make_stat(content))
                     dirname = os.path.dirname(path)
                     while dirname not in dirnames and dirname != top_dir:
+                        # TODO: optimize memory usage and file path order
                         dirnames.add(dirname)
                         path = dirname + "/" if search_dir else dirname
                         if pattern.match(path):
@@ -968,6 +969,8 @@ def s3_buffered_open(
         (both file head part and tail part can seek block_size).
         Notes: This parameter are valid only for write-handle.
         Read-handle support arbitrary seek
+    :param buffered: If you are operating pickle file without .pkl or .pickle extension,
+        please set this to True to avoid the performance issue.
     :returns: An opened File object
     :raises: S3FileNotFoundError
     """
@@ -999,6 +1002,7 @@ def s3_buffered_open(
         )
     if mode == "rb":
+        block_size = block_size or READER_BLOCK_SIZE
         if share_cache_key is not None:
             reader = S3ShareCacheReader(
                 bucket,
@@ -1007,7 +1011,7 @@ def s3_buffered_open(
                 s3_client=client,
                 max_retries=max_retries,
                 max_workers=max_workers,
-                block_size=block_size or READER_BLOCK_SIZE,
+                block_size=block_size,
                 block_forward=block_forward,
                 profile_name=s3_url._profile_name,
             )
@@ -1022,13 +1026,14 @@ def s3_buffered_open(
                 max_workers=max_workers,
                 max_buffer_size=max_buffer_size,
                 block_forward=block_forward,
-                block_size=block_size or READER_BLOCK_SIZE,
+                block_size=block_size,
                 profile_name=s3_url._profile_name,
             )
         if buffered or _is_pickle(reader):
             reader = io.BufferedReader(reader)  # type: ignore
         return reader
+    block_size = block_size or WRITER_BLOCK_SIZE
     if limited_seekable:
         if max_buffer_size is None:
             max_buffer_size = WRITER_MAX_BUFFER_SIZE
@@ -1037,7 +1042,7 @@ def s3_buffered_open(
             key,
             s3_client=client,
             max_workers=max_workers,
-            block_size=block_size or WRITER_BLOCK_SIZE,
+            block_size=block_size,
             max_buffer_size=max_buffer_size,
             profile_name=s3_url._profile_name,
         )
@@ -1049,7 +1054,7 @@ def s3_buffered_open(
             key,
             s3_client=client,
             max_workers=max_workers,
-            block_size=block_size or WRITER_BLOCK_SIZE,
+            block_size=block_size,
             max_buffer_size=max_buffer_size,
             profile_name=s3_url._profile_name,
         )

megfile 4.2.2__py3-none-any.whl → 4.2.4__py3-none-any.whl

megfile 4.2.2py3-none-any.whl → 4.2.4py3-none-any.whl