megfile 4.2.3__py3-none-any.whl → 4.2.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- megfile/__init__.py +5 -0
- megfile/cli.py +5 -9
- megfile/config.py +5 -0
- megfile/fs_path.py +2 -10
- megfile/lib/base_prefetch_reader.py +18 -7
- megfile/lib/s3_prefetch_reader.py +2 -1
- megfile/lib/s3_share_cache_reader.py +15 -10
- megfile/s3_path.py +8 -4
- megfile/sftp2.py +827 -0
- megfile/sftp2_path.py +1084 -0
- megfile/sftp_path.py +3 -15
- megfile/smart.py +5 -17
- megfile/utils/__init__.py +92 -9
- megfile/version.py +1 -1
- {megfile-4.2.3.dist-info → megfile-4.2.4.dist-info}/METADATA +3 -1
- {megfile-4.2.3.dist-info → megfile-4.2.4.dist-info}/RECORD +21 -19
- {megfile-4.2.3.dist-info → megfile-4.2.4.dist-info}/WHEEL +0 -0
- {megfile-4.2.3.dist-info → megfile-4.2.4.dist-info}/entry_points.txt +0 -0
- {megfile-4.2.3.dist-info → megfile-4.2.4.dist-info}/licenses/LICENSE +0 -0
- {megfile-4.2.3.dist-info → megfile-4.2.4.dist-info}/licenses/LICENSE.pyre +0 -0
- {megfile-4.2.3.dist-info → megfile-4.2.4.dist-info}/top_level.txt +0 -0
megfile/__init__.py
CHANGED
|
@@ -206,6 +206,11 @@ from megfile.stdio import is_stdio, stdio_open
|
|
|
206
206
|
from megfile.stdio_path import StdioPath
|
|
207
207
|
from megfile.version import VERSION as __version__ # noqa: F401
|
|
208
208
|
|
|
209
|
+
try:
|
|
210
|
+
from megfile.sftp2_path import Sftp2Path
|
|
211
|
+
except ImportError:
|
|
212
|
+
Sftp2Path = None
|
|
213
|
+
|
|
209
214
|
__all__ = [
|
|
210
215
|
"smart_access",
|
|
211
216
|
"smart_cache",
|
megfile/cli.py
CHANGED
|
@@ -47,7 +47,7 @@ from megfile.smart import (
|
|
|
47
47
|
smart_unlink,
|
|
48
48
|
)
|
|
49
49
|
from megfile.smart_path import SmartPath
|
|
50
|
-
from megfile.utils import get_human_size
|
|
50
|
+
from megfile.utils import copyfileobj_multi, get_human_size
|
|
51
51
|
from megfile.version import VERSION
|
|
52
52
|
|
|
53
53
|
options = {}
|
|
@@ -646,14 +646,10 @@ def to(path: str, append: bool, stdout: bool):
|
|
|
646
646
|
smart_open(path, mode) as f,
|
|
647
647
|
smart_open("stdio://1", "wb") as stdout_fd,
|
|
648
648
|
):
|
|
649
|
-
|
|
650
|
-
|
|
651
|
-
|
|
652
|
-
|
|
653
|
-
break
|
|
654
|
-
f.write(buf)
|
|
655
|
-
if stdout:
|
|
656
|
-
stdout_fd.write(buf)
|
|
649
|
+
destinations = [f]
|
|
650
|
+
if stdout:
|
|
651
|
+
destinations.append(stdout_fd)
|
|
652
|
+
copyfileobj_multi(stdin, destinations)
|
|
657
653
|
|
|
658
654
|
|
|
659
655
|
@cli.command(short_help="Produce an md5sum file for all the objects in the path.")
|
megfile/config.py
CHANGED
|
@@ -83,6 +83,7 @@ if READER_BLOCK_SIZE <= 0:
|
|
|
83
83
|
READER_MAX_BUFFER_SIZE = parse_quantity(
|
|
84
84
|
os.getenv("MEGFILE_READER_MAX_BUFFER_SIZE") or 128 * 2**20
|
|
85
85
|
)
|
|
86
|
+
READER_LAZY_PREFETCH = parse_boolean(os.getenv("MEGFILE_READER_LAZY_PREFETCH"), False)
|
|
86
87
|
|
|
87
88
|
# Multi-upload in aws s3 has a maximum of 10,000 parts,
|
|
88
89
|
# so the maximum supported file size is MEGFILE_WRITE_BLOCK_SIZE * 10,000,
|
|
@@ -105,6 +106,10 @@ GLOBAL_MAX_WORKERS = int(os.getenv("MEGFILE_MAX_WORKERS") or 8)
|
|
|
105
106
|
|
|
106
107
|
NEWLINE = ord("\n")
|
|
107
108
|
|
|
109
|
+
# Default buffer sizes for various operations
|
|
110
|
+
DEFAULT_COPY_BUFFER_SIZE = 16 * 1024 # 16KB, same as shutil.copyfileobj
|
|
111
|
+
DEFAULT_HASH_BUFFER_SIZE = 4 * 1024 # 4KB for hash calculations
|
|
112
|
+
|
|
108
113
|
S3_CLIENT_CACHE_MODE = os.getenv("MEGFILE_S3_CLIENT_CACHE_MODE") or "thread_local"
|
|
109
114
|
|
|
110
115
|
DEFAULT_MAX_RETRY_TIMES = int(os.getenv("MEGFILE_MAX_RETRY_TIMES") or 10)
|
megfile/fs_path.py
CHANGED
|
@@ -27,7 +27,7 @@ from megfile.lib.glob import iglob
|
|
|
27
27
|
from megfile.lib.joinpath import path_join
|
|
28
28
|
from megfile.lib.url import get_url_scheme
|
|
29
29
|
from megfile.smart_path import SmartPath
|
|
30
|
-
from megfile.utils import calculate_md5
|
|
30
|
+
from megfile.utils import calculate_md5, copyfd
|
|
31
31
|
|
|
32
32
|
__all__ = [
|
|
33
33
|
"FSPath",
|
|
@@ -737,15 +737,7 @@ class FSPath(URIPath):
|
|
|
737
737
|
):
|
|
738
738
|
if isinstance(self.path_without_protocol, int):
|
|
739
739
|
with open(fspath(dst_path), "wb") as fdst:
|
|
740
|
-
|
|
741
|
-
length = 16 * 1024
|
|
742
|
-
while True:
|
|
743
|
-
buf = os.read(self.path_without_protocol, length) # pyre-ignore[6]
|
|
744
|
-
if not buf:
|
|
745
|
-
break
|
|
746
|
-
fdst.write(buf)
|
|
747
|
-
if callback:
|
|
748
|
-
callback(len(buf))
|
|
740
|
+
copyfd(self.path_without_protocol, fdst, callback)
|
|
749
741
|
else:
|
|
750
742
|
shutil.copy2(
|
|
751
743
|
self.path_without_protocol, # pyre-ignore[6]
|
|
@@ -82,9 +82,9 @@ class BasePrefetchReader(Readable[bytes], Seekable, ABC):
|
|
|
82
82
|
|
|
83
83
|
self._offset = 0
|
|
84
84
|
self._cached_buffer = None
|
|
85
|
-
self._block_index =
|
|
85
|
+
self._block_index = 0 # Current block index
|
|
86
|
+
self._cached_offset = 0 # Current offset in the current block
|
|
86
87
|
self._seek_history = []
|
|
87
|
-
|
|
88
88
|
self._seek_buffer(0)
|
|
89
89
|
|
|
90
90
|
_logger.debug("open file: %r, mode: %s" % (self.name, self.mode))
|
|
@@ -98,7 +98,9 @@ class BasePrefetchReader(Readable[bytes], Seekable, ABC):
|
|
|
98
98
|
return self._process_local("futures", self._get_futures)
|
|
99
99
|
|
|
100
100
|
def _get_futures(self):
|
|
101
|
-
|
|
101
|
+
futures = LRUCacheFutureManager()
|
|
102
|
+
futures.register(self.name)
|
|
103
|
+
return futures
|
|
102
104
|
|
|
103
105
|
@property
|
|
104
106
|
@abstractmethod
|
|
@@ -207,9 +209,8 @@ class BasePrefetchReader(Readable[bytes], Seekable, ABC):
|
|
|
207
209
|
if size == 0 or self._offset >= self._content_size:
|
|
208
210
|
return b""
|
|
209
211
|
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
].read()
|
|
212
|
+
resp = self._fetch_response(start=self._offset, end=self._offset + size - 1)
|
|
213
|
+
data = resp["Body"].read()
|
|
213
214
|
self.seek(size, os.SEEK_CUR)
|
|
214
215
|
return data
|
|
215
216
|
|
|
@@ -369,12 +370,17 @@ class BasePrefetchReader(Readable[bytes], Seekable, ABC):
|
|
|
369
370
|
class LRUCacheFutureManager(OrderedDict):
|
|
370
371
|
def __init__(self):
|
|
371
372
|
super().__init__()
|
|
373
|
+
self._name = None
|
|
374
|
+
|
|
375
|
+
def register(self, name):
|
|
376
|
+
self._name = name
|
|
372
377
|
|
|
373
378
|
def submit(self, executor, key, *args, **kwargs):
|
|
374
379
|
if key in self:
|
|
375
380
|
self.move_to_end(key, last=True)
|
|
376
381
|
return
|
|
377
382
|
self[key] = executor.submit(*args, **kwargs)
|
|
383
|
+
_logger.debug("submit future: %r, key: %r" % (self._name, key))
|
|
378
384
|
|
|
379
385
|
@property
|
|
380
386
|
def finished(self):
|
|
@@ -385,7 +391,12 @@ class LRUCacheFutureManager(OrderedDict):
|
|
|
385
391
|
return self[key].result()
|
|
386
392
|
|
|
387
393
|
def cleanup(self, block_capacity: int):
|
|
394
|
+
keys = []
|
|
388
395
|
while len(self) > block_capacity:
|
|
389
|
-
|
|
396
|
+
key, future = self.popitem(last=False)
|
|
397
|
+
keys.append(key)
|
|
390
398
|
if not future.done():
|
|
391
399
|
future.cancel()
|
|
400
|
+
if keys:
|
|
401
|
+
_logger.debug("cleanup futures: %r, keys: %s" % (self._name, keys))
|
|
402
|
+
return keys
|
|
@@ -4,6 +4,7 @@ from typing import Optional
|
|
|
4
4
|
|
|
5
5
|
from megfile.config import (
|
|
6
6
|
READER_BLOCK_SIZE,
|
|
7
|
+
READER_LAZY_PREFETCH,
|
|
7
8
|
READER_MAX_BUFFER_SIZE,
|
|
8
9
|
S3_MAX_RETRY_TIMES,
|
|
9
10
|
)
|
|
@@ -62,7 +63,7 @@ class S3PrefetchReader(BasePrefetchReader):
|
|
|
62
63
|
)
|
|
63
64
|
|
|
64
65
|
def _get_content_size(self):
|
|
65
|
-
if self._block_capacity <= 0:
|
|
66
|
+
if self._block_capacity <= 0 or READER_LAZY_PREFETCH:
|
|
66
67
|
response = self._client.head_object(Bucket=self._bucket, Key=self._key)
|
|
67
68
|
self._content_etag = response.get("ETag")
|
|
68
69
|
return int(response["ContentLength"])
|
|
@@ -101,16 +101,21 @@ class ShareCacheFutureManager(LRUCacheFutureManager):
|
|
|
101
101
|
super().__init__()
|
|
102
102
|
self._references = Counter()
|
|
103
103
|
|
|
104
|
-
def register(self,
|
|
105
|
-
self._references[
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
104
|
+
def register(self, name):
|
|
105
|
+
self._references[name] += 1
|
|
106
|
+
_logger.debug("register reader: %r, count: %d" % (name, self._references[name]))
|
|
107
|
+
|
|
108
|
+
def unregister(self, name):
|
|
109
|
+
self._references[name] -= 1
|
|
110
|
+
_logger.debug(
|
|
111
|
+
"unregister reader: %r, count: %d" % (name, self._references[name])
|
|
112
|
+
)
|
|
113
|
+
if self._references[name] == 0:
|
|
114
|
+
self._references.pop(name)
|
|
115
|
+
for key in list(self):
|
|
116
|
+
if key[0] != name:
|
|
113
117
|
continue
|
|
114
|
-
future = self.pop(
|
|
118
|
+
future = self.pop(key)
|
|
115
119
|
if not future.done():
|
|
116
120
|
future.cancel() # pragma: no cover
|
|
121
|
+
_logger.debug("cleanup all futures of reader: %r" % name)
|
megfile/s3_path.py
CHANGED
|
@@ -969,6 +969,8 @@ def s3_buffered_open(
|
|
|
969
969
|
(both file head part and tail part can seek block_size).
|
|
970
970
|
Notes: This parameter are valid only for write-handle.
|
|
971
971
|
Read-handle support arbitrary seek
|
|
972
|
+
:param buffered: If you are operating pickle file without .pkl or .pickle extension,
|
|
973
|
+
please set this to True to avoid the performance issue.
|
|
972
974
|
:returns: An opened File object
|
|
973
975
|
:raises: S3FileNotFoundError
|
|
974
976
|
"""
|
|
@@ -1000,6 +1002,7 @@ def s3_buffered_open(
|
|
|
1000
1002
|
)
|
|
1001
1003
|
|
|
1002
1004
|
if mode == "rb":
|
|
1005
|
+
block_size = block_size or READER_BLOCK_SIZE
|
|
1003
1006
|
if share_cache_key is not None:
|
|
1004
1007
|
reader = S3ShareCacheReader(
|
|
1005
1008
|
bucket,
|
|
@@ -1008,7 +1011,7 @@ def s3_buffered_open(
|
|
|
1008
1011
|
s3_client=client,
|
|
1009
1012
|
max_retries=max_retries,
|
|
1010
1013
|
max_workers=max_workers,
|
|
1011
|
-
block_size=block_size
|
|
1014
|
+
block_size=block_size,
|
|
1012
1015
|
block_forward=block_forward,
|
|
1013
1016
|
profile_name=s3_url._profile_name,
|
|
1014
1017
|
)
|
|
@@ -1023,13 +1026,14 @@ def s3_buffered_open(
|
|
|
1023
1026
|
max_workers=max_workers,
|
|
1024
1027
|
max_buffer_size=max_buffer_size,
|
|
1025
1028
|
block_forward=block_forward,
|
|
1026
|
-
block_size=block_size
|
|
1029
|
+
block_size=block_size,
|
|
1027
1030
|
profile_name=s3_url._profile_name,
|
|
1028
1031
|
)
|
|
1029
1032
|
if buffered or _is_pickle(reader):
|
|
1030
1033
|
reader = io.BufferedReader(reader) # type: ignore
|
|
1031
1034
|
return reader
|
|
1032
1035
|
|
|
1036
|
+
block_size = block_size or WRITER_BLOCK_SIZE
|
|
1033
1037
|
if limited_seekable:
|
|
1034
1038
|
if max_buffer_size is None:
|
|
1035
1039
|
max_buffer_size = WRITER_MAX_BUFFER_SIZE
|
|
@@ -1038,7 +1042,7 @@ def s3_buffered_open(
|
|
|
1038
1042
|
key,
|
|
1039
1043
|
s3_client=client,
|
|
1040
1044
|
max_workers=max_workers,
|
|
1041
|
-
block_size=block_size
|
|
1045
|
+
block_size=block_size,
|
|
1042
1046
|
max_buffer_size=max_buffer_size,
|
|
1043
1047
|
profile_name=s3_url._profile_name,
|
|
1044
1048
|
)
|
|
@@ -1050,7 +1054,7 @@ def s3_buffered_open(
|
|
|
1050
1054
|
key,
|
|
1051
1055
|
s3_client=client,
|
|
1052
1056
|
max_workers=max_workers,
|
|
1053
|
-
block_size=block_size
|
|
1057
|
+
block_size=block_size,
|
|
1054
1058
|
max_buffer_size=max_buffer_size,
|
|
1055
1059
|
profile_name=s3_url._profile_name,
|
|
1056
1060
|
)
|