megfile 4.2.2__py3-none-any.whl → 4.2.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- megfile/__init__.py +5 -0
- megfile/cli.py +5 -9
- megfile/config.py +5 -0
- megfile/fs_path.py +2 -10
- megfile/lib/base_prefetch_reader.py +18 -7
- megfile/lib/compare.py +3 -3
- megfile/lib/glob.py +3 -0
- megfile/lib/s3_prefetch_reader.py +2 -1
- megfile/lib/s3_share_cache_reader.py +15 -10
- megfile/s3_path.py +9 -4
- megfile/sftp2.py +827 -0
- megfile/sftp2_path.py +1084 -0
- megfile/sftp_path.py +3 -15
- megfile/smart.py +5 -17
- megfile/utils/__init__.py +92 -9
- megfile/version.py +1 -1
- {megfile-4.2.2.dist-info → megfile-4.2.4.dist-info}/METADATA +3 -1
- {megfile-4.2.2.dist-info → megfile-4.2.4.dist-info}/RECORD +23 -21
- {megfile-4.2.2.dist-info → megfile-4.2.4.dist-info}/WHEEL +0 -0
- {megfile-4.2.2.dist-info → megfile-4.2.4.dist-info}/entry_points.txt +0 -0
- {megfile-4.2.2.dist-info → megfile-4.2.4.dist-info}/licenses/LICENSE +0 -0
- {megfile-4.2.2.dist-info → megfile-4.2.4.dist-info}/licenses/LICENSE.pyre +0 -0
- {megfile-4.2.2.dist-info → megfile-4.2.4.dist-info}/top_level.txt +0 -0
megfile/__init__.py
CHANGED
|
@@ -206,6 +206,11 @@ from megfile.stdio import is_stdio, stdio_open
|
|
|
206
206
|
from megfile.stdio_path import StdioPath
|
|
207
207
|
from megfile.version import VERSION as __version__ # noqa: F401
|
|
208
208
|
|
|
209
|
+
try:
|
|
210
|
+
from megfile.sftp2_path import Sftp2Path
|
|
211
|
+
except ImportError:
|
|
212
|
+
Sftp2Path = None
|
|
213
|
+
|
|
209
214
|
__all__ = [
|
|
210
215
|
"smart_access",
|
|
211
216
|
"smart_cache",
|
megfile/cli.py
CHANGED
|
@@ -47,7 +47,7 @@ from megfile.smart import (
|
|
|
47
47
|
smart_unlink,
|
|
48
48
|
)
|
|
49
49
|
from megfile.smart_path import SmartPath
|
|
50
|
-
from megfile.utils import get_human_size
|
|
50
|
+
from megfile.utils import copyfileobj_multi, get_human_size
|
|
51
51
|
from megfile.version import VERSION
|
|
52
52
|
|
|
53
53
|
options = {}
|
|
@@ -646,14 +646,10 @@ def to(path: str, append: bool, stdout: bool):
|
|
|
646
646
|
smart_open(path, mode) as f,
|
|
647
647
|
smart_open("stdio://1", "wb") as stdout_fd,
|
|
648
648
|
):
|
|
649
|
-
|
|
650
|
-
|
|
651
|
-
|
|
652
|
-
|
|
653
|
-
break
|
|
654
|
-
f.write(buf)
|
|
655
|
-
if stdout:
|
|
656
|
-
stdout_fd.write(buf)
|
|
649
|
+
destinations = [f]
|
|
650
|
+
if stdout:
|
|
651
|
+
destinations.append(stdout_fd)
|
|
652
|
+
copyfileobj_multi(stdin, destinations)
|
|
657
653
|
|
|
658
654
|
|
|
659
655
|
@cli.command(short_help="Produce an md5sum file for all the objects in the path.")
|
megfile/config.py
CHANGED
|
@@ -83,6 +83,7 @@ if READER_BLOCK_SIZE <= 0:
|
|
|
83
83
|
READER_MAX_BUFFER_SIZE = parse_quantity(
|
|
84
84
|
os.getenv("MEGFILE_READER_MAX_BUFFER_SIZE") or 128 * 2**20
|
|
85
85
|
)
|
|
86
|
+
READER_LAZY_PREFETCH = parse_boolean(os.getenv("MEGFILE_READER_LAZY_PREFETCH"), False)
|
|
86
87
|
|
|
87
88
|
# Multi-upload in aws s3 has a maximum of 10,000 parts,
|
|
88
89
|
# so the maximum supported file size is MEGFILE_WRITE_BLOCK_SIZE * 10,000,
|
|
@@ -105,6 +106,10 @@ GLOBAL_MAX_WORKERS = int(os.getenv("MEGFILE_MAX_WORKERS") or 8)
|
|
|
105
106
|
|
|
106
107
|
NEWLINE = ord("\n")
|
|
107
108
|
|
|
109
|
+
# Default buffer sizes for various operations
|
|
110
|
+
DEFAULT_COPY_BUFFER_SIZE = 16 * 1024 # 16KB, same as shutil.copyfileobj
|
|
111
|
+
DEFAULT_HASH_BUFFER_SIZE = 4 * 1024 # 4KB for hash calculations
|
|
112
|
+
|
|
108
113
|
S3_CLIENT_CACHE_MODE = os.getenv("MEGFILE_S3_CLIENT_CACHE_MODE") or "thread_local"
|
|
109
114
|
|
|
110
115
|
DEFAULT_MAX_RETRY_TIMES = int(os.getenv("MEGFILE_MAX_RETRY_TIMES") or 10)
|
megfile/fs_path.py
CHANGED
|
@@ -27,7 +27,7 @@ from megfile.lib.glob import iglob
|
|
|
27
27
|
from megfile.lib.joinpath import path_join
|
|
28
28
|
from megfile.lib.url import get_url_scheme
|
|
29
29
|
from megfile.smart_path import SmartPath
|
|
30
|
-
from megfile.utils import calculate_md5
|
|
30
|
+
from megfile.utils import calculate_md5, copyfd
|
|
31
31
|
|
|
32
32
|
__all__ = [
|
|
33
33
|
"FSPath",
|
|
@@ -737,15 +737,7 @@ class FSPath(URIPath):
|
|
|
737
737
|
):
|
|
738
738
|
if isinstance(self.path_without_protocol, int):
|
|
739
739
|
with open(fspath(dst_path), "wb") as fdst:
|
|
740
|
-
|
|
741
|
-
length = 16 * 1024
|
|
742
|
-
while True:
|
|
743
|
-
buf = os.read(self.path_without_protocol, length) # pyre-ignore[6]
|
|
744
|
-
if not buf:
|
|
745
|
-
break
|
|
746
|
-
fdst.write(buf)
|
|
747
|
-
if callback:
|
|
748
|
-
callback(len(buf))
|
|
740
|
+
copyfd(self.path_without_protocol, fdst, callback)
|
|
749
741
|
else:
|
|
750
742
|
shutil.copy2(
|
|
751
743
|
self.path_without_protocol, # pyre-ignore[6]
|
|
@@ -82,9 +82,9 @@ class BasePrefetchReader(Readable[bytes], Seekable, ABC):
|
|
|
82
82
|
|
|
83
83
|
self._offset = 0
|
|
84
84
|
self._cached_buffer = None
|
|
85
|
-
self._block_index =
|
|
85
|
+
self._block_index = 0 # Current block index
|
|
86
|
+
self._cached_offset = 0 # Current offset in the current block
|
|
86
87
|
self._seek_history = []
|
|
87
|
-
|
|
88
88
|
self._seek_buffer(0)
|
|
89
89
|
|
|
90
90
|
_logger.debug("open file: %r, mode: %s" % (self.name, self.mode))
|
|
@@ -98,7 +98,9 @@ class BasePrefetchReader(Readable[bytes], Seekable, ABC):
|
|
|
98
98
|
return self._process_local("futures", self._get_futures)
|
|
99
99
|
|
|
100
100
|
def _get_futures(self):
|
|
101
|
-
|
|
101
|
+
futures = LRUCacheFutureManager()
|
|
102
|
+
futures.register(self.name)
|
|
103
|
+
return futures
|
|
102
104
|
|
|
103
105
|
@property
|
|
104
106
|
@abstractmethod
|
|
@@ -207,9 +209,8 @@ class BasePrefetchReader(Readable[bytes], Seekable, ABC):
|
|
|
207
209
|
if size == 0 or self._offset >= self._content_size:
|
|
208
210
|
return b""
|
|
209
211
|
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
].read()
|
|
212
|
+
resp = self._fetch_response(start=self._offset, end=self._offset + size - 1)
|
|
213
|
+
data = resp["Body"].read()
|
|
213
214
|
self.seek(size, os.SEEK_CUR)
|
|
214
215
|
return data
|
|
215
216
|
|
|
@@ -369,12 +370,17 @@ class BasePrefetchReader(Readable[bytes], Seekable, ABC):
|
|
|
369
370
|
class LRUCacheFutureManager(OrderedDict):
|
|
370
371
|
def __init__(self):
|
|
371
372
|
super().__init__()
|
|
373
|
+
self._name = None
|
|
374
|
+
|
|
375
|
+
def register(self, name):
|
|
376
|
+
self._name = name
|
|
372
377
|
|
|
373
378
|
def submit(self, executor, key, *args, **kwargs):
|
|
374
379
|
if key in self:
|
|
375
380
|
self.move_to_end(key, last=True)
|
|
376
381
|
return
|
|
377
382
|
self[key] = executor.submit(*args, **kwargs)
|
|
383
|
+
_logger.debug("submit future: %r, key: %r" % (self._name, key))
|
|
378
384
|
|
|
379
385
|
@property
|
|
380
386
|
def finished(self):
|
|
@@ -385,7 +391,12 @@ class LRUCacheFutureManager(OrderedDict):
|
|
|
385
391
|
return self[key].result()
|
|
386
392
|
|
|
387
393
|
def cleanup(self, block_capacity: int):
|
|
394
|
+
keys = []
|
|
388
395
|
while len(self) > block_capacity:
|
|
389
|
-
|
|
396
|
+
key, future = self.popitem(last=False)
|
|
397
|
+
keys.append(key)
|
|
390
398
|
if not future.done():
|
|
391
399
|
future.cancel()
|
|
400
|
+
if keys:
|
|
401
|
+
_logger.debug("cleanup futures: %r, keys: %s" % (self._name, keys))
|
|
402
|
+
return keys
|
megfile/lib/compare.py
CHANGED
|
@@ -5,10 +5,10 @@ from megfile.pathlike import StatResult
|
|
|
5
5
|
|
|
6
6
|
|
|
7
7
|
def get_sync_type(src_protocol, dst_protocol):
|
|
8
|
-
if
|
|
9
|
-
return "download"
|
|
10
|
-
elif src_protocol != "s3" and dst_protocol == "s3":
|
|
8
|
+
if dst_protocol == "s3" or dst_protocol.startswith("s3+"):
|
|
11
9
|
return "upload"
|
|
10
|
+
elif src_protocol == "s3" or src_protocol.startswith("s3+"):
|
|
11
|
+
return "download"
|
|
12
12
|
else:
|
|
13
13
|
return "copy"
|
|
14
14
|
|
megfile/lib/glob.py
CHANGED
|
@@ -289,6 +289,9 @@ def get_non_glob_dir(glob: str):
|
|
|
289
289
|
root_dir = []
|
|
290
290
|
if glob.startswith("/"):
|
|
291
291
|
root_dir.append("/")
|
|
292
|
+
elif "://" in glob:
|
|
293
|
+
protocol, glob = glob.split("://", 1)
|
|
294
|
+
root_dir.append(f"{protocol}://")
|
|
292
295
|
for name in glob.split("/"):
|
|
293
296
|
if has_magic(name):
|
|
294
297
|
break
|
|
@@ -4,6 +4,7 @@ from typing import Optional
|
|
|
4
4
|
|
|
5
5
|
from megfile.config import (
|
|
6
6
|
READER_BLOCK_SIZE,
|
|
7
|
+
READER_LAZY_PREFETCH,
|
|
7
8
|
READER_MAX_BUFFER_SIZE,
|
|
8
9
|
S3_MAX_RETRY_TIMES,
|
|
9
10
|
)
|
|
@@ -62,7 +63,7 @@ class S3PrefetchReader(BasePrefetchReader):
|
|
|
62
63
|
)
|
|
63
64
|
|
|
64
65
|
def _get_content_size(self):
|
|
65
|
-
if self._block_capacity <= 0:
|
|
66
|
+
if self._block_capacity <= 0 or READER_LAZY_PREFETCH:
|
|
66
67
|
response = self._client.head_object(Bucket=self._bucket, Key=self._key)
|
|
67
68
|
self._content_etag = response.get("ETag")
|
|
68
69
|
return int(response["ContentLength"])
|
|
@@ -101,16 +101,21 @@ class ShareCacheFutureManager(LRUCacheFutureManager):
|
|
|
101
101
|
super().__init__()
|
|
102
102
|
self._references = Counter()
|
|
103
103
|
|
|
104
|
-
def register(self,
|
|
105
|
-
self._references[
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
104
|
+
def register(self, name):
|
|
105
|
+
self._references[name] += 1
|
|
106
|
+
_logger.debug("register reader: %r, count: %d" % (name, self._references[name]))
|
|
107
|
+
|
|
108
|
+
def unregister(self, name):
|
|
109
|
+
self._references[name] -= 1
|
|
110
|
+
_logger.debug(
|
|
111
|
+
"unregister reader: %r, count: %d" % (name, self._references[name])
|
|
112
|
+
)
|
|
113
|
+
if self._references[name] == 0:
|
|
114
|
+
self._references.pop(name)
|
|
115
|
+
for key in list(self):
|
|
116
|
+
if key[0] != name:
|
|
113
117
|
continue
|
|
114
|
-
future = self.pop(
|
|
118
|
+
future = self.pop(key)
|
|
115
119
|
if not future.done():
|
|
116
120
|
future.cancel() # pragma: no cover
|
|
121
|
+
_logger.debug("cleanup all futures of reader: %r" % name)
|
megfile/s3_path.py
CHANGED
|
@@ -610,6 +610,7 @@ def _s3_glob_stat_single_path(
|
|
|
610
610
|
yield FileEntry(S3Path(path).name, path, _make_stat(content))
|
|
611
611
|
dirname = os.path.dirname(path)
|
|
612
612
|
while dirname not in dirnames and dirname != top_dir:
|
|
613
|
+
# TODO: optimize memory usage and file path order
|
|
613
614
|
dirnames.add(dirname)
|
|
614
615
|
path = dirname + "/" if search_dir else dirname
|
|
615
616
|
if pattern.match(path):
|
|
@@ -968,6 +969,8 @@ def s3_buffered_open(
|
|
|
968
969
|
(both file head part and tail part can seek block_size).
|
|
969
970
|
Notes: This parameter are valid only for write-handle.
|
|
970
971
|
Read-handle support arbitrary seek
|
|
972
|
+
:param buffered: If you are operating pickle file without .pkl or .pickle extension,
|
|
973
|
+
please set this to True to avoid the performance issue.
|
|
971
974
|
:returns: An opened File object
|
|
972
975
|
:raises: S3FileNotFoundError
|
|
973
976
|
"""
|
|
@@ -999,6 +1002,7 @@ def s3_buffered_open(
|
|
|
999
1002
|
)
|
|
1000
1003
|
|
|
1001
1004
|
if mode == "rb":
|
|
1005
|
+
block_size = block_size or READER_BLOCK_SIZE
|
|
1002
1006
|
if share_cache_key is not None:
|
|
1003
1007
|
reader = S3ShareCacheReader(
|
|
1004
1008
|
bucket,
|
|
@@ -1007,7 +1011,7 @@ def s3_buffered_open(
|
|
|
1007
1011
|
s3_client=client,
|
|
1008
1012
|
max_retries=max_retries,
|
|
1009
1013
|
max_workers=max_workers,
|
|
1010
|
-
block_size=block_size
|
|
1014
|
+
block_size=block_size,
|
|
1011
1015
|
block_forward=block_forward,
|
|
1012
1016
|
profile_name=s3_url._profile_name,
|
|
1013
1017
|
)
|
|
@@ -1022,13 +1026,14 @@ def s3_buffered_open(
|
|
|
1022
1026
|
max_workers=max_workers,
|
|
1023
1027
|
max_buffer_size=max_buffer_size,
|
|
1024
1028
|
block_forward=block_forward,
|
|
1025
|
-
block_size=block_size
|
|
1029
|
+
block_size=block_size,
|
|
1026
1030
|
profile_name=s3_url._profile_name,
|
|
1027
1031
|
)
|
|
1028
1032
|
if buffered or _is_pickle(reader):
|
|
1029
1033
|
reader = io.BufferedReader(reader) # type: ignore
|
|
1030
1034
|
return reader
|
|
1031
1035
|
|
|
1036
|
+
block_size = block_size or WRITER_BLOCK_SIZE
|
|
1032
1037
|
if limited_seekable:
|
|
1033
1038
|
if max_buffer_size is None:
|
|
1034
1039
|
max_buffer_size = WRITER_MAX_BUFFER_SIZE
|
|
@@ -1037,7 +1042,7 @@ def s3_buffered_open(
|
|
|
1037
1042
|
key,
|
|
1038
1043
|
s3_client=client,
|
|
1039
1044
|
max_workers=max_workers,
|
|
1040
|
-
block_size=block_size
|
|
1045
|
+
block_size=block_size,
|
|
1041
1046
|
max_buffer_size=max_buffer_size,
|
|
1042
1047
|
profile_name=s3_url._profile_name,
|
|
1043
1048
|
)
|
|
@@ -1049,7 +1054,7 @@ def s3_buffered_open(
|
|
|
1049
1054
|
key,
|
|
1050
1055
|
s3_client=client,
|
|
1051
1056
|
max_workers=max_workers,
|
|
1052
|
-
block_size=block_size
|
|
1057
|
+
block_size=block_size,
|
|
1053
1058
|
max_buffer_size=max_buffer_size,
|
|
1054
1059
|
profile_name=s3_url._profile_name,
|
|
1055
1060
|
)
|