megfile 4.2.2__py3-none-any.whl → 4.2.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
megfile/__init__.py CHANGED
@@ -206,6 +206,11 @@ from megfile.stdio import is_stdio, stdio_open
206
206
  from megfile.stdio_path import StdioPath
207
207
  from megfile.version import VERSION as __version__ # noqa: F401
208
208
 
209
+ try:
210
+ from megfile.sftp2_path import Sftp2Path
211
+ except ImportError:
212
+ Sftp2Path = None
213
+
209
214
  __all__ = [
210
215
  "smart_access",
211
216
  "smart_cache",
megfile/cli.py CHANGED
@@ -47,7 +47,7 @@ from megfile.smart import (
47
47
  smart_unlink,
48
48
  )
49
49
  from megfile.smart_path import SmartPath
50
- from megfile.utils import get_human_size
50
+ from megfile.utils import copyfileobj_multi, get_human_size
51
51
  from megfile.version import VERSION
52
52
 
53
53
  options = {}
@@ -646,14 +646,10 @@ def to(path: str, append: bool, stdout: bool):
646
646
  smart_open(path, mode) as f,
647
647
  smart_open("stdio://1", "wb") as stdout_fd,
648
648
  ):
649
- length = 16 * 1024
650
- while True:
651
- buf = stdin.read(length)
652
- if not buf:
653
- break
654
- f.write(buf)
655
- if stdout:
656
- stdout_fd.write(buf)
649
+ destinations = [f]
650
+ if stdout:
651
+ destinations.append(stdout_fd)
652
+ copyfileobj_multi(stdin, destinations)
657
653
 
658
654
 
659
655
  @cli.command(short_help="Produce an md5sum file for all the objects in the path.")
megfile/config.py CHANGED
@@ -83,6 +83,7 @@ if READER_BLOCK_SIZE <= 0:
83
83
  READER_MAX_BUFFER_SIZE = parse_quantity(
84
84
  os.getenv("MEGFILE_READER_MAX_BUFFER_SIZE") or 128 * 2**20
85
85
  )
86
+ READER_LAZY_PREFETCH = parse_boolean(os.getenv("MEGFILE_READER_LAZY_PREFETCH"), False)
86
87
 
87
88
  # Multi-upload in aws s3 has a maximum of 10,000 parts,
88
89
  # so the maximum supported file size is MEGFILE_WRITE_BLOCK_SIZE * 10,000,
@@ -105,6 +106,10 @@ GLOBAL_MAX_WORKERS = int(os.getenv("MEGFILE_MAX_WORKERS") or 8)
105
106
 
106
107
  NEWLINE = ord("\n")
107
108
 
109
+ # Default buffer sizes for various operations
110
+ DEFAULT_COPY_BUFFER_SIZE = 16 * 1024 # 16KB, same as shutil.copyfileobj
111
+ DEFAULT_HASH_BUFFER_SIZE = 4 * 1024 # 4KB for hash calculations
112
+
108
113
  S3_CLIENT_CACHE_MODE = os.getenv("MEGFILE_S3_CLIENT_CACHE_MODE") or "thread_local"
109
114
 
110
115
  DEFAULT_MAX_RETRY_TIMES = int(os.getenv("MEGFILE_MAX_RETRY_TIMES") or 10)
megfile/fs_path.py CHANGED
@@ -27,7 +27,7 @@ from megfile.lib.glob import iglob
27
27
  from megfile.lib.joinpath import path_join
28
28
  from megfile.lib.url import get_url_scheme
29
29
  from megfile.smart_path import SmartPath
30
- from megfile.utils import calculate_md5
30
+ from megfile.utils import calculate_md5, copyfd
31
31
 
32
32
  __all__ = [
33
33
  "FSPath",
@@ -737,15 +737,7 @@ class FSPath(URIPath):
737
737
  ):
738
738
  if isinstance(self.path_without_protocol, int):
739
739
  with open(fspath(dst_path), "wb") as fdst:
740
- # This magic number is copied from copyfileobj
741
- length = 16 * 1024
742
- while True:
743
- buf = os.read(self.path_without_protocol, length) # pyre-ignore[6]
744
- if not buf:
745
- break
746
- fdst.write(buf)
747
- if callback:
748
- callback(len(buf))
740
+ copyfd(self.path_without_protocol, fdst, callback)
749
741
  else:
750
742
  shutil.copy2(
751
743
  self.path_without_protocol, # pyre-ignore[6]
@@ -82,9 +82,9 @@ class BasePrefetchReader(Readable[bytes], Seekable, ABC):
82
82
 
83
83
  self._offset = 0
84
84
  self._cached_buffer = None
85
- self._block_index = None # Current block index
85
+ self._block_index = 0 # Current block index
86
+ self._cached_offset = 0 # Current offset in the current block
86
87
  self._seek_history = []
87
-
88
88
  self._seek_buffer(0)
89
89
 
90
90
  _logger.debug("open file: %r, mode: %s" % (self.name, self.mode))
@@ -98,7 +98,9 @@ class BasePrefetchReader(Readable[bytes], Seekable, ABC):
98
98
  return self._process_local("futures", self._get_futures)
99
99
 
100
100
  def _get_futures(self):
101
- return LRUCacheFutureManager()
101
+ futures = LRUCacheFutureManager()
102
+ futures.register(self.name)
103
+ return futures
102
104
 
103
105
  @property
104
106
  @abstractmethod
@@ -207,9 +209,8 @@ class BasePrefetchReader(Readable[bytes], Seekable, ABC):
207
209
  if size == 0 or self._offset >= self._content_size:
208
210
  return b""
209
211
 
210
- data = self._fetch_response(start=self._offset, end=self._offset + size - 1)[
211
- "Body"
212
- ].read()
212
+ resp = self._fetch_response(start=self._offset, end=self._offset + size - 1)
213
+ data = resp["Body"].read()
213
214
  self.seek(size, os.SEEK_CUR)
214
215
  return data
215
216
 
@@ -369,12 +370,17 @@ class BasePrefetchReader(Readable[bytes], Seekable, ABC):
369
370
  class LRUCacheFutureManager(OrderedDict):
370
371
  def __init__(self):
371
372
  super().__init__()
373
+ self._name = None
374
+
375
+ def register(self, name):
376
+ self._name = name
372
377
 
373
378
  def submit(self, executor, key, *args, **kwargs):
374
379
  if key in self:
375
380
  self.move_to_end(key, last=True)
376
381
  return
377
382
  self[key] = executor.submit(*args, **kwargs)
383
+ _logger.debug("submit future: %r, key: %r" % (self._name, key))
378
384
 
379
385
  @property
380
386
  def finished(self):
@@ -385,7 +391,12 @@ class LRUCacheFutureManager(OrderedDict):
385
391
  return self[key].result()
386
392
 
387
393
  def cleanup(self, block_capacity: int):
394
+ keys = []
388
395
  while len(self) > block_capacity:
389
- _, future = self.popitem(last=False)
396
+ key, future = self.popitem(last=False)
397
+ keys.append(key)
390
398
  if not future.done():
391
399
  future.cancel()
400
+ if keys:
401
+ _logger.debug("cleanup futures: %r, keys: %s" % (self._name, keys))
402
+ return keys
megfile/lib/compare.py CHANGED
@@ -5,10 +5,10 @@ from megfile.pathlike import StatResult
5
5
 
6
6
 
7
7
  def get_sync_type(src_protocol, dst_protocol):
8
- if src_protocol == "s3" and dst_protocol != "s3":
9
- return "download"
10
- elif src_protocol != "s3" and dst_protocol == "s3":
8
+ if dst_protocol == "s3" or dst_protocol.startswith("s3+"):
11
9
  return "upload"
10
+ elif src_protocol == "s3" or src_protocol.startswith("s3+"):
11
+ return "download"
12
12
  else:
13
13
  return "copy"
14
14
 
megfile/lib/glob.py CHANGED
@@ -289,6 +289,9 @@ def get_non_glob_dir(glob: str):
289
289
  root_dir = []
290
290
  if glob.startswith("/"):
291
291
  root_dir.append("/")
292
+ elif "://" in glob:
293
+ protocol, glob = glob.split("://", 1)
294
+ root_dir.append(f"{protocol}://")
292
295
  for name in glob.split("/"):
293
296
  if has_magic(name):
294
297
  break
@@ -4,6 +4,7 @@ from typing import Optional
4
4
 
5
5
  from megfile.config import (
6
6
  READER_BLOCK_SIZE,
7
+ READER_LAZY_PREFETCH,
7
8
  READER_MAX_BUFFER_SIZE,
8
9
  S3_MAX_RETRY_TIMES,
9
10
  )
@@ -62,7 +63,7 @@ class S3PrefetchReader(BasePrefetchReader):
62
63
  )
63
64
 
64
65
  def _get_content_size(self):
65
- if self._block_capacity <= 0:
66
+ if self._block_capacity <= 0 or READER_LAZY_PREFETCH:
66
67
  response = self._client.head_object(Bucket=self._bucket, Key=self._key)
67
68
  self._content_etag = response.get("ETag")
68
69
  return int(response["ContentLength"])
@@ -101,16 +101,21 @@ class ShareCacheFutureManager(LRUCacheFutureManager):
101
101
  super().__init__()
102
102
  self._references = Counter()
103
103
 
104
- def register(self, key):
105
- self._references[key] += 1
106
-
107
- def unregister(self, key):
108
- self._references[key] -= 1
109
- if self._references[key] == 0:
110
- self._references.pop(key)
111
- for key_tuple in list(self):
112
- if key_tuple[0] != key:
104
+ def register(self, name):
105
+ self._references[name] += 1
106
+ _logger.debug("register reader: %r, count: %d" % (name, self._references[name]))
107
+
108
+ def unregister(self, name):
109
+ self._references[name] -= 1
110
+ _logger.debug(
111
+ "unregister reader: %r, count: %d" % (name, self._references[name])
112
+ )
113
+ if self._references[name] == 0:
114
+ self._references.pop(name)
115
+ for key in list(self):
116
+ if key[0] != name:
113
117
  continue
114
- future = self.pop(key_tuple)
118
+ future = self.pop(key)
115
119
  if not future.done():
116
120
  future.cancel() # pragma: no cover
121
+ _logger.debug("cleanup all futures of reader: %r" % name)
megfile/s3_path.py CHANGED
@@ -610,6 +610,7 @@ def _s3_glob_stat_single_path(
610
610
  yield FileEntry(S3Path(path).name, path, _make_stat(content))
611
611
  dirname = os.path.dirname(path)
612
612
  while dirname not in dirnames and dirname != top_dir:
613
+ # TODO: optimize memory usage and file path order
613
614
  dirnames.add(dirname)
614
615
  path = dirname + "/" if search_dir else dirname
615
616
  if pattern.match(path):
@@ -968,6 +969,8 @@ def s3_buffered_open(
968
969
  (both file head part and tail part can seek block_size).
969
970
  Notes: This parameter are valid only for write-handle.
970
971
  Read-handle support arbitrary seek
972
+ :param buffered: If you are operating pickle file without .pkl or .pickle extension,
973
+ please set this to True to avoid the performance issue.
971
974
  :returns: An opened File object
972
975
  :raises: S3FileNotFoundError
973
976
  """
@@ -999,6 +1002,7 @@ def s3_buffered_open(
999
1002
  )
1000
1003
 
1001
1004
  if mode == "rb":
1005
+ block_size = block_size or READER_BLOCK_SIZE
1002
1006
  if share_cache_key is not None:
1003
1007
  reader = S3ShareCacheReader(
1004
1008
  bucket,
@@ -1007,7 +1011,7 @@ def s3_buffered_open(
1007
1011
  s3_client=client,
1008
1012
  max_retries=max_retries,
1009
1013
  max_workers=max_workers,
1010
- block_size=block_size or READER_BLOCK_SIZE,
1014
+ block_size=block_size,
1011
1015
  block_forward=block_forward,
1012
1016
  profile_name=s3_url._profile_name,
1013
1017
  )
@@ -1022,13 +1026,14 @@ def s3_buffered_open(
1022
1026
  max_workers=max_workers,
1023
1027
  max_buffer_size=max_buffer_size,
1024
1028
  block_forward=block_forward,
1025
- block_size=block_size or READER_BLOCK_SIZE,
1029
+ block_size=block_size,
1026
1030
  profile_name=s3_url._profile_name,
1027
1031
  )
1028
1032
  if buffered or _is_pickle(reader):
1029
1033
  reader = io.BufferedReader(reader) # type: ignore
1030
1034
  return reader
1031
1035
 
1036
+ block_size = block_size or WRITER_BLOCK_SIZE
1032
1037
  if limited_seekable:
1033
1038
  if max_buffer_size is None:
1034
1039
  max_buffer_size = WRITER_MAX_BUFFER_SIZE
@@ -1037,7 +1042,7 @@ def s3_buffered_open(
1037
1042
  key,
1038
1043
  s3_client=client,
1039
1044
  max_workers=max_workers,
1040
- block_size=block_size or WRITER_BLOCK_SIZE,
1045
+ block_size=block_size,
1041
1046
  max_buffer_size=max_buffer_size,
1042
1047
  profile_name=s3_url._profile_name,
1043
1048
  )
@@ -1049,7 +1054,7 @@ def s3_buffered_open(
1049
1054
  key,
1050
1055
  s3_client=client,
1051
1056
  max_workers=max_workers,
1052
- block_size=block_size or WRITER_BLOCK_SIZE,
1057
+ block_size=block_size,
1053
1058
  max_buffer_size=max_buffer_size,
1054
1059
  profile_name=s3_url._profile_name,
1055
1060
  )