megfile 4.2.3__py3-none-any.whl → 4.2.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
megfile/__init__.py CHANGED
@@ -206,6 +206,11 @@ from megfile.stdio import is_stdio, stdio_open
206
206
  from megfile.stdio_path import StdioPath
207
207
  from megfile.version import VERSION as __version__ # noqa: F401
208
208
 
209
+ try:
210
+ from megfile.sftp2_path import Sftp2Path
211
+ except ImportError:
212
+ Sftp2Path = None
213
+
209
214
  __all__ = [
210
215
  "smart_access",
211
216
  "smart_cache",
megfile/cli.py CHANGED
@@ -47,7 +47,7 @@ from megfile.smart import (
47
47
  smart_unlink,
48
48
  )
49
49
  from megfile.smart_path import SmartPath
50
- from megfile.utils import get_human_size
50
+ from megfile.utils import copyfileobj_multi, get_human_size
51
51
  from megfile.version import VERSION
52
52
 
53
53
  options = {}
@@ -646,14 +646,10 @@ def to(path: str, append: bool, stdout: bool):
646
646
  smart_open(path, mode) as f,
647
647
  smart_open("stdio://1", "wb") as stdout_fd,
648
648
  ):
649
- length = 16 * 1024
650
- while True:
651
- buf = stdin.read(length)
652
- if not buf:
653
- break
654
- f.write(buf)
655
- if stdout:
656
- stdout_fd.write(buf)
649
+ destinations = [f]
650
+ if stdout:
651
+ destinations.append(stdout_fd)
652
+ copyfileobj_multi(stdin, destinations)
657
653
 
658
654
 
659
655
  @cli.command(short_help="Produce an md5sum file for all the objects in the path.")
megfile/config.py CHANGED
@@ -83,6 +83,7 @@ if READER_BLOCK_SIZE <= 0:
83
83
  READER_MAX_BUFFER_SIZE = parse_quantity(
84
84
  os.getenv("MEGFILE_READER_MAX_BUFFER_SIZE") or 128 * 2**20
85
85
  )
86
+ READER_LAZY_PREFETCH = parse_boolean(os.getenv("MEGFILE_READER_LAZY_PREFETCH"), False)
86
87
 
87
88
  # Multi-upload in aws s3 has a maximum of 10,000 parts,
88
89
  # so the maximum supported file size is MEGFILE_WRITE_BLOCK_SIZE * 10,000,
@@ -105,6 +106,10 @@ GLOBAL_MAX_WORKERS = int(os.getenv("MEGFILE_MAX_WORKERS") or 8)
105
106
 
106
107
  NEWLINE = ord("\n")
107
108
 
109
+ # Default buffer sizes for various operations
110
+ DEFAULT_COPY_BUFFER_SIZE = 16 * 1024 # 16KB, same as shutil.copyfileobj
111
+ DEFAULT_HASH_BUFFER_SIZE = 4 * 1024 # 4KB for hash calculations
112
+
108
113
  S3_CLIENT_CACHE_MODE = os.getenv("MEGFILE_S3_CLIENT_CACHE_MODE") or "thread_local"
109
114
 
110
115
  DEFAULT_MAX_RETRY_TIMES = int(os.getenv("MEGFILE_MAX_RETRY_TIMES") or 10)
megfile/fs_path.py CHANGED
@@ -27,7 +27,7 @@ from megfile.lib.glob import iglob
27
27
  from megfile.lib.joinpath import path_join
28
28
  from megfile.lib.url import get_url_scheme
29
29
  from megfile.smart_path import SmartPath
30
- from megfile.utils import calculate_md5
30
+ from megfile.utils import calculate_md5, copyfd
31
31
 
32
32
  __all__ = [
33
33
  "FSPath",
@@ -737,15 +737,7 @@ class FSPath(URIPath):
737
737
  ):
738
738
  if isinstance(self.path_without_protocol, int):
739
739
  with open(fspath(dst_path), "wb") as fdst:
740
- # This magic number is copied from copyfileobj
741
- length = 16 * 1024
742
- while True:
743
- buf = os.read(self.path_without_protocol, length) # pyre-ignore[6]
744
- if not buf:
745
- break
746
- fdst.write(buf)
747
- if callback:
748
- callback(len(buf))
740
+ copyfd(self.path_without_protocol, fdst, callback)
749
741
  else:
750
742
  shutil.copy2(
751
743
  self.path_without_protocol, # pyre-ignore[6]
@@ -82,9 +82,9 @@ class BasePrefetchReader(Readable[bytes], Seekable, ABC):
82
82
 
83
83
  self._offset = 0
84
84
  self._cached_buffer = None
85
- self._block_index = None # Current block index
85
+ self._block_index = 0 # Current block index
86
+ self._cached_offset = 0 # Current offset in the current block
86
87
  self._seek_history = []
87
-
88
88
  self._seek_buffer(0)
89
89
 
90
90
  _logger.debug("open file: %r, mode: %s" % (self.name, self.mode))
@@ -98,7 +98,9 @@ class BasePrefetchReader(Readable[bytes], Seekable, ABC):
98
98
  return self._process_local("futures", self._get_futures)
99
99
 
100
100
  def _get_futures(self):
101
- return LRUCacheFutureManager()
101
+ futures = LRUCacheFutureManager()
102
+ futures.register(self.name)
103
+ return futures
102
104
 
103
105
  @property
104
106
  @abstractmethod
@@ -207,9 +209,8 @@ class BasePrefetchReader(Readable[bytes], Seekable, ABC):
207
209
  if size == 0 or self._offset >= self._content_size:
208
210
  return b""
209
211
 
210
- data = self._fetch_response(start=self._offset, end=self._offset + size - 1)[
211
- "Body"
212
- ].read()
212
+ resp = self._fetch_response(start=self._offset, end=self._offset + size - 1)
213
+ data = resp["Body"].read()
213
214
  self.seek(size, os.SEEK_CUR)
214
215
  return data
215
216
 
@@ -369,12 +370,17 @@ class BasePrefetchReader(Readable[bytes], Seekable, ABC):
369
370
  class LRUCacheFutureManager(OrderedDict):
370
371
  def __init__(self):
371
372
  super().__init__()
373
+ self._name = None
374
+
375
+ def register(self, name):
376
+ self._name = name
372
377
 
373
378
  def submit(self, executor, key, *args, **kwargs):
374
379
  if key in self:
375
380
  self.move_to_end(key, last=True)
376
381
  return
377
382
  self[key] = executor.submit(*args, **kwargs)
383
+ _logger.debug("submit future: %r, key: %r" % (self._name, key))
378
384
 
379
385
  @property
380
386
  def finished(self):
@@ -385,7 +391,12 @@ class LRUCacheFutureManager(OrderedDict):
385
391
  return self[key].result()
386
392
 
387
393
  def cleanup(self, block_capacity: int):
394
+ keys = []
388
395
  while len(self) > block_capacity:
389
- _, future = self.popitem(last=False)
396
+ key, future = self.popitem(last=False)
397
+ keys.append(key)
390
398
  if not future.done():
391
399
  future.cancel()
400
+ if keys:
401
+ _logger.debug("cleanup futures: %r, keys: %s" % (self._name, keys))
402
+ return keys
@@ -4,6 +4,7 @@ from typing import Optional
4
4
 
5
5
  from megfile.config import (
6
6
  READER_BLOCK_SIZE,
7
+ READER_LAZY_PREFETCH,
7
8
  READER_MAX_BUFFER_SIZE,
8
9
  S3_MAX_RETRY_TIMES,
9
10
  )
@@ -62,7 +63,7 @@ class S3PrefetchReader(BasePrefetchReader):
62
63
  )
63
64
 
64
65
  def _get_content_size(self):
65
- if self._block_capacity <= 0:
66
+ if self._block_capacity <= 0 or READER_LAZY_PREFETCH:
66
67
  response = self._client.head_object(Bucket=self._bucket, Key=self._key)
67
68
  self._content_etag = response.get("ETag")
68
69
  return int(response["ContentLength"])
@@ -101,16 +101,21 @@ class ShareCacheFutureManager(LRUCacheFutureManager):
101
101
  super().__init__()
102
102
  self._references = Counter()
103
103
 
104
- def register(self, key):
105
- self._references[key] += 1
106
-
107
- def unregister(self, key):
108
- self._references[key] -= 1
109
- if self._references[key] == 0:
110
- self._references.pop(key)
111
- for key_tuple in list(self):
112
- if key_tuple[0] != key:
104
+ def register(self, name):
105
+ self._references[name] += 1
106
+ _logger.debug("register reader: %r, count: %d" % (name, self._references[name]))
107
+
108
+ def unregister(self, name):
109
+ self._references[name] -= 1
110
+ _logger.debug(
111
+ "unregister reader: %r, count: %d" % (name, self._references[name])
112
+ )
113
+ if self._references[name] == 0:
114
+ self._references.pop(name)
115
+ for key in list(self):
116
+ if key[0] != name:
113
117
  continue
114
- future = self.pop(key_tuple)
118
+ future = self.pop(key)
115
119
  if not future.done():
116
120
  future.cancel() # pragma: no cover
121
+ _logger.debug("cleanup all futures of reader: %r" % name)
megfile/s3_path.py CHANGED
@@ -969,6 +969,8 @@ def s3_buffered_open(
969
969
  (both file head part and tail part can seek block_size).
970
970
  Notes: This parameter are valid only for write-handle.
971
971
  Read-handle support arbitrary seek
972
+ :param buffered: If you are operating pickle file without .pkl or .pickle extension,
973
+ please set this to True to avoid the performance issue.
972
974
  :returns: An opened File object
973
975
  :raises: S3FileNotFoundError
974
976
  """
@@ -1000,6 +1002,7 @@ def s3_buffered_open(
1000
1002
  )
1001
1003
 
1002
1004
  if mode == "rb":
1005
+ block_size = block_size or READER_BLOCK_SIZE
1003
1006
  if share_cache_key is not None:
1004
1007
  reader = S3ShareCacheReader(
1005
1008
  bucket,
@@ -1008,7 +1011,7 @@ def s3_buffered_open(
1008
1011
  s3_client=client,
1009
1012
  max_retries=max_retries,
1010
1013
  max_workers=max_workers,
1011
- block_size=block_size or READER_BLOCK_SIZE,
1014
+ block_size=block_size,
1012
1015
  block_forward=block_forward,
1013
1016
  profile_name=s3_url._profile_name,
1014
1017
  )
@@ -1023,13 +1026,14 @@ def s3_buffered_open(
1023
1026
  max_workers=max_workers,
1024
1027
  max_buffer_size=max_buffer_size,
1025
1028
  block_forward=block_forward,
1026
- block_size=block_size or READER_BLOCK_SIZE,
1029
+ block_size=block_size,
1027
1030
  profile_name=s3_url._profile_name,
1028
1031
  )
1029
1032
  if buffered or _is_pickle(reader):
1030
1033
  reader = io.BufferedReader(reader) # type: ignore
1031
1034
  return reader
1032
1035
 
1036
+ block_size = block_size or WRITER_BLOCK_SIZE
1033
1037
  if limited_seekable:
1034
1038
  if max_buffer_size is None:
1035
1039
  max_buffer_size = WRITER_MAX_BUFFER_SIZE
@@ -1038,7 +1042,7 @@ def s3_buffered_open(
1038
1042
  key,
1039
1043
  s3_client=client,
1040
1044
  max_workers=max_workers,
1041
- block_size=block_size or WRITER_BLOCK_SIZE,
1045
+ block_size=block_size,
1042
1046
  max_buffer_size=max_buffer_size,
1043
1047
  profile_name=s3_url._profile_name,
1044
1048
  )
@@ -1050,7 +1054,7 @@ def s3_buffered_open(
1050
1054
  key,
1051
1055
  s3_client=client,
1052
1056
  max_workers=max_workers,
1053
- block_size=block_size or WRITER_BLOCK_SIZE,
1057
+ block_size=block_size,
1054
1058
  max_buffer_size=max_buffer_size,
1055
1059
  profile_name=s3_url._profile_name,
1056
1060
  )