megfile 4.2.3__py3-none-any.whl → 4.2.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
megfile/__init__.py CHANGED
@@ -206,6 +206,16 @@ from megfile.stdio import is_stdio, stdio_open
206
206
  from megfile.stdio_path import StdioPath
207
207
  from megfile.version import VERSION as __version__ # noqa: F401
208
208
 
209
+ try:
210
+ from megfile.sftp2_path import Sftp2Path
211
+ except ImportError:
212
+ Sftp2Path = None
213
+
214
+ try:
215
+ from megfile.webdav_path import WebdavPath
216
+ except ImportError:
217
+ WebdavPath = None
218
+
209
219
  __all__ = [
210
220
  "smart_access",
211
221
  "smart_cache",
megfile/cli.py CHANGED
@@ -47,7 +47,7 @@ from megfile.smart import (
47
47
  smart_unlink,
48
48
  )
49
49
  from megfile.smart_path import SmartPath
50
- from megfile.utils import get_human_size
50
+ from megfile.utils import copyfileobj_multi, get_human_size
51
51
  from megfile.version import VERSION
52
52
 
53
53
  options = {}
@@ -646,14 +646,10 @@ def to(path: str, append: bool, stdout: bool):
646
646
  smart_open(path, mode) as f,
647
647
  smart_open("stdio://1", "wb") as stdout_fd,
648
648
  ):
649
- length = 16 * 1024
650
- while True:
651
- buf = stdin.read(length)
652
- if not buf:
653
- break
654
- f.write(buf)
655
- if stdout:
656
- stdout_fd.write(buf)
649
+ destinations = [f]
650
+ if stdout:
651
+ destinations.append(stdout_fd)
652
+ copyfileobj_multi(stdin, destinations)
657
653
 
658
654
 
659
655
  @cli.command(short_help="Produce an md5sum file for all the objects in the path.")
megfile/config.py CHANGED
@@ -83,6 +83,7 @@ if READER_BLOCK_SIZE <= 0:
83
83
  READER_MAX_BUFFER_SIZE = parse_quantity(
84
84
  os.getenv("MEGFILE_READER_MAX_BUFFER_SIZE") or 128 * 2**20
85
85
  )
86
+ READER_LAZY_PREFETCH = parse_boolean(os.getenv("MEGFILE_READER_LAZY_PREFETCH"), False)
86
87
 
87
88
  # Multi-upload in aws s3 has a maximum of 10,000 parts,
88
89
  # so the maximum supported file size is MEGFILE_WRITE_BLOCK_SIZE * 10,000,
@@ -105,6 +106,10 @@ GLOBAL_MAX_WORKERS = int(os.getenv("MEGFILE_MAX_WORKERS") or 8)
105
106
 
106
107
  NEWLINE = ord("\n")
107
108
 
109
+ # Default buffer sizes for various operations
110
+ DEFAULT_COPY_BUFFER_SIZE = 16 * 1024 # 16KB, same as shutil.copyfileobj
111
+ DEFAULT_HASH_BUFFER_SIZE = 4 * 1024 # 4KB for hash calculations
112
+
108
113
  S3_CLIENT_CACHE_MODE = os.getenv("MEGFILE_S3_CLIENT_CACHE_MODE") or "thread_local"
109
114
 
110
115
  DEFAULT_MAX_RETRY_TIMES = int(os.getenv("MEGFILE_MAX_RETRY_TIMES") or 10)
megfile/fs.py CHANGED
@@ -1,7 +1,7 @@
1
1
  import os
2
2
  from stat import S_ISDIR as stat_isdir
3
3
  from stat import S_ISLNK as stat_islnk
4
- from typing import BinaryIO, Callable, Iterator, List, Optional, Tuple
4
+ from typing import IO, BinaryIO, Callable, Iterator, List, Optional, Tuple
5
5
 
6
6
  from megfile.fs_path import (
7
7
  FSPath,
@@ -52,6 +52,7 @@ __all__ = [
52
52
  "fs_islink",
53
53
  "fs_ismount",
54
54
  "fs_save_as",
55
+ "fs_open",
55
56
  ]
56
57
 
57
58
 
@@ -612,3 +613,15 @@ def fs_move(src_path: PathLike, dst_path: PathLike, overwrite: bool = True) -> N
612
613
  :param overwrite: whether or not overwrite file when exists
613
614
  """
614
615
  return fs_rename(src_path, dst_path, overwrite)
616
+
617
+
618
+ def fs_open(path: PathLike, mode: str = "r", **kwargs) -> IO:
619
+ """
620
+ Open file on fs
621
+
622
+ :param path: Given path
623
+ :param mode: File open mode, like built-in open function
624
+ :param buffering: Buffering policy, like built-in open function
625
+ :returns: A file-like object
626
+ """
627
+ return FSPath(path).open(mode, **kwargs)
megfile/fs_path.py CHANGED
@@ -17,6 +17,7 @@ from megfile.interfaces import (
17
17
  Access,
18
18
  ContextIterator,
19
19
  FileEntry,
20
+ FileLike,
20
21
  PathLike,
21
22
  StatResult,
22
23
  URIPath,
@@ -27,7 +28,7 @@ from megfile.lib.glob import iglob
27
28
  from megfile.lib.joinpath import path_join
28
29
  from megfile.lib.url import get_url_scheme
29
30
  from megfile.smart_path import SmartPath
30
- from megfile.utils import calculate_md5
31
+ from megfile.utils import calculate_md5, copyfd
31
32
 
32
33
  __all__ = [
33
34
  "FSPath",
@@ -85,6 +86,36 @@ def _fs_rename_file(
85
86
  shutil.move(src_path, dst_path)
86
87
 
87
88
 
89
+ class WrapAtomic(FileLike):
90
+ __atomic__ = True
91
+
92
+ def __init__(self, fileobj):
93
+ self.fileobj = fileobj
94
+ self.temp_name = f"{self.name}.temp"
95
+ os.rename(self.name, self.temp_name)
96
+
97
+ @property
98
+ def name(self):
99
+ return self.fileobj.name
100
+
101
+ @property
102
+ def mode(self):
103
+ return self.fileobj.mode
104
+
105
+ def _close(self):
106
+ self.fileobj.close()
107
+ os.rename(self.temp_name, self.name)
108
+
109
+ def _abort(self):
110
+ try:
111
+ os.unlink(self.temp_name)
112
+ except FileNotFoundError:
113
+ pass
114
+
115
+ def __getattr__(self, name: str):
116
+ return getattr(self.fileobj, name)
117
+
118
+
88
119
  @SmartPath.register
89
120
  class FSPath(URIPath):
90
121
  """file protocol
@@ -627,9 +658,11 @@ class FSPath(URIPath):
627
658
  """
628
659
  self._check_int_path()
629
660
 
630
- if missing_ok and not self.exists():
631
- return
632
- os.unlink(self.path_without_protocol) # pyre-ignore[6]
661
+ try:
662
+ os.unlink(self.path_without_protocol) # pyre-ignore[6]
663
+ except FileNotFoundError:
664
+ if not missing_ok:
665
+ raise
633
666
 
634
667
  def walk(
635
668
  self, followlinks: bool = False
@@ -737,15 +770,7 @@ class FSPath(URIPath):
737
770
  ):
738
771
  if isinstance(self.path_without_protocol, int):
739
772
  with open(fspath(dst_path), "wb") as fdst:
740
- # This magic number is copied from copyfileobj
741
- length = 16 * 1024
742
- while True:
743
- buf = os.read(self.path_without_protocol, length) # pyre-ignore[6]
744
- if not buf:
745
- break
746
- fdst.write(buf)
747
- if callback:
748
- callback(len(buf))
773
+ copyfd(self.path_without_protocol, fdst, callback)
749
774
  else:
750
775
  shutil.copy2(
751
776
  self.path_without_protocol, # pyre-ignore[6]
@@ -925,11 +950,12 @@ class FSPath(URIPath):
925
950
  def open(
926
951
  self,
927
952
  mode: str = "r",
928
- buffering=-1,
929
- encoding=None,
930
- errors=None,
931
- newline=None,
932
- closefd=True,
953
+ buffering: int = -1,
954
+ encoding: Optional[str] = None,
955
+ errors: Optional[str] = None,
956
+ newline: Optional[str] = None,
957
+ closefd: bool = True,
958
+ atomic: bool = False,
933
959
  **kwargs,
934
960
  ) -> IO:
935
961
  if not isinstance(self.path_without_protocol, int) and (
@@ -940,7 +966,7 @@ class FSPath(URIPath):
940
966
  self.path_without_protocol # pyre-ignore[6]
941
967
  )
942
968
  ).mkdir(parents=True, exist_ok=True)
943
- return io.open(
969
+ fp = io.open(
944
970
  self.path_without_protocol,
945
971
  mode,
946
972
  buffering=buffering,
@@ -949,6 +975,9 @@ class FSPath(URIPath):
949
975
  newline=newline,
950
976
  closefd=closefd,
951
977
  )
978
+ if atomic and ("w" in mode or "x" in mode or "a" in mode):
979
+ return WrapAtomic(fp)
980
+ return fp
952
981
 
953
982
  @cached_property
954
983
  def parts(self) -> Tuple[str, ...]:
megfile/interfaces.py CHANGED
@@ -1,6 +1,7 @@
1
1
  import os
2
2
  from abc import ABC, abstractmethod
3
3
  from io import IOBase, UnsupportedOperation
4
+ from logging import getLogger as get_logger
4
5
  from typing import IO, AnyStr, Iterable, List, Optional
5
6
 
6
7
  from megfile.pathlike import (
@@ -31,6 +32,8 @@ __all__ = [
31
32
  "URIPath",
32
33
  ]
33
34
 
35
+ _logger = get_logger(__name__)
36
+
34
37
 
35
38
  def fullname(o):
36
39
  klass = o.__class__
@@ -43,16 +46,28 @@ def fullname(o):
43
46
  # 1. Default value of closed is False
44
47
  # 2. closed is set to True when close() are called
45
48
  # 3. close() will only be called once
49
+ # 4. atomic means the file-like object should not be closed automatically
50
+ # when an exception is raised in the context manager or when the object is
51
+ # garbage collected.
52
+ # 5. atomic is False by default
46
53
  class Closable(ABC):
47
54
  @property
48
55
  def closed(self) -> bool:
49
56
  """Return True if the file-like object is closed."""
50
57
  return getattr(self, "__closed__", False)
51
58
 
59
+ @property
60
+ def atomic(self) -> bool:
61
+ """Return True if the file-like object is atomic."""
62
+ return getattr(self, "__atomic__", False)
63
+
52
64
  @abstractmethod
53
65
  def _close(self) -> None:
54
66
  pass # pragma: no cover
55
67
 
68
+ def _abort(self) -> None:
69
+ pass
70
+
56
71
  def close(self) -> None:
57
72
  """Flush and close the file-like object.
58
73
 
@@ -66,6 +81,24 @@ class Closable(ABC):
66
81
  return self
67
82
 
68
83
  def __exit__(self, type, value, traceback) -> None:
84
+ if self.atomic and value is not None:
85
+ from megfile.errors import full_error_message
86
+
87
+ _logger.warning(
88
+ f"skip closing atomic file-like object: {self}, "
89
+ f"since error encountered: {full_error_message(value)}"
90
+ )
91
+ self._abort()
92
+ return
93
+ self.close()
94
+
95
+ def __del__(self):
96
+ if self.atomic:
97
+ _logger.warning(
98
+ f"skip closing atomic file-like object before deletion: {self}"
99
+ )
100
+ self._abort()
101
+ return
69
102
  self.close()
70
103
 
71
104
 
@@ -82,9 +82,9 @@ class BasePrefetchReader(Readable[bytes], Seekable, ABC):
82
82
 
83
83
  self._offset = 0
84
84
  self._cached_buffer = None
85
- self._block_index = None # Current block index
85
+ self._block_index = 0 # Current block index
86
+ self._cached_offset = 0 # Current offset in the current block
86
87
  self._seek_history = []
87
-
88
88
  self._seek_buffer(0)
89
89
 
90
90
  _logger.debug("open file: %r, mode: %s" % (self.name, self.mode))
@@ -98,7 +98,9 @@ class BasePrefetchReader(Readable[bytes], Seekable, ABC):
98
98
  return self._process_local("futures", self._get_futures)
99
99
 
100
100
  def _get_futures(self):
101
- return LRUCacheFutureManager()
101
+ futures = LRUCacheFutureManager()
102
+ futures.register(self.name)
103
+ return futures
102
104
 
103
105
  @property
104
106
  @abstractmethod
@@ -207,9 +209,8 @@ class BasePrefetchReader(Readable[bytes], Seekable, ABC):
207
209
  if size == 0 or self._offset >= self._content_size:
208
210
  return b""
209
211
 
210
- data = self._fetch_response(start=self._offset, end=self._offset + size - 1)[
211
- "Body"
212
- ].read()
212
+ resp = self._fetch_response(start=self._offset, end=self._offset + size - 1)
213
+ data = resp["Body"].read()
213
214
  self.seek(size, os.SEEK_CUR)
214
215
  return data
215
216
 
@@ -369,12 +370,17 @@ class BasePrefetchReader(Readable[bytes], Seekable, ABC):
369
370
  class LRUCacheFutureManager(OrderedDict):
370
371
  def __init__(self):
371
372
  super().__init__()
373
+ self._name = None
374
+
375
+ def register(self, name):
376
+ self._name = name
372
377
 
373
378
  def submit(self, executor, key, *args, **kwargs):
374
379
  if key in self:
375
380
  self.move_to_end(key, last=True)
376
381
  return
377
382
  self[key] = executor.submit(*args, **kwargs)
383
+ _logger.debug("submit future: %r, key: %r" % (self._name, key))
378
384
 
379
385
  @property
380
386
  def finished(self):
@@ -385,7 +391,12 @@ class LRUCacheFutureManager(OrderedDict):
385
391
  return self[key].result()
386
392
 
387
393
  def cleanup(self, block_capacity: int):
394
+ keys = []
388
395
  while len(self) > block_capacity:
389
- _, future = self.popitem(last=False)
396
+ key, future = self.popitem(last=False)
397
+ keys.append(key)
390
398
  if not future.done():
391
399
  future.cancel()
400
+ if keys:
401
+ _logger.debug("cleanup futures: %r, keys: %s" % (self._name, keys))
402
+ return keys
megfile/lib/joinpath.py CHANGED
@@ -33,3 +33,16 @@ def uri_join(path: str, *other_paths: str) -> str:
33
33
 
34
34
  # Imp. 3
35
35
  # return '/'.join((path, *other_paths))
36
+
37
+
38
+ def uri_norm(path: str) -> str:
39
+ parts = path.split("/")
40
+ new_parts = []
41
+ for part in parts:
42
+ if part == ".":
43
+ continue
44
+ if part == ".." and new_parts and new_parts[-1] != "..":
45
+ new_parts.pop()
46
+ else:
47
+ new_parts.append(part)
48
+ return "/".join(new_parts)
@@ -53,11 +53,13 @@ class S3BufferedWriter(Writable[bytes]):
53
53
  max_buffer_size: int = WRITER_MAX_BUFFER_SIZE,
54
54
  max_workers: Optional[int] = None,
55
55
  profile_name: Optional[str] = None,
56
+ atomic: bool = False,
56
57
  ):
57
58
  self._bucket = bucket
58
59
  self._key = key
59
60
  self._client = s3_client
60
61
  self._profile_name = profile_name
62
+ self.__atomic__ = atomic
61
63
 
62
64
  # user maybe put block_size with 'numpy.uint64' type
63
65
  self._base_block_size = int(block_size)
@@ -213,6 +215,17 @@ class S3BufferedWriter(Writable[bytes]):
213
215
  if not self._is_global_executor:
214
216
  self._executor.shutdown()
215
217
 
218
+ def _abort(self):
219
+ _logger.debug("abort file: %r" % self.name)
220
+
221
+ if self._is_multipart:
222
+ with raise_s3_error(self.name):
223
+ self._client.abort_multipart_upload(
224
+ Bucket=self._bucket, Key=self._key, UploadId=self._upload_id
225
+ )
226
+
227
+ self._shutdown()
228
+
216
229
  def _close(self):
217
230
  _logger.debug("close file: %r" % self.name)
218
231
 
@@ -33,6 +33,7 @@ class S3LimitedSeekableWriter(S3BufferedWriter, Seekable):
33
33
  max_buffer_size: int = WRITER_MAX_BUFFER_SIZE,
34
34
  max_workers: Optional[int] = None,
35
35
  profile_name: Optional[str] = None,
36
+ atomic: bool = False,
36
37
  ):
37
38
  super().__init__(
38
39
  bucket,
@@ -42,6 +43,7 @@ class S3LimitedSeekableWriter(S3BufferedWriter, Seekable):
42
43
  max_buffer_size=max_buffer_size,
43
44
  max_workers=max_workers,
44
45
  profile_name=profile_name,
46
+ atomic=atomic,
45
47
  )
46
48
 
47
49
  self._head_block_size = head_block_size or block_size
@@ -4,6 +4,7 @@ from typing import Optional
4
4
 
5
5
  from megfile.config import (
6
6
  READER_BLOCK_SIZE,
7
+ READER_LAZY_PREFETCH,
7
8
  READER_MAX_BUFFER_SIZE,
8
9
  S3_MAX_RETRY_TIMES,
9
10
  )
@@ -62,7 +63,7 @@ class S3PrefetchReader(BasePrefetchReader):
62
63
  )
63
64
 
64
65
  def _get_content_size(self):
65
- if self._block_capacity <= 0:
66
+ if self._block_capacity <= 0 or READER_LAZY_PREFETCH:
66
67
  response = self._client.head_object(Bucket=self._bucket, Key=self._key)
67
68
  self._content_etag = response.get("ETag")
68
69
  return int(response["ContentLength"])
@@ -101,16 +101,21 @@ class ShareCacheFutureManager(LRUCacheFutureManager):
101
101
  super().__init__()
102
102
  self._references = Counter()
103
103
 
104
- def register(self, key):
105
- self._references[key] += 1
106
-
107
- def unregister(self, key):
108
- self._references[key] -= 1
109
- if self._references[key] == 0:
110
- self._references.pop(key)
111
- for key_tuple in list(self):
112
- if key_tuple[0] != key:
104
+ def register(self, name):
105
+ self._references[name] += 1
106
+ _logger.debug("register reader: %r, count: %d" % (name, self._references[name]))
107
+
108
+ def unregister(self, name):
109
+ self._references[name] -= 1
110
+ _logger.debug(
111
+ "unregister reader: %r, count: %d" % (name, self._references[name])
112
+ )
113
+ if self._references[name] == 0:
114
+ self._references.pop(name)
115
+ for key in list(self):
116
+ if key[0] != name:
113
117
  continue
114
- future = self.pop(key_tuple)
118
+ future = self.pop(key)
115
119
  if not future.done():
116
120
  future.cancel() # pragma: no cover
121
+ _logger.debug("cleanup all futures of reader: %r" % name)
megfile/s3_path.py CHANGED
@@ -230,7 +230,7 @@ def get_endpoint_url(profile_name: Optional[str] = None) -> str:
230
230
  config_endpoint_url = config.get("s3", {}).get("endpoint_url")
231
231
  config_endpoint_url = config_endpoint_url or config.get("endpoint_url")
232
232
  if config_endpoint_url:
233
- warning_endpoint_url("~/.aws/config", config_endpoint_url)
233
+ warning_endpoint_url("~/.aws/config or ~/.aws/credentials", config_endpoint_url)
234
234
  return config_endpoint_url
235
235
  return endpoint_url
236
236
 
@@ -937,6 +937,7 @@ def s3_buffered_open(
937
937
  buffered: bool = False,
938
938
  share_cache_key: Optional[str] = None,
939
939
  cache_path: Optional[str] = None,
940
+ atomic: bool = False,
940
941
  ) -> IO:
941
942
  """Open an asynchronous prefetch reader, to support fast sequential read
942
943
 
@@ -969,6 +970,8 @@ def s3_buffered_open(
969
970
  (both file head part and tail part can seek block_size).
970
971
  Notes: This parameter are valid only for write-handle.
971
972
  Read-handle support arbitrary seek
973
+ :param buffered: If you are operating pickle file without .pkl or .pickle extension,
974
+ please set this to True to avoid the performance issue.
972
975
  :returns: An opened File object
973
976
  :raises: S3FileNotFoundError
974
977
  """
@@ -1000,6 +1003,7 @@ def s3_buffered_open(
1000
1003
  )
1001
1004
 
1002
1005
  if mode == "rb":
1006
+ block_size = block_size or READER_BLOCK_SIZE
1003
1007
  if share_cache_key is not None:
1004
1008
  reader = S3ShareCacheReader(
1005
1009
  bucket,
@@ -1008,7 +1012,7 @@ def s3_buffered_open(
1008
1012
  s3_client=client,
1009
1013
  max_retries=max_retries,
1010
1014
  max_workers=max_workers,
1011
- block_size=block_size or READER_BLOCK_SIZE,
1015
+ block_size=block_size,
1012
1016
  block_forward=block_forward,
1013
1017
  profile_name=s3_url._profile_name,
1014
1018
  )
@@ -1023,13 +1027,14 @@ def s3_buffered_open(
1023
1027
  max_workers=max_workers,
1024
1028
  max_buffer_size=max_buffer_size,
1025
1029
  block_forward=block_forward,
1026
- block_size=block_size or READER_BLOCK_SIZE,
1030
+ block_size=block_size,
1027
1031
  profile_name=s3_url._profile_name,
1028
1032
  )
1029
1033
  if buffered or _is_pickle(reader):
1030
1034
  reader = io.BufferedReader(reader) # type: ignore
1031
1035
  return reader
1032
1036
 
1037
+ block_size = block_size or WRITER_BLOCK_SIZE
1033
1038
  if limited_seekable:
1034
1039
  if max_buffer_size is None:
1035
1040
  max_buffer_size = WRITER_MAX_BUFFER_SIZE
@@ -1038,9 +1043,10 @@ def s3_buffered_open(
1038
1043
  key,
1039
1044
  s3_client=client,
1040
1045
  max_workers=max_workers,
1041
- block_size=block_size or WRITER_BLOCK_SIZE,
1046
+ block_size=block_size,
1042
1047
  max_buffer_size=max_buffer_size,
1043
1048
  profile_name=s3_url._profile_name,
1049
+ atomic=atomic,
1044
1050
  )
1045
1051
  else:
1046
1052
  if max_buffer_size is None:
@@ -1050,9 +1056,10 @@ def s3_buffered_open(
1050
1056
  key,
1051
1057
  s3_client=client,
1052
1058
  max_workers=max_workers,
1053
- block_size=block_size or WRITER_BLOCK_SIZE,
1059
+ block_size=block_size,
1054
1060
  max_buffer_size=max_buffer_size,
1055
1061
  profile_name=s3_url._profile_name,
1062
+ atomic=atomic,
1056
1063
  )
1057
1064
  if buffered or _is_pickle(writer):
1058
1065
  writer = io.BufferedWriter(writer) # type: ignore