megfile 3.1.6.post1__py3-none-any.whl → 4.0.0.post1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- megfile/cli.py +12 -7
- megfile/config.py +27 -39
- megfile/fs.py +169 -12
- megfile/fs_path.py +183 -260
- megfile/hdfs.py +106 -5
- megfile/hdfs_path.py +34 -90
- megfile/http.py +50 -1
- megfile/http_path.py +27 -65
- megfile/interfaces.py +1 -8
- megfile/lib/base_prefetch_reader.py +62 -78
- megfile/lib/combine_reader.py +5 -0
- megfile/lib/glob.py +3 -6
- megfile/lib/hdfs_prefetch_reader.py +7 -7
- megfile/lib/http_prefetch_reader.py +6 -6
- megfile/lib/s3_buffered_writer.py +71 -65
- megfile/lib/s3_cached_handler.py +1 -2
- megfile/lib/s3_limited_seekable_writer.py +3 -7
- megfile/lib/s3_memory_handler.py +1 -2
- megfile/lib/s3_pipe_handler.py +1 -2
- megfile/lib/s3_prefetch_reader.py +10 -19
- megfile/lib/s3_share_cache_reader.py +8 -5
- megfile/pathlike.py +397 -401
- megfile/s3.py +118 -17
- megfile/s3_path.py +126 -209
- megfile/sftp.py +300 -10
- megfile/sftp_path.py +46 -322
- megfile/smart.py +33 -27
- megfile/smart_path.py +9 -14
- megfile/stdio.py +1 -1
- megfile/stdio_path.py +2 -2
- megfile/utils/__init__.py +3 -4
- megfile/version.py +1 -1
- {megfile-3.1.6.post1.dist-info → megfile-4.0.0.post1.dist-info}/METADATA +7 -7
- megfile-4.0.0.post1.dist-info/RECORD +52 -0
- {megfile-3.1.6.post1.dist-info → megfile-4.0.0.post1.dist-info}/WHEEL +1 -1
- {megfile-3.1.6.post1.dist-info → megfile-4.0.0.post1.dist-info}/top_level.txt +0 -2
- docs/conf.py +0 -65
- megfile-3.1.6.post1.dist-info/RECORD +0 -55
- scripts/convert_results_to_sarif.py +0 -91
- scripts/generate_file.py +0 -344
- {megfile-3.1.6.post1.dist-info → megfile-4.0.0.post1.dist-info}/LICENSE +0 -0
- {megfile-3.1.6.post1.dist-info → megfile-4.0.0.post1.dist-info}/LICENSE.pyre +0 -0
- {megfile-3.1.6.post1.dist-info → megfile-4.0.0.post1.dist-info}/entry_points.txt +0 -0
megfile/s3_path.py
CHANGED
|
@@ -11,16 +11,19 @@ from urllib.parse import urlparse
|
|
|
11
11
|
|
|
12
12
|
import boto3
|
|
13
13
|
import botocore
|
|
14
|
+
from boto3.s3.transfer import TransferConfig
|
|
14
15
|
from botocore.awsrequest import AWSPreparedRequest, AWSResponse
|
|
15
16
|
|
|
16
17
|
from megfile.config import (
|
|
17
|
-
DEFAULT_BLOCK_SIZE,
|
|
18
|
-
DEFAULT_MAX_BLOCK_SIZE,
|
|
19
|
-
DEFAULT_MIN_BLOCK_SIZE,
|
|
20
18
|
GLOBAL_MAX_WORKERS,
|
|
21
19
|
HTTP_AUTH_HEADERS,
|
|
20
|
+
READER_BLOCK_SIZE,
|
|
21
|
+
READER_MAX_BUFFER_SIZE,
|
|
22
22
|
S3_CLIENT_CACHE_MODE,
|
|
23
23
|
S3_MAX_RETRY_TIMES,
|
|
24
|
+
WRITER_BLOCK_SIZE,
|
|
25
|
+
WRITER_MAX_BUFFER_SIZE,
|
|
26
|
+
to_boolean,
|
|
24
27
|
)
|
|
25
28
|
from megfile.errors import (
|
|
26
29
|
S3BucketNotFoundError,
|
|
@@ -61,7 +64,6 @@ from megfile.lib.fnmatch import translate
|
|
|
61
64
|
from megfile.lib.glob import has_magic, has_magic_ignore_brace, ungloblize
|
|
62
65
|
from megfile.lib.joinpath import uri_join
|
|
63
66
|
from megfile.lib.s3_buffered_writer import (
|
|
64
|
-
DEFAULT_MAX_BUFFER_SIZE,
|
|
65
67
|
S3BufferedWriter,
|
|
66
68
|
)
|
|
67
69
|
from megfile.lib.s3_cached_handler import S3CachedHandler
|
|
@@ -101,26 +103,14 @@ __all__ = [
|
|
|
101
103
|
"s3_share_cache_open",
|
|
102
104
|
"s3_open",
|
|
103
105
|
"S3Cacher",
|
|
104
|
-
"S3BufferedWriter",
|
|
105
|
-
"S3LimitedSeekableWriter",
|
|
106
|
-
"S3PrefetchReader",
|
|
107
|
-
"S3ShareCacheReader",
|
|
108
106
|
"s3_upload",
|
|
109
107
|
"s3_download",
|
|
110
108
|
"s3_load_content",
|
|
111
|
-
"s3_readlink",
|
|
112
|
-
"s3_glob",
|
|
113
|
-
"s3_glob_stat",
|
|
114
|
-
"s3_iglob",
|
|
115
|
-
"s3_rename",
|
|
116
|
-
"s3_makedirs",
|
|
117
109
|
"s3_concat",
|
|
118
|
-
"s3_lstat",
|
|
119
110
|
]
|
|
120
111
|
_logger = get_logger(__name__)
|
|
121
112
|
content_md5_header = "megfile-content-md5"
|
|
122
113
|
endpoint_url = "https://s3.amazonaws.com"
|
|
123
|
-
max_pool_connections = GLOBAL_MAX_WORKERS # for compatibility
|
|
124
114
|
max_retries = S3_MAX_RETRY_TIMES
|
|
125
115
|
max_keys = 1000
|
|
126
116
|
|
|
@@ -266,7 +256,7 @@ def get_env_var(env_name: str, profile_name=None):
|
|
|
266
256
|
def parse_boolean(value: Optional[str], default: bool = False) -> bool:
|
|
267
257
|
if value is None:
|
|
268
258
|
return default
|
|
269
|
-
return value
|
|
259
|
+
return to_boolean(value)
|
|
270
260
|
|
|
271
261
|
|
|
272
262
|
def get_access_token(profile_name=None):
|
|
@@ -638,7 +628,7 @@ def _s3_scan_pairs(
|
|
|
638
628
|
src_url: PathLike, dst_url: PathLike
|
|
639
629
|
) -> Iterator[Tuple[PathLike, PathLike]]:
|
|
640
630
|
for src_file_path in S3Path(src_url).scan():
|
|
641
|
-
content_path = src_file_path[len(src_url) :]
|
|
631
|
+
content_path = src_file_path[len(fspath(src_url)) :]
|
|
642
632
|
if len(content_path) > 0:
|
|
643
633
|
dst_file_path = s3_path_join(dst_url, content_path)
|
|
644
634
|
else:
|
|
@@ -698,8 +688,8 @@ def s3_prefetch_open(
|
|
|
698
688
|
mode: str = "rb",
|
|
699
689
|
followlinks: bool = False,
|
|
700
690
|
*,
|
|
701
|
-
|
|
702
|
-
|
|
691
|
+
max_workers: Optional[int] = None,
|
|
692
|
+
block_size: int = READER_BLOCK_SIZE,
|
|
703
693
|
) -> S3PrefetchReader:
|
|
704
694
|
"""Open a asynchronous prefetch reader, to support fast sequential
|
|
705
695
|
read and random read
|
|
@@ -710,11 +700,18 @@ def s3_prefetch_open(
|
|
|
710
700
|
|
|
711
701
|
Supports context manager
|
|
712
702
|
|
|
713
|
-
Some parameter setting may perform well:
|
|
714
|
-
|
|
715
|
-
|
|
716
|
-
:param
|
|
717
|
-
:param
|
|
703
|
+
Some parameter setting may perform well: max_workers=10 or 20,
|
|
704
|
+
block_size=8 or 16 MB, default value None means using global thread pool
|
|
705
|
+
|
|
706
|
+
:param s3_url: s3 path
|
|
707
|
+
:param mode: only support "r" or "rb"
|
|
708
|
+
:param encoding: encoding is the name of the encoding used to decode or encode
|
|
709
|
+
the file. This should only be used in text mode.
|
|
710
|
+
:param errors: errors is an optional string that specifies how encoding and
|
|
711
|
+
decoding errors are to be handled—this cannot be used in binary mode.
|
|
712
|
+
:param followlinks: follow symbolic link, default `False`
|
|
713
|
+
:param max_workers: Max download thread number, `None` by default
|
|
714
|
+
:param block_size: Max data size downloaded by each thread, in bytes,
|
|
718
715
|
8MB by default
|
|
719
716
|
:returns: An opened S3PrefetchReader object
|
|
720
717
|
:raises: S3FileNotFoundError
|
|
@@ -730,15 +727,15 @@ def s3_prefetch_open(
|
|
|
730
727
|
pass
|
|
731
728
|
|
|
732
729
|
bucket, key = parse_s3_url(s3_url.path_with_protocol)
|
|
733
|
-
config = botocore.config.Config(max_pool_connections=
|
|
730
|
+
config = botocore.config.Config(max_pool_connections=GLOBAL_MAX_WORKERS)
|
|
734
731
|
client = get_s3_client_with_cache(config=config, profile_name=s3_url._profile_name)
|
|
735
732
|
return S3PrefetchReader(
|
|
736
733
|
bucket,
|
|
737
734
|
key,
|
|
738
735
|
s3_client=client,
|
|
739
736
|
max_retries=max_retries,
|
|
740
|
-
max_workers=
|
|
741
|
-
block_size=
|
|
737
|
+
max_workers=max_workers,
|
|
738
|
+
block_size=block_size,
|
|
742
739
|
profile_name=s3_url._profile_name,
|
|
743
740
|
)
|
|
744
741
|
|
|
@@ -750,8 +747,8 @@ def s3_share_cache_open(
|
|
|
750
747
|
followlinks: bool = False,
|
|
751
748
|
*,
|
|
752
749
|
cache_key: str = "lru",
|
|
753
|
-
|
|
754
|
-
|
|
750
|
+
max_workers: Optional[int] = None,
|
|
751
|
+
block_size: int = READER_BLOCK_SIZE,
|
|
755
752
|
) -> S3ShareCacheReader:
|
|
756
753
|
"""Open a asynchronous prefetch reader, to support fast sequential read and
|
|
757
754
|
random read
|
|
@@ -762,11 +759,18 @@ def s3_share_cache_open(
|
|
|
762
759
|
|
|
763
760
|
Supports context manager
|
|
764
761
|
|
|
765
|
-
Some parameter setting may perform well:
|
|
766
|
-
|
|
767
|
-
|
|
768
|
-
:param
|
|
769
|
-
:param
|
|
762
|
+
Some parameter setting may perform well: max_workers=10 or 20,
|
|
763
|
+
block_size=8 or 16 MB, default value None means using global thread pool
|
|
764
|
+
|
|
765
|
+
:param s3_url: s3 path
|
|
766
|
+
:param mode: only support "r" or "rb"
|
|
767
|
+
:param encoding: encoding is the name of the encoding used to decode or encode
|
|
768
|
+
the file. This should only be used in text mode.
|
|
769
|
+
:param errors: errors is an optional string that specifies how encoding and
|
|
770
|
+
decoding errors are to be handled—this cannot be used in binary mode.
|
|
771
|
+
:param followlinks: follow symbolic link, default `False`
|
|
772
|
+
:param max_workers: Max download thread number, None by default
|
|
773
|
+
:param block_size: Max data size downloaded by each thread, in bytes,
|
|
770
774
|
8MB by default
|
|
771
775
|
:returns: An opened S3ShareCacheReader object
|
|
772
776
|
:raises: S3FileNotFoundError
|
|
@@ -783,7 +787,7 @@ def s3_share_cache_open(
|
|
|
783
787
|
pass
|
|
784
788
|
|
|
785
789
|
bucket, key = parse_s3_url(s3_url.path_with_protocol)
|
|
786
|
-
config = botocore.config.Config(max_pool_connections=
|
|
790
|
+
config = botocore.config.Config(max_pool_connections=GLOBAL_MAX_WORKERS)
|
|
787
791
|
client = get_s3_client_with_cache(config=config, profile_name=s3_url._profile_name)
|
|
788
792
|
return S3ShareCacheReader(
|
|
789
793
|
bucket,
|
|
@@ -791,8 +795,8 @@ def s3_share_cache_open(
|
|
|
791
795
|
cache_key=cache_key,
|
|
792
796
|
s3_client=client,
|
|
793
797
|
max_retries=max_retries,
|
|
794
|
-
max_workers=
|
|
795
|
-
block_size=
|
|
798
|
+
max_workers=max_workers,
|
|
799
|
+
block_size=block_size,
|
|
796
800
|
profile_name=s3_url._profile_name,
|
|
797
801
|
)
|
|
798
802
|
|
|
@@ -819,7 +823,13 @@ def s3_pipe_open(
|
|
|
819
823
|
But asynchronous behavior can guarantee the file are successfully written,
|
|
820
824
|
and frequent execution may cause thread and file handle exhaustion
|
|
821
825
|
|
|
822
|
-
:param
|
|
826
|
+
:param s3_url: s3 path
|
|
827
|
+
:param mode: Mode to open file, either "r", "rb", "w" or "wb"
|
|
828
|
+
:param encoding: encoding is the name of the encoding used to decode or encode
|
|
829
|
+
the file. This should only be used in text mode.
|
|
830
|
+
:param errors: errors is an optional string that specifies how encoding and
|
|
831
|
+
decoding errors are to be handled—this cannot be used in binary mode.
|
|
832
|
+
:param followlinks: follow symbolic link, default `False`
|
|
823
833
|
:param join_thread: If wait after function execution until s3 finishes writing
|
|
824
834
|
:returns: An opened BufferedReader / BufferedWriter object
|
|
825
835
|
"""
|
|
@@ -838,7 +848,7 @@ def s3_pipe_open(
|
|
|
838
848
|
pass
|
|
839
849
|
|
|
840
850
|
bucket, key = parse_s3_url(s3_url.path_with_protocol)
|
|
841
|
-
config = botocore.config.Config(max_pool_connections=
|
|
851
|
+
config = botocore.config.Config(max_pool_connections=GLOBAL_MAX_WORKERS)
|
|
842
852
|
client = get_s3_client_with_cache(config=config, profile_name=s3_url._profile_name)
|
|
843
853
|
return S3PipeHandler(
|
|
844
854
|
bucket,
|
|
@@ -869,7 +879,14 @@ def s3_cached_open(
|
|
|
869
879
|
cache_path can specify the path of cache file. Performance could be better
|
|
870
880
|
if cache file path is on ssd or tmpfs
|
|
871
881
|
|
|
872
|
-
:param
|
|
882
|
+
:param s3_url: s3 path
|
|
883
|
+
:param mode: Mode to open file, could be one of "rb", "wb", "ab", "rb+", "wb+"
|
|
884
|
+
or "ab+"
|
|
885
|
+
:param encoding: encoding is the name of the encoding used to decode or encode
|
|
886
|
+
the file. This should only be used in text mode.
|
|
887
|
+
:param errors: errors is an optional string that specifies how encoding and
|
|
888
|
+
decoding errors are to be handled—this cannot be used in binary mode.
|
|
889
|
+
:param followlinks: follow symbolic link, default `False`
|
|
873
890
|
:param cache_path: cache file path
|
|
874
891
|
:returns: An opened BufferedReader / BufferedWriter object
|
|
875
892
|
"""
|
|
@@ -884,7 +901,7 @@ def s3_cached_open(
|
|
|
884
901
|
pass
|
|
885
902
|
|
|
886
903
|
bucket, key = parse_s3_url(s3_url.path_with_protocol)
|
|
887
|
-
config = botocore.config.Config(max_pool_connections=
|
|
904
|
+
config = botocore.config.Config(max_pool_connections=GLOBAL_MAX_WORKERS)
|
|
888
905
|
client = get_s3_client_with_cache(config=config, profile_name=s3_url._profile_name)
|
|
889
906
|
return S3CachedHandler(
|
|
890
907
|
bucket,
|
|
@@ -902,16 +919,14 @@ def s3_buffered_open(
|
|
|
902
919
|
mode: str,
|
|
903
920
|
followlinks: bool = False,
|
|
904
921
|
*,
|
|
905
|
-
|
|
906
|
-
max_buffer_size: int =
|
|
907
|
-
|
|
922
|
+
max_workers: Optional[int] = None,
|
|
923
|
+
max_buffer_size: Optional[int] = None,
|
|
924
|
+
block_forward: Optional[int] = None,
|
|
908
925
|
block_size: Optional[int] = None,
|
|
909
926
|
limited_seekable: bool = False,
|
|
910
927
|
buffered: bool = False,
|
|
911
928
|
share_cache_key: Optional[str] = None,
|
|
912
929
|
cache_path: Optional[str] = None,
|
|
913
|
-
min_block_size: Optional[int] = None,
|
|
914
|
-
max_block_size: int = DEFAULT_MAX_BLOCK_SIZE,
|
|
915
930
|
) -> IO:
|
|
916
931
|
"""Open an asynchronous prefetch reader, to support fast sequential read
|
|
917
932
|
|
|
@@ -921,22 +936,30 @@ def s3_buffered_open(
|
|
|
921
936
|
|
|
922
937
|
Supports context manager
|
|
923
938
|
|
|
924
|
-
Some parameter setting may perform well:
|
|
925
|
-
|
|
926
|
-
|
|
927
|
-
:param
|
|
928
|
-
:param
|
|
929
|
-
|
|
930
|
-
|
|
931
|
-
|
|
932
|
-
|
|
933
|
-
|
|
939
|
+
Some parameter setting may perform well: max_workers=10 or 20,
|
|
940
|
+
default value None means using global thread pool
|
|
941
|
+
|
|
942
|
+
:param s3_url: s3 path
|
|
943
|
+
:param mode: Mode to open file, could be one of "rb", "wb", "ab", "rb+", "wb+"
|
|
944
|
+
or "ab+"
|
|
945
|
+
:param encoding: encoding is the name of the encoding used to decode or encode
|
|
946
|
+
the file. This should only be used in text mode.
|
|
947
|
+
:param errors: errors is an optional string that specifies how encoding and
|
|
948
|
+
decoding errors are to be handled—this cannot be used in binary mode.
|
|
949
|
+
:param followlinks: follow symbolic link, default `False`
|
|
950
|
+
:param max_workers: Max download / upload thread number, `None` by default,
|
|
951
|
+
will use global thread pool with 8 threads.
|
|
952
|
+
:param max_buffer_size: Max cached buffer size in memory, 128MB by default.
|
|
953
|
+
Set to `0` will disable cache.
|
|
954
|
+
:param block_forward: How many blocks of data cached from offset position, only for
|
|
955
|
+
read mode.
|
|
956
|
+
:param block_size: Size of single block.
|
|
934
957
|
Each block will be uploaded by single thread.
|
|
935
958
|
:param limited_seekable: If write-handle supports limited seek
|
|
936
959
|
(both file head part and tail part can seek block_size).
|
|
937
960
|
Notes: This parameter are valid only for write-handle.
|
|
938
961
|
Read-handle support arbitrary seek
|
|
939
|
-
:returns: An opened
|
|
962
|
+
:returns: An opened File object
|
|
940
963
|
:raises: S3FileNotFoundError
|
|
941
964
|
"""
|
|
942
965
|
if mode not in ("rb", "wb", "ab", "rb+", "wb+", "ab+"):
|
|
@@ -948,11 +971,8 @@ def s3_buffered_open(
|
|
|
948
971
|
s3_url = s3_url.readlink()
|
|
949
972
|
except S3NotALinkError:
|
|
950
973
|
pass
|
|
951
|
-
min_block_size = min_block_size or block_size or DEFAULT_MIN_BLOCK_SIZE
|
|
952
|
-
block_size = block_size or DEFAULT_BLOCK_SIZE
|
|
953
|
-
|
|
954
974
|
bucket, key = parse_s3_url(s3_url.path_with_protocol)
|
|
955
|
-
config = botocore.config.Config(max_pool_connections=
|
|
975
|
+
config = botocore.config.Config(max_pool_connections=GLOBAL_MAX_WORKERS)
|
|
956
976
|
client = get_s3_client_with_cache(config=config, profile_name=s3_url._profile_name)
|
|
957
977
|
|
|
958
978
|
if "a" in mode or "+" in mode:
|
|
@@ -970,13 +990,6 @@ def s3_buffered_open(
|
|
|
970
990
|
)
|
|
971
991
|
|
|
972
992
|
if mode == "rb":
|
|
973
|
-
# A rough conversion algorithm to align 2 types of Reader / Writer parameters
|
|
974
|
-
# TODO: Optimize the conversion algorithm
|
|
975
|
-
block_capacity = max_buffer_size // block_size
|
|
976
|
-
if forward_ratio is None:
|
|
977
|
-
block_forward = None
|
|
978
|
-
else:
|
|
979
|
-
block_forward = max(int(block_capacity * forward_ratio), 1)
|
|
980
993
|
if share_cache_key is not None:
|
|
981
994
|
reader = S3ShareCacheReader(
|
|
982
995
|
bucket,
|
|
@@ -984,8 +997,8 @@ def s3_buffered_open(
|
|
|
984
997
|
cache_key=share_cache_key,
|
|
985
998
|
s3_client=client,
|
|
986
999
|
max_retries=max_retries,
|
|
987
|
-
max_workers=
|
|
988
|
-
block_size=block_size,
|
|
1000
|
+
max_workers=max_workers,
|
|
1001
|
+
block_size=block_size or READER_BLOCK_SIZE,
|
|
989
1002
|
block_forward=block_forward,
|
|
990
1003
|
profile_name=s3_url._profile_name,
|
|
991
1004
|
)
|
|
@@ -995,10 +1008,10 @@ def s3_buffered_open(
|
|
|
995
1008
|
key,
|
|
996
1009
|
s3_client=client,
|
|
997
1010
|
max_retries=max_retries,
|
|
998
|
-
max_workers=
|
|
999
|
-
|
|
1011
|
+
max_workers=max_workers,
|
|
1012
|
+
max_buffer_size=max_buffer_size or READER_MAX_BUFFER_SIZE,
|
|
1000
1013
|
block_forward=block_forward,
|
|
1001
|
-
block_size=block_size,
|
|
1014
|
+
block_size=block_size or READER_BLOCK_SIZE,
|
|
1002
1015
|
profile_name=s3_url._profile_name,
|
|
1003
1016
|
)
|
|
1004
1017
|
if buffered or _is_pickle(reader):
|
|
@@ -1010,10 +1023,9 @@ def s3_buffered_open(
|
|
|
1010
1023
|
bucket,
|
|
1011
1024
|
key,
|
|
1012
1025
|
s3_client=client,
|
|
1013
|
-
max_workers=
|
|
1014
|
-
block_size=
|
|
1015
|
-
|
|
1016
|
-
max_buffer_size=max_buffer_size,
|
|
1026
|
+
max_workers=max_workers,
|
|
1027
|
+
block_size=block_size or WRITER_BLOCK_SIZE,
|
|
1028
|
+
max_buffer_size=max_buffer_size or WRITER_MAX_BUFFER_SIZE,
|
|
1017
1029
|
profile_name=s3_url._profile_name,
|
|
1018
1030
|
)
|
|
1019
1031
|
else:
|
|
@@ -1021,10 +1033,9 @@ def s3_buffered_open(
|
|
|
1021
1033
|
bucket,
|
|
1022
1034
|
key,
|
|
1023
1035
|
s3_client=client,
|
|
1024
|
-
max_workers=
|
|
1025
|
-
block_size=
|
|
1026
|
-
|
|
1027
|
-
max_buffer_size=max_buffer_size,
|
|
1036
|
+
max_workers=max_workers,
|
|
1037
|
+
block_size=block_size or WRITER_BLOCK_SIZE,
|
|
1038
|
+
max_buffer_size=max_buffer_size or WRITER_MAX_BUFFER_SIZE,
|
|
1028
1039
|
profile_name=s3_url._profile_name,
|
|
1029
1040
|
)
|
|
1030
1041
|
if buffered or _is_pickle(writer):
|
|
@@ -1059,7 +1070,7 @@ def s3_memory_open(
|
|
|
1059
1070
|
pass
|
|
1060
1071
|
|
|
1061
1072
|
bucket, key = parse_s3_url(s3_url.path_with_protocol)
|
|
1062
|
-
config = botocore.config.Config(max_pool_connections=
|
|
1073
|
+
config = botocore.config.Config(max_pool_connections=GLOBAL_MAX_WORKERS)
|
|
1063
1074
|
client = get_s3_client_with_cache(config=config, profile_name=s3_url._profile_name)
|
|
1064
1075
|
return S3MemoryHandler(
|
|
1065
1076
|
bucket, key, mode, s3_client=client, profile_name=s3_url._profile_name
|
|
@@ -1118,7 +1129,7 @@ def s3_download(
|
|
|
1118
1129
|
if not src_url.is_file():
|
|
1119
1130
|
raise S3IsADirectoryError("Is a directory: %r" % src_url.path_with_protocol)
|
|
1120
1131
|
|
|
1121
|
-
dst_directory = os.path.dirname(dst_path.path_without_protocol)
|
|
1132
|
+
dst_directory = os.path.dirname(dst_path.path_without_protocol) # pyre-ignore[6]
|
|
1122
1133
|
if dst_directory != "":
|
|
1123
1134
|
os.makedirs(dst_directory, exist_ok=True)
|
|
1124
1135
|
|
|
@@ -1126,9 +1137,21 @@ def s3_download(
|
|
|
1126
1137
|
download_file = patch_method(
|
|
1127
1138
|
client.download_file, max_retries=max_retries, should_retry=s3_should_retry
|
|
1128
1139
|
)
|
|
1140
|
+
|
|
1141
|
+
transfer_config = TransferConfig(
|
|
1142
|
+
multipart_threshold=READER_BLOCK_SIZE,
|
|
1143
|
+
max_concurrency=GLOBAL_MAX_WORKERS,
|
|
1144
|
+
multipart_chunksize=READER_BLOCK_SIZE,
|
|
1145
|
+
num_download_attempts=S3_MAX_RETRY_TIMES,
|
|
1146
|
+
max_io_queue=max(READER_MAX_BUFFER_SIZE // READER_BLOCK_SIZE, 1),
|
|
1147
|
+
)
|
|
1129
1148
|
try:
|
|
1130
1149
|
download_file(
|
|
1131
|
-
src_bucket,
|
|
1150
|
+
src_bucket,
|
|
1151
|
+
src_key,
|
|
1152
|
+
dst_path.path_without_protocol,
|
|
1153
|
+
Callback=callback,
|
|
1154
|
+
Config=transfer_config,
|
|
1132
1155
|
)
|
|
1133
1156
|
except Exception as error:
|
|
1134
1157
|
error = translate_fs_error(error, dst_url)
|
|
@@ -1179,8 +1202,19 @@ def s3_upload(
|
|
|
1179
1202
|
client.upload_fileobj, max_retries=max_retries, should_retry=s3_should_retry
|
|
1180
1203
|
)
|
|
1181
1204
|
|
|
1205
|
+
transfer_config = TransferConfig(
|
|
1206
|
+
multipart_threshold=WRITER_BLOCK_SIZE,
|
|
1207
|
+
max_concurrency=GLOBAL_MAX_WORKERS,
|
|
1208
|
+
multipart_chunksize=WRITER_BLOCK_SIZE,
|
|
1209
|
+
)
|
|
1182
1210
|
with open(src_path.path_without_protocol, "rb") as src, raise_s3_error(dst_url):
|
|
1183
|
-
upload_fileobj(
|
|
1211
|
+
upload_fileobj(
|
|
1212
|
+
src,
|
|
1213
|
+
Bucket=dst_bucket,
|
|
1214
|
+
Key=dst_key,
|
|
1215
|
+
Callback=callback,
|
|
1216
|
+
Config=transfer_config,
|
|
1217
|
+
)
|
|
1184
1218
|
|
|
1185
1219
|
|
|
1186
1220
|
def s3_load_content(
|
|
@@ -1226,27 +1260,6 @@ def s3_load_content(
|
|
|
1226
1260
|
)(client, bucket, key, range_str)
|
|
1227
1261
|
|
|
1228
1262
|
|
|
1229
|
-
def s3_readlink(path) -> str:
|
|
1230
|
-
"""
|
|
1231
|
-
Return a string representing the path to which the symbolic link points.
|
|
1232
|
-
|
|
1233
|
-
:returns: Return a string representing the path to which the symbolic link points.
|
|
1234
|
-
:raises: S3NameTooLongError, S3BucketNotFoundError, S3IsADirectoryError,
|
|
1235
|
-
S3NotALinkError
|
|
1236
|
-
"""
|
|
1237
|
-
return S3Path(path).readlink().path_with_protocol
|
|
1238
|
-
|
|
1239
|
-
|
|
1240
|
-
def s3_rename(src_url: PathLike, dst_url: PathLike, overwrite: bool = True) -> None:
|
|
1241
|
-
"""
|
|
1242
|
-
Move s3 file path from src_url to dst_url
|
|
1243
|
-
|
|
1244
|
-
:param dst_url: Given destination path
|
|
1245
|
-
:param overwrite: whether or not overwrite file when exists
|
|
1246
|
-
"""
|
|
1247
|
-
S3Path(src_url).rename(dst_url, overwrite)
|
|
1248
|
-
|
|
1249
|
-
|
|
1250
1263
|
class S3Cacher(FileCacher):
|
|
1251
1264
|
cache_path = None
|
|
1252
1265
|
|
|
@@ -1268,97 +1281,8 @@ class S3Cacher(FileCacher):
|
|
|
1268
1281
|
os.unlink(self.cache_path)
|
|
1269
1282
|
|
|
1270
1283
|
|
|
1271
|
-
def s3_glob(
|
|
1272
|
-
path: PathLike,
|
|
1273
|
-
recursive: bool = True,
|
|
1274
|
-
missing_ok: bool = True,
|
|
1275
|
-
followlinks: bool = False,
|
|
1276
|
-
) -> List[str]:
|
|
1277
|
-
"""Return s3 path list in ascending alphabetical order,
|
|
1278
|
-
in which path matches glob pattern
|
|
1279
|
-
|
|
1280
|
-
Notes: Only glob in bucket. If trying to match bucket with wildcard characters,
|
|
1281
|
-
raise UnsupportedError
|
|
1282
|
-
|
|
1283
|
-
:param recursive: If False, `**` will not search directory recursively
|
|
1284
|
-
:param missing_ok: If False and target path doesn't match any file,
|
|
1285
|
-
raise FileNotFoundError
|
|
1286
|
-
:raises: UnsupportedError, when bucket part contains wildcard characters
|
|
1287
|
-
:returns: A list contains paths match `s3_pathname`
|
|
1288
|
-
"""
|
|
1289
|
-
return list(
|
|
1290
|
-
s3_iglob(
|
|
1291
|
-
path=path,
|
|
1292
|
-
recursive=recursive,
|
|
1293
|
-
missing_ok=missing_ok,
|
|
1294
|
-
followlinks=followlinks,
|
|
1295
|
-
)
|
|
1296
|
-
)
|
|
1297
|
-
|
|
1298
|
-
|
|
1299
|
-
def s3_glob_stat(
|
|
1300
|
-
path: PathLike,
|
|
1301
|
-
recursive: bool = True,
|
|
1302
|
-
missing_ok: bool = True,
|
|
1303
|
-
followlinks: bool = False,
|
|
1304
|
-
) -> Iterator[FileEntry]:
|
|
1305
|
-
"""Return a generator contains tuples of path and file stat,
|
|
1306
|
-
in ascending alphabetical order, in which path matches glob pattern
|
|
1307
|
-
|
|
1308
|
-
Notes: Only glob in bucket. If trying to match bucket with wildcard characters,
|
|
1309
|
-
raise UnsupportedError
|
|
1310
|
-
|
|
1311
|
-
:param recursive: If False, `**` will not search directory recursively
|
|
1312
|
-
:param missing_ok: If False and target path doesn't match any file,
|
|
1313
|
-
raise FileNotFoundError
|
|
1314
|
-
:raises: UnsupportedError, when bucket part contains wildcard characters
|
|
1315
|
-
:returns: A generator contains tuples of path and file stat,
|
|
1316
|
-
in which paths match `s3_pathname`
|
|
1317
|
-
"""
|
|
1318
|
-
return S3Path(path).glob_stat(
|
|
1319
|
-
pattern="", recursive=recursive, missing_ok=missing_ok, followlinks=followlinks
|
|
1320
|
-
)
|
|
1321
|
-
|
|
1322
|
-
|
|
1323
|
-
def s3_iglob(
|
|
1324
|
-
path: PathLike,
|
|
1325
|
-
recursive: bool = True,
|
|
1326
|
-
missing_ok: bool = True,
|
|
1327
|
-
followlinks: bool = False,
|
|
1328
|
-
) -> Iterator[str]:
|
|
1329
|
-
"""Return s3 path iterator in ascending alphabetical order,
|
|
1330
|
-
in which path matches glob pattern
|
|
1331
|
-
|
|
1332
|
-
Notes: Only glob in bucket. If trying to match bucket with wildcard characters,
|
|
1333
|
-
raise UnsupportedError
|
|
1334
|
-
|
|
1335
|
-
:param recursive: If False, `**` will not search directory recursively
|
|
1336
|
-
:param missing_ok: If False and target path doesn't match any file,
|
|
1337
|
-
raise FileNotFoundError
|
|
1338
|
-
:raises: UnsupportedError, when bucket part contains wildcard characters
|
|
1339
|
-
:returns: An iterator contains paths match `s3_pathname`
|
|
1340
|
-
"""
|
|
1341
|
-
for path_obj in S3Path(path).iglob(
|
|
1342
|
-
pattern="", recursive=recursive, missing_ok=missing_ok, followlinks=followlinks
|
|
1343
|
-
):
|
|
1344
|
-
yield path_obj.path_with_protocol
|
|
1345
|
-
|
|
1346
|
-
|
|
1347
|
-
def s3_makedirs(path: PathLike, exist_ok: bool = False):
|
|
1348
|
-
"""
|
|
1349
|
-
Create an s3 directory.
|
|
1350
|
-
Purely creating directory is invalid because it's unavailable on OSS.
|
|
1351
|
-
This function is to test the target bucket have WRITE access.
|
|
1352
|
-
|
|
1353
|
-
:param path: Given path
|
|
1354
|
-
:param exist_ok: If False and target directory exists, raise S3FileExistsError
|
|
1355
|
-
:raises: S3BucketNotFoundError, S3FileExistsError
|
|
1356
|
-
"""
|
|
1357
|
-
return S3Path(path).mkdir(parents=True, exist_ok=exist_ok)
|
|
1358
|
-
|
|
1359
|
-
|
|
1360
1284
|
def _group_src_paths_by_block(
|
|
1361
|
-
src_paths: List[PathLike], block_size: int =
|
|
1285
|
+
src_paths: List[PathLike], block_size: int = READER_BLOCK_SIZE
|
|
1362
1286
|
) -> List[List[Tuple[PathLike, Optional[str]]]]:
|
|
1363
1287
|
groups = []
|
|
1364
1288
|
current_group, current_group_size = [], 0
|
|
@@ -1404,7 +1328,7 @@ def _group_src_paths_by_block(
|
|
|
1404
1328
|
def s3_concat(
|
|
1405
1329
|
src_paths: List[PathLike],
|
|
1406
1330
|
dst_path: PathLike,
|
|
1407
|
-
block_size: int =
|
|
1331
|
+
block_size: int = READER_BLOCK_SIZE,
|
|
1408
1332
|
max_workers: int = GLOBAL_MAX_WORKERS,
|
|
1409
1333
|
) -> None:
|
|
1410
1334
|
"""Concatenate s3 files to one file.
|
|
@@ -1419,9 +1343,10 @@ def s3_concat(
|
|
|
1419
1343
|
else:
|
|
1420
1344
|
groups = _group_src_paths_by_block(src_paths, block_size=block_size)
|
|
1421
1345
|
|
|
1422
|
-
with
|
|
1423
|
-
|
|
1424
|
-
|
|
1346
|
+
with (
|
|
1347
|
+
MultiPartWriter(client, dst_path) as writer,
|
|
1348
|
+
ThreadPoolExecutor(max_workers=max_workers) as executor,
|
|
1349
|
+
):
|
|
1425
1350
|
for index, group in enumerate(groups, start=1):
|
|
1426
1351
|
if len(group) == 1:
|
|
1427
1352
|
executor.submit(
|
|
@@ -1431,14 +1356,6 @@ def s3_concat(
|
|
|
1431
1356
|
executor.submit(writer.upload_part_by_paths, index, group)
|
|
1432
1357
|
|
|
1433
1358
|
|
|
1434
|
-
def s3_lstat(path: PathLike) -> StatResult:
|
|
1435
|
-
"""
|
|
1436
|
-
Like Path.stat() but, if the path points to a symbolic link,
|
|
1437
|
-
return the symbolic link’s information rather than its target’s.
|
|
1438
|
-
"""
|
|
1439
|
-
return S3Path(path).lstat()
|
|
1440
|
-
|
|
1441
|
-
|
|
1442
1359
|
@SmartPath.register
|
|
1443
1360
|
class S3Path(URIPath):
|
|
1444
1361
|
protocol = "s3"
|