megfile 3.1.6__py3-none-any.whl → 4.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- megfile/cli.py +12 -7
- megfile/config.py +34 -44
- megfile/fs.py +169 -11
- megfile/fs_path.py +183 -259
- megfile/hdfs.py +106 -5
- megfile/hdfs_path.py +34 -90
- megfile/http.py +50 -1
- megfile/http_path.py +27 -65
- megfile/interfaces.py +1 -8
- megfile/lib/base_prefetch_reader.py +62 -78
- megfile/lib/combine_reader.py +5 -0
- megfile/lib/glob.py +3 -6
- megfile/lib/hdfs_prefetch_reader.py +7 -7
- megfile/lib/http_prefetch_reader.py +6 -6
- megfile/lib/s3_buffered_writer.py +67 -64
- megfile/lib/s3_cached_handler.py +1 -2
- megfile/lib/s3_limited_seekable_writer.py +3 -7
- megfile/lib/s3_memory_handler.py +1 -2
- megfile/lib/s3_pipe_handler.py +1 -2
- megfile/lib/s3_prefetch_reader.py +15 -20
- megfile/lib/s3_share_cache_reader.py +8 -5
- megfile/pathlike.py +397 -401
- megfile/s3.py +118 -17
- megfile/s3_path.py +150 -224
- megfile/sftp.py +300 -10
- megfile/sftp_path.py +46 -322
- megfile/smart.py +33 -27
- megfile/smart_path.py +9 -14
- megfile/stdio.py +1 -1
- megfile/stdio_path.py +2 -2
- megfile/utils/__init__.py +11 -4
- megfile/version.py +1 -1
- {megfile-3.1.6.dist-info → megfile-4.0.0.dist-info}/METADATA +7 -7
- megfile-4.0.0.dist-info/RECORD +52 -0
- {megfile-3.1.6.dist-info → megfile-4.0.0.dist-info}/WHEEL +1 -1
- {megfile-3.1.6.dist-info → megfile-4.0.0.dist-info}/top_level.txt +0 -2
- docs/conf.py +0 -65
- megfile-3.1.6.dist-info/RECORD +0 -55
- scripts/convert_results_to_sarif.py +0 -91
- scripts/generate_file.py +0 -344
- {megfile-3.1.6.dist-info → megfile-4.0.0.dist-info}/LICENSE +0 -0
- {megfile-3.1.6.dist-info → megfile-4.0.0.dist-info}/LICENSE.pyre +0 -0
- {megfile-3.1.6.dist-info → megfile-4.0.0.dist-info}/entry_points.txt +0 -0
megfile/s3_path.py
CHANGED
|
@@ -7,18 +7,23 @@ from concurrent.futures import ThreadPoolExecutor
|
|
|
7
7
|
from functools import cached_property, lru_cache, wraps
|
|
8
8
|
from logging import getLogger as get_logger
|
|
9
9
|
from typing import IO, Any, BinaryIO, Callable, Dict, Iterator, List, Optional, Tuple
|
|
10
|
+
from urllib.parse import urlparse
|
|
10
11
|
|
|
11
12
|
import boto3
|
|
12
13
|
import botocore
|
|
13
|
-
from
|
|
14
|
+
from boto3.s3.transfer import TransferConfig
|
|
15
|
+
from botocore.awsrequest import AWSPreparedRequest, AWSResponse
|
|
14
16
|
|
|
15
17
|
from megfile.config import (
|
|
16
|
-
DEFAULT_BLOCK_SIZE,
|
|
17
|
-
DEFAULT_MAX_BLOCK_SIZE,
|
|
18
|
-
DEFAULT_MIN_BLOCK_SIZE,
|
|
19
18
|
GLOBAL_MAX_WORKERS,
|
|
19
|
+
HTTP_AUTH_HEADERS,
|
|
20
|
+
READER_BLOCK_SIZE,
|
|
21
|
+
READER_MAX_BUFFER_SIZE,
|
|
20
22
|
S3_CLIENT_CACHE_MODE,
|
|
21
23
|
S3_MAX_RETRY_TIMES,
|
|
24
|
+
WRITER_BLOCK_SIZE,
|
|
25
|
+
WRITER_MAX_BUFFER_SIZE,
|
|
26
|
+
to_boolean,
|
|
22
27
|
)
|
|
23
28
|
from megfile.errors import (
|
|
24
29
|
S3BucketNotFoundError,
|
|
@@ -59,7 +64,6 @@ from megfile.lib.fnmatch import translate
|
|
|
59
64
|
from megfile.lib.glob import has_magic, has_magic_ignore_brace, ungloblize
|
|
60
65
|
from megfile.lib.joinpath import uri_join
|
|
61
66
|
from megfile.lib.s3_buffered_writer import (
|
|
62
|
-
DEFAULT_MAX_BUFFER_SIZE,
|
|
63
67
|
S3BufferedWriter,
|
|
64
68
|
)
|
|
65
69
|
from megfile.lib.s3_cached_handler import S3CachedHandler
|
|
@@ -76,6 +80,7 @@ from megfile.utils import (
|
|
|
76
80
|
generate_cache_path,
|
|
77
81
|
get_binary_mode,
|
|
78
82
|
get_content_offset,
|
|
83
|
+
is_domain_or_subdomain,
|
|
79
84
|
is_readable,
|
|
80
85
|
necessary_params,
|
|
81
86
|
process_local,
|
|
@@ -98,26 +103,14 @@ __all__ = [
|
|
|
98
103
|
"s3_share_cache_open",
|
|
99
104
|
"s3_open",
|
|
100
105
|
"S3Cacher",
|
|
101
|
-
"S3BufferedWriter",
|
|
102
|
-
"S3LimitedSeekableWriter",
|
|
103
|
-
"S3PrefetchReader",
|
|
104
|
-
"S3ShareCacheReader",
|
|
105
106
|
"s3_upload",
|
|
106
107
|
"s3_download",
|
|
107
108
|
"s3_load_content",
|
|
108
|
-
"s3_readlink",
|
|
109
|
-
"s3_glob",
|
|
110
|
-
"s3_glob_stat",
|
|
111
|
-
"s3_iglob",
|
|
112
|
-
"s3_rename",
|
|
113
|
-
"s3_makedirs",
|
|
114
109
|
"s3_concat",
|
|
115
|
-
"s3_lstat",
|
|
116
110
|
]
|
|
117
111
|
_logger = get_logger(__name__)
|
|
118
112
|
content_md5_header = "megfile-content-md5"
|
|
119
113
|
endpoint_url = "https://s3.amazonaws.com"
|
|
120
|
-
max_pool_connections = GLOBAL_MAX_WORKERS # for compatibility
|
|
121
114
|
max_retries = S3_MAX_RETRY_TIMES
|
|
122
115
|
max_keys = 1000
|
|
123
116
|
|
|
@@ -162,24 +155,30 @@ def _patch_make_request(client: botocore.client.BaseClient, redirect: bool = Fal
|
|
|
162
155
|
retry_callback=retry_callback,
|
|
163
156
|
)
|
|
164
157
|
|
|
165
|
-
def
|
|
166
|
-
def
|
|
167
|
-
|
|
158
|
+
def patch_send(send):
|
|
159
|
+
def patched_send(request: AWSPreparedRequest) -> AWSResponse:
|
|
160
|
+
response: AWSResponse = send(request)
|
|
168
161
|
if (
|
|
169
|
-
|
|
170
|
-
and
|
|
171
|
-
and "Location" in
|
|
162
|
+
request.method == "GET" # only support GET method for now
|
|
163
|
+
and response.status_code in (301, 302, 307, 308)
|
|
164
|
+
and "Location" in response.headers
|
|
172
165
|
):
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
166
|
+
# Permit sending auth/cookie headers from "foo.com" to "sub.foo.com".
|
|
167
|
+
# See also: https://go.dev/src/net/http/client.go#L980
|
|
168
|
+
location = response.headers["Location"]
|
|
169
|
+
ihost = urlparse(request.url).hostname
|
|
170
|
+
dhost = urlparse(location).hostname
|
|
171
|
+
if not is_domain_or_subdomain(dhost, ihost):
|
|
172
|
+
for name in HTTP_AUTH_HEADERS:
|
|
173
|
+
request.headers.pop(name, None)
|
|
174
|
+
request.url = location
|
|
175
|
+
response = send(request)
|
|
176
|
+
return response
|
|
177
|
+
|
|
178
|
+
return patched_send
|
|
178
179
|
|
|
179
180
|
if redirect:
|
|
180
|
-
client._endpoint.
|
|
181
|
-
client._endpoint._send_request
|
|
182
|
-
)
|
|
181
|
+
client._endpoint._send = patch_send(client._endpoint._send)
|
|
183
182
|
|
|
184
183
|
return client
|
|
185
184
|
|
|
@@ -257,7 +256,7 @@ def get_env_var(env_name: str, profile_name=None):
|
|
|
257
256
|
def parse_boolean(value: Optional[str], default: bool = False) -> bool:
|
|
258
257
|
if value is None:
|
|
259
258
|
return default
|
|
260
|
-
return value
|
|
259
|
+
return to_boolean(value)
|
|
261
260
|
|
|
262
261
|
|
|
263
262
|
def get_access_token(profile_name=None):
|
|
@@ -629,7 +628,7 @@ def _s3_scan_pairs(
|
|
|
629
628
|
src_url: PathLike, dst_url: PathLike
|
|
630
629
|
) -> Iterator[Tuple[PathLike, PathLike]]:
|
|
631
630
|
for src_file_path in S3Path(src_url).scan():
|
|
632
|
-
content_path = src_file_path[len(src_url) :]
|
|
631
|
+
content_path = src_file_path[len(fspath(src_url)) :]
|
|
633
632
|
if len(content_path) > 0:
|
|
634
633
|
dst_file_path = s3_path_join(dst_url, content_path)
|
|
635
634
|
else:
|
|
@@ -689,8 +688,8 @@ def s3_prefetch_open(
|
|
|
689
688
|
mode: str = "rb",
|
|
690
689
|
followlinks: bool = False,
|
|
691
690
|
*,
|
|
692
|
-
|
|
693
|
-
|
|
691
|
+
max_workers: Optional[int] = None,
|
|
692
|
+
block_size: int = READER_BLOCK_SIZE,
|
|
694
693
|
) -> S3PrefetchReader:
|
|
695
694
|
"""Open a asynchronous prefetch reader, to support fast sequential
|
|
696
695
|
read and random read
|
|
@@ -701,11 +700,18 @@ def s3_prefetch_open(
|
|
|
701
700
|
|
|
702
701
|
Supports context manager
|
|
703
702
|
|
|
704
|
-
Some parameter setting may perform well:
|
|
705
|
-
|
|
706
|
-
|
|
707
|
-
:param
|
|
708
|
-
:param
|
|
703
|
+
Some parameter setting may perform well: max_workers=10 or 20,
|
|
704
|
+
block_size=8 or 16 MB, default value None means using global thread pool
|
|
705
|
+
|
|
706
|
+
:param s3_url: s3 path
|
|
707
|
+
:param mode: only support "r" or "rb"
|
|
708
|
+
:param encoding: encoding is the name of the encoding used to decode or encode
|
|
709
|
+
the file. This should only be used in text mode.
|
|
710
|
+
:param errors: errors is an optional string that specifies how encoding and
|
|
711
|
+
decoding errors are to be handled—this cannot be used in binary mode.
|
|
712
|
+
:param followlinks: follow symbolic link, default `False`
|
|
713
|
+
:param max_workers: Max download thread number, `None` by default
|
|
714
|
+
:param block_size: Max data size downloaded by each thread, in bytes,
|
|
709
715
|
8MB by default
|
|
710
716
|
:returns: An opened S3PrefetchReader object
|
|
711
717
|
:raises: S3FileNotFoundError
|
|
@@ -721,15 +727,15 @@ def s3_prefetch_open(
|
|
|
721
727
|
pass
|
|
722
728
|
|
|
723
729
|
bucket, key = parse_s3_url(s3_url.path_with_protocol)
|
|
724
|
-
config = botocore.config.Config(max_pool_connections=
|
|
730
|
+
config = botocore.config.Config(max_pool_connections=GLOBAL_MAX_WORKERS)
|
|
725
731
|
client = get_s3_client_with_cache(config=config, profile_name=s3_url._profile_name)
|
|
726
732
|
return S3PrefetchReader(
|
|
727
733
|
bucket,
|
|
728
734
|
key,
|
|
729
735
|
s3_client=client,
|
|
730
736
|
max_retries=max_retries,
|
|
731
|
-
max_workers=
|
|
732
|
-
block_size=
|
|
737
|
+
max_workers=max_workers,
|
|
738
|
+
block_size=block_size,
|
|
733
739
|
profile_name=s3_url._profile_name,
|
|
734
740
|
)
|
|
735
741
|
|
|
@@ -741,8 +747,8 @@ def s3_share_cache_open(
|
|
|
741
747
|
followlinks: bool = False,
|
|
742
748
|
*,
|
|
743
749
|
cache_key: str = "lru",
|
|
744
|
-
|
|
745
|
-
|
|
750
|
+
max_workers: Optional[int] = None,
|
|
751
|
+
block_size: int = READER_BLOCK_SIZE,
|
|
746
752
|
) -> S3ShareCacheReader:
|
|
747
753
|
"""Open a asynchronous prefetch reader, to support fast sequential read and
|
|
748
754
|
random read
|
|
@@ -753,11 +759,18 @@ def s3_share_cache_open(
|
|
|
753
759
|
|
|
754
760
|
Supports context manager
|
|
755
761
|
|
|
756
|
-
Some parameter setting may perform well:
|
|
757
|
-
|
|
758
|
-
|
|
759
|
-
:param
|
|
760
|
-
:param
|
|
762
|
+
Some parameter setting may perform well: max_workers=10 or 20,
|
|
763
|
+
block_size=8 or 16 MB, default value None means using global thread pool
|
|
764
|
+
|
|
765
|
+
:param s3_url: s3 path
|
|
766
|
+
:param mode: only support "r" or "rb"
|
|
767
|
+
:param encoding: encoding is the name of the encoding used to decode or encode
|
|
768
|
+
the file. This should only be used in text mode.
|
|
769
|
+
:param errors: errors is an optional string that specifies how encoding and
|
|
770
|
+
decoding errors are to be handled—this cannot be used in binary mode.
|
|
771
|
+
:param followlinks: follow symbolic link, default `False`
|
|
772
|
+
:param max_workers: Max download thread number, None by default
|
|
773
|
+
:param block_size: Max data size downloaded by each thread, in bytes,
|
|
761
774
|
8MB by default
|
|
762
775
|
:returns: An opened S3ShareCacheReader object
|
|
763
776
|
:raises: S3FileNotFoundError
|
|
@@ -774,7 +787,7 @@ def s3_share_cache_open(
|
|
|
774
787
|
pass
|
|
775
788
|
|
|
776
789
|
bucket, key = parse_s3_url(s3_url.path_with_protocol)
|
|
777
|
-
config = botocore.config.Config(max_pool_connections=
|
|
790
|
+
config = botocore.config.Config(max_pool_connections=GLOBAL_MAX_WORKERS)
|
|
778
791
|
client = get_s3_client_with_cache(config=config, profile_name=s3_url._profile_name)
|
|
779
792
|
return S3ShareCacheReader(
|
|
780
793
|
bucket,
|
|
@@ -782,8 +795,8 @@ def s3_share_cache_open(
|
|
|
782
795
|
cache_key=cache_key,
|
|
783
796
|
s3_client=client,
|
|
784
797
|
max_retries=max_retries,
|
|
785
|
-
max_workers=
|
|
786
|
-
block_size=
|
|
798
|
+
max_workers=max_workers,
|
|
799
|
+
block_size=block_size,
|
|
787
800
|
profile_name=s3_url._profile_name,
|
|
788
801
|
)
|
|
789
802
|
|
|
@@ -810,7 +823,13 @@ def s3_pipe_open(
|
|
|
810
823
|
But asynchronous behavior can guarantee the file are successfully written,
|
|
811
824
|
and frequent execution may cause thread and file handle exhaustion
|
|
812
825
|
|
|
813
|
-
:param
|
|
826
|
+
:param s3_url: s3 path
|
|
827
|
+
:param mode: Mode to open file, either "r", "rb", "w" or "wb"
|
|
828
|
+
:param encoding: encoding is the name of the encoding used to decode or encode
|
|
829
|
+
the file. This should only be used in text mode.
|
|
830
|
+
:param errors: errors is an optional string that specifies how encoding and
|
|
831
|
+
decoding errors are to be handled—this cannot be used in binary mode.
|
|
832
|
+
:param followlinks: follow symbolic link, default `False`
|
|
814
833
|
:param join_thread: If wait after function execution until s3 finishes writing
|
|
815
834
|
:returns: An opened BufferedReader / BufferedWriter object
|
|
816
835
|
"""
|
|
@@ -829,7 +848,7 @@ def s3_pipe_open(
|
|
|
829
848
|
pass
|
|
830
849
|
|
|
831
850
|
bucket, key = parse_s3_url(s3_url.path_with_protocol)
|
|
832
|
-
config = botocore.config.Config(max_pool_connections=
|
|
851
|
+
config = botocore.config.Config(max_pool_connections=GLOBAL_MAX_WORKERS)
|
|
833
852
|
client = get_s3_client_with_cache(config=config, profile_name=s3_url._profile_name)
|
|
834
853
|
return S3PipeHandler(
|
|
835
854
|
bucket,
|
|
@@ -860,7 +879,14 @@ def s3_cached_open(
|
|
|
860
879
|
cache_path can specify the path of cache file. Performance could be better
|
|
861
880
|
if cache file path is on ssd or tmpfs
|
|
862
881
|
|
|
863
|
-
:param
|
|
882
|
+
:param s3_url: s3 path
|
|
883
|
+
:param mode: Mode to open file, could be one of "rb", "wb", "ab", "rb+", "wb+"
|
|
884
|
+
or "ab+"
|
|
885
|
+
:param encoding: encoding is the name of the encoding used to decode or encode
|
|
886
|
+
the file. This should only be used in text mode.
|
|
887
|
+
:param errors: errors is an optional string that specifies how encoding and
|
|
888
|
+
decoding errors are to be handled—this cannot be used in binary mode.
|
|
889
|
+
:param followlinks: follow symbolic link, default `False`
|
|
864
890
|
:param cache_path: cache file path
|
|
865
891
|
:returns: An opened BufferedReader / BufferedWriter object
|
|
866
892
|
"""
|
|
@@ -875,7 +901,7 @@ def s3_cached_open(
|
|
|
875
901
|
pass
|
|
876
902
|
|
|
877
903
|
bucket, key = parse_s3_url(s3_url.path_with_protocol)
|
|
878
|
-
config = botocore.config.Config(max_pool_connections=
|
|
904
|
+
config = botocore.config.Config(max_pool_connections=GLOBAL_MAX_WORKERS)
|
|
879
905
|
client = get_s3_client_with_cache(config=config, profile_name=s3_url._profile_name)
|
|
880
906
|
return S3CachedHandler(
|
|
881
907
|
bucket,
|
|
@@ -893,16 +919,14 @@ def s3_buffered_open(
|
|
|
893
919
|
mode: str,
|
|
894
920
|
followlinks: bool = False,
|
|
895
921
|
*,
|
|
896
|
-
|
|
897
|
-
max_buffer_size: int =
|
|
898
|
-
|
|
922
|
+
max_workers: Optional[int] = None,
|
|
923
|
+
max_buffer_size: Optional[int] = None,
|
|
924
|
+
block_forward: Optional[int] = None,
|
|
899
925
|
block_size: Optional[int] = None,
|
|
900
926
|
limited_seekable: bool = False,
|
|
901
927
|
buffered: bool = False,
|
|
902
928
|
share_cache_key: Optional[str] = None,
|
|
903
929
|
cache_path: Optional[str] = None,
|
|
904
|
-
min_block_size: Optional[int] = None,
|
|
905
|
-
max_block_size: int = DEFAULT_MAX_BLOCK_SIZE,
|
|
906
930
|
) -> IO:
|
|
907
931
|
"""Open an asynchronous prefetch reader, to support fast sequential read
|
|
908
932
|
|
|
@@ -912,22 +936,30 @@ def s3_buffered_open(
|
|
|
912
936
|
|
|
913
937
|
Supports context manager
|
|
914
938
|
|
|
915
|
-
Some parameter setting may perform well:
|
|
916
|
-
|
|
917
|
-
|
|
918
|
-
:param
|
|
919
|
-
:param
|
|
920
|
-
|
|
921
|
-
|
|
922
|
-
|
|
923
|
-
|
|
924
|
-
|
|
939
|
+
Some parameter setting may perform well: max_workers=10 or 20,
|
|
940
|
+
default value None means using global thread pool
|
|
941
|
+
|
|
942
|
+
:param s3_url: s3 path
|
|
943
|
+
:param mode: Mode to open file, could be one of "rb", "wb", "ab", "rb+", "wb+"
|
|
944
|
+
or "ab+"
|
|
945
|
+
:param encoding: encoding is the name of the encoding used to decode or encode
|
|
946
|
+
the file. This should only be used in text mode.
|
|
947
|
+
:param errors: errors is an optional string that specifies how encoding and
|
|
948
|
+
decoding errors are to be handled—this cannot be used in binary mode.
|
|
949
|
+
:param followlinks: follow symbolic link, default `False`
|
|
950
|
+
:param max_workers: Max download / upload thread number, `None` by default,
|
|
951
|
+
will use global thread pool with 8 threads.
|
|
952
|
+
:param max_buffer_size: Max cached buffer size in memory, 128MB by default.
|
|
953
|
+
Set to `0` will disable cache.
|
|
954
|
+
:param block_forward: How many blocks of data cached from offset position, only for
|
|
955
|
+
read mode.
|
|
956
|
+
:param block_size: Size of single block.
|
|
925
957
|
Each block will be uploaded by single thread.
|
|
926
958
|
:param limited_seekable: If write-handle supports limited seek
|
|
927
959
|
(both file head part and tail part can seek block_size).
|
|
928
960
|
Notes: This parameter are valid only for write-handle.
|
|
929
961
|
Read-handle support arbitrary seek
|
|
930
|
-
:returns: An opened
|
|
962
|
+
:returns: An opened File object
|
|
931
963
|
:raises: S3FileNotFoundError
|
|
932
964
|
"""
|
|
933
965
|
if mode not in ("rb", "wb", "ab", "rb+", "wb+", "ab+"):
|
|
@@ -939,11 +971,8 @@ def s3_buffered_open(
|
|
|
939
971
|
s3_url = s3_url.readlink()
|
|
940
972
|
except S3NotALinkError:
|
|
941
973
|
pass
|
|
942
|
-
min_block_size = min_block_size or block_size or DEFAULT_MIN_BLOCK_SIZE
|
|
943
|
-
block_size = block_size or DEFAULT_BLOCK_SIZE
|
|
944
|
-
|
|
945
974
|
bucket, key = parse_s3_url(s3_url.path_with_protocol)
|
|
946
|
-
config = botocore.config.Config(max_pool_connections=
|
|
975
|
+
config = botocore.config.Config(max_pool_connections=GLOBAL_MAX_WORKERS)
|
|
947
976
|
client = get_s3_client_with_cache(config=config, profile_name=s3_url._profile_name)
|
|
948
977
|
|
|
949
978
|
if "a" in mode or "+" in mode:
|
|
@@ -961,13 +990,6 @@ def s3_buffered_open(
|
|
|
961
990
|
)
|
|
962
991
|
|
|
963
992
|
if mode == "rb":
|
|
964
|
-
# A rough conversion algorithm to align 2 types of Reader / Writer parameters
|
|
965
|
-
# TODO: Optimize the conversion algorithm
|
|
966
|
-
block_capacity = max_buffer_size // block_size
|
|
967
|
-
if forward_ratio is None:
|
|
968
|
-
block_forward = None
|
|
969
|
-
else:
|
|
970
|
-
block_forward = max(int(block_capacity * forward_ratio), 1)
|
|
971
993
|
if share_cache_key is not None:
|
|
972
994
|
reader = S3ShareCacheReader(
|
|
973
995
|
bucket,
|
|
@@ -975,8 +997,8 @@ def s3_buffered_open(
|
|
|
975
997
|
cache_key=share_cache_key,
|
|
976
998
|
s3_client=client,
|
|
977
999
|
max_retries=max_retries,
|
|
978
|
-
max_workers=
|
|
979
|
-
block_size=block_size,
|
|
1000
|
+
max_workers=max_workers,
|
|
1001
|
+
block_size=block_size or READER_BLOCK_SIZE,
|
|
980
1002
|
block_forward=block_forward,
|
|
981
1003
|
profile_name=s3_url._profile_name,
|
|
982
1004
|
)
|
|
@@ -986,10 +1008,10 @@ def s3_buffered_open(
|
|
|
986
1008
|
key,
|
|
987
1009
|
s3_client=client,
|
|
988
1010
|
max_retries=max_retries,
|
|
989
|
-
max_workers=
|
|
990
|
-
|
|
1011
|
+
max_workers=max_workers,
|
|
1012
|
+
max_buffer_size=max_buffer_size or READER_MAX_BUFFER_SIZE,
|
|
991
1013
|
block_forward=block_forward,
|
|
992
|
-
block_size=block_size,
|
|
1014
|
+
block_size=block_size or READER_BLOCK_SIZE,
|
|
993
1015
|
profile_name=s3_url._profile_name,
|
|
994
1016
|
)
|
|
995
1017
|
if buffered or _is_pickle(reader):
|
|
@@ -1001,10 +1023,9 @@ def s3_buffered_open(
|
|
|
1001
1023
|
bucket,
|
|
1002
1024
|
key,
|
|
1003
1025
|
s3_client=client,
|
|
1004
|
-
max_workers=
|
|
1005
|
-
block_size=
|
|
1006
|
-
|
|
1007
|
-
max_buffer_size=max_buffer_size,
|
|
1026
|
+
max_workers=max_workers,
|
|
1027
|
+
block_size=block_size or WRITER_BLOCK_SIZE,
|
|
1028
|
+
max_buffer_size=max_buffer_size or WRITER_MAX_BUFFER_SIZE,
|
|
1008
1029
|
profile_name=s3_url._profile_name,
|
|
1009
1030
|
)
|
|
1010
1031
|
else:
|
|
@@ -1012,10 +1033,9 @@ def s3_buffered_open(
|
|
|
1012
1033
|
bucket,
|
|
1013
1034
|
key,
|
|
1014
1035
|
s3_client=client,
|
|
1015
|
-
max_workers=
|
|
1016
|
-
block_size=
|
|
1017
|
-
|
|
1018
|
-
max_buffer_size=max_buffer_size,
|
|
1036
|
+
max_workers=max_workers,
|
|
1037
|
+
block_size=block_size or WRITER_BLOCK_SIZE,
|
|
1038
|
+
max_buffer_size=max_buffer_size or WRITER_MAX_BUFFER_SIZE,
|
|
1019
1039
|
profile_name=s3_url._profile_name,
|
|
1020
1040
|
)
|
|
1021
1041
|
if buffered or _is_pickle(writer):
|
|
@@ -1050,7 +1070,7 @@ def s3_memory_open(
|
|
|
1050
1070
|
pass
|
|
1051
1071
|
|
|
1052
1072
|
bucket, key = parse_s3_url(s3_url.path_with_protocol)
|
|
1053
|
-
config = botocore.config.Config(max_pool_connections=
|
|
1073
|
+
config = botocore.config.Config(max_pool_connections=GLOBAL_MAX_WORKERS)
|
|
1054
1074
|
client = get_s3_client_with_cache(config=config, profile_name=s3_url._profile_name)
|
|
1055
1075
|
return S3MemoryHandler(
|
|
1056
1076
|
bucket, key, mode, s3_client=client, profile_name=s3_url._profile_name
|
|
@@ -1109,7 +1129,7 @@ def s3_download(
|
|
|
1109
1129
|
if not src_url.is_file():
|
|
1110
1130
|
raise S3IsADirectoryError("Is a directory: %r" % src_url.path_with_protocol)
|
|
1111
1131
|
|
|
1112
|
-
dst_directory = os.path.dirname(dst_path.path_without_protocol)
|
|
1132
|
+
dst_directory = os.path.dirname(dst_path.path_without_protocol) # pyre-ignore[6]
|
|
1113
1133
|
if dst_directory != "":
|
|
1114
1134
|
os.makedirs(dst_directory, exist_ok=True)
|
|
1115
1135
|
|
|
@@ -1117,9 +1137,21 @@ def s3_download(
|
|
|
1117
1137
|
download_file = patch_method(
|
|
1118
1138
|
client.download_file, max_retries=max_retries, should_retry=s3_should_retry
|
|
1119
1139
|
)
|
|
1140
|
+
|
|
1141
|
+
transfer_config = TransferConfig(
|
|
1142
|
+
multipart_threshold=READER_BLOCK_SIZE,
|
|
1143
|
+
max_concurrency=GLOBAL_MAX_WORKERS,
|
|
1144
|
+
multipart_chunksize=READER_BLOCK_SIZE,
|
|
1145
|
+
num_download_attempts=S3_MAX_RETRY_TIMES,
|
|
1146
|
+
max_io_queue=max(READER_MAX_BUFFER_SIZE // READER_BLOCK_SIZE, 1),
|
|
1147
|
+
)
|
|
1120
1148
|
try:
|
|
1121
1149
|
download_file(
|
|
1122
|
-
src_bucket,
|
|
1150
|
+
src_bucket,
|
|
1151
|
+
src_key,
|
|
1152
|
+
dst_path.path_without_protocol,
|
|
1153
|
+
Callback=callback,
|
|
1154
|
+
Config=transfer_config,
|
|
1123
1155
|
)
|
|
1124
1156
|
except Exception as error:
|
|
1125
1157
|
error = translate_fs_error(error, dst_url)
|
|
@@ -1170,8 +1202,19 @@ def s3_upload(
|
|
|
1170
1202
|
client.upload_fileobj, max_retries=max_retries, should_retry=s3_should_retry
|
|
1171
1203
|
)
|
|
1172
1204
|
|
|
1205
|
+
transfer_config = TransferConfig(
|
|
1206
|
+
multipart_threshold=WRITER_BLOCK_SIZE,
|
|
1207
|
+
max_concurrency=GLOBAL_MAX_WORKERS,
|
|
1208
|
+
multipart_chunksize=WRITER_BLOCK_SIZE,
|
|
1209
|
+
)
|
|
1173
1210
|
with open(src_path.path_without_protocol, "rb") as src, raise_s3_error(dst_url):
|
|
1174
|
-
upload_fileobj(
|
|
1211
|
+
upload_fileobj(
|
|
1212
|
+
src,
|
|
1213
|
+
Bucket=dst_bucket,
|
|
1214
|
+
Key=dst_key,
|
|
1215
|
+
Callback=callback,
|
|
1216
|
+
Config=transfer_config,
|
|
1217
|
+
)
|
|
1175
1218
|
|
|
1176
1219
|
|
|
1177
1220
|
def s3_load_content(
|
|
@@ -1217,27 +1260,6 @@ def s3_load_content(
|
|
|
1217
1260
|
)(client, bucket, key, range_str)
|
|
1218
1261
|
|
|
1219
1262
|
|
|
1220
|
-
def s3_readlink(path) -> str:
|
|
1221
|
-
"""
|
|
1222
|
-
Return a string representing the path to which the symbolic link points.
|
|
1223
|
-
|
|
1224
|
-
:returns: Return a string representing the path to which the symbolic link points.
|
|
1225
|
-
:raises: S3NameTooLongError, S3BucketNotFoundError, S3IsADirectoryError,
|
|
1226
|
-
S3NotALinkError
|
|
1227
|
-
"""
|
|
1228
|
-
return S3Path(path).readlink().path_with_protocol
|
|
1229
|
-
|
|
1230
|
-
|
|
1231
|
-
def s3_rename(src_url: PathLike, dst_url: PathLike, overwrite: bool = True) -> None:
|
|
1232
|
-
"""
|
|
1233
|
-
Move s3 file path from src_url to dst_url
|
|
1234
|
-
|
|
1235
|
-
:param dst_url: Given destination path
|
|
1236
|
-
:param overwrite: whether or not overwrite file when exists
|
|
1237
|
-
"""
|
|
1238
|
-
S3Path(src_url).rename(dst_url, overwrite)
|
|
1239
|
-
|
|
1240
|
-
|
|
1241
1263
|
class S3Cacher(FileCacher):
|
|
1242
1264
|
cache_path = None
|
|
1243
1265
|
|
|
@@ -1259,97 +1281,8 @@ class S3Cacher(FileCacher):
|
|
|
1259
1281
|
os.unlink(self.cache_path)
|
|
1260
1282
|
|
|
1261
1283
|
|
|
1262
|
-
def s3_glob(
|
|
1263
|
-
path: PathLike,
|
|
1264
|
-
recursive: bool = True,
|
|
1265
|
-
missing_ok: bool = True,
|
|
1266
|
-
followlinks: bool = False,
|
|
1267
|
-
) -> List[str]:
|
|
1268
|
-
"""Return s3 path list in ascending alphabetical order,
|
|
1269
|
-
in which path matches glob pattern
|
|
1270
|
-
|
|
1271
|
-
Notes: Only glob in bucket. If trying to match bucket with wildcard characters,
|
|
1272
|
-
raise UnsupportedError
|
|
1273
|
-
|
|
1274
|
-
:param recursive: If False, `**` will not search directory recursively
|
|
1275
|
-
:param missing_ok: If False and target path doesn't match any file,
|
|
1276
|
-
raise FileNotFoundError
|
|
1277
|
-
:raises: UnsupportedError, when bucket part contains wildcard characters
|
|
1278
|
-
:returns: A list contains paths match `s3_pathname`
|
|
1279
|
-
"""
|
|
1280
|
-
return list(
|
|
1281
|
-
s3_iglob(
|
|
1282
|
-
path=path,
|
|
1283
|
-
recursive=recursive,
|
|
1284
|
-
missing_ok=missing_ok,
|
|
1285
|
-
followlinks=followlinks,
|
|
1286
|
-
)
|
|
1287
|
-
)
|
|
1288
|
-
|
|
1289
|
-
|
|
1290
|
-
def s3_glob_stat(
|
|
1291
|
-
path: PathLike,
|
|
1292
|
-
recursive: bool = True,
|
|
1293
|
-
missing_ok: bool = True,
|
|
1294
|
-
followlinks: bool = False,
|
|
1295
|
-
) -> Iterator[FileEntry]:
|
|
1296
|
-
"""Return a generator contains tuples of path and file stat,
|
|
1297
|
-
in ascending alphabetical order, in which path matches glob pattern
|
|
1298
|
-
|
|
1299
|
-
Notes: Only glob in bucket. If trying to match bucket with wildcard characters,
|
|
1300
|
-
raise UnsupportedError
|
|
1301
|
-
|
|
1302
|
-
:param recursive: If False, `**` will not search directory recursively
|
|
1303
|
-
:param missing_ok: If False and target path doesn't match any file,
|
|
1304
|
-
raise FileNotFoundError
|
|
1305
|
-
:raises: UnsupportedError, when bucket part contains wildcard characters
|
|
1306
|
-
:returns: A generator contains tuples of path and file stat,
|
|
1307
|
-
in which paths match `s3_pathname`
|
|
1308
|
-
"""
|
|
1309
|
-
return S3Path(path).glob_stat(
|
|
1310
|
-
pattern="", recursive=recursive, missing_ok=missing_ok, followlinks=followlinks
|
|
1311
|
-
)
|
|
1312
|
-
|
|
1313
|
-
|
|
1314
|
-
def s3_iglob(
|
|
1315
|
-
path: PathLike,
|
|
1316
|
-
recursive: bool = True,
|
|
1317
|
-
missing_ok: bool = True,
|
|
1318
|
-
followlinks: bool = False,
|
|
1319
|
-
) -> Iterator[str]:
|
|
1320
|
-
"""Return s3 path iterator in ascending alphabetical order,
|
|
1321
|
-
in which path matches glob pattern
|
|
1322
|
-
|
|
1323
|
-
Notes: Only glob in bucket. If trying to match bucket with wildcard characters,
|
|
1324
|
-
raise UnsupportedError
|
|
1325
|
-
|
|
1326
|
-
:param recursive: If False, `**` will not search directory recursively
|
|
1327
|
-
:param missing_ok: If False and target path doesn't match any file,
|
|
1328
|
-
raise FileNotFoundError
|
|
1329
|
-
:raises: UnsupportedError, when bucket part contains wildcard characters
|
|
1330
|
-
:returns: An iterator contains paths match `s3_pathname`
|
|
1331
|
-
"""
|
|
1332
|
-
for path_obj in S3Path(path).iglob(
|
|
1333
|
-
pattern="", recursive=recursive, missing_ok=missing_ok, followlinks=followlinks
|
|
1334
|
-
):
|
|
1335
|
-
yield path_obj.path_with_protocol
|
|
1336
|
-
|
|
1337
|
-
|
|
1338
|
-
def s3_makedirs(path: PathLike, exist_ok: bool = False):
|
|
1339
|
-
"""
|
|
1340
|
-
Create an s3 directory.
|
|
1341
|
-
Purely creating directory is invalid because it's unavailable on OSS.
|
|
1342
|
-
This function is to test the target bucket have WRITE access.
|
|
1343
|
-
|
|
1344
|
-
:param path: Given path
|
|
1345
|
-
:param exist_ok: If False and target directory exists, raise S3FileExistsError
|
|
1346
|
-
:raises: S3BucketNotFoundError, S3FileExistsError
|
|
1347
|
-
"""
|
|
1348
|
-
return S3Path(path).mkdir(parents=True, exist_ok=exist_ok)
|
|
1349
|
-
|
|
1350
|
-
|
|
1351
1284
|
def _group_src_paths_by_block(
|
|
1352
|
-
src_paths: List[PathLike], block_size: int =
|
|
1285
|
+
src_paths: List[PathLike], block_size: int = READER_BLOCK_SIZE
|
|
1353
1286
|
) -> List[List[Tuple[PathLike, Optional[str]]]]:
|
|
1354
1287
|
groups = []
|
|
1355
1288
|
current_group, current_group_size = [], 0
|
|
@@ -1395,7 +1328,7 @@ def _group_src_paths_by_block(
|
|
|
1395
1328
|
def s3_concat(
|
|
1396
1329
|
src_paths: List[PathLike],
|
|
1397
1330
|
dst_path: PathLike,
|
|
1398
|
-
block_size: int =
|
|
1331
|
+
block_size: int = READER_BLOCK_SIZE,
|
|
1399
1332
|
max_workers: int = GLOBAL_MAX_WORKERS,
|
|
1400
1333
|
) -> None:
|
|
1401
1334
|
"""Concatenate s3 files to one file.
|
|
@@ -1410,9 +1343,10 @@ def s3_concat(
|
|
|
1410
1343
|
else:
|
|
1411
1344
|
groups = _group_src_paths_by_block(src_paths, block_size=block_size)
|
|
1412
1345
|
|
|
1413
|
-
with
|
|
1414
|
-
|
|
1415
|
-
|
|
1346
|
+
with (
|
|
1347
|
+
MultiPartWriter(client, dst_path) as writer,
|
|
1348
|
+
ThreadPoolExecutor(max_workers=max_workers) as executor,
|
|
1349
|
+
):
|
|
1416
1350
|
for index, group in enumerate(groups, start=1):
|
|
1417
1351
|
if len(group) == 1:
|
|
1418
1352
|
executor.submit(
|
|
@@ -1422,14 +1356,6 @@ def s3_concat(
|
|
|
1422
1356
|
executor.submit(writer.upload_part_by_paths, index, group)
|
|
1423
1357
|
|
|
1424
1358
|
|
|
1425
|
-
def s3_lstat(path: PathLike) -> StatResult:
|
|
1426
|
-
"""
|
|
1427
|
-
Like Path.stat() but, if the path points to a symbolic link,
|
|
1428
|
-
return the symbolic link’s information rather than its target’s.
|
|
1429
|
-
"""
|
|
1430
|
-
return S3Path(path).lstat()
|
|
1431
|
-
|
|
1432
|
-
|
|
1433
1359
|
@SmartPath.register
|
|
1434
1360
|
class S3Path(URIPath):
|
|
1435
1361
|
protocol = "s3"
|