megfile 3.1.1__py3-none-any.whl → 3.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docs/conf.py +2 -4
- megfile/__init__.py +394 -203
- megfile/cli.py +258 -238
- megfile/config.py +25 -21
- megfile/errors.py +124 -114
- megfile/fs.py +174 -140
- megfile/fs_path.py +462 -354
- megfile/hdfs.py +133 -101
- megfile/hdfs_path.py +290 -236
- megfile/http.py +15 -14
- megfile/http_path.py +111 -107
- megfile/interfaces.py +70 -65
- megfile/lib/base_prefetch_reader.py +84 -65
- megfile/lib/combine_reader.py +12 -12
- megfile/lib/compare.py +17 -13
- megfile/lib/compat.py +1 -5
- megfile/lib/fnmatch.py +29 -30
- megfile/lib/glob.py +46 -54
- megfile/lib/hdfs_prefetch_reader.py +40 -25
- megfile/lib/hdfs_tools.py +1 -3
- megfile/lib/http_prefetch_reader.py +69 -46
- megfile/lib/joinpath.py +5 -5
- megfile/lib/lazy_handler.py +7 -3
- megfile/lib/s3_buffered_writer.py +58 -51
- megfile/lib/s3_cached_handler.py +13 -14
- megfile/lib/s3_limited_seekable_writer.py +37 -28
- megfile/lib/s3_memory_handler.py +34 -30
- megfile/lib/s3_pipe_handler.py +24 -25
- megfile/lib/s3_prefetch_reader.py +71 -52
- megfile/lib/s3_share_cache_reader.py +37 -24
- megfile/lib/shadow_handler.py +7 -3
- megfile/lib/stdio_handler.py +9 -8
- megfile/lib/url.py +3 -3
- megfile/pathlike.py +259 -228
- megfile/s3.py +220 -153
- megfile/s3_path.py +977 -802
- megfile/sftp.py +190 -156
- megfile/sftp_path.py +540 -450
- megfile/smart.py +397 -330
- megfile/smart_path.py +100 -105
- megfile/stdio.py +10 -9
- megfile/stdio_path.py +32 -35
- megfile/utils/__init__.py +73 -54
- megfile/utils/mutex.py +11 -14
- megfile/version.py +1 -1
- {megfile-3.1.1.dist-info → megfile-3.1.2.dist-info}/METADATA +5 -8
- megfile-3.1.2.dist-info/RECORD +55 -0
- {megfile-3.1.1.dist-info → megfile-3.1.2.dist-info}/WHEEL +1 -1
- scripts/convert_results_to_sarif.py +45 -78
- scripts/generate_file.py +140 -64
- megfile-3.1.1.dist-info/RECORD +0 -55
- {megfile-3.1.1.dist-info → megfile-3.1.2.dist-info}/LICENSE +0 -0
- {megfile-3.1.1.dist-info → megfile-3.1.2.dist-info}/LICENSE.pyre +0 -0
- {megfile-3.1.1.dist-info → megfile-3.1.2.dist-info}/entry_points.txt +0 -0
- {megfile-3.1.1.dist-info → megfile-3.1.2.dist-info}/top_level.txt +0 -0
megfile/s3_path.py
CHANGED
|
@@ -12,17 +12,56 @@ import boto3
|
|
|
12
12
|
import botocore
|
|
13
13
|
from botocore.awsrequest import AWSResponse
|
|
14
14
|
|
|
15
|
-
from megfile.config import
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
15
|
+
from megfile.config import (
|
|
16
|
+
DEFAULT_BLOCK_SIZE,
|
|
17
|
+
DEFAULT_MAX_BLOCK_SIZE,
|
|
18
|
+
DEFAULT_MIN_BLOCK_SIZE,
|
|
19
|
+
GLOBAL_MAX_WORKERS,
|
|
20
|
+
S3_CLIENT_CACHE_MODE,
|
|
21
|
+
S3_MAX_RETRY_TIMES,
|
|
22
|
+
)
|
|
23
|
+
from megfile.errors import (
|
|
24
|
+
S3BucketNotFoundError,
|
|
25
|
+
S3ConfigError,
|
|
26
|
+
S3FileExistsError,
|
|
27
|
+
S3FileNotFoundError,
|
|
28
|
+
S3IsADirectoryError,
|
|
29
|
+
S3NameTooLongError,
|
|
30
|
+
S3NotADirectoryError,
|
|
31
|
+
S3NotALinkError,
|
|
32
|
+
S3PermissionError,
|
|
33
|
+
S3UnknownError,
|
|
34
|
+
SameFileError,
|
|
35
|
+
UnsupportedError,
|
|
36
|
+
_create_missing_ok_generator,
|
|
37
|
+
patch_method,
|
|
38
|
+
raise_s3_error,
|
|
39
|
+
s3_error_code_should_retry,
|
|
40
|
+
s3_should_retry,
|
|
41
|
+
translate_fs_error,
|
|
42
|
+
translate_s3_error,
|
|
43
|
+
)
|
|
44
|
+
from megfile.errors import (
|
|
45
|
+
_logger as error_logger,
|
|
46
|
+
)
|
|
47
|
+
from megfile.interfaces import (
|
|
48
|
+
Access,
|
|
49
|
+
ContextIterator,
|
|
50
|
+
FileCacher,
|
|
51
|
+
FileEntry,
|
|
52
|
+
PathLike,
|
|
53
|
+
StatResult,
|
|
54
|
+
URIPath,
|
|
55
|
+
)
|
|
20
56
|
from megfile.lib.compare import is_same_file
|
|
21
57
|
from megfile.lib.compat import fspath
|
|
22
58
|
from megfile.lib.fnmatch import translate
|
|
23
59
|
from megfile.lib.glob import has_magic, has_magic_ignore_brace, ungloblize
|
|
24
60
|
from megfile.lib.joinpath import uri_join
|
|
25
|
-
from megfile.lib.s3_buffered_writer import
|
|
61
|
+
from megfile.lib.s3_buffered_writer import (
|
|
62
|
+
DEFAULT_MAX_BUFFER_SIZE,
|
|
63
|
+
S3BufferedWriter,
|
|
64
|
+
)
|
|
26
65
|
from megfile.lib.s3_cached_handler import S3CachedHandler
|
|
27
66
|
from megfile.lib.s3_limited_seekable_writer import S3LimitedSeekableWriter
|
|
28
67
|
from megfile.lib.s3_memory_handler import S3MemoryHandler
|
|
@@ -31,72 +70,88 @@ from megfile.lib.s3_prefetch_reader import S3PrefetchReader
|
|
|
31
70
|
from megfile.lib.s3_share_cache_reader import S3ShareCacheReader
|
|
32
71
|
from megfile.lib.url import get_url_scheme
|
|
33
72
|
from megfile.smart_path import SmartPath
|
|
34
|
-
from megfile.utils import
|
|
73
|
+
from megfile.utils import (
|
|
74
|
+
_is_pickle,
|
|
75
|
+
calculate_md5,
|
|
76
|
+
generate_cache_path,
|
|
77
|
+
get_binary_mode,
|
|
78
|
+
get_content_offset,
|
|
79
|
+
is_readable,
|
|
80
|
+
necessary_params,
|
|
81
|
+
process_local,
|
|
82
|
+
thread_local,
|
|
83
|
+
)
|
|
35
84
|
|
|
36
85
|
__all__ = [
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
86
|
+
"S3Path",
|
|
87
|
+
"parse_s3_url",
|
|
88
|
+
"get_endpoint_url",
|
|
89
|
+
"get_s3_session",
|
|
90
|
+
"get_s3_client",
|
|
91
|
+
"s3_path_join",
|
|
92
|
+
"is_s3",
|
|
93
|
+
"s3_buffered_open",
|
|
94
|
+
"s3_cached_open",
|
|
95
|
+
"s3_memory_open",
|
|
96
|
+
"s3_pipe_open",
|
|
97
|
+
"s3_prefetch_open",
|
|
98
|
+
"s3_share_cache_open",
|
|
99
|
+
"s3_open",
|
|
100
|
+
"S3Cacher",
|
|
101
|
+
"S3BufferedWriter",
|
|
102
|
+
"S3LimitedSeekableWriter",
|
|
103
|
+
"S3PrefetchReader",
|
|
104
|
+
"S3ShareCacheReader",
|
|
105
|
+
"s3_upload",
|
|
106
|
+
"s3_download",
|
|
107
|
+
"s3_load_content",
|
|
108
|
+
"s3_readlink",
|
|
109
|
+
"s3_glob",
|
|
110
|
+
"s3_glob_stat",
|
|
111
|
+
"s3_iglob",
|
|
112
|
+
"s3_rename",
|
|
113
|
+
"s3_makedirs",
|
|
114
|
+
"s3_concat",
|
|
115
|
+
"s3_lstat",
|
|
67
116
|
]
|
|
68
117
|
_logger = get_logger(__name__)
|
|
69
|
-
content_md5_header =
|
|
70
|
-
endpoint_url =
|
|
118
|
+
content_md5_header = "megfile-content-md5"
|
|
119
|
+
endpoint_url = "https://s3.amazonaws.com"
|
|
71
120
|
max_pool_connections = GLOBAL_MAX_WORKERS # for compatibility
|
|
72
121
|
max_retries = S3_MAX_RETRY_TIMES
|
|
73
122
|
max_keys = 1000
|
|
74
123
|
|
|
75
124
|
|
|
76
125
|
def _patch_make_request(client: botocore.client.BaseClient):
|
|
77
|
-
|
|
78
126
|
def after_callback(result: Tuple[AWSResponse, dict], *args, **kwargs):
|
|
79
|
-
if
|
|
80
|
-
|
|
127
|
+
if (
|
|
128
|
+
not isinstance(result, tuple)
|
|
129
|
+
or len(result) != 2
|
|
130
|
+
or not isinstance(result[0], AWSResponse)
|
|
131
|
+
or not isinstance(result[1], dict)
|
|
132
|
+
):
|
|
81
133
|
return result
|
|
82
134
|
http, parsed_response = result
|
|
83
|
-
if http.status_code >=
|
|
135
|
+
if http.status_code >= 400:
|
|
84
136
|
error_code = parsed_response.get("Error", {}).get("Code")
|
|
85
|
-
operation_model = kwargs.get(
|
|
86
|
-
args[0] if args else None
|
|
87
|
-
|
|
137
|
+
operation_model = kwargs.get("operation_model") or (
|
|
138
|
+
args[0] if args else None
|
|
139
|
+
)
|
|
140
|
+
operation_name = operation_model.name if operation_model else "ProxyMethod"
|
|
88
141
|
error_class = client.exceptions.from_code(error_code)
|
|
89
142
|
raise error_class(parsed_response, operation_name)
|
|
90
143
|
return result
|
|
91
144
|
|
|
92
145
|
def retry_callback(error, operation_model, request_dict, request_context):
|
|
93
|
-
if is_readable(request_dict[
|
|
94
|
-
request_dict[
|
|
146
|
+
if is_readable(request_dict["body"]):
|
|
147
|
+
request_dict["body"].seek(0)
|
|
95
148
|
|
|
96
149
|
def before_callback(operation_model, request_dict, request_context):
|
|
97
150
|
_logger.debug(
|
|
98
|
-
|
|
99
|
-
|
|
151
|
+
"send s3 request: %r, with parameters: %s",
|
|
152
|
+
operation_model.name,
|
|
153
|
+
request_dict,
|
|
154
|
+
)
|
|
100
155
|
|
|
101
156
|
client._make_request = patch_method(
|
|
102
157
|
client._make_request,
|
|
@@ -104,28 +159,28 @@ def _patch_make_request(client: botocore.client.BaseClient):
|
|
|
104
159
|
should_retry=s3_should_retry,
|
|
105
160
|
after_callback=after_callback,
|
|
106
161
|
before_callback=before_callback,
|
|
107
|
-
retry_callback=retry_callback
|
|
162
|
+
retry_callback=retry_callback,
|
|
163
|
+
)
|
|
108
164
|
return client
|
|
109
165
|
|
|
110
166
|
|
|
111
167
|
def parse_s3_url(s3_url: PathLike) -> Tuple[str, str]:
|
|
112
168
|
s3_url = fspath(s3_url)
|
|
113
169
|
if not is_s3(s3_url):
|
|
114
|
-
raise ValueError(
|
|
115
|
-
right_part = s3_url.split(
|
|
116
|
-
bucket_pattern = re.match(
|
|
170
|
+
raise ValueError("Not a s3 url: %r" % s3_url)
|
|
171
|
+
right_part = s3_url.split("://", maxsplit=1)[1]
|
|
172
|
+
bucket_pattern = re.match("(.*?)/", right_part)
|
|
117
173
|
if bucket_pattern is None:
|
|
118
174
|
bucket = right_part
|
|
119
|
-
path =
|
|
175
|
+
path = ""
|
|
120
176
|
else:
|
|
121
177
|
bucket = bucket_pattern.group(1)
|
|
122
|
-
path = right_part[len(bucket) + 1:]
|
|
178
|
+
path = right_part[len(bucket) + 1 :]
|
|
123
179
|
return bucket, path
|
|
124
180
|
|
|
125
181
|
|
|
126
182
|
def get_scoped_config(profile_name: Optional[str] = None) -> Dict:
|
|
127
|
-
return get_s3_session(
|
|
128
|
-
profile_name=profile_name)._session.get_scoped_config()
|
|
183
|
+
return get_s3_session(profile_name=profile_name)._session.get_scoped_config()
|
|
129
184
|
|
|
130
185
|
|
|
131
186
|
@lru_cache()
|
|
@@ -134,15 +189,14 @@ def warning_endpoint_url(key: str, endpoint_url: str):
|
|
|
134
189
|
|
|
135
190
|
|
|
136
191
|
def get_endpoint_url(profile_name: Optional[str] = None) -> str:
|
|
137
|
-
|
|
192
|
+
"""Get the endpoint url of S3
|
|
138
193
|
|
|
139
194
|
:returns: S3 endpoint url
|
|
140
|
-
|
|
195
|
+
"""
|
|
141
196
|
if profile_name:
|
|
142
|
-
environ_keys = (f
|
|
197
|
+
environ_keys = (f"{profile_name}__OSS_ENDPOINT".upper(),)
|
|
143
198
|
else:
|
|
144
|
-
environ_keys = (
|
|
145
|
-
'OSS_ENDPOINT', 'AWS_ENDPOINT_URL_S3', 'AWS_ENDPOINT_URL')
|
|
199
|
+
environ_keys = ("OSS_ENDPOINT", "AWS_ENDPOINT_URL_S3", "AWS_ENDPOINT_URL")
|
|
146
200
|
for environ_key in environ_keys:
|
|
147
201
|
environ_endpoint_url = os.environ.get(environ_key)
|
|
148
202
|
if environ_endpoint_url:
|
|
@@ -150,10 +204,10 @@ def get_endpoint_url(profile_name: Optional[str] = None) -> str:
|
|
|
150
204
|
return environ_endpoint_url
|
|
151
205
|
try:
|
|
152
206
|
config = get_scoped_config(profile_name=profile_name)
|
|
153
|
-
config_endpoint_url = config.get(
|
|
154
|
-
config_endpoint_url = config_endpoint_url or config.get(
|
|
207
|
+
config_endpoint_url = config.get("s3", {}).get("endpoint_url")
|
|
208
|
+
config_endpoint_url = config_endpoint_url or config.get("endpoint_url")
|
|
155
209
|
if config_endpoint_url:
|
|
156
|
-
warning_endpoint_url(
|
|
210
|
+
warning_endpoint_url("~/.aws/config", config_endpoint_url)
|
|
157
211
|
return config_endpoint_url
|
|
158
212
|
except botocore.exceptions.ProfileNotFound:
|
|
159
213
|
pass
|
|
@@ -161,27 +215,33 @@ def get_endpoint_url(profile_name: Optional[str] = None) -> str:
|
|
|
161
215
|
|
|
162
216
|
|
|
163
217
|
def get_s3_session(profile_name=None) -> boto3.Session:
|
|
164
|
-
|
|
218
|
+
"""Get S3 session
|
|
165
219
|
|
|
166
220
|
:returns: S3 session
|
|
167
|
-
|
|
221
|
+
"""
|
|
168
222
|
return thread_local(
|
|
169
|
-
f
|
|
223
|
+
f"s3_session:{profile_name}", boto3.Session, profile_name=profile_name
|
|
224
|
+
)
|
|
170
225
|
|
|
171
226
|
|
|
172
227
|
def get_access_token(profile_name=None):
|
|
173
|
-
access_key_env_name =
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
228
|
+
access_key_env_name = (
|
|
229
|
+
f"{profile_name}__AWS_ACCESS_KEY_ID".upper()
|
|
230
|
+
if profile_name
|
|
231
|
+
else "AWS_ACCESS_KEY_ID"
|
|
232
|
+
)
|
|
233
|
+
secret_key_env_name = (
|
|
234
|
+
f"{profile_name}__AWS_SECRET_ACCESS_KEY".upper()
|
|
235
|
+
if profile_name
|
|
236
|
+
else "AWS_SECRET_ACCESS_KEY"
|
|
237
|
+
)
|
|
177
238
|
access_key = os.getenv(access_key_env_name)
|
|
178
239
|
secret_key = os.getenv(secret_key_env_name)
|
|
179
240
|
if access_key and secret_key:
|
|
180
241
|
return access_key, secret_key
|
|
181
242
|
|
|
182
243
|
try:
|
|
183
|
-
credentials = get_s3_session(
|
|
184
|
-
profile_name=profile_name).get_credentials()
|
|
244
|
+
credentials = get_s3_session(profile_name=profile_name).get_credentials()
|
|
185
245
|
except botocore.exceptions.ProfileNotFound:
|
|
186
246
|
credentials = None
|
|
187
247
|
if credentials:
|
|
@@ -193,39 +253,42 @@ def get_access_token(profile_name=None):
|
|
|
193
253
|
|
|
194
254
|
|
|
195
255
|
def get_s3_client(
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
256
|
+
config: Optional[botocore.config.Config] = None,
|
|
257
|
+
cache_key: Optional[str] = None,
|
|
258
|
+
profile_name: Optional[str] = None,
|
|
259
|
+
):
|
|
260
|
+
"""Get S3 client
|
|
200
261
|
|
|
201
262
|
:returns: S3 client
|
|
202
|
-
|
|
263
|
+
"""
|
|
203
264
|
if cache_key is not None:
|
|
204
265
|
local_storage = thread_local
|
|
205
|
-
if S3_CLIENT_CACHE_MODE ==
|
|
266
|
+
if S3_CLIENT_CACHE_MODE == "process_local":
|
|
206
267
|
local_storage = process_local
|
|
207
268
|
return local_storage(
|
|
208
269
|
f"{cache_key}:{profile_name}",
|
|
209
270
|
get_s3_client,
|
|
210
271
|
config=config,
|
|
211
|
-
profile_name=profile_name
|
|
272
|
+
profile_name=profile_name,
|
|
273
|
+
)
|
|
212
274
|
|
|
213
275
|
if config:
|
|
214
276
|
config = botocore.config.Config(
|
|
215
|
-
connect_timeout=5,
|
|
216
|
-
|
|
277
|
+
connect_timeout=5, max_pool_connections=GLOBAL_MAX_WORKERS
|
|
278
|
+
).merge(config)
|
|
217
279
|
else:
|
|
218
280
|
config = botocore.config.Config(
|
|
219
|
-
connect_timeout=5, max_pool_connections=GLOBAL_MAX_WORKERS
|
|
281
|
+
connect_timeout=5, max_pool_connections=GLOBAL_MAX_WORKERS
|
|
282
|
+
)
|
|
220
283
|
|
|
221
|
-
addressing_style_env_key =
|
|
284
|
+
addressing_style_env_key = "AWS_S3_ADDRESSING_STYLE"
|
|
222
285
|
if profile_name:
|
|
223
|
-
addressing_style_env_key = f
|
|
224
|
-
)
|
|
286
|
+
addressing_style_env_key = f"{profile_name}__AWS_S3_ADDRESSING_STYLE".upper()
|
|
225
287
|
addressing_style = os.environ.get(addressing_style_env_key)
|
|
226
288
|
if addressing_style:
|
|
227
289
|
config = config.merge(
|
|
228
|
-
botocore.config.Config(s3={
|
|
290
|
+
botocore.config.Config(s3={"addressing_style": addressing_style})
|
|
291
|
+
)
|
|
229
292
|
|
|
230
293
|
access_key, secret_key = get_access_token(profile_name)
|
|
231
294
|
try:
|
|
@@ -233,7 +296,7 @@ def get_s3_client(
|
|
|
233
296
|
except botocore.exceptions.ProfileNotFound:
|
|
234
297
|
session = get_s3_session()
|
|
235
298
|
client = session.client(
|
|
236
|
-
|
|
299
|
+
"s3",
|
|
237
300
|
endpoint_url=get_endpoint_url(profile_name=profile_name),
|
|
238
301
|
config=config,
|
|
239
302
|
aws_access_key_id=access_key,
|
|
@@ -244,16 +307,15 @@ def get_s3_client(
|
|
|
244
307
|
|
|
245
308
|
|
|
246
309
|
def get_s3_client_with_cache(
|
|
247
|
-
|
|
248
|
-
|
|
310
|
+
config: Optional[botocore.config.Config] = None, profile_name: Optional[str] = None
|
|
311
|
+
):
|
|
249
312
|
return get_s3_client(
|
|
250
|
-
config=config,
|
|
251
|
-
|
|
252
|
-
profile_name=profile_name)
|
|
313
|
+
config=config, cache_key="s3_filelike_client", profile_name=profile_name
|
|
314
|
+
)
|
|
253
315
|
|
|
254
316
|
|
|
255
317
|
def s3_path_join(path: PathLike, *other_paths: PathLike) -> str:
|
|
256
|
-
|
|
318
|
+
"""
|
|
257
319
|
Concat 2 or more path to a complete path
|
|
258
320
|
|
|
259
321
|
:param path: Given path
|
|
@@ -262,27 +324,31 @@ def s3_path_join(path: PathLike, *other_paths: PathLike) -> str:
|
|
|
262
324
|
|
|
263
325
|
.. note ::
|
|
264
326
|
|
|
265
|
-
The difference between this function and ``os.path.join`` is that this function
|
|
266
|
-
|
|
267
|
-
|
|
327
|
+
The difference between this function and ``os.path.join`` is that this function
|
|
328
|
+
ignores left side slash (which indicates absolute path) in ``other_paths``
|
|
329
|
+
and will directly concat.
|
|
330
|
+
|
|
331
|
+
e.g. os.path.join('/path', 'to', '/file') => '/file',
|
|
332
|
+
but s3_path_join('/path', 'to', '/file') => '/path/to/file'
|
|
333
|
+
"""
|
|
268
334
|
return uri_join(fspath(path), *map(fspath, other_paths))
|
|
269
335
|
|
|
270
336
|
|
|
271
337
|
def _list_all_buckets(profile_name: Optional[str] = None) -> List[str]:
|
|
272
338
|
client = get_s3_client_with_cache(profile_name=profile_name)
|
|
273
339
|
response = client.list_buckets()
|
|
274
|
-
return [content[
|
|
340
|
+
return [content["Name"] for content in response["Buckets"]]
|
|
275
341
|
|
|
276
342
|
|
|
277
343
|
def _parse_s3_url_ignore_brace(s3_url: str) -> Tuple[str, str]:
|
|
278
344
|
s3_url = fspath(s3_url)
|
|
279
345
|
s3_scheme, right_part = s3_url[:5], s3_url[5:]
|
|
280
|
-
if s3_scheme !=
|
|
281
|
-
raise ValueError(
|
|
346
|
+
if s3_scheme != "s3://":
|
|
347
|
+
raise ValueError("Not a s3 url: %r" % s3_url)
|
|
282
348
|
left_brace = False
|
|
283
349
|
for current_index, current_character in enumerate(right_part):
|
|
284
350
|
if current_character == "/" and left_brace is False:
|
|
285
|
-
return right_part[:current_index], right_part[current_index + 1:]
|
|
351
|
+
return right_part[:current_index], right_part[current_index + 1 :]
|
|
286
352
|
elif current_character == "{":
|
|
287
353
|
left_brace = True
|
|
288
354
|
elif current_character == "}":
|
|
@@ -291,12 +357,13 @@ def _parse_s3_url_ignore_brace(s3_url: str) -> Tuple[str, str]:
|
|
|
291
357
|
|
|
292
358
|
|
|
293
359
|
def _group_s3path_by_bucket(
|
|
294
|
-
|
|
360
|
+
s3_pathname: str, profile_name: Optional[str] = None
|
|
361
|
+
) -> List[str]:
|
|
295
362
|
bucket, key = _parse_s3_url_ignore_brace(s3_pathname)
|
|
296
363
|
if not bucket:
|
|
297
364
|
if not key:
|
|
298
|
-
raise UnsupportedError(
|
|
299
|
-
raise S3BucketNotFoundError(
|
|
365
|
+
raise UnsupportedError("Glob whole s3", s3_pathname)
|
|
366
|
+
raise S3BucketNotFoundError("Empty bucket name: %r" % s3_pathname)
|
|
300
367
|
|
|
301
368
|
grouped_path = []
|
|
302
369
|
|
|
@@ -312,7 +379,7 @@ def _group_s3path_by_bucket(
|
|
|
312
379
|
path_part = None
|
|
313
380
|
if len(split_bucket_name) == 2:
|
|
314
381
|
bucket_name, path_part = split_bucket_name
|
|
315
|
-
pattern = re.compile(translate(re.sub(r
|
|
382
|
+
pattern = re.compile(translate(re.sub(r"\*{2,}", "*", bucket_name)))
|
|
316
383
|
|
|
317
384
|
for bucket in all_bucket(profile_name):
|
|
318
385
|
if pattern.fullmatch(bucket) is not None:
|
|
@@ -347,8 +414,8 @@ def _s3_split_magic_ignore_brace(s3_pathname: str) -> Tuple[str, str]:
|
|
|
347
414
|
if current_character == "/" and left_brace is False:
|
|
348
415
|
if has_magic_ignore_brace(s3_pathname[left_index:current_index]):
|
|
349
416
|
magic_parts.append(s3_pathname[left_index:current_index])
|
|
350
|
-
if s3_pathname[current_index + 1:]:
|
|
351
|
-
magic_parts.append(s3_pathname[current_index + 1:])
|
|
417
|
+
if s3_pathname[current_index + 1 :]:
|
|
418
|
+
magic_parts.append(s3_pathname[current_index + 1 :])
|
|
352
419
|
left_index = len(s3_pathname)
|
|
353
420
|
break
|
|
354
421
|
normal_parts.append(s3_pathname[left_index:current_index])
|
|
@@ -377,7 +444,6 @@ def _s3_split_magic_ignore_brace(s3_pathname: str) -> Tuple[str, str]:
|
|
|
377
444
|
|
|
378
445
|
|
|
379
446
|
def _group_s3path_by_prefix(s3_pathname: str) -> List[str]:
|
|
380
|
-
|
|
381
447
|
_, key = parse_s3_url(s3_pathname)
|
|
382
448
|
if not key:
|
|
383
449
|
return ungloblize(s3_pathname)
|
|
@@ -394,15 +460,15 @@ def _group_s3path_by_prefix(s3_pathname: str) -> List[str]:
|
|
|
394
460
|
|
|
395
461
|
|
|
396
462
|
def _become_prefix(prefix: str) -> str:
|
|
397
|
-
if prefix !=
|
|
398
|
-
prefix +=
|
|
463
|
+
if prefix != "" and not prefix.endswith("/"):
|
|
464
|
+
prefix += "/"
|
|
399
465
|
return prefix
|
|
400
466
|
|
|
401
467
|
|
|
402
468
|
def _s3_split_magic(s3_pathname: str) -> Tuple[str, str]:
|
|
403
469
|
if not has_magic(s3_pathname):
|
|
404
|
-
return s3_pathname,
|
|
405
|
-
delimiter =
|
|
470
|
+
return s3_pathname, ""
|
|
471
|
+
delimiter = "/"
|
|
406
472
|
normal_parts = []
|
|
407
473
|
magic_parts = []
|
|
408
474
|
all_parts = s3_pathname.split(delimiter)
|
|
@@ -415,53 +481,54 @@ def _s3_split_magic(s3_pathname: str) -> Tuple[str, str]:
|
|
|
415
481
|
return delimiter.join(normal_parts), delimiter.join(magic_parts)
|
|
416
482
|
|
|
417
483
|
|
|
418
|
-
def _list_objects_recursive(
|
|
419
|
-
s3_client, bucket: str, prefix: str, delimiter: str = ''):
|
|
420
|
-
|
|
484
|
+
def _list_objects_recursive(s3_client, bucket: str, prefix: str, delimiter: str = ""):
|
|
421
485
|
resp = s3_client.list_objects_v2(
|
|
422
|
-
Bucket=bucket, Prefix=prefix, Delimiter=delimiter, MaxKeys=max_keys
|
|
486
|
+
Bucket=bucket, Prefix=prefix, Delimiter=delimiter, MaxKeys=max_keys
|
|
487
|
+
)
|
|
423
488
|
|
|
424
489
|
while True:
|
|
425
490
|
yield resp
|
|
426
491
|
|
|
427
|
-
if not resp[
|
|
492
|
+
if not resp["IsTruncated"]:
|
|
428
493
|
break
|
|
429
494
|
|
|
430
495
|
resp = s3_client.list_objects_v2(
|
|
431
496
|
Bucket=bucket,
|
|
432
497
|
Prefix=prefix,
|
|
433
498
|
Delimiter=delimiter,
|
|
434
|
-
ContinuationToken=resp[
|
|
435
|
-
MaxKeys=max_keys
|
|
499
|
+
ContinuationToken=resp["NextContinuationToken"],
|
|
500
|
+
MaxKeys=max_keys,
|
|
501
|
+
)
|
|
436
502
|
|
|
437
503
|
|
|
438
504
|
def _make_stat(content: Dict[str, Any]):
|
|
439
505
|
return StatResult(
|
|
440
|
-
islnk=content.get(
|
|
441
|
-
size=content[
|
|
442
|
-
mtime=content[
|
|
506
|
+
islnk=content.get("islnk", False),
|
|
507
|
+
size=content["Size"],
|
|
508
|
+
mtime=content["LastModified"].timestamp(),
|
|
443
509
|
extra=content,
|
|
444
510
|
)
|
|
445
511
|
|
|
446
512
|
|
|
447
513
|
def _s3_glob_stat_single_path(
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
514
|
+
s3_pathname: PathLike,
|
|
515
|
+
recursive: bool = True,
|
|
516
|
+
missing_ok: bool = True,
|
|
517
|
+
followlinks: bool = False,
|
|
518
|
+
profile_name: Optional[str] = None,
|
|
519
|
+
) -> Iterator[FileEntry]:
|
|
453
520
|
s3_pathname = fspath(s3_pathname)
|
|
454
521
|
if not recursive:
|
|
455
522
|
# If not recursive, replace ** with *
|
|
456
|
-
s3_pathname = re.sub(r
|
|
523
|
+
s3_pathname = re.sub(r"\*{2,}", "*", s3_pathname)
|
|
457
524
|
top_dir, wildcard_part = _s3_split_magic(s3_pathname)
|
|
458
|
-
search_dir = wildcard_part.endswith(
|
|
525
|
+
search_dir = wildcard_part.endswith("/")
|
|
459
526
|
|
|
460
527
|
def should_recursive(wildcard_part: str) -> bool:
|
|
461
|
-
if
|
|
528
|
+
if "**" in wildcard_part:
|
|
462
529
|
return True
|
|
463
530
|
for expanded_path in ungloblize(wildcard_part):
|
|
464
|
-
parts_length = len(expanded_path.split(
|
|
531
|
+
parts_length = len(expanded_path.split("/"))
|
|
465
532
|
if parts_length + search_dir >= 2:
|
|
466
533
|
return True
|
|
467
534
|
return False
|
|
@@ -469,24 +536,23 @@ def _s3_glob_stat_single_path(
|
|
|
469
536
|
def create_generator(_s3_pathname) -> Iterator[FileEntry]:
|
|
470
537
|
top_dir_with_profile = top_dir
|
|
471
538
|
if profile_name:
|
|
472
|
-
top_dir_with_profile = f
|
|
539
|
+
top_dir_with_profile = f"s3+{profile_name}://{top_dir[5:]}"
|
|
473
540
|
if not S3Path(top_dir_with_profile).exists():
|
|
474
541
|
return
|
|
475
542
|
if not has_magic(_s3_pathname):
|
|
476
543
|
_s3_pathname_obj = S3Path(_s3_pathname)
|
|
477
544
|
if _s3_pathname_obj.is_file():
|
|
478
545
|
stat = S3Path(_s3_pathname).stat(follow_symlinks=followlinks)
|
|
479
|
-
yield FileEntry(
|
|
480
|
-
_s3_pathname_obj.name, _s3_pathname_obj.path, stat)
|
|
546
|
+
yield FileEntry(_s3_pathname_obj.name, _s3_pathname_obj.path, stat)
|
|
481
547
|
if _s3_pathname_obj.is_dir():
|
|
482
548
|
yield FileEntry(
|
|
483
|
-
_s3_pathname_obj.name, _s3_pathname_obj.path,
|
|
484
|
-
|
|
549
|
+
_s3_pathname_obj.name, _s3_pathname_obj.path, StatResult(isdir=True)
|
|
550
|
+
)
|
|
485
551
|
return
|
|
486
552
|
|
|
487
|
-
delimiter =
|
|
553
|
+
delimiter = ""
|
|
488
554
|
if not should_recursive(wildcard_part):
|
|
489
|
-
delimiter =
|
|
555
|
+
delimiter = "/"
|
|
490
556
|
|
|
491
557
|
dirnames = set()
|
|
492
558
|
pattern = re.compile(translate(_s3_pathname))
|
|
@@ -494,39 +560,39 @@ def _s3_glob_stat_single_path(
|
|
|
494
560
|
prefix = _become_prefix(key)
|
|
495
561
|
client = get_s3_client_with_cache(profile_name=profile_name)
|
|
496
562
|
with raise_s3_error(_s3_pathname):
|
|
497
|
-
for resp in _list_objects_recursive(client, bucket, prefix,
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
path = s3_path_join('s3://', bucket, content['Key'])
|
|
563
|
+
for resp in _list_objects_recursive(client, bucket, prefix, delimiter):
|
|
564
|
+
for content in resp.get("Contents", []):
|
|
565
|
+
path = s3_path_join("s3://", bucket, content["Key"])
|
|
501
566
|
if not search_dir and pattern.match(path):
|
|
502
|
-
yield FileEntry(
|
|
503
|
-
S3Path(path).name, path, _make_stat(content))
|
|
567
|
+
yield FileEntry(S3Path(path).name, path, _make_stat(content))
|
|
504
568
|
dirname = os.path.dirname(path)
|
|
505
569
|
while dirname not in dirnames and dirname != top_dir:
|
|
506
570
|
dirnames.add(dirname)
|
|
507
|
-
path = dirname +
|
|
571
|
+
path = dirname + "/" if search_dir else dirname
|
|
508
572
|
if pattern.match(path):
|
|
509
573
|
yield FileEntry(
|
|
510
|
-
S3Path(path).name, path, StatResult(isdir=True)
|
|
574
|
+
S3Path(path).name, path, StatResult(isdir=True)
|
|
575
|
+
)
|
|
511
576
|
dirname = os.path.dirname(dirname)
|
|
512
|
-
for common_prefix in resp.get(
|
|
513
|
-
path = s3_path_join(
|
|
514
|
-
's3://', bucket, common_prefix['Prefix'])
|
|
577
|
+
for common_prefix in resp.get("CommonPrefixes", []):
|
|
578
|
+
path = s3_path_join("s3://", bucket, common_prefix["Prefix"])
|
|
515
579
|
dirname = os.path.dirname(path)
|
|
516
580
|
if dirname not in dirnames and dirname != top_dir:
|
|
517
581
|
dirnames.add(dirname)
|
|
518
|
-
path = dirname +
|
|
582
|
+
path = dirname + "/" if search_dir else dirname
|
|
519
583
|
if pattern.match(path):
|
|
520
584
|
yield FileEntry(
|
|
521
|
-
S3Path(path).name, path, StatResult(isdir=True)
|
|
585
|
+
S3Path(path).name, path, StatResult(isdir=True)
|
|
586
|
+
)
|
|
522
587
|
|
|
523
588
|
return create_generator(s3_pathname)
|
|
524
589
|
|
|
525
590
|
|
|
526
|
-
def _s3_scan_pairs(
|
|
527
|
-
|
|
591
|
+
def _s3_scan_pairs(
|
|
592
|
+
src_url: PathLike, dst_url: PathLike
|
|
593
|
+
) -> Iterator[Tuple[PathLike, PathLike]]:
|
|
528
594
|
for src_file_path in S3Path(src_url).scan():
|
|
529
|
-
content_path = src_file_path[len(src_url):]
|
|
595
|
+
content_path = src_file_path[len(src_url) :]
|
|
530
596
|
if len(content_path) > 0:
|
|
531
597
|
dst_file_path = s3_path_join(dst_url, content_path)
|
|
532
598
|
else:
|
|
@@ -535,44 +601,45 @@ def _s3_scan_pairs(src_url: PathLike,
|
|
|
535
601
|
|
|
536
602
|
|
|
537
603
|
def is_s3(path: PathLike) -> bool:
|
|
538
|
-
|
|
539
|
-
1. According to
|
|
604
|
+
"""
|
|
605
|
+
1. According to
|
|
606
|
+
`aws-cli <https://docs.aws.amazon.com/cli/latest/reference/s3/index.html>`_ ,
|
|
607
|
+
test if a path is s3 path.
|
|
540
608
|
2. megfile also support the path like `s3[+profile_name]://bucket/key`
|
|
541
609
|
|
|
542
610
|
:param path: Path to be tested
|
|
543
611
|
:returns: True if path is s3 path, else False
|
|
544
|
-
|
|
612
|
+
"""
|
|
545
613
|
path = fspath(path)
|
|
546
|
-
if re.match(r
|
|
614
|
+
if re.match(r"^s3(\+\w+)?:\/\/", path):
|
|
547
615
|
return True
|
|
548
616
|
return False
|
|
549
617
|
|
|
550
618
|
|
|
551
619
|
def _s3_binary_mode(s3_open_func):
|
|
552
|
-
|
|
553
620
|
@wraps(s3_open_func)
|
|
554
621
|
def wrapper(
|
|
555
|
-
|
|
556
|
-
|
|
557
|
-
|
|
558
|
-
|
|
559
|
-
|
|
622
|
+
s3_url,
|
|
623
|
+
mode: str = "rb",
|
|
624
|
+
encoding: Optional[str] = None,
|
|
625
|
+
errors: Optional[str] = None,
|
|
626
|
+
**kwargs,
|
|
627
|
+
):
|
|
560
628
|
bucket, key = parse_s3_url(s3_url)
|
|
561
629
|
if not bucket:
|
|
562
|
-
raise S3BucketNotFoundError(
|
|
630
|
+
raise S3BucketNotFoundError("Empty bucket name: %r" % s3_url)
|
|
563
631
|
|
|
564
|
-
if not key or key.endswith(
|
|
565
|
-
raise S3IsADirectoryError(
|
|
632
|
+
if not key or key.endswith("/"):
|
|
633
|
+
raise S3IsADirectoryError("Is a directory: %r" % s3_url)
|
|
566
634
|
|
|
567
|
-
if
|
|
635
|
+
if "x" in mode:
|
|
568
636
|
if S3Path(s3_url).is_file():
|
|
569
|
-
raise S3FileExistsError(
|
|
570
|
-
mode = mode.replace(
|
|
637
|
+
raise S3FileExistsError("File exists: %r" % s3_url)
|
|
638
|
+
mode = mode.replace("x", "w")
|
|
571
639
|
|
|
572
640
|
fileobj = s3_open_func(s3_url, get_binary_mode(mode), **kwargs)
|
|
573
|
-
if
|
|
574
|
-
fileobj = io.TextIOWrapper(
|
|
575
|
-
fileobj, encoding=encoding, errors=errors) # type: ignore
|
|
641
|
+
if "b" not in mode:
|
|
642
|
+
fileobj = io.TextIOWrapper(fileobj, encoding=encoding, errors=errors) # type: ignore
|
|
576
643
|
fileobj.mode = mode # pyre-ignore[41]
|
|
577
644
|
return fileobj
|
|
578
645
|
|
|
@@ -581,13 +648,15 @@ def _s3_binary_mode(s3_open_func):
|
|
|
581
648
|
|
|
582
649
|
@_s3_binary_mode
|
|
583
650
|
def s3_prefetch_open(
|
|
584
|
-
|
|
585
|
-
|
|
586
|
-
|
|
587
|
-
|
|
588
|
-
|
|
589
|
-
|
|
590
|
-
|
|
651
|
+
s3_url: PathLike,
|
|
652
|
+
mode: str = "rb",
|
|
653
|
+
followlinks: bool = False,
|
|
654
|
+
*,
|
|
655
|
+
max_concurrency: Optional[int] = None,
|
|
656
|
+
max_block_size: int = DEFAULT_BLOCK_SIZE,
|
|
657
|
+
) -> S3PrefetchReader:
|
|
658
|
+
"""Open a asynchronous prefetch reader, to support fast sequential
|
|
659
|
+
read and random read
|
|
591
660
|
|
|
592
661
|
.. note ::
|
|
593
662
|
|
|
@@ -595,15 +664,17 @@ def s3_prefetch_open(
|
|
|
595
664
|
|
|
596
665
|
Supports context manager
|
|
597
666
|
|
|
598
|
-
Some parameter setting may perform well: max_concurrency=10 or 20,
|
|
667
|
+
Some parameter setting may perform well: max_concurrency=10 or 20,
|
|
668
|
+
max_block_size=8 or 16 MB, default value None means using global thread pool
|
|
599
669
|
|
|
600
670
|
:param max_concurrency: Max download thread number, None by default
|
|
601
|
-
:param max_block_size: Max data size downloaded by each thread, in bytes,
|
|
671
|
+
:param max_block_size: Max data size downloaded by each thread, in bytes,
|
|
672
|
+
8MB by default
|
|
602
673
|
:returns: An opened S3PrefetchReader object
|
|
603
674
|
:raises: S3FileNotFoundError
|
|
604
|
-
|
|
605
|
-
if mode !=
|
|
606
|
-
raise ValueError(
|
|
675
|
+
"""
|
|
676
|
+
if mode != "rb":
|
|
677
|
+
raise ValueError("unacceptable mode: %r" % mode)
|
|
607
678
|
if not isinstance(s3_url, S3Path):
|
|
608
679
|
s3_url = S3Path(s3_url)
|
|
609
680
|
if followlinks:
|
|
@@ -614,8 +685,7 @@ def s3_prefetch_open(
|
|
|
614
685
|
|
|
615
686
|
bucket, key = parse_s3_url(s3_url.path_with_protocol)
|
|
616
687
|
config = botocore.config.Config(max_pool_connections=max_pool_connections)
|
|
617
|
-
client = get_s3_client_with_cache(
|
|
618
|
-
config=config, profile_name=s3_url._profile_name)
|
|
688
|
+
client = get_s3_client_with_cache(config=config, profile_name=s3_url._profile_name)
|
|
619
689
|
return S3PrefetchReader(
|
|
620
690
|
bucket,
|
|
621
691
|
key,
|
|
@@ -623,19 +693,22 @@ def s3_prefetch_open(
|
|
|
623
693
|
max_retries=max_retries,
|
|
624
694
|
max_workers=max_concurrency,
|
|
625
695
|
block_size=max_block_size,
|
|
626
|
-
profile_name=s3_url._profile_name
|
|
696
|
+
profile_name=s3_url._profile_name,
|
|
697
|
+
)
|
|
627
698
|
|
|
628
699
|
|
|
629
700
|
@_s3_binary_mode
|
|
630
701
|
def s3_share_cache_open(
|
|
631
|
-
|
|
632
|
-
|
|
633
|
-
|
|
634
|
-
|
|
635
|
-
|
|
636
|
-
|
|
637
|
-
|
|
638
|
-
|
|
702
|
+
s3_url: PathLike,
|
|
703
|
+
mode: str = "rb",
|
|
704
|
+
followlinks: bool = False,
|
|
705
|
+
*,
|
|
706
|
+
cache_key: str = "lru",
|
|
707
|
+
max_concurrency: Optional[int] = None,
|
|
708
|
+
max_block_size: int = DEFAULT_BLOCK_SIZE,
|
|
709
|
+
) -> S3ShareCacheReader:
|
|
710
|
+
"""Open a asynchronous prefetch reader, to support fast sequential read and
|
|
711
|
+
random read
|
|
639
712
|
|
|
640
713
|
.. note ::
|
|
641
714
|
|
|
@@ -643,15 +716,17 @@ def s3_share_cache_open(
|
|
|
643
716
|
|
|
644
717
|
Supports context manager
|
|
645
718
|
|
|
646
|
-
Some parameter setting may perform well: max_concurrency=10 or 20,
|
|
719
|
+
Some parameter setting may perform well: max_concurrency=10 or 20,
|
|
720
|
+
max_block_size=8 or 16 MB, default value None means using global thread pool
|
|
647
721
|
|
|
648
722
|
:param max_concurrency: Max download thread number, None by default
|
|
649
|
-
:param max_block_size: Max data size downloaded by each thread, in bytes,
|
|
723
|
+
:param max_block_size: Max data size downloaded by each thread, in bytes,
|
|
724
|
+
8MB by default
|
|
650
725
|
:returns: An opened S3ShareCacheReader object
|
|
651
726
|
:raises: S3FileNotFoundError
|
|
652
|
-
|
|
653
|
-
if mode !=
|
|
654
|
-
raise ValueError(
|
|
727
|
+
"""
|
|
728
|
+
if mode != "rb":
|
|
729
|
+
raise ValueError("unacceptable mode: %r" % mode)
|
|
655
730
|
|
|
656
731
|
if not isinstance(s3_url, S3Path):
|
|
657
732
|
s3_url = S3Path(s3_url)
|
|
@@ -663,8 +738,7 @@ def s3_share_cache_open(
|
|
|
663
738
|
|
|
664
739
|
bucket, key = parse_s3_url(s3_url.path_with_protocol)
|
|
665
740
|
config = botocore.config.Config(max_pool_connections=max_pool_connections)
|
|
666
|
-
client = get_s3_client_with_cache(
|
|
667
|
-
config=config, profile_name=s3_url._profile_name)
|
|
741
|
+
client = get_s3_client_with_cache(config=config, profile_name=s3_url._profile_name)
|
|
668
742
|
return S3ShareCacheReader(
|
|
669
743
|
bucket,
|
|
670
744
|
key,
|
|
@@ -673,17 +747,16 @@ def s3_share_cache_open(
|
|
|
673
747
|
max_retries=max_retries,
|
|
674
748
|
max_workers=max_concurrency,
|
|
675
749
|
block_size=max_block_size,
|
|
676
|
-
profile_name=s3_url._profile_name
|
|
750
|
+
profile_name=s3_url._profile_name,
|
|
751
|
+
)
|
|
677
752
|
|
|
678
753
|
|
|
679
754
|
@_s3_binary_mode
|
|
680
755
|
def s3_pipe_open(
|
|
681
|
-
|
|
682
|
-
|
|
683
|
-
|
|
684
|
-
|
|
685
|
-
join_thread: bool = True) -> S3PipeHandler:
|
|
686
|
-
'''Open a asynchronous read-write reader / writer, to support fast sequential read / write
|
|
756
|
+
s3_url: PathLike, mode: str, followlinks: bool = False, *, join_thread: bool = True
|
|
757
|
+
) -> S3PipeHandler:
|
|
758
|
+
"""Open a asynchronous read-write reader / writer, to support fast sequential
|
|
759
|
+
read / write
|
|
687
760
|
|
|
688
761
|
.. note ::
|
|
689
762
|
|
|
@@ -691,19 +764,24 @@ def s3_pipe_open(
|
|
|
691
764
|
|
|
692
765
|
Supports context manager
|
|
693
766
|
|
|
694
|
-
When join_thread is False, while the file handle are closing,
|
|
695
|
-
|
|
696
|
-
|
|
767
|
+
When join_thread is False, while the file handle are closing,
|
|
768
|
+
this function will not wait until the asynchronous writing finishes;
|
|
769
|
+
|
|
770
|
+
False doesn't affect read-handle, but this can speed up write-handle because
|
|
771
|
+
file will be written asynchronously.
|
|
772
|
+
|
|
773
|
+
But asynchronous behavior can guarantee the file are successfully written,
|
|
774
|
+
and frequent execution may cause thread and file handle exhaustion
|
|
697
775
|
|
|
698
776
|
:param mode: Mode to open file, either "rb" or "wb"
|
|
699
777
|
:param join_thread: If wait after function execution until s3 finishes writing
|
|
700
778
|
:returns: An opened BufferedReader / BufferedWriter object
|
|
701
|
-
|
|
702
|
-
if mode not in (
|
|
703
|
-
raise ValueError(
|
|
779
|
+
"""
|
|
780
|
+
if mode not in ("rb", "wb"):
|
|
781
|
+
raise ValueError("unacceptable mode: %r" % mode)
|
|
704
782
|
|
|
705
|
-
if mode[0] ==
|
|
706
|
-
raise S3FileNotFoundError(
|
|
783
|
+
if mode[0] == "r" and not S3Path(s3_url).is_file():
|
|
784
|
+
raise S3FileNotFoundError("No such file: %r" % s3_url)
|
|
707
785
|
|
|
708
786
|
if not isinstance(s3_url, S3Path):
|
|
709
787
|
s3_url = S3Path(s3_url)
|
|
@@ -715,25 +793,26 @@ def s3_pipe_open(
|
|
|
715
793
|
|
|
716
794
|
bucket, key = parse_s3_url(s3_url.path_with_protocol)
|
|
717
795
|
config = botocore.config.Config(max_pool_connections=max_pool_connections)
|
|
718
|
-
client = get_s3_client_with_cache(
|
|
719
|
-
config=config, profile_name=s3_url._profile_name)
|
|
796
|
+
client = get_s3_client_with_cache(config=config, profile_name=s3_url._profile_name)
|
|
720
797
|
return S3PipeHandler(
|
|
721
798
|
bucket,
|
|
722
799
|
key,
|
|
723
800
|
mode,
|
|
724
801
|
s3_client=client,
|
|
725
802
|
join_thread=join_thread,
|
|
726
|
-
profile_name=s3_url._profile_name
|
|
803
|
+
profile_name=s3_url._profile_name,
|
|
804
|
+
)
|
|
727
805
|
|
|
728
806
|
|
|
729
807
|
@_s3_binary_mode
|
|
730
808
|
def s3_cached_open(
|
|
731
|
-
|
|
732
|
-
|
|
733
|
-
|
|
734
|
-
|
|
735
|
-
|
|
736
|
-
|
|
809
|
+
s3_url: PathLike,
|
|
810
|
+
mode: str,
|
|
811
|
+
followlinks: bool = False,
|
|
812
|
+
*,
|
|
813
|
+
cache_path: Optional[str] = None,
|
|
814
|
+
) -> S3CachedHandler:
|
|
815
|
+
"""Open a local-cache file reader / writer, for frequent random read / write
|
|
737
816
|
|
|
738
817
|
.. note ::
|
|
739
818
|
|
|
@@ -741,14 +820,15 @@ def s3_cached_open(
|
|
|
741
820
|
|
|
742
821
|
Supports context manager
|
|
743
822
|
|
|
744
|
-
cache_path can specify the path of cache file. Performance could be better
|
|
823
|
+
cache_path can specify the path of cache file. Performance could be better
|
|
824
|
+
if cache file path is on ssd or tmpfs
|
|
745
825
|
|
|
746
826
|
:param mode: Mode to open file, could be one of "rb", "wb" or "ab"
|
|
747
827
|
:param cache_path: cache file path
|
|
748
828
|
:returns: An opened BufferedReader / BufferedWriter object
|
|
749
|
-
|
|
750
|
-
if mode not in (
|
|
751
|
-
raise ValueError(
|
|
829
|
+
"""
|
|
830
|
+
if mode not in ("rb", "wb", "ab", "rb+", "wb+", "ab+"):
|
|
831
|
+
raise ValueError("unacceptable mode: %r" % mode)
|
|
752
832
|
if not isinstance(s3_url, S3Path):
|
|
753
833
|
s3_url = S3Path(s3_url)
|
|
754
834
|
if followlinks:
|
|
@@ -759,34 +839,35 @@ def s3_cached_open(
|
|
|
759
839
|
|
|
760
840
|
bucket, key = parse_s3_url(s3_url.path_with_protocol)
|
|
761
841
|
config = botocore.config.Config(max_pool_connections=max_pool_connections)
|
|
762
|
-
client = get_s3_client_with_cache(
|
|
763
|
-
config=config, profile_name=s3_url._profile_name)
|
|
842
|
+
client = get_s3_client_with_cache(config=config, profile_name=s3_url._profile_name)
|
|
764
843
|
return S3CachedHandler(
|
|
765
844
|
bucket,
|
|
766
845
|
key,
|
|
767
846
|
mode,
|
|
768
847
|
s3_client=client,
|
|
769
848
|
cache_path=cache_path,
|
|
770
|
-
profile_name=s3_url._profile_name
|
|
849
|
+
profile_name=s3_url._profile_name,
|
|
850
|
+
)
|
|
771
851
|
|
|
772
852
|
|
|
773
853
|
@_s3_binary_mode
|
|
774
854
|
def s3_buffered_open(
|
|
775
|
-
|
|
776
|
-
|
|
777
|
-
|
|
778
|
-
|
|
779
|
-
|
|
780
|
-
|
|
781
|
-
|
|
782
|
-
|
|
783
|
-
|
|
784
|
-
|
|
785
|
-
|
|
786
|
-
|
|
787
|
-
|
|
788
|
-
|
|
789
|
-
|
|
855
|
+
s3_url: PathLike,
|
|
856
|
+
mode: str,
|
|
857
|
+
followlinks: bool = False,
|
|
858
|
+
*,
|
|
859
|
+
max_concurrency: Optional[int] = None,
|
|
860
|
+
max_buffer_size: int = DEFAULT_MAX_BUFFER_SIZE,
|
|
861
|
+
forward_ratio: Optional[float] = None,
|
|
862
|
+
block_size: Optional[int] = None,
|
|
863
|
+
limited_seekable: bool = False,
|
|
864
|
+
buffered: bool = False,
|
|
865
|
+
share_cache_key: Optional[str] = None,
|
|
866
|
+
cache_path: Optional[str] = None,
|
|
867
|
+
min_block_size: Optional[int] = None,
|
|
868
|
+
max_block_size: int = DEFAULT_MAX_BLOCK_SIZE,
|
|
869
|
+
) -> IO:
|
|
870
|
+
"""Open an asynchronous prefetch reader, to support fast sequential read
|
|
790
871
|
|
|
791
872
|
.. note ::
|
|
792
873
|
|
|
@@ -794,19 +875,26 @@ def s3_buffered_open(
|
|
|
794
875
|
|
|
795
876
|
Supports context manager
|
|
796
877
|
|
|
797
|
-
Some parameter setting may perform well: max_concurrency=10 or 20,
|
|
878
|
+
Some parameter setting may perform well: max_concurrency=10 or 20,
|
|
879
|
+
max_block_size=8 or 16 MB, default value None means using global thread pool
|
|
798
880
|
|
|
799
881
|
:param max_concurrency: Max download thread number, None by default
|
|
800
882
|
:param max_buffer_size: Max cached buffer size in memory, 128MB by default
|
|
801
|
-
:param min_block_size: Min size of single block, default is same as block_size.
|
|
802
|
-
|
|
803
|
-
:param
|
|
804
|
-
|
|
883
|
+
:param min_block_size: Min size of single block, default is same as block_size.
|
|
884
|
+
Each block will be downloaded by single thread.
|
|
885
|
+
:param max_block_size: Max size of single block, 128MB by default.
|
|
886
|
+
Each block will be downloaded by single thread.
|
|
887
|
+
:param block_size: Size of single block, 8MB by default.
|
|
888
|
+
Each block will be uploaded by single thread.
|
|
889
|
+
:param limited_seekable: If write-handle supports limited seek
|
|
890
|
+
(both file head part and tail part can seek block_size).
|
|
891
|
+
Notes: This parameter are valid only for write-handle.
|
|
892
|
+
Read-handle support arbitrary seek
|
|
805
893
|
:returns: An opened S3PrefetchReader object
|
|
806
894
|
:raises: S3FileNotFoundError
|
|
807
|
-
|
|
808
|
-
if mode not in (
|
|
809
|
-
raise ValueError(
|
|
895
|
+
"""
|
|
896
|
+
if mode not in ("rb", "wb", "ab", "rb+", "wb+", "ab+"):
|
|
897
|
+
raise ValueError("unacceptable mode: %r" % mode)
|
|
810
898
|
if not isinstance(s3_url, S3Path):
|
|
811
899
|
s3_url = S3Path(s3_url)
|
|
812
900
|
if followlinks:
|
|
@@ -819,26 +907,23 @@ def s3_buffered_open(
|
|
|
819
907
|
|
|
820
908
|
bucket, key = parse_s3_url(s3_url.path_with_protocol)
|
|
821
909
|
config = botocore.config.Config(max_pool_connections=max_pool_connections)
|
|
822
|
-
client = get_s3_client_with_cache(
|
|
823
|
-
config=config, profile_name=s3_url._profile_name)
|
|
910
|
+
client = get_s3_client_with_cache(config=config, profile_name=s3_url._profile_name)
|
|
824
911
|
|
|
825
|
-
if
|
|
912
|
+
if "a" in mode or "+" in mode:
|
|
826
913
|
if cache_path is None:
|
|
827
914
|
return S3MemoryHandler(
|
|
828
|
-
bucket,
|
|
829
|
-
|
|
830
|
-
mode,
|
|
831
|
-
s3_client=client,
|
|
832
|
-
profile_name=s3_url._profile_name)
|
|
915
|
+
bucket, key, mode, s3_client=client, profile_name=s3_url._profile_name
|
|
916
|
+
)
|
|
833
917
|
return S3CachedHandler(
|
|
834
918
|
bucket,
|
|
835
919
|
key,
|
|
836
920
|
mode,
|
|
837
921
|
s3_client=client,
|
|
838
922
|
cache_path=cache_path,
|
|
839
|
-
profile_name=s3_url._profile_name
|
|
923
|
+
profile_name=s3_url._profile_name,
|
|
924
|
+
)
|
|
840
925
|
|
|
841
|
-
if mode ==
|
|
926
|
+
if mode == "rb":
|
|
842
927
|
# A rough conversion algorithm to align 2 types of Reader / Writer parameters
|
|
843
928
|
# TODO: Optimize the conversion algorithm
|
|
844
929
|
block_capacity = max_buffer_size // block_size
|
|
@@ -856,7 +941,8 @@ def s3_buffered_open(
|
|
|
856
941
|
max_workers=max_concurrency,
|
|
857
942
|
block_size=block_size,
|
|
858
943
|
block_forward=block_forward,
|
|
859
|
-
profile_name=s3_url._profile_name
|
|
944
|
+
profile_name=s3_url._profile_name,
|
|
945
|
+
)
|
|
860
946
|
else:
|
|
861
947
|
reader = S3PrefetchReader(
|
|
862
948
|
bucket,
|
|
@@ -867,7 +953,8 @@ def s3_buffered_open(
|
|
|
867
953
|
block_capacity=block_capacity,
|
|
868
954
|
block_forward=block_forward,
|
|
869
955
|
block_size=block_size,
|
|
870
|
-
profile_name=s3_url._profile_name
|
|
956
|
+
profile_name=s3_url._profile_name,
|
|
957
|
+
)
|
|
871
958
|
if buffered or _is_pickle(reader):
|
|
872
959
|
reader = io.BufferedReader(reader) # type: ignore
|
|
873
960
|
return reader
|
|
@@ -881,7 +968,8 @@ def s3_buffered_open(
|
|
|
881
968
|
block_size=min_block_size,
|
|
882
969
|
max_block_size=max_block_size,
|
|
883
970
|
max_buffer_size=max_buffer_size,
|
|
884
|
-
profile_name=s3_url._profile_name
|
|
971
|
+
profile_name=s3_url._profile_name,
|
|
972
|
+
)
|
|
885
973
|
else:
|
|
886
974
|
writer = S3BufferedWriter(
|
|
887
975
|
bucket,
|
|
@@ -891,7 +979,8 @@ def s3_buffered_open(
|
|
|
891
979
|
block_size=min_block_size,
|
|
892
980
|
max_block_size=max_block_size,
|
|
893
981
|
max_buffer_size=max_buffer_size,
|
|
894
|
-
profile_name=s3_url._profile_name
|
|
982
|
+
profile_name=s3_url._profile_name,
|
|
983
|
+
)
|
|
895
984
|
if buffered or _is_pickle(writer):
|
|
896
985
|
writer = io.BufferedWriter(writer) # type: ignore
|
|
897
986
|
return writer
|
|
@@ -899,10 +988,9 @@ def s3_buffered_open(
|
|
|
899
988
|
|
|
900
989
|
@_s3_binary_mode
|
|
901
990
|
def s3_memory_open(
|
|
902
|
-
|
|
903
|
-
|
|
904
|
-
|
|
905
|
-
'''Open a memory-cache file reader / writer, for frequent random read / write
|
|
991
|
+
s3_url: PathLike, mode: str, followlinks: bool = False
|
|
992
|
+
) -> S3MemoryHandler:
|
|
993
|
+
"""Open a memory-cache file reader / writer, for frequent random read / write
|
|
906
994
|
|
|
907
995
|
.. note ::
|
|
908
996
|
|
|
@@ -910,11 +998,12 @@ def s3_memory_open(
|
|
|
910
998
|
|
|
911
999
|
Supports context manager
|
|
912
1000
|
|
|
913
|
-
:param mode: Mode to open file, could be one of "rb", "wb", "ab", "rb+",
|
|
1001
|
+
:param mode: Mode to open file, could be one of "rb", "wb", "ab", "rb+",
|
|
1002
|
+
"wb+" or "ab+"
|
|
914
1003
|
:returns: An opened BufferedReader / BufferedWriter object
|
|
915
|
-
|
|
916
|
-
if mode not in (
|
|
917
|
-
raise ValueError(
|
|
1004
|
+
"""
|
|
1005
|
+
if mode not in ("rb", "wb", "ab", "rb+", "wb+", "ab+"):
|
|
1006
|
+
raise ValueError("unacceptable mode: %r" % mode)
|
|
918
1007
|
if not isinstance(s3_url, S3Path):
|
|
919
1008
|
s3_url = S3Path(s3_url)
|
|
920
1009
|
if followlinks:
|
|
@@ -925,37 +1014,40 @@ def s3_memory_open(
|
|
|
925
1014
|
|
|
926
1015
|
bucket, key = parse_s3_url(s3_url.path_with_protocol)
|
|
927
1016
|
config = botocore.config.Config(max_pool_connections=max_pool_connections)
|
|
928
|
-
client = get_s3_client_with_cache(
|
|
929
|
-
config=config, profile_name=s3_url._profile_name)
|
|
1017
|
+
client = get_s3_client_with_cache(config=config, profile_name=s3_url._profile_name)
|
|
930
1018
|
return S3MemoryHandler(
|
|
931
|
-
bucket, key, mode, s3_client=client, profile_name=s3_url._profile_name
|
|
1019
|
+
bucket, key, mode, s3_client=client, profile_name=s3_url._profile_name
|
|
1020
|
+
)
|
|
932
1021
|
|
|
933
1022
|
|
|
934
1023
|
s3_open = s3_buffered_open
|
|
935
1024
|
|
|
936
1025
|
|
|
937
1026
|
def s3_download(
|
|
938
|
-
|
|
939
|
-
|
|
940
|
-
|
|
941
|
-
|
|
942
|
-
|
|
943
|
-
|
|
1027
|
+
src_url: PathLike,
|
|
1028
|
+
dst_url: PathLike,
|
|
1029
|
+
callback: Optional[Callable[[int], None]] = None,
|
|
1030
|
+
followlinks: bool = False,
|
|
1031
|
+
overwrite: bool = True,
|
|
1032
|
+
) -> None:
|
|
1033
|
+
"""
|
|
944
1034
|
Downloads a file from s3 to local filesystem.
|
|
1035
|
+
|
|
945
1036
|
:param src_url: source s3 path
|
|
946
1037
|
:param dst_url: target fs path
|
|
947
|
-
:param callback: Called periodically during copy, and the input parameter is
|
|
1038
|
+
:param callback: Called periodically during copy, and the input parameter is
|
|
1039
|
+
the data size (in bytes) of copy since the last call
|
|
948
1040
|
:param followlinks: False if regard symlink as file, else True
|
|
949
1041
|
:param overwrite: whether or not overwrite file when exists, default is True
|
|
950
|
-
|
|
1042
|
+
"""
|
|
951
1043
|
from megfile.fs import is_fs
|
|
952
1044
|
from megfile.fs_path import FSPath
|
|
953
1045
|
|
|
954
1046
|
dst_url = fspath(dst_url)
|
|
955
1047
|
if not is_fs(dst_url):
|
|
956
|
-
raise OSError(f
|
|
957
|
-
if not dst_url or dst_url.endswith(
|
|
958
|
-
raise S3IsADirectoryError(
|
|
1048
|
+
raise OSError(f"dst_url is not fs path: {dst_url}")
|
|
1049
|
+
if not dst_url or dst_url.endswith("/"):
|
|
1050
|
+
raise S3IsADirectoryError("Is a directory: %r" % dst_url)
|
|
959
1051
|
|
|
960
1052
|
dst_path = FSPath(dst_url)
|
|
961
1053
|
if not overwrite and dst_path.exists():
|
|
@@ -971,104 +1063,97 @@ def s3_download(
|
|
|
971
1063
|
src_bucket, src_key = parse_s3_url(src_url.path_with_protocol)
|
|
972
1064
|
if not src_bucket:
|
|
973
1065
|
raise S3BucketNotFoundError(
|
|
974
|
-
|
|
1066
|
+
"Empty bucket name: %r" % src_url.path_with_protocol
|
|
1067
|
+
)
|
|
975
1068
|
|
|
976
1069
|
if not src_url.exists():
|
|
977
|
-
raise S3FileNotFoundError(
|
|
978
|
-
'File not found: %r' % src_url.path_with_protocol)
|
|
1070
|
+
raise S3FileNotFoundError("File not found: %r" % src_url.path_with_protocol)
|
|
979
1071
|
|
|
980
1072
|
if not src_url.is_file():
|
|
981
|
-
raise S3IsADirectoryError(
|
|
982
|
-
'Is a directory: %r' % src_url.path_with_protocol)
|
|
1073
|
+
raise S3IsADirectoryError("Is a directory: %r" % src_url.path_with_protocol)
|
|
983
1074
|
|
|
984
1075
|
dst_directory = os.path.dirname(dst_path.path_without_protocol)
|
|
985
|
-
if dst_directory !=
|
|
1076
|
+
if dst_directory != "":
|
|
986
1077
|
os.makedirs(dst_directory, exist_ok=True)
|
|
987
1078
|
|
|
988
1079
|
client = get_s3_client_with_cache(profile_name=src_url._profile_name)
|
|
989
1080
|
download_file = patch_method(
|
|
990
|
-
client.download_file,
|
|
991
|
-
max_retries=max_retries,
|
|
992
|
-
should_retry=s3_should_retry,
|
|
1081
|
+
client.download_file, max_retries=max_retries, should_retry=s3_should_retry
|
|
993
1082
|
)
|
|
994
1083
|
try:
|
|
995
1084
|
download_file(
|
|
996
|
-
src_bucket,
|
|
997
|
-
|
|
998
|
-
dst_path.path_without_protocol,
|
|
999
|
-
Callback=callback)
|
|
1085
|
+
src_bucket, src_key, dst_path.path_without_protocol, Callback=callback
|
|
1086
|
+
)
|
|
1000
1087
|
except Exception as error:
|
|
1001
1088
|
error = translate_fs_error(error, dst_url)
|
|
1002
1089
|
error = translate_s3_error(error, src_url.path_with_protocol)
|
|
1003
1090
|
raise error
|
|
1004
1091
|
|
|
1005
1092
|
src_stat = src_url.stat()
|
|
1006
|
-
os.utime(
|
|
1007
|
-
dst_path.path_without_protocol, (src_stat.st_mtime, src_stat.st_mtime))
|
|
1093
|
+
os.utime(dst_path.path_without_protocol, (src_stat.st_mtime, src_stat.st_mtime))
|
|
1008
1094
|
|
|
1009
1095
|
|
|
1010
1096
|
def s3_upload(
|
|
1011
|
-
|
|
1012
|
-
|
|
1013
|
-
|
|
1014
|
-
|
|
1015
|
-
|
|
1016
|
-
|
|
1097
|
+
src_url: PathLike,
|
|
1098
|
+
dst_url: PathLike,
|
|
1099
|
+
callback: Optional[Callable[[int], None]] = None,
|
|
1100
|
+
followlinks: bool = False,
|
|
1101
|
+
overwrite: bool = True,
|
|
1102
|
+
) -> None:
|
|
1103
|
+
"""
|
|
1017
1104
|
Uploads a file from local filesystem to s3.
|
|
1105
|
+
|
|
1018
1106
|
:param src_url: source fs path
|
|
1019
1107
|
:param dst_url: target s3 path
|
|
1020
|
-
:param callback: Called periodically during copy, and the input parameter is
|
|
1108
|
+
:param callback: Called periodically during copy, and the input parameter is
|
|
1109
|
+
the data size (in bytes) of copy since the last call
|
|
1021
1110
|
:param followlinks: False if regard symlink as file, else True
|
|
1022
1111
|
:param overwrite: whether or not overwrite file when exists, default is True
|
|
1023
|
-
|
|
1112
|
+
"""
|
|
1024
1113
|
from megfile.fs import is_fs
|
|
1025
1114
|
from megfile.fs_path import FSPath
|
|
1026
1115
|
|
|
1027
1116
|
if not is_fs(src_url):
|
|
1028
|
-
raise OSError(f
|
|
1117
|
+
raise OSError(f"src_url is not fs path: {src_url}")
|
|
1029
1118
|
src_path = FSPath(src_url)
|
|
1030
1119
|
if followlinks and src_path.is_symlink():
|
|
1031
1120
|
src_path = src_path.readlink()
|
|
1032
1121
|
|
|
1033
1122
|
dst_bucket, dst_key = parse_s3_url(dst_url)
|
|
1034
1123
|
if not dst_bucket:
|
|
1035
|
-
raise S3BucketNotFoundError(
|
|
1036
|
-
if not dst_key or dst_key.endswith(
|
|
1037
|
-
raise S3IsADirectoryError(
|
|
1124
|
+
raise S3BucketNotFoundError("Empty bucket name: %r" % dst_url)
|
|
1125
|
+
if not dst_key or dst_key.endswith("/"):
|
|
1126
|
+
raise S3IsADirectoryError("Is a directory: %r" % dst_url)
|
|
1038
1127
|
|
|
1039
1128
|
if not overwrite and S3Path(dst_url).is_file():
|
|
1040
1129
|
return
|
|
1041
1130
|
|
|
1042
|
-
client = get_s3_client_with_cache(
|
|
1043
|
-
profile_name=S3Path(dst_url)._profile_name)
|
|
1131
|
+
client = get_s3_client_with_cache(profile_name=S3Path(dst_url)._profile_name)
|
|
1044
1132
|
upload_fileobj = patch_method(
|
|
1045
|
-
client.upload_fileobj,
|
|
1046
|
-
max_retries=max_retries,
|
|
1047
|
-
should_retry=s3_should_retry,
|
|
1133
|
+
client.upload_fileobj, max_retries=max_retries, should_retry=s3_should_retry
|
|
1048
1134
|
)
|
|
1049
1135
|
|
|
1050
|
-
with open(src_path.path_without_protocol,
|
|
1051
|
-
'rb') as src, raise_s3_error(dst_url):
|
|
1136
|
+
with open(src_path.path_without_protocol, "rb") as src, raise_s3_error(dst_url):
|
|
1052
1137
|
upload_fileobj(src, Bucket=dst_bucket, Key=dst_key, Callback=callback)
|
|
1053
1138
|
|
|
1054
1139
|
|
|
1055
1140
|
def s3_load_content(
|
|
1056
|
-
|
|
1057
|
-
|
|
1058
|
-
|
|
1059
|
-
|
|
1060
|
-
|
|
1141
|
+
s3_url,
|
|
1142
|
+
start: Optional[int] = None,
|
|
1143
|
+
stop: Optional[int] = None,
|
|
1144
|
+
followlinks: bool = False,
|
|
1145
|
+
) -> bytes:
|
|
1146
|
+
"""
|
|
1061
1147
|
Get specified file from [start, stop) in bytes
|
|
1062
1148
|
|
|
1063
1149
|
:param s3_url: Specified path
|
|
1064
1150
|
:param start: start index
|
|
1065
1151
|
:param stop: stop index
|
|
1066
1152
|
:returns: bytes content in range [start, stop)
|
|
1067
|
-
|
|
1153
|
+
"""
|
|
1068
1154
|
|
|
1069
1155
|
def _get_object(client, bucket, key, range_str):
|
|
1070
|
-
return client.get_object(
|
|
1071
|
-
Bucket=bucket, Key=key, Range=range_str)['Body'].read()
|
|
1156
|
+
return client.get_object(Bucket=bucket, Key=key, Range=range_str)["Body"].read()
|
|
1072
1157
|
|
|
1073
1158
|
s3_url = S3Path(s3_url)
|
|
1074
1159
|
if followlinks:
|
|
@@ -1079,65 +1164,60 @@ def s3_load_content(
|
|
|
1079
1164
|
|
|
1080
1165
|
bucket, key = parse_s3_url(s3_url.path_with_protocol)
|
|
1081
1166
|
if not bucket:
|
|
1082
|
-
raise S3BucketNotFoundError(
|
|
1083
|
-
if not key or key.endswith(
|
|
1084
|
-
raise S3IsADirectoryError(
|
|
1167
|
+
raise S3BucketNotFoundError("Empty bucket name: %r" % s3_url)
|
|
1168
|
+
if not key or key.endswith("/"):
|
|
1169
|
+
raise S3IsADirectoryError("Is a directory: %r" % s3_url)
|
|
1085
1170
|
|
|
1086
|
-
start, stop = get_content_offset(
|
|
1087
|
-
start, stop, s3_url.getsize(follow_symlinks=False))
|
|
1171
|
+
start, stop = get_content_offset(start, stop, s3_url.getsize(follow_symlinks=False))
|
|
1088
1172
|
if start == 0 and stop == 0:
|
|
1089
|
-
return b
|
|
1090
|
-
range_str =
|
|
1173
|
+
return b""
|
|
1174
|
+
range_str = "bytes=%d-%d" % (start, stop - 1)
|
|
1091
1175
|
|
|
1092
1176
|
client = get_s3_client_with_cache(profile_name=s3_url._profile_name)
|
|
1093
1177
|
with raise_s3_error(s3_url.path):
|
|
1094
1178
|
return patch_method(
|
|
1095
|
-
_get_object,
|
|
1096
|
-
max_retries=max_retries,
|
|
1097
|
-
should_retry=s3_should_retry,
|
|
1179
|
+
_get_object, max_retries=max_retries, should_retry=s3_should_retry
|
|
1098
1180
|
)(client, bucket, key, range_str)
|
|
1099
1181
|
|
|
1100
1182
|
|
|
1101
1183
|
def s3_readlink(path) -> str:
|
|
1102
|
-
|
|
1184
|
+
"""
|
|
1103
1185
|
Return a string representing the path to which the symbolic link points.
|
|
1104
1186
|
|
|
1105
1187
|
:returns: Return a string representing the path to which the symbolic link points.
|
|
1106
|
-
:raises: S3NameTooLongError, S3BucketNotFoundError, S3IsADirectoryError,
|
|
1107
|
-
|
|
1188
|
+
:raises: S3NameTooLongError, S3BucketNotFoundError, S3IsADirectoryError,
|
|
1189
|
+
S3NotALinkError
|
|
1190
|
+
"""
|
|
1108
1191
|
return S3Path(path).readlink().path_with_protocol
|
|
1109
1192
|
|
|
1110
1193
|
|
|
1111
|
-
def s3_rename(
|
|
1112
|
-
|
|
1113
|
-
'''
|
|
1194
|
+
def s3_rename(src_url: PathLike, dst_url: PathLike, overwrite: bool = True) -> None:
|
|
1195
|
+
"""
|
|
1114
1196
|
Move s3 file path from src_url to dst_url
|
|
1115
1197
|
|
|
1116
1198
|
:param dst_url: Given destination path
|
|
1117
1199
|
:param overwrite: whether or not overwrite file when exists
|
|
1118
|
-
|
|
1200
|
+
"""
|
|
1119
1201
|
S3Path(src_url).rename(dst_url, overwrite)
|
|
1120
1202
|
|
|
1121
1203
|
|
|
1122
1204
|
class S3Cacher(FileCacher):
|
|
1123
1205
|
cache_path = None
|
|
1124
1206
|
|
|
1125
|
-
def __init__(
|
|
1126
|
-
|
|
1127
|
-
|
|
1128
|
-
raise ValueError('unacceptable mode: %r' % mode)
|
|
1207
|
+
def __init__(self, path: str, cache_path: Optional[str] = None, mode: str = "r"):
|
|
1208
|
+
if mode not in ("r", "w", "a"):
|
|
1209
|
+
raise ValueError("unacceptable mode: %r" % mode)
|
|
1129
1210
|
if cache_path is None:
|
|
1130
1211
|
cache_path = generate_cache_path(path)
|
|
1131
|
-
if mode in (
|
|
1212
|
+
if mode in ("r", "a"):
|
|
1132
1213
|
s3_download(path, cache_path)
|
|
1133
1214
|
self.name = path
|
|
1134
1215
|
self.mode = mode
|
|
1135
1216
|
self.cache_path = cache_path
|
|
1136
1217
|
|
|
1137
1218
|
def _close(self):
|
|
1138
|
-
if self.cache_path is not None and
|
|
1139
|
-
|
|
1140
|
-
if self.mode in ('w', 'a'):
|
|
1219
|
+
if self.cache_path is not None and os.path.exists(self.cache_path):
|
|
1220
|
+
if self.mode in ("w", "a"):
|
|
1141
1221
|
s3_upload(self.cache_path, self.name)
|
|
1142
1222
|
os.unlink(self.cache_path)
|
|
1143
1223
|
|
|
@@ -1148,40 +1228,50 @@ def s3_glob(
|
|
|
1148
1228
|
missing_ok: bool = True,
|
|
1149
1229
|
followlinks: bool = False,
|
|
1150
1230
|
) -> List[str]:
|
|
1151
|
-
|
|
1152
|
-
|
|
1231
|
+
"""Return s3 path list in ascending alphabetical order,
|
|
1232
|
+
in which path matches glob pattern
|
|
1233
|
+
|
|
1234
|
+
Notes: Only glob in bucket. If trying to match bucket with wildcard characters,
|
|
1235
|
+
raise UnsupportedError
|
|
1153
1236
|
|
|
1154
1237
|
:param recursive: If False, `**` will not search directory recursively
|
|
1155
|
-
:param missing_ok: If False and target path doesn't match any file,
|
|
1238
|
+
:param missing_ok: If False and target path doesn't match any file,
|
|
1239
|
+
raise FileNotFoundError
|
|
1156
1240
|
:raises: UnsupportedError, when bucket part contains wildcard characters
|
|
1157
1241
|
:returns: A list contains paths match `s3_pathname`
|
|
1158
|
-
|
|
1242
|
+
"""
|
|
1159
1243
|
return list(
|
|
1160
1244
|
s3_iglob(
|
|
1161
1245
|
path=path,
|
|
1162
1246
|
recursive=recursive,
|
|
1163
1247
|
missing_ok=missing_ok,
|
|
1164
|
-
followlinks=followlinks
|
|
1248
|
+
followlinks=followlinks,
|
|
1249
|
+
)
|
|
1250
|
+
)
|
|
1165
1251
|
|
|
1166
1252
|
|
|
1167
1253
|
def s3_glob_stat(
|
|
1168
|
-
|
|
1169
|
-
|
|
1170
|
-
|
|
1171
|
-
|
|
1172
|
-
|
|
1173
|
-
|
|
1254
|
+
path: PathLike,
|
|
1255
|
+
recursive: bool = True,
|
|
1256
|
+
missing_ok: bool = True,
|
|
1257
|
+
followlinks: bool = False,
|
|
1258
|
+
) -> Iterator[FileEntry]:
|
|
1259
|
+
"""Return a generator contains tuples of path and file stat,
|
|
1260
|
+
in ascending alphabetical order, in which path matches glob pattern
|
|
1261
|
+
|
|
1262
|
+
Notes: Only glob in bucket. If trying to match bucket with wildcard characters,
|
|
1263
|
+
raise UnsupportedError
|
|
1174
1264
|
|
|
1175
1265
|
:param recursive: If False, `**` will not search directory recursively
|
|
1176
|
-
:param missing_ok: If False and target path doesn't match any file,
|
|
1266
|
+
:param missing_ok: If False and target path doesn't match any file,
|
|
1267
|
+
raise FileNotFoundError
|
|
1177
1268
|
:raises: UnsupportedError, when bucket part contains wildcard characters
|
|
1178
|
-
:returns: A generator contains tuples of path and file stat,
|
|
1179
|
-
|
|
1269
|
+
:returns: A generator contains tuples of path and file stat,
|
|
1270
|
+
in which paths match `s3_pathname`
|
|
1271
|
+
"""
|
|
1180
1272
|
return S3Path(path).glob_stat(
|
|
1181
|
-
pattern="",
|
|
1182
|
-
|
|
1183
|
-
missing_ok=missing_ok,
|
|
1184
|
-
followlinks=followlinks)
|
|
1273
|
+
pattern="", recursive=recursive, missing_ok=missing_ok, followlinks=followlinks
|
|
1274
|
+
)
|
|
1185
1275
|
|
|
1186
1276
|
|
|
1187
1277
|
def s3_iglob(
|
|
@@ -1190,22 +1280,26 @@ def s3_iglob(
|
|
|
1190
1280
|
missing_ok: bool = True,
|
|
1191
1281
|
followlinks: bool = False,
|
|
1192
1282
|
) -> Iterator[str]:
|
|
1193
|
-
|
|
1194
|
-
|
|
1283
|
+
"""Return s3 path iterator in ascending alphabetical order,
|
|
1284
|
+
in which path matches glob pattern
|
|
1285
|
+
|
|
1286
|
+
Notes: Only glob in bucket. If trying to match bucket with wildcard characters,
|
|
1287
|
+
raise UnsupportedError
|
|
1195
1288
|
|
|
1196
1289
|
:param recursive: If False, `**` will not search directory recursively
|
|
1197
|
-
:param missing_ok: If False and target path doesn't match any file,
|
|
1290
|
+
:param missing_ok: If False and target path doesn't match any file,
|
|
1291
|
+
raise FileNotFoundError
|
|
1198
1292
|
:raises: UnsupportedError, when bucket part contains wildcard characters
|
|
1199
1293
|
:returns: An iterator contains paths match `s3_pathname`
|
|
1200
|
-
|
|
1201
|
-
for path_obj in S3Path(path).iglob(
|
|
1202
|
-
|
|
1203
|
-
|
|
1294
|
+
"""
|
|
1295
|
+
for path_obj in S3Path(path).iglob(
|
|
1296
|
+
pattern="", recursive=recursive, missing_ok=missing_ok, followlinks=followlinks
|
|
1297
|
+
):
|
|
1204
1298
|
yield path_obj.path_with_protocol
|
|
1205
1299
|
|
|
1206
1300
|
|
|
1207
1301
|
def s3_makedirs(path: PathLike, exist_ok: bool = False):
|
|
1208
|
-
|
|
1302
|
+
"""
|
|
1209
1303
|
Create an s3 directory.
|
|
1210
1304
|
Purely creating directory is invalid because it's unavailable on OSS.
|
|
1211
1305
|
This function is to test the target bucket have WRITE access.
|
|
@@ -1213,13 +1307,12 @@ def s3_makedirs(path: PathLike, exist_ok: bool = False):
|
|
|
1213
1307
|
:param path: Given path
|
|
1214
1308
|
:param exist_ok: If False and target directory exists, raise S3FileExistsError
|
|
1215
1309
|
:raises: S3BucketNotFoundError, S3FileExistsError
|
|
1216
|
-
|
|
1310
|
+
"""
|
|
1217
1311
|
return S3Path(path).mkdir(parents=True, exist_ok=exist_ok)
|
|
1218
1312
|
|
|
1219
1313
|
|
|
1220
1314
|
def _group_src_paths_by_block(
|
|
1221
|
-
src_paths: List[PathLike],
|
|
1222
|
-
block_size: int = DEFAULT_BLOCK_SIZE
|
|
1315
|
+
src_paths: List[PathLike], block_size: int = DEFAULT_BLOCK_SIZE
|
|
1223
1316
|
) -> List[List[Tuple[PathLike, Optional[str]]]]:
|
|
1224
1317
|
groups = []
|
|
1225
1318
|
current_group, current_group_size = [], 0
|
|
@@ -1232,18 +1325,18 @@ def _group_src_paths_by_block(
|
|
|
1232
1325
|
if len(groups) == 0:
|
|
1233
1326
|
if current_group_size + current_file_size > 2 * block_size:
|
|
1234
1327
|
group_lack_size = block_size - current_group_size
|
|
1235
|
-
current_group.append(
|
|
1236
|
-
(src_path, f'bytes=0-{group_lack_size-1}'))
|
|
1328
|
+
current_group.append((src_path, f"bytes=0-{group_lack_size-1}"))
|
|
1237
1329
|
groups.extend(
|
|
1238
1330
|
[
|
|
1239
1331
|
current_group,
|
|
1240
1332
|
[
|
|
1241
1333
|
(
|
|
1242
1334
|
src_path,
|
|
1243
|
-
f
|
|
1335
|
+
f"bytes={group_lack_size}-{current_file_size-1}",
|
|
1244
1336
|
)
|
|
1245
|
-
]
|
|
1246
|
-
]
|
|
1337
|
+
],
|
|
1338
|
+
]
|
|
1339
|
+
)
|
|
1247
1340
|
else:
|
|
1248
1341
|
current_group.append((src_path, None))
|
|
1249
1342
|
groups.append(current_group)
|
|
@@ -1263,15 +1356,16 @@ def _group_src_paths_by_block(
|
|
|
1263
1356
|
|
|
1264
1357
|
|
|
1265
1358
|
def s3_concat(
|
|
1266
|
-
|
|
1267
|
-
|
|
1268
|
-
|
|
1269
|
-
|
|
1270
|
-
|
|
1359
|
+
src_paths: List[PathLike],
|
|
1360
|
+
dst_path: PathLike,
|
|
1361
|
+
block_size: int = DEFAULT_BLOCK_SIZE,
|
|
1362
|
+
max_workers: int = GLOBAL_MAX_WORKERS,
|
|
1363
|
+
) -> None:
|
|
1364
|
+
"""Concatenate s3 files to one file.
|
|
1271
1365
|
|
|
1272
1366
|
:param src_paths: Given source paths
|
|
1273
1367
|
:param dst_path: Given destination path
|
|
1274
|
-
|
|
1368
|
+
"""
|
|
1275
1369
|
client = S3Path(dst_path)._client
|
|
1276
1370
|
with raise_s3_error(dst_path):
|
|
1277
1371
|
if block_size == 0:
|
|
@@ -1280,24 +1374,27 @@ def s3_concat(
|
|
|
1280
1374
|
groups = _group_src_paths_by_block(src_paths, block_size=block_size)
|
|
1281
1375
|
|
|
1282
1376
|
with MultiPartWriter(client, dst_path) as writer, ThreadPoolExecutor(
|
|
1283
|
-
|
|
1377
|
+
max_workers=max_workers
|
|
1378
|
+
) as executor:
|
|
1284
1379
|
for index, group in enumerate(groups, start=1):
|
|
1285
1380
|
if len(group) == 1:
|
|
1286
1381
|
executor.submit(
|
|
1287
|
-
writer.upload_part_copy, index, group[0][0],
|
|
1288
|
-
|
|
1382
|
+
writer.upload_part_copy, index, group[0][0], group[0][1]
|
|
1383
|
+
)
|
|
1289
1384
|
else:
|
|
1290
1385
|
executor.submit(writer.upload_part_by_paths, index, group)
|
|
1291
1386
|
|
|
1292
1387
|
|
|
1293
1388
|
def s3_lstat(path: PathLike) -> StatResult:
|
|
1294
|
-
|
|
1389
|
+
"""
|
|
1390
|
+
Like Path.stat() but, if the path points to a symbolic link,
|
|
1391
|
+
return the symbolic link’s information rather than its target’s.
|
|
1392
|
+
"""
|
|
1295
1393
|
return S3Path(path).lstat()
|
|
1296
1394
|
|
|
1297
1395
|
|
|
1298
1396
|
@SmartPath.register
|
|
1299
1397
|
class S3Path(URIPath):
|
|
1300
|
-
|
|
1301
1398
|
protocol = "s3"
|
|
1302
1399
|
|
|
1303
1400
|
def __init__(self, path: "PathLike", *other_paths: "PathLike"):
|
|
@@ -1305,7 +1402,7 @@ class S3Path(URIPath):
|
|
|
1305
1402
|
protocol = get_url_scheme(self.path)
|
|
1306
1403
|
self._protocol_with_profile = self.protocol
|
|
1307
1404
|
self._profile_name = None
|
|
1308
|
-
if protocol.startswith(
|
|
1405
|
+
if protocol.startswith("s3+"):
|
|
1309
1406
|
self._protocol_with_profile = protocol
|
|
1310
1407
|
self._profile_name = protocol[3:]
|
|
1311
1408
|
self._s3_path = f"s3://{self.path[len(protocol)+3:]}"
|
|
@@ -1316,30 +1413,33 @@ class S3Path(URIPath):
|
|
|
1316
1413
|
|
|
1317
1414
|
@cached_property
|
|
1318
1415
|
def path_with_protocol(self) -> str:
|
|
1319
|
-
|
|
1416
|
+
"""Return path with protocol, like file:///root, s3://bucket/key"""
|
|
1320
1417
|
path = self.path
|
|
1321
1418
|
protocol_prefix = self._protocol_with_profile + "://"
|
|
1322
1419
|
if path.startswith(protocol_prefix):
|
|
1323
1420
|
return path
|
|
1324
|
-
return protocol_prefix + path.lstrip(
|
|
1421
|
+
return protocol_prefix + path.lstrip("/")
|
|
1325
1422
|
|
|
1326
1423
|
@cached_property
|
|
1327
1424
|
def path_without_protocol(self) -> str:
|
|
1328
|
-
|
|
1425
|
+
"""
|
|
1426
|
+
Return path without protocol, example: if path is s3://bucket/key,
|
|
1427
|
+
return bucket/key
|
|
1428
|
+
"""
|
|
1329
1429
|
path = self.path
|
|
1330
1430
|
protocol_prefix = self._protocol_with_profile + "://"
|
|
1331
1431
|
if path.startswith(protocol_prefix):
|
|
1332
|
-
path = path[len(protocol_prefix):]
|
|
1432
|
+
path = path[len(protocol_prefix) :]
|
|
1333
1433
|
return path
|
|
1334
1434
|
|
|
1335
1435
|
@cached_property
|
|
1336
1436
|
def parts(self) -> Tuple[str, ...]:
|
|
1337
|
-
|
|
1437
|
+
"""A tuple giving access to the path’s various components"""
|
|
1338
1438
|
parts = [f"{self._protocol_with_profile}://"]
|
|
1339
1439
|
path = self.path_without_protocol
|
|
1340
|
-
path = path.lstrip(
|
|
1341
|
-
if path !=
|
|
1342
|
-
parts.extend(path.split(
|
|
1440
|
+
path = path.lstrip("/")
|
|
1441
|
+
if path != "":
|
|
1442
|
+
parts.extend(path.split("/"))
|
|
1343
1443
|
return tuple(parts)
|
|
1344
1444
|
|
|
1345
1445
|
@cached_property
|
|
@@ -1347,38 +1447,33 @@ class S3Path(URIPath):
|
|
|
1347
1447
|
return get_s3_client_with_cache(profile_name=self._profile_name)
|
|
1348
1448
|
|
|
1349
1449
|
def _s3_get_metadata(self) -> dict:
|
|
1350
|
-
|
|
1450
|
+
"""
|
|
1351
1451
|
Get object metadata
|
|
1352
1452
|
|
|
1353
1453
|
:param path: Object path
|
|
1354
1454
|
:returns: Object metadata
|
|
1355
|
-
|
|
1455
|
+
"""
|
|
1356
1456
|
bucket, key = parse_s3_url(self.path_with_protocol)
|
|
1357
1457
|
if not bucket:
|
|
1358
1458
|
return {}
|
|
1359
|
-
if not key or key.endswith(
|
|
1459
|
+
if not key or key.endswith("/"):
|
|
1360
1460
|
return {}
|
|
1361
1461
|
try:
|
|
1362
1462
|
with raise_s3_error(self.path_with_protocol):
|
|
1363
1463
|
resp = self._client.head_object(Bucket=bucket, Key=key)
|
|
1364
|
-
return dict(
|
|
1365
|
-
(key.lower(), value) for key, value in resp['Metadata'].items())
|
|
1464
|
+
return dict((key.lower(), value) for key, value in resp["Metadata"].items())
|
|
1366
1465
|
except Exception as error:
|
|
1367
|
-
if isinstance(error,
|
|
1368
|
-
(S3UnknownError, S3ConfigError, S3PermissionError)):
|
|
1466
|
+
if isinstance(error, (S3UnknownError, S3ConfigError, S3PermissionError)):
|
|
1369
1467
|
raise error
|
|
1370
1468
|
return {}
|
|
1371
1469
|
|
|
1372
|
-
def access(
|
|
1373
|
-
|
|
1374
|
-
mode: Access = Access.READ,
|
|
1375
|
-
followlinks: bool = False) -> bool:
|
|
1376
|
-
'''
|
|
1470
|
+
def access(self, mode: Access = Access.READ, followlinks: bool = False) -> bool:
|
|
1471
|
+
"""
|
|
1377
1472
|
Test if path has access permission described by mode
|
|
1378
1473
|
|
|
1379
1474
|
:param mode: access mode
|
|
1380
1475
|
:returns: bool, if the bucket of s3_url has read/write access.
|
|
1381
|
-
|
|
1476
|
+
"""
|
|
1382
1477
|
s3_url = self.path_with_protocol
|
|
1383
1478
|
if followlinks:
|
|
1384
1479
|
try:
|
|
@@ -1390,11 +1485,13 @@ class S3Path(URIPath):
|
|
|
1390
1485
|
raise Exception("No available bucket")
|
|
1391
1486
|
if not isinstance(mode, Access):
|
|
1392
1487
|
raise TypeError(
|
|
1393
|
-
|
|
1394
|
-
|
|
1488
|
+
"Unsupported mode: {} -- Mode should use one of "
|
|
1489
|
+
"the enums belonging to: {}".format(
|
|
1490
|
+
mode, ", ".join([str(a) for a in Access])
|
|
1491
|
+
)
|
|
1492
|
+
)
|
|
1395
1493
|
if mode not in (Access.READ, Access.WRITE):
|
|
1396
|
-
raise TypeError(
|
|
1397
|
-
|
|
1494
|
+
raise TypeError("Unsupported mode: {}".format(mode))
|
|
1398
1495
|
try:
|
|
1399
1496
|
if not self.exists():
|
|
1400
1497
|
return False
|
|
@@ -1408,17 +1505,14 @@ class S3Path(URIPath):
|
|
|
1408
1505
|
return True
|
|
1409
1506
|
try:
|
|
1410
1507
|
if not key:
|
|
1411
|
-
key =
|
|
1412
|
-
elif key.endswith(
|
|
1508
|
+
key = "test"
|
|
1509
|
+
elif key.endswith("/"):
|
|
1413
1510
|
key = key[:-1]
|
|
1414
|
-
upload_id = self._client.create_multipart_upload(
|
|
1415
|
-
|
|
1416
|
-
|
|
1417
|
-
)['UploadId']
|
|
1511
|
+
upload_id = self._client.create_multipart_upload(Bucket=bucket, Key=key)[
|
|
1512
|
+
"UploadId"
|
|
1513
|
+
]
|
|
1418
1514
|
self._client.abort_multipart_upload(
|
|
1419
|
-
Bucket=bucket,
|
|
1420
|
-
Key=key,
|
|
1421
|
-
UploadId=upload_id,
|
|
1515
|
+
Bucket=bucket, Key=key, UploadId=upload_id
|
|
1422
1516
|
)
|
|
1423
1517
|
return True
|
|
1424
1518
|
except Exception as error:
|
|
@@ -1428,13 +1522,13 @@ class S3Path(URIPath):
|
|
|
1428
1522
|
raise error
|
|
1429
1523
|
|
|
1430
1524
|
def exists(self, followlinks: bool = False) -> bool:
|
|
1431
|
-
|
|
1525
|
+
"""
|
|
1432
1526
|
Test if s3_url exists
|
|
1433
1527
|
|
|
1434
1528
|
If the bucket of s3_url are not permitted to read, return False
|
|
1435
1529
|
|
|
1436
1530
|
:returns: True if s3_url exists, else False
|
|
1437
|
-
|
|
1531
|
+
"""
|
|
1438
1532
|
bucket, key = parse_s3_url(self.path_with_protocol)
|
|
1439
1533
|
if not bucket: # s3:// => True, s3:///key => False
|
|
1440
1534
|
return not key
|
|
@@ -1442,28 +1536,37 @@ class S3Path(URIPath):
|
|
|
1442
1536
|
return self.is_file(followlinks) or self.is_dir()
|
|
1443
1537
|
|
|
1444
1538
|
def getmtime(self, follow_symlinks: bool = False) -> float:
|
|
1445
|
-
|
|
1446
|
-
Get last-modified time of the file on the given s3_url path
|
|
1447
|
-
|
|
1539
|
+
"""
|
|
1540
|
+
Get last-modified time of the file on the given s3_url path
|
|
1541
|
+
(in Unix timestamp format).
|
|
1542
|
+
|
|
1543
|
+
If the path is an existent directory, return the latest modified time of
|
|
1544
|
+
all file in it. The mtime of empty directory is 1970-01-01 00:00:00
|
|
1448
1545
|
|
|
1449
|
-
If s3_url is not an existent path, which means s3_exist(s3_url) returns False,
|
|
1546
|
+
If s3_url is not an existent path, which means s3_exist(s3_url) returns False,
|
|
1547
|
+
then raise S3FileNotFoundError
|
|
1450
1548
|
|
|
1451
1549
|
:returns: Last-modified time
|
|
1452
1550
|
:raises: S3FileNotFoundError, UnsupportedError
|
|
1453
|
-
|
|
1551
|
+
"""
|
|
1454
1552
|
return self.stat(follow_symlinks=follow_symlinks).mtime
|
|
1455
1553
|
|
|
1456
1554
|
def getsize(self, follow_symlinks: bool = False) -> int:
|
|
1457
|
-
|
|
1555
|
+
"""
|
|
1458
1556
|
Get file size on the given s3_url path (in bytes).
|
|
1459
|
-
If the path in a directory, return the sum of all file size in it, including file in subdirectories (if exist).
|
|
1460
|
-
The result excludes the size of directory itself. In other words, return 0 Byte on an empty directory path.
|
|
1461
1557
|
|
|
1462
|
-
If
|
|
1558
|
+
If the path in a directory, return the sum of all file size in it,
|
|
1559
|
+
including file in subdirectories (if exist).
|
|
1560
|
+
|
|
1561
|
+
The result excludes the size of directory itself.
|
|
1562
|
+
In other words, return 0 Byte on an empty directory path.
|
|
1563
|
+
|
|
1564
|
+
If s3_url is not an existent path, which means s3_exist(s3_url) returns False,
|
|
1565
|
+
then raise S3FileNotFoundError
|
|
1463
1566
|
|
|
1464
1567
|
:returns: File size
|
|
1465
1568
|
:raises: S3FileNotFoundError, UnsupportedError
|
|
1466
|
-
|
|
1569
|
+
"""
|
|
1467
1570
|
return self.stat(follow_symlinks=follow_symlinks).size
|
|
1468
1571
|
|
|
1469
1572
|
def glob(
|
|
@@ -1472,38 +1575,52 @@ class S3Path(URIPath):
|
|
|
1472
1575
|
recursive: bool = True,
|
|
1473
1576
|
missing_ok: bool = True,
|
|
1474
1577
|
followlinks: bool = False,
|
|
1475
|
-
) -> List[
|
|
1476
|
-
|
|
1477
|
-
|
|
1578
|
+
) -> List["S3Path"]:
|
|
1579
|
+
"""Return s3 path list in ascending alphabetical order,
|
|
1580
|
+
in which path matches glob pattern
|
|
1581
|
+
|
|
1582
|
+
Notes: Only glob in bucket. If trying to match bucket with wildcard characters,
|
|
1583
|
+
raise UnsupportedError
|
|
1478
1584
|
|
|
1479
|
-
:param pattern: Glob the given relative pattern in the directory represented
|
|
1585
|
+
:param pattern: Glob the given relative pattern in the directory represented
|
|
1586
|
+
by this path
|
|
1480
1587
|
:param recursive: If False, `**` will not search directory recursively
|
|
1481
|
-
:param missing_ok: If False and target path doesn't match any file,
|
|
1588
|
+
:param missing_ok: If False and target path doesn't match any file,
|
|
1589
|
+
raise FileNotFoundError
|
|
1482
1590
|
:raises: UnsupportedError, when bucket part contains wildcard characters
|
|
1483
1591
|
:returns: A list contains paths match `s3_pathname`
|
|
1484
|
-
|
|
1592
|
+
"""
|
|
1485
1593
|
return list(
|
|
1486
1594
|
self.iglob(
|
|
1487
1595
|
pattern=pattern,
|
|
1488
1596
|
recursive=recursive,
|
|
1489
1597
|
missing_ok=missing_ok,
|
|
1490
|
-
followlinks=followlinks
|
|
1598
|
+
followlinks=followlinks,
|
|
1599
|
+
)
|
|
1600
|
+
)
|
|
1491
1601
|
|
|
1492
1602
|
def glob_stat(
|
|
1493
|
-
|
|
1494
|
-
|
|
1495
|
-
|
|
1496
|
-
|
|
1497
|
-
|
|
1498
|
-
|
|
1499
|
-
|
|
1500
|
-
|
|
1501
|
-
|
|
1603
|
+
self,
|
|
1604
|
+
pattern,
|
|
1605
|
+
recursive: bool = True,
|
|
1606
|
+
missing_ok: bool = True,
|
|
1607
|
+
followlinks: bool = False,
|
|
1608
|
+
) -> Iterator[FileEntry]:
|
|
1609
|
+
"""Return a generator contains tuples of path and file stat,
|
|
1610
|
+
in ascending alphabetical order, in which path matches glob pattern
|
|
1611
|
+
|
|
1612
|
+
Notes: Only glob in bucket. If trying to match bucket with wildcard characters,
|
|
1613
|
+
raise UnsupportedError
|
|
1614
|
+
|
|
1615
|
+
:param pattern: Glob the given relative pattern in the directory represented
|
|
1616
|
+
by this path
|
|
1502
1617
|
:param recursive: If False, `**` will not search directory recursively
|
|
1503
|
-
:param missing_ok: If False and target path doesn't match any file,
|
|
1618
|
+
:param missing_ok: If False and target path doesn't match any file,
|
|
1619
|
+
raise FileNotFoundError
|
|
1504
1620
|
:raises: UnsupportedError, when bucket part contains wildcard characters
|
|
1505
|
-
:returns: A generator contains tuples of path and file stat,
|
|
1506
|
-
|
|
1621
|
+
:returns: A generator contains tuples of path and file stat,
|
|
1622
|
+
in which paths match `s3_pathname`
|
|
1623
|
+
"""
|
|
1507
1624
|
glob_path = self._s3_path
|
|
1508
1625
|
if pattern:
|
|
1509
1626
|
glob_path = self.joinpath(pattern)._s3_path
|
|
@@ -1511,23 +1628,27 @@ class S3Path(URIPath):
|
|
|
1511
1628
|
|
|
1512
1629
|
def create_generator():
|
|
1513
1630
|
for group_s3_pathname_1 in _group_s3path_by_bucket(
|
|
1514
|
-
|
|
1515
|
-
|
|
1516
|
-
|
|
1631
|
+
s3_pathname, self._profile_name
|
|
1632
|
+
):
|
|
1633
|
+
for group_s3_pathname_2 in _group_s3path_by_prefix(group_s3_pathname_1):
|
|
1517
1634
|
for file_entry in _s3_glob_stat_single_path(
|
|
1518
|
-
|
|
1519
|
-
|
|
1520
|
-
|
|
1635
|
+
group_s3_pathname_2,
|
|
1636
|
+
recursive,
|
|
1637
|
+
missing_ok,
|
|
1638
|
+
followlinks=followlinks,
|
|
1639
|
+
profile_name=self._profile_name,
|
|
1640
|
+
):
|
|
1521
1641
|
if self._profile_name:
|
|
1522
1642
|
file_entry = file_entry._replace(
|
|
1523
|
-
path=
|
|
1524
|
-
f"{self._protocol_with_profile}://{file_entry.path[5:]}"
|
|
1643
|
+
path=f"{self._protocol_with_profile}://{file_entry.path[5:]}"
|
|
1525
1644
|
)
|
|
1526
1645
|
yield file_entry
|
|
1527
1646
|
|
|
1528
1647
|
return _create_missing_ok_generator(
|
|
1529
|
-
create_generator(),
|
|
1530
|
-
|
|
1648
|
+
create_generator(),
|
|
1649
|
+
missing_ok,
|
|
1650
|
+
S3FileNotFoundError("No match any file: %r" % s3_pathname),
|
|
1651
|
+
)
|
|
1531
1652
|
|
|
1532
1653
|
def iglob(
|
|
1533
1654
|
self,
|
|
@@ -1535,60 +1656,70 @@ class S3Path(URIPath):
|
|
|
1535
1656
|
recursive: bool = True,
|
|
1536
1657
|
missing_ok: bool = True,
|
|
1537
1658
|
followlinks: bool = False,
|
|
1538
|
-
) -> Iterator[
|
|
1539
|
-
|
|
1540
|
-
|
|
1659
|
+
) -> Iterator["S3Path"]:
|
|
1660
|
+
"""Return s3 path iterator in ascending alphabetical order,
|
|
1661
|
+
in which path matches glob pattern
|
|
1541
1662
|
|
|
1542
|
-
:
|
|
1663
|
+
Notes: Only glob in bucket. If trying to match bucket with wildcard characters,
|
|
1664
|
+
raise UnsupportedError
|
|
1665
|
+
|
|
1666
|
+
:param pattern: Glob the given relative pattern in the directory represented
|
|
1667
|
+
by this path
|
|
1543
1668
|
:param recursive: If False, `**` will not search directory recursively
|
|
1544
|
-
:param missing_ok: If False and target path doesn't match any file,
|
|
1669
|
+
:param missing_ok: If False and target path doesn't match any file,
|
|
1670
|
+
raise FileNotFoundError
|
|
1545
1671
|
:raises: UnsupportedError, when bucket part contains wildcard characters
|
|
1546
1672
|
:returns: An iterator contains paths match `s3_pathname`
|
|
1547
|
-
|
|
1548
|
-
for file_entry in self.glob_stat(
|
|
1549
|
-
|
|
1550
|
-
|
|
1673
|
+
"""
|
|
1674
|
+
for file_entry in self.glob_stat(
|
|
1675
|
+
pattern=pattern,
|
|
1676
|
+
recursive=recursive,
|
|
1677
|
+
missing_ok=missing_ok,
|
|
1678
|
+
followlinks=followlinks,
|
|
1679
|
+
):
|
|
1551
1680
|
yield self.from_path(file_entry.path)
|
|
1552
1681
|
|
|
1553
1682
|
def is_dir(self, followlinks: bool = False) -> bool:
|
|
1554
|
-
|
|
1683
|
+
"""
|
|
1555
1684
|
Test if an s3 url is directory
|
|
1556
1685
|
Specific procedures are as follows:
|
|
1557
1686
|
If there exists a suffix, of which ``os.path.join(s3_url, suffix)`` is a file
|
|
1558
1687
|
If the url is empty bucket or s3://
|
|
1559
1688
|
|
|
1560
|
-
:param followlinks: whether followlinks is True or False, result is the same.
|
|
1689
|
+
:param followlinks: whether followlinks is True or False, result is the same.
|
|
1690
|
+
Because s3 symlink not support dir.
|
|
1561
1691
|
:returns: True if path is s3 directory, else False
|
|
1562
|
-
|
|
1692
|
+
"""
|
|
1563
1693
|
bucket, key = parse_s3_url(self.path_with_protocol)
|
|
1564
1694
|
if not bucket: # s3:// => True, s3:///key => False
|
|
1565
1695
|
return not key
|
|
1566
1696
|
prefix = _become_prefix(key)
|
|
1567
1697
|
try:
|
|
1568
1698
|
resp = self._client.list_objects_v2(
|
|
1569
|
-
Bucket=bucket, Prefix=prefix, Delimiter=
|
|
1699
|
+
Bucket=bucket, Prefix=prefix, Delimiter="/", MaxKeys=1
|
|
1700
|
+
)
|
|
1570
1701
|
except Exception as error:
|
|
1571
1702
|
error = translate_s3_error(error, self.path_with_protocol)
|
|
1572
|
-
if isinstance(error,
|
|
1573
|
-
(S3UnknownError, S3ConfigError, S3PermissionError)):
|
|
1703
|
+
if isinstance(error, (S3UnknownError, S3ConfigError, S3PermissionError)):
|
|
1574
1704
|
raise error
|
|
1575
1705
|
return False
|
|
1576
1706
|
|
|
1577
1707
|
if not key: # bucket is accessible
|
|
1578
1708
|
return True
|
|
1579
1709
|
|
|
1580
|
-
if
|
|
1581
|
-
return resp[
|
|
1710
|
+
if "KeyCount" in resp:
|
|
1711
|
+
return resp["KeyCount"] > 0
|
|
1582
1712
|
|
|
1583
|
-
return
|
|
1584
|
-
len(resp.get(
|
|
1713
|
+
return (
|
|
1714
|
+
len(resp.get("Contents", [])) > 0 or len(resp.get("CommonPrefixes", [])) > 0
|
|
1715
|
+
)
|
|
1585
1716
|
|
|
1586
1717
|
def is_file(self, followlinks: bool = False) -> bool:
|
|
1587
|
-
|
|
1718
|
+
"""
|
|
1588
1719
|
Test if an s3_url is file
|
|
1589
1720
|
|
|
1590
1721
|
:returns: True if path is s3 file, else False
|
|
1591
|
-
|
|
1722
|
+
"""
|
|
1592
1723
|
s3_url = self.path_with_protocol
|
|
1593
1724
|
if followlinks:
|
|
1594
1725
|
try:
|
|
@@ -1596,46 +1727,45 @@ class S3Path(URIPath):
|
|
|
1596
1727
|
except S3NotALinkError:
|
|
1597
1728
|
pass
|
|
1598
1729
|
bucket, key = parse_s3_url(s3_url)
|
|
1599
|
-
if not bucket or not key or key.endswith(
|
|
1730
|
+
if not bucket or not key or key.endswith("/"):
|
|
1600
1731
|
# s3://, s3:///key, s3://bucket, s3://bucket/prefix/
|
|
1601
1732
|
return False
|
|
1602
1733
|
try:
|
|
1603
1734
|
self._client.head_object(Bucket=bucket, Key=key)
|
|
1604
1735
|
except Exception as error:
|
|
1605
1736
|
error = translate_s3_error(error, s3_url)
|
|
1606
|
-
if isinstance(error,
|
|
1607
|
-
(S3UnknownError, S3ConfigError, S3PermissionError)):
|
|
1737
|
+
if isinstance(error, (S3UnknownError, S3ConfigError, S3PermissionError)):
|
|
1608
1738
|
raise error
|
|
1609
1739
|
return False
|
|
1610
1740
|
return True
|
|
1611
1741
|
|
|
1612
1742
|
def listdir(self, followlinks: bool = False) -> List[str]:
|
|
1613
|
-
|
|
1743
|
+
"""
|
|
1614
1744
|
Get all contents of given s3_url. The result is in ascending alphabetical order.
|
|
1615
1745
|
|
|
1616
1746
|
:returns: All contents have prefix of s3_url in ascending alphabetical order
|
|
1617
1747
|
:raises: S3FileNotFoundError, S3NotADirectoryError
|
|
1618
|
-
|
|
1748
|
+
"""
|
|
1619
1749
|
entries = list(self.scandir(followlinks=followlinks))
|
|
1620
1750
|
return sorted([entry.name for entry in entries])
|
|
1621
1751
|
|
|
1622
|
-
def iterdir(self, followlinks: bool = False) -> Iterator[
|
|
1623
|
-
|
|
1752
|
+
def iterdir(self, followlinks: bool = False) -> Iterator["S3Path"]:
|
|
1753
|
+
"""
|
|
1624
1754
|
Get all contents of given s3_url. The result is in ascending alphabetical order.
|
|
1625
1755
|
|
|
1626
1756
|
:returns: All contents have prefix of s3_url in ascending alphabetical order
|
|
1627
1757
|
:raises: S3FileNotFoundError, S3NotADirectoryError
|
|
1628
|
-
|
|
1758
|
+
"""
|
|
1629
1759
|
for path in self.listdir(followlinks=followlinks):
|
|
1630
1760
|
yield self.joinpath(path)
|
|
1631
1761
|
|
|
1632
1762
|
def load(self, followlinks: bool = False) -> BinaryIO:
|
|
1633
|
-
|
|
1763
|
+
"""Read all content in binary on specified path and write into memory
|
|
1634
1764
|
|
|
1635
1765
|
User should close the BinaryIO manually
|
|
1636
1766
|
|
|
1637
1767
|
:returns: BinaryIO
|
|
1638
|
-
|
|
1768
|
+
"""
|
|
1639
1769
|
s3_url = self.path_with_protocol
|
|
1640
1770
|
if followlinks:
|
|
1641
1771
|
try:
|
|
@@ -1644,9 +1774,9 @@ class S3Path(URIPath):
|
|
|
1644
1774
|
pass
|
|
1645
1775
|
bucket, key = parse_s3_url(s3_url)
|
|
1646
1776
|
if not bucket:
|
|
1647
|
-
raise S3BucketNotFoundError(
|
|
1648
|
-
if not key or key.endswith(
|
|
1649
|
-
raise S3IsADirectoryError(
|
|
1777
|
+
raise S3BucketNotFoundError("Empty bucket name: %r" % s3_url)
|
|
1778
|
+
if not key or key.endswith("/"):
|
|
1779
|
+
raise S3IsADirectoryError("Is a directory: %r" % s3_url)
|
|
1650
1780
|
|
|
1651
1781
|
buffer = io.BytesIO()
|
|
1652
1782
|
with raise_s3_error(s3_url):
|
|
@@ -1655,11 +1785,11 @@ class S3Path(URIPath):
|
|
|
1655
1785
|
return buffer
|
|
1656
1786
|
|
|
1657
1787
|
def hasbucket(self) -> bool:
|
|
1658
|
-
|
|
1788
|
+
"""
|
|
1659
1789
|
Test if the bucket of s3_url exists
|
|
1660
1790
|
|
|
1661
1791
|
:returns: True if bucket of s3_url exists, else False
|
|
1662
|
-
|
|
1792
|
+
"""
|
|
1663
1793
|
bucket, _ = parse_s3_url(self.path_with_protocol)
|
|
1664
1794
|
if not bucket:
|
|
1665
1795
|
return False
|
|
@@ -1669,15 +1799,16 @@ class S3Path(URIPath):
|
|
|
1669
1799
|
except Exception as error:
|
|
1670
1800
|
error = translate_s3_error(error, self.path_with_protocol)
|
|
1671
1801
|
if isinstance(error, S3PermissionError):
|
|
1672
|
-
# Aliyun OSS doesn't give bucket api permission when you only have read
|
|
1802
|
+
# Aliyun OSS doesn't give bucket api permission when you only have read
|
|
1803
|
+
# and write permission
|
|
1673
1804
|
try:
|
|
1674
1805
|
self._client.list_objects_v2(Bucket=bucket, MaxKeys=1)
|
|
1675
1806
|
return True
|
|
1676
1807
|
except Exception as error2:
|
|
1677
1808
|
error2 = translate_s3_error(error2, self.path_with_protocol)
|
|
1678
1809
|
if isinstance(
|
|
1679
|
-
|
|
1680
|
-
|
|
1810
|
+
error2, (S3UnknownError, S3ConfigError, S3PermissionError)
|
|
1811
|
+
):
|
|
1681
1812
|
raise error2
|
|
1682
1813
|
return False
|
|
1683
1814
|
elif isinstance(error, (S3UnknownError, S3ConfigError)):
|
|
@@ -1688,7 +1819,7 @@ class S3Path(URIPath):
|
|
|
1688
1819
|
return True
|
|
1689
1820
|
|
|
1690
1821
|
def mkdir(self, mode=0o777, parents: bool = False, exist_ok: bool = False):
|
|
1691
|
-
|
|
1822
|
+
"""
|
|
1692
1823
|
Create an s3 directory.
|
|
1693
1824
|
Purely creating directory is invalid because it's unavailable on OSS.
|
|
1694
1825
|
This function is to test the target bucket have WRITE access.
|
|
@@ -1697,54 +1828,57 @@ class S3Path(URIPath):
|
|
|
1697
1828
|
:param parents: parents is ignored, only be compatible with pathlib.Path
|
|
1698
1829
|
:param exist_ok: If False and target directory exists, raise S3FileExistsError
|
|
1699
1830
|
:raises: S3BucketNotFoundError, S3FileExistsError
|
|
1700
|
-
|
|
1831
|
+
"""
|
|
1701
1832
|
bucket, _ = parse_s3_url(self.path_with_protocol)
|
|
1702
1833
|
if not bucket:
|
|
1703
1834
|
raise S3BucketNotFoundError(
|
|
1704
|
-
|
|
1835
|
+
"Empty bucket name: %r" % self.path_with_protocol
|
|
1836
|
+
)
|
|
1705
1837
|
if not self.hasbucket():
|
|
1706
|
-
raise S3BucketNotFoundError(
|
|
1707
|
-
'No such bucket: %r' % self.path_with_protocol)
|
|
1838
|
+
raise S3BucketNotFoundError("No such bucket: %r" % self.path_with_protocol)
|
|
1708
1839
|
if exist_ok:
|
|
1709
1840
|
if self.is_file():
|
|
1710
|
-
raise S3FileExistsError(
|
|
1711
|
-
'File exists: %r' % self.path_with_protocol)
|
|
1841
|
+
raise S3FileExistsError("File exists: %r" % self.path_with_protocol)
|
|
1712
1842
|
return
|
|
1713
1843
|
if self.exists():
|
|
1714
|
-
raise S3FileExistsError(
|
|
1844
|
+
raise S3FileExistsError("File exists: %r" % self.path_with_protocol)
|
|
1715
1845
|
|
|
1716
1846
|
def move(self, dst_url: PathLike, overwrite: bool = True) -> None:
|
|
1717
|
-
|
|
1847
|
+
"""
|
|
1718
1848
|
Move file/directory path from src_url to dst_url
|
|
1719
1849
|
|
|
1720
1850
|
:param dst_url: Given destination path
|
|
1721
1851
|
:param overwrite: whether or not overwrite file when exists
|
|
1722
|
-
|
|
1852
|
+
"""
|
|
1723
1853
|
for src_file_path, dst_file_path in _s3_scan_pairs(
|
|
1724
|
-
|
|
1854
|
+
self.path_with_protocol, dst_url
|
|
1855
|
+
):
|
|
1725
1856
|
S3Path(src_file_path).rename(dst_file_path, overwrite)
|
|
1726
1857
|
|
|
1727
1858
|
def remove(self, missing_ok: bool = False) -> None:
|
|
1728
|
-
|
|
1729
|
-
Remove the file or directory on s3, `s3://` and `s3://bucket`
|
|
1859
|
+
"""
|
|
1860
|
+
Remove the file or directory on s3, `s3://` and `s3://bucket`
|
|
1861
|
+
are not permitted to remove
|
|
1730
1862
|
|
|
1731
|
-
:param missing_ok: if False and target file/directory not exists,
|
|
1863
|
+
:param missing_ok: if False and target file/directory not exists,
|
|
1864
|
+
raise S3FileNotFoundError
|
|
1732
1865
|
:raises: S3PermissionError, S3FileNotFoundError, UnsupportedError
|
|
1733
|
-
|
|
1866
|
+
"""
|
|
1734
1867
|
bucket, key = parse_s3_url(self.path_with_protocol)
|
|
1735
1868
|
if not bucket:
|
|
1736
1869
|
if not key:
|
|
1737
|
-
raise UnsupportedError(
|
|
1738
|
-
'Remove whole s3', self.path_with_protocol)
|
|
1870
|
+
raise UnsupportedError("Remove whole s3", self.path_with_protocol)
|
|
1739
1871
|
raise S3BucketNotFoundError(
|
|
1740
|
-
|
|
1872
|
+
"Empty bucket name: %r" % self.path_with_protocol
|
|
1873
|
+
)
|
|
1741
1874
|
if not key:
|
|
1742
|
-
raise UnsupportedError(
|
|
1875
|
+
raise UnsupportedError("Remove bucket", self.path_with_protocol)
|
|
1743
1876
|
if not self.exists():
|
|
1744
1877
|
if missing_ok:
|
|
1745
1878
|
return
|
|
1746
1879
|
raise S3FileNotFoundError(
|
|
1747
|
-
|
|
1880
|
+
"No such file or directory: %r" % self.path_with_protocol
|
|
1881
|
+
)
|
|
1748
1882
|
|
|
1749
1883
|
client = self._client
|
|
1750
1884
|
with raise_s3_error(self.path_with_protocol):
|
|
@@ -1754,55 +1888,61 @@ class S3Path(URIPath):
|
|
|
1754
1888
|
prefix = _become_prefix(key)
|
|
1755
1889
|
total_count, error_count = 0, 0
|
|
1756
1890
|
for resp in _list_objects_recursive(client, bucket, prefix):
|
|
1757
|
-
if
|
|
1758
|
-
keys = [
|
|
1759
|
-
{
|
|
1760
|
-
'Key': content['Key']
|
|
1761
|
-
} for content in resp['Contents']
|
|
1762
|
-
]
|
|
1891
|
+
if "Contents" in resp:
|
|
1892
|
+
keys = [{"Key": content["Key"]} for content in resp["Contents"]]
|
|
1763
1893
|
total_count += len(keys)
|
|
1764
1894
|
errors = []
|
|
1765
1895
|
retries = 2
|
|
1766
1896
|
retry_interval = min(0.1 * 2**retries, 30)
|
|
1767
1897
|
for i in range(retries):
|
|
1768
|
-
# doc:
|
|
1898
|
+
# doc:
|
|
1899
|
+
# https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3.html#S3.Client.delete_objects
|
|
1769
1900
|
if not keys:
|
|
1770
1901
|
break
|
|
1771
1902
|
response = client.delete_objects(
|
|
1772
|
-
Bucket=bucket, Delete={
|
|
1903
|
+
Bucket=bucket, Delete={"Objects": keys}
|
|
1904
|
+
)
|
|
1773
1905
|
keys = []
|
|
1774
|
-
for error_info in response.get(
|
|
1775
|
-
if s3_error_code_should_retry(
|
|
1776
|
-
error_info.get('Code')):
|
|
1906
|
+
for error_info in response.get("Errors", []):
|
|
1907
|
+
if s3_error_code_should_retry(error_info.get("Code")):
|
|
1777
1908
|
error_logger.warning(
|
|
1778
|
-
"retry %s times, removing file: %s,
|
|
1909
|
+
"retry %s times, removing file: %s, "
|
|
1910
|
+
"with error %s: %s"
|
|
1779
1911
|
% (
|
|
1780
|
-
i + 1,
|
|
1781
|
-
error_info[
|
|
1782
|
-
error_info[
|
|
1783
|
-
|
|
1912
|
+
i + 1,
|
|
1913
|
+
error_info["Key"],
|
|
1914
|
+
error_info["Code"],
|
|
1915
|
+
error_info["Message"],
|
|
1916
|
+
)
|
|
1917
|
+
)
|
|
1918
|
+
keys.append({"Key": error_info["Key"]})
|
|
1784
1919
|
else:
|
|
1785
1920
|
errors.append(error_info)
|
|
1786
1921
|
time.sleep(retry_interval)
|
|
1787
1922
|
for error_info in errors:
|
|
1788
1923
|
error_logger.error(
|
|
1789
|
-
"failed remove file: %s, with error %s: %s"
|
|
1790
|
-
|
|
1791
|
-
error_info[
|
|
1924
|
+
"failed remove file: %s, with error %s: %s"
|
|
1925
|
+
% (
|
|
1926
|
+
error_info["Key"],
|
|
1927
|
+
error_info["Code"],
|
|
1928
|
+
error_info["Message"],
|
|
1929
|
+
)
|
|
1930
|
+
)
|
|
1792
1931
|
error_count += len(errors)
|
|
1793
1932
|
if error_count > 0:
|
|
1794
|
-
error_msg =
|
|
1795
|
-
|
|
1796
|
-
|
|
1797
|
-
|
|
1798
|
-
|
|
1799
|
-
|
|
1800
|
-
|
|
1933
|
+
error_msg = (
|
|
1934
|
+
"failed remove path: %s, total file count: %s, failed count: %s"
|
|
1935
|
+
% (self.path_with_protocol, total_count, error_count)
|
|
1936
|
+
)
|
|
1937
|
+
raise S3UnknownError(Exception(error_msg), self.path_with_protocol)
|
|
1938
|
+
|
|
1939
|
+
def rename(self, dst_path: PathLike, overwrite: bool = True) -> "S3Path":
|
|
1940
|
+
"""
|
|
1801
1941
|
Move s3 file path from src_url to dst_url
|
|
1802
1942
|
|
|
1803
1943
|
:param dst_path: Given destination path
|
|
1804
1944
|
:param overwrite: whether or not overwrite file when exists
|
|
1805
|
-
|
|
1945
|
+
"""
|
|
1806
1946
|
if self.is_file():
|
|
1807
1947
|
self.copy(dst_path, overwrite=overwrite)
|
|
1808
1948
|
else:
|
|
@@ -1810,25 +1950,28 @@ class S3Path(URIPath):
|
|
|
1810
1950
|
self.remove(missing_ok=True)
|
|
1811
1951
|
return self.from_path(dst_path)
|
|
1812
1952
|
|
|
1813
|
-
def scan(self,
|
|
1814
|
-
|
|
1815
|
-
followlinks: bool = False) -> Iterator[str]:
|
|
1816
|
-
'''
|
|
1953
|
+
def scan(self, missing_ok: bool = True, followlinks: bool = False) -> Iterator[str]:
|
|
1954
|
+
"""
|
|
1817
1955
|
Iteratively traverse only files in given s3 directory, in alphabetical order.
|
|
1818
1956
|
Every iteration on generator yields a path string.
|
|
1819
1957
|
|
|
1820
1958
|
If s3_url is a file path, yields the file only
|
|
1959
|
+
|
|
1821
1960
|
If s3_url is a non-existent path, return an empty generator
|
|
1961
|
+
|
|
1822
1962
|
If s3_url is a bucket path, return all file paths in the bucket
|
|
1963
|
+
|
|
1823
1964
|
If s3_url is an empty bucket, return an empty generator
|
|
1824
|
-
If s3_url doesn't contain any bucket, which is s3_url == 's3://', raise UnsupportedError. walk() on complete s3 is not supported in megfile
|
|
1825
1965
|
|
|
1826
|
-
|
|
1966
|
+
If s3_url doesn't contain any bucket, which is s3_url == 's3://',
|
|
1967
|
+
raise UnsupportedError. walk() on complete s3 is not supported in megfile
|
|
1968
|
+
|
|
1969
|
+
:param missing_ok: If False and there's no file in the directory,
|
|
1970
|
+
raise FileNotFoundError
|
|
1827
1971
|
:raises: UnsupportedError
|
|
1828
1972
|
:returns: A file path generator
|
|
1829
|
-
|
|
1830
|
-
scan_stat_iter = self.scan_stat(
|
|
1831
|
-
missing_ok=missing_ok, followlinks=followlinks)
|
|
1973
|
+
"""
|
|
1974
|
+
scan_stat_iter = self.scan_stat(missing_ok=missing_ok, followlinks=followlinks)
|
|
1832
1975
|
|
|
1833
1976
|
def create_generator() -> Iterator[str]:
|
|
1834
1977
|
for file_entry in scan_stat_iter:
|
|
@@ -1836,83 +1979,89 @@ class S3Path(URIPath):
|
|
|
1836
1979
|
|
|
1837
1980
|
return create_generator()
|
|
1838
1981
|
|
|
1839
|
-
def scan_stat(
|
|
1840
|
-
|
|
1841
|
-
|
|
1842
|
-
|
|
1982
|
+
def scan_stat(
|
|
1983
|
+
self, missing_ok: bool = True, followlinks: bool = False
|
|
1984
|
+
) -> Iterator[FileEntry]:
|
|
1985
|
+
"""
|
|
1843
1986
|
Iteratively traverse only files in given directory, in alphabetical order.
|
|
1844
1987
|
Every iteration on generator yields a tuple of path string and file stat
|
|
1845
1988
|
|
|
1846
|
-
:param missing_ok: If False and there's no file in the directory,
|
|
1989
|
+
:param missing_ok: If False and there's no file in the directory,
|
|
1990
|
+
raise FileNotFoundError
|
|
1847
1991
|
:raises: UnsupportedError
|
|
1848
1992
|
:returns: A file path generator
|
|
1849
|
-
|
|
1993
|
+
"""
|
|
1850
1994
|
bucket, key = parse_s3_url(self.path_with_protocol)
|
|
1851
1995
|
if not bucket:
|
|
1852
|
-
raise UnsupportedError(
|
|
1996
|
+
raise UnsupportedError("Scan whole s3", self.path_with_protocol)
|
|
1853
1997
|
|
|
1854
1998
|
def create_generator() -> Iterator[FileEntry]:
|
|
1855
1999
|
if not self.is_dir():
|
|
1856
2000
|
if self.is_file():
|
|
1857
|
-
# On s3, file and directory may be of same name and level, so need
|
|
2001
|
+
# On s3, file and directory may be of same name and level, so need
|
|
2002
|
+
# to test the path is file or directory
|
|
1858
2003
|
yield FileEntry(
|
|
1859
|
-
self.name,
|
|
1860
|
-
self.
|
|
2004
|
+
self.name,
|
|
2005
|
+
fspath(self.path_with_protocol),
|
|
2006
|
+
self.stat(follow_symlinks=followlinks),
|
|
2007
|
+
)
|
|
1861
2008
|
return
|
|
1862
2009
|
|
|
1863
|
-
if not key.endswith(
|
|
2010
|
+
if not key.endswith("/") and self.is_file():
|
|
1864
2011
|
yield FileEntry(
|
|
1865
|
-
self.name,
|
|
1866
|
-
self.
|
|
2012
|
+
self.name,
|
|
2013
|
+
fspath(self.path_with_protocol),
|
|
2014
|
+
self.stat(follow_symlinks=followlinks),
|
|
2015
|
+
)
|
|
1867
2016
|
|
|
1868
2017
|
prefix = _become_prefix(key)
|
|
1869
2018
|
client = self._client
|
|
1870
2019
|
with raise_s3_error(self.path_with_protocol):
|
|
1871
2020
|
for resp in _list_objects_recursive(client, bucket, prefix):
|
|
1872
|
-
for content in resp.get(
|
|
2021
|
+
for content in resp.get("Contents", []):
|
|
1873
2022
|
full_path = s3_path_join(
|
|
1874
|
-
f
|
|
1875
|
-
|
|
2023
|
+
f"{self._protocol_with_profile}://", bucket, content["Key"]
|
|
2024
|
+
)
|
|
1876
2025
|
|
|
1877
2026
|
if followlinks:
|
|
1878
2027
|
try:
|
|
1879
|
-
origin_path = self.from_path(
|
|
1880
|
-
full_path).readlink()
|
|
2028
|
+
origin_path = self.from_path(full_path).readlink()
|
|
1881
2029
|
yield FileEntry(
|
|
1882
2030
|
origin_path.name,
|
|
1883
2031
|
origin_path.path_with_protocol,
|
|
1884
|
-
origin_path.lstat()
|
|
2032
|
+
origin_path.lstat(),
|
|
2033
|
+
)
|
|
1885
2034
|
continue
|
|
1886
2035
|
except S3NotALinkError:
|
|
1887
2036
|
pass
|
|
1888
2037
|
|
|
1889
2038
|
yield FileEntry(
|
|
1890
|
-
S3Path(full_path).name, full_path,
|
|
1891
|
-
|
|
2039
|
+
S3Path(full_path).name, full_path, _make_stat(content)
|
|
2040
|
+
)
|
|
1892
2041
|
|
|
1893
2042
|
return _create_missing_ok_generator(
|
|
1894
|
-
create_generator(),
|
|
1895
|
-
|
|
1896
|
-
|
|
2043
|
+
create_generator(),
|
|
2044
|
+
missing_ok,
|
|
2045
|
+
S3FileNotFoundError("No match any file in: %r" % self.path_with_protocol),
|
|
2046
|
+
)
|
|
1897
2047
|
|
|
1898
2048
|
def scandir(self, followlinks: bool = False) -> Iterator[FileEntry]:
|
|
1899
|
-
|
|
2049
|
+
"""
|
|
1900
2050
|
Get all contents of given s3_url, the order of result is not guaranteed.
|
|
1901
2051
|
|
|
1902
2052
|
:returns: All contents have prefix of s3_url
|
|
1903
2053
|
:raises: S3FileNotFoundError, S3NotADirectoryError
|
|
1904
|
-
|
|
2054
|
+
"""
|
|
1905
2055
|
bucket, key = parse_s3_url(self.path_with_protocol)
|
|
1906
2056
|
if not bucket and key:
|
|
1907
2057
|
raise S3BucketNotFoundError(
|
|
1908
|
-
|
|
2058
|
+
"Empty bucket name: %r" % self.path_with_protocol
|
|
2059
|
+
)
|
|
1909
2060
|
|
|
1910
2061
|
if self.is_file():
|
|
1911
|
-
raise S3NotADirectoryError(
|
|
1912
|
-
'Not a directory: %r' % self.path_with_protocol)
|
|
2062
|
+
raise S3NotADirectoryError("Not a directory: %r" % self.path_with_protocol)
|
|
1913
2063
|
elif not self.is_dir():
|
|
1914
|
-
raise S3FileNotFoundError(
|
|
1915
|
-
'No such directory: %r' % self.path_with_protocol)
|
|
2064
|
+
raise S3FileNotFoundError("No such directory: %r" % self.path_with_protocol)
|
|
1916
2065
|
prefix = _become_prefix(key)
|
|
1917
2066
|
client = self._client
|
|
1918
2067
|
|
|
@@ -1921,34 +2070,38 @@ class S3Path(URIPath):
|
|
|
1921
2070
|
def create_generator() -> Iterator[FileEntry]:
|
|
1922
2071
|
with raise_s3_error(self.path_with_protocol):
|
|
1923
2072
|
|
|
1924
|
-
def generate_s3_path(
|
|
1925
|
-
protocol: str, bucket: str, key: str) -> str:
|
|
2073
|
+
def generate_s3_path(protocol: str, bucket: str, key: str) -> str:
|
|
1926
2074
|
return "%s://%s/%s" % (protocol, bucket, key)
|
|
1927
2075
|
|
|
1928
2076
|
if not bucket and not key: # list buckets
|
|
1929
2077
|
response = client.list_buckets()
|
|
1930
|
-
for content in response[
|
|
2078
|
+
for content in response["Buckets"]:
|
|
1931
2079
|
yield FileEntry(
|
|
1932
|
-
content[
|
|
2080
|
+
content["Name"],
|
|
2081
|
+
f"s3://{content['Name']}",
|
|
1933
2082
|
StatResult(
|
|
1934
|
-
ctime=content[
|
|
2083
|
+
ctime=content["CreationDate"].timestamp(),
|
|
1935
2084
|
isdir=True,
|
|
1936
2085
|
extra=content,
|
|
1937
|
-
)
|
|
2086
|
+
),
|
|
2087
|
+
)
|
|
1938
2088
|
return
|
|
1939
2089
|
|
|
1940
|
-
for resp in _list_objects_recursive(client, bucket, prefix,
|
|
1941
|
-
|
|
1942
|
-
for common_prefix in resp.get('CommonPrefixes', []):
|
|
2090
|
+
for resp in _list_objects_recursive(client, bucket, prefix, "/"):
|
|
2091
|
+
for common_prefix in resp.get("CommonPrefixes", []):
|
|
1943
2092
|
yield FileEntry(
|
|
1944
|
-
common_prefix[
|
|
2093
|
+
common_prefix["Prefix"][len(prefix) : -1],
|
|
1945
2094
|
generate_s3_path(
|
|
1946
|
-
self._protocol_with_profile,
|
|
1947
|
-
|
|
1948
|
-
|
|
1949
|
-
|
|
2095
|
+
self._protocol_with_profile,
|
|
2096
|
+
bucket,
|
|
2097
|
+
common_prefix["Prefix"],
|
|
2098
|
+
),
|
|
2099
|
+
StatResult(isdir=True, extra=common_prefix),
|
|
2100
|
+
)
|
|
2101
|
+
for content in resp.get("Contents", []):
|
|
1950
2102
|
src_url = generate_s3_path(
|
|
1951
|
-
self._protocol_with_profile, bucket, content[
|
|
2103
|
+
self._protocol_with_profile, bucket, content["Key"]
|
|
2104
|
+
)
|
|
1952
2105
|
|
|
1953
2106
|
if followlinks:
|
|
1954
2107
|
try:
|
|
@@ -1956,30 +2109,35 @@ class S3Path(URIPath):
|
|
|
1956
2109
|
yield FileEntry(
|
|
1957
2110
|
origin_path.name,
|
|
1958
2111
|
origin_path.path_with_protocol,
|
|
1959
|
-
origin_path.lstat()
|
|
2112
|
+
origin_path.lstat(),
|
|
2113
|
+
)
|
|
1960
2114
|
continue
|
|
1961
2115
|
except S3NotALinkError:
|
|
1962
2116
|
pass
|
|
1963
2117
|
|
|
1964
2118
|
yield FileEntry(
|
|
1965
|
-
content[
|
|
1966
|
-
|
|
2119
|
+
content["Key"][len(prefix) :], src_url, _make_stat(content)
|
|
2120
|
+
)
|
|
1967
2121
|
|
|
1968
2122
|
return ContextIterator(create_generator())
|
|
1969
2123
|
|
|
1970
2124
|
def _get_dir_stat(self) -> StatResult:
|
|
1971
|
-
|
|
1972
|
-
Return StatResult of given s3_url directory, including:
|
|
2125
|
+
"""
|
|
2126
|
+
Return StatResult of given s3_url directory, including:
|
|
1973
2127
|
|
|
1974
|
-
1. Directory size: the sum of all file size in it,
|
|
1975
|
-
|
|
1976
|
-
|
|
2128
|
+
1. Directory size: the sum of all file size in it,
|
|
2129
|
+
including file in subdirectories (if exist).
|
|
2130
|
+
The result excludes the size of directory itself.
|
|
2131
|
+
In other words, return 0 Byte on an empty directory path
|
|
2132
|
+
2. Last-modified time of directory: return the latest modified time
|
|
2133
|
+
of all file in it. The mtime of empty directory is 1970-01-01 00:00:00
|
|
1977
2134
|
|
|
1978
2135
|
:returns: An int indicates size in Bytes
|
|
1979
|
-
|
|
2136
|
+
"""
|
|
1980
2137
|
if not self.is_dir():
|
|
1981
2138
|
raise S3FileNotFoundError(
|
|
1982
|
-
|
|
2139
|
+
"No such file or directory: %r" % self.path_with_protocol
|
|
2140
|
+
)
|
|
1983
2141
|
|
|
1984
2142
|
bucket, key = parse_s3_url(self.path_with_protocol)
|
|
1985
2143
|
prefix = _become_prefix(key)
|
|
@@ -1988,29 +2146,34 @@ class S3Path(URIPath):
|
|
|
1988
2146
|
mtime = 0.0
|
|
1989
2147
|
with raise_s3_error(self.path_with_protocol):
|
|
1990
2148
|
for resp in _list_objects_recursive(client, bucket, prefix):
|
|
1991
|
-
for content in resp.get(
|
|
1992
|
-
size += content[
|
|
1993
|
-
last_modified = content[
|
|
2149
|
+
for content in resp.get("Contents", []):
|
|
2150
|
+
size += content["Size"]
|
|
2151
|
+
last_modified = content["LastModified"].timestamp()
|
|
1994
2152
|
if mtime < last_modified:
|
|
1995
2153
|
mtime = last_modified
|
|
1996
2154
|
|
|
1997
2155
|
return StatResult(size=size, mtime=mtime, isdir=True)
|
|
1998
2156
|
|
|
1999
2157
|
def stat(self, follow_symlinks=True) -> StatResult:
|
|
2000
|
-
|
|
2001
|
-
Get StatResult of s3_url file, including file size and mtime,
|
|
2158
|
+
"""
|
|
2159
|
+
Get StatResult of s3_url file, including file size and mtime,
|
|
2160
|
+
referring to s3_getsize and s3_getmtime
|
|
2161
|
+
|
|
2162
|
+
If s3_url is not an existent path, which means s3_exist(s3_url) returns False,
|
|
2163
|
+
then raise S3FileNotFoundError
|
|
2002
2164
|
|
|
2003
|
-
If
|
|
2004
|
-
|
|
2165
|
+
If attempt to get StatResult of complete s3, such as s3_dir_url == 's3://',
|
|
2166
|
+
raise S3BucketNotFoundError
|
|
2005
2167
|
|
|
2006
2168
|
:returns: StatResult
|
|
2007
2169
|
:raises: S3FileNotFoundError, S3BucketNotFoundError
|
|
2008
|
-
|
|
2170
|
+
"""
|
|
2009
2171
|
islnk = False
|
|
2010
2172
|
bucket, key = parse_s3_url(self.path_with_protocol)
|
|
2011
2173
|
if not bucket:
|
|
2012
2174
|
raise S3BucketNotFoundError(
|
|
2013
|
-
|
|
2175
|
+
"Empty bucket name: %r" % self.path_with_protocol
|
|
2176
|
+
)
|
|
2014
2177
|
|
|
2015
2178
|
if not self.is_file():
|
|
2016
2179
|
return self._get_dir_stat()
|
|
@@ -2018,68 +2181,80 @@ class S3Path(URIPath):
|
|
|
2018
2181
|
client = self._client
|
|
2019
2182
|
with raise_s3_error(self.path_with_protocol):
|
|
2020
2183
|
content = client.head_object(Bucket=bucket, Key=key)
|
|
2021
|
-
if
|
|
2184
|
+
if "Metadata" in content:
|
|
2022
2185
|
metadata = dict(
|
|
2023
|
-
(key.lower(), value)
|
|
2024
|
-
|
|
2025
|
-
if metadata and
|
|
2186
|
+
(key.lower(), value) for key, value in content["Metadata"].items()
|
|
2187
|
+
)
|
|
2188
|
+
if metadata and "symlink_to" in metadata:
|
|
2026
2189
|
islnk = True
|
|
2027
2190
|
if islnk and follow_symlinks:
|
|
2028
|
-
s3_url = metadata[
|
|
2191
|
+
s3_url = metadata["symlink_to"]
|
|
2029
2192
|
bucket, key = parse_s3_url(s3_url)
|
|
2030
2193
|
content = client.head_object(Bucket=bucket, Key=key)
|
|
2031
2194
|
stat_record = StatResult(
|
|
2032
2195
|
islnk=islnk,
|
|
2033
|
-
size=content[
|
|
2034
|
-
mtime=content[
|
|
2035
|
-
extra=content
|
|
2196
|
+
size=content["ContentLength"],
|
|
2197
|
+
mtime=content["LastModified"].timestamp(),
|
|
2198
|
+
extra=content,
|
|
2199
|
+
)
|
|
2036
2200
|
return stat_record
|
|
2037
2201
|
|
|
2038
2202
|
def unlink(self, missing_ok: bool = False) -> None:
|
|
2039
|
-
|
|
2203
|
+
"""
|
|
2040
2204
|
Remove the file on s3
|
|
2041
2205
|
|
|
2042
|
-
:param missing_ok: if False and target file not exists,
|
|
2206
|
+
:param missing_ok: if False and target file not exists,
|
|
2207
|
+
raise S3FileNotFoundError
|
|
2043
2208
|
:raises: S3PermissionError, S3FileNotFoundError, S3IsADirectoryError
|
|
2044
|
-
|
|
2209
|
+
"""
|
|
2045
2210
|
bucket, key = parse_s3_url(self.path_with_protocol)
|
|
2046
|
-
if not bucket or not key or key.endswith(
|
|
2047
|
-
raise S3IsADirectoryError(
|
|
2048
|
-
'Is a directory: %r' % self.path_with_protocol)
|
|
2211
|
+
if not bucket or not key or key.endswith("/"):
|
|
2212
|
+
raise S3IsADirectoryError("Is a directory: %r" % self.path_with_protocol)
|
|
2049
2213
|
if not self.is_file():
|
|
2050
2214
|
if missing_ok:
|
|
2051
2215
|
return
|
|
2052
|
-
raise S3FileNotFoundError(
|
|
2053
|
-
'No such file: %r' % self.path_with_protocol)
|
|
2216
|
+
raise S3FileNotFoundError("No such file: %r" % self.path_with_protocol)
|
|
2054
2217
|
|
|
2055
2218
|
with raise_s3_error(self.path_with_protocol):
|
|
2056
2219
|
self._client.delete_object(Bucket=bucket, Key=key)
|
|
2057
2220
|
|
|
2058
2221
|
def walk(
|
|
2059
|
-
self,
|
|
2060
|
-
followlinks: bool = False
|
|
2222
|
+
self, followlinks: bool = False
|
|
2061
2223
|
) -> Iterator[Tuple[str, List[str], List[str]]]:
|
|
2062
|
-
|
|
2063
|
-
Iteratively traverse the given s3 directory, in top-bottom order.
|
|
2224
|
+
"""
|
|
2225
|
+
Iteratively traverse the given s3 directory, in top-bottom order.
|
|
2226
|
+
In other words, firstly traverse parent directory, if subdirectories exist,
|
|
2227
|
+
traverse the subdirectories in alphabetical order.
|
|
2228
|
+
|
|
2064
2229
|
Every iteration on generator yields a 3-tuple: (root, dirs, files)
|
|
2065
2230
|
|
|
2066
2231
|
- root: Current s3 path;
|
|
2067
|
-
- dirs: Name list of subdirectories in current directory.
|
|
2068
|
-
|
|
2232
|
+
- dirs: Name list of subdirectories in current directory.
|
|
2233
|
+
The list is sorted by name in ascending alphabetical order;
|
|
2234
|
+
- files: Name list of files in current directory.
|
|
2235
|
+
The list is sorted by name in ascending alphabetical order;
|
|
2069
2236
|
|
|
2070
2237
|
If s3_url is a file path, return an empty generator
|
|
2238
|
+
|
|
2071
2239
|
If s3_url is a non-existent path, return an empty generator
|
|
2072
|
-
If s3_url is a bucket path, bucket will be the top directory, and will be returned at first iteration of generator
|
|
2073
|
-
If s3_url is an empty bucket, only yield one 3-tuple (notes: s3 doesn't have empty directory)
|
|
2074
|
-
If s3_url doesn't contain any bucket, which is s3_url == 's3://', raise UnsupportedError. walk() on complete s3 is not supported in megfile
|
|
2075
2240
|
|
|
2076
|
-
|
|
2241
|
+
If s3_url is a bucket path, bucket will be the top directory,
|
|
2242
|
+
and will be returned at first iteration of generator
|
|
2243
|
+
|
|
2244
|
+
If s3_url is an empty bucket, only yield one 3-tuple
|
|
2245
|
+
(notes: s3 doesn't have empty directory)
|
|
2246
|
+
|
|
2247
|
+
If s3_url doesn't contain any bucket, which is s3_url == 's3://',
|
|
2248
|
+
raise UnsupportedError. walk() on complete s3 is not supported in megfile
|
|
2249
|
+
|
|
2250
|
+
:param followlinks: whether followlinks is True or False, result is the same.
|
|
2251
|
+
Because s3 symlink not support dir.
|
|
2077
2252
|
:raises: UnsupportedError
|
|
2078
2253
|
:returns: A 3-tuple generator
|
|
2079
|
-
|
|
2254
|
+
"""
|
|
2080
2255
|
bucket, key = parse_s3_url(self.path_with_protocol)
|
|
2081
2256
|
if not bucket:
|
|
2082
|
-
raise UnsupportedError(
|
|
2257
|
+
raise UnsupportedError("Walk whole s3", self.path_with_protocol)
|
|
2083
2258
|
|
|
2084
2259
|
if not self.is_dir():
|
|
2085
2260
|
return
|
|
@@ -2089,23 +2264,24 @@ class S3Path(URIPath):
|
|
|
2089
2264
|
while len(stack) > 0:
|
|
2090
2265
|
current = _become_prefix(stack.pop())
|
|
2091
2266
|
dirs, files = [], []
|
|
2092
|
-
for resp in _list_objects_recursive(client, bucket, current,
|
|
2093
|
-
for common_prefix in resp.get(
|
|
2094
|
-
dirs.append(common_prefix[
|
|
2095
|
-
for content in resp.get(
|
|
2096
|
-
files.append(content[
|
|
2267
|
+
for resp in _list_objects_recursive(client, bucket, current, "/"):
|
|
2268
|
+
for common_prefix in resp.get("CommonPrefixes", []):
|
|
2269
|
+
dirs.append(common_prefix["Prefix"][:-1])
|
|
2270
|
+
for content in resp.get("Contents", []):
|
|
2271
|
+
files.append(content["Key"])
|
|
2097
2272
|
|
|
2098
2273
|
dirs = sorted(dirs)
|
|
2099
2274
|
stack.extend(reversed(dirs))
|
|
2100
2275
|
|
|
2101
|
-
root = s3_path_join(
|
|
2102
|
-
|
|
2103
|
-
|
|
2104
|
-
|
|
2276
|
+
root = s3_path_join(f"{self._protocol_with_profile}://", bucket, current)[
|
|
2277
|
+
:-1
|
|
2278
|
+
]
|
|
2279
|
+
dirs = [path[len(current) :] for path in dirs]
|
|
2280
|
+
files = sorted(path[len(current) :] for path in files)
|
|
2105
2281
|
yield root, dirs, files
|
|
2106
2282
|
|
|
2107
2283
|
def md5(self, recalculate: bool = False, followlinks: bool = False) -> str:
|
|
2108
|
-
|
|
2284
|
+
"""
|
|
2109
2285
|
Get md5 meta info in files that uploaded/copied via megfile
|
|
2110
2286
|
|
|
2111
2287
|
If meta info is lost or non-existent, return None
|
|
@@ -2113,19 +2289,21 @@ class S3Path(URIPath):
|
|
|
2113
2289
|
:param recalculate: calculate md5 in real-time or return s3 etag
|
|
2114
2290
|
:param followlinks: If is True, calculate md5 for real file
|
|
2115
2291
|
:returns: md5 meta info
|
|
2116
|
-
|
|
2292
|
+
"""
|
|
2117
2293
|
bucket, _ = parse_s3_url(self.path_with_protocol)
|
|
2118
2294
|
if not bucket:
|
|
2119
2295
|
raise S3BucketNotFoundError(
|
|
2120
|
-
|
|
2296
|
+
"Empty bucket name: %r" % self.path_with_protocol
|
|
2297
|
+
)
|
|
2121
2298
|
stat = self.stat(follow_symlinks=followlinks)
|
|
2122
2299
|
if stat.isdir:
|
|
2123
2300
|
hash_md5 = hashlib.md5() # nosec
|
|
2124
2301
|
for file_name in self.listdir():
|
|
2125
|
-
chunk =
|
|
2126
|
-
s3_path_join(
|
|
2127
|
-
|
|
2128
|
-
|
|
2302
|
+
chunk = (
|
|
2303
|
+
S3Path(s3_path_join(self.path_with_protocol, file_name))
|
|
2304
|
+
.md5(recalculate=recalculate)
|
|
2305
|
+
.encode()
|
|
2306
|
+
)
|
|
2129
2307
|
hash_md5.update(chunk)
|
|
2130
2308
|
return hash_md5.hexdigest()
|
|
2131
2309
|
if recalculate:
|
|
@@ -2135,45 +2313,45 @@ class S3Path(URIPath):
|
|
|
2135
2313
|
path_instance = self.readlink()
|
|
2136
2314
|
except S3NotALinkError:
|
|
2137
2315
|
pass
|
|
2138
|
-
with path_instance.open(
|
|
2316
|
+
with path_instance.open("rb") as f:
|
|
2139
2317
|
return calculate_md5(f)
|
|
2140
|
-
return stat.extra.get(
|
|
2318
|
+
return stat.extra.get("ETag", "")[1:-1]
|
|
2141
2319
|
|
|
2142
2320
|
def copy(
|
|
2143
|
-
|
|
2144
|
-
|
|
2145
|
-
|
|
2146
|
-
|
|
2147
|
-
|
|
2148
|
-
|
|
2321
|
+
self,
|
|
2322
|
+
dst_url: PathLike,
|
|
2323
|
+
callback: Optional[Callable[[int], None]] = None,
|
|
2324
|
+
followlinks: bool = False,
|
|
2325
|
+
overwrite: bool = True,
|
|
2326
|
+
) -> None:
|
|
2327
|
+
"""File copy on S3
|
|
2149
2328
|
Copy content of file on `src_path` to `dst_path`.
|
|
2150
|
-
It's caller's responsibility to ensure the s3_isfile(src_url)
|
|
2329
|
+
It's caller's responsibility to ensure the s3_isfile(src_url) is True
|
|
2151
2330
|
|
|
2152
2331
|
:param dst_path: Target file path
|
|
2153
|
-
:param callback: Called periodically during copy, and the input parameter is
|
|
2332
|
+
:param callback: Called periodically during copy, and the input parameter is
|
|
2333
|
+
the data size (in bytes) of copy since the last call
|
|
2154
2334
|
:param followlinks: False if regard symlink as file, else True
|
|
2155
2335
|
:param overwrite: whether or not overwrite file when exists, default is True
|
|
2156
|
-
|
|
2336
|
+
"""
|
|
2157
2337
|
if not overwrite and self.from_path(dst_url).is_file():
|
|
2158
2338
|
return
|
|
2159
2339
|
|
|
2160
2340
|
src_url = self.path_with_protocol
|
|
2161
2341
|
src_bucket, src_key = parse_s3_url(src_url)
|
|
2162
2342
|
dst_bucket, dst_key = parse_s3_url(dst_url)
|
|
2163
|
-
if dst_bucket == src_bucket and src_key.rstrip(
|
|
2164
|
-
|
|
2165
|
-
raise SameFileError(
|
|
2166
|
-
f"'{src_url}' and '{dst_url}' are the same file")
|
|
2343
|
+
if dst_bucket == src_bucket and src_key.rstrip("/") == dst_key.rstrip("/"):
|
|
2344
|
+
raise SameFileError(f"'{src_url}' and '{dst_url}' are the same file")
|
|
2167
2345
|
|
|
2168
2346
|
if not src_bucket:
|
|
2169
|
-
raise S3BucketNotFoundError(
|
|
2347
|
+
raise S3BucketNotFoundError("Empty bucket name: %r" % src_url)
|
|
2170
2348
|
if self.is_dir():
|
|
2171
|
-
raise S3IsADirectoryError(
|
|
2349
|
+
raise S3IsADirectoryError("Is a directory: %r" % src_url)
|
|
2172
2350
|
|
|
2173
2351
|
if not dst_bucket:
|
|
2174
|
-
raise S3BucketNotFoundError(
|
|
2175
|
-
if not dst_key or dst_key.endswith(
|
|
2176
|
-
raise S3IsADirectoryError(
|
|
2352
|
+
raise S3BucketNotFoundError("Empty bucket name: %r" % dst_url)
|
|
2353
|
+
if not dst_key or dst_key.endswith("/"):
|
|
2354
|
+
raise S3IsADirectoryError("Is a directory: %r" % dst_url)
|
|
2177
2355
|
|
|
2178
2356
|
if followlinks:
|
|
2179
2357
|
try:
|
|
@@ -2184,30 +2362,31 @@ class S3Path(URIPath):
|
|
|
2184
2362
|
|
|
2185
2363
|
with raise_s3_error(f"'{src_url}' or '{dst_url}'"):
|
|
2186
2364
|
self._client.copy(
|
|
2187
|
-
{
|
|
2188
|
-
'Bucket': src_bucket,
|
|
2189
|
-
'Key': src_key,
|
|
2190
|
-
},
|
|
2365
|
+
{"Bucket": src_bucket, "Key": src_key},
|
|
2191
2366
|
Bucket=dst_bucket,
|
|
2192
2367
|
Key=dst_key,
|
|
2193
|
-
Callback=callback
|
|
2368
|
+
Callback=callback,
|
|
2369
|
+
)
|
|
2194
2370
|
|
|
2195
2371
|
def sync(
|
|
2196
|
-
|
|
2197
|
-
|
|
2198
|
-
|
|
2199
|
-
|
|
2200
|
-
|
|
2201
|
-
|
|
2372
|
+
self,
|
|
2373
|
+
dst_url: PathLike,
|
|
2374
|
+
followlinks: bool = False,
|
|
2375
|
+
force: bool = False,
|
|
2376
|
+
overwrite: bool = True,
|
|
2377
|
+
) -> None:
|
|
2378
|
+
"""
|
|
2202
2379
|
Copy file/directory on src_url to dst_url
|
|
2203
2380
|
|
|
2204
2381
|
:param dst_url: Given destination path
|
|
2205
2382
|
:param followlinks: False if regard symlink as file, else True
|
|
2206
|
-
:param force: Sync file forcible, do not ignore same files,
|
|
2383
|
+
:param force: Sync file forcible, do not ignore same files,
|
|
2384
|
+
priority is higher than 'overwrite', default is False
|
|
2207
2385
|
:param overwrite: whether or not overwrite file when exists, default is True
|
|
2208
|
-
|
|
2386
|
+
"""
|
|
2209
2387
|
for src_file_path, dst_file_path in _s3_scan_pairs(
|
|
2210
|
-
|
|
2388
|
+
self.path_with_protocol, dst_url
|
|
2389
|
+
):
|
|
2211
2390
|
src_file_path = self.from_path(src_file_path)
|
|
2212
2391
|
dst_file_path = self.from_path(dst_file_path)
|
|
2213
2392
|
|
|
@@ -2216,29 +2395,30 @@ class S3Path(URIPath):
|
|
|
2216
2395
|
elif not overwrite and dst_file_path.exists():
|
|
2217
2396
|
continue
|
|
2218
2397
|
elif dst_file_path.exists() and is_same_file(
|
|
2219
|
-
|
|
2398
|
+
src_file_path.stat(), dst_file_path.stat(), "copy"
|
|
2399
|
+
):
|
|
2220
2400
|
continue
|
|
2221
2401
|
|
|
2222
2402
|
src_file_path.copy(dst_file_path, followlinks=followlinks)
|
|
2223
2403
|
|
|
2224
2404
|
def symlink(self, dst_path: PathLike) -> None:
|
|
2225
|
-
|
|
2405
|
+
"""
|
|
2226
2406
|
Create a symbolic link pointing to src_path named dst_path.
|
|
2227
2407
|
|
|
2228
2408
|
:param dst_path: Destination path
|
|
2229
2409
|
:raises: S3NameTooLongError, S3BucketNotFoundError, S3IsADirectoryError
|
|
2230
|
-
|
|
2410
|
+
"""
|
|
2231
2411
|
if len(fspath(self._s3_path).encode()) > 1024:
|
|
2232
|
-
raise S3NameTooLongError(
|
|
2412
|
+
raise S3NameTooLongError("File name too long: %r" % dst_path)
|
|
2233
2413
|
src_bucket, src_key = parse_s3_url(self.path_with_protocol)
|
|
2234
2414
|
dst_bucket, dst_key = parse_s3_url(dst_path)
|
|
2235
2415
|
|
|
2236
2416
|
if not src_bucket:
|
|
2237
|
-
raise S3BucketNotFoundError(
|
|
2417
|
+
raise S3BucketNotFoundError("Empty bucket name: %r" % self.path)
|
|
2238
2418
|
if not dst_bucket:
|
|
2239
|
-
raise S3BucketNotFoundError(
|
|
2240
|
-
if not dst_key or dst_key.endswith(
|
|
2241
|
-
raise S3IsADirectoryError(
|
|
2419
|
+
raise S3BucketNotFoundError("Empty bucket name: %r" % dst_path)
|
|
2420
|
+
if not dst_key or dst_key.endswith("/"):
|
|
2421
|
+
raise S3IsADirectoryError("Is a directory: %r" % dst_path)
|
|
2242
2422
|
|
|
2243
2423
|
src_path = self._s3_path
|
|
2244
2424
|
try:
|
|
@@ -2247,93 +2427,97 @@ class S3Path(URIPath):
|
|
|
2247
2427
|
pass
|
|
2248
2428
|
with raise_s3_error(dst_path):
|
|
2249
2429
|
self._client.put_object(
|
|
2250
|
-
Bucket=dst_bucket,
|
|
2251
|
-
|
|
2252
|
-
Metadata={"symlink_to": src_path})
|
|
2430
|
+
Bucket=dst_bucket, Key=dst_key, Metadata={"symlink_to": src_path}
|
|
2431
|
+
)
|
|
2253
2432
|
|
|
2254
|
-
def readlink(self) ->
|
|
2255
|
-
|
|
2256
|
-
Return a S3Path instance representing the path to which the symbolic link points
|
|
2433
|
+
def readlink(self) -> "S3Path":
|
|
2434
|
+
"""
|
|
2435
|
+
Return a S3Path instance representing the path to which the symbolic link points
|
|
2257
2436
|
|
|
2258
|
-
:returns: Return a S3Path instance representing the path to
|
|
2259
|
-
|
|
2260
|
-
|
|
2437
|
+
:returns: Return a S3Path instance representing the path to
|
|
2438
|
+
which the symbolic link points.
|
|
2439
|
+
:raises: S3NameTooLongError, S3BucketNotFoundError, S3IsADirectoryError,
|
|
2440
|
+
S3NotALinkError
|
|
2441
|
+
"""
|
|
2261
2442
|
bucket, key = parse_s3_url(self.path_with_protocol)
|
|
2262
2443
|
if not bucket:
|
|
2263
2444
|
raise S3BucketNotFoundError(
|
|
2264
|
-
|
|
2265
|
-
|
|
2266
|
-
|
|
2267
|
-
|
|
2445
|
+
"Empty bucket name: %r" % self.path_with_protocol
|
|
2446
|
+
)
|
|
2447
|
+
if not key or key.endswith("/"):
|
|
2448
|
+
raise S3IsADirectoryError("Is a directory: %r" % self.path_with_protocol)
|
|
2268
2449
|
metadata = self._s3_get_metadata()
|
|
2269
2450
|
|
|
2270
|
-
if not
|
|
2271
|
-
raise S3NotALinkError(
|
|
2451
|
+
if "symlink_to" not in metadata:
|
|
2452
|
+
raise S3NotALinkError("Not a link: %r" % self.path_with_protocol)
|
|
2272
2453
|
else:
|
|
2273
|
-
return self.from_path(metadata[
|
|
2454
|
+
return self.from_path(metadata["symlink_to"])
|
|
2274
2455
|
|
|
2275
2456
|
def is_symlink(self) -> bool:
|
|
2276
|
-
|
|
2457
|
+
"""
|
|
2277
2458
|
Test whether a path is link
|
|
2278
2459
|
|
|
2279
2460
|
:returns: True if a path is link, else False
|
|
2280
2461
|
:raises: S3NotALinkError
|
|
2281
|
-
|
|
2462
|
+
"""
|
|
2282
2463
|
bucket, key = parse_s3_url(self.path_with_protocol)
|
|
2283
2464
|
if not bucket:
|
|
2284
2465
|
return False
|
|
2285
|
-
if not key or key.endswith(
|
|
2466
|
+
if not key or key.endswith("/"):
|
|
2286
2467
|
return False
|
|
2287
2468
|
metadata = self._s3_get_metadata()
|
|
2288
|
-
return
|
|
2469
|
+
return "symlink_to" in metadata
|
|
2289
2470
|
|
|
2290
2471
|
def save(self, file_object: BinaryIO):
|
|
2291
|
-
|
|
2472
|
+
"""Write the opened binary stream to specified path,
|
|
2473
|
+
but the stream won't be closed
|
|
2292
2474
|
|
|
2293
2475
|
:param file_object: Stream to be read
|
|
2294
|
-
|
|
2476
|
+
"""
|
|
2295
2477
|
bucket, key = parse_s3_url(self.path_with_protocol)
|
|
2296
2478
|
if not bucket:
|
|
2297
2479
|
raise S3BucketNotFoundError(
|
|
2298
|
-
|
|
2299
|
-
|
|
2300
|
-
|
|
2301
|
-
|
|
2480
|
+
"Empty bucket name: %r" % self.path_with_protocol
|
|
2481
|
+
)
|
|
2482
|
+
if not key or key.endswith("/"):
|
|
2483
|
+
raise S3IsADirectoryError("Is a directory: %r" % self.path_with_protocol)
|
|
2302
2484
|
|
|
2303
2485
|
with raise_s3_error(self.path_with_protocol):
|
|
2304
2486
|
self._client.upload_fileobj(file_object, Bucket=bucket, Key=key)
|
|
2305
2487
|
|
|
2306
2488
|
def open(
|
|
2307
|
-
|
|
2308
|
-
|
|
2309
|
-
|
|
2310
|
-
|
|
2311
|
-
|
|
2312
|
-
|
|
2313
|
-
|
|
2489
|
+
self,
|
|
2490
|
+
mode: str = "r",
|
|
2491
|
+
*,
|
|
2492
|
+
encoding: Optional[str] = None,
|
|
2493
|
+
errors: Optional[str] = None,
|
|
2494
|
+
s3_open_func: Callable = s3_open,
|
|
2495
|
+
**kwargs,
|
|
2496
|
+
) -> IO:
|
|
2314
2497
|
return s3_open_func(
|
|
2315
2498
|
self,
|
|
2316
2499
|
mode,
|
|
2317
2500
|
encoding=encoding,
|
|
2318
2501
|
errors=errors,
|
|
2319
|
-
**necessary_params(s3_open_func, **kwargs)
|
|
2502
|
+
**necessary_params(s3_open_func, **kwargs),
|
|
2503
|
+
)
|
|
2320
2504
|
|
|
2321
|
-
def absolute(self) ->
|
|
2322
|
-
|
|
2323
|
-
Make the path absolute, without normalization or resolving symlinks.
|
|
2324
|
-
|
|
2505
|
+
def absolute(self) -> "S3Path":
|
|
2506
|
+
"""
|
|
2507
|
+
Make the path absolute, without normalization or resolving symlinks.
|
|
2508
|
+
Returns a new path object
|
|
2509
|
+
"""
|
|
2325
2510
|
return self
|
|
2326
2511
|
|
|
2327
|
-
def cwd(self) ->
|
|
2328
|
-
|
|
2512
|
+
def cwd(self) -> "S3Path":
|
|
2513
|
+
"""Return current working directory
|
|
2329
2514
|
|
|
2330
2515
|
returns: Current working directory
|
|
2331
|
-
|
|
2516
|
+
"""
|
|
2332
2517
|
return self.from_path(self.path_with_protocol)
|
|
2333
2518
|
|
|
2334
2519
|
|
|
2335
2520
|
class MultiPartWriter:
|
|
2336
|
-
|
|
2337
2521
|
def __init__(self, client, path: PathLike) -> None:
|
|
2338
2522
|
self._client = client
|
|
2339
2523
|
self._multipart_upload_info = []
|
|
@@ -2342,7 +2526,8 @@ class MultiPartWriter:
|
|
|
2342
2526
|
self._bucket = bucket
|
|
2343
2527
|
self._key = key
|
|
2344
2528
|
self._upload_id = self._client.create_multipart_upload(
|
|
2345
|
-
Bucket=self._bucket, Key=self._key
|
|
2529
|
+
Bucket=self._bucket, Key=self._key
|
|
2530
|
+
)["UploadId"]
|
|
2346
2531
|
|
|
2347
2532
|
def upload_part(self, part_num: int, file_obj: io.BytesIO) -> None:
|
|
2348
2533
|
response = self._client.upload_part(
|
|
@@ -2353,70 +2538,60 @@ class MultiPartWriter:
|
|
|
2353
2538
|
Key=self._key,
|
|
2354
2539
|
)
|
|
2355
2540
|
self._multipart_upload_info.append(
|
|
2356
|
-
{
|
|
2357
|
-
|
|
2358
|
-
'ETag': response['ETag']
|
|
2359
|
-
})
|
|
2541
|
+
{"PartNumber": part_num, "ETag": response["ETag"]}
|
|
2542
|
+
)
|
|
2360
2543
|
|
|
2361
2544
|
def upload_part_by_paths(
|
|
2362
|
-
|
|
2545
|
+
self, part_num: int, paths: List[Tuple[PathLike, str]]
|
|
2546
|
+
) -> None:
|
|
2363
2547
|
file_obj = io.BytesIO()
|
|
2364
2548
|
|
|
2365
|
-
def get_object(
|
|
2366
|
-
client, bucket, key, range_str: Optional[str] = None) -> bytes:
|
|
2549
|
+
def get_object(client, bucket, key, range_str: Optional[str] = None) -> bytes:
|
|
2367
2550
|
if range_str:
|
|
2368
|
-
return client.get_object(
|
|
2369
|
-
|
|
2551
|
+
return client.get_object(Bucket=bucket, Key=key, Range=range_str)[
|
|
2552
|
+
"Body"
|
|
2553
|
+
].read()
|
|
2370
2554
|
else:
|
|
2371
|
-
return client.get_object(Bucket=bucket, Key=key)[
|
|
2555
|
+
return client.get_object(Bucket=bucket, Key=key)["Body"].read()
|
|
2372
2556
|
|
|
2373
2557
|
get_object = patch_method(
|
|
2374
|
-
get_object,
|
|
2375
|
-
max_retries=max_retries,
|
|
2376
|
-
should_retry=s3_should_retry,
|
|
2558
|
+
get_object, max_retries=max_retries, should_retry=s3_should_retry
|
|
2377
2559
|
)
|
|
2378
2560
|
for path, bytes_range in paths:
|
|
2379
2561
|
bucket, key = parse_s3_url(path)
|
|
2380
2562
|
if bytes_range:
|
|
2381
|
-
file_obj.write(
|
|
2382
|
-
get_object(self._client, bucket, key, bytes_range))
|
|
2563
|
+
file_obj.write(get_object(self._client, bucket, key, bytes_range))
|
|
2383
2564
|
else:
|
|
2384
2565
|
file_obj.write(get_object(self._client, bucket, key))
|
|
2385
2566
|
file_obj.seek(0, os.SEEK_SET)
|
|
2386
2567
|
self.upload_part(part_num, file_obj)
|
|
2387
2568
|
|
|
2388
2569
|
def upload_part_copy(
|
|
2389
|
-
|
|
2390
|
-
|
|
2391
|
-
path: PathLike,
|
|
2392
|
-
copy_source_range: Optional[str] = None) -> None:
|
|
2570
|
+
self, part_num: int, path: PathLike, copy_source_range: Optional[str] = None
|
|
2571
|
+
) -> None:
|
|
2393
2572
|
bucket, key = parse_s3_url(path)
|
|
2394
2573
|
params = dict(
|
|
2395
2574
|
UploadId=self._upload_id,
|
|
2396
2575
|
PartNumber=part_num,
|
|
2397
|
-
CopySource={
|
|
2398
|
-
'Bucket': bucket,
|
|
2399
|
-
'Key': key
|
|
2400
|
-
},
|
|
2576
|
+
CopySource={"Bucket": bucket, "Key": key},
|
|
2401
2577
|
Bucket=self._bucket,
|
|
2402
2578
|
Key=self._key,
|
|
2403
2579
|
)
|
|
2404
2580
|
if copy_source_range:
|
|
2405
|
-
params[
|
|
2581
|
+
params["CopySourceRange"] = copy_source_range
|
|
2406
2582
|
response = self._client.upload_part_copy(**params)
|
|
2407
2583
|
self._multipart_upload_info.append(
|
|
2408
|
-
{
|
|
2409
|
-
|
|
2410
|
-
'ETag': response['CopyPartResult']['ETag']
|
|
2411
|
-
})
|
|
2584
|
+
{"PartNumber": part_num, "ETag": response["CopyPartResult"]["ETag"]}
|
|
2585
|
+
)
|
|
2412
2586
|
|
|
2413
2587
|
def close(self):
|
|
2414
|
-
self._multipart_upload_info.sort(key=lambda t: t[
|
|
2588
|
+
self._multipart_upload_info.sort(key=lambda t: t["PartNumber"])
|
|
2415
2589
|
self._client.complete_multipart_upload(
|
|
2416
2590
|
UploadId=self._upload_id,
|
|
2417
2591
|
Bucket=self._bucket,
|
|
2418
2592
|
Key=self._key,
|
|
2419
|
-
MultipartUpload={
|
|
2593
|
+
MultipartUpload={"Parts": self._multipart_upload_info},
|
|
2594
|
+
)
|
|
2420
2595
|
|
|
2421
2596
|
def __enter__(self):
|
|
2422
2597
|
return self
|