megfile 3.0.6.post1__py3-none-any.whl → 3.1.0.post1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docs/conf.py +67 -0
- megfile/cli.py +16 -16
- megfile/config.py +37 -6
- megfile/errors.py +26 -20
- megfile/fs.py +13 -8
- megfile/fs_path.py +69 -49
- megfile/hdfs.py +13 -8
- megfile/hdfs_path.py +49 -41
- megfile/http.py +1 -1
- megfile/http_path.py +35 -28
- megfile/interfaces.py +119 -48
- megfile/lib/base_prefetch_reader.py +9 -8
- megfile/lib/combine_reader.py +7 -7
- megfile/lib/fnmatch.py +2 -2
- megfile/lib/glob.py +3 -3
- megfile/lib/hdfs_prefetch_reader.py +2 -1
- megfile/lib/http_prefetch_reader.py +3 -2
- megfile/lib/lazy_handler.py +6 -5
- megfile/lib/s3_buffered_writer.py +8 -7
- megfile/lib/s3_cached_handler.py +3 -4
- megfile/lib/s3_limited_seekable_writer.py +5 -3
- megfile/lib/s3_memory_handler.py +10 -6
- megfile/lib/s3_pipe_handler.py +1 -1
- megfile/lib/s3_prefetch_reader.py +7 -5
- megfile/lib/s3_share_cache_reader.py +2 -2
- megfile/lib/shadow_handler.py +5 -5
- megfile/lib/stdio_handler.py +3 -3
- megfile/pathlike.py +156 -170
- megfile/s3.py +19 -13
- megfile/s3_path.py +98 -83
- megfile/sftp.py +25 -16
- megfile/sftp_path.py +109 -94
- megfile/smart.py +38 -28
- megfile/smart_path.py +6 -6
- megfile/stdio.py +3 -3
- megfile/stdio_path.py +5 -5
- megfile/utils/__init__.py +8 -27
- megfile/version.py +1 -1
- {megfile-3.0.6.post1.dist-info → megfile-3.1.0.post1.dist-info}/METADATA +4 -5
- megfile-3.1.0.post1.dist-info/RECORD +55 -0
- {megfile-3.0.6.post1.dist-info → megfile-3.1.0.post1.dist-info}/WHEEL +1 -1
- megfile-3.1.0.post1.dist-info/top_level.txt +7 -0
- scripts/convert_results_to_sarif.py +124 -0
- scripts/generate_file.py +268 -0
- megfile-3.0.6.post1.dist-info/RECORD +0 -52
- megfile-3.0.6.post1.dist-info/top_level.txt +0 -1
- {megfile-3.0.6.post1.dist-info → megfile-3.1.0.post1.dist-info}/LICENSE +0 -0
- {megfile-3.0.6.post1.dist-info → megfile-3.1.0.post1.dist-info}/LICENSE.pyre +0 -0
- {megfile-3.0.6.post1.dist-info → megfile-3.1.0.post1.dist-info}/entry_points.txt +0 -0
megfile/s3.py
CHANGED
|
@@ -60,7 +60,8 @@ __all__ = [
|
|
|
60
60
|
|
|
61
61
|
|
|
62
62
|
def s3_access(
|
|
63
|
-
path: PathLike,
|
|
63
|
+
path: PathLike,
|
|
64
|
+
mode: Access = Access.READ,
|
|
64
65
|
followlinks: bool = False) -> bool:
|
|
65
66
|
'''
|
|
66
67
|
Test if path has access permission described by mode
|
|
@@ -79,7 +80,7 @@ def s3_exists(path: PathLike, followlinks: bool = False) -> bool:
|
|
|
79
80
|
If the bucket of s3_url are not permitted to read, return False
|
|
80
81
|
|
|
81
82
|
:param path: Given path
|
|
82
|
-
:returns: True if s3_url
|
|
83
|
+
:returns: True if s3_url exists, else False
|
|
83
84
|
'''
|
|
84
85
|
return S3Path(path).exists(followlinks)
|
|
85
86
|
|
|
@@ -139,10 +140,10 @@ def s3_isfile(path: PathLike, followlinks: bool = False) -> bool:
|
|
|
139
140
|
|
|
140
141
|
def s3_listdir(path: PathLike, followlinks: bool = False) -> List[str]:
|
|
141
142
|
'''
|
|
142
|
-
Get all contents of given s3_url. The result is in
|
|
143
|
+
Get all contents of given s3_url. The result is in ascending alphabetical order.
|
|
143
144
|
|
|
144
145
|
:param path: Given path
|
|
145
|
-
:returns: All contents have prefix of s3_url in
|
|
146
|
+
:returns: All contents have prefix of s3_url in ascending alphabetical order
|
|
146
147
|
:raises: S3FileNotFoundError, S3NotADirectoryError
|
|
147
148
|
'''
|
|
148
149
|
return S3Path(path).listdir(followlinks)
|
|
@@ -164,7 +165,7 @@ def s3_hasbucket(path: PathLike) -> bool:
|
|
|
164
165
|
Test if the bucket of s3_url exists
|
|
165
166
|
|
|
166
167
|
:param path: Given path
|
|
167
|
-
:returns: True if bucket of s3_url
|
|
168
|
+
:returns: True if bucket of s3_url exists, else False
|
|
168
169
|
'''
|
|
169
170
|
return S3Path(path).hasbucket()
|
|
170
171
|
|
|
@@ -192,7 +193,8 @@ def s3_remove(path: PathLike, missing_ok: bool = False) -> None:
|
|
|
192
193
|
return S3Path(path).remove(missing_ok)
|
|
193
194
|
|
|
194
195
|
|
|
195
|
-
def s3_scan(path: PathLike,
|
|
196
|
+
def s3_scan(path: PathLike,
|
|
197
|
+
missing_ok: bool = True,
|
|
196
198
|
followlinks: bool = False) -> Iterator[str]:
|
|
197
199
|
'''
|
|
198
200
|
Iteratively traverse only files in given s3 directory, in alphabetical order.
|
|
@@ -213,7 +215,8 @@ def s3_scan(path: PathLike, missing_ok: bool = True,
|
|
|
213
215
|
|
|
214
216
|
|
|
215
217
|
def s3_scan_stat(
|
|
216
|
-
path: PathLike,
|
|
218
|
+
path: PathLike,
|
|
219
|
+
missing_ok: bool = True,
|
|
217
220
|
followlinks: bool = False) -> Iterator[FileEntry]:
|
|
218
221
|
'''
|
|
219
222
|
Iteratively traverse only files in given directory, in alphabetical order.
|
|
@@ -264,8 +267,10 @@ def s3_unlink(path: PathLike, missing_ok: bool = False) -> None:
|
|
|
264
267
|
return S3Path(path).unlink(missing_ok)
|
|
265
268
|
|
|
266
269
|
|
|
267
|
-
def s3_walk(
|
|
268
|
-
|
|
270
|
+
def s3_walk(
|
|
271
|
+
path: PathLike,
|
|
272
|
+
followlinks: bool = False
|
|
273
|
+
) -> Iterator[Tuple[str, List[str], List[str]]]:
|
|
269
274
|
'''
|
|
270
275
|
Iteratively traverse the given s3 directory, in top-bottom order. In other words, firstly traverse parent directory, if subdirectories exist, traverse the subdirectories in alphabetical order.
|
|
271
276
|
Every iteration on generator yields a 3-tuple: (root, dirs, files)
|
|
@@ -289,7 +294,8 @@ def s3_walk(path: PathLike, followlinks: bool = False
|
|
|
289
294
|
|
|
290
295
|
|
|
291
296
|
def s3_getmd5(
|
|
292
|
-
path: PathLike,
|
|
297
|
+
path: PathLike,
|
|
298
|
+
recalculate: bool = False,
|
|
293
299
|
followlinks: bool = False) -> str:
|
|
294
300
|
'''
|
|
295
301
|
Get md5 meta info in files that uploaded/copied via megfile
|
|
@@ -312,7 +318,7 @@ def s3_copy(
|
|
|
312
318
|
overwrite: bool = True) -> None:
|
|
313
319
|
''' File copy on S3
|
|
314
320
|
Copy content of file on `src_path` to `dst_path`.
|
|
315
|
-
It's caller's
|
|
321
|
+
It's caller's responsibility to ensure the s3_isfile(src_url) == True
|
|
316
322
|
|
|
317
323
|
:param src_url: Given path
|
|
318
324
|
:param dst_path: Target file path
|
|
@@ -335,7 +341,7 @@ def s3_sync(
|
|
|
335
341
|
:param src_url: Given path
|
|
336
342
|
:param dst_url: Given destination path
|
|
337
343
|
:param followlinks: False if regard symlink as file, else True
|
|
338
|
-
:param force: Sync file
|
|
344
|
+
:param force: Sync file forcible, do not ignore same files, priority is higher than 'overwrite', default is False
|
|
339
345
|
:param overwrite: whether or not overwrite file when exists, default is True
|
|
340
346
|
'''
|
|
341
347
|
return S3Path(src_url).sync(dst_url, followlinks, force, overwrite)
|
|
@@ -346,7 +352,7 @@ def s3_symlink(src_path: PathLike, dst_path: PathLike) -> None:
|
|
|
346
352
|
Create a symbolic link pointing to src_path named dst_path.
|
|
347
353
|
|
|
348
354
|
:param src_path: Given path
|
|
349
|
-
:param dst_path:
|
|
355
|
+
:param dst_path: Destination path
|
|
350
356
|
:raises: S3NameTooLongError, S3BucketNotFoundError, S3IsADirectoryError
|
|
351
357
|
'''
|
|
352
358
|
return S3Path(src_path).symlink(dst_path)
|
megfile/s3_path.py
CHANGED
|
@@ -4,15 +4,15 @@ import os
|
|
|
4
4
|
import re
|
|
5
5
|
import time
|
|
6
6
|
from concurrent.futures import ThreadPoolExecutor
|
|
7
|
-
from functools import lru_cache, wraps
|
|
7
|
+
from functools import cached_property, lru_cache, wraps
|
|
8
8
|
from logging import getLogger as get_logger
|
|
9
|
-
from typing import IO, Any,
|
|
9
|
+
from typing import IO, Any, BinaryIO, Callable, Dict, Iterator, List, Optional, Tuple
|
|
10
10
|
|
|
11
11
|
import boto3
|
|
12
12
|
import botocore
|
|
13
13
|
from botocore.awsrequest import AWSResponse
|
|
14
14
|
|
|
15
|
-
from megfile.config import DEFAULT_BLOCK_SIZE, GLOBAL_MAX_WORKERS, S3_CLIENT_CACHE_MODE, S3_MAX_RETRY_TIMES
|
|
15
|
+
from megfile.config import DEFAULT_BLOCK_SIZE, DEFAULT_MAX_BLOCK_SIZE, DEFAULT_MIN_BLOCK_SIZE, GLOBAL_MAX_WORKERS, S3_CLIENT_CACHE_MODE, S3_MAX_RETRY_TIMES
|
|
16
16
|
from megfile.errors import S3BucketNotFoundError, S3ConfigError, S3FileExistsError, S3FileNotFoundError, S3IsADirectoryError, S3NameTooLongError, S3NotADirectoryError, S3NotALinkError, S3PermissionError, S3UnknownError, SameFileError, UnsupportedError, _create_missing_ok_generator
|
|
17
17
|
from megfile.errors import _logger as error_logger
|
|
18
18
|
from megfile.errors import patch_method, raise_s3_error, s3_error_code_should_retry, s3_should_retry, translate_fs_error, translate_s3_error
|
|
@@ -31,7 +31,7 @@ from megfile.lib.s3_prefetch_reader import S3PrefetchReader
|
|
|
31
31
|
from megfile.lib.s3_share_cache_reader import S3ShareCacheReader
|
|
32
32
|
from megfile.lib.url import get_url_scheme
|
|
33
33
|
from megfile.smart_path import SmartPath
|
|
34
|
-
from megfile.utils import _is_pickle,
|
|
34
|
+
from megfile.utils import _is_pickle, calculate_md5, generate_cache_path, get_binary_mode, get_content_offset, is_readable, necessary_params, process_local, thread_local
|
|
35
35
|
|
|
36
36
|
__all__ = [
|
|
37
37
|
'S3Path',
|
|
@@ -112,14 +112,14 @@ def parse_s3_url(s3_url: PathLike) -> Tuple[str, str]:
|
|
|
112
112
|
s3_url = fspath(s3_url)
|
|
113
113
|
if not is_s3(s3_url):
|
|
114
114
|
raise ValueError('Not a s3 url: %r' % s3_url)
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
if
|
|
118
|
-
bucket =
|
|
115
|
+
right_part = s3_url.split('://', maxsplit=1)[1]
|
|
116
|
+
bucket_pattern = re.match('(.*?)/', right_part)
|
|
117
|
+
if bucket_pattern is None:
|
|
118
|
+
bucket = right_part
|
|
119
119
|
path = ''
|
|
120
120
|
else:
|
|
121
|
-
bucket =
|
|
122
|
-
path =
|
|
121
|
+
bucket = bucket_pattern.group(1)
|
|
122
|
+
path = right_part[len(bucket) + 1:]
|
|
123
123
|
return bucket, path
|
|
124
124
|
|
|
125
125
|
|
|
@@ -276,18 +276,18 @@ def _list_all_buckets(profile_name: Optional[str] = None) -> List[str]:
|
|
|
276
276
|
|
|
277
277
|
def _parse_s3_url_ignore_brace(s3_url: str) -> Tuple[str, str]:
|
|
278
278
|
s3_url = fspath(s3_url)
|
|
279
|
-
s3_scheme,
|
|
279
|
+
s3_scheme, right_part = s3_url[:5], s3_url[5:]
|
|
280
280
|
if s3_scheme != 's3://':
|
|
281
281
|
raise ValueError('Not a s3 url: %r' % s3_url)
|
|
282
282
|
left_brace = False
|
|
283
|
-
for current_index, current_character in enumerate(
|
|
283
|
+
for current_index, current_character in enumerate(right_part):
|
|
284
284
|
if current_character == "/" and left_brace is False:
|
|
285
|
-
return
|
|
285
|
+
return right_part[:current_index], right_part[current_index + 1:]
|
|
286
286
|
elif current_character == "{":
|
|
287
287
|
left_brace = True
|
|
288
288
|
elif current_character == "}":
|
|
289
289
|
left_brace = False
|
|
290
|
-
return
|
|
290
|
+
return right_part, ""
|
|
291
291
|
|
|
292
292
|
|
|
293
293
|
def _group_s3path_by_bucket(
|
|
@@ -306,13 +306,13 @@ def _group_s3path_by_bucket(
|
|
|
306
306
|
return "s3://%s%s" % (bucket, "/" if s3_pathname.endswith("/") else "")
|
|
307
307
|
|
|
308
308
|
all_bucket = lru_cache(maxsize=1)(_list_all_buckets)
|
|
309
|
-
for
|
|
310
|
-
if has_magic(
|
|
311
|
-
|
|
309
|
+
for bucket_name in ungloblize(bucket):
|
|
310
|
+
if has_magic(bucket_name):
|
|
311
|
+
split_bucket_name = bucket_name.split("/", 1)
|
|
312
312
|
path_part = None
|
|
313
|
-
if len(
|
|
314
|
-
|
|
315
|
-
pattern = re.compile(translate(re.sub(r'\*{2,}', '*',
|
|
313
|
+
if len(split_bucket_name) == 2:
|
|
314
|
+
bucket_name, path_part = split_bucket_name
|
|
315
|
+
pattern = re.compile(translate(re.sub(r'\*{2,}', '*', bucket_name)))
|
|
316
316
|
|
|
317
317
|
for bucket in all_bucket(profile_name):
|
|
318
318
|
if pattern.fullmatch(bucket) is not None:
|
|
@@ -320,7 +320,7 @@ def _group_s3path_by_bucket(
|
|
|
320
320
|
bucket = "%s/%s" % (bucket, path_part)
|
|
321
321
|
grouped_path.append(generate_s3_path(bucket, key))
|
|
322
322
|
else:
|
|
323
|
-
grouped_path.append(generate_s3_path(
|
|
323
|
+
grouped_path.append(generate_s3_path(bucket_name, key))
|
|
324
324
|
|
|
325
325
|
return grouped_path
|
|
326
326
|
|
|
@@ -450,6 +450,7 @@ def _s3_glob_stat_single_path(
|
|
|
450
450
|
missing_ok: bool = True,
|
|
451
451
|
followlinks: bool = False,
|
|
452
452
|
profile_name: Optional[str] = None) -> Iterator[FileEntry]:
|
|
453
|
+
s3_pathname = fspath(s3_pathname)
|
|
453
454
|
if not recursive:
|
|
454
455
|
# If not recursive, replace ** with *
|
|
455
456
|
s3_pathname = re.sub(r'\*{2,}', '*', s3_pathname)
|
|
@@ -571,8 +572,8 @@ def _s3_binary_mode(s3_open_func):
|
|
|
571
572
|
fileobj = s3_open_func(s3_url, get_binary_mode(mode), **kwargs)
|
|
572
573
|
if 'b' not in mode:
|
|
573
574
|
fileobj = io.TextIOWrapper(
|
|
574
|
-
fileobj, encoding=encoding, errors=errors) #
|
|
575
|
-
fileobj.mode = mode
|
|
575
|
+
fileobj, encoding=encoding, errors=errors) # type: ignore
|
|
576
|
+
fileobj.mode = mode # pyre-ignore[41]
|
|
576
577
|
return fileobj
|
|
577
578
|
|
|
578
579
|
return wrapper
|
|
@@ -692,7 +693,7 @@ def s3_pipe_open(
|
|
|
692
693
|
|
|
693
694
|
When join_thread is False, while the file handle are closing, this function will not wait until the asynchronous writing finishes;
|
|
694
695
|
False doesn't affect read-handle, but this can speed up write-handle because file will be written asynchronously.
|
|
695
|
-
But asynchronous
|
|
696
|
+
But asynchronous behavior can guarantee the file are successfully written, and frequent execution may cause thread and file handle exhaustion
|
|
696
697
|
|
|
697
698
|
:param mode: Mode to open file, either "rb" or "wb"
|
|
698
699
|
:param join_thread: If wait after function execution until s3 finishes writing
|
|
@@ -782,9 +783,9 @@ def s3_buffered_open(
|
|
|
782
783
|
limited_seekable: bool = False,
|
|
783
784
|
buffered: bool = False,
|
|
784
785
|
share_cache_key: Optional[str] = None,
|
|
785
|
-
cache_path: Optional[str] = None
|
|
786
|
-
|
|
787
|
-
|
|
786
|
+
cache_path: Optional[str] = None,
|
|
787
|
+
min_block_size: int = DEFAULT_MIN_BLOCK_SIZE,
|
|
788
|
+
max_block_size: int = DEFAULT_MAX_BLOCK_SIZE) -> IO:
|
|
788
789
|
'''Open an asynchronous prefetch reader, to support fast sequential read
|
|
789
790
|
|
|
790
791
|
.. note ::
|
|
@@ -797,7 +798,9 @@ def s3_buffered_open(
|
|
|
797
798
|
|
|
798
799
|
:param max_concurrency: Max download thread number, None by default
|
|
799
800
|
:param max_buffer_size: Max cached buffer size in memory, 128MB by default
|
|
800
|
-
:param
|
|
801
|
+
:param min_block_size: Min size of single block, default is same as block_size. Each block will be downloaded by single thread.
|
|
802
|
+
:param max_block_size: Max size of single block, 128MB by default. Each block will be downloaded by single thread.
|
|
803
|
+
:param block_size: Size of single block, 8MB by default. Each block will be uploaded by single thread.
|
|
801
804
|
:param limited_seekable: If write-handle supports limited seek (both file head part and tail part can seek block_size). Notes: This parameter are valid only for write-handle. Read-handle support arbitrary seek
|
|
802
805
|
:returns: An opened S3PrefetchReader object
|
|
803
806
|
:raises: S3FileNotFoundError
|
|
@@ -863,8 +866,8 @@ def s3_buffered_open(
|
|
|
863
866
|
block_forward=block_forward,
|
|
864
867
|
block_size=block_size,
|
|
865
868
|
profile_name=s3_url._profile_name)
|
|
866
|
-
if buffered or _is_pickle(reader):
|
|
867
|
-
reader = io.BufferedReader(reader) #
|
|
869
|
+
if buffered or _is_pickle(reader):
|
|
870
|
+
reader = io.BufferedReader(reader) # type: ignore
|
|
868
871
|
return reader
|
|
869
872
|
|
|
870
873
|
if limited_seekable:
|
|
@@ -873,8 +876,9 @@ def s3_buffered_open(
|
|
|
873
876
|
key,
|
|
874
877
|
s3_client=client,
|
|
875
878
|
max_workers=max_concurrency,
|
|
879
|
+
block_size=min_block_size,
|
|
880
|
+
max_block_size=max_block_size,
|
|
876
881
|
max_buffer_size=max_buffer_size,
|
|
877
|
-
block_size=block_size,
|
|
878
882
|
profile_name=s3_url._profile_name)
|
|
879
883
|
else:
|
|
880
884
|
writer = S3BufferedWriter(
|
|
@@ -882,17 +886,19 @@ def s3_buffered_open(
|
|
|
882
886
|
key,
|
|
883
887
|
s3_client=client,
|
|
884
888
|
max_workers=max_concurrency,
|
|
889
|
+
block_size=min_block_size,
|
|
890
|
+
max_block_size=max_block_size,
|
|
885
891
|
max_buffer_size=max_buffer_size,
|
|
886
|
-
block_size=block_size,
|
|
887
892
|
profile_name=s3_url._profile_name)
|
|
888
|
-
if buffered or _is_pickle(writer):
|
|
889
|
-
writer = io.BufferedWriter(writer) #
|
|
893
|
+
if buffered or _is_pickle(writer):
|
|
894
|
+
writer = io.BufferedWriter(writer) # type: ignore
|
|
890
895
|
return writer
|
|
891
896
|
|
|
892
897
|
|
|
893
898
|
@_s3_binary_mode
|
|
894
899
|
def s3_memory_open(
|
|
895
|
-
s3_url: PathLike,
|
|
900
|
+
s3_url: PathLike,
|
|
901
|
+
mode: str,
|
|
896
902
|
followlinks: bool = False) -> S3MemoryHandler:
|
|
897
903
|
'''Open a memory-cache file reader / writer, for frequent random read / write
|
|
898
904
|
|
|
@@ -929,8 +935,8 @@ s3_open = s3_buffered_open
|
|
|
929
935
|
def s3_download(
|
|
930
936
|
src_url: PathLike,
|
|
931
937
|
dst_url: PathLike,
|
|
932
|
-
followlinks: bool = False,
|
|
933
938
|
callback: Optional[Callable[[int], None]] = None,
|
|
939
|
+
followlinks: bool = False,
|
|
934
940
|
overwrite: bool = True) -> None:
|
|
935
941
|
'''
|
|
936
942
|
Downloads a file from s3 to local filesystem.
|
|
@@ -1003,13 +1009,14 @@ def s3_upload(
|
|
|
1003
1009
|
src_url: PathLike,
|
|
1004
1010
|
dst_url: PathLike,
|
|
1005
1011
|
callback: Optional[Callable[[int], None]] = None,
|
|
1006
|
-
|
|
1007
|
-
|
|
1012
|
+
followlinks: bool = False,
|
|
1013
|
+
overwrite: bool = True) -> None:
|
|
1008
1014
|
'''
|
|
1009
1015
|
Uploads a file from local filesystem to s3.
|
|
1010
1016
|
:param src_url: source fs path
|
|
1011
1017
|
:param dst_url: target s3 path
|
|
1012
1018
|
:param callback: Called periodically during copy, and the input parameter is the data size (in bytes) of copy since the last call
|
|
1019
|
+
:param followlinks: False if regard symlink as file, else True
|
|
1013
1020
|
:param overwrite: whether or not overwrite file when exists, default is True
|
|
1014
1021
|
'''
|
|
1015
1022
|
from megfile.fs import is_fs
|
|
@@ -1018,6 +1025,8 @@ def s3_upload(
|
|
|
1018
1025
|
if not is_fs(src_url):
|
|
1019
1026
|
raise OSError(f'src_url is not fs path: {src_url}')
|
|
1020
1027
|
src_path = FSPath(src_url)
|
|
1028
|
+
if followlinks and src_path.is_symlink():
|
|
1029
|
+
src_path = src_path.readlink()
|
|
1021
1030
|
|
|
1022
1031
|
dst_bucket, dst_key = parse_s3_url(dst_url)
|
|
1023
1032
|
if not dst_bucket:
|
|
@@ -1132,10 +1141,10 @@ class S3Cacher(FileCacher):
|
|
|
1132
1141
|
|
|
1133
1142
|
|
|
1134
1143
|
def s3_glob(
|
|
1135
|
-
|
|
1136
|
-
|
|
1137
|
-
|
|
1138
|
-
|
|
1144
|
+
path: PathLike,
|
|
1145
|
+
recursive: bool = True,
|
|
1146
|
+
missing_ok: bool = True,
|
|
1147
|
+
followlinks: bool = False,
|
|
1139
1148
|
) -> List[str]:
|
|
1140
1149
|
'''Return s3 path list in ascending alphabetical order, in which path matches glob pattern
|
|
1141
1150
|
Notes: Only glob in bucket. If trying to match bucket with wildcard characters, raise UnsupportedError
|
|
@@ -1174,10 +1183,10 @@ def s3_glob_stat(
|
|
|
1174
1183
|
|
|
1175
1184
|
|
|
1176
1185
|
def s3_iglob(
|
|
1177
|
-
|
|
1178
|
-
|
|
1179
|
-
|
|
1180
|
-
|
|
1186
|
+
path: PathLike,
|
|
1187
|
+
recursive: bool = True,
|
|
1188
|
+
missing_ok: bool = True,
|
|
1189
|
+
followlinks: bool = False,
|
|
1181
1190
|
) -> Iterator[str]:
|
|
1182
1191
|
'''Return s3 path iterator in ascending alphabetical order, in which path matches glob pattern
|
|
1183
1192
|
Notes: Only glob in bucket. If trying to match bucket with wildcard characters, raise UnsupportedError
|
|
@@ -1207,7 +1216,8 @@ def s3_makedirs(path: PathLike, exist_ok: bool = False):
|
|
|
1207
1216
|
|
|
1208
1217
|
|
|
1209
1218
|
def _group_src_paths_by_block(
|
|
1210
|
-
|
|
1219
|
+
src_paths: List[PathLike],
|
|
1220
|
+
block_size: int = DEFAULT_BLOCK_SIZE
|
|
1211
1221
|
) -> List[List[Tuple[PathLike, Optional[str]]]]:
|
|
1212
1222
|
groups = []
|
|
1213
1223
|
current_group, current_group_size = [], 0
|
|
@@ -1302,7 +1312,7 @@ class S3Path(URIPath):
|
|
|
1302
1312
|
else:
|
|
1303
1313
|
self._s3_path = self.path
|
|
1304
1314
|
|
|
1305
|
-
@
|
|
1315
|
+
@cached_property
|
|
1306
1316
|
def path_with_protocol(self) -> str:
|
|
1307
1317
|
'''Return path with protocol, like file:///root, s3://bucket/key'''
|
|
1308
1318
|
path = self.path
|
|
@@ -1311,7 +1321,7 @@ class S3Path(URIPath):
|
|
|
1311
1321
|
return path
|
|
1312
1322
|
return protocol_prefix + path.lstrip('/')
|
|
1313
1323
|
|
|
1314
|
-
@
|
|
1324
|
+
@cached_property
|
|
1315
1325
|
def path_without_protocol(self) -> str:
|
|
1316
1326
|
'''Return path without protocol, example: if path is s3://bucket/key, return bucket/key'''
|
|
1317
1327
|
path = self.path
|
|
@@ -1320,8 +1330,8 @@ class S3Path(URIPath):
|
|
|
1320
1330
|
path = path[len(protocol_prefix):]
|
|
1321
1331
|
return path
|
|
1322
1332
|
|
|
1323
|
-
@
|
|
1324
|
-
def parts(self) -> Tuple[str]:
|
|
1333
|
+
@cached_property
|
|
1334
|
+
def parts(self) -> Tuple[str, ...]:
|
|
1325
1335
|
'''A tuple giving access to the path’s various components'''
|
|
1326
1336
|
parts = [f"{self._protocol_with_profile}://"]
|
|
1327
1337
|
path = self.path_without_protocol
|
|
@@ -1330,7 +1340,7 @@ class S3Path(URIPath):
|
|
|
1330
1340
|
parts.extend(path.split('/'))
|
|
1331
1341
|
return tuple(parts)
|
|
1332
1342
|
|
|
1333
|
-
@
|
|
1343
|
+
@cached_property
|
|
1334
1344
|
def _client(self):
|
|
1335
1345
|
return get_s3_client_with_cache(profile_name=self._profile_name)
|
|
1336
1346
|
|
|
@@ -1358,7 +1368,8 @@ class S3Path(URIPath):
|
|
|
1358
1368
|
return {}
|
|
1359
1369
|
|
|
1360
1370
|
def access(
|
|
1361
|
-
self,
|
|
1371
|
+
self,
|
|
1372
|
+
mode: Access = Access.READ,
|
|
1362
1373
|
followlinks: bool = False) -> bool:
|
|
1363
1374
|
'''
|
|
1364
1375
|
Test if path has access permission described by mode
|
|
@@ -1420,7 +1431,7 @@ class S3Path(URIPath):
|
|
|
1420
1431
|
|
|
1421
1432
|
If the bucket of s3_url are not permitted to read, return False
|
|
1422
1433
|
|
|
1423
|
-
:returns: True if s3_url
|
|
1434
|
+
:returns: True if s3_url exists, else False
|
|
1424
1435
|
'''
|
|
1425
1436
|
bucket, key = parse_s3_url(self.path_with_protocol)
|
|
1426
1437
|
if not bucket: # s3:// => True, s3:///key => False
|
|
@@ -1454,11 +1465,11 @@ class S3Path(URIPath):
|
|
|
1454
1465
|
return self.stat(follow_symlinks=follow_symlinks).size
|
|
1455
1466
|
|
|
1456
1467
|
def glob(
|
|
1457
|
-
|
|
1458
|
-
|
|
1459
|
-
|
|
1460
|
-
|
|
1461
|
-
|
|
1468
|
+
self,
|
|
1469
|
+
pattern,
|
|
1470
|
+
recursive: bool = True,
|
|
1471
|
+
missing_ok: bool = True,
|
|
1472
|
+
followlinks: bool = False,
|
|
1462
1473
|
) -> List['S3Path']:
|
|
1463
1474
|
'''Return s3 path list in ascending alphabetical order, in which path matches glob pattern
|
|
1464
1475
|
Notes: Only glob in bucket. If trying to match bucket with wildcard characters, raise UnsupportedError
|
|
@@ -1493,7 +1504,7 @@ class S3Path(URIPath):
|
|
|
1493
1504
|
'''
|
|
1494
1505
|
glob_path = self._s3_path
|
|
1495
1506
|
if pattern:
|
|
1496
|
-
glob_path = self.joinpath(pattern)._s3_path
|
|
1507
|
+
glob_path = self.joinpath(pattern)._s3_path
|
|
1497
1508
|
s3_pathname = fspath(glob_path)
|
|
1498
1509
|
|
|
1499
1510
|
def create_generator():
|
|
@@ -1517,11 +1528,11 @@ class S3Path(URIPath):
|
|
|
1517
1528
|
S3FileNotFoundError('No match any file: %r' % s3_pathname))
|
|
1518
1529
|
|
|
1519
1530
|
def iglob(
|
|
1520
|
-
|
|
1521
|
-
|
|
1522
|
-
|
|
1523
|
-
|
|
1524
|
-
|
|
1531
|
+
self,
|
|
1532
|
+
pattern,
|
|
1533
|
+
recursive: bool = True,
|
|
1534
|
+
missing_ok: bool = True,
|
|
1535
|
+
followlinks: bool = False,
|
|
1525
1536
|
) -> Iterator['S3Path']:
|
|
1526
1537
|
'''Return s3 path iterator in ascending alphabetical order, in which path matches glob pattern
|
|
1527
1538
|
Notes: Only glob in bucket. If trying to match bucket with wildcard characters, raise UnsupportedError
|
|
@@ -1598,9 +1609,9 @@ class S3Path(URIPath):
|
|
|
1598
1609
|
|
|
1599
1610
|
def listdir(self, followlinks: bool = False) -> List[str]:
|
|
1600
1611
|
'''
|
|
1601
|
-
Get all contents of given s3_url. The result is in
|
|
1612
|
+
Get all contents of given s3_url. The result is in ascending alphabetical order.
|
|
1602
1613
|
|
|
1603
|
-
:returns: All contents have prefix of s3_url in
|
|
1614
|
+
:returns: All contents have prefix of s3_url in ascending alphabetical order
|
|
1604
1615
|
:raises: S3FileNotFoundError, S3NotADirectoryError
|
|
1605
1616
|
'''
|
|
1606
1617
|
entries = list(self.scandir(followlinks=followlinks))
|
|
@@ -1608,13 +1619,13 @@ class S3Path(URIPath):
|
|
|
1608
1619
|
|
|
1609
1620
|
def iterdir(self, followlinks: bool = False) -> Iterator['S3Path']:
|
|
1610
1621
|
'''
|
|
1611
|
-
Get all contents of given s3_url. The result is in
|
|
1622
|
+
Get all contents of given s3_url. The result is in ascending alphabetical order.
|
|
1612
1623
|
|
|
1613
|
-
:returns: All contents have prefix of s3_url in
|
|
1624
|
+
:returns: All contents have prefix of s3_url in ascending alphabetical order
|
|
1614
1625
|
:raises: S3FileNotFoundError, S3NotADirectoryError
|
|
1615
1626
|
'''
|
|
1616
1627
|
for path in self.listdir(followlinks=followlinks):
|
|
1617
|
-
yield self.joinpath(path)
|
|
1628
|
+
yield self.joinpath(path)
|
|
1618
1629
|
|
|
1619
1630
|
def load(self, followlinks: bool = False) -> BinaryIO:
|
|
1620
1631
|
'''Read all content in binary on specified path and write into memory
|
|
@@ -1645,7 +1656,7 @@ class S3Path(URIPath):
|
|
|
1645
1656
|
'''
|
|
1646
1657
|
Test if the bucket of s3_url exists
|
|
1647
1658
|
|
|
1648
|
-
:returns: True if bucket of s3_url
|
|
1659
|
+
:returns: True if bucket of s3_url exists, else False
|
|
1649
1660
|
'''
|
|
1650
1661
|
bucket, _ = parse_s3_url(self.path_with_protocol)
|
|
1651
1662
|
if not bucket:
|
|
@@ -1797,7 +1808,8 @@ class S3Path(URIPath):
|
|
|
1797
1808
|
self.remove(missing_ok=True)
|
|
1798
1809
|
return self.from_path(dst_path)
|
|
1799
1810
|
|
|
1800
|
-
def scan(self,
|
|
1811
|
+
def scan(self,
|
|
1812
|
+
missing_ok: bool = True,
|
|
1801
1813
|
followlinks: bool = False) -> Iterator[str]:
|
|
1802
1814
|
'''
|
|
1803
1815
|
Iteratively traverse only files in given s3 directory, in alphabetical order.
|
|
@@ -1822,7 +1834,8 @@ class S3Path(URIPath):
|
|
|
1822
1834
|
|
|
1823
1835
|
return create_generator()
|
|
1824
1836
|
|
|
1825
|
-
def scan_stat(self,
|
|
1837
|
+
def scan_stat(self,
|
|
1838
|
+
missing_ok: bool = True,
|
|
1826
1839
|
followlinks: bool = False) -> Iterator[FileEntry]:
|
|
1827
1840
|
'''
|
|
1828
1841
|
Iteratively traverse only files in given directory, in alphabetical order.
|
|
@@ -1952,7 +1965,7 @@ class S3Path(URIPath):
|
|
|
1952
1965
|
|
|
1953
1966
|
return ContextIterator(create_generator())
|
|
1954
1967
|
|
|
1955
|
-
def
|
|
1968
|
+
def _get_dir_stat(self) -> StatResult:
|
|
1956
1969
|
'''
|
|
1957
1970
|
Return StatResult of given s3_url directory, including:
|
|
1958
1971
|
|
|
@@ -1998,7 +2011,7 @@ class S3Path(URIPath):
|
|
|
1998
2011
|
'Empty bucket name: %r' % self.path_with_protocol)
|
|
1999
2012
|
|
|
2000
2013
|
if not self.is_file():
|
|
2001
|
-
return self.
|
|
2014
|
+
return self._get_dir_stat()
|
|
2002
2015
|
|
|
2003
2016
|
client = self._client
|
|
2004
2017
|
with raise_s3_error(self.path_with_protocol):
|
|
@@ -2040,8 +2053,10 @@ class S3Path(URIPath):
|
|
|
2040
2053
|
with raise_s3_error(self.path_with_protocol):
|
|
2041
2054
|
self._client.delete_object(Bucket=bucket, Key=key)
|
|
2042
2055
|
|
|
2043
|
-
def walk(
|
|
2044
|
-
|
|
2056
|
+
def walk(
|
|
2057
|
+
self,
|
|
2058
|
+
followlinks: bool = False
|
|
2059
|
+
) -> Iterator[Tuple[str, List[str], List[str]]]:
|
|
2045
2060
|
'''
|
|
2046
2061
|
Iteratively traverse the given s3 directory, in top-bottom order. In other words, firstly traverse parent directory, if subdirectories exist, traverse the subdirectories in alphabetical order.
|
|
2047
2062
|
Every iteration on generator yields a 3-tuple: (root, dirs, files)
|
|
@@ -2130,7 +2145,7 @@ class S3Path(URIPath):
|
|
|
2130
2145
|
overwrite: bool = True) -> None:
|
|
2131
2146
|
''' File copy on S3
|
|
2132
2147
|
Copy content of file on `src_path` to `dst_path`.
|
|
2133
|
-
It's caller's
|
|
2148
|
+
It's caller's responsibility to ensure the s3_isfile(src_url) == True
|
|
2134
2149
|
|
|
2135
2150
|
:param dst_path: Target file path
|
|
2136
2151
|
:param callback: Called periodically during copy, and the input parameter is the data size (in bytes) of copy since the last call
|
|
@@ -2186,7 +2201,7 @@ class S3Path(URIPath):
|
|
|
2186
2201
|
|
|
2187
2202
|
:param dst_url: Given destination path
|
|
2188
2203
|
:param followlinks: False if regard symlink as file, else True
|
|
2189
|
-
:param force: Sync file
|
|
2204
|
+
:param force: Sync file forcible, do not ignore same files, priority is higher than 'overwrite', default is False
|
|
2190
2205
|
:param overwrite: whether or not overwrite file when exists, default is True
|
|
2191
2206
|
'''
|
|
2192
2207
|
for src_file_path, dst_file_path in _s3_scan_pairs(
|
|
@@ -2208,7 +2223,7 @@ class S3Path(URIPath):
|
|
|
2208
2223
|
'''
|
|
2209
2224
|
Create a symbolic link pointing to src_path named dst_path.
|
|
2210
2225
|
|
|
2211
|
-
:param dst_path:
|
|
2226
|
+
:param dst_path: Destination path
|
|
2212
2227
|
:raises: S3NameTooLongError, S3BucketNotFoundError, S3IsADirectoryError
|
|
2213
2228
|
'''
|
|
2214
2229
|
if len(fspath(self._s3_path).encode()) > 1024:
|
|
@@ -2292,9 +2307,9 @@ class S3Path(URIPath):
|
|
|
2292
2307
|
*,
|
|
2293
2308
|
encoding: Optional[str] = None,
|
|
2294
2309
|
errors: Optional[str] = None,
|
|
2295
|
-
s3_open_func: Callable
|
|
2296
|
-
**kwargs) -> IO
|
|
2297
|
-
return s3_open_func(
|
|
2310
|
+
s3_open_func: Callable = s3_open,
|
|
2311
|
+
**kwargs) -> IO:
|
|
2312
|
+
return s3_open_func(
|
|
2298
2313
|
self,
|
|
2299
2314
|
mode,
|
|
2300
2315
|
encoding=encoding,
|