megfile 3.1.6__py3-none-any.whl → 4.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. megfile/cli.py +12 -7
  2. megfile/config.py +34 -44
  3. megfile/fs.py +169 -11
  4. megfile/fs_path.py +183 -259
  5. megfile/hdfs.py +106 -5
  6. megfile/hdfs_path.py +34 -90
  7. megfile/http.py +50 -1
  8. megfile/http_path.py +27 -65
  9. megfile/interfaces.py +1 -8
  10. megfile/lib/base_prefetch_reader.py +62 -78
  11. megfile/lib/combine_reader.py +5 -0
  12. megfile/lib/glob.py +3 -6
  13. megfile/lib/hdfs_prefetch_reader.py +7 -7
  14. megfile/lib/http_prefetch_reader.py +6 -6
  15. megfile/lib/s3_buffered_writer.py +67 -64
  16. megfile/lib/s3_cached_handler.py +1 -2
  17. megfile/lib/s3_limited_seekable_writer.py +3 -7
  18. megfile/lib/s3_memory_handler.py +1 -2
  19. megfile/lib/s3_pipe_handler.py +1 -2
  20. megfile/lib/s3_prefetch_reader.py +15 -20
  21. megfile/lib/s3_share_cache_reader.py +8 -5
  22. megfile/pathlike.py +397 -401
  23. megfile/s3.py +118 -17
  24. megfile/s3_path.py +150 -224
  25. megfile/sftp.py +300 -10
  26. megfile/sftp_path.py +46 -322
  27. megfile/smart.py +33 -27
  28. megfile/smart_path.py +9 -14
  29. megfile/stdio.py +1 -1
  30. megfile/stdio_path.py +2 -2
  31. megfile/utils/__init__.py +11 -4
  32. megfile/version.py +1 -1
  33. {megfile-3.1.6.dist-info → megfile-4.0.0.dist-info}/METADATA +7 -7
  34. megfile-4.0.0.dist-info/RECORD +52 -0
  35. {megfile-3.1.6.dist-info → megfile-4.0.0.dist-info}/WHEEL +1 -1
  36. {megfile-3.1.6.dist-info → megfile-4.0.0.dist-info}/top_level.txt +0 -2
  37. docs/conf.py +0 -65
  38. megfile-3.1.6.dist-info/RECORD +0 -55
  39. scripts/convert_results_to_sarif.py +0 -91
  40. scripts/generate_file.py +0 -344
  41. {megfile-3.1.6.dist-info → megfile-4.0.0.dist-info}/LICENSE +0 -0
  42. {megfile-3.1.6.dist-info → megfile-4.0.0.dist-info}/LICENSE.pyre +0 -0
  43. {megfile-3.1.6.dist-info → megfile-4.0.0.dist-info}/entry_points.txt +0 -0
megfile/s3_path.py CHANGED
@@ -7,18 +7,23 @@ from concurrent.futures import ThreadPoolExecutor
7
7
  from functools import cached_property, lru_cache, wraps
8
8
  from logging import getLogger as get_logger
9
9
  from typing import IO, Any, BinaryIO, Callable, Dict, Iterator, List, Optional, Tuple
10
+ from urllib.parse import urlparse
10
11
 
11
12
  import boto3
12
13
  import botocore
13
- from botocore.awsrequest import AWSResponse
14
+ from boto3.s3.transfer import TransferConfig
15
+ from botocore.awsrequest import AWSPreparedRequest, AWSResponse
14
16
 
15
17
  from megfile.config import (
16
- DEFAULT_BLOCK_SIZE,
17
- DEFAULT_MAX_BLOCK_SIZE,
18
- DEFAULT_MIN_BLOCK_SIZE,
19
18
  GLOBAL_MAX_WORKERS,
19
+ HTTP_AUTH_HEADERS,
20
+ READER_BLOCK_SIZE,
21
+ READER_MAX_BUFFER_SIZE,
20
22
  S3_CLIENT_CACHE_MODE,
21
23
  S3_MAX_RETRY_TIMES,
24
+ WRITER_BLOCK_SIZE,
25
+ WRITER_MAX_BUFFER_SIZE,
26
+ to_boolean,
22
27
  )
23
28
  from megfile.errors import (
24
29
  S3BucketNotFoundError,
@@ -59,7 +64,6 @@ from megfile.lib.fnmatch import translate
59
64
  from megfile.lib.glob import has_magic, has_magic_ignore_brace, ungloblize
60
65
  from megfile.lib.joinpath import uri_join
61
66
  from megfile.lib.s3_buffered_writer import (
62
- DEFAULT_MAX_BUFFER_SIZE,
63
67
  S3BufferedWriter,
64
68
  )
65
69
  from megfile.lib.s3_cached_handler import S3CachedHandler
@@ -76,6 +80,7 @@ from megfile.utils import (
76
80
  generate_cache_path,
77
81
  get_binary_mode,
78
82
  get_content_offset,
83
+ is_domain_or_subdomain,
79
84
  is_readable,
80
85
  necessary_params,
81
86
  process_local,
@@ -98,26 +103,14 @@ __all__ = [
98
103
  "s3_share_cache_open",
99
104
  "s3_open",
100
105
  "S3Cacher",
101
- "S3BufferedWriter",
102
- "S3LimitedSeekableWriter",
103
- "S3PrefetchReader",
104
- "S3ShareCacheReader",
105
106
  "s3_upload",
106
107
  "s3_download",
107
108
  "s3_load_content",
108
- "s3_readlink",
109
- "s3_glob",
110
- "s3_glob_stat",
111
- "s3_iglob",
112
- "s3_rename",
113
- "s3_makedirs",
114
109
  "s3_concat",
115
- "s3_lstat",
116
110
  ]
117
111
  _logger = get_logger(__name__)
118
112
  content_md5_header = "megfile-content-md5"
119
113
  endpoint_url = "https://s3.amazonaws.com"
120
- max_pool_connections = GLOBAL_MAX_WORKERS # for compatibility
121
114
  max_retries = S3_MAX_RETRY_TIMES
122
115
  max_keys = 1000
123
116
 
@@ -162,24 +155,30 @@ def _patch_make_request(client: botocore.client.BaseClient, redirect: bool = Fal
162
155
  retry_callback=retry_callback,
163
156
  )
164
157
 
165
- def patch_send_request(send_request):
166
- def patched_send_request(request_dict, operation_model):
167
- http, parsed_response = send_request(request_dict, operation_model)
158
+ def patch_send(send):
159
+ def patched_send(request: AWSPreparedRequest) -> AWSResponse:
160
+ response: AWSResponse = send(request)
168
161
  if (
169
- request_dict["method"] == "GET" # only support GET method for now
170
- and http.status_code in (301, 302, 307, 308)
171
- and "Location" in http.headers
162
+ request.method == "GET" # only support GET method for now
163
+ and response.status_code in (301, 302, 307, 308)
164
+ and "Location" in response.headers
172
165
  ):
173
- request_dict["url"] = http.headers["Location"]
174
- http, parsed_response = send_request(request_dict, operation_model)
175
- return http, parsed_response
176
-
177
- return patched_send_request
166
+ # Permit sending auth/cookie headers from "foo.com" to "sub.foo.com".
167
+ # See also: https://go.dev/src/net/http/client.go#L980
168
+ location = response.headers["Location"]
169
+ ihost = urlparse(request.url).hostname
170
+ dhost = urlparse(location).hostname
171
+ if not is_domain_or_subdomain(dhost, ihost):
172
+ for name in HTTP_AUTH_HEADERS:
173
+ request.headers.pop(name, None)
174
+ request.url = location
175
+ response = send(request)
176
+ return response
177
+
178
+ return patched_send
178
179
 
179
180
  if redirect:
180
- client._endpoint._send_request = patch_send_request(
181
- client._endpoint._send_request
182
- )
181
+ client._endpoint._send = patch_send(client._endpoint._send)
183
182
 
184
183
  return client
185
184
 
@@ -257,7 +256,7 @@ def get_env_var(env_name: str, profile_name=None):
257
256
  def parse_boolean(value: Optional[str], default: bool = False) -> bool:
258
257
  if value is None:
259
258
  return default
260
- return value.lower() in ("true", "yes", "1")
259
+ return to_boolean(value)
261
260
 
262
261
 
263
262
  def get_access_token(profile_name=None):
@@ -629,7 +628,7 @@ def _s3_scan_pairs(
629
628
  src_url: PathLike, dst_url: PathLike
630
629
  ) -> Iterator[Tuple[PathLike, PathLike]]:
631
630
  for src_file_path in S3Path(src_url).scan():
632
- content_path = src_file_path[len(src_url) :]
631
+ content_path = src_file_path[len(fspath(src_url)) :]
633
632
  if len(content_path) > 0:
634
633
  dst_file_path = s3_path_join(dst_url, content_path)
635
634
  else:
@@ -689,8 +688,8 @@ def s3_prefetch_open(
689
688
  mode: str = "rb",
690
689
  followlinks: bool = False,
691
690
  *,
692
- max_concurrency: Optional[int] = None,
693
- max_block_size: int = DEFAULT_BLOCK_SIZE,
691
+ max_workers: Optional[int] = None,
692
+ block_size: int = READER_BLOCK_SIZE,
694
693
  ) -> S3PrefetchReader:
695
694
  """Open a asynchronous prefetch reader, to support fast sequential
696
695
  read and random read
@@ -701,11 +700,18 @@ def s3_prefetch_open(
701
700
 
702
701
  Supports context manager
703
702
 
704
- Some parameter setting may perform well: max_concurrency=10 or 20,
705
- max_block_size=8 or 16 MB, default value None means using global thread pool
706
-
707
- :param max_concurrency: Max download thread number, None by default
708
- :param max_block_size: Max data size downloaded by each thread, in bytes,
703
+ Some parameter setting may perform well: max_workers=10 or 20,
704
+ block_size=8 or 16 MB, default value None means using global thread pool
705
+
706
+ :param s3_url: s3 path
707
+ :param mode: only support "r" or "rb"
708
+ :param encoding: encoding is the name of the encoding used to decode or encode
709
+ the file. This should only be used in text mode.
710
+ :param errors: errors is an optional string that specifies how encoding and
711
+ decoding errors are to be handled—this cannot be used in binary mode.
712
+ :param followlinks: follow symbolic link, default `False`
713
+ :param max_workers: Max download thread number, `None` by default
714
+ :param block_size: Max data size downloaded by each thread, in bytes,
709
715
  8MB by default
710
716
  :returns: An opened S3PrefetchReader object
711
717
  :raises: S3FileNotFoundError
@@ -721,15 +727,15 @@ def s3_prefetch_open(
721
727
  pass
722
728
 
723
729
  bucket, key = parse_s3_url(s3_url.path_with_protocol)
724
- config = botocore.config.Config(max_pool_connections=max_pool_connections)
730
+ config = botocore.config.Config(max_pool_connections=GLOBAL_MAX_WORKERS)
725
731
  client = get_s3_client_with_cache(config=config, profile_name=s3_url._profile_name)
726
732
  return S3PrefetchReader(
727
733
  bucket,
728
734
  key,
729
735
  s3_client=client,
730
736
  max_retries=max_retries,
731
- max_workers=max_concurrency,
732
- block_size=max_block_size,
737
+ max_workers=max_workers,
738
+ block_size=block_size,
733
739
  profile_name=s3_url._profile_name,
734
740
  )
735
741
 
@@ -741,8 +747,8 @@ def s3_share_cache_open(
741
747
  followlinks: bool = False,
742
748
  *,
743
749
  cache_key: str = "lru",
744
- max_concurrency: Optional[int] = None,
745
- max_block_size: int = DEFAULT_BLOCK_SIZE,
750
+ max_workers: Optional[int] = None,
751
+ block_size: int = READER_BLOCK_SIZE,
746
752
  ) -> S3ShareCacheReader:
747
753
  """Open a asynchronous prefetch reader, to support fast sequential read and
748
754
  random read
@@ -753,11 +759,18 @@ def s3_share_cache_open(
753
759
 
754
760
  Supports context manager
755
761
 
756
- Some parameter setting may perform well: max_concurrency=10 or 20,
757
- max_block_size=8 or 16 MB, default value None means using global thread pool
758
-
759
- :param max_concurrency: Max download thread number, None by default
760
- :param max_block_size: Max data size downloaded by each thread, in bytes,
762
+ Some parameter setting may perform well: max_workers=10 or 20,
763
+ block_size=8 or 16 MB, default value None means using global thread pool
764
+
765
+ :param s3_url: s3 path
766
+ :param mode: only support "r" or "rb"
767
+ :param encoding: encoding is the name of the encoding used to decode or encode
768
+ the file. This should only be used in text mode.
769
+ :param errors: errors is an optional string that specifies how encoding and
770
+ decoding errors are to be handled—this cannot be used in binary mode.
771
+ :param followlinks: follow symbolic link, default `False`
772
+ :param max_workers: Max download thread number, None by default
773
+ :param block_size: Max data size downloaded by each thread, in bytes,
761
774
  8MB by default
762
775
  :returns: An opened S3ShareCacheReader object
763
776
  :raises: S3FileNotFoundError
@@ -774,7 +787,7 @@ def s3_share_cache_open(
774
787
  pass
775
788
 
776
789
  bucket, key = parse_s3_url(s3_url.path_with_protocol)
777
- config = botocore.config.Config(max_pool_connections=max_pool_connections)
790
+ config = botocore.config.Config(max_pool_connections=GLOBAL_MAX_WORKERS)
778
791
  client = get_s3_client_with_cache(config=config, profile_name=s3_url._profile_name)
779
792
  return S3ShareCacheReader(
780
793
  bucket,
@@ -782,8 +795,8 @@ def s3_share_cache_open(
782
795
  cache_key=cache_key,
783
796
  s3_client=client,
784
797
  max_retries=max_retries,
785
- max_workers=max_concurrency,
786
- block_size=max_block_size,
798
+ max_workers=max_workers,
799
+ block_size=block_size,
787
800
  profile_name=s3_url._profile_name,
788
801
  )
789
802
 
@@ -810,7 +823,13 @@ def s3_pipe_open(
810
823
  But asynchronous behavior can guarantee the file are successfully written,
811
824
  and frequent execution may cause thread and file handle exhaustion
812
825
 
813
- :param mode: Mode to open file, either "rb" or "wb"
826
+ :param s3_url: s3 path
827
+ :param mode: Mode to open file, either "r", "rb", "w" or "wb"
828
+ :param encoding: encoding is the name of the encoding used to decode or encode
829
+ the file. This should only be used in text mode.
830
+ :param errors: errors is an optional string that specifies how encoding and
831
+ decoding errors are to be handled—this cannot be used in binary mode.
832
+ :param followlinks: follow symbolic link, default `False`
814
833
  :param join_thread: If wait after function execution until s3 finishes writing
815
834
  :returns: An opened BufferedReader / BufferedWriter object
816
835
  """
@@ -829,7 +848,7 @@ def s3_pipe_open(
829
848
  pass
830
849
 
831
850
  bucket, key = parse_s3_url(s3_url.path_with_protocol)
832
- config = botocore.config.Config(max_pool_connections=max_pool_connections)
851
+ config = botocore.config.Config(max_pool_connections=GLOBAL_MAX_WORKERS)
833
852
  client = get_s3_client_with_cache(config=config, profile_name=s3_url._profile_name)
834
853
  return S3PipeHandler(
835
854
  bucket,
@@ -860,7 +879,14 @@ def s3_cached_open(
860
879
  cache_path can specify the path of cache file. Performance could be better
861
880
  if cache file path is on ssd or tmpfs
862
881
 
863
- :param mode: Mode to open file, could be one of "rb", "wb" or "ab"
882
+ :param s3_url: s3 path
883
+ :param mode: Mode to open file, could be one of "rb", "wb", "ab", "rb+", "wb+"
884
+ or "ab+"
885
+ :param encoding: encoding is the name of the encoding used to decode or encode
886
+ the file. This should only be used in text mode.
887
+ :param errors: errors is an optional string that specifies how encoding and
888
+ decoding errors are to be handled—this cannot be used in binary mode.
889
+ :param followlinks: follow symbolic link, default `False`
864
890
  :param cache_path: cache file path
865
891
  :returns: An opened BufferedReader / BufferedWriter object
866
892
  """
@@ -875,7 +901,7 @@ def s3_cached_open(
875
901
  pass
876
902
 
877
903
  bucket, key = parse_s3_url(s3_url.path_with_protocol)
878
- config = botocore.config.Config(max_pool_connections=max_pool_connections)
904
+ config = botocore.config.Config(max_pool_connections=GLOBAL_MAX_WORKERS)
879
905
  client = get_s3_client_with_cache(config=config, profile_name=s3_url._profile_name)
880
906
  return S3CachedHandler(
881
907
  bucket,
@@ -893,16 +919,14 @@ def s3_buffered_open(
893
919
  mode: str,
894
920
  followlinks: bool = False,
895
921
  *,
896
- max_concurrency: Optional[int] = None,
897
- max_buffer_size: int = DEFAULT_MAX_BUFFER_SIZE,
898
- forward_ratio: Optional[float] = None,
922
+ max_workers: Optional[int] = None,
923
+ max_buffer_size: Optional[int] = None,
924
+ block_forward: Optional[int] = None,
899
925
  block_size: Optional[int] = None,
900
926
  limited_seekable: bool = False,
901
927
  buffered: bool = False,
902
928
  share_cache_key: Optional[str] = None,
903
929
  cache_path: Optional[str] = None,
904
- min_block_size: Optional[int] = None,
905
- max_block_size: int = DEFAULT_MAX_BLOCK_SIZE,
906
930
  ) -> IO:
907
931
  """Open an asynchronous prefetch reader, to support fast sequential read
908
932
 
@@ -912,22 +936,30 @@ def s3_buffered_open(
912
936
 
913
937
  Supports context manager
914
938
 
915
- Some parameter setting may perform well: max_concurrency=10 or 20,
916
- max_block_size=8 or 16 MB, default value None means using global thread pool
917
-
918
- :param max_concurrency: Max download thread number, None by default
919
- :param max_buffer_size: Max cached buffer size in memory, 128MB by default
920
- :param min_block_size: Min size of single block, default is same as block_size.
921
- Each block will be downloaded by single thread.
922
- :param max_block_size: Max size of single block, 128MB by default.
923
- Each block will be downloaded by single thread.
924
- :param block_size: Size of single block, 8MB by default.
939
+ Some parameter setting may perform well: max_workers=10 or 20,
940
+ default value None means using global thread pool
941
+
942
+ :param s3_url: s3 path
943
+ :param mode: Mode to open file, could be one of "rb", "wb", "ab", "rb+", "wb+"
944
+ or "ab+"
945
+ :param encoding: encoding is the name of the encoding used to decode or encode
946
+ the file. This should only be used in text mode.
947
+ :param errors: errors is an optional string that specifies how encoding and
948
+ decoding errors are to be handled—this cannot be used in binary mode.
949
+ :param followlinks: follow symbolic link, default `False`
950
+ :param max_workers: Max download / upload thread number, `None` by default,
951
+ will use global thread pool with 8 threads.
952
+ :param max_buffer_size: Max cached buffer size in memory, 128MB by default.
953
+ Set to `0` will disable cache.
954
+ :param block_forward: How many blocks of data cached from offset position, only for
955
+ read mode.
956
+ :param block_size: Size of single block.
925
957
  Each block will be uploaded by single thread.
926
958
  :param limited_seekable: If write-handle supports limited seek
927
959
  (both file head part and tail part can seek block_size).
928
960
  Notes: This parameter are valid only for write-handle.
929
961
  Read-handle support arbitrary seek
930
- :returns: An opened S3PrefetchReader object
962
+ :returns: An opened File object
931
963
  :raises: S3FileNotFoundError
932
964
  """
933
965
  if mode not in ("rb", "wb", "ab", "rb+", "wb+", "ab+"):
@@ -939,11 +971,8 @@ def s3_buffered_open(
939
971
  s3_url = s3_url.readlink()
940
972
  except S3NotALinkError:
941
973
  pass
942
- min_block_size = min_block_size or block_size or DEFAULT_MIN_BLOCK_SIZE
943
- block_size = block_size or DEFAULT_BLOCK_SIZE
944
-
945
974
  bucket, key = parse_s3_url(s3_url.path_with_protocol)
946
- config = botocore.config.Config(max_pool_connections=max_pool_connections)
975
+ config = botocore.config.Config(max_pool_connections=GLOBAL_MAX_WORKERS)
947
976
  client = get_s3_client_with_cache(config=config, profile_name=s3_url._profile_name)
948
977
 
949
978
  if "a" in mode or "+" in mode:
@@ -961,13 +990,6 @@ def s3_buffered_open(
961
990
  )
962
991
 
963
992
  if mode == "rb":
964
- # A rough conversion algorithm to align 2 types of Reader / Writer parameters
965
- # TODO: Optimize the conversion algorithm
966
- block_capacity = max_buffer_size // block_size
967
- if forward_ratio is None:
968
- block_forward = None
969
- else:
970
- block_forward = max(int(block_capacity * forward_ratio), 1)
971
993
  if share_cache_key is not None:
972
994
  reader = S3ShareCacheReader(
973
995
  bucket,
@@ -975,8 +997,8 @@ def s3_buffered_open(
975
997
  cache_key=share_cache_key,
976
998
  s3_client=client,
977
999
  max_retries=max_retries,
978
- max_workers=max_concurrency,
979
- block_size=block_size,
1000
+ max_workers=max_workers,
1001
+ block_size=block_size or READER_BLOCK_SIZE,
980
1002
  block_forward=block_forward,
981
1003
  profile_name=s3_url._profile_name,
982
1004
  )
@@ -986,10 +1008,10 @@ def s3_buffered_open(
986
1008
  key,
987
1009
  s3_client=client,
988
1010
  max_retries=max_retries,
989
- max_workers=max_concurrency,
990
- block_capacity=block_capacity,
1011
+ max_workers=max_workers,
1012
+ max_buffer_size=max_buffer_size or READER_MAX_BUFFER_SIZE,
991
1013
  block_forward=block_forward,
992
- block_size=block_size,
1014
+ block_size=block_size or READER_BLOCK_SIZE,
993
1015
  profile_name=s3_url._profile_name,
994
1016
  )
995
1017
  if buffered or _is_pickle(reader):
@@ -1001,10 +1023,9 @@ def s3_buffered_open(
1001
1023
  bucket,
1002
1024
  key,
1003
1025
  s3_client=client,
1004
- max_workers=max_concurrency,
1005
- block_size=min_block_size,
1006
- max_block_size=max_block_size,
1007
- max_buffer_size=max_buffer_size,
1026
+ max_workers=max_workers,
1027
+ block_size=block_size or WRITER_BLOCK_SIZE,
1028
+ max_buffer_size=max_buffer_size or WRITER_MAX_BUFFER_SIZE,
1008
1029
  profile_name=s3_url._profile_name,
1009
1030
  )
1010
1031
  else:
@@ -1012,10 +1033,9 @@ def s3_buffered_open(
1012
1033
  bucket,
1013
1034
  key,
1014
1035
  s3_client=client,
1015
- max_workers=max_concurrency,
1016
- block_size=min_block_size,
1017
- max_block_size=max_block_size,
1018
- max_buffer_size=max_buffer_size,
1036
+ max_workers=max_workers,
1037
+ block_size=block_size or WRITER_BLOCK_SIZE,
1038
+ max_buffer_size=max_buffer_size or WRITER_MAX_BUFFER_SIZE,
1019
1039
  profile_name=s3_url._profile_name,
1020
1040
  )
1021
1041
  if buffered or _is_pickle(writer):
@@ -1050,7 +1070,7 @@ def s3_memory_open(
1050
1070
  pass
1051
1071
 
1052
1072
  bucket, key = parse_s3_url(s3_url.path_with_protocol)
1053
- config = botocore.config.Config(max_pool_connections=max_pool_connections)
1073
+ config = botocore.config.Config(max_pool_connections=GLOBAL_MAX_WORKERS)
1054
1074
  client = get_s3_client_with_cache(config=config, profile_name=s3_url._profile_name)
1055
1075
  return S3MemoryHandler(
1056
1076
  bucket, key, mode, s3_client=client, profile_name=s3_url._profile_name
@@ -1109,7 +1129,7 @@ def s3_download(
1109
1129
  if not src_url.is_file():
1110
1130
  raise S3IsADirectoryError("Is a directory: %r" % src_url.path_with_protocol)
1111
1131
 
1112
- dst_directory = os.path.dirname(dst_path.path_without_protocol)
1132
+ dst_directory = os.path.dirname(dst_path.path_without_protocol) # pyre-ignore[6]
1113
1133
  if dst_directory != "":
1114
1134
  os.makedirs(dst_directory, exist_ok=True)
1115
1135
 
@@ -1117,9 +1137,21 @@ def s3_download(
1117
1137
  download_file = patch_method(
1118
1138
  client.download_file, max_retries=max_retries, should_retry=s3_should_retry
1119
1139
  )
1140
+
1141
+ transfer_config = TransferConfig(
1142
+ multipart_threshold=READER_BLOCK_SIZE,
1143
+ max_concurrency=GLOBAL_MAX_WORKERS,
1144
+ multipart_chunksize=READER_BLOCK_SIZE,
1145
+ num_download_attempts=S3_MAX_RETRY_TIMES,
1146
+ max_io_queue=max(READER_MAX_BUFFER_SIZE // READER_BLOCK_SIZE, 1),
1147
+ )
1120
1148
  try:
1121
1149
  download_file(
1122
- src_bucket, src_key, dst_path.path_without_protocol, Callback=callback
1150
+ src_bucket,
1151
+ src_key,
1152
+ dst_path.path_without_protocol,
1153
+ Callback=callback,
1154
+ Config=transfer_config,
1123
1155
  )
1124
1156
  except Exception as error:
1125
1157
  error = translate_fs_error(error, dst_url)
@@ -1170,8 +1202,19 @@ def s3_upload(
1170
1202
  client.upload_fileobj, max_retries=max_retries, should_retry=s3_should_retry
1171
1203
  )
1172
1204
 
1205
+ transfer_config = TransferConfig(
1206
+ multipart_threshold=WRITER_BLOCK_SIZE,
1207
+ max_concurrency=GLOBAL_MAX_WORKERS,
1208
+ multipart_chunksize=WRITER_BLOCK_SIZE,
1209
+ )
1173
1210
  with open(src_path.path_without_protocol, "rb") as src, raise_s3_error(dst_url):
1174
- upload_fileobj(src, Bucket=dst_bucket, Key=dst_key, Callback=callback)
1211
+ upload_fileobj(
1212
+ src,
1213
+ Bucket=dst_bucket,
1214
+ Key=dst_key,
1215
+ Callback=callback,
1216
+ Config=transfer_config,
1217
+ )
1175
1218
 
1176
1219
 
1177
1220
  def s3_load_content(
@@ -1217,27 +1260,6 @@ def s3_load_content(
1217
1260
  )(client, bucket, key, range_str)
1218
1261
 
1219
1262
 
1220
- def s3_readlink(path) -> str:
1221
- """
1222
- Return a string representing the path to which the symbolic link points.
1223
-
1224
- :returns: Return a string representing the path to which the symbolic link points.
1225
- :raises: S3NameTooLongError, S3BucketNotFoundError, S3IsADirectoryError,
1226
- S3NotALinkError
1227
- """
1228
- return S3Path(path).readlink().path_with_protocol
1229
-
1230
-
1231
- def s3_rename(src_url: PathLike, dst_url: PathLike, overwrite: bool = True) -> None:
1232
- """
1233
- Move s3 file path from src_url to dst_url
1234
-
1235
- :param dst_url: Given destination path
1236
- :param overwrite: whether or not overwrite file when exists
1237
- """
1238
- S3Path(src_url).rename(dst_url, overwrite)
1239
-
1240
-
1241
1263
  class S3Cacher(FileCacher):
1242
1264
  cache_path = None
1243
1265
 
@@ -1259,97 +1281,8 @@ class S3Cacher(FileCacher):
1259
1281
  os.unlink(self.cache_path)
1260
1282
 
1261
1283
 
1262
- def s3_glob(
1263
- path: PathLike,
1264
- recursive: bool = True,
1265
- missing_ok: bool = True,
1266
- followlinks: bool = False,
1267
- ) -> List[str]:
1268
- """Return s3 path list in ascending alphabetical order,
1269
- in which path matches glob pattern
1270
-
1271
- Notes: Only glob in bucket. If trying to match bucket with wildcard characters,
1272
- raise UnsupportedError
1273
-
1274
- :param recursive: If False, `**` will not search directory recursively
1275
- :param missing_ok: If False and target path doesn't match any file,
1276
- raise FileNotFoundError
1277
- :raises: UnsupportedError, when bucket part contains wildcard characters
1278
- :returns: A list contains paths match `s3_pathname`
1279
- """
1280
- return list(
1281
- s3_iglob(
1282
- path=path,
1283
- recursive=recursive,
1284
- missing_ok=missing_ok,
1285
- followlinks=followlinks,
1286
- )
1287
- )
1288
-
1289
-
1290
- def s3_glob_stat(
1291
- path: PathLike,
1292
- recursive: bool = True,
1293
- missing_ok: bool = True,
1294
- followlinks: bool = False,
1295
- ) -> Iterator[FileEntry]:
1296
- """Return a generator contains tuples of path and file stat,
1297
- in ascending alphabetical order, in which path matches glob pattern
1298
-
1299
- Notes: Only glob in bucket. If trying to match bucket with wildcard characters,
1300
- raise UnsupportedError
1301
-
1302
- :param recursive: If False, `**` will not search directory recursively
1303
- :param missing_ok: If False and target path doesn't match any file,
1304
- raise FileNotFoundError
1305
- :raises: UnsupportedError, when bucket part contains wildcard characters
1306
- :returns: A generator contains tuples of path and file stat,
1307
- in which paths match `s3_pathname`
1308
- """
1309
- return S3Path(path).glob_stat(
1310
- pattern="", recursive=recursive, missing_ok=missing_ok, followlinks=followlinks
1311
- )
1312
-
1313
-
1314
- def s3_iglob(
1315
- path: PathLike,
1316
- recursive: bool = True,
1317
- missing_ok: bool = True,
1318
- followlinks: bool = False,
1319
- ) -> Iterator[str]:
1320
- """Return s3 path iterator in ascending alphabetical order,
1321
- in which path matches glob pattern
1322
-
1323
- Notes: Only glob in bucket. If trying to match bucket with wildcard characters,
1324
- raise UnsupportedError
1325
-
1326
- :param recursive: If False, `**` will not search directory recursively
1327
- :param missing_ok: If False and target path doesn't match any file,
1328
- raise FileNotFoundError
1329
- :raises: UnsupportedError, when bucket part contains wildcard characters
1330
- :returns: An iterator contains paths match `s3_pathname`
1331
- """
1332
- for path_obj in S3Path(path).iglob(
1333
- pattern="", recursive=recursive, missing_ok=missing_ok, followlinks=followlinks
1334
- ):
1335
- yield path_obj.path_with_protocol
1336
-
1337
-
1338
- def s3_makedirs(path: PathLike, exist_ok: bool = False):
1339
- """
1340
- Create an s3 directory.
1341
- Purely creating directory is invalid because it's unavailable on OSS.
1342
- This function is to test the target bucket have WRITE access.
1343
-
1344
- :param path: Given path
1345
- :param exist_ok: If False and target directory exists, raise S3FileExistsError
1346
- :raises: S3BucketNotFoundError, S3FileExistsError
1347
- """
1348
- return S3Path(path).mkdir(parents=True, exist_ok=exist_ok)
1349
-
1350
-
1351
1284
  def _group_src_paths_by_block(
1352
- src_paths: List[PathLike], block_size: int = DEFAULT_BLOCK_SIZE
1285
+ src_paths: List[PathLike], block_size: int = READER_BLOCK_SIZE
1353
1286
  ) -> List[List[Tuple[PathLike, Optional[str]]]]:
1354
1287
  groups = []
1355
1288
  current_group, current_group_size = [], 0
@@ -1395,7 +1328,7 @@ def _group_src_paths_by_block(
1395
1328
  def s3_concat(
1396
1329
  src_paths: List[PathLike],
1397
1330
  dst_path: PathLike,
1398
- block_size: int = DEFAULT_BLOCK_SIZE,
1331
+ block_size: int = READER_BLOCK_SIZE,
1399
1332
  max_workers: int = GLOBAL_MAX_WORKERS,
1400
1333
  ) -> None:
1401
1334
  """Concatenate s3 files to one file.
@@ -1410,9 +1343,10 @@ def s3_concat(
1410
1343
  else:
1411
1344
  groups = _group_src_paths_by_block(src_paths, block_size=block_size)
1412
1345
 
1413
- with MultiPartWriter(client, dst_path) as writer, ThreadPoolExecutor(
1414
- max_workers=max_workers
1415
- ) as executor:
1346
+ with (
1347
+ MultiPartWriter(client, dst_path) as writer,
1348
+ ThreadPoolExecutor(max_workers=max_workers) as executor,
1349
+ ):
1416
1350
  for index, group in enumerate(groups, start=1):
1417
1351
  if len(group) == 1:
1418
1352
  executor.submit(
@@ -1422,14 +1356,6 @@ def s3_concat(
1422
1356
  executor.submit(writer.upload_part_by_paths, index, group)
1423
1357
 
1424
1358
 
1425
- def s3_lstat(path: PathLike) -> StatResult:
1426
- """
1427
- Like Path.stat() but, if the path points to a symbolic link,
1428
- return the symbolic link’s information rather than its target’s.
1429
- """
1430
- return S3Path(path).lstat()
1431
-
1432
-
1433
1359
  @SmartPath.register
1434
1360
  class S3Path(URIPath):
1435
1361
  protocol = "s3"