PyS3Uploader 0.1.2__py3-none-any.whl → 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of PyS3Uploader might be problematic. Click here for more details.

@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: PyS3Uploader
3
- Version: 0.1.2
3
+ Version: 0.2.1
4
4
  Summary: Python module to upload objects to an S3 bucket.
5
5
  Author-email: Vignesh Rao <svignesh1793@gmail.com>
6
6
  License: MIT License
@@ -115,6 +115,8 @@ if __name__ == '__main__':
115
115
  #### Optional kwargs
116
116
  - **s3_prefix** - S3 object prefix for each file. Defaults to ``None``
117
117
  - **exclude_path** - Path in ``upload_dir`` that has to be excluded in object keys. Defaults to `None`
118
+ - **skip_dot_files** - Boolean flag to skip dot files. Defaults to ``True``
119
+ - **overwrite** - Boolean flag to overwrite files present in S3. Defaults to ``False``
118
120
  - **logger** - Bring your own custom pre-configured logger. Defaults to on-screen logging.
119
121
  <br><br>
120
122
  - **region_name** - AWS region name. Defaults to the env var `AWS_DEFAULT_REGION`
@@ -0,0 +1,11 @@
1
+ s3/__init__.py,sha256=IqcPR9iWMw0GDBEmKvLzW7P-AhInTkwRklkvYgiT1Xc,66
2
+ s3/exceptions.py,sha256=hH3jlMOe8yjBatQK9EdndWZz4QESU74KSY_iDhQ37SY,2585
3
+ s3/logger.py,sha256=oH540oq8jY723jA4lDWlgfFPLbNgGXTkDwFpB7TLO_o,1196
4
+ s3/tree.py,sha256=DiQ2ekMMaj2m_P3-iKkEqSuJCJZ_UZxcAwHtAoPVa5c,1824
5
+ s3/uploader.py,sha256=v6TGdm8EcAldoIw3GOAwkuoUzi9b9IRj8b94hl1Pkyw,11454
6
+ s3/utils.py,sha256=dd1OeLbswLzFVyjYiXixkJlFsoGWRtRCOHha6wLG5zQ,2485
7
+ pys3uploader-0.2.1.dist-info/LICENSE,sha256=8k-hEraOzyum0GvmmK65YxNRTFXK7eIFHJ0OshJXeTk,1068
8
+ pys3uploader-0.2.1.dist-info/METADATA,sha256=NA7x6YqpWEKvn3XEYzVug7XTl1vPcwiphUzWtMvlzHE,7449
9
+ pys3uploader-0.2.1.dist-info/WHEEL,sha256=beeZ86-EfXScwlR_HKu4SllMC9wUEj_8Z_4FJ3egI2w,91
10
+ pys3uploader-0.2.1.dist-info/top_level.txt,sha256=iQp4y1P58Q633gj8M08kHE4mqqT0hixuDWcniDk_RJ4,3
11
+ pys3uploader-0.2.1.dist-info/RECORD,,
s3/__init__.py CHANGED
@@ -1,3 +1,3 @@
1
1
  from s3.uploader import Uploader # noqa: F401
2
2
 
3
- version = "0.1.2"
3
+ version = "0.2.1"
s3/uploader.py CHANGED
@@ -11,7 +11,13 @@ from tqdm import tqdm
11
11
 
12
12
  from s3.exceptions import BucketNotFound
13
13
  from s3.logger import default_logger
14
- from s3.utils import UploadResults, convert_to_folder_structure, getenv, urljoin
14
+ from s3.utils import (
15
+ RETRY_CONFIG,
16
+ UploadResults,
17
+ convert_to_folder_structure,
18
+ getenv,
19
+ urljoin,
20
+ )
15
21
 
16
22
 
17
23
  class Uploader:
@@ -21,18 +27,19 @@ class Uploader:
21
27
 
22
28
  """
23
29
 
24
- RETRY_CONFIG: Config = Config(retries={"max_attempts": 10, "mode": "standard"})
25
-
26
30
  def __init__(
27
31
  self,
28
32
  bucket_name: str,
29
33
  upload_dir: str,
30
34
  s3_prefix: str = None,
31
35
  exclude_path: str = None,
36
+ skip_dot_files: bool = True,
37
+ overwrite: bool = False,
32
38
  region_name: str = None,
33
39
  profile_name: str = None,
34
40
  aws_access_key_id: str = None,
35
41
  aws_secret_access_key: str = None,
42
+ retry_config: Config = RETRY_CONFIG,
36
43
  logger: logging.Logger = None,
37
44
  ):
38
45
  """Initiates all the necessary args and creates a boto3 session with retry logic.
@@ -42,6 +49,8 @@ class Uploader:
42
49
  upload_dir: Full path of the directory to be uploaded.
43
50
  s3_prefix: Particular bucket prefix within which the upload should happen.
44
51
  exclude_path: Full directory path to exclude from S3 object prefix.
52
+ skip_dot_files: Boolean flag to skip dot files.
53
+ overwrite: Boolean flag to overwrite files in S3.
45
54
  region_name: Name of the AWS region.
46
55
  profile_name: AWS profile name.
47
56
  aws_access_key_id: AWS access key ID.
@@ -49,18 +58,18 @@ class Uploader:
49
58
  logger: Bring your own logger.
50
59
 
51
60
  See Also:
61
+ s3_prefix:
62
+ If provided, ``s3_prefix`` will always be attached to each object.
63
+
64
+ If ``s3_prefix`` is set to: ``2025``, then the file path
65
+ ``/home/ubuntu/Desktop/S3Upload/sub/photo.jpg`` will be uploaded as ``2025/S3Upload/sub/photo.jpg``
66
+
52
67
  exclude_path:
53
68
  When upload directory is "/home/ubuntu/Desktop/S3Upload", each file will naturally have the full prefix.
54
69
  However, this behavior can be avoided by specifying the ``exclude_path`` parameter.
55
70
 
56
71
  If exclude_path is set to: ``/home/ubuntu/Desktop``, then the file path
57
72
  ``/home/ubuntu/Desktop/S3Upload/sub-dir/photo.jpg`` will be uploaded as ``S3Upload/sub-dir/photo.jpg``
58
-
59
- s3_prefix:
60
- If provided, ``s3_prefix`` will always be attached to each object.
61
-
62
- If ``s3_prefix`` is set to: ``2025``, then the file path
63
- ``/home/ubuntu/Desktop/S3Upload/sub/photo.jpg`` will be uploaded as ``2025/S3Upload/sub/photo.jpg``
64
73
  """
65
74
  self.session = boto3.Session(
66
75
  profile_name=profile_name or getenv("PROFILE_NAME"),
@@ -68,17 +77,26 @@ class Uploader:
68
77
  aws_access_key_id=aws_access_key_id or getenv("AWS_ACCESS_KEY_ID"),
69
78
  aws_secret_access_key=aws_secret_access_key or getenv("AWS_SECRET_ACCESS_KEY"),
70
79
  )
71
- self.s3 = self.session.resource(service_name="s3", config=self.RETRY_CONFIG)
80
+ self.s3 = self.session.resource(service_name="s3", config=retry_config)
81
+
72
82
  self.logger = logger or default_logger()
83
+
84
+ self.bucket_name = bucket_name
73
85
  self.upload_dir = upload_dir or getenv("UPLOAD_DIR", "UPLOAD_SOURCE")
74
86
  self.s3_prefix = s3_prefix
75
87
  self.exclude_path = exclude_path
76
- self.bucket_name = bucket_name
77
- # noinspection PyUnresolvedReferences
78
- self.bucket: boto3.resources.factory.s3.Bucket = None
88
+ self.skip_dot_files = skip_dot_files
89
+ self.overwrite = overwrite
90
+
79
91
  self.results = UploadResults()
80
92
  self.start = time.time()
81
93
 
94
+ # noinspection PyUnresolvedReferences
95
+ self.bucket: boto3.resources.factory.s3.Bucket = None
96
+ # noinspection PyUnresolvedReferences
97
+ self.bucket_objects: boto3.resources.factory.s3.ObjectSummary = []
98
+ self.object_size_map: Dict[str, int] = {}
99
+
82
100
  def init(self) -> None:
83
101
  """Instantiates the bucket instance.
84
102
 
@@ -106,6 +124,9 @@ class Uploader:
106
124
  self.upload_dir = os.path.abspath(self.upload_dir)
107
125
  # noinspection PyUnresolvedReferences
108
126
  self.bucket: boto3.resources.factory.s3.Bucket = self.s3.Bucket(self.bucket_name)
127
+ # noinspection PyUnresolvedReferences
128
+ self.bucket_objects: boto3.resources.factory.s3.ObjectSummary = [obj for obj in self.bucket.objects.all()]
129
+ self.object_size_map = {obj.key: obj.size for obj in self.bucket_objects}
109
130
 
110
131
  def exit(self) -> None:
111
132
  """Exits after printing results, and run time."""
@@ -115,25 +136,57 @@ class Uploader:
115
136
  )
116
137
  self.logger.info("Run Time: %.2fs", time.time() - self.start)
117
138
 
118
- def _uploader(self, objectpath: str, filepath: str) -> None:
139
+ def _proceed_to_upload(self, filepath: str, objectpath: str) -> bool:
140
+ """Compares file size if the object already exists in S3.
141
+
142
+ Args:
143
+ filepath: Source filepath.
144
+ objectpath: S3 object path.
145
+
146
+ Returns:
147
+ bool:
148
+ Returns a boolean flag to indicate upload flag.
149
+ """
150
+ if self.overwrite:
151
+ return True
152
+ # Indicates that the object path already exists in S3
153
+ if object_size := self.object_size_map.get(objectpath):
154
+ try:
155
+ file_size = os.path.getsize(filepath)
156
+ except (OSError, PermissionError) as error:
157
+ self.logger.error(error)
158
+ return True
159
+ if object_size == file_size:
160
+ self.logger.info("S3 object %s exists, and size [%d] matches, skipping..", objectpath, object_size)
161
+ return False
162
+ self.logger.info(
163
+ "S3 object %s exists, but size mismatch. Local: [%d], S3: [%d]", objectpath, file_size, object_size
164
+ )
165
+ return True
166
+
167
+ def _uploader(self, filepath: str, objectpath: str) -> None:
119
168
  """Uploads the filepath to the specified S3 bucket.
120
169
 
121
170
  Args:
122
- objectpath: Object path ref in S3.
123
171
  filepath: Filepath to upload.
172
+ objectpath: Object path ref in S3.
124
173
  """
125
- self.bucket.upload_file(filepath, objectpath)
174
+ if self._proceed_to_upload(filepath, objectpath):
175
+ self.bucket.upload_file(filepath, objectpath)
126
176
 
127
177
  def _get_files(self) -> Dict[str, str]:
128
178
  """Get a mapping for all the file path and object paths in upload directory.
129
179
 
130
180
  Returns:
131
181
  Dict[str, str]:
132
- Returns a dictionary object path and filepath.
182
+ Returns a key-value pair of filepath and objectpath.
133
183
  """
134
184
  files_to_upload = {}
135
185
  for __path, __directory, __files in os.walk(self.upload_dir):
136
186
  for file_ in __files:
187
+ if self.skip_dot_files and file_.startswith("."):
188
+ self.logger.info("Skipping dot file: %s", file_)
189
+ continue
137
190
  file_path = os.path.join(__path, file_)
138
191
  if self.exclude_path:
139
192
  relative_path = file_path.replace(self.exclude_path, "")
@@ -149,7 +202,7 @@ class Uploader:
149
202
  url_parts.extend(relative_path.split(os.sep))
150
203
  # Remove falsy values using filter - "None", "bool", "len" or "lambda item: item"
151
204
  object_path = urljoin(*filter(None, url_parts))
152
- files_to_upload[object_path] = file_path
205
+ files_to_upload[file_path] = object_path
153
206
  return files_to_upload
154
207
 
155
208
  def run(self) -> None:
@@ -163,7 +216,7 @@ class Uploader:
163
216
  keys.items(), total=len(keys), unit="file", leave=True, desc=f"Uploading files from {self.upload_dir}"
164
217
  ):
165
218
  try:
166
- self._uploader(objectpath=objectpath, filepath=filepath)
219
+ self._uploader(filepath=filepath, objectpath=objectpath)
167
220
  self.results.success += 1
168
221
  except ClientError as error:
169
222
  self.logger.error(error)
@@ -187,7 +240,10 @@ class Uploader:
187
240
  max_workers,
188
241
  )
189
242
  with ThreadPoolExecutor(max_workers=max_workers) as executor:
190
- futures = [executor.submit(self._uploader, *kv) for kv in keys.items()]
243
+ futures = [
244
+ executor.submit(self._uploader, **dict(filepath=filepath, objectpath=objectpath))
245
+ for filepath, objectpath in keys.items()
246
+ ]
191
247
  for future in tqdm(
192
248
  iterable=as_completed(futures),
193
249
  total=len(futures),
@@ -212,7 +268,7 @@ class Uploader:
212
268
  """
213
269
  self.init()
214
270
  # Using list and set will yield the same results but using set we can isolate directories from files
215
- return convert_to_folder_structure(set([obj.key for obj in self.bucket.objects.all()]))
271
+ return convert_to_folder_structure(set(obj.key for obj in self.bucket_objects))
216
272
 
217
273
  def print_bucket_structure(self) -> None:
218
274
  """Prints all the objects in an S3 bucket with a folder like representation."""
s3/utils.py CHANGED
@@ -1,6 +1,8 @@
1
1
  import os
2
2
  from typing import Dict, Set
3
3
 
4
+ from botocore.config import Config
5
+
4
6
 
5
7
  class UploadResults(dict):
6
8
  """Object to store results of S3 upload.
@@ -13,6 +15,18 @@ class UploadResults(dict):
13
15
  failed: int = 0
14
16
 
15
17
 
18
+ RETRY_CONFIG: Config = Config(
19
+ retries={
20
+ "max_attempts": 10,
21
+ "mode": "adaptive", # Adaptive retry mode with jitter
22
+ "total_max_attempts": 20, # Max retries across all requests
23
+ },
24
+ # Adding custom timeouts here:
25
+ connect_timeout=5, # 5 seconds for establishing a connection
26
+ read_timeout=30, # 30 seconds to wait for a response from the server
27
+ )
28
+
29
+
16
30
  def getenv(*args, default: str = None) -> str:
17
31
  """Returns the key-ed environment variable or the default value."""
18
32
  for key in args:
@@ -1,11 +0,0 @@
1
- s3/__init__.py,sha256=qSltnC7r3AjwiYWzsD9JUs8SzeBEV16nrHldiWlrxtY,66
2
- s3/exceptions.py,sha256=hH3jlMOe8yjBatQK9EdndWZz4QESU74KSY_iDhQ37SY,2585
3
- s3/logger.py,sha256=oH540oq8jY723jA4lDWlgfFPLbNgGXTkDwFpB7TLO_o,1196
4
- s3/tree.py,sha256=DiQ2ekMMaj2m_P3-iKkEqSuJCJZ_UZxcAwHtAoPVa5c,1824
5
- s3/uploader.py,sha256=kkv7d2EaMH3OsoIJgTx7yRUd00s0n9PbRbjj6Rm7qdA,9355
6
- s3/utils.py,sha256=0kcG0aE2olHhC8thaUEwx2J8tOI2-2TGCk6E6U-PiKw,2058
7
- pys3uploader-0.1.2.dist-info/LICENSE,sha256=8k-hEraOzyum0GvmmK65YxNRTFXK7eIFHJ0OshJXeTk,1068
8
- pys3uploader-0.1.2.dist-info/METADATA,sha256=GtQq-ZDiZEMpl2CEs4VJw4AQ8tf5rzcfgjDu68oHX6c,7286
9
- pys3uploader-0.1.2.dist-info/WHEEL,sha256=beeZ86-EfXScwlR_HKu4SllMC9wUEj_8Z_4FJ3egI2w,91
10
- pys3uploader-0.1.2.dist-info/top_level.txt,sha256=iQp4y1P58Q633gj8M08kHE4mqqT0hixuDWcniDk_RJ4,3
11
- pys3uploader-0.1.2.dist-info/RECORD,,