PyS3Uploader 0.2.0__py3-none-any.whl → 0.4.0a1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of PyS3Uploader might be problematic. Click here for more details.
- pys3uploader/__init__.py +2 -0
- pys3uploader/logger.py +104 -0
- pys3uploader/metadata.py +11 -0
- pys3uploader/progress.py +39 -0
- pys3uploader/timer.py +54 -0
- pys3uploader/uploader.py +432 -0
- pys3uploader/utils.py +194 -0
- pys3uploader/version.py +1 -0
- {pys3uploader-0.2.0.dist-info → pys3uploader-0.4.0a1.dist-info}/METADATA +55 -10
- pys3uploader-0.4.0a1.dist-info/RECORD +15 -0
- pys3uploader-0.4.0a1.dist-info/top_level.txt +1 -0
- pys3uploader-0.2.0.dist-info/RECORD +0 -11
- pys3uploader-0.2.0.dist-info/top_level.txt +0 -1
- s3/__init__.py +0 -3
- s3/logger.py +0 -45
- s3/uploader.py +0 -264
- s3/utils.py +0 -70
- {s3 → pys3uploader}/exceptions.py +0 -0
- {s3 → pys3uploader}/tree.py +0 -0
- {pys3uploader-0.2.0.dist-info → pys3uploader-0.4.0a1.dist-info}/LICENSE +0 -0
- {pys3uploader-0.2.0.dist-info → pys3uploader-0.4.0a1.dist-info}/WHEEL +0 -0
pys3uploader/__init__.py
ADDED
pys3uploader/logger.py
ADDED
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
"""Loads a default logger with StreamHandler set to DEBUG mode.
|
|
2
|
+
|
|
3
|
+
>>> logging.Logger
|
|
4
|
+
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import logging
|
|
8
|
+
import os
|
|
9
|
+
from datetime import datetime
|
|
10
|
+
from enum import IntEnum, StrEnum
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class LogHandler(StrEnum):
|
|
14
|
+
"""Logging handlers to choose from when default logger is used.
|
|
15
|
+
|
|
16
|
+
>>> LogHandler
|
|
17
|
+
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
file = "file"
|
|
21
|
+
stdout = "stdout"
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class LogLevel(IntEnum):
|
|
25
|
+
"""Logging levels to choose from when default logger is used.
|
|
26
|
+
|
|
27
|
+
>>> LogLevel
|
|
28
|
+
|
|
29
|
+
"""
|
|
30
|
+
|
|
31
|
+
debug = logging.DEBUG
|
|
32
|
+
info = logging.INFO
|
|
33
|
+
warning = logging.WARNING
|
|
34
|
+
error = logging.ERROR
|
|
35
|
+
|
|
36
|
+
@classmethod
|
|
37
|
+
def _missing_(cls, value):
|
|
38
|
+
"""Allow constructing from string names."""
|
|
39
|
+
if isinstance(value, str):
|
|
40
|
+
value = value.lower()
|
|
41
|
+
for member in cls:
|
|
42
|
+
if member.name == value:
|
|
43
|
+
return member
|
|
44
|
+
return None
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def stream_handler() -> logging.StreamHandler:
|
|
48
|
+
"""Creates a ``StreamHandler`` and assigns a default format to it.
|
|
49
|
+
|
|
50
|
+
Returns:
|
|
51
|
+
logging.StreamHandler:
|
|
52
|
+
Returns an instance of the ``StreamHandler`` object.
|
|
53
|
+
"""
|
|
54
|
+
handler = logging.StreamHandler()
|
|
55
|
+
handler.setFormatter(fmt=default_format())
|
|
56
|
+
return handler
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def file_handler() -> logging.FileHandler:
|
|
60
|
+
"""Creates a ``StreamHandler`` and assigns a default format to it.
|
|
61
|
+
|
|
62
|
+
Returns:
|
|
63
|
+
logging.StreamHandler:
|
|
64
|
+
Returns an instance of the ``StreamHandler`` object.
|
|
65
|
+
"""
|
|
66
|
+
os.makedirs("logs", exist_ok=True)
|
|
67
|
+
filename = os.path.join("logs", datetime.now().strftime("PyS3Uploader_%d-%m-%Y_%H:%M.log"))
|
|
68
|
+
handler = logging.FileHandler(filename, mode="a")
|
|
69
|
+
handler.setFormatter(fmt=default_format())
|
|
70
|
+
return handler
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def default_format() -> logging.Formatter:
|
|
74
|
+
"""Creates a logging ``Formatter`` with a custom message and datetime format.
|
|
75
|
+
|
|
76
|
+
Returns:
|
|
77
|
+
logging.Formatter:
|
|
78
|
+
Returns an instance of the ``Formatter`` object.
|
|
79
|
+
"""
|
|
80
|
+
return logging.Formatter(
|
|
81
|
+
fmt="%(asctime)s - %(levelname)s - [%(module)s:%(lineno)d] - %(funcName)s - %(message)s",
|
|
82
|
+
datefmt="%b-%d-%Y %I:%M:%S %p",
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def setup_logger(handler: LogHandler, level: LogLevel) -> logging.Logger:
|
|
87
|
+
"""Creates a default logger with debug mode enabled.
|
|
88
|
+
|
|
89
|
+
Args:
|
|
90
|
+
handler: Logging handler to use.
|
|
91
|
+
level: Logging level to use.
|
|
92
|
+
|
|
93
|
+
Returns:
|
|
94
|
+
logging.Logger:
|
|
95
|
+
Returns an instance of the ``Logger`` object.
|
|
96
|
+
"""
|
|
97
|
+
logger = logging.getLogger(__name__)
|
|
98
|
+
if handler == LogHandler.file:
|
|
99
|
+
logger.addHandler(hdlr=file_handler())
|
|
100
|
+
elif handler == LogHandler.stdout:
|
|
101
|
+
logger.addHandler(hdlr=stream_handler())
|
|
102
|
+
|
|
103
|
+
logger.setLevel(level)
|
|
104
|
+
return logger
|
pys3uploader/metadata.py
ADDED
pys3uploader/progress.py
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
import threading
|
|
2
|
+
|
|
3
|
+
from alive_progress import alive_bar
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class ProgressPercentage:
|
|
7
|
+
"""Tracks progress of a file upload to S3 and updates the alive_bar.
|
|
8
|
+
|
|
9
|
+
>>> ProgressPercentage
|
|
10
|
+
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
def __init__(self, filename: str, size: int, bar: alive_bar):
|
|
14
|
+
"""Initializes the progress tracker.
|
|
15
|
+
|
|
16
|
+
Args:
|
|
17
|
+
filename: Name of the file being uploaded.
|
|
18
|
+
size: Total size of the file in bytes.
|
|
19
|
+
bar: alive_bar instance to update progress.
|
|
20
|
+
"""
|
|
21
|
+
self._filename = filename
|
|
22
|
+
self._size = size
|
|
23
|
+
self._seen_so_far = 0
|
|
24
|
+
self._lock = threading.Lock()
|
|
25
|
+
self._bar = bar
|
|
26
|
+
|
|
27
|
+
def __call__(self, bytes_amount: int) -> None:
|
|
28
|
+
"""Callback method to update progress.
|
|
29
|
+
|
|
30
|
+
Args:
|
|
31
|
+
bytes_amount: Number of bytes transferred in the last chunk.
|
|
32
|
+
"""
|
|
33
|
+
with self._lock:
|
|
34
|
+
self._seen_so_far += bytes_amount
|
|
35
|
+
percent = (self._seen_so_far / self._size) * 100
|
|
36
|
+
bar_len = 20
|
|
37
|
+
filled = int(bar_len * percent / 100)
|
|
38
|
+
bar_str = "█" * filled + "." * (bar_len - filled)
|
|
39
|
+
self._bar.text(f" || {self._filename} [{bar_str}] {percent:.0f}%")
|
pys3uploader/timer.py
ADDED
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
from threading import Timer
|
|
2
|
+
from typing import Any, Callable, Dict, Tuple
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class RepeatedTimer:
|
|
6
|
+
"""Instantiates RepeatedTimer object to kick off the threading.Timer object with custom intervals.
|
|
7
|
+
|
|
8
|
+
>>> RepeatedTimer
|
|
9
|
+
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
def __init__(
|
|
13
|
+
self,
|
|
14
|
+
interval: int,
|
|
15
|
+
function: Callable,
|
|
16
|
+
args: Tuple = None,
|
|
17
|
+
kwargs: Dict[str, Any] = None,
|
|
18
|
+
):
|
|
19
|
+
"""Repeats the ``Timer`` object from threading.
|
|
20
|
+
|
|
21
|
+
Args:
|
|
22
|
+
interval: Interval in seconds.
|
|
23
|
+
function: Function to trigger with intervals.
|
|
24
|
+
args: Arguments for the function.
|
|
25
|
+
kwargs: Keyword arguments for the function.
|
|
26
|
+
"""
|
|
27
|
+
self._timer = None
|
|
28
|
+
self.interval = interval
|
|
29
|
+
self.function = function
|
|
30
|
+
self.args = args or ()
|
|
31
|
+
self.kwargs = kwargs or {}
|
|
32
|
+
self.is_running = False
|
|
33
|
+
|
|
34
|
+
def _run(self):
|
|
35
|
+
"""Triggers the target function."""
|
|
36
|
+
self.is_running = False
|
|
37
|
+
self.start()
|
|
38
|
+
self.function(*self.args, **self.kwargs)
|
|
39
|
+
|
|
40
|
+
def start(self):
|
|
41
|
+
"""Trigger target function if timer isn't running already."""
|
|
42
|
+
if not self.is_running:
|
|
43
|
+
self._timer = Timer(self.interval, self._run)
|
|
44
|
+
self._timer.start()
|
|
45
|
+
self.is_running = True
|
|
46
|
+
|
|
47
|
+
def stop(self):
|
|
48
|
+
"""Stop the timer and cancel all futures."""
|
|
49
|
+
self._timer.cancel()
|
|
50
|
+
self.is_running = False
|
|
51
|
+
|
|
52
|
+
def cancel(self):
|
|
53
|
+
"""Initiate cancellation."""
|
|
54
|
+
self.stop()
|
pys3uploader/uploader.py
ADDED
|
@@ -0,0 +1,432 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import logging
|
|
3
|
+
import os
|
|
4
|
+
import time
|
|
5
|
+
from datetime import datetime, UTC
|
|
6
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
7
|
+
from typing import Dict, Iterable, NoReturn
|
|
8
|
+
|
|
9
|
+
import boto3.resources.factory
|
|
10
|
+
import dotenv
|
|
11
|
+
from alive_progress import alive_bar
|
|
12
|
+
from botocore.config import Config
|
|
13
|
+
from botocore.exceptions import ClientError
|
|
14
|
+
|
|
15
|
+
from pys3uploader.exceptions import BucketNotFound
|
|
16
|
+
from pys3uploader.metadata import Metadata
|
|
17
|
+
from pys3uploader.timer import RepeatedTimer
|
|
18
|
+
from pys3uploader.logger import LogHandler, LogLevel, setup_logger
|
|
19
|
+
from pys3uploader.progress import ProgressPercentage
|
|
20
|
+
from pys3uploader.utils import (
|
|
21
|
+
RETRY_CONFIG,
|
|
22
|
+
UploadResults,
|
|
23
|
+
convert_seconds,
|
|
24
|
+
convert_to_folder_structure,
|
|
25
|
+
getenv,
|
|
26
|
+
size_converter,
|
|
27
|
+
urljoin,
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class Uploader:
|
|
32
|
+
"""Initiates Uploader object to upload entire directory to S3.
|
|
33
|
+
|
|
34
|
+
>>> Uploader
|
|
35
|
+
|
|
36
|
+
"""
|
|
37
|
+
|
|
38
|
+
def __init__(
|
|
39
|
+
self,
|
|
40
|
+
bucket_name: str,
|
|
41
|
+
upload_dir: str,
|
|
42
|
+
s3_prefix: str = None,
|
|
43
|
+
exclude_prefix: str = None,
|
|
44
|
+
skip_dot_files: bool = True,
|
|
45
|
+
overwrite: bool = False,
|
|
46
|
+
file_exclusion: Iterable[str] = None,
|
|
47
|
+
folder_exclusion: Iterable[str] = None,
|
|
48
|
+
region_name: str = None,
|
|
49
|
+
profile_name: str = None,
|
|
50
|
+
aws_access_key_id: str = None,
|
|
51
|
+
aws_secret_access_key: str = None,
|
|
52
|
+
retry_config: Config = RETRY_CONFIG,
|
|
53
|
+
logger: logging.Logger = None,
|
|
54
|
+
log_handler: LogHandler = LogHandler.stdout,
|
|
55
|
+
log_level: LogLevel = LogLevel.debug,
|
|
56
|
+
env_file: str = None,
|
|
57
|
+
):
|
|
58
|
+
"""Initiates all the necessary args and creates a boto3 session with retry logic.
|
|
59
|
+
|
|
60
|
+
Args:
|
|
61
|
+
bucket_name: Name of the bucket.
|
|
62
|
+
upload_dir: Full path of the directory to be uploaded.
|
|
63
|
+
s3_prefix: Particular bucket prefix within which the upload should happen.
|
|
64
|
+
exclude_prefix: Full directory path to exclude from S3 object prefix.
|
|
65
|
+
skip_dot_files: Boolean flag to skip dot files.
|
|
66
|
+
overwrite: Boolean flag to overwrite files in S3.
|
|
67
|
+
file_exclusion: Sequence of files to exclude during upload.
|
|
68
|
+
folder_exclusion: Sequence of directories to exclude during upload.
|
|
69
|
+
region_name: Name of the AWS region.
|
|
70
|
+
profile_name: AWS profile name.
|
|
71
|
+
aws_access_key_id: AWS access key ID.
|
|
72
|
+
aws_secret_access_key: AWS secret access key.
|
|
73
|
+
logger: Bring your own logger.
|
|
74
|
+
log_handler: Default log handler, can be ``file`` or ``stdout``.
|
|
75
|
+
log_level: Default log level, can be ``debug``, ``info``, ``warning`` or ``error``.
|
|
76
|
+
env_file: Dotenv file (.env) filepath to load environment variables.
|
|
77
|
+
|
|
78
|
+
See Also:
|
|
79
|
+
s3_prefix:
|
|
80
|
+
If provided, ``s3_prefix`` will always be attached to each object.
|
|
81
|
+
|
|
82
|
+
If ``s3_prefix`` is set to: ``2025``, then the file path
|
|
83
|
+
``/home/ubuntu/Desktop/S3Upload/sub/photo.jpg`` will be uploaded as ``2025/S3Upload/sub/photo.jpg``
|
|
84
|
+
|
|
85
|
+
exclude_prefix:
|
|
86
|
+
When upload directory is "/home/ubuntu/Desktop/S3Upload", each file will naturally have the full prefix.
|
|
87
|
+
However, this behavior can be avoided by specifying the ``exclude_prefix`` parameter.
|
|
88
|
+
|
|
89
|
+
If exclude_prefix is set to: ``/home/ubuntu/Desktop``, then the file path
|
|
90
|
+
``/home/ubuntu/Desktop/S3Upload/sub-dir/photo.jpg`` will be uploaded as ``S3Upload/sub-dir/photo.jpg``
|
|
91
|
+
|
|
92
|
+
env_file:
|
|
93
|
+
Environment variables can be loaded from a .env file.
|
|
94
|
+
The filepath can be set as ``env_file`` during object instantiation or as an environment variable.
|
|
95
|
+
If a filepath is provided, PyS3Uploader loads it directly or searches the root directory for the file.
|
|
96
|
+
If no filepath is provided, PyS3Uploader searches the current directory for a .env file.
|
|
97
|
+
"""
|
|
98
|
+
self.logger = logger or setup_logger(handler=LogHandler(log_handler), level=LogLevel(log_level))
|
|
99
|
+
self.env_file = env_file or getenv("ENV_FILE", default=".env")
|
|
100
|
+
|
|
101
|
+
# Check for env_file in current working directory
|
|
102
|
+
if os.path.isfile(self.env_file):
|
|
103
|
+
self.logger.debug("Loading env file: %s", self.env_file)
|
|
104
|
+
dotenv.load_dotenv(dotenv_path=self.env_file, override=True)
|
|
105
|
+
# Find the env_file from root
|
|
106
|
+
elif env_file := dotenv.find_dotenv(self.env_file, raise_error_if_not_found=False):
|
|
107
|
+
self.logger.debug("Loading env file: %s", env_file)
|
|
108
|
+
dotenv.load_dotenv(dotenv_path=env_file, override=True)
|
|
109
|
+
else:
|
|
110
|
+
# Scan current working directory for any .env files
|
|
111
|
+
for file in os.listdir():
|
|
112
|
+
if file.endswith(".env"):
|
|
113
|
+
self.logger.debug("Loading env file: %s", file)
|
|
114
|
+
dotenv.load_dotenv(dotenv_path=file, override=True)
|
|
115
|
+
break
|
|
116
|
+
else:
|
|
117
|
+
self.logger.debug("No .env files found to load")
|
|
118
|
+
|
|
119
|
+
self.session = boto3.Session(
|
|
120
|
+
profile_name=profile_name or getenv("PROFILE_NAME", "AWS_PROFILE_NAME"),
|
|
121
|
+
region_name=region_name or getenv("AWS_DEFAULT_REGION"),
|
|
122
|
+
aws_access_key_id=aws_access_key_id or getenv("AWS_ACCESS_KEY_ID"),
|
|
123
|
+
aws_secret_access_key=aws_secret_access_key or getenv("AWS_SECRET_ACCESS_KEY"),
|
|
124
|
+
)
|
|
125
|
+
self.s3 = self.session.resource(service_name="s3", config=retry_config)
|
|
126
|
+
|
|
127
|
+
self.bucket_name = bucket_name
|
|
128
|
+
self.upload_dir = upload_dir
|
|
129
|
+
self.s3_prefix = s3_prefix
|
|
130
|
+
self.exclude_prefix = exclude_prefix
|
|
131
|
+
self.skip_dot_files = skip_dot_files
|
|
132
|
+
self.overwrite = overwrite
|
|
133
|
+
self.file_exclusion = file_exclusion or []
|
|
134
|
+
self.folder_exclusion = folder_exclusion or []
|
|
135
|
+
|
|
136
|
+
self.results = UploadResults()
|
|
137
|
+
self.start = time.time()
|
|
138
|
+
|
|
139
|
+
# noinspection PyUnresolvedReferences
|
|
140
|
+
self.bucket: boto3.resources.factory.s3.Bucket = None
|
|
141
|
+
# noinspection PyUnresolvedReferences
|
|
142
|
+
self.bucket_objects: boto3.resources.factory.s3.ObjectSummary = []
|
|
143
|
+
self.object_size_map: Dict[str, int] = {}
|
|
144
|
+
|
|
145
|
+
self.upload_files: Dict[str, str] = {}
|
|
146
|
+
self.file_size_map: Dict[str, int] = {}
|
|
147
|
+
|
|
148
|
+
self.timer = RepeatedTimer(
|
|
149
|
+
function=self.metadata_uploader,
|
|
150
|
+
interval=int(getenv("METADATA_UPLOAD_INTERVAL", 300))
|
|
151
|
+
)
|
|
152
|
+
|
|
153
|
+
def init(self) -> None | NoReturn:
|
|
154
|
+
"""Instantiates the bucket instance.
|
|
155
|
+
|
|
156
|
+
Raises:
|
|
157
|
+
ValueError: If no bucket name was passed.
|
|
158
|
+
BucketNotFound: If bucket name was not found.
|
|
159
|
+
"""
|
|
160
|
+
self.start = time.time()
|
|
161
|
+
if self.exclude_prefix and self.exclude_prefix not in self.upload_dir:
|
|
162
|
+
raise ValueError(
|
|
163
|
+
f"\n\n\tStart folder {self.exclude_prefix!r} is not a part of upload directory {self.upload_dir!r}"
|
|
164
|
+
)
|
|
165
|
+
if not self.upload_dir:
|
|
166
|
+
raise ValueError("\n\n\tCannot proceed without an upload directory.")
|
|
167
|
+
try:
|
|
168
|
+
assert os.path.exists(self.upload_dir)
|
|
169
|
+
except AssertionError:
|
|
170
|
+
raise ValueError(f"\n\n\tPath not found: {self.upload_dir}")
|
|
171
|
+
if not self.bucket_name:
|
|
172
|
+
raise ValueError("\n\n\tCannot proceed without a bucket name.")
|
|
173
|
+
if (buckets := [bucket.name for bucket in self.s3.buckets.all()]) and self.bucket_name not in buckets:
|
|
174
|
+
raise BucketNotFound(f"\n\n\t{self.bucket_name} was not found.\n\tAvailable: {buckets}")
|
|
175
|
+
self.upload_dir = os.path.abspath(self.upload_dir)
|
|
176
|
+
self.load_bucket_state()
|
|
177
|
+
|
|
178
|
+
def load_bucket_state(self):
|
|
179
|
+
"""Loads the bucket's current state."""
|
|
180
|
+
# noinspection PyUnresolvedReferences
|
|
181
|
+
self.bucket: boto3.resources.factory.s3.Bucket = self.s3.Bucket(self.bucket_name)
|
|
182
|
+
# noinspection PyUnresolvedReferences
|
|
183
|
+
self.bucket_objects: boto3.resources.factory.s3.ObjectSummary = [obj for obj in self.bucket.objects.all()]
|
|
184
|
+
self.object_size_map = {obj.key: obj.size for obj in self.bucket_objects}
|
|
185
|
+
|
|
186
|
+
def load_local_state(self):
|
|
187
|
+
"""Loads the local file queue."""
|
|
188
|
+
self.upload_files = self._get_files()
|
|
189
|
+
self.file_size_map = {file: self.filesize(file) for file in self.upload_files}
|
|
190
|
+
|
|
191
|
+
def exit(self) -> None:
|
|
192
|
+
"""Exits after printing results, and run time."""
|
|
193
|
+
success = len(self.results.success)
|
|
194
|
+
failed = len(self.results.failed)
|
|
195
|
+
total = success + failed
|
|
196
|
+
self.logger.info(
|
|
197
|
+
"Total number of uploads: %d, success: %d, failed: %d", total, success, failed
|
|
198
|
+
)
|
|
199
|
+
# Stop the timer and upload the final state as metadata file
|
|
200
|
+
self.timer.stop()
|
|
201
|
+
self.metadata_uploader()
|
|
202
|
+
self.logger.info("Run time: %s", convert_seconds(time.time() - self.start))
|
|
203
|
+
|
|
204
|
+
def filesize(self, filepath: str) -> int:
|
|
205
|
+
"""Gets the file size of a given filepath.
|
|
206
|
+
|
|
207
|
+
Args:
|
|
208
|
+
filepath: Full path of the file.
|
|
209
|
+
|
|
210
|
+
Returns:
|
|
211
|
+
int:
|
|
212
|
+
Returns the file size in bytes.
|
|
213
|
+
"""
|
|
214
|
+
try:
|
|
215
|
+
return os.path.getsize(filepath)
|
|
216
|
+
except (OSError, PermissionError) as error:
|
|
217
|
+
self.logger.error(error)
|
|
218
|
+
return 0
|
|
219
|
+
|
|
220
|
+
def size_it(self) -> None:
|
|
221
|
+
"""Calculates and logs the total size of files in S3 and local."""
|
|
222
|
+
files_in_s3 = len(self.object_size_map)
|
|
223
|
+
files_local = len(self.upload_files)
|
|
224
|
+
|
|
225
|
+
total_size_s3 = sum(self.object_size_map.values())
|
|
226
|
+
total_size_local = sum(self.file_size_map.values())
|
|
227
|
+
|
|
228
|
+
self.logger.info("Files in S3: [#%d]: %s (%d bytes)", files_in_s3, size_converter(total_size_s3), total_size_s3)
|
|
229
|
+
self.logger.info(
|
|
230
|
+
"Files local: [#%d]: %s (%d bytes)", files_local, size_converter(total_size_local), total_size_local
|
|
231
|
+
)
|
|
232
|
+
|
|
233
|
+
def _proceed_to_upload(self, filepath: str, objectpath: str) -> bool:
|
|
234
|
+
"""Compares file size if the object already exists in S3.
|
|
235
|
+
|
|
236
|
+
Args:
|
|
237
|
+
filepath: Source filepath.
|
|
238
|
+
objectpath: S3 object path.
|
|
239
|
+
|
|
240
|
+
Returns:
|
|
241
|
+
bool:
|
|
242
|
+
Returns a boolean flag to indicate upload flag.
|
|
243
|
+
"""
|
|
244
|
+
if self.overwrite:
|
|
245
|
+
return True
|
|
246
|
+
file_size = self.filesize(filepath)
|
|
247
|
+
# Indicates that the object path already exists in S3
|
|
248
|
+
if object_size := self.object_size_map.get(objectpath):
|
|
249
|
+
if object_size == file_size:
|
|
250
|
+
self.logger.info(
|
|
251
|
+
"S3 object %s exists, and size [%d bytes / %s] matches, skipping..",
|
|
252
|
+
objectpath,
|
|
253
|
+
object_size,
|
|
254
|
+
size_converter(object_size),
|
|
255
|
+
)
|
|
256
|
+
return False
|
|
257
|
+
self.logger.info(
|
|
258
|
+
"S3 object %s exists, but size mismatch. Local: [%d bytes / %s], S3: [%d bytes / %s]",
|
|
259
|
+
objectpath,
|
|
260
|
+
file_size,
|
|
261
|
+
object_size,
|
|
262
|
+
size_converter(object_size),
|
|
263
|
+
)
|
|
264
|
+
else:
|
|
265
|
+
self.logger.debug(
|
|
266
|
+
"S3 object '%s' of size [%d bytes / %s] doesn't exist, uploading..",
|
|
267
|
+
objectpath,
|
|
268
|
+
file_size,
|
|
269
|
+
size_converter(file_size),
|
|
270
|
+
)
|
|
271
|
+
return True
|
|
272
|
+
|
|
273
|
+
def _uploader(self, filepath: str, objectpath: str, callback: ProgressPercentage) -> None:
|
|
274
|
+
"""Uploads the filepath to the specified S3 bucket.
|
|
275
|
+
|
|
276
|
+
Args:
|
|
277
|
+
filepath: Filepath to upload.
|
|
278
|
+
objectpath: Object path ref in S3.
|
|
279
|
+
callback: ProgressPercentage callback to track upload progress.
|
|
280
|
+
"""
|
|
281
|
+
if self._proceed_to_upload(filepath, objectpath):
|
|
282
|
+
self.bucket.upload_file(filepath, objectpath, Callback=callback)
|
|
283
|
+
|
|
284
|
+
def _get_files(self) -> Dict[str, str]:
|
|
285
|
+
"""Get a mapping for all the file path and object paths in upload directory.
|
|
286
|
+
|
|
287
|
+
Returns:
|
|
288
|
+
Dict[str, str]:
|
|
289
|
+
Returns a key-value pair of filepath and objectpath.
|
|
290
|
+
"""
|
|
291
|
+
files_to_upload = {}
|
|
292
|
+
for __path, __directory, __files in os.walk(self.upload_dir):
|
|
293
|
+
scan_dir = os.path.split(__path)[-1]
|
|
294
|
+
if scan_dir in self.folder_exclusion:
|
|
295
|
+
self.logger.info("Skipping '%s' honoring folder exclusion", scan_dir)
|
|
296
|
+
continue
|
|
297
|
+
for file_ in __files:
|
|
298
|
+
if file_ in self.file_exclusion:
|
|
299
|
+
self.logger.info("Skipping '%s' honoring file exclusion", file_)
|
|
300
|
+
continue
|
|
301
|
+
if self.skip_dot_files and file_.startswith("."):
|
|
302
|
+
self.logger.info("Skipping dot file: %s", file_)
|
|
303
|
+
continue
|
|
304
|
+
file_path = os.path.join(__path, file_)
|
|
305
|
+
if self.exclude_prefix:
|
|
306
|
+
relative_path = file_path.replace(self.exclude_prefix, "")
|
|
307
|
+
else:
|
|
308
|
+
relative_path = file_path
|
|
309
|
+
# Lists in python are ordered, so s3 prefix will get loaded first when provided
|
|
310
|
+
url_parts = []
|
|
311
|
+
if self.s3_prefix:
|
|
312
|
+
url_parts.extend(
|
|
313
|
+
self.s3_prefix.split(os.sep) if os.sep in self.s3_prefix else self.s3_prefix.split("/")
|
|
314
|
+
)
|
|
315
|
+
# Add rest of the file path to parts before normalizing as an S3 object URL
|
|
316
|
+
url_parts.extend(relative_path.split(os.sep))
|
|
317
|
+
# Remove falsy values using filter - "None", "bool", "len" or "lambda item: item"
|
|
318
|
+
object_path = urljoin(*filter(None, url_parts))
|
|
319
|
+
files_to_upload[file_path] = object_path
|
|
320
|
+
return files_to_upload
|
|
321
|
+
|
|
322
|
+
def run(self) -> None:
|
|
323
|
+
"""Initiates object upload in a traditional loop."""
|
|
324
|
+
self.init()
|
|
325
|
+
self.load_local_state()
|
|
326
|
+
self.size_it()
|
|
327
|
+
self.timer.start()
|
|
328
|
+
total_files = len(self.upload_files)
|
|
329
|
+
|
|
330
|
+
self.logger.info(
|
|
331
|
+
"%d files from '%s' will be uploaded to '%s' sequentially",
|
|
332
|
+
total_files,
|
|
333
|
+
self.upload_dir,
|
|
334
|
+
self.bucket_name,
|
|
335
|
+
)
|
|
336
|
+
with alive_bar(total_files, title="Progress", bar="smooth", spinner="dots") as overall_bar:
|
|
337
|
+
for filepath, objectpath in self.upload_files.items():
|
|
338
|
+
progress_callback = ProgressPercentage(
|
|
339
|
+
filename=os.path.basename(filepath), size=self.filesize(filepath), bar=overall_bar
|
|
340
|
+
)
|
|
341
|
+
try:
|
|
342
|
+
self._uploader(filepath, objectpath, progress_callback)
|
|
343
|
+
self.results.success.append(filepath)
|
|
344
|
+
except ClientError as error:
|
|
345
|
+
self.logger.error("Upload failed: %s", error)
|
|
346
|
+
self.results.failed.append(filepath)
|
|
347
|
+
except KeyboardInterrupt:
|
|
348
|
+
self.logger.warning("Upload interrupted by user")
|
|
349
|
+
break
|
|
350
|
+
overall_bar() # increment overall progress bar
|
|
351
|
+
self.exit()
|
|
352
|
+
|
|
353
|
+
def run_in_parallel(self, max_workers: int = 5) -> None:
|
|
354
|
+
"""Initiates upload in multi-threading.
|
|
355
|
+
|
|
356
|
+
Args:
|
|
357
|
+
max_workers: Number of maximum threads to use.
|
|
358
|
+
"""
|
|
359
|
+
# Verify and initiate bucket state
|
|
360
|
+
self.init()
|
|
361
|
+
# Verify and initiate local state
|
|
362
|
+
self.load_local_state()
|
|
363
|
+
self.size_it()
|
|
364
|
+
self.timer.start()
|
|
365
|
+
total_files = len(self.upload_files)
|
|
366
|
+
|
|
367
|
+
self.logger.info(
|
|
368
|
+
"%d files from '%s' will be uploaded to '%s' with maximum concurrency of: %d",
|
|
369
|
+
total_files,
|
|
370
|
+
self.upload_dir,
|
|
371
|
+
self.bucket_name,
|
|
372
|
+
max_workers,
|
|
373
|
+
)
|
|
374
|
+
with alive_bar(total_files, title="Progress", bar="smooth", spinner="dots") as overall_bar:
|
|
375
|
+
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
|
376
|
+
futures = []
|
|
377
|
+
for filepath, objectpath in self.upload_files.items():
|
|
378
|
+
progress_callback = ProgressPercentage(
|
|
379
|
+
filename=os.path.basename(filepath), size=self.filesize(filepath), bar=overall_bar
|
|
380
|
+
)
|
|
381
|
+
futures.append(executor.submit(self._uploader, filepath, objectpath, callback=progress_callback))
|
|
382
|
+
|
|
383
|
+
for future in as_completed(futures):
|
|
384
|
+
try:
|
|
385
|
+
future.result()
|
|
386
|
+
self.results.success.append(filepath)
|
|
387
|
+
except ClientError as error:
|
|
388
|
+
self.logger.error("Upload failed: %s", error)
|
|
389
|
+
self.results.failed.append(filepath)
|
|
390
|
+
overall_bar() # Increment overall bar after each upload finishes
|
|
391
|
+
self.exit()
|
|
392
|
+
|
|
393
|
+
def metadata_uploader(self) -> None:
|
|
394
|
+
"""Metadata uploader."""
|
|
395
|
+
filename = objectpath = getenv("METADATA_FILENAME", "METADATA.json")
|
|
396
|
+
self.load_bucket_state()
|
|
397
|
+
objects_uploaded = len(self.results.success)
|
|
398
|
+
size_uploaded = sum([self.filesize(file) for file in self.results.success])
|
|
399
|
+
|
|
400
|
+
pending_files = self.upload_files.keys() - self.results.success
|
|
401
|
+
objects_pending = len(pending_files)
|
|
402
|
+
size_pending = sum([self.filesize(file) for file in pending_files])
|
|
403
|
+
|
|
404
|
+
metadata = Metadata(
|
|
405
|
+
timestamp=datetime.now(tz=UTC).strftime("%A %B %d, %Y %H:%M:%S"),
|
|
406
|
+
objects_uploaded=objects_uploaded,
|
|
407
|
+
objects_pending=objects_pending,
|
|
408
|
+
size_uploaded=size_converter(size_uploaded),
|
|
409
|
+
size_pending=size_converter(size_pending)
|
|
410
|
+
)
|
|
411
|
+
self.logger.debug("\n" + json.dumps(metadata, indent=2) + "\n")
|
|
412
|
+
self.logger.debug("Uploading metadata to S3")
|
|
413
|
+
filepath = os.path.join(os.getcwd(), filename)
|
|
414
|
+
with open(filepath, "w") as file:
|
|
415
|
+
json.dump(metadata.__dict__, file, indent=2)
|
|
416
|
+
file.flush()
|
|
417
|
+
self.bucket.upload_file(filepath, objectpath)
|
|
418
|
+
|
|
419
|
+
def get_bucket_structure(self) -> str:
|
|
420
|
+
"""Gets all the objects in an S3 bucket and forms it into a hierarchical folder like representation.
|
|
421
|
+
|
|
422
|
+
Returns:
|
|
423
|
+
str:
|
|
424
|
+
Returns a hierarchical folder like representation of the chosen bucket.
|
|
425
|
+
"""
|
|
426
|
+
self.init()
|
|
427
|
+
# Using list and set will yield the same results but using set we can isolate directories from files
|
|
428
|
+
return convert_to_folder_structure(set(obj.key for obj in self.bucket_objects))
|
|
429
|
+
|
|
430
|
+
def print_bucket_structure(self) -> None:
|
|
431
|
+
"""Prints all the objects in an S3 bucket with a folder like representation."""
|
|
432
|
+
print(self.get_bucket_structure())
|