rclone-api 1.0.88__py2.py3-none-any.whl → 1.0.90__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rclone_api/__init__.py +5 -1
- rclone_api/cmd/copy_large_s3.py +99 -0
- rclone_api/config.py +80 -1
- rclone_api/group_files.py +4 -1
- rclone_api/rclone.py +238 -49
- rclone_api/s3/api.py +73 -0
- rclone_api/s3/basic_ops.py +61 -0
- rclone_api/s3/chunk_uploader.py +538 -0
- rclone_api/s3/create.py +69 -0
- rclone_api/s3/types.py +58 -0
- rclone_api/types.py +5 -3
- rclone_api/util.py +32 -4
- {rclone_api-1.0.88.dist-info → rclone_api-1.0.90.dist-info}/METADATA +2 -3
- rclone_api-1.0.90.dist-info/RECORD +35 -0
- {rclone_api-1.0.88.dist-info → rclone_api-1.0.90.dist-info}/WHEEL +1 -1
- {rclone_api-1.0.88.dist-info → rclone_api-1.0.90.dist-info}/entry_points.txt +1 -0
- rclone_api-1.0.88.dist-info/RECORD +0 -29
- {rclone_api-1.0.88.dist-info → rclone_api-1.0.90.dist-info}/LICENSE +0 -0
- {rclone_api-1.0.88.dist-info → rclone_api-1.0.90.dist-info}/top_level.txt +0 -0
rclone_api/s3/api.py
ADDED
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
import warnings
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
|
|
4
|
+
from botocore.client import BaseClient
|
|
5
|
+
|
|
6
|
+
from rclone_api.s3.basic_ops import (
|
|
7
|
+
download_file,
|
|
8
|
+
head,
|
|
9
|
+
list_bucket_contents,
|
|
10
|
+
upload_file,
|
|
11
|
+
)
|
|
12
|
+
from rclone_api.s3.chunk_uploader import MultiUploadResult, upload_file_multipart
|
|
13
|
+
from rclone_api.s3.create import create_s3_client
|
|
14
|
+
from rclone_api.s3.types import S3Credentials, S3MutliPartUploadConfig, S3UploadTarget
|
|
15
|
+
|
|
16
|
+
_MIN_THRESHOLD_FOR_CHUNKING = 5 * 1024 * 1024
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class S3Client:
|
|
20
|
+
def __init__(self, credentials: S3Credentials):
|
|
21
|
+
self.credentials: S3Credentials = credentials
|
|
22
|
+
self.client: BaseClient = create_s3_client(credentials)
|
|
23
|
+
|
|
24
|
+
def list_bucket_contents(self, bucket_name: str) -> None:
|
|
25
|
+
list_bucket_contents(self.client, bucket_name)
|
|
26
|
+
|
|
27
|
+
def upload_file(self, target: S3UploadTarget) -> Exception | None:
|
|
28
|
+
bucket_name = target.bucket_name
|
|
29
|
+
file_path = target.src_file
|
|
30
|
+
object_name = target.s3_key
|
|
31
|
+
return upload_file(
|
|
32
|
+
s3_client=self.client,
|
|
33
|
+
bucket_name=bucket_name,
|
|
34
|
+
file_path=file_path,
|
|
35
|
+
object_name=object_name,
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
def download_file(self, bucket_name: str, object_name: str, file_path: str) -> None:
|
|
39
|
+
download_file(self.client, bucket_name, object_name, file_path)
|
|
40
|
+
|
|
41
|
+
def head(self, bucket_name: str, object_name: str) -> dict | None:
|
|
42
|
+
return head(self.client, bucket_name, object_name)
|
|
43
|
+
|
|
44
|
+
def upload_file_multipart(
|
|
45
|
+
self,
|
|
46
|
+
upload_target: S3UploadTarget,
|
|
47
|
+
upload_config: S3MutliPartUploadConfig,
|
|
48
|
+
) -> MultiUploadResult:
|
|
49
|
+
filesize = upload_target.src_file.stat().st_size
|
|
50
|
+
if filesize < _MIN_THRESHOLD_FOR_CHUNKING:
|
|
51
|
+
warnings.warn(
|
|
52
|
+
f"File size {filesize} is less than the minimum threshold for chunking ({_MIN_THRESHOLD_FOR_CHUNKING}), switching to single threaded upload."
|
|
53
|
+
)
|
|
54
|
+
err = self.upload_file(upload_target)
|
|
55
|
+
if err:
|
|
56
|
+
raise err
|
|
57
|
+
return MultiUploadResult.UPLOADED_FRESH
|
|
58
|
+
chunk_size = upload_config.chunk_size
|
|
59
|
+
retries = upload_config.retries
|
|
60
|
+
resume_path_json = upload_config.resume_path_json
|
|
61
|
+
max_chunks_before_suspension = upload_config.max_chunks_before_suspension
|
|
62
|
+
bucket_name = upload_target.bucket_name
|
|
63
|
+
out = upload_file_multipart(
|
|
64
|
+
s3_client=self.client,
|
|
65
|
+
bucket_name=bucket_name,
|
|
66
|
+
file_path=upload_target.src_file,
|
|
67
|
+
object_name=upload_target.s3_key,
|
|
68
|
+
resumable_info_path=resume_path_json,
|
|
69
|
+
chunk_size=chunk_size,
|
|
70
|
+
retries=retries,
|
|
71
|
+
max_chunks_before_suspension=max_chunks_before_suspension,
|
|
72
|
+
)
|
|
73
|
+
return out
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
|
|
3
|
+
from botocore.client import BaseClient
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def list_bucket_contents(s3_client: BaseClient, bucket_name: str) -> None:
|
|
7
|
+
"""List contents of the specified bucket."""
|
|
8
|
+
try:
|
|
9
|
+
response = s3_client.list_objects_v2(Bucket=bucket_name)
|
|
10
|
+
if "Contents" in response:
|
|
11
|
+
for obj in response["Contents"]:
|
|
12
|
+
print(f"File: {obj['Key']} | Size: {obj['Size']} bytes")
|
|
13
|
+
else:
|
|
14
|
+
print(f"The bucket '{bucket_name}' is empty.")
|
|
15
|
+
except Exception as e:
|
|
16
|
+
print(f"Error listing bucket contents: {e}")
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def upload_file(
|
|
20
|
+
s3_client: BaseClient,
|
|
21
|
+
bucket_name: str,
|
|
22
|
+
file_path: Path,
|
|
23
|
+
object_name: str,
|
|
24
|
+
) -> Exception | None:
|
|
25
|
+
"""Upload a file to the bucket."""
|
|
26
|
+
try:
|
|
27
|
+
s3_client.upload_file(str(file_path), bucket_name, object_name)
|
|
28
|
+
print(f"Uploaded {file_path} to {bucket_name}/{object_name}")
|
|
29
|
+
except Exception as e:
|
|
30
|
+
print(f"Error uploading file: {e}")
|
|
31
|
+
return e
|
|
32
|
+
return None
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def download_file(
|
|
36
|
+
s3_client: BaseClient, bucket_name: str, object_name: str, file_path: str
|
|
37
|
+
) -> None:
|
|
38
|
+
"""Download a file from the bucket."""
|
|
39
|
+
try:
|
|
40
|
+
s3_client.download_file(bucket_name, object_name, file_path)
|
|
41
|
+
print(f"Downloaded {object_name} from {bucket_name} to {file_path}")
|
|
42
|
+
except Exception as e:
|
|
43
|
+
print(f"Error downloading file: {e}")
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def head(s3_client: BaseClient, bucket_name: str, object_name: str) -> dict | None:
|
|
47
|
+
"""
|
|
48
|
+
Retrieve metadata for the specified object using a HEAD operation.
|
|
49
|
+
|
|
50
|
+
:param s3_client: The S3 client to use.
|
|
51
|
+
:param bucket_name: The name of the bucket containing the object.
|
|
52
|
+
:param object_name: The key of the object.
|
|
53
|
+
:return: A dictionary containing the object's metadata if successful, otherwise None.
|
|
54
|
+
"""
|
|
55
|
+
try:
|
|
56
|
+
response = s3_client.head_object(Bucket=bucket_name, Key=object_name)
|
|
57
|
+
print(f"Metadata for {object_name} in {bucket_name}: {response}")
|
|
58
|
+
return response
|
|
59
|
+
except Exception as e:
|
|
60
|
+
print(f"Error retrieving metadata for {object_name}: {e}")
|
|
61
|
+
return None
|
|
@@ -0,0 +1,538 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import os
|
|
3
|
+
import time
|
|
4
|
+
import warnings
|
|
5
|
+
from concurrent.futures import ThreadPoolExecutor
|
|
6
|
+
from dataclasses import dataclass, field, fields
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from queue import Queue
|
|
9
|
+
from threading import Lock, Thread
|
|
10
|
+
|
|
11
|
+
from botocore.client import BaseClient
|
|
12
|
+
|
|
13
|
+
from rclone_api.s3.types import MultiUploadResult
|
|
14
|
+
|
|
15
|
+
_MIN_UPLOAD_CHUNK_SIZE = 5 * 1024 * 1024 # 5MB
|
|
16
|
+
_SAVE_STATE_LOCK = Lock()
|
|
17
|
+
|
|
18
|
+
_PRINT_LOCK = Lock()
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def locked_print(*args, **kwargs):
|
|
22
|
+
with _PRINT_LOCK:
|
|
23
|
+
print(*args, **kwargs)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class FileChunk:
|
|
27
|
+
def __init__(self, src: Path, upload_id: str, part_number: int, data: bytes):
|
|
28
|
+
assert data is not None, f"{src}: Data must not be None"
|
|
29
|
+
self.upload_id = upload_id
|
|
30
|
+
self.src = src
|
|
31
|
+
self.part_number = part_number
|
|
32
|
+
name = src.name
|
|
33
|
+
self.tmpdir = _get_chunk_tmpdir()
|
|
34
|
+
self.filepart = self.tmpdir / f"{name}_{upload_id}.part_{part_number}.tmp"
|
|
35
|
+
self.filepart.write_bytes(data)
|
|
36
|
+
del data # free up memory
|
|
37
|
+
|
|
38
|
+
@property
|
|
39
|
+
def data(self) -> bytes:
|
|
40
|
+
assert self.filepart is not None
|
|
41
|
+
with open(self.filepart, "rb") as f:
|
|
42
|
+
return f.read()
|
|
43
|
+
return b""
|
|
44
|
+
|
|
45
|
+
def close(self):
|
|
46
|
+
if self.filepart.exists():
|
|
47
|
+
self.filepart.unlink()
|
|
48
|
+
|
|
49
|
+
def __del__(self):
|
|
50
|
+
self.close()
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
@dataclass
|
|
54
|
+
class UploadInfo:
|
|
55
|
+
s3_client: BaseClient
|
|
56
|
+
bucket_name: str
|
|
57
|
+
object_name: str
|
|
58
|
+
src_file_path: Path
|
|
59
|
+
upload_id: str
|
|
60
|
+
retries: int
|
|
61
|
+
chunk_size: int
|
|
62
|
+
file_size: int
|
|
63
|
+
_total_chunks: int | None = None
|
|
64
|
+
|
|
65
|
+
def total_chunks(self) -> int:
|
|
66
|
+
out = self.file_size // self.chunk_size
|
|
67
|
+
if self.file_size % self.chunk_size:
|
|
68
|
+
return out + 1
|
|
69
|
+
return out
|
|
70
|
+
|
|
71
|
+
def __post_init__(self):
|
|
72
|
+
if self._total_chunks is not None:
|
|
73
|
+
return
|
|
74
|
+
self._total_chunks = self.total_chunks()
|
|
75
|
+
|
|
76
|
+
def to_json(self) -> dict:
|
|
77
|
+
json_dict = {}
|
|
78
|
+
for f in fields(self):
|
|
79
|
+
value = getattr(self, f.name)
|
|
80
|
+
# Convert non-serializable objects (like s3_client) to a string representation.
|
|
81
|
+
if f.name == "s3_client":
|
|
82
|
+
json_dict[f.name] = "RUNTIME OBJECT"
|
|
83
|
+
else:
|
|
84
|
+
if isinstance(value, Path):
|
|
85
|
+
value = str(value)
|
|
86
|
+
json_dict[f.name] = value
|
|
87
|
+
return json_dict
|
|
88
|
+
|
|
89
|
+
@staticmethod
|
|
90
|
+
def from_json(s3_client: BaseClient, json_dict: dict) -> "UploadInfo":
|
|
91
|
+
json_dict.pop("s3_client") # Remove the placeholder string
|
|
92
|
+
return UploadInfo(s3_client=s3_client, **json_dict)
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
@dataclass
|
|
96
|
+
class FinishedPiece:
|
|
97
|
+
part_number: int
|
|
98
|
+
etag: str
|
|
99
|
+
|
|
100
|
+
def to_json(self) -> dict:
|
|
101
|
+
return {"part_number": self.part_number, "etag": self.etag}
|
|
102
|
+
|
|
103
|
+
def to_json_str(self) -> str:
|
|
104
|
+
return json.dumps(self.to_json(), indent=0)
|
|
105
|
+
|
|
106
|
+
@staticmethod
|
|
107
|
+
def to_json_array(parts: list["FinishedPiece | None"]) -> list[dict | None]:
|
|
108
|
+
non_none: list[FinishedPiece] = [p for p in parts if p is not None]
|
|
109
|
+
non_none.sort(key=lambda x: x.part_number)
|
|
110
|
+
all_nones: list[None] = [None for p in parts if p is None]
|
|
111
|
+
assert len(all_nones) <= 1, "Only one None should be present"
|
|
112
|
+
return [p.to_json() for p in non_none]
|
|
113
|
+
|
|
114
|
+
@staticmethod
|
|
115
|
+
def from_json(json: dict | None) -> "FinishedPiece | None":
|
|
116
|
+
if json is None:
|
|
117
|
+
return None
|
|
118
|
+
return FinishedPiece(**json)
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
@dataclass
|
|
122
|
+
class UploadState:
|
|
123
|
+
upload_info: UploadInfo
|
|
124
|
+
# finished_parts: Queue[FinishedPiece | None]
|
|
125
|
+
peristant: Path | None
|
|
126
|
+
lock: Lock = Lock()
|
|
127
|
+
parts: list[FinishedPiece | None] = field(default_factory=list)
|
|
128
|
+
|
|
129
|
+
def is_done(self) -> bool:
|
|
130
|
+
return self.remaining() == 0
|
|
131
|
+
|
|
132
|
+
def count(self) -> tuple[int, int]: # count, num_chunks
|
|
133
|
+
num_chunks = self.upload_info.total_chunks()
|
|
134
|
+
count = 0
|
|
135
|
+
for p in self.parts:
|
|
136
|
+
if p is not None:
|
|
137
|
+
count += 1
|
|
138
|
+
return count, num_chunks
|
|
139
|
+
|
|
140
|
+
def finished(self) -> int:
|
|
141
|
+
count, _ = self.count()
|
|
142
|
+
return count
|
|
143
|
+
|
|
144
|
+
def remaining(self) -> int:
|
|
145
|
+
count, num_chunks = self.count()
|
|
146
|
+
assert (
|
|
147
|
+
count <= num_chunks
|
|
148
|
+
), f"Count {count} is greater than num_chunks {num_chunks}"
|
|
149
|
+
return num_chunks - count
|
|
150
|
+
|
|
151
|
+
def add_finished(self, part: FinishedPiece | None) -> None:
|
|
152
|
+
with self.lock:
|
|
153
|
+
self.parts.append(part)
|
|
154
|
+
self._save_no_lock()
|
|
155
|
+
|
|
156
|
+
def __post_init__(self):
|
|
157
|
+
if self.peristant is None:
|
|
158
|
+
# upload_id = self.upload_info.upload_id
|
|
159
|
+
object_name = self.upload_info.object_name
|
|
160
|
+
chunk_size = self.upload_info.chunk_size
|
|
161
|
+
parent = _get_chunk_tmpdir()
|
|
162
|
+
self.peristant = parent / f"{object_name}_chunk_size_{chunk_size}_.json"
|
|
163
|
+
|
|
164
|
+
def save(self) -> None:
|
|
165
|
+
with _SAVE_STATE_LOCK:
|
|
166
|
+
self._save_no_lock()
|
|
167
|
+
|
|
168
|
+
def _save_no_lock(self) -> None:
|
|
169
|
+
assert self.peristant is not None, "No path to save to"
|
|
170
|
+
self.peristant.write_text(self.to_json_str(), encoding="utf-8")
|
|
171
|
+
|
|
172
|
+
@staticmethod
|
|
173
|
+
def load(s3_client: BaseClient, path: Path) -> "UploadState":
|
|
174
|
+
with _SAVE_STATE_LOCK:
|
|
175
|
+
return UploadState.from_json(s3_client, path)
|
|
176
|
+
|
|
177
|
+
def to_json(self) -> dict:
|
|
178
|
+
# queue -> list
|
|
179
|
+
# parts: list[dict] = [f.to_json() for f in self.parts]
|
|
180
|
+
parts: list[FinishedPiece | None] = list(self.parts)
|
|
181
|
+
|
|
182
|
+
parts_json = FinishedPiece.to_json_array(parts)
|
|
183
|
+
is_done = self.is_done()
|
|
184
|
+
count_non_none: int = 0
|
|
185
|
+
for p in parts:
|
|
186
|
+
if p is not None:
|
|
187
|
+
count_non_none += 1
|
|
188
|
+
|
|
189
|
+
# self.count()
|
|
190
|
+
finished_count, total = self.count()
|
|
191
|
+
|
|
192
|
+
# parts.sort(key=lambda x: x.part_number) # Some backends need this.
|
|
193
|
+
out_json = {
|
|
194
|
+
"upload_info": self.upload_info.to_json(),
|
|
195
|
+
"finished_parts": parts_json,
|
|
196
|
+
"is_done": is_done,
|
|
197
|
+
"finished_count": finished_count,
|
|
198
|
+
"total_parts": total,
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
# check that we can sererialize
|
|
202
|
+
# json.dumps(out_json)
|
|
203
|
+
return out_json
|
|
204
|
+
|
|
205
|
+
def to_json_str(self) -> str:
|
|
206
|
+
return json.dumps(self.to_json(), indent=4)
|
|
207
|
+
|
|
208
|
+
@staticmethod
|
|
209
|
+
def from_json(s3_client: BaseClient, json_file: Path) -> "UploadState":
|
|
210
|
+
json_str = json_file.read_text(encoding="utf-8")
|
|
211
|
+
data = json.loads(json_str)
|
|
212
|
+
upload_info_json = data["upload_info"]
|
|
213
|
+
finished_parts_json = data["finished_parts"]
|
|
214
|
+
upload_info = UploadInfo.from_json(s3_client, upload_info_json)
|
|
215
|
+
finished_parts = [FinishedPiece.from_json(p) for p in finished_parts_json]
|
|
216
|
+
return UploadState(
|
|
217
|
+
peristant=json_file, upload_info=upload_info, parts=finished_parts
|
|
218
|
+
)
|
|
219
|
+
|
|
220
|
+
|
|
221
|
+
# lock
|
|
222
|
+
|
|
223
|
+
_TMP_DIR_ACCESS_LOCK = Lock()
|
|
224
|
+
|
|
225
|
+
|
|
226
|
+
def clean_old_files(out: Path) -> None:
|
|
227
|
+
# clean up files older than 1 day
|
|
228
|
+
|
|
229
|
+
now = time.time()
|
|
230
|
+
# Erase all stale files and then purge empty directories.
|
|
231
|
+
for root, dirs, files in os.walk(out):
|
|
232
|
+
for name in files:
|
|
233
|
+
f = Path(root) / name
|
|
234
|
+
filemod = f.stat().st_mtime
|
|
235
|
+
diff_secs = now - filemod
|
|
236
|
+
diff_days = diff_secs / (60 * 60 * 24)
|
|
237
|
+
if diff_days > 1:
|
|
238
|
+
locked_print(f"Removing old file: {f}")
|
|
239
|
+
f.unlink()
|
|
240
|
+
|
|
241
|
+
for root, dirs, _ in os.walk(out):
|
|
242
|
+
for dir in dirs:
|
|
243
|
+
d = Path(root) / dir
|
|
244
|
+
if not list(d.iterdir()):
|
|
245
|
+
locked_print(f"Removing empty directory: {d}")
|
|
246
|
+
d.rmdir()
|
|
247
|
+
|
|
248
|
+
|
|
249
|
+
def _get_chunk_tmpdir() -> Path:
|
|
250
|
+
with _TMP_DIR_ACCESS_LOCK:
|
|
251
|
+
dat = _get_chunk_tmpdir.__dict__
|
|
252
|
+
if "out" in dat:
|
|
253
|
+
return dat["out"] # Folder already validated.
|
|
254
|
+
out = Path("chunk_store")
|
|
255
|
+
if out.exists():
|
|
256
|
+
# first access, clean up directory
|
|
257
|
+
clean_old_files(out)
|
|
258
|
+
out.mkdir(exist_ok=True, parents=True)
|
|
259
|
+
dat["out"] = out
|
|
260
|
+
return out
|
|
261
|
+
|
|
262
|
+
|
|
263
|
+
def file_chunker(
|
|
264
|
+
upload_state: UploadState, max_chunks: int | None, output: Queue[FileChunk | None]
|
|
265
|
+
) -> None:
|
|
266
|
+
|
|
267
|
+
count = 0
|
|
268
|
+
|
|
269
|
+
def should_stop() -> bool:
|
|
270
|
+
nonlocal count
|
|
271
|
+
if max_chunks is None:
|
|
272
|
+
return False
|
|
273
|
+
if count >= max_chunks:
|
|
274
|
+
return True
|
|
275
|
+
count += 1
|
|
276
|
+
return False
|
|
277
|
+
|
|
278
|
+
upload_info = upload_state.upload_info
|
|
279
|
+
file_path = upload_info.src_file_path
|
|
280
|
+
chunk_size = upload_info.chunk_size
|
|
281
|
+
src = Path(file_path)
|
|
282
|
+
file_size = os.path.getsize(file_path)
|
|
283
|
+
part_number = 1
|
|
284
|
+
done_part_numbers: set[int] = {
|
|
285
|
+
p.part_number for p in upload_state.parts if p is not None
|
|
286
|
+
}
|
|
287
|
+
num_parts = upload_info.total_chunks()
|
|
288
|
+
|
|
289
|
+
def next_part_number() -> int | None:
|
|
290
|
+
nonlocal part_number
|
|
291
|
+
while part_number in done_part_numbers:
|
|
292
|
+
part_number += 1
|
|
293
|
+
if part_number > num_parts:
|
|
294
|
+
return None
|
|
295
|
+
return part_number
|
|
296
|
+
|
|
297
|
+
try:
|
|
298
|
+
while not should_stop():
|
|
299
|
+
curr_parth_num = next_part_number()
|
|
300
|
+
if curr_parth_num is None:
|
|
301
|
+
locked_print(f"File {file_path} has completed chunking all parts")
|
|
302
|
+
break
|
|
303
|
+
assert curr_parth_num is not None
|
|
304
|
+
offset = (curr_parth_num - 1) * chunk_size
|
|
305
|
+
|
|
306
|
+
assert offset < file_size, f"Offset {offset} is greater than file size"
|
|
307
|
+
|
|
308
|
+
# Open the file, seek, read the chunk, and close immediately.
|
|
309
|
+
with open(file_path, "rb") as f:
|
|
310
|
+
f.seek(offset)
|
|
311
|
+
data = f.read(chunk_size)
|
|
312
|
+
|
|
313
|
+
if not data:
|
|
314
|
+
warnings.warn(f"Empty data for part {part_number} of {file_path}")
|
|
315
|
+
|
|
316
|
+
file_chunk = FileChunk(
|
|
317
|
+
src,
|
|
318
|
+
upload_id=upload_info.upload_id,
|
|
319
|
+
part_number=part_number,
|
|
320
|
+
data=data, # After this, data should not be reused.
|
|
321
|
+
)
|
|
322
|
+
done_part_numbers.add(part_number)
|
|
323
|
+
output.put(file_chunk)
|
|
324
|
+
part_number += 1
|
|
325
|
+
except Exception as e:
|
|
326
|
+
|
|
327
|
+
warnings.warn(f"Error reading file: {e}")
|
|
328
|
+
finally:
|
|
329
|
+
output.put(None)
|
|
330
|
+
|
|
331
|
+
|
|
332
|
+
def upload_task(
|
|
333
|
+
info: UploadInfo, chunk: bytes, part_number: int, retries: int
|
|
334
|
+
) -> FinishedPiece:
|
|
335
|
+
retries = retries + 1 # Add one for the initial attempt
|
|
336
|
+
for retry in range(retries):
|
|
337
|
+
try:
|
|
338
|
+
if retry > 0:
|
|
339
|
+
locked_print(f"Retrying part {part_number} for {info.src_file_path}")
|
|
340
|
+
locked_print(
|
|
341
|
+
f"Uploading part {part_number} for {info.src_file_path} of size {len(chunk)}"
|
|
342
|
+
)
|
|
343
|
+
part = info.s3_client.upload_part(
|
|
344
|
+
Bucket=info.bucket_name,
|
|
345
|
+
Key=info.object_name,
|
|
346
|
+
PartNumber=part_number,
|
|
347
|
+
UploadId=info.upload_id,
|
|
348
|
+
Body=chunk,
|
|
349
|
+
)
|
|
350
|
+
out: FinishedPiece = FinishedPiece(
|
|
351
|
+
etag=part["ETag"], part_number=part_number
|
|
352
|
+
)
|
|
353
|
+
return out
|
|
354
|
+
except Exception as e:
|
|
355
|
+
if retry == retries - 1:
|
|
356
|
+
locked_print(f"Error uploading part {part_number}: {e}")
|
|
357
|
+
raise e
|
|
358
|
+
else:
|
|
359
|
+
locked_print(f"Error uploading part {part_number}: {e}, retrying")
|
|
360
|
+
continue
|
|
361
|
+
raise Exception("Should not reach here")
|
|
362
|
+
|
|
363
|
+
|
|
364
|
+
def handle_upload(
|
|
365
|
+
upload_info: UploadInfo, file_chunk: FileChunk | None
|
|
366
|
+
) -> FinishedPiece | None:
|
|
367
|
+
if file_chunk is None:
|
|
368
|
+
return None
|
|
369
|
+
chunk, part_number = file_chunk.data, file_chunk.part_number
|
|
370
|
+
part: FinishedPiece = upload_task(
|
|
371
|
+
info=upload_info,
|
|
372
|
+
chunk=chunk,
|
|
373
|
+
part_number=part_number,
|
|
374
|
+
retries=upload_info.retries,
|
|
375
|
+
)
|
|
376
|
+
file_chunk.close()
|
|
377
|
+
return part
|
|
378
|
+
|
|
379
|
+
|
|
380
|
+
def prepare_upload_file_multipart(
|
|
381
|
+
s3_client: BaseClient,
|
|
382
|
+
bucket_name: str,
|
|
383
|
+
file_path: Path,
|
|
384
|
+
object_name: str,
|
|
385
|
+
chunk_size: int,
|
|
386
|
+
retries: int,
|
|
387
|
+
) -> UploadInfo:
|
|
388
|
+
"""Upload a file to the bucket using multipart upload with customizable chunk size."""
|
|
389
|
+
|
|
390
|
+
# Initiate multipart upload
|
|
391
|
+
locked_print(
|
|
392
|
+
f"Creating multipart upload for {file_path} to {bucket_name}/{object_name}"
|
|
393
|
+
)
|
|
394
|
+
mpu = s3_client.create_multipart_upload(Bucket=bucket_name, Key=object_name)
|
|
395
|
+
upload_id = mpu["UploadId"]
|
|
396
|
+
|
|
397
|
+
file_size = os.path.getsize(file_path)
|
|
398
|
+
|
|
399
|
+
upload_info: UploadInfo = UploadInfo(
|
|
400
|
+
s3_client=s3_client,
|
|
401
|
+
bucket_name=bucket_name,
|
|
402
|
+
object_name=object_name,
|
|
403
|
+
src_file_path=file_path,
|
|
404
|
+
upload_id=upload_id,
|
|
405
|
+
retries=retries,
|
|
406
|
+
chunk_size=chunk_size,
|
|
407
|
+
file_size=file_size,
|
|
408
|
+
)
|
|
409
|
+
return upload_info
|
|
410
|
+
|
|
411
|
+
|
|
412
|
+
def upload_file_multipart(
|
|
413
|
+
s3_client: BaseClient,
|
|
414
|
+
bucket_name: str,
|
|
415
|
+
file_path: Path,
|
|
416
|
+
object_name: str,
|
|
417
|
+
resumable_info_path: Path | None,
|
|
418
|
+
chunk_size: int = 16 * 1024 * 1024, # Default chunk size is 16MB; can be overridden
|
|
419
|
+
retries: int = 20,
|
|
420
|
+
max_chunks_before_suspension: int | None = None,
|
|
421
|
+
) -> MultiUploadResult:
|
|
422
|
+
"""Upload a file to the bucket using multipart upload with customizable chunk size."""
|
|
423
|
+
file_size = os.path.getsize(str(file_path))
|
|
424
|
+
if chunk_size > file_size:
|
|
425
|
+
warnings.warn(
|
|
426
|
+
f"Chunk size {chunk_size} is greater than file size {file_size}, using file size"
|
|
427
|
+
)
|
|
428
|
+
chunk_size = file_size
|
|
429
|
+
|
|
430
|
+
if chunk_size < _MIN_UPLOAD_CHUNK_SIZE:
|
|
431
|
+
raise ValueError(
|
|
432
|
+
f"Chunk size {chunk_size} is less than minimum upload chunk size {_MIN_UPLOAD_CHUNK_SIZE}"
|
|
433
|
+
)
|
|
434
|
+
|
|
435
|
+
def get_upload_state() -> UploadState | None:
|
|
436
|
+
if resumable_info_path is None:
|
|
437
|
+
locked_print(f"No resumable info path provided for {file_path}")
|
|
438
|
+
return None
|
|
439
|
+
if not resumable_info_path.exists():
|
|
440
|
+
locked_print(
|
|
441
|
+
f"Resumable info path {resumable_info_path} does not exist for {file_path}"
|
|
442
|
+
)
|
|
443
|
+
return None
|
|
444
|
+
upload_state = UploadState.load(s3_client=s3_client, path=resumable_info_path)
|
|
445
|
+
return upload_state
|
|
446
|
+
|
|
447
|
+
def make_new_state() -> UploadState:
|
|
448
|
+
locked_print(f"Creating new upload state for {file_path}")
|
|
449
|
+
upload_info = prepare_upload_file_multipart(
|
|
450
|
+
s3_client=s3_client,
|
|
451
|
+
bucket_name=bucket_name,
|
|
452
|
+
file_path=file_path,
|
|
453
|
+
object_name=object_name,
|
|
454
|
+
chunk_size=chunk_size,
|
|
455
|
+
retries=retries,
|
|
456
|
+
)
|
|
457
|
+
upload_state = UploadState(
|
|
458
|
+
upload_info=upload_info,
|
|
459
|
+
parts=[],
|
|
460
|
+
peristant=resumable_info_path,
|
|
461
|
+
)
|
|
462
|
+
return upload_state
|
|
463
|
+
|
|
464
|
+
filechunks: Queue[FileChunk | None] = Queue(10)
|
|
465
|
+
upload_state = get_upload_state() or make_new_state()
|
|
466
|
+
if upload_state.is_done():
|
|
467
|
+
return MultiUploadResult.ALREADY_DONE
|
|
468
|
+
finished = upload_state.finished()
|
|
469
|
+
if finished > 0:
|
|
470
|
+
locked_print(
|
|
471
|
+
f"Resuming upload for {file_path}, {finished} parts already uploaded"
|
|
472
|
+
)
|
|
473
|
+
started_new_upload = finished == 0
|
|
474
|
+
upload_info = upload_state.upload_info
|
|
475
|
+
max_workers = 8
|
|
476
|
+
|
|
477
|
+
def chunker_task(
|
|
478
|
+
upload_state=upload_state,
|
|
479
|
+
output=filechunks,
|
|
480
|
+
max_chunks=max_chunks_before_suspension,
|
|
481
|
+
) -> None:
|
|
482
|
+
file_chunker(upload_state=upload_state, output=output, max_chunks=max_chunks)
|
|
483
|
+
|
|
484
|
+
try:
|
|
485
|
+
thread_chunker = Thread(target=chunker_task, daemon=True)
|
|
486
|
+
thread_chunker.start()
|
|
487
|
+
|
|
488
|
+
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
|
489
|
+
while True:
|
|
490
|
+
file_chunk: FileChunk | None = filechunks.get()
|
|
491
|
+
if file_chunk is None:
|
|
492
|
+
break
|
|
493
|
+
|
|
494
|
+
def task(upload_info=upload_info, file_chunk=file_chunk):
|
|
495
|
+
return handle_upload(upload_info, file_chunk)
|
|
496
|
+
|
|
497
|
+
fut = executor.submit(task)
|
|
498
|
+
|
|
499
|
+
def done_cb(fut=fut):
|
|
500
|
+
result = fut.result()
|
|
501
|
+
# upload_state.finished_parts.put(result)
|
|
502
|
+
upload_state.add_finished(result)
|
|
503
|
+
|
|
504
|
+
fut.add_done_callback(done_cb)
|
|
505
|
+
# upload_state.finished_parts.put(None) # Signal the end of the queue
|
|
506
|
+
upload_state.add_finished(None)
|
|
507
|
+
thread_chunker.join()
|
|
508
|
+
if not upload_state.is_done():
|
|
509
|
+
upload_state.save()
|
|
510
|
+
return MultiUploadResult.SUSPENDED
|
|
511
|
+
parts: list[FinishedPiece] = [p for p in upload_state.parts if p is not None]
|
|
512
|
+
locked_print(f"Upload complete, sorting {len(parts)} parts to complete upload")
|
|
513
|
+
parts.sort(key=lambda x: x.part_number) # Some backends need this.
|
|
514
|
+
parts_s3: list[dict] = [
|
|
515
|
+
{"ETag": p.etag, "PartNumber": p.part_number} for p in parts
|
|
516
|
+
]
|
|
517
|
+
locked_print(f"Sending multi part completion message for {file_path}")
|
|
518
|
+
s3_client.complete_multipart_upload(
|
|
519
|
+
Bucket=bucket_name,
|
|
520
|
+
Key=object_name,
|
|
521
|
+
UploadId=upload_info.upload_id,
|
|
522
|
+
MultipartUpload={"Parts": parts_s3},
|
|
523
|
+
)
|
|
524
|
+
locked_print(
|
|
525
|
+
f"Multipart upload completed: {file_path} to {bucket_name}/{object_name}"
|
|
526
|
+
)
|
|
527
|
+
except Exception:
|
|
528
|
+
if upload_info.upload_id:
|
|
529
|
+
try:
|
|
530
|
+
s3_client.abort_multipart_upload(
|
|
531
|
+
Bucket=bucket_name, Key=object_name, UploadId=upload_info.upload_id
|
|
532
|
+
)
|
|
533
|
+
except Exception:
|
|
534
|
+
pass
|
|
535
|
+
raise
|
|
536
|
+
if started_new_upload:
|
|
537
|
+
return MultiUploadResult.UPLOADED_FRESH
|
|
538
|
+
return MultiUploadResult.UPLOADED_RESUME
|