rclone-api 1.5.38__py3-none-any.whl → 1.5.39__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,546 +1,546 @@
1
- """
2
- https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3/client/upload_part_copy.html
3
- * client.upload_part_copy
4
-
5
- This module provides functionality for S3 multipart uploads, including copying parts
6
- from existing S3 objects using upload_part_copy.
7
- """
8
-
9
- import json
10
- import os
11
- import time
12
- import warnings
13
- from concurrent.futures import Future, ThreadPoolExecutor
14
- from queue import Queue
15
- from threading import Semaphore, Thread
16
- from typing import Any, Callable
17
-
18
- from rclone_api.rclone_impl import RcloneImpl
19
- from rclone_api.s3.create import (
20
- BaseClient,
21
- S3Config,
22
- create_s3_client,
23
- )
24
- from rclone_api.s3.multipart.finished_piece import FinishedPiece
25
- from rclone_api.s3.multipart.info_json import InfoJson
26
- from rclone_api.s3.multipart.merge_state import MergeState, Part
27
- from rclone_api.types import EndOfStream
28
- from rclone_api.util import locked_print
29
-
30
- DEFAULT_MAX_WORKERS = 5 # Backblaze can do 10 with exponential backoff, so let's try 5
31
-
32
- _TIMEOUT_READ = 900
33
- _TIMEOUT_CONNECTION = 900
34
-
35
-
36
- def _upload_part_copy_task(
37
- s3_client: BaseClient,
38
- state: MergeState,
39
- source_bucket: str,
40
- source_key: str,
41
- part_number: int,
42
- ) -> FinishedPiece | Exception:
43
- """
44
- Upload a part by copying from an existing S3 object.
45
-
46
- Args:
47
- info: Upload information
48
- source_bucket: Source bucket name
49
- source_key: Source object key
50
- part_number: Part number (1-10000)
51
- byte_range: Optional byte range in format 'bytes=start-end'
52
- retries: Number of retry attempts
53
-
54
- Returns:
55
- FinishedPiece with ETag and part number
56
- """
57
- copy_source = {"Bucket": source_bucket, "Key": source_key}
58
-
59
- # from botocore.exceptions import NoSuchKey
60
- default_retries = 9
61
- retries = default_retries + 1 # Add one for the initial attempt
62
- for retry in range(retries):
63
- params: dict = {}
64
- try:
65
- if retry > 0:
66
- locked_print(f"Retrying part copy {part_number} for {state.dst_key}")
67
-
68
- locked_print(
69
- f"Copying part {part_number} for {state.dst_key} from {source_bucket}/{source_key}"
70
- )
71
-
72
- # Prepare the upload_part_copy parameters
73
- params = {
74
- "Bucket": state.bucket,
75
- "CopySource": copy_source,
76
- "Key": state.dst_key,
77
- "PartNumber": part_number,
78
- "UploadId": state.upload_id,
79
- }
80
-
81
- # Execute the copy operation
82
- part = s3_client.upload_part_copy(**params)
83
-
84
- # Extract ETag from the response
85
- etag = part["CopyPartResult"]["ETag"]
86
- out = FinishedPiece(etag=etag, part_number=part_number)
87
- locked_print(f"Finished part {part_number} for {state.dst_key}")
88
- return out
89
-
90
- except Exception as e:
91
- msg = (
92
- f"Error copying {copy_source} -> {state.dst_key}: {e}, params={params}"
93
- )
94
- if "An error occurred (InternalError)" in str(e):
95
- locked_print(msg)
96
- elif "NoSuchKey" in str(e):
97
- locked_print(msg)
98
- if retry == retries - 1:
99
- locked_print(msg)
100
- return e
101
- else:
102
- locked_print(f"{msg}, retrying")
103
- # sleep
104
- sleep_time = 2**retry
105
- locked_print(f"Sleeping for {sleep_time} seconds")
106
- continue
107
-
108
- return Exception("Should not reach here")
109
-
110
-
111
- def _complete_multipart_upload_from_parts(
112
- s3_client: BaseClient, state: MergeState, finished_parts: list[FinishedPiece]
113
- ) -> Exception | None:
114
- """
115
- Complete a multipart upload using the provided parts.
116
-
117
- Args:
118
- info: Upload information
119
- parts: List of finished pieces with ETags
120
-
121
- Returns:
122
- The URL of the completed object
123
- """
124
- # Sort parts by part number to ensure correct order
125
- finished_parts.sort(key=lambda x: x.part_number)
126
- multipart_parts = FinishedPiece.to_json_array(finished_parts)
127
- multipart_upload: dict = {
128
- "Parts": multipart_parts,
129
- }
130
- response: Any = None
131
- try:
132
- # Complete the multipart upload
133
- response = s3_client.complete_multipart_upload(
134
- Bucket=state.bucket,
135
- Key=state.dst_key,
136
- UploadId=state.upload_id,
137
- MultipartUpload=multipart_upload,
138
- )
139
- except Exception as e:
140
- import traceback
141
-
142
- stacktrace = traceback.format_exc()
143
- warnings.warn(
144
- f"Error completing multipart upload: {e}\n\n{response}\n\n{stacktrace}"
145
- )
146
- return e
147
-
148
- return None
149
-
150
-
151
- def _do_upload_task(
152
- s3_client: BaseClient,
153
- max_workers: int,
154
- merge_state: MergeState,
155
- on_finished: Callable[[FinishedPiece | EndOfStream], None],
156
- ) -> Exception | None:
157
- futures: list[Future[FinishedPiece | Exception]] = []
158
- parts = merge_state.remaining_parts()
159
- source_bucket = merge_state.bucket
160
- with ThreadPoolExecutor(max_workers=max_workers) as executor:
161
- semaphore = Semaphore(max_workers)
162
- for part in parts:
163
- part_number, s3_key = part.part_number, part.s3_key
164
-
165
- def task(
166
- s3_client=s3_client,
167
- state=merge_state,
168
- source_bucket=source_bucket,
169
- s3_key=s3_key,
170
- part_number=part_number,
171
- ):
172
- out = _upload_part_copy_task(
173
- s3_client=s3_client,
174
- state=state,
175
- source_bucket=source_bucket,
176
- source_key=s3_key,
177
- part_number=part_number,
178
- )
179
- if isinstance(out, Exception):
180
- return out
181
- # merge_state.on_finished(out)
182
- on_finished(out)
183
- return out
184
-
185
- fut = executor.submit(task)
186
- fut.add_done_callback(lambda x: semaphore.release())
187
- futures.append(fut)
188
-
189
- while not semaphore.acquire(blocking=False):
190
- time.sleep(0.1)
191
-
192
- final_fut = executor.submit(lambda: on_finished(EndOfStream()))
193
-
194
- for fut in futures:
195
- finished_part = fut.result()
196
- if isinstance(finished_part, Exception):
197
- executor.shutdown(wait=True, cancel_futures=True)
198
- return finished_part
199
- final_fut.result()
200
-
201
- finished_parts = merge_state.finished
202
- try:
203
- assert len(finished_parts) == len(merge_state.all_parts)
204
- except Exception:
205
- return ValueError(
206
- f"Finished parts mismatch: {len(finished_parts)} != {len(parts)}"
207
- )
208
-
209
- try:
210
- # Complete the multipart upload
211
- _complete_multipart_upload_from_parts(
212
- s3_client=s3_client, state=merge_state, finished_parts=finished_parts
213
- )
214
- except Exception as e:
215
- warnings.warn(f"Error completing multipart upload: {e}")
216
- return e
217
- return None
218
-
219
-
220
- def _begin_upload(
221
- s3_client: BaseClient,
222
- parts: list[Part],
223
- bucket: str,
224
- dst_key: str,
225
- verbose: bool,
226
- ) -> str:
227
- """
228
- Finish a multipart upload by copying parts from existing S3 objects.
229
-
230
- Args:
231
- s3_client: Boto3 S3 client
232
- source_bucket: Source bucket name
233
- source_keys: List of source object keys to copy from
234
- bucket: Destination bucket name
235
- dst_key: Destination object key
236
- retries: Number of retry attempts
237
- byte_ranges: Optional list of byte ranges corresponding to source_keys
238
-
239
- Returns:
240
- The upload id of the multipart upload
241
- """
242
-
243
- # Initiate multipart upload
244
- if verbose:
245
- locked_print(
246
- f"Creating multipart upload for {bucket}/{dst_key} from {len(parts)} source objects"
247
- )
248
- create_params: dict[str, str] = {
249
- "Bucket": bucket,
250
- "Key": dst_key,
251
- }
252
- if verbose:
253
- locked_print(f"Creating multipart upload with {create_params}")
254
- mpu = s3_client.create_multipart_upload(**create_params)
255
- if verbose:
256
- locked_print(f"Created multipart upload: {mpu}")
257
- upload_id = mpu["UploadId"]
258
- return upload_id
259
-
260
-
261
- class WriteMergeStateThread(Thread):
262
- def __init__(self, rclone_impl: RcloneImpl, merge_state: MergeState, verbose: bool):
263
- super().__init__(daemon=True)
264
- assert isinstance(merge_state, MergeState)
265
- self.verbose = verbose
266
- self.merge_state = merge_state
267
- self.merge_path = merge_state.merge_path
268
- self.rclone_impl = rclone_impl
269
- self.queue: Queue[FinishedPiece | EndOfStream] = Queue()
270
- self.start()
271
-
272
- def _get_next(self) -> FinishedPiece | EndOfStream:
273
- item = self.queue.get()
274
- if isinstance(item, EndOfStream):
275
- return item
276
- # see if there are more items in the queue, only write the last one
277
- while not self.queue.empty():
278
- item = self.queue.get()
279
- if isinstance(item, EndOfStream):
280
- # put it back in for next time
281
- self.queue.put(item)
282
- return item
283
- return item
284
-
285
- def verbose_print(self, msg: str) -> None:
286
- if self.verbose:
287
- locked_print(msg)
288
-
289
- def run(self):
290
- while True:
291
- item = self._get_next()
292
- if isinstance(item, EndOfStream):
293
- self.verbose_print("WriteMergeStateThread: End of stream")
294
- break
295
-
296
- assert isinstance(item, FinishedPiece)
297
- # piece: FinishedPiece = item
298
- # at this point just write out the whole json str
299
- json_str = self.merge_state.to_json_str()
300
- err = self.rclone_impl.write_text(self.merge_path, json_str)
301
- if isinstance(err, Exception):
302
- warnings.warn(f"Error writing merge state: {err}")
303
- break
304
-
305
- def add_finished(self, finished: FinishedPiece) -> None:
306
- self.queue.put(finished)
307
-
308
- def add_eos(self) -> None:
309
- self.queue.put(EndOfStream())
310
-
311
-
312
- def _cleanup_merge(rclone: RcloneImpl, info: InfoJson) -> Exception | None:
313
- size = info.size
314
- dst = info.dst
315
- parts_dir = info.parts_dir
316
- if not rclone.exists(dst):
317
- return FileNotFoundError(f"Destination file not found: {dst}")
318
-
319
- write_size = rclone.size_file(dst)
320
- if write_size != size:
321
- return ValueError(f"Size mismatch: {write_size} != {size}")
322
-
323
- print(f"Upload complete: {dst}")
324
- cp = rclone.purge(parts_dir)
325
- if cp.failed():
326
- return Exception(f"Failed to purge parts dir: {cp}")
327
- return None
328
-
329
-
330
- def _get_merge_path(info_path: str) -> str:
331
- par_dir = os.path.dirname(info_path)
332
- merge_path = f"{par_dir}/merge.json"
333
- return merge_path
334
-
335
-
336
- def _begin_or_resume_merge(
337
- rclone: RcloneImpl,
338
- info: InfoJson,
339
- verbose: bool = False,
340
- max_workers: int = DEFAULT_MAX_WORKERS,
341
- ) -> "S3MultiPartMerger | Exception":
342
- try:
343
- merger: S3MultiPartMerger = S3MultiPartMerger(
344
- rclone_impl=rclone,
345
- info=info,
346
- verbose=verbose,
347
- max_workers=max_workers,
348
- )
349
-
350
- s3_bucket = merger.bucket
351
- is_done = info.fetch_is_done()
352
- assert is_done, f"Upload is not done: {info}"
353
-
354
- merge_path = _get_merge_path(info_path=info.src_info)
355
- merge_json_text = rclone.read_text(merge_path)
356
- if isinstance(merge_json_text, str):
357
- # Attempt to do a resume
358
- merge_data = json.loads(merge_json_text)
359
- merge_state = MergeState.from_json(rclone_impl=rclone, json=merge_data)
360
- if isinstance(merge_state, MergeState):
361
- merger._begin_resume_merge(merge_state=merge_state)
362
- return merger
363
- warnings.warn(f"Failed to resume merge: {merge_state}, starting new merge")
364
-
365
- parts_dir = info.parts_dir
366
- source_keys = info.fetch_all_finished()
367
-
368
- parts_path = parts_dir.split(s3_bucket)[1]
369
- if parts_path.startswith("/"):
370
- parts_path = parts_path[1:]
371
-
372
- first_part: int | None = info.first_part
373
- last_part: int | None = info.last_part
374
-
375
- assert first_part is not None
376
- assert last_part is not None
377
-
378
- def _to_s3_key(name: str | None) -> str:
379
- if name:
380
- out = f"{parts_path}/{name}"
381
- return out
382
- out = f"{parts_path}"
383
- return out
384
-
385
- parts: list[Part] = []
386
- part_num = first_part
387
- for part_key in source_keys:
388
- assert part_num <= last_part and part_num >= first_part
389
- s3_key = _to_s3_key(name=part_key)
390
- part = Part(part_number=part_num, s3_key=s3_key)
391
- parts.append(part)
392
- part_num += 1
393
-
394
- dst_name = info.dst_name
395
- dst_dir = os.path.dirname(parts_path)
396
- dst_key = f"{dst_dir}/{dst_name}"
397
-
398
- err = merger._begin_new_merge(
399
- merge_path=merge_path,
400
- parts=parts,
401
- bucket=merger.bucket,
402
- dst_key=dst_key,
403
- )
404
- if isinstance(err, Exception):
405
- return err
406
- return merger
407
- except Exception as e:
408
- return e
409
-
410
-
411
- class S3MultiPartMerger:
412
- def __init__(
413
- self,
414
- rclone_impl: RcloneImpl,
415
- info: InfoJson,
416
- s3_config: S3Config | None = None,
417
- verbose: bool = False,
418
- max_workers: int = DEFAULT_MAX_WORKERS,
419
- ) -> None:
420
- self.rclone_impl: RcloneImpl = rclone_impl
421
- self.info = info
422
- self.s3_creds = rclone_impl.get_s3_credentials(remote=info.dst)
423
- self.verbose = verbose
424
- s3_config = s3_config or S3Config(
425
- verbose=verbose,
426
- timeout_read=_TIMEOUT_READ,
427
- timeout_connection=_TIMEOUT_CONNECTION,
428
- max_pool_connections=max_workers,
429
- )
430
- self.max_workers = s3_config.max_pool_connections or DEFAULT_MAX_WORKERS
431
- self.client = create_s3_client(s3_creds=self.s3_creds, s3_config=s3_config)
432
- self.state: MergeState | None = None
433
- self.write_thread: WriteMergeStateThread | None = None
434
-
435
- @staticmethod
436
- def create(
437
- rclone: RcloneImpl, info: InfoJson, max_workers: int, verbose: bool
438
- ) -> "S3MultiPartMerger | Exception":
439
- return _begin_or_resume_merge(
440
- rclone=rclone, info=info, max_workers=max_workers, verbose=verbose
441
- )
442
-
443
- @property
444
- def bucket(self) -> str:
445
- return self.s3_creds.bucket_name
446
-
447
- def start_write_thread(self) -> None:
448
- assert self.state is not None
449
- assert self.write_thread is None
450
- self.write_thread = WriteMergeStateThread(
451
- rclone_impl=self.rclone_impl,
452
- merge_state=self.state,
453
- verbose=self.verbose,
454
- )
455
-
456
- def _begin_new_merge(
457
- self,
458
- parts: list[Part],
459
- merge_path: str,
460
- bucket: str,
461
- dst_key: str,
462
- ) -> Exception | None:
463
- try:
464
- upload_id: str = _begin_upload(
465
- s3_client=self.client,
466
- parts=parts,
467
- bucket=bucket,
468
- dst_key=dst_key,
469
- verbose=self.verbose,
470
- )
471
- merge_state = MergeState(
472
- rclone_impl=self.rclone_impl,
473
- merge_path=merge_path,
474
- upload_id=upload_id,
475
- bucket=bucket,
476
- dst_key=dst_key,
477
- finished=[],
478
- all_parts=parts,
479
- )
480
- self.state = merge_state
481
- return None
482
- except Exception as e:
483
- return e
484
-
485
- def _begin_resume_merge(
486
- self,
487
- merge_state: MergeState,
488
- ) -> None:
489
- self.state = merge_state
490
-
491
- def _on_piece_finished(self, finished_piece: FinishedPiece | EndOfStream) -> None:
492
- assert self.write_thread is not None
493
- assert self.state is not None
494
- if isinstance(finished_piece, EndOfStream):
495
- self.write_thread.add_eos()
496
- else:
497
- self.state.on_finished(finished_piece)
498
- self.write_thread.add_finished(finished_piece)
499
-
500
- def merge(
501
- self,
502
- ) -> Exception | None:
503
- state = self.state
504
- if state is None:
505
- return Exception("No merge state loaded")
506
- self.start_write_thread()
507
- err = _do_upload_task(
508
- s3_client=self.client,
509
- merge_state=state,
510
- max_workers=self.max_workers,
511
- on_finished=self._on_piece_finished,
512
- )
513
- if isinstance(err, Exception):
514
- return err
515
- return None
516
-
517
- def cleanup(self) -> Exception | None:
518
- return _cleanup_merge(rclone=self.rclone_impl, info=self.info)
519
-
520
-
521
- def s3_server_side_multi_part_merge(
522
- rclone: RcloneImpl,
523
- info_path: str,
524
- max_workers: int = DEFAULT_MAX_WORKERS,
525
- verbose: bool = False,
526
- ) -> Exception | None:
527
- info = InfoJson(rclone, src=None, src_info=info_path)
528
- loaded = info.load()
529
- if not loaded:
530
- return FileNotFoundError(
531
- f"Info file not found, has the upload finished? {info_path}"
532
- )
533
- merger: S3MultiPartMerger | Exception = S3MultiPartMerger.create(
534
- rclone=rclone, info=info, max_workers=max_workers, verbose=verbose
535
- )
536
- if isinstance(merger, Exception):
537
- return merger
538
-
539
- err = merger.merge()
540
- if isinstance(err, Exception):
541
- return err
542
-
543
- err = merger.cleanup()
544
- if isinstance(err, Exception):
545
- return err
546
- return None
1
+ """
2
+ https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3/client/upload_part_copy.html
3
+ * client.upload_part_copy
4
+
5
+ This module provides functionality for S3 multipart uploads, including copying parts
6
+ from existing S3 objects using upload_part_copy.
7
+ """
8
+
9
+ import json
10
+ import os
11
+ import time
12
+ import warnings
13
+ from concurrent.futures import Future, ThreadPoolExecutor
14
+ from queue import Queue
15
+ from threading import Semaphore, Thread
16
+ from typing import Any, Callable
17
+
18
+ from rclone_api.rclone_impl import RcloneImpl
19
+ from rclone_api.s3.create import (
20
+ BaseClient,
21
+ S3Config,
22
+ create_s3_client,
23
+ )
24
+ from rclone_api.s3.multipart.finished_piece import FinishedPiece
25
+ from rclone_api.s3.multipart.info_json import InfoJson
26
+ from rclone_api.s3.multipart.merge_state import MergeState, Part
27
+ from rclone_api.types import EndOfStream
28
+ from rclone_api.util import locked_print
29
+
30
+ DEFAULT_MAX_WORKERS = 5 # Backblaze can do 10 with exponential backoff, so let's try 5
31
+
32
+ _TIMEOUT_READ = 900
33
+ _TIMEOUT_CONNECTION = 900
34
+
35
+
36
+ def _upload_part_copy_task(
37
+ s3_client: BaseClient,
38
+ state: MergeState,
39
+ source_bucket: str,
40
+ source_key: str,
41
+ part_number: int,
42
+ ) -> FinishedPiece | Exception:
43
+ """
44
+ Upload a part by copying from an existing S3 object.
45
+
46
+ Args:
47
+ info: Upload information
48
+ source_bucket: Source bucket name
49
+ source_key: Source object key
50
+ part_number: Part number (1-10000)
51
+ byte_range: Optional byte range in format 'bytes=start-end'
52
+ retries: Number of retry attempts
53
+
54
+ Returns:
55
+ FinishedPiece with ETag and part number
56
+ """
57
+ copy_source = {"Bucket": source_bucket, "Key": source_key}
58
+
59
+ # from botocore.exceptions import NoSuchKey
60
+ default_retries = 9
61
+ retries = default_retries + 1 # Add one for the initial attempt
62
+ for retry in range(retries):
63
+ params: dict = {}
64
+ try:
65
+ if retry > 0:
66
+ locked_print(f"Retrying part copy {part_number} for {state.dst_key}")
67
+
68
+ locked_print(
69
+ f"Copying part {part_number} for {state.dst_key} from {source_bucket}/{source_key}"
70
+ )
71
+
72
+ # Prepare the upload_part_copy parameters
73
+ params = {
74
+ "Bucket": state.bucket,
75
+ "CopySource": copy_source,
76
+ "Key": state.dst_key,
77
+ "PartNumber": part_number,
78
+ "UploadId": state.upload_id,
79
+ }
80
+
81
+ # Execute the copy operation
82
+ part = s3_client.upload_part_copy(**params)
83
+
84
+ # Extract ETag from the response
85
+ etag = part["CopyPartResult"]["ETag"]
86
+ out = FinishedPiece(etag=etag, part_number=part_number)
87
+ locked_print(f"Finished part {part_number} for {state.dst_key}")
88
+ return out
89
+
90
+ except Exception as e:
91
+ msg = (
92
+ f"Error copying {copy_source} -> {state.dst_key}: {e}, params={params}"
93
+ )
94
+ if "An error occurred (InternalError)" in str(e):
95
+ locked_print(msg)
96
+ elif "NoSuchKey" in str(e):
97
+ locked_print(msg)
98
+ if retry == retries - 1:
99
+ locked_print(msg)
100
+ return e
101
+ else:
102
+ locked_print(f"{msg}, retrying")
103
+ # sleep
104
+ sleep_time = 2**retry
105
+ locked_print(f"Sleeping for {sleep_time} seconds")
106
+ continue
107
+
108
+ return Exception("Should not reach here")
109
+
110
+
111
+ def _complete_multipart_upload_from_parts(
112
+ s3_client: BaseClient, state: MergeState, finished_parts: list[FinishedPiece]
113
+ ) -> Exception | None:
114
+ """
115
+ Complete a multipart upload using the provided parts.
116
+
117
+ Args:
118
+ info: Upload information
119
+ parts: List of finished pieces with ETags
120
+
121
+ Returns:
122
+ The URL of the completed object
123
+ """
124
+ # Sort parts by part number to ensure correct order
125
+ finished_parts.sort(key=lambda x: x.part_number)
126
+ multipart_parts = FinishedPiece.to_json_array(finished_parts)
127
+ multipart_upload: dict = {
128
+ "Parts": multipart_parts,
129
+ }
130
+ response: Any = None
131
+ try:
132
+ # Complete the multipart upload
133
+ response = s3_client.complete_multipart_upload(
134
+ Bucket=state.bucket,
135
+ Key=state.dst_key,
136
+ UploadId=state.upload_id,
137
+ MultipartUpload=multipart_upload,
138
+ )
139
+ except Exception as e:
140
+ import traceback
141
+
142
+ stacktrace = traceback.format_exc()
143
+ warnings.warn(
144
+ f"Error completing multipart upload: {e}\n\n{response}\n\n{stacktrace}"
145
+ )
146
+ return e
147
+
148
+ return None
149
+
150
+
151
+ def _do_upload_task(
152
+ s3_client: BaseClient,
153
+ max_workers: int,
154
+ merge_state: MergeState,
155
+ on_finished: Callable[[FinishedPiece | EndOfStream], None],
156
+ ) -> Exception | None:
157
+ futures: list[Future[FinishedPiece | Exception]] = []
158
+ parts = merge_state.remaining_parts()
159
+ source_bucket = merge_state.bucket
160
+ with ThreadPoolExecutor(max_workers=max_workers) as executor:
161
+ semaphore = Semaphore(max_workers)
162
+ for part in parts:
163
+ part_number, s3_key = part.part_number, part.s3_key
164
+
165
+ def task(
166
+ s3_client=s3_client,
167
+ state=merge_state,
168
+ source_bucket=source_bucket,
169
+ s3_key=s3_key,
170
+ part_number=part_number,
171
+ ):
172
+ out = _upload_part_copy_task(
173
+ s3_client=s3_client,
174
+ state=state,
175
+ source_bucket=source_bucket,
176
+ source_key=s3_key,
177
+ part_number=part_number,
178
+ )
179
+ if isinstance(out, Exception):
180
+ return out
181
+ # merge_state.on_finished(out)
182
+ on_finished(out)
183
+ return out
184
+
185
+ fut = executor.submit(task)
186
+ fut.add_done_callback(lambda x: semaphore.release())
187
+ futures.append(fut)
188
+
189
+ while not semaphore.acquire(blocking=False):
190
+ time.sleep(0.1)
191
+
192
+ final_fut = executor.submit(lambda: on_finished(EndOfStream()))
193
+
194
+ for fut in futures:
195
+ finished_part = fut.result()
196
+ if isinstance(finished_part, Exception):
197
+ executor.shutdown(wait=True, cancel_futures=True)
198
+ return finished_part
199
+ final_fut.result()
200
+
201
+ finished_parts = merge_state.finished
202
+ try:
203
+ assert len(finished_parts) == len(merge_state.all_parts)
204
+ except Exception:
205
+ return ValueError(
206
+ f"Finished parts mismatch: {len(finished_parts)} != {len(parts)}"
207
+ )
208
+
209
+ try:
210
+ # Complete the multipart upload
211
+ _complete_multipart_upload_from_parts(
212
+ s3_client=s3_client, state=merge_state, finished_parts=finished_parts
213
+ )
214
+ except Exception as e:
215
+ warnings.warn(f"Error completing multipart upload: {e}")
216
+ return e
217
+ return None
218
+
219
+
220
+ def _begin_upload(
221
+ s3_client: BaseClient,
222
+ parts: list[Part],
223
+ bucket: str,
224
+ dst_key: str,
225
+ verbose: bool,
226
+ ) -> str:
227
+ """
228
+ Finish a multipart upload by copying parts from existing S3 objects.
229
+
230
+ Args:
231
+ s3_client: Boto3 S3 client
232
+ source_bucket: Source bucket name
233
+ source_keys: List of source object keys to copy from
234
+ bucket: Destination bucket name
235
+ dst_key: Destination object key
236
+ retries: Number of retry attempts
237
+ byte_ranges: Optional list of byte ranges corresponding to source_keys
238
+
239
+ Returns:
240
+ The upload id of the multipart upload
241
+ """
242
+
243
+ # Initiate multipart upload
244
+ if verbose:
245
+ locked_print(
246
+ f"Creating multipart upload for {bucket}/{dst_key} from {len(parts)} source objects"
247
+ )
248
+ create_params: dict[str, str] = {
249
+ "Bucket": bucket,
250
+ "Key": dst_key,
251
+ }
252
+ if verbose:
253
+ locked_print(f"Creating multipart upload with {create_params}")
254
+ mpu = s3_client.create_multipart_upload(**create_params)
255
+ if verbose:
256
+ locked_print(f"Created multipart upload: {mpu}")
257
+ upload_id = mpu["UploadId"]
258
+ return upload_id
259
+
260
+
261
+ class WriteMergeStateThread(Thread):
262
+ def __init__(self, rclone_impl: RcloneImpl, merge_state: MergeState, verbose: bool):
263
+ super().__init__(daemon=True)
264
+ assert isinstance(merge_state, MergeState)
265
+ self.verbose = verbose
266
+ self.merge_state = merge_state
267
+ self.merge_path = merge_state.merge_path
268
+ self.rclone_impl = rclone_impl
269
+ self.queue: Queue[FinishedPiece | EndOfStream] = Queue()
270
+ self.start()
271
+
272
+ def _get_next(self) -> FinishedPiece | EndOfStream:
273
+ item = self.queue.get()
274
+ if isinstance(item, EndOfStream):
275
+ return item
276
+ # see if there are more items in the queue, only write the last one
277
+ while not self.queue.empty():
278
+ item = self.queue.get()
279
+ if isinstance(item, EndOfStream):
280
+ # put it back in for next time
281
+ self.queue.put(item)
282
+ return item
283
+ return item
284
+
285
+ def verbose_print(self, msg: str) -> None:
286
+ if self.verbose:
287
+ locked_print(msg)
288
+
289
+ def run(self):
290
+ while True:
291
+ item = self._get_next()
292
+ if isinstance(item, EndOfStream):
293
+ self.verbose_print("WriteMergeStateThread: End of stream")
294
+ break
295
+
296
+ assert isinstance(item, FinishedPiece)
297
+ # piece: FinishedPiece = item
298
+ # at this point just write out the whole json str
299
+ json_str = self.merge_state.to_json_str()
300
+ err = self.rclone_impl.write_text(self.merge_path, json_str)
301
+ if isinstance(err, Exception):
302
+ warnings.warn(f"Error writing merge state: {err}")
303
+ break
304
+
305
+ def add_finished(self, finished: FinishedPiece) -> None:
306
+ self.queue.put(finished)
307
+
308
+ def add_eos(self) -> None:
309
+ self.queue.put(EndOfStream())
310
+
311
+
312
+ def _cleanup_merge(rclone: RcloneImpl, info: InfoJson) -> Exception | None:
313
+ size = info.size
314
+ dst = info.dst
315
+ parts_dir = info.parts_dir
316
+ if not rclone.exists(dst):
317
+ return FileNotFoundError(f"Destination file not found: {dst}")
318
+
319
+ write_size = rclone.size_file(dst)
320
+ if write_size != size:
321
+ return ValueError(f"Size mismatch: {write_size} != {size}")
322
+
323
+ print(f"Upload complete: {dst}")
324
+ cp = rclone.purge(parts_dir)
325
+ if cp.failed():
326
+ return Exception(f"Failed to purge parts dir: {cp}")
327
+ return None
328
+
329
+
330
+ def _get_merge_path(info_path: str) -> str:
331
+ par_dir = os.path.dirname(info_path)
332
+ merge_path = f"{par_dir}/merge.json"
333
+ return merge_path
334
+
335
+
336
+ def _begin_or_resume_merge(
337
+ rclone: RcloneImpl,
338
+ info: InfoJson,
339
+ verbose: bool = False,
340
+ max_workers: int = DEFAULT_MAX_WORKERS,
341
+ ) -> "S3MultiPartMerger | Exception":
342
+ try:
343
+ merger: S3MultiPartMerger = S3MultiPartMerger(
344
+ rclone_impl=rclone,
345
+ info=info,
346
+ verbose=verbose,
347
+ max_workers=max_workers,
348
+ )
349
+
350
+ s3_bucket = merger.bucket
351
+ is_done = info.fetch_is_done()
352
+ assert is_done, f"Upload is not done: {info}"
353
+
354
+ merge_path = _get_merge_path(info_path=info.src_info)
355
+ merge_json_text = rclone.read_text(merge_path)
356
+ if isinstance(merge_json_text, str):
357
+ # Attempt to do a resume
358
+ merge_data = json.loads(merge_json_text)
359
+ merge_state = MergeState.from_json(rclone_impl=rclone, json=merge_data)
360
+ if isinstance(merge_state, MergeState):
361
+ merger._begin_resume_merge(merge_state=merge_state)
362
+ return merger
363
+ warnings.warn(f"Failed to resume merge: {merge_state}, starting new merge")
364
+
365
+ parts_dir = info.parts_dir
366
+ source_keys = info.fetch_all_finished()
367
+
368
+ parts_path = parts_dir.split(s3_bucket)[1]
369
+ if parts_path.startswith("/"):
370
+ parts_path = parts_path[1:]
371
+
372
+ first_part: int | None = info.first_part
373
+ last_part: int | None = info.last_part
374
+
375
+ assert first_part is not None
376
+ assert last_part is not None
377
+
378
+ def _to_s3_key(name: str | None) -> str:
379
+ if name:
380
+ out = f"{parts_path}/{name}"
381
+ return out
382
+ out = f"{parts_path}"
383
+ return out
384
+
385
+ parts: list[Part] = []
386
+ part_num = first_part
387
+ for part_key in source_keys:
388
+ assert part_num <= last_part and part_num >= first_part
389
+ s3_key = _to_s3_key(name=part_key)
390
+ part = Part(part_number=part_num, s3_key=s3_key)
391
+ parts.append(part)
392
+ part_num += 1
393
+
394
+ dst_name = info.dst_name
395
+ dst_dir = os.path.dirname(parts_path)
396
+ dst_key = f"{dst_dir}/{dst_name}"
397
+
398
+ err = merger._begin_new_merge(
399
+ merge_path=merge_path,
400
+ parts=parts,
401
+ bucket=merger.bucket,
402
+ dst_key=dst_key,
403
+ )
404
+ if isinstance(err, Exception):
405
+ return err
406
+ return merger
407
+ except Exception as e:
408
+ return e
409
+
410
+
411
+ class S3MultiPartMerger:
412
+ def __init__(
413
+ self,
414
+ rclone_impl: RcloneImpl,
415
+ info: InfoJson,
416
+ s3_config: S3Config | None = None,
417
+ verbose: bool = False,
418
+ max_workers: int = DEFAULT_MAX_WORKERS,
419
+ ) -> None:
420
+ self.rclone_impl: RcloneImpl = rclone_impl
421
+ self.info = info
422
+ self.s3_creds = rclone_impl.get_s3_credentials(remote=info.dst)
423
+ self.verbose = verbose
424
+ s3_config = s3_config or S3Config(
425
+ verbose=verbose,
426
+ timeout_read=_TIMEOUT_READ,
427
+ timeout_connection=_TIMEOUT_CONNECTION,
428
+ max_pool_connections=max_workers,
429
+ )
430
+ self.max_workers = s3_config.max_pool_connections or DEFAULT_MAX_WORKERS
431
+ self.client = create_s3_client(s3_creds=self.s3_creds, s3_config=s3_config)
432
+ self.state: MergeState | None = None
433
+ self.write_thread: WriteMergeStateThread | None = None
434
+
435
+ @staticmethod
436
+ def create(
437
+ rclone: RcloneImpl, info: InfoJson, max_workers: int, verbose: bool
438
+ ) -> "S3MultiPartMerger | Exception":
439
+ return _begin_or_resume_merge(
440
+ rclone=rclone, info=info, max_workers=max_workers, verbose=verbose
441
+ )
442
+
443
+ @property
444
+ def bucket(self) -> str:
445
+ return self.s3_creds.bucket_name
446
+
447
+ def start_write_thread(self) -> None:
448
+ assert self.state is not None
449
+ assert self.write_thread is None
450
+ self.write_thread = WriteMergeStateThread(
451
+ rclone_impl=self.rclone_impl,
452
+ merge_state=self.state,
453
+ verbose=self.verbose,
454
+ )
455
+
456
+ def _begin_new_merge(
457
+ self,
458
+ parts: list[Part],
459
+ merge_path: str,
460
+ bucket: str,
461
+ dst_key: str,
462
+ ) -> Exception | None:
463
+ try:
464
+ upload_id: str = _begin_upload(
465
+ s3_client=self.client,
466
+ parts=parts,
467
+ bucket=bucket,
468
+ dst_key=dst_key,
469
+ verbose=self.verbose,
470
+ )
471
+ merge_state = MergeState(
472
+ rclone_impl=self.rclone_impl,
473
+ merge_path=merge_path,
474
+ upload_id=upload_id,
475
+ bucket=bucket,
476
+ dst_key=dst_key,
477
+ finished=[],
478
+ all_parts=parts,
479
+ )
480
+ self.state = merge_state
481
+ return None
482
+ except Exception as e:
483
+ return e
484
+
485
+ def _begin_resume_merge(
486
+ self,
487
+ merge_state: MergeState,
488
+ ) -> None:
489
+ self.state = merge_state
490
+
491
+ def _on_piece_finished(self, finished_piece: FinishedPiece | EndOfStream) -> None:
492
+ assert self.write_thread is not None
493
+ assert self.state is not None
494
+ if isinstance(finished_piece, EndOfStream):
495
+ self.write_thread.add_eos()
496
+ else:
497
+ self.state.on_finished(finished_piece)
498
+ self.write_thread.add_finished(finished_piece)
499
+
500
+ def merge(
501
+ self,
502
+ ) -> Exception | None:
503
+ state = self.state
504
+ if state is None:
505
+ return Exception("No merge state loaded")
506
+ self.start_write_thread()
507
+ err = _do_upload_task(
508
+ s3_client=self.client,
509
+ merge_state=state,
510
+ max_workers=self.max_workers,
511
+ on_finished=self._on_piece_finished,
512
+ )
513
+ if isinstance(err, Exception):
514
+ return err
515
+ return None
516
+
517
+ def cleanup(self) -> Exception | None:
518
+ return _cleanup_merge(rclone=self.rclone_impl, info=self.info)
519
+
520
+
521
+ def s3_server_side_multi_part_merge(
522
+ rclone: RcloneImpl,
523
+ info_path: str,
524
+ max_workers: int = DEFAULT_MAX_WORKERS,
525
+ verbose: bool = False,
526
+ ) -> Exception | None:
527
+ info = InfoJson(rclone, src=None, src_info=info_path)
528
+ loaded = info.load()
529
+ if not loaded:
530
+ return FileNotFoundError(
531
+ f"Info file not found, has the upload finished? {info_path}"
532
+ )
533
+ merger: S3MultiPartMerger | Exception = S3MultiPartMerger.create(
534
+ rclone=rclone, info=info, max_workers=max_workers, verbose=verbose
535
+ )
536
+ if isinstance(merger, Exception):
537
+ return merger
538
+
539
+ err = merger.merge()
540
+ if isinstance(err, Exception):
541
+ return err
542
+
543
+ err = merger.cleanup()
544
+ if isinstance(err, Exception):
545
+ return err
546
+ return None