rclone-api 1.4.15__py2.py3-none-any.whl → 1.4.19__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -6,38 +6,39 @@ This module provides functionality for S3 multipart uploads, including copying p
6
6
  from existing S3 objects using upload_part_copy.
7
7
  """
8
8
 
9
+ import json
10
+ import os
11
+ import time
12
+ import warnings
9
13
  from concurrent.futures import Future, ThreadPoolExecutor
10
- from dataclasses import dataclass
11
- from pathlib import Path
12
- from threading import Semaphore
13
- from typing import Optional
14
-
15
- from botocore.client import BaseClient
16
-
14
+ from queue import Queue
15
+ from threading import Semaphore, Thread
16
+ from typing import Callable
17
+
18
+ from rclone_api.detail.copy_file_parts import InfoJson
19
+ from rclone_api.rclone_impl import RcloneImpl
20
+ from rclone_api.s3.create import (
21
+ BaseClient,
22
+ S3Config,
23
+ create_s3_client,
24
+ )
25
+ from rclone_api.s3.merge_state import MergeState, Part
17
26
  from rclone_api.s3.multipart.finished_piece import FinishedPiece
27
+ from rclone_api.types import EndOfStream
18
28
  from rclone_api.util import locked_print
19
29
 
30
+ DEFAULT_MAX_WORKERS = 5 # Backblaze can do 10 with exponential backoff, so let's try 5
20
31
 
21
- @dataclass
22
- class MultipartUploadInfo:
23
- """Simplified upload information for multipart uploads."""
24
-
25
- s3_client: BaseClient
26
- bucket_name: str
27
- object_name: str
28
- upload_id: str
29
- chunk_size: int
30
- retries: int
31
- file_size: Optional[int] = None
32
- src_file_path: Optional[Path] = None
32
+ _TIMEOUT_READ = 900
33
+ _TIMEOUT_CONNECTION = 900
33
34
 
34
35
 
35
- def upload_part_copy_task(
36
- info: MultipartUploadInfo,
36
+ def _upload_part_copy_task(
37
+ s3_client: BaseClient,
38
+ state: MergeState,
37
39
  source_bucket: str,
38
40
  source_key: str,
39
41
  part_number: int,
40
- retries: int = 3,
41
42
  ) -> FinishedPiece | Exception:
42
43
  """
43
44
  Upload a part by copying from an existing S3 object.
@@ -56,51 +57,59 @@ def upload_part_copy_task(
56
57
  copy_source = {"Bucket": source_bucket, "Key": source_key}
57
58
 
58
59
  # from botocore.exceptions import NoSuchKey
59
-
60
- retries = retries + 1 # Add one for the initial attempt
60
+ default_retries = 9
61
+ retries = default_retries + 1 # Add one for the initial attempt
61
62
  for retry in range(retries):
62
63
  params: dict = {}
63
64
  try:
64
65
  if retry > 0:
65
- locked_print(f"Retrying part copy {part_number} for {info.object_name}")
66
+ locked_print(f"Retrying part copy {part_number} for {state.dst_key}")
66
67
 
67
68
  locked_print(
68
- f"Copying part {part_number} for {info.object_name} from {source_bucket}/{source_key}"
69
+ f"Copying part {part_number} for {state.dst_key} from {source_bucket}/{source_key}"
69
70
  )
70
71
 
71
72
  # Prepare the upload_part_copy parameters
72
73
  params = {
73
- "Bucket": info.bucket_name,
74
+ "Bucket": state.bucket,
74
75
  "CopySource": copy_source,
75
- "Key": info.object_name,
76
+ "Key": state.dst_key,
76
77
  "PartNumber": part_number,
77
- "UploadId": info.upload_id,
78
+ "UploadId": state.upload_id,
78
79
  }
79
80
 
80
81
  # Execute the copy operation
81
- part = info.s3_client.upload_part_copy(**params)
82
+ part = s3_client.upload_part_copy(**params)
82
83
 
83
84
  # Extract ETag from the response
84
85
  etag = part["CopyPartResult"]["ETag"]
85
- return FinishedPiece(etag=etag, part_number=part_number)
86
+ out = FinishedPiece(etag=etag, part_number=part_number)
87
+ locked_print(f"Finished part {part_number} for {state.dst_key}")
88
+ return out
86
89
 
87
90
  except Exception as e:
88
- msg = f"Error copying {copy_source} -> {info.object_name}: {e}, params={params}"
89
- if "NoSuchKey" in str(e):
91
+ msg = (
92
+ f"Error copying {copy_source} -> {state.dst_key}: {e}, params={params}"
93
+ )
94
+ if "An error occurred (InternalError)" in str(e):
95
+ locked_print(msg)
96
+ elif "NoSuchKey" in str(e):
90
97
  locked_print(msg)
91
- return e
92
98
  if retry == retries - 1:
93
99
  locked_print(msg)
94
100
  return e
95
101
  else:
96
102
  locked_print(f"{msg}, retrying")
103
+ # sleep
104
+ sleep_time = 2**retry
105
+ locked_print(f"Sleeping for {sleep_time} seconds")
97
106
  continue
98
107
 
99
108
  return Exception("Should not reach here")
100
109
 
101
110
 
102
- def complete_multipart_upload_from_parts(
103
- info: MultipartUploadInfo, parts: list[FinishedPiece]
111
+ def _complete_multipart_upload_from_parts(
112
+ s3_client: BaseClient, state: MergeState, finished_parts: list[FinishedPiece]
104
113
  ) -> str:
105
114
  """
106
115
  Complete a multipart upload using the provided parts.
@@ -113,105 +122,61 @@ def complete_multipart_upload_from_parts(
113
122
  The URL of the completed object
114
123
  """
115
124
  # Sort parts by part number to ensure correct order
116
- parts.sort(key=lambda x: x.part_number)
117
-
118
- # Prepare the parts list for the complete_multipart_upload call
119
- multipart_parts = [
120
- {"ETag": part.etag, "PartNumber": part.part_number} for part in parts
121
- ]
125
+ finished_parts.sort(key=lambda x: x.part_number)
126
+ multipart_parts = FinishedPiece.to_json_array(finished_parts)
122
127
 
123
128
  # Complete the multipart upload
124
- response = info.s3_client.complete_multipart_upload(
125
- Bucket=info.bucket_name,
126
- Key=info.object_name,
127
- UploadId=info.upload_id,
129
+ response = s3_client.complete_multipart_upload(
130
+ Bucket=state.bucket,
131
+ Key=state.dst_key,
132
+ UploadId=state.upload_id,
128
133
  MultipartUpload={"Parts": multipart_parts},
129
134
  )
130
135
 
131
136
  # Return the URL of the completed object
132
- return response.get("Location", f"s3://{info.bucket_name}/{info.object_name}")
137
+ return response.get("Location", f"s3://{state.bucket}/{state.dst_key}")
133
138
 
134
139
 
135
- def finish_multipart_upload_from_keys(
140
+ def _do_upload_task(
136
141
  s3_client: BaseClient,
137
- source_bucket: str,
138
- parts: list[tuple[int, str]],
139
- final_size: int,
140
- destination_bucket: str,
141
- destination_key: str,
142
- chunk_size: int, # 5MB default
143
- max_workers: int = 100,
144
- retries: int = 3,
145
- ) -> str:
146
- """
147
- Finish a multipart upload by copying parts from existing S3 objects.
148
-
149
- Args:
150
- s3_client: Boto3 S3 client
151
- source_bucket: Source bucket name
152
- source_keys: List of source object keys to copy from
153
- destination_bucket: Destination bucket name
154
- destination_key: Destination object key
155
- chunk_size: Size of each part in bytes
156
- retries: Number of retry attempts
157
- byte_ranges: Optional list of byte ranges corresponding to source_keys
158
-
159
- Returns:
160
- The URL of the completed object
161
- """
162
-
163
- # Initiate multipart upload
164
- locked_print(
165
- f"Creating multipart upload for {destination_bucket}/{destination_key} from {len(parts)} source objects"
166
- )
167
-
168
- create_params: dict[str, str] = {
169
- "Bucket": destination_bucket,
170
- "Key": destination_key,
171
- }
172
- print(f"Creating multipart upload with {create_params}")
173
- mpu = s3_client.create_multipart_upload(**create_params)
174
- print(f"Created multipart upload: {mpu}")
175
- upload_id = mpu["UploadId"]
176
-
177
- # Create upload info
178
- upload_info = MultipartUploadInfo(
179
- s3_client=s3_client,
180
- bucket_name=destination_bucket,
181
- object_name=destination_key,
182
- upload_id=upload_id,
183
- retries=retries,
184
- chunk_size=chunk_size,
185
- file_size=final_size,
186
- )
187
-
142
+ max_workers: int,
143
+ merge_state: MergeState,
144
+ on_finished: Callable[[FinishedPiece | EndOfStream], None],
145
+ ) -> Exception | None:
188
146
  futures: list[Future[FinishedPiece | Exception]] = []
189
-
147
+ parts = merge_state.remaining_parts()
148
+ source_bucket = merge_state.bucket
190
149
  with ThreadPoolExecutor(max_workers=max_workers) as executor:
191
- # semaphore
192
-
193
- semaphore = Semaphore(max_workers * 2)
194
- for part_number, source_key in parts:
150
+ semaphore = Semaphore(max_workers)
151
+ for part in parts:
152
+ part_number, s3_key = part.part_number, part.s3_key
195
153
 
196
154
  def task(
197
- info=upload_info,
155
+ s3_client=s3_client,
156
+ state=merge_state,
198
157
  source_bucket=source_bucket,
199
- source_key=source_key,
158
+ s3_key=s3_key,
200
159
  part_number=part_number,
201
- retries=retries,
202
160
  ):
203
- return upload_part_copy_task(
204
- info=info,
161
+ out = _upload_part_copy_task(
162
+ s3_client=s3_client,
163
+ state=state,
205
164
  source_bucket=source_bucket,
206
- source_key=source_key,
165
+ source_key=s3_key,
207
166
  part_number=part_number,
208
- retries=retries,
209
167
  )
168
+ if isinstance(out, Exception):
169
+ return out
170
+ # merge_state.on_finished(out)
171
+ on_finished(out)
172
+ return out
210
173
 
211
174
  fut = executor.submit(task)
212
175
  fut.add_done_callback(lambda x: semaphore.release())
213
176
  futures.append(fut)
214
- semaphore.acquire()
177
+
178
+ while not semaphore.acquire(blocking=False):
179
+ time.sleep(0.1)
215
180
 
216
181
  # Upload parts by copying from source objects
217
182
  finished_parts: list[FinishedPiece] = []
@@ -220,50 +185,334 @@ def finish_multipart_upload_from_keys(
220
185
  finished_part = fut.result()
221
186
  if isinstance(finished_part, Exception):
222
187
  executor.shutdown(wait=True, cancel_futures=True)
223
- raise finished_part
188
+ return finished_part
224
189
  finished_parts.append(finished_part)
225
190
 
226
- # Complete the multipart upload
227
- return complete_multipart_upload_from_parts(upload_info, finished_parts)
191
+ on_finished(EndOfStream())
192
+
193
+ try:
194
+ # Complete the multipart upload
195
+ _complete_multipart_upload_from_parts(
196
+ s3_client=s3_client, state=merge_state, finished_parts=finished_parts
197
+ )
198
+ except Exception as e:
199
+ warnings.warn(f"Error completing multipart upload: {e}")
200
+ return e
201
+ return None
202
+
203
+
204
+ def _begin_upload(
205
+ s3_client: BaseClient,
206
+ parts: list[Part],
207
+ bucket: str,
208
+ dst_key: str,
209
+ verbose: bool,
210
+ ) -> str:
211
+ """
212
+ Finish a multipart upload by copying parts from existing S3 objects.
213
+
214
+ Args:
215
+ s3_client: Boto3 S3 client
216
+ source_bucket: Source bucket name
217
+ source_keys: List of source object keys to copy from
218
+ bucket: Destination bucket name
219
+ dst_key: Destination object key
220
+ retries: Number of retry attempts
221
+ byte_ranges: Optional list of byte ranges corresponding to source_keys
222
+
223
+ Returns:
224
+ The upload id of the multipart upload
225
+ """
226
+
227
+ # Initiate multipart upload
228
+ if verbose:
229
+ locked_print(
230
+ f"Creating multipart upload for {bucket}/{dst_key} from {len(parts)} source objects"
231
+ )
232
+ create_params: dict[str, str] = {
233
+ "Bucket": bucket,
234
+ "Key": dst_key,
235
+ }
236
+ if verbose:
237
+ locked_print(f"Creating multipart upload with {create_params}")
238
+ mpu = s3_client.create_multipart_upload(**create_params)
239
+ if verbose:
240
+ locked_print(f"Created multipart upload: {mpu}")
241
+ upload_id = mpu["UploadId"]
242
+ return upload_id
243
+
244
+
245
+ class WriteMergeStateThread(Thread):
246
+ def __init__(self, rclone_impl: RcloneImpl, merge_state: MergeState):
247
+ super().__init__(daemon=True)
248
+ assert isinstance(merge_state, MergeState)
249
+ self.merge_state = merge_state
250
+ self.merge_path = merge_state.merge_path
251
+ self.rclone_impl = rclone_impl
252
+ self.queue: Queue[FinishedPiece | EndOfStream] = Queue()
253
+ self.start()
254
+
255
+ def _get_next(self) -> FinishedPiece | EndOfStream:
256
+ item = self.queue.get()
257
+ if isinstance(item, EndOfStream):
258
+ return item
259
+ # see if there are more items in the queue, only write the last one
260
+ while not self.queue.empty():
261
+ item = self.queue.get()
262
+ if isinstance(item, EndOfStream):
263
+ # put it back in for next time
264
+ self.queue.put(item)
265
+ return item
266
+ return item
267
+
268
+ def run(self):
269
+ while True:
270
+ item = self._get_next()
271
+ if isinstance(item, EndOfStream):
272
+ warnings.warn("End of stream")
273
+ break
274
+
275
+ assert isinstance(item, FinishedPiece)
276
+ # piece: FinishedPiece = item
277
+ # at this point just write out the whole json str
278
+ json_str = self.merge_state.to_json_str()
279
+ err = self.rclone_impl.write_text(self.merge_path, json_str)
280
+ if isinstance(err, Exception):
281
+ warnings.warn(f"Error writing merge state: {err}")
282
+ break
283
+
284
+ def add_finished(self, finished: FinishedPiece) -> None:
285
+ self.queue.put(finished)
286
+
287
+ def add_eos(self) -> None:
288
+ self.queue.put(EndOfStream())
289
+
290
+
291
+ def _cleanup_merge(rclone: RcloneImpl, info: InfoJson) -> Exception | None:
292
+ size = info.size
293
+ dst = info.dst
294
+ parts_dir = info.parts_dir
295
+ if not rclone.exists(dst):
296
+ return FileNotFoundError(f"Destination file not found: {dst}")
297
+
298
+ write_size = rclone.size_file(dst)
299
+ if write_size != size:
300
+ return ValueError(f"Size mismatch: {write_size} != {size}")
301
+
302
+ print(f"Upload complete: {dst}")
303
+ cp = rclone.purge(parts_dir)
304
+ if cp.failed():
305
+ return Exception(f"Failed to purge parts dir: {cp}")
306
+ return None
307
+
308
+
309
+ def _get_merge_path(info_path: str) -> str:
310
+ par_dir = os.path.dirname(info_path)
311
+ merge_path = f"{par_dir}/merge.json"
312
+ return merge_path
313
+
314
+
315
+ def _begin_or_resume_merge(
316
+ rclone: RcloneImpl,
317
+ info: InfoJson,
318
+ max_workers: int = DEFAULT_MAX_WORKERS,
319
+ ) -> "S3MultiPartMerger | Exception":
320
+ try:
321
+ merger: S3MultiPartMerger = S3MultiPartMerger(
322
+ rclone_impl=rclone,
323
+ info=info,
324
+ verbose=True,
325
+ max_workers=max_workers,
326
+ )
327
+
328
+ s3_bucket = merger.bucket
329
+ is_done = info.fetch_is_done()
330
+ assert is_done, f"Upload is not done: {info}"
331
+
332
+ merge_path = _get_merge_path(info_path=info.src_info)
333
+ merge_json_text = rclone.read_text(merge_path)
334
+ if isinstance(merge_json_text, str):
335
+ # Attempt to do a resume
336
+ merge_data = json.loads(merge_json_text)
337
+ merge_state = MergeState.from_json(rclone_impl=rclone, json=merge_data)
338
+ if isinstance(merge_state, MergeState):
339
+ merger._begin_resume_merge(merge_state=merge_state)
340
+ return merger
341
+ warnings.warn(f"Failed to resume merge: {merge_state}, starting new merge")
342
+
343
+ parts_dir = info.parts_dir
344
+ source_keys = info.fetch_all_finished()
345
+
346
+ parts_path = parts_dir.split(s3_bucket)[1]
347
+ if parts_path.startswith("/"):
348
+ parts_path = parts_path[1:]
349
+
350
+ first_part: int | None = info.first_part
351
+ last_part: int | None = info.last_part
352
+
353
+ assert first_part is not None
354
+ assert last_part is not None
355
+
356
+ def _to_s3_key(name: str | None) -> str:
357
+ if name:
358
+ out = f"{parts_path}/{name}"
359
+ return out
360
+ out = f"{parts_path}"
361
+ return out
362
+
363
+ parts: list[Part] = []
364
+ part_num = first_part
365
+ for part_key in source_keys:
366
+ assert part_num <= last_part and part_num >= first_part
367
+ s3_key = _to_s3_key(name=part_key)
368
+ part = Part(part_number=part_num, s3_key=s3_key)
369
+ parts.append(part)
370
+ part_num += 1
371
+
372
+ dst_name = info.dst_name
373
+ dst_dir = os.path.dirname(parts_path)
374
+ dst_key = f"{dst_dir}/{dst_name}"
375
+
376
+ err = merger._begin_new_merge(
377
+ merge_path=merge_path,
378
+ parts=parts,
379
+ bucket=merger.bucket,
380
+ dst_key=dst_key,
381
+ )
382
+ if isinstance(err, Exception):
383
+ return err
384
+ return merger
385
+ except Exception as e:
386
+ return e
228
387
 
229
388
 
230
- class S3MultiPartUploader:
231
- def __init__(self, s3_client: BaseClient, verbose: bool) -> None:
232
- self.s3_client = s3_client
389
+ class S3MultiPartMerger:
390
+ def __init__(
391
+ self,
392
+ rclone_impl: RcloneImpl,
393
+ info: InfoJson,
394
+ s3_config: S3Config | None = None,
395
+ verbose: bool = False,
396
+ max_workers: int = DEFAULT_MAX_WORKERS,
397
+ ) -> None:
398
+ self.rclone_impl: RcloneImpl = rclone_impl
399
+ self.info = info
400
+ self.s3_creds = rclone_impl.get_s3_credentials(remote=info.dst)
233
401
  self.verbose = verbose
402
+ s3_config = s3_config or S3Config(
403
+ verbose=verbose,
404
+ timeout_read=_TIMEOUT_READ,
405
+ timeout_connection=_TIMEOUT_CONNECTION,
406
+ max_pool_connections=max_workers,
407
+ )
408
+ self.max_workers = s3_config.max_pool_connections or DEFAULT_MAX_WORKERS
409
+ self.client = create_s3_client(s3_creds=self.s3_creds, s3_config=s3_config)
410
+ self.state: MergeState | None = None
411
+ self.write_thread: WriteMergeStateThread | None = None
412
+
413
+ @staticmethod
414
+ def create(
415
+ rclone: RcloneImpl, info: InfoJson, max_workers: int
416
+ ) -> "S3MultiPartMerger | Exception":
417
+ return _begin_or_resume_merge(rclone=rclone, info=info, max_workers=max_workers)
418
+
419
+ @property
420
+ def bucket(self) -> str:
421
+ return self.s3_creds.bucket_name
422
+
423
+ def start_write_thread(self) -> None:
424
+ assert self.state is not None
425
+ assert self.write_thread is None
426
+ self.write_thread = WriteMergeStateThread(
427
+ rclone_impl=self.rclone_impl,
428
+ merge_state=self.state,
429
+ )
234
430
 
235
- def finish_from_keys(
431
+ def _begin_new_merge(
236
432
  self,
237
- source_bucket: str,
238
- parts: list[tuple[int, str]],
239
- destination_bucket: str,
240
- destination_key: str,
241
- chunk_size: int,
242
- final_size: int,
243
- retries: int = 100,
244
- ) -> str:
245
- """
246
- Finish a multipart upload by copying parts from existing S3 objects.
247
-
248
- Args:
249
- source_bucket: Source bucket name
250
- source_keys: List of source object keys to copy from
251
- destination_bucket: Destination bucket name
252
- destination_key: Destination object key
253
- chunk_size: Size of each part in bytes
254
- retries: Number of retry attempts
255
- byte_ranges: Optional list of byte ranges corresponding to source_keys
256
-
257
- Returns:
258
- The URL of the completed object
259
- """
260
- return finish_multipart_upload_from_keys(
261
- s3_client=self.s3_client,
262
- source_bucket=source_bucket,
263
- parts=parts,
264
- destination_bucket=destination_bucket,
265
- destination_key=destination_key,
266
- chunk_size=chunk_size,
267
- final_size=final_size,
268
- retries=retries,
433
+ parts: list[Part],
434
+ merge_path: str,
435
+ bucket: str,
436
+ dst_key: str,
437
+ ) -> Exception | None:
438
+ try:
439
+ upload_id: str = _begin_upload(
440
+ s3_client=self.client,
441
+ parts=parts,
442
+ bucket=bucket,
443
+ dst_key=dst_key,
444
+ verbose=self.verbose,
445
+ )
446
+ merge_state = MergeState(
447
+ rclone_impl=self.rclone_impl,
448
+ merge_path=merge_path,
449
+ upload_id=upload_id,
450
+ bucket=bucket,
451
+ dst_key=dst_key,
452
+ finished=[],
453
+ all_parts=parts,
454
+ )
455
+ self.state = merge_state
456
+ return None
457
+ except Exception as e:
458
+ return e
459
+
460
+ def _begin_resume_merge(
461
+ self,
462
+ merge_state: MergeState,
463
+ ) -> None:
464
+ self.state = merge_state
465
+
466
+ def _on_piece_finished(self, finished_piece: FinishedPiece | EndOfStream) -> None:
467
+ assert self.write_thread is not None
468
+ assert self.state is not None
469
+ if isinstance(finished_piece, EndOfStream):
470
+ self.write_thread.add_eos()
471
+ else:
472
+ self.state.on_finished(finished_piece)
473
+ self.write_thread.add_finished(finished_piece)
474
+
475
+ def merge(
476
+ self,
477
+ ) -> Exception | None:
478
+ state = self.state
479
+ if state is None:
480
+ return Exception("No merge state loaded")
481
+ self.start_write_thread()
482
+ err = _do_upload_task(
483
+ s3_client=self.client,
484
+ merge_state=state,
485
+ max_workers=self.max_workers,
486
+ on_finished=self._on_piece_finished,
487
+ )
488
+ if isinstance(err, Exception):
489
+ return err
490
+ return None
491
+
492
+ def cleanup(self) -> Exception | None:
493
+ return _cleanup_merge(rclone=self.rclone_impl, info=self.info)
494
+
495
+
496
+ def s3_server_side_multi_part_merge(
497
+ rclone: RcloneImpl, info_path: str, max_workers: int = DEFAULT_MAX_WORKERS
498
+ ) -> Exception | None:
499
+ info = InfoJson(rclone, src=None, src_info=info_path)
500
+ loaded = info.load()
501
+ if not loaded:
502
+ return FileNotFoundError(
503
+ f"Info file not found, has the upload finished? {info_path}"
269
504
  )
505
+ merger: S3MultiPartMerger | Exception = S3MultiPartMerger.create(
506
+ rclone=rclone, info=info, max_workers=max_workers
507
+ )
508
+ if isinstance(merger, Exception):
509
+ return merger
510
+
511
+ err = merger.merge()
512
+ if isinstance(err, Exception):
513
+ return err
514
+
515
+ err = merger.cleanup()
516
+ if isinstance(err, Exception):
517
+ err
518
+ return None