divbase-cli 0.1.0.dev2__py3-none-any.whl → 0.1.0.dev3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,446 @@
1
+ """
2
+ Module responsible for taking pre-signed urls and using them to do file download and upload.
3
+
4
+ Grouped into 4 sections:
5
+ - Downloading singlepart files
6
+ - Downloading multipart files (based on size threshold)
7
+ - Uploading singlepart files
8
+ - Uploading multipart files (based on size threshold)
9
+
10
+ Size thresholds are defined in divbase_lib.divbase_constants and which function to call is determined by the caller.
11
+
12
+ Retry logic here uses the stamina library.
13
+
14
+ TODO: Consider adding progress bars
15
+ """
16
+
17
+ import logging
18
+ from concurrent.futures import ThreadPoolExecutor, as_completed
19
+ from dataclasses import dataclass
20
+ from pathlib import Path
21
+
22
+ import httpx
23
+ import stamina
24
+
25
+ from divbase_cli.cli_exceptions import DivBaseAPIConnectionError, DivBaseAPIError
26
+ from divbase_cli.retries import retry_only_on_retryable_http_errors
27
+ from divbase_cli.user_auth import make_authenticated_request
28
+ from divbase_lib.api_schemas.s3 import (
29
+ AbortMultipartUploadRequest,
30
+ CompleteMultipartUploadRequest,
31
+ CompleteMultipartUploadResponse,
32
+ CreateMultipartUploadRequest,
33
+ CreateMultipartUploadResponse,
34
+ GetPresignedPartUrlsRequest,
35
+ PreSignedDownloadResponse,
36
+ PreSignedSinglePartUploadResponse,
37
+ PresignedUploadPartUrlResponse,
38
+ UploadedPart,
39
+ )
40
+ from divbase_lib.divbase_constants import (
41
+ MAX_S3_API_BATCH_SIZE,
42
+ S3_MULTIPART_CHUNK_SIZE,
43
+ )
44
+ from divbase_lib.exceptions import ChecksumVerificationError
45
+ from divbase_lib.s3_checksums import calculate_md5_checksum_for_chunk, verify_downloaded_checksum
46
+
47
+ logger = logging.getLogger(__name__)
48
+
49
+ # Used for multipart file transfers
50
+ MAX_CONCURRENCY = 10
51
+
52
+ MB = 1024 * 1024
53
+ # We can download in whatever chunk size we want,
54
+ # the checksum validation has to use the right chunk size though
55
+ # Uploads should use the same threshold and chunk size as divbase-api
56
+ # So those are defined in a shared lib.
57
+ MULTIPART_DOWNLOAD_THRESHOLD = 32 * MB
58
+ DOWNLOAD_CHUNK_SIZE = 8 * MB
59
+
60
+
61
+ @dataclass
62
+ class SuccessfulDownload:
63
+ """Represents a successfully downloaded file."""
64
+
65
+ file_path: Path
66
+ object_name: str
67
+
68
+
69
+ @dataclass
70
+ class FailedDownload:
71
+ """Represents a failed download attempt."""
72
+
73
+ object_name: str
74
+ file_path: Path
75
+ exception: Exception
76
+
77
+
78
+ @dataclass
79
+ class DownloadOutcome:
80
+ """Outcome of attempting to download multiple files."""
81
+
82
+ successful: list[SuccessfulDownload]
83
+ failed: list[FailedDownload]
84
+
85
+
86
+ def download_multiple_pre_signed_urls(
87
+ pre_signed_urls: list[PreSignedDownloadResponse], verify_checksums: bool, download_dir: Path
88
+ ) -> tuple[list[SuccessfulDownload], list[FailedDownload]]:
89
+ """
90
+ Download files using pre-signed URLs.
91
+ Returns a tuple of both the successful and failed downloads.
92
+ """
93
+ successful_downloads, failed_downloads = [], []
94
+ with httpx.Client() as client:
95
+ for obj in pre_signed_urls:
96
+ output_file_path = download_dir / obj.name
97
+ object_name = obj.name
98
+ try:
99
+ result = _download_single_pre_signed_url(
100
+ httpx_client=client,
101
+ pre_signed_url=obj.pre_signed_url,
102
+ verify_checksums=verify_checksums,
103
+ output_file_path=output_file_path,
104
+ object_name=object_name,
105
+ )
106
+ except httpx.HTTPError as err:
107
+ output_file_path.unlink(missing_ok=True) # Clean up possible partial file
108
+ result = FailedDownload(object_name=object_name, file_path=output_file_path, exception=err)
109
+
110
+ if isinstance(result, SuccessfulDownload):
111
+ successful_downloads.append(result)
112
+ else:
113
+ failed_downloads.append(result)
114
+
115
+ return successful_downloads, failed_downloads
116
+
117
+
118
+ def _download_single_pre_signed_url(
119
+ httpx_client: httpx.Client, pre_signed_url: str, verify_checksums: bool, output_file_path: Path, object_name: str
120
+ ) -> SuccessfulDownload | FailedDownload:
121
+ """
122
+ Download a single file using a pre-signed URL.
123
+ If the file is large enough, we download in chunks.
124
+ Helper function, do not call directly from outside this module.
125
+ """
126
+ content_length, server_checksum = _get_content_length_and_checksum(
127
+ httpx_client=httpx_client, pre_signed_url=pre_signed_url
128
+ )
129
+
130
+ if content_length < MULTIPART_DOWNLOAD_THRESHOLD:
131
+ _perform_singlepart_download(
132
+ httpx_client=httpx_client,
133
+ pre_signed_url=pre_signed_url,
134
+ output_file_path=output_file_path,
135
+ )
136
+
137
+ else:
138
+ logger.info(f"Starting multipart download for large file '{object_name}' of size {content_length} bytes.")
139
+ _perform_multipart_download(httpx_client, pre_signed_url, output_file_path, content_length)
140
+
141
+ if verify_checksums:
142
+ try:
143
+ verify_downloaded_checksum(file_path=output_file_path, expected_checksum=server_checksum)
144
+ except ChecksumVerificationError as err:
145
+ output_file_path.unlink()
146
+ return FailedDownload(object_name=object_name, file_path=output_file_path, exception=err)
147
+
148
+ return SuccessfulDownload(file_path=output_file_path, object_name=object_name)
149
+
150
+
151
+ @stamina.retry(on=retry_only_on_retryable_http_errors, attempts=3)
152
+ def _get_content_length_and_checksum(httpx_client: httpx.Client, pre_signed_url: str) -> tuple[int, str]:
153
+ """
154
+ "HEAD" a pre-signed download URL to get it's content length and checksum.
155
+
156
+ As you can't HEAD a presigned GET, we do a GET with a Range header to only get the first byte.
157
+ Otherwise would have to be given a separate pre-signed HEAD url to do this.
158
+ """
159
+ with httpx_client.stream("GET", pre_signed_url, headers={"Range": "bytes=0-0"}) as head_response:
160
+ head_response.raise_for_status()
161
+ content_range = head_response.headers["Content-Range"]
162
+ # format is "bytes 0-0/12345"
163
+ content_length = int(content_range.split("/")[-1])
164
+ server_checksum = head_response.headers["ETag"].strip('"')
165
+ return content_length, server_checksum
166
+
167
+
168
+ @stamina.retry(on=retry_only_on_retryable_http_errors, attempts=3)
169
+ def _perform_singlepart_download(httpx_client: httpx.Client, pre_signed_url: str, output_file_path: Path) -> None:
170
+ """Used on objects smaller than the multipart threshold cutoff"""
171
+ with httpx_client.stream("GET", pre_signed_url) as response:
172
+ response.raise_for_status()
173
+
174
+ with open(output_file_path, "wb") as file:
175
+ for chunk in response.iter_bytes(chunk_size=8192):
176
+ file.write(chunk)
177
+
178
+
179
+ def _perform_multipart_download(httpx_client, pre_signed_url, output_file_path, content_length):
180
+ """
181
+ Download a large file in multiple chunks using range requests.
182
+
183
+ As we write to the file concurrently, the file is first created with the correct size,
184
+ and then each chunk is written to the correct position in the file.
185
+ """
186
+ with open(output_file_path, "wb") as f:
187
+ f.seek(content_length - 1)
188
+ f.write(b"\0")
189
+ with ThreadPoolExecutor(max_workers=MAX_CONCURRENCY) as executor:
190
+ futures = []
191
+ for i in range(0, content_length, DOWNLOAD_CHUNK_SIZE):
192
+ start = i
193
+ end = min(i + DOWNLOAD_CHUNK_SIZE, content_length)
194
+ futures.append(
195
+ executor.submit(
196
+ _download_chunk,
197
+ httpx_client,
198
+ pre_signed_url,
199
+ start,
200
+ end,
201
+ output_file_path,
202
+ )
203
+ )
204
+
205
+ for future in futures:
206
+ future.result()
207
+
208
+
209
+ @stamina.retry(on=retry_only_on_retryable_http_errors, attempts=3)
210
+ def _download_chunk(client: httpx.Client, url: str, start: int, end: int, output_file_path: Path) -> None:
211
+ """
212
+ Downloads a specific range of bytes of a file (aka chunk),
213
+ and writes it to the correct place in the file.
214
+ """
215
+ headers = {"Range": f"bytes={start}-{end - 1}"}
216
+ with client.stream("GET", url, headers=headers) as response:
217
+ response.raise_for_status()
218
+ with open(output_file_path, "rb+") as f:
219
+ f.seek(start)
220
+ for chunk in response.iter_bytes():
221
+ f.write(chunk)
222
+
223
+
224
+ @dataclass
225
+ class SuccessfulUpload:
226
+ """Represents a successfully uploaded file."""
227
+
228
+ file_path: Path
229
+ object_name: str
230
+
231
+
232
+ @dataclass
233
+ class FailedUpload:
234
+ """Represents a failed upload attempt."""
235
+
236
+ object_name: str
237
+ file_path: Path
238
+ exception: Exception
239
+
240
+
241
+ @dataclass
242
+ class UploadOutcome:
243
+ """Outcome of attempting to upload multiple files."""
244
+
245
+ successful: list[SuccessfulUpload]
246
+ failed: list[FailedUpload]
247
+
248
+
249
+ def upload_multiple_singlepart_pre_signed_urls(
250
+ pre_signed_urls: list[PreSignedSinglePartUploadResponse], all_files: list[Path]
251
+ ) -> UploadOutcome:
252
+ """
253
+ Upload singlepart files using pre-signed PUT URLs.
254
+ Returns a UploadResults object containing the results of the upload attempts.
255
+ """
256
+ file_map = {file.name: file for file in all_files}
257
+
258
+ successful_uploads, failed_uploads = [], []
259
+ with httpx.Client() as client:
260
+ for obj in pre_signed_urls:
261
+ result = _upload_one_singlepart_pre_signed_url(
262
+ httpx_client=client,
263
+ pre_signed_url=obj.pre_signed_url,
264
+ file_path=file_map[obj.name],
265
+ object_name=obj.name,
266
+ headers=obj.put_headers,
267
+ )
268
+
269
+ if isinstance(result, SuccessfulUpload):
270
+ successful_uploads.append(result)
271
+ else:
272
+ failed_uploads.append(result)
273
+
274
+ return UploadOutcome(successful=successful_uploads, failed=failed_uploads)
275
+
276
+
277
+ @stamina.retry(on=retry_only_on_retryable_http_errors, attempts=3)
278
+ def _upload_one_singlepart_pre_signed_url(
279
+ httpx_client: httpx.Client,
280
+ pre_signed_url: str,
281
+ file_path: Path,
282
+ object_name: str,
283
+ headers: dict[str, str],
284
+ ) -> SuccessfulUpload | FailedUpload:
285
+ """
286
+ Upload one singlepart file to S3 using a pre-signed PUT URL.
287
+ Helper function, do not call directly from outside this module.
288
+ """
289
+ with open(file_path, "rb") as file:
290
+ try:
291
+ response = httpx_client.put(pre_signed_url, content=file, headers=headers)
292
+ response.raise_for_status()
293
+ except httpx.HTTPError as err:
294
+ return FailedUpload(object_name=object_name, file_path=file_path, exception=err)
295
+
296
+ return SuccessfulUpload(file_path=file_path, object_name=object_name)
297
+
298
+
299
+ ### multipart upload logic below ###
300
+
301
+
302
+ def perform_multipart_upload(
303
+ project_name: str,
304
+ divbase_base_url: str,
305
+ file_path: Path,
306
+ safe_mode: bool,
307
+ ) -> SuccessfulUpload | FailedUpload:
308
+ """
309
+ Manages the entire multi-part upload process for a single file.
310
+ See the docs docs/development/s3_transfers.md for high level overview of the process.
311
+ """
312
+ object_name = file_path.name
313
+ file_size = file_path.stat().st_size
314
+
315
+ # 1. Create multipart upload
316
+ create_request = CreateMultipartUploadRequest(name=object_name, content_length=file_size)
317
+ response = make_authenticated_request(
318
+ method="POST",
319
+ divbase_base_url=divbase_base_url,
320
+ api_route=f"v1/s3/upload/multi-part/create?project_name={project_name}",
321
+ json=create_request.model_dump(),
322
+ )
323
+ object_data = CreateMultipartUploadResponse(**response.json())
324
+
325
+ # 2. Upload each part in batches as divbase server limits how many part urls it will give at once
326
+ parts_to_request = list(range(1, object_data.number_of_parts + 1))
327
+ try:
328
+ uploaded_parts: list[UploadedPart] = []
329
+ for i in range(0, len(parts_to_request), MAX_S3_API_BATCH_SIZE):
330
+ part_batch_numbers = parts_to_request[i : i + MAX_S3_API_BATCH_SIZE]
331
+ part_urls = _get_part_urls(
332
+ project_name=project_name,
333
+ divbase_base_url=divbase_base_url,
334
+ object_name=object_name,
335
+ upload_id=object_data.upload_id,
336
+ part_numbers=part_batch_numbers,
337
+ file_path=file_path,
338
+ safe_mode=safe_mode,
339
+ )
340
+ batch_uploads = _upload_parts(part_urls=part_urls, file_path=file_path)
341
+ uploaded_parts.extend(batch_uploads)
342
+
343
+ # 3. Complete multipart upload
344
+ complete_request_body = CompleteMultipartUploadRequest(
345
+ name=object_name,
346
+ upload_id=object_data.upload_id,
347
+ parts=uploaded_parts,
348
+ )
349
+ response = make_authenticated_request(
350
+ method="POST",
351
+ divbase_base_url=divbase_base_url,
352
+ api_route=f"v1/s3/upload/multi-part/complete?project_name={project_name}",
353
+ json=complete_request_body.model_dump(),
354
+ )
355
+ completed_upload = CompleteMultipartUploadResponse(**response.json())
356
+ return SuccessfulUpload(file_path=file_path, object_name=completed_upload.name)
357
+
358
+ # 4. If any unexpected error occurs, abort the multipart upload
359
+ # To avoid leaving incomplete uploads in S3
360
+ except Exception as e:
361
+ try:
362
+ abort_request = AbortMultipartUploadRequest(name=object_name, upload_id=object_data.upload_id)
363
+ make_authenticated_request(
364
+ method="DELETE",
365
+ divbase_base_url=divbase_base_url,
366
+ api_route=f"v1/s3/upload/multi-part/abort?project_name={project_name}",
367
+ json=abort_request.model_dump(),
368
+ )
369
+ except (DivBaseAPIConnectionError, DivBaseAPIError):
370
+ logger.error(f"Failed to abort multipart upload for object '{object_name}' after an upload error.")
371
+
372
+ return FailedUpload(object_name=object_name, file_path=file_path, exception=e)
373
+
374
+
375
+ def _get_part_urls(
376
+ project_name: str,
377
+ divbase_base_url: str,
378
+ object_name: str,
379
+ upload_id: str,
380
+ part_numbers: list[int],
381
+ file_path: Path,
382
+ safe_mode: bool,
383
+ ) -> list[PresignedUploadPartUrlResponse]:
384
+ """
385
+ Gets up to 100 pre-signed URLs (from divbase server) for uploading parts of a large file to S3.
386
+
387
+ Not responsible for uploading the parts, just getting the URLs.
388
+ """
389
+ md5_checksums = None
390
+ if safe_mode:
391
+ md5_checksums = []
392
+ for part_num in part_numbers:
393
+ checksum = calculate_md5_checksum_for_chunk(
394
+ file_path=file_path,
395
+ start_byte=(part_num - 1) * S3_MULTIPART_CHUNK_SIZE,
396
+ chunk_size=S3_MULTIPART_CHUNK_SIZE,
397
+ )
398
+ md5_checksums.append(checksum)
399
+
400
+ request_body = GetPresignedPartUrlsRequest(
401
+ name=object_name,
402
+ upload_id=upload_id,
403
+ parts_range_start=part_numbers[0],
404
+ parts_range_end=part_numbers[-1],
405
+ md5_checksums=md5_checksums,
406
+ )
407
+ response = make_authenticated_request(
408
+ method="POST",
409
+ divbase_base_url=divbase_base_url,
410
+ api_route=f"v1/s3/upload/multi-part/part-urls?project_name={project_name}",
411
+ json=request_body.model_dump(),
412
+ )
413
+ return [PresignedUploadPartUrlResponse(**item) for item in response.json()]
414
+
415
+
416
+ def _upload_parts(part_urls: list[PresignedUploadPartUrlResponse], file_path: Path) -> list[UploadedPart]:
417
+ """Uploads a batch of parts in parallel and returns their ETag info."""
418
+ completed_parts = []
419
+ with ThreadPoolExecutor(max_workers=MAX_CONCURRENCY) as executor:
420
+ future_to_part = {executor.submit(_upload_chunk, part=part, file_path=file_path): part for part in part_urls}
421
+ for future in as_completed(future_to_part):
422
+ part_number, etag = future.result()
423
+ completed_parts.append(UploadedPart(part_number=part_number, etag=etag))
424
+ return completed_parts
425
+
426
+
427
+ @stamina.retry(on=retry_only_on_retryable_http_errors, attempts=3)
428
+ def _upload_chunk(part: PresignedUploadPartUrlResponse, file_path: Path) -> tuple[int, str]:
429
+ """Uploads a single chunk of a file to a pre-signed URL and returns its part number and ETag."""
430
+
431
+ start_byte = (part.part_number - 1) * S3_MULTIPART_CHUNK_SIZE
432
+ with open(file_path, "rb") as f:
433
+ f.seek(start_byte)
434
+ data_to_upload = f.read(S3_MULTIPART_CHUNK_SIZE)
435
+
436
+ with httpx.Client() as client:
437
+ response = client.put(
438
+ part.pre_signed_url,
439
+ content=data_to_upload,
440
+ headers=part.headers,
441
+ timeout=httpx.Timeout(5.0, write=30.0),
442
+ )
443
+ response.raise_for_status()
444
+ # ETag is returned with quotes, which must be stripped prior to comparison
445
+ etag = response.headers["ETag"].strip('"')
446
+ return part.part_number, etag
@@ -0,0 +1,77 @@
1
+ """
2
+ Service layer for DivBase CLI project version operations.
3
+ """
4
+
5
+ from divbase_cli.user_auth import make_authenticated_request
6
+ from divbase_lib.api_schemas.project_versions import (
7
+ AddVersionRequest,
8
+ AddVersionResponse,
9
+ DeleteVersionRequest,
10
+ DeleteVersionResponse,
11
+ ProjectVersionDetailResponse,
12
+ ProjectVersionInfo,
13
+ )
14
+
15
+
16
+ def add_version_command(project_name: str, divbase_base_url: str, name: str, description: str) -> AddVersionResponse:
17
+ """Add a new version to the project versions table stored on the divbase server"""
18
+ request_data = AddVersionRequest(name=name, description=description)
19
+
20
+ response = make_authenticated_request(
21
+ method="PATCH",
22
+ divbase_base_url=divbase_base_url,
23
+ api_route=f"v1/project-versions/add?project_name={project_name}",
24
+ json=request_data.model_dump(),
25
+ )
26
+
27
+ return AddVersionResponse(**response.json())
28
+
29
+
30
+ def list_versions_command(project_name: str, include_deleted: bool, divbase_base_url: str) -> list[ProjectVersionInfo]:
31
+ """
32
+ List all versions in the project versions table stored on the divbase server.
33
+ Returns a dict of version names (keys) to details about the versions.
34
+ """
35
+ response = make_authenticated_request(
36
+ method="GET",
37
+ divbase_base_url=divbase_base_url,
38
+ api_route=f"v1/project-versions/list?project_name={project_name}&include_deleted={str(include_deleted).lower()}",
39
+ )
40
+
41
+ project_versions = []
42
+ response_data = response.json()
43
+ for version in response_data:
44
+ project_versions.append(ProjectVersionInfo(**version))
45
+
46
+ return project_versions
47
+
48
+
49
+ def get_version_details_command(
50
+ project_name: str, divbase_base_url: str, version_name: str
51
+ ) -> ProjectVersionDetailResponse:
52
+ """Get details about a specific project version, including all files and their version IDs at that version."""
53
+ response = make_authenticated_request(
54
+ method="GET",
55
+ divbase_base_url=divbase_base_url,
56
+ api_route=f"v1/project-versions/version_details?project_name={project_name}&version_name={version_name}",
57
+ )
58
+
59
+ return ProjectVersionDetailResponse(**response.json())
60
+
61
+
62
+ def delete_version_command(project_name: str, divbase_base_url: str, version_name: str) -> DeleteVersionResponse:
63
+ """
64
+ Delete a version from the project versions table stored on the divbase server.
65
+ This marks the version as (soft) deleted server side,
66
+ and it will eventually be permanently deleted (after some grace period).
67
+ """
68
+ request_data = DeleteVersionRequest(version_name=version_name)
69
+
70
+ response = make_authenticated_request(
71
+ method="DELETE",
72
+ divbase_base_url=divbase_base_url,
73
+ api_route=f"v1/project-versions/delete?project_name={project_name}",
74
+ json=request_data.model_dump(),
75
+ )
76
+
77
+ return DeleteVersionResponse(**response.json())