kleinkram 0.38.1.dev20241120100707__py3-none-any.whl → 0.38.1.dev20241125112529__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of kleinkram might be problematic. Click here for more details.

@@ -1,13 +1,14 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import logging
4
- import os
5
4
  import sys
5
+ from concurrent.futures import as_completed
6
+ from concurrent.futures import Future
6
7
  from concurrent.futures import ThreadPoolExecutor
8
+ from enum import Enum
7
9
  from pathlib import Path
8
10
  from time import monotonic
9
11
  from typing import Dict
10
- from typing import List
11
12
  from typing import NamedTuple
12
13
  from typing import Optional
13
14
  from typing import Tuple
@@ -19,11 +20,9 @@ import httpx
19
20
  from kleinkram.api.client import AuthenticatedClient
20
21
  from kleinkram.config import Config
21
22
  from kleinkram.config import LOCAL_S3
22
- from kleinkram.errors import AccessDeniedException
23
- from kleinkram.errors import CorruptedFile
24
- from kleinkram.errors import NotValidUUID
25
- from kleinkram.errors import UploadCredentialsFailed
26
- from kleinkram.errors import UploadFailed
23
+ from kleinkram.errors import AccessDenied
24
+ from kleinkram.models import File
25
+ from kleinkram.models import FileState
27
26
  from kleinkram.utils import b64_md5
28
27
  from kleinkram.utils import format_error
29
28
  from kleinkram.utils import format_traceback
@@ -53,12 +52,6 @@ class UploadCredentials(NamedTuple):
53
52
  bucket: str
54
53
 
55
54
 
56
- class FileUploadJob(NamedTuple):
57
- mission_id: UUID
58
- name: str
59
- path: Path
60
-
61
-
62
55
  def _get_s3_endpoint() -> str:
63
56
  config = Config()
64
57
  endpoint = config.endpoint
@@ -77,9 +70,6 @@ def _confirm_file_upload(
77
70
  "md5": file_hash,
78
71
  }
79
72
  resp = client.post(UPLOAD_CONFIRM, json=data)
80
-
81
- if 400 <= resp.status_code < 500:
82
- raise CorruptedFile("failed to confirm upload")
83
73
  resp.raise_for_status()
84
74
 
85
75
 
@@ -95,66 +85,47 @@ def _cancel_file_upload(
95
85
  return
96
86
 
97
87
 
98
- def _get_file_download(client: AuthenticatedClient, id: UUID) -> str:
99
- """\
100
- get the download url for a file by file id
101
- """
102
- resp = client.get(DOWNLOAD_URL, params={"uuid": str(id), "expires": True})
88
+ FILE_EXISTS_ERROR = "File already exists"
103
89
 
104
- if 400 <= resp.status_code < 500:
105
- raise AccessDeniedException(
106
- f"Failed to download file: {resp.json()['message']}",
107
- f"Status Code: {resp.status_code}",
108
- )
109
-
110
- resp.raise_for_status()
111
-
112
- return resp.text
90
+ # fields for upload credentials
91
+ ACCESS_KEY_FIELD = "accessKey"
92
+ SECRET_KEY_FIELD = "secretKey"
93
+ SESSION_TOKEN_FIELD = "sessionToken"
94
+ CREDENTIALS_FIELD = "accessCredentials"
95
+ FILE_ID_FIELD = "fileUUID"
96
+ BUCKET_FIELD = "bucket"
113
97
 
114
98
 
115
99
  def _get_upload_creditials(
116
- client: AuthenticatedClient, internal_filenames: List[str], mission_id: UUID
117
- ) -> Dict[str, UploadCredentials]:
118
- if mission_id.version != 4:
119
- raise NotValidUUID("Mission ID must be a UUIDv4")
100
+ client: AuthenticatedClient, internal_filename: str, mission_id: UUID
101
+ ) -> Optional[UploadCredentials]:
120
102
  dct = {
121
- "filenames": internal_filenames,
103
+ "filenames": [internal_filename],
122
104
  "missionUUID": str(mission_id),
123
105
  }
124
106
  resp = client.post(UPLOAD_CREDS, json=dct)
107
+ resp.raise_for_status()
125
108
 
126
- if resp.status_code >= 400:
127
- raise UploadCredentialsFailed(
128
- f"Failed to get temporary credentials {internal_filenames}"
129
- )
130
-
131
- data = resp.json()
132
-
133
- ret = {}
134
- for record in data:
135
- if "error" in record:
136
- # TODO: handle this better
137
- continue
138
-
139
- bucket = record["bucket"]
140
- file_id = UUID(record["fileUUID"], version=4)
141
- filename = record["fileName"]
109
+ data = resp.json()[0]
142
110
 
143
- creds = record["accessCredentials"]
111
+ if data.get("error") == FILE_EXISTS_ERROR:
112
+ return None
144
113
 
145
- access_key = creds["accessKey"]
146
- secret_key = creds["secretKey"]
147
- session_token = creds["sessionToken"]
114
+ bucket = data[BUCKET_FIELD]
115
+ file_id = UUID(data[FILE_ID_FIELD], version=4)
148
116
 
149
- ret[filename] = UploadCredentials(
150
- access_key=access_key,
151
- secret_key=secret_key,
152
- session_token=session_token,
153
- file_id=file_id,
154
- bucket=bucket,
155
- )
117
+ creds = data[CREDENTIALS_FIELD]
118
+ access_key = creds[ACCESS_KEY_FIELD]
119
+ secret_key = creds[SECRET_KEY_FIELD]
120
+ session_token = creds[SESSION_TOKEN_FIELD]
156
121
 
157
- return ret
122
+ return UploadCredentials(
123
+ access_key=access_key,
124
+ secret_key=secret_key,
125
+ session_token=session_token,
126
+ file_id=file_id,
127
+ bucket=bucket,
128
+ )
158
129
 
159
130
 
160
131
  def _s3_upload(
@@ -163,184 +134,333 @@ def _s3_upload(
163
134
  endpoint: str,
164
135
  credentials: UploadCredentials,
165
136
  pbar: tqdm,
166
- ) -> bool:
137
+ ) -> None:
167
138
  # configure boto3
168
- try:
169
- config = botocore.config.Config(
170
- retries={"max_attempts": S3_MAX_RETRIES},
171
- read_timeout=S3_READ_TIMEOUT,
172
- )
173
- client = boto3.client(
174
- "s3",
175
- endpoint_url=endpoint,
176
- aws_access_key_id=credentials.access_key,
177
- aws_secret_access_key=credentials.secret_key,
178
- aws_session_token=credentials.session_token,
179
- config=config,
180
- )
181
- client.upload_file(
182
- str(local_path),
183
- credentials.bucket,
184
- str(credentials.file_id),
185
- Callback=pbar.update,
186
- )
187
- except Exception as e:
188
- logger.error(format_traceback(e))
189
- pbar.write(format_error(f"error uploading file {local_path}", e))
190
- return False
191
- return True
139
+ config = botocore.config.Config(
140
+ retries={"max_attempts": S3_MAX_RETRIES},
141
+ read_timeout=S3_READ_TIMEOUT,
142
+ )
143
+ client = boto3.client(
144
+ "s3",
145
+ endpoint_url=endpoint,
146
+ aws_access_key_id=credentials.access_key,
147
+ aws_secret_access_key=credentials.secret_key,
148
+ aws_session_token=credentials.session_token,
149
+ config=config,
150
+ )
151
+ client.upload_file(
152
+ str(local_path),
153
+ credentials.bucket,
154
+ str(credentials.file_id),
155
+ Callback=pbar.update,
156
+ )
157
+
192
158
 
159
+ class UploadState(Enum):
160
+ UPLOADED = 1
161
+ EXISTS = 2
162
+ CANCELED = 3
193
163
 
194
- def _upload_file(
164
+
165
+ # TODO: i dont want to handle errors at this level
166
+ def upload_file(
195
167
  client: AuthenticatedClient,
196
- job: FileUploadJob,
197
- hide_progress: bool = False,
198
- global_pbar: Optional[tqdm] = None,
199
- ) -> Tuple[int, Path]:
168
+ *,
169
+ mission_id: UUID,
170
+ filename: str,
171
+ path: Path,
172
+ verbose: bool = False,
173
+ ) -> UploadState:
200
174
  """\
201
175
  returns bytes uploaded
202
176
  """
203
177
 
204
- pbar = tqdm(
205
- total=os.path.getsize(job.path),
178
+ total_size = path.stat().st_size
179
+ with tqdm(
180
+ total=total_size,
206
181
  unit="B",
207
182
  unit_scale=True,
208
- desc=f"uploading {job.path.name}...",
183
+ desc=f"uploading {path}...",
209
184
  leave=False,
210
- disable=hide_progress,
211
- )
185
+ disable=not verbose,
186
+ ) as pbar:
187
+ endpoint = _get_s3_endpoint()
212
188
 
213
- # get creditials for the upload
214
- try:
215
- # get upload credentials for a single file
216
- access = _get_upload_creditials(
217
- client, internal_filenames=[job.name], mission_id=job.mission_id
189
+ # get per file upload credentials
190
+ creds = _get_upload_creditials(
191
+ client, internal_filename=filename, mission_id=mission_id
218
192
  )
219
- # upload file
220
- creds = access[job.name]
221
- except Exception as e:
222
- logger.error(format_traceback(e))
223
- pbar.write(f"unable to get upload credentials for file {job.path.name}: {e}")
224
- pbar.close()
225
- if global_pbar is not None:
226
- global_pbar.update()
227
- return (0, job.path)
193
+ if creds is None:
194
+ return UploadState.EXISTS
228
195
 
229
- # do the upload
230
- endpoint = _get_s3_endpoint()
231
- success = _s3_upload(job.path, endpoint=endpoint, credentials=creds, pbar=pbar)
232
-
233
- if not success:
234
196
  try:
235
- _cancel_file_upload(client, creds.file_id, job.mission_id)
197
+ _s3_upload(path, endpoint=endpoint, credentials=creds, pbar=pbar)
236
198
  except Exception as e:
237
199
  logger.error(format_traceback(e))
238
- pbar.write(format_error(f"error cancelling upload {job.path}", e))
239
- else:
240
- # tell backend that upload is complete
241
- try:
242
- local_hash = b64_md5(job.path)
243
- _confirm_file_upload(client, creds.file_id, local_hash)
244
-
245
- if global_pbar is not None:
246
- msg = f"uploaded {job.path}"
247
- logger.info(msg)
248
- global_pbar.write(styled_string(msg, style="green"))
249
- global_pbar.update()
250
-
251
- except Exception as e:
252
- msg = format_error(f"error confirming upload {job.path}", e)
253
- pbar.write(msg)
254
-
255
- pbar.close()
256
- return (job.path.stat().st_size, job.path)
257
-
200
+ _cancel_file_upload(client, creds.file_id, mission_id)
201
+ return UploadState.CANCELED
258
202
 
259
- def upload_files(
260
- files_map: Dict[str, Path],
261
- mission_id: UUID,
262
- *,
263
- verbose: bool = False,
264
- n_workers: int = 2,
265
- ) -> None:
266
- futures = []
267
-
268
- pbar = tqdm(
269
- total=len(files_map),
270
- unit="files",
271
- desc="Uploading files",
272
- disable=not verbose,
273
- )
274
-
275
- start = monotonic()
276
- with ThreadPoolExecutor(max_workers=n_workers) as executor:
277
- for name, path in files_map.items():
278
- # client is not thread safe
279
- client = AuthenticatedClient()
280
- job = FileUploadJob(mission_id=mission_id, name=name, path=path)
281
- future = executor.submit(
282
- _upload_file,
283
- client=client,
284
- job=job,
285
- hide_progress=not verbose,
286
- global_pbar=pbar,
287
- )
288
- futures.append(future)
289
-
290
- errors = []
291
- total_size = 0
292
- for f in futures:
293
- try:
294
- size, path = f.result()
295
- size = size / 1024 / 1024 # convert to MB
203
+ else:
204
+ _confirm_file_upload(client, creds.file_id, b64_md5(path))
205
+ return UploadState.UPLOADED
296
206
 
297
- if not verbose and size > 0:
298
- print(path.absolte())
299
207
 
300
- total_size += size
301
- except Exception as e:
302
- logger.error(format_traceback(e))
303
- errors.append(e)
208
+ def _get_file_download(client: AuthenticatedClient, id: UUID) -> str:
209
+ """\
210
+ get the download url for a file by file id
211
+ """
212
+ resp = client.get(DOWNLOAD_URL, params={"uuid": str(id), "expires": True})
304
213
 
305
- pbar.close()
214
+ if 400 <= resp.status_code < 500:
215
+ raise AccessDenied(
216
+ f"Failed to download file: {resp.json()['message']}"
217
+ f"Status Code: {resp.status_code}",
218
+ )
306
219
 
307
- time = monotonic() - start
308
- c = Console(file=sys.stderr)
309
- c.print(f"upload took {time:.2f} seconds")
310
- c.print(f"total size: {int(total_size)} MB")
311
- c.print(f"average speed: {total_size / time:.2f} MB/s")
220
+ resp.raise_for_status()
312
221
 
313
- if errors:
314
- raise UploadFailed(f"got unhandled errors: {errors} when uploading files")
222
+ return resp.text
315
223
 
316
224
 
317
- def _url_download(url: str, path: Path, size: int, overwrite: bool = False) -> None:
225
+ def _url_download(
226
+ url: str, *, path: Path, size: int, overwrite: bool = False, verbose: bool = False
227
+ ) -> None:
318
228
  if path.exists() and not overwrite:
319
229
  raise FileExistsError(f"file already exists: {path}")
320
230
 
321
231
  with httpx.stream("GET", url) as response:
322
232
  with open(path, "wb") as f:
323
233
  with tqdm(
324
- total=size, desc=f"downloading {path.name}", unit="B", unit_scale=True
234
+ total=size,
235
+ desc=f"downloading {path.name}",
236
+ unit="B",
237
+ unit_scale=True,
238
+ leave=False,
239
+ disable=not verbose,
325
240
  ) as pbar:
326
241
  for chunk in response.iter_bytes(chunk_size=DOWNLOAD_CHUNK_SIZE):
327
242
  f.write(chunk)
328
243
  pbar.update(len(chunk))
329
244
 
330
245
 
246
+ class DownloadState(Enum):
247
+ DOWNLOADED_OK = 1
248
+ SKIPPED_OK = 2
249
+ DOWNLOADED_INVALID_HASH = 3
250
+ SKIPPED_INVALID_HASH = 4
251
+ SKIPPED_INVALID_REMOTE_STATE = 5
252
+
253
+
331
254
  def download_file(
332
255
  client: AuthenticatedClient,
333
- file_id: UUID,
334
- name: str,
335
- dest: Path,
336
- hash: str,
337
- size: int,
338
- ) -> None:
339
- download_url = _get_file_download(client, file_id)
256
+ *,
257
+ file: File,
258
+ path: Path,
259
+ overwrite: bool = False,
260
+ verbose: bool = False,
261
+ ) -> DownloadState:
262
+ # skip files that are not ok on remote
263
+ if file.state != FileState.OK:
264
+ return DownloadState.SKIPPED_INVALID_REMOTE_STATE
265
+
266
+ # skip existing files depending on flags set
267
+ if path.exists():
268
+ local_hash = b64_md5(path)
269
+ if local_hash != file.hash and not overwrite and file.hash is not None:
270
+ return DownloadState.SKIPPED_INVALID_HASH
271
+
272
+ elif local_hash == file.hash:
273
+ return DownloadState.SKIPPED_OK
274
+
275
+ # this has to be here
276
+ if verbose:
277
+ tqdm.write(
278
+ styled_string(f"overwriting {path}, hash missmatch", style="yellow")
279
+ )
280
+
281
+ # request a download url
282
+ download_url = _get_file_download(client, file.id)
283
+
284
+ # create parent directories
285
+ path.parent.mkdir(parents=True, exist_ok=True)
286
+
287
+ # download the file and check the hash
288
+ _url_download(
289
+ download_url, path=path, size=file.size, overwrite=overwrite, verbose=verbose
290
+ )
291
+ observed_hash = b64_md5(path)
292
+ if file.hash is not None and observed_hash != file.hash:
293
+ return DownloadState.DOWNLOADED_INVALID_HASH
294
+ return DownloadState.DOWNLOADED_OK
295
+
296
+
297
+ UPLOAD_STATE_COLOR = {
298
+ UploadState.UPLOADED: "green",
299
+ UploadState.EXISTS: "yellow",
300
+ UploadState.CANCELED: "red",
301
+ }
302
+
303
+
304
+ def _upload_handler(
305
+ future: Future[UploadState], path: Path, *, verbose: bool = False
306
+ ) -> int:
307
+ try:
308
+ state = future.result()
309
+ except Exception as e:
310
+ logger.error(format_traceback(e))
311
+ if verbose:
312
+ tqdm.write(format_error(f"error uploading {path}", e))
313
+ else:
314
+ print(path.absolute(), file=sys.stderr)
315
+ return 0
316
+
317
+ if state == UploadState.UPLOADED:
318
+ msg = f"uploaded {path}"
319
+ elif state == UploadState.EXISTS:
320
+ msg = f"skipped {path} already uploaded"
321
+ else:
322
+ msg = f"canceled {path} upload"
323
+
324
+ if verbose:
325
+ tqdm.write(styled_string(msg, style=UPLOAD_STATE_COLOR[state]))
326
+ else:
327
+ stream = sys.stdout if state == UploadState.UPLOADED else sys.stderr
328
+ print(path.absolute(), file=stream)
340
329
 
341
- file_path = dest / name
342
- _url_download(download_url, file_path, size)
343
- observed_hash = b64_md5(file_path)
330
+ return path.stat().st_size if state == UploadState.UPLOADED else 0
344
331
 
345
- if observed_hash != hash:
346
- raise CorruptedFile(f"file hash does not match: {dest}")
332
+
333
+ DOWNLOAD_STATE_COLOR = {
334
+ DownloadState.DOWNLOADED_OK: "green",
335
+ DownloadState.SKIPPED_OK: "green",
336
+ DownloadState.DOWNLOADED_INVALID_HASH: "red",
337
+ DownloadState.SKIPPED_INVALID_HASH: "yellow",
338
+ DownloadState.SKIPPED_INVALID_REMOTE_STATE: "purple",
339
+ }
340
+
341
+
342
+ def _download_handler(
343
+ future: Future[DownloadState], file: File, path: Path, *, verbose: bool = False
344
+ ) -> int:
345
+ try:
346
+ state = future.result()
347
+ except Exception as e:
348
+ logger.error(format_traceback(e))
349
+ if verbose:
350
+ tqdm.write(format_error(f"error uploading {path}", e))
351
+ else:
352
+ print(path.absolute(), file=sys.stderr)
353
+ return 0
354
+
355
+ if state == DownloadState.DOWNLOADED_OK:
356
+ msg = f"downloaded {path}"
357
+ elif state == DownloadState.DOWNLOADED_INVALID_HASH:
358
+ msg = f"downloaded {path} failed hash check"
359
+ elif state == DownloadState.SKIPPED_OK:
360
+ msg = f"skipped {path} already downloaded"
361
+ elif state == DownloadState.SKIPPED_INVALID_HASH:
362
+ msg = f"skipped {path} already downloaded, hash missmatch, cosider using `--overwrite`"
363
+ else:
364
+ msg = f"skipped {path} remote file has invalid state"
365
+
366
+ if verbose:
367
+ tqdm.write(styled_string(msg, style=DOWNLOAD_STATE_COLOR[state]))
368
+ else:
369
+ stream = (
370
+ sys.stdout
371
+ if state in (DownloadState.DOWNLOADED_OK, DownloadState.SKIPPED_OK)
372
+ else sys.stderr
373
+ )
374
+ print(path.absolute(), file=stream)
375
+
376
+ # number of bytes downloaded
377
+ return file.size if state == DownloadState.DOWNLOADED_OK else 0
378
+
379
+
380
+ def upload_files(
381
+ client: AuthenticatedClient,
382
+ files_map: Dict[str, Path],
383
+ mission_id: UUID,
384
+ *,
385
+ verbose: bool = False,
386
+ n_workers: int = 2,
387
+ ) -> None:
388
+ with tqdm(
389
+ total=len(files_map),
390
+ unit="files",
391
+ desc="uploading files",
392
+ disable=not verbose,
393
+ leave=False,
394
+ ) as pbar:
395
+ start = monotonic()
396
+ futures: Dict[Future[UploadState], Path] = {}
397
+ with ThreadPoolExecutor(max_workers=n_workers) as executor:
398
+ for name, path in files_map.items():
399
+ future = executor.submit(
400
+ upload_file,
401
+ client=client,
402
+ mission_id=mission_id,
403
+ filename=name,
404
+ path=path,
405
+ verbose=verbose,
406
+ )
407
+ futures[future] = path
408
+
409
+ total_size = 0
410
+ for future in as_completed(futures):
411
+ size = _upload_handler(future, futures[future], verbose=verbose)
412
+ total_size += size / 1024 / 1024
413
+
414
+ pbar.update()
415
+ pbar.refresh()
416
+
417
+ t = monotonic() - start
418
+ c = Console(file=sys.stderr)
419
+ c.print(f"upload took {t:.2f} seconds")
420
+ c.print(f"total size: {int(total_size)} MB")
421
+ c.print(f"average speed: {total_size / t:.2f} MB/s")
422
+
423
+
424
+ def download_files(
425
+ client: AuthenticatedClient,
426
+ files: Dict[Path, File],
427
+ *,
428
+ verbose: bool = False,
429
+ overwrite: bool = False,
430
+ n_workers: int = 2,
431
+ ) -> None:
432
+ with tqdm(
433
+ total=len(files),
434
+ unit="files",
435
+ desc="downloading files",
436
+ disable=not verbose,
437
+ leave=False,
438
+ ) as pbar:
439
+
440
+ start = monotonic()
441
+ futures: Dict[Future[DownloadState], Tuple[File, Path]] = {}
442
+ with ThreadPoolExecutor(max_workers=n_workers) as executor:
443
+ for path, file in files.items():
444
+ future = executor.submit(
445
+ download_file,
446
+ client=client,
447
+ file=file,
448
+ path=path,
449
+ overwrite=overwrite,
450
+ verbose=verbose,
451
+ )
452
+ futures[future] = (file, path)
453
+
454
+ total_size = 0
455
+ for future in as_completed(futures):
456
+ file, path = futures[future]
457
+ size = _download_handler(future, file, path, verbose=verbose)
458
+ total_size += size / 1024 / 1024 # MB
459
+ pbar.update()
460
+ pbar.refresh()
461
+
462
+ time = monotonic() - start
463
+ c = Console(file=sys.stderr)
464
+ c.print(f"download took {time:.2f} seconds")
465
+ c.print(f"total size: {int(total_size)} MB")
466
+ c.print(f"average speed: {total_size / time:.2f} MB/s")