kleinkram 0.38.1.dev20241119134715__py3-none-any.whl → 0.38.1.dev20241125112529__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of kleinkram might be problematic. Click here for more details.

@@ -1,12 +1,14 @@
1
1
  from __future__ import annotations
2
2
 
3
- import os
3
+ import logging
4
4
  import sys
5
+ from concurrent.futures import as_completed
6
+ from concurrent.futures import Future
5
7
  from concurrent.futures import ThreadPoolExecutor
8
+ from enum import Enum
6
9
  from pathlib import Path
7
10
  from time import monotonic
8
11
  from typing import Dict
9
- from typing import List
10
12
  from typing import NamedTuple
11
13
  from typing import Optional
12
14
  from typing import Tuple
@@ -18,14 +20,19 @@ import httpx
18
20
  from kleinkram.api.client import AuthenticatedClient
19
21
  from kleinkram.config import Config
20
22
  from kleinkram.config import LOCAL_S3
21
- from kleinkram.errors import AccessDeniedException
22
- from kleinkram.errors import CorruptedFile
23
- from kleinkram.errors import UploadFailed
23
+ from kleinkram.errors import AccessDenied
24
+ from kleinkram.models import File
25
+ from kleinkram.models import FileState
24
26
  from kleinkram.utils import b64_md5
25
- from kleinkram.utils import raw_rich
26
- from rich.text import Text
27
+ from kleinkram.utils import format_error
28
+ from kleinkram.utils import format_traceback
29
+ from kleinkram.utils import styled_string
30
+ from rich.console import Console
27
31
  from tqdm import tqdm
28
32
 
33
+
34
+ logger = logging.getLogger(__name__)
35
+
29
36
  UPLOAD_CREDS = "/file/temporaryAccess"
30
37
  UPLOAD_CONFIRM = "/queue/confirmUpload"
31
38
  UPLOAD_CANCEL = "/file/cancelUpload"
@@ -45,12 +52,6 @@ class UploadCredentials(NamedTuple):
45
52
  bucket: str
46
53
 
47
54
 
48
- class FileUploadJob(NamedTuple):
49
- mission_id: UUID
50
- name: str
51
- path: Path
52
-
53
-
54
55
  def _get_s3_endpoint() -> str:
55
56
  config = Config()
56
57
  endpoint = config.endpoint
@@ -69,9 +70,6 @@ def _confirm_file_upload(
69
70
  "md5": file_hash,
70
71
  }
71
72
  resp = client.post(UPLOAD_CONFIRM, json=data)
72
-
73
- if 400 <= resp.status_code < 500:
74
- raise CorruptedFile()
75
73
  resp.raise_for_status()
76
74
 
77
75
 
@@ -87,67 +85,47 @@ def _cancel_file_upload(
87
85
  return
88
86
 
89
87
 
90
- def _get_file_download(client: AuthenticatedClient, id: UUID) -> str:
91
- """\
92
- get the download url for a file by file id
93
- """
94
- resp = client.get(DOWNLOAD_URL, params={"uuid": str(id), "expires": True})
95
-
96
- if 400 <= resp.status_code < 500:
97
- raise AccessDeniedException(
98
- f"Failed to download file: {resp.json()['message']}",
99
- "Status Code: " + str(resp.status_code),
100
- )
101
-
102
- resp.raise_for_status()
88
+ FILE_EXISTS_ERROR = "File already exists"
103
89
 
104
- return resp.text
90
+ # fields for upload credentials
91
+ ACCESS_KEY_FIELD = "accessKey"
92
+ SECRET_KEY_FIELD = "secretKey"
93
+ SESSION_TOKEN_FIELD = "sessionToken"
94
+ CREDENTIALS_FIELD = "accessCredentials"
95
+ FILE_ID_FIELD = "fileUUID"
96
+ BUCKET_FIELD = "bucket"
105
97
 
106
98
 
107
99
  def _get_upload_creditials(
108
- client: AuthenticatedClient, internal_filenames: List[str], mission_id: UUID
109
- ) -> Dict[str, UploadCredentials]:
110
- if mission_id.version != 4:
111
- raise ValueError("Mission ID must be a UUIDv4")
100
+ client: AuthenticatedClient, internal_filename: str, mission_id: UUID
101
+ ) -> Optional[UploadCredentials]:
112
102
  dct = {
113
- "filenames": internal_filenames,
103
+ "filenames": [internal_filename],
114
104
  "missionUUID": str(mission_id),
115
105
  }
116
106
  resp = client.post(UPLOAD_CREDS, json=dct)
107
+ resp.raise_for_status()
117
108
 
118
- if resp.status_code >= 400:
119
- raise ValueError(
120
- "Failed to get temporary credentials. Status Code: "
121
- f"{resp.status_code}\n{resp.json()['message'][0]}"
122
- )
123
-
124
- data = resp.json()
125
-
126
- ret = {}
127
- for record in data:
128
- if "error" in record:
129
- # TODO: handle this better
130
- continue
131
-
132
- bucket = record["bucket"]
133
- file_id = UUID(record["fileUUID"], version=4)
134
- filename = record["fileName"]
109
+ data = resp.json()[0]
135
110
 
136
- creds = record["accessCredentials"]
111
+ if data.get("error") == FILE_EXISTS_ERROR:
112
+ return None
137
113
 
138
- access_key = creds["accessKey"]
139
- secret_key = creds["secretKey"]
140
- session_token = creds["sessionToken"]
114
+ bucket = data[BUCKET_FIELD]
115
+ file_id = UUID(data[FILE_ID_FIELD], version=4)
141
116
 
142
- ret[filename] = UploadCredentials(
143
- access_key=access_key,
144
- secret_key=secret_key,
145
- session_token=session_token,
146
- file_id=file_id,
147
- bucket=bucket,
148
- )
117
+ creds = data[CREDENTIALS_FIELD]
118
+ access_key = creds[ACCESS_KEY_FIELD]
119
+ secret_key = creds[SECRET_KEY_FIELD]
120
+ session_token = creds[SESSION_TOKEN_FIELD]
149
121
 
150
- return ret
122
+ return UploadCredentials(
123
+ access_key=access_key,
124
+ secret_key=secret_key,
125
+ session_token=session_token,
126
+ file_id=file_id,
127
+ bucket=bucket,
128
+ )
151
129
 
152
130
 
153
131
  def _s3_upload(
@@ -156,182 +134,333 @@ def _s3_upload(
156
134
  endpoint: str,
157
135
  credentials: UploadCredentials,
158
136
  pbar: tqdm,
159
- ) -> bool:
137
+ ) -> None:
160
138
  # configure boto3
161
- try:
162
- config = botocore.config.Config(
163
- retries={"max_attempts": S3_MAX_RETRIES},
164
- read_timeout=S3_READ_TIMEOUT,
165
- )
166
- client = boto3.client(
167
- "s3",
168
- endpoint_url=endpoint,
169
- aws_access_key_id=credentials.access_key,
170
- aws_secret_access_key=credentials.secret_key,
171
- aws_session_token=credentials.session_token,
172
- config=config,
173
- )
174
- client.upload_file(
175
- str(local_path),
176
- credentials.bucket,
177
- str(credentials.file_id),
178
- Callback=pbar.update,
179
- )
180
- except Exception as e:
181
- err = f"error uploading file: {local_path}: {type(e).__name__}"
182
- pbar.write(raw_rich(Text(err, style="red")))
183
- return False
184
- return True
139
+ config = botocore.config.Config(
140
+ retries={"max_attempts": S3_MAX_RETRIES},
141
+ read_timeout=S3_READ_TIMEOUT,
142
+ )
143
+ client = boto3.client(
144
+ "s3",
145
+ endpoint_url=endpoint,
146
+ aws_access_key_id=credentials.access_key,
147
+ aws_secret_access_key=credentials.secret_key,
148
+ aws_session_token=credentials.session_token,
149
+ config=config,
150
+ )
151
+ client.upload_file(
152
+ str(local_path),
153
+ credentials.bucket,
154
+ str(credentials.file_id),
155
+ Callback=pbar.update,
156
+ )
185
157
 
186
158
 
187
- def _upload_file(
159
+ class UploadState(Enum):
160
+ UPLOADED = 1
161
+ EXISTS = 2
162
+ CANCELED = 3
163
+
164
+
165
+ # TODO: i dont want to handle errors at this level
166
+ def upload_file(
188
167
  client: AuthenticatedClient,
189
- job: FileUploadJob,
190
- hide_progress: bool = False,
191
- global_pbar: Optional[tqdm] = None,
192
- ) -> Tuple[int, Path]:
168
+ *,
169
+ mission_id: UUID,
170
+ filename: str,
171
+ path: Path,
172
+ verbose: bool = False,
173
+ ) -> UploadState:
193
174
  """\
194
175
  returns bytes uploaded
195
176
  """
196
177
 
197
- pbar = tqdm(
198
- total=os.path.getsize(job.path),
178
+ total_size = path.stat().st_size
179
+ with tqdm(
180
+ total=total_size,
199
181
  unit="B",
200
182
  unit_scale=True,
201
- desc=f"uploading {job.path.name}...",
183
+ desc=f"uploading {path}...",
202
184
  leave=False,
203
- disable=hide_progress,
204
- )
185
+ disable=not verbose,
186
+ ) as pbar:
187
+ endpoint = _get_s3_endpoint()
205
188
 
206
- # get creditials for the upload
207
- try:
208
- # get upload credentials for a single file
209
- access = _get_upload_creditials(
210
- client, internal_filenames=[job.name], mission_id=job.mission_id
189
+ # get per file upload credentials
190
+ creds = _get_upload_creditials(
191
+ client, internal_filename=filename, mission_id=mission_id
211
192
  )
212
- # upload file
213
- creds = access[job.name]
214
- except Exception as e:
215
- pbar.write(f"unable to get upload credentials for file {job.path.name}: {e}")
216
- pbar.close()
217
- if global_pbar is not None:
218
- global_pbar.update()
219
- return (0, job.path)
220
-
221
- # do the upload
222
- endpoint = _get_s3_endpoint()
223
- success = _s3_upload(job.path, endpoint=endpoint, credentials=creds, pbar=pbar)
193
+ if creds is None:
194
+ return UploadState.EXISTS
224
195
 
225
- if not success:
226
- try:
227
- _cancel_file_upload(client, creds.file_id, job.mission_id)
228
- except Exception as e:
229
- msg = Text(f"failed to cancel upload: {type(e).__name__}", style="red")
230
- pbar.write(raw_rich(msg))
231
- else:
232
- # tell backend that upload is complete
233
196
  try:
234
- local_hash = b64_md5(job.path)
235
- _confirm_file_upload(client, creds.file_id, local_hash)
236
-
237
- if global_pbar is not None:
238
- msg = Text(f"uploaded {job.path}", style="green")
239
- global_pbar.write(raw_rich(msg))
240
- global_pbar.update()
241
-
197
+ _s3_upload(path, endpoint=endpoint, credentials=creds, pbar=pbar)
242
198
  except Exception as e:
243
- msg = Text(
244
- f"error confirming upload {job.path}: {type(e).__name__}", style="red"
245
- )
246
- pbar.write(raw_rich(msg))
199
+ logger.error(format_traceback(e))
200
+ _cancel_file_upload(client, creds.file_id, mission_id)
201
+ return UploadState.CANCELED
247
202
 
248
- pbar.close()
249
- return (job.path.stat().st_size, job.path)
203
+ else:
204
+ _confirm_file_upload(client, creds.file_id, b64_md5(path))
205
+ return UploadState.UPLOADED
250
206
 
251
207
 
252
- def upload_files(
253
- files_map: Dict[str, Path],
254
- mission_id: UUID,
255
- *,
256
- verbose: bool = False,
257
- n_workers: int = 2,
258
- ) -> None:
259
- futures = []
260
-
261
- pbar = tqdm(
262
- total=len(files_map),
263
- unit="files",
264
- desc="Uploading files",
265
- disable=not verbose,
266
- )
267
-
268
- start = monotonic()
269
- with ThreadPoolExecutor(max_workers=n_workers) as executor:
270
- for name, path in files_map.items():
271
- # client is not thread safe
272
- client = AuthenticatedClient()
273
- job = FileUploadJob(mission_id=mission_id, name=name, path=path)
274
- future = executor.submit(
275
- _upload_file,
276
- client=client,
277
- job=job,
278
- hide_progress=not verbose,
279
- global_pbar=pbar,
280
- )
281
- futures.append(future)
282
-
283
- errors = []
284
- total_size = 0
285
- for f in futures:
286
- try:
287
- size, path = f.result()
288
- size = size / 1024 / 1024 # convert to MB
289
-
290
- if not verbose and size > 0:
291
- print(path.absolte())
292
-
293
- total_size += size
294
- except Exception as e:
295
- errors.append(e)
208
+ def _get_file_download(client: AuthenticatedClient, id: UUID) -> str:
209
+ """\
210
+ get the download url for a file by file id
211
+ """
212
+ resp = client.get(DOWNLOAD_URL, params={"uuid": str(id), "expires": True})
296
213
 
297
- pbar.close()
214
+ if 400 <= resp.status_code < 500:
215
+ raise AccessDenied(
216
+ f"Failed to download file: {resp.json()['message']}"
217
+ f"Status Code: {resp.status_code}",
218
+ )
298
219
 
299
- time = monotonic() - start
300
- print(f"upload took {time:.2f} seconds", file=sys.stderr)
301
- print(f"total size: {int(total_size)} MB", file=sys.stderr)
302
- print(f"average speed: {total_size / time:.2f} MB/s", file=sys.stderr)
220
+ resp.raise_for_status()
303
221
 
304
- if errors:
305
- raise UploadFailed(f"got unhandled errors: {errors} when uploading files")
222
+ return resp.text
306
223
 
307
224
 
308
- def _url_download(url: str, path: Path, size: int, overwrite: bool = False) -> None:
225
+ def _url_download(
226
+ url: str, *, path: Path, size: int, overwrite: bool = False, verbose: bool = False
227
+ ) -> None:
309
228
  if path.exists() and not overwrite:
310
- raise FileExistsError(f"File already exists: {path}")
229
+ raise FileExistsError(f"file already exists: {path}")
311
230
 
312
231
  with httpx.stream("GET", url) as response:
313
232
  with open(path, "wb") as f:
314
233
  with tqdm(
315
- total=size, desc=f"Downloading {path.name}", unit="B", unit_scale=True
234
+ total=size,
235
+ desc=f"downloading {path.name}",
236
+ unit="B",
237
+ unit_scale=True,
238
+ leave=False,
239
+ disable=not verbose,
316
240
  ) as pbar:
317
241
  for chunk in response.iter_bytes(chunk_size=DOWNLOAD_CHUNK_SIZE):
318
242
  f.write(chunk)
319
243
  pbar.update(len(chunk))
320
244
 
321
245
 
246
+ class DownloadState(Enum):
247
+ DOWNLOADED_OK = 1
248
+ SKIPPED_OK = 2
249
+ DOWNLOADED_INVALID_HASH = 3
250
+ SKIPPED_INVALID_HASH = 4
251
+ SKIPPED_INVALID_REMOTE_STATE = 5
252
+
253
+
322
254
  def download_file(
323
255
  client: AuthenticatedClient,
324
- file_id: UUID,
325
- name: str,
326
- dest: Path,
327
- hash: str,
328
- size: int,
329
- ) -> None:
330
- download_url = _get_file_download(client, file_id)
256
+ *,
257
+ file: File,
258
+ path: Path,
259
+ overwrite: bool = False,
260
+ verbose: bool = False,
261
+ ) -> DownloadState:
262
+ # skip files that are not ok on remote
263
+ if file.state != FileState.OK:
264
+ return DownloadState.SKIPPED_INVALID_REMOTE_STATE
265
+
266
+ # skip existing files depending on flags set
267
+ if path.exists():
268
+ local_hash = b64_md5(path)
269
+ if local_hash != file.hash and not overwrite and file.hash is not None:
270
+ return DownloadState.SKIPPED_INVALID_HASH
271
+
272
+ elif local_hash == file.hash:
273
+ return DownloadState.SKIPPED_OK
274
+
275
+ # this has to be here
276
+ if verbose:
277
+ tqdm.write(
278
+ styled_string(f"overwriting {path}, hash missmatch", style="yellow")
279
+ )
280
+
281
+ # request a download url
282
+ download_url = _get_file_download(client, file.id)
283
+
284
+ # create parent directories
285
+ path.parent.mkdir(parents=True, exist_ok=True)
286
+
287
+ # download the file and check the hash
288
+ _url_download(
289
+ download_url, path=path, size=file.size, overwrite=overwrite, verbose=verbose
290
+ )
291
+ observed_hash = b64_md5(path)
292
+ if file.hash is not None and observed_hash != file.hash:
293
+ return DownloadState.DOWNLOADED_INVALID_HASH
294
+ return DownloadState.DOWNLOADED_OK
295
+
296
+
297
+ UPLOAD_STATE_COLOR = {
298
+ UploadState.UPLOADED: "green",
299
+ UploadState.EXISTS: "yellow",
300
+ UploadState.CANCELED: "red",
301
+ }
302
+
303
+
304
+ def _upload_handler(
305
+ future: Future[UploadState], path: Path, *, verbose: bool = False
306
+ ) -> int:
307
+ try:
308
+ state = future.result()
309
+ except Exception as e:
310
+ logger.error(format_traceback(e))
311
+ if verbose:
312
+ tqdm.write(format_error(f"error uploading {path}", e))
313
+ else:
314
+ print(path.absolute(), file=sys.stderr)
315
+ return 0
316
+
317
+ if state == UploadState.UPLOADED:
318
+ msg = f"uploaded {path}"
319
+ elif state == UploadState.EXISTS:
320
+ msg = f"skipped {path} already uploaded"
321
+ else:
322
+ msg = f"canceled {path} upload"
323
+
324
+ if verbose:
325
+ tqdm.write(styled_string(msg, style=UPLOAD_STATE_COLOR[state]))
326
+ else:
327
+ stream = sys.stdout if state == UploadState.UPLOADED else sys.stderr
328
+ print(path.absolute(), file=stream)
331
329
 
332
- file_path = dest / name
333
- _url_download(download_url, file_path, size)
334
- observed_hash = b64_md5(file_path)
330
+ return path.stat().st_size if state == UploadState.UPLOADED else 0
335
331
 
336
- if observed_hash != hash:
337
- raise CorruptedFile("file hash does not match")
332
+
333
+ DOWNLOAD_STATE_COLOR = {
334
+ DownloadState.DOWNLOADED_OK: "green",
335
+ DownloadState.SKIPPED_OK: "green",
336
+ DownloadState.DOWNLOADED_INVALID_HASH: "red",
337
+ DownloadState.SKIPPED_INVALID_HASH: "yellow",
338
+ DownloadState.SKIPPED_INVALID_REMOTE_STATE: "purple",
339
+ }
340
+
341
+
342
+ def _download_handler(
343
+ future: Future[DownloadState], file: File, path: Path, *, verbose: bool = False
344
+ ) -> int:
345
+ try:
346
+ state = future.result()
347
+ except Exception as e:
348
+ logger.error(format_traceback(e))
349
+ if verbose:
350
+ tqdm.write(format_error(f"error uploading {path}", e))
351
+ else:
352
+ print(path.absolute(), file=sys.stderr)
353
+ return 0
354
+
355
+ if state == DownloadState.DOWNLOADED_OK:
356
+ msg = f"downloaded {path}"
357
+ elif state == DownloadState.DOWNLOADED_INVALID_HASH:
358
+ msg = f"downloaded {path} failed hash check"
359
+ elif state == DownloadState.SKIPPED_OK:
360
+ msg = f"skipped {path} already downloaded"
361
+ elif state == DownloadState.SKIPPED_INVALID_HASH:
362
+ msg = f"skipped {path} already downloaded, hash missmatch, cosider using `--overwrite`"
363
+ else:
364
+ msg = f"skipped {path} remote file has invalid state"
365
+
366
+ if verbose:
367
+ tqdm.write(styled_string(msg, style=DOWNLOAD_STATE_COLOR[state]))
368
+ else:
369
+ stream = (
370
+ sys.stdout
371
+ if state in (DownloadState.DOWNLOADED_OK, DownloadState.SKIPPED_OK)
372
+ else sys.stderr
373
+ )
374
+ print(path.absolute(), file=stream)
375
+
376
+ # number of bytes downloaded
377
+ return file.size if state == DownloadState.DOWNLOADED_OK else 0
378
+
379
+
380
+ def upload_files(
381
+ client: AuthenticatedClient,
382
+ files_map: Dict[str, Path],
383
+ mission_id: UUID,
384
+ *,
385
+ verbose: bool = False,
386
+ n_workers: int = 2,
387
+ ) -> None:
388
+ with tqdm(
389
+ total=len(files_map),
390
+ unit="files",
391
+ desc="uploading files",
392
+ disable=not verbose,
393
+ leave=False,
394
+ ) as pbar:
395
+ start = monotonic()
396
+ futures: Dict[Future[UploadState], Path] = {}
397
+ with ThreadPoolExecutor(max_workers=n_workers) as executor:
398
+ for name, path in files_map.items():
399
+ future = executor.submit(
400
+ upload_file,
401
+ client=client,
402
+ mission_id=mission_id,
403
+ filename=name,
404
+ path=path,
405
+ verbose=verbose,
406
+ )
407
+ futures[future] = path
408
+
409
+ total_size = 0
410
+ for future in as_completed(futures):
411
+ size = _upload_handler(future, futures[future], verbose=verbose)
412
+ total_size += size / 1024 / 1024
413
+
414
+ pbar.update()
415
+ pbar.refresh()
416
+
417
+ t = monotonic() - start
418
+ c = Console(file=sys.stderr)
419
+ c.print(f"upload took {t:.2f} seconds")
420
+ c.print(f"total size: {int(total_size)} MB")
421
+ c.print(f"average speed: {total_size / t:.2f} MB/s")
422
+
423
+
424
+ def download_files(
425
+ client: AuthenticatedClient,
426
+ files: Dict[Path, File],
427
+ *,
428
+ verbose: bool = False,
429
+ overwrite: bool = False,
430
+ n_workers: int = 2,
431
+ ) -> None:
432
+ with tqdm(
433
+ total=len(files),
434
+ unit="files",
435
+ desc="downloading files",
436
+ disable=not verbose,
437
+ leave=False,
438
+ ) as pbar:
439
+
440
+ start = monotonic()
441
+ futures: Dict[Future[DownloadState], Tuple[File, Path]] = {}
442
+ with ThreadPoolExecutor(max_workers=n_workers) as executor:
443
+ for path, file in files.items():
444
+ future = executor.submit(
445
+ download_file,
446
+ client=client,
447
+ file=file,
448
+ path=path,
449
+ overwrite=overwrite,
450
+ verbose=verbose,
451
+ )
452
+ futures[future] = (file, path)
453
+
454
+ total_size = 0
455
+ for future in as_completed(futures):
456
+ file, path = futures[future]
457
+ size = _download_handler(future, file, path, verbose=verbose)
458
+ total_size += size / 1024 / 1024 # MB
459
+ pbar.update()
460
+ pbar.refresh()
461
+
462
+ time = monotonic() - start
463
+ c = Console(file=sys.stderr)
464
+ c.print(f"download took {time:.2f} seconds")
465
+ c.print(f"total size: {int(total_size)} MB")
466
+ c.print(f"average speed: {total_size / time:.2f} MB/s")