llmboost-hub 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,559 @@
1
+ import click
2
+ import subprocess
3
+ from typing import List, Optional, Dict, Tuple, Any
4
+
5
+ from llmboost_hub.commands.list import do_list
6
+ from llmboost_hub.utils.config import config
7
+ from llmboost_hub.utils import gpu_info
8
+ import pandas as pd
9
+ import os
10
+ import tabulate
11
+ import time
12
+ import math
13
+ import docker
14
+ from huggingface_hub import HfApi, snapshot_download, errors as hf_errors
15
+ from rich.progress import Progress, BarColumn, TimeElapsedColumn, TimeRemainingColumn, TextColumn
16
+ import threading
17
+ import hashlib
18
+ from huggingface_hub.utils.tqdm import disable_progress_bars, enable_progress_bars
19
+ import shutil
20
+ from llmboost_hub.utils.fs_utils import path_has_files, dir_size_bytes, sha256_file
21
+ import docker.errors
22
+ from llmboost_hub.commands.completions import complete_model_names
23
+
24
+
25
+ def do_prep(
26
+ model: str, verbose: bool = False, only_verify: bool = False, fresh: bool = False
27
+ ) -> dict:
28
+ """
29
+ Prepare (pull) the Docker image and Hugging Face model assets for a model.
30
+
31
+ Steps:
32
+ 1. Validate exactly one local GPU family (avoid ambiguous matches).
33
+ 2. Resolve an exact match for the model via `do_list` (local images first, then lookup).
34
+ 3. Pull the docker image (Docker SDK); compact or verbose progress.
35
+ 4. Manage HF assets under `config.LBH_MODELS/<model>` using `snapshot_download` (resumable) via staging:
36
+ - Staging path: `config.LBH_MODELS/.tmp/<model>`
37
+ - Move to final on success.
38
+ 5. Modes:
39
+ - `only_verify=True`: verify docker image digest and HF files (size/hash) and return.
40
+ - `fresh=True`: remove local image and HF directories before re-downloading.
41
+
42
+ Args:
43
+ model: Target model identifier.
44
+ verbose: If True, stream detailed logs and progress.
45
+ only_verify: Verify artifacts without default re-download behavior.
46
+ fresh: Forcefully remove local image and HF dirs before re-preparing.
47
+
48
+ Returns:
49
+ Dict:
50
+ status: 'pulled' or 'error'
51
+ image: Chosen docker image (if resolved)
52
+ model_path: Final HF model directory (if prepared)
53
+ error: Error message if any
54
+ lookup_df: Lookup dataframe (for diagnostics)
55
+ """
56
+ # Check GPU families to avoid ambiguity
57
+ local_families = {gpu_info.gpu_name2family(g) for g in gpu_info.get_gpus() if g}
58
+ list_data = do_list(query=model, local_only=False, verbose=verbose)
59
+ images_df: pd.DataFrame = list_data.get("images_df", pd.DataFrame())
60
+ lookup_df: pd.DataFrame = list_data.get("lookup_df", pd.DataFrame())
61
+ # Branch: reject when multiple or zero families (ambiguous environment)
62
+ if len(local_families) != 1:
63
+ return {
64
+ "status": "error",
65
+ "image": None,
66
+ "model_path": None,
67
+ "error": "No exact match found (ambiguous GPU families detected on this system).",
68
+ "lookup_df": lookup_df,
69
+ }
70
+
71
+ # Use list to get filtered lookup_df and find exact model match
72
+ exact_df = images_df[images_df["model"] == model] if not images_df.empty else pd.DataFrame()
73
+ if exact_df.empty:
74
+ exact_df = lookup_df[lookup_df["model"] == model] if not lookup_df.empty else pd.DataFrame()
75
+ # Branch: no exact mapping for the given model name
76
+ if exact_df.empty:
77
+ return {
78
+ "status": "error",
79
+ "image": None,
80
+ "model_path": None,
81
+ "error": f"No exact match found for model '{model}'.",
82
+ "lookup_df": lookup_df,
83
+ }
84
+
85
+ docker_image = exact_df["docker_image"].dropna().astype(str).iloc[0]
86
+
87
+ # Fresh mode: remove local image and local HF model directories
88
+ if fresh:
89
+ try:
90
+ click.echo(f"Fresh mode: removing docker image {docker_image}")
91
+ client_tmp = docker.from_env()
92
+ client_tmp.images.remove(docker_image, force=True, noprune=False)
93
+ except docker.errors.ImageNotFound:
94
+ # Acceptable: image wasn't present locally
95
+ pass
96
+ except Exception as e:
97
+ click.echo(f"Warning: failed to remove docker image {docker_image} in fresh mode.\n{e}")
98
+ return {
99
+ "status": "error",
100
+ "image": docker_image,
101
+ "model_path": None,
102
+ "error": f"Failed to remove docker image {docker_image} in fresh mode: {e}",
103
+ "lookup_df": lookup_df,
104
+ }
105
+ # Ensure HF paths prepared before we might remove
106
+ model_prefix = os.path.join(config.LBH_MODELS, model) # final path
107
+ staging_prefix = os.path.join(config.LBH_MODELS_STAGING, model) # staging path
108
+ os.makedirs(os.path.dirname(staging_prefix), exist_ok=True)
109
+ if fresh:
110
+ click.echo(
111
+ f"Fresh mode: removing local model directories at {model_prefix} and {staging_prefix}"
112
+ )
113
+ try:
114
+ shutil.rmtree(model_prefix, ignore_errors=False)
115
+ shutil.rmtree(staging_prefix, ignore_errors=False)
116
+ except FileNotFoundError:
117
+ pass
118
+ except Exception as e:
119
+ click.echo(f"Warning: failed to remove local model directories in fresh mode.\n{e}")
120
+ return {
121
+ "status": "error",
122
+ "image": docker_image,
123
+ "model_path": None,
124
+ "error": f"Failed to remove local model directories ({model_prefix}, {staging_prefix}) in fresh mode: {e}",
125
+ "lookup_df": lookup_df,
126
+ }
127
+
128
+ # Pull the docker image using Docker SDK
129
+ try:
130
+ client = docker.from_env()
131
+ if verbose:
132
+ click.echo(f"Pulling Docker image: {docker_image}")
133
+ # Stream verbose logs
134
+ for evt in client.api.pull(docker_image, stream=True, decode=True):
135
+ status = evt.get("status") or ""
136
+ eid = evt.get("id") or ""
137
+ prog = evt.get("progress") or ""
138
+ if eid:
139
+ click.echo(f"[{eid}] {status} {prog}")
140
+ else:
141
+ click.echo(status)
142
+ else:
143
+ # Non-verbose: show a compact progress bar based on layer completions
144
+ seen_layers = set()
145
+ done_layers = set()
146
+ progress = Progress(
147
+ TextColumn("Docker pull"),
148
+ BarColumn(),
149
+ TextColumn("{task.completed}/{task.total} layers"),
150
+ TimeElapsedColumn(),
151
+ TimeRemainingColumn(),
152
+ # Keep the bar visible after completion
153
+ )
154
+ with progress:
155
+ task = progress.add_task("pull", total=0)
156
+ for evt in client.api.pull(docker_image, stream=True, decode=True):
157
+ eid = evt.get("id")
158
+ if not eid:
159
+ continue
160
+ if eid not in seen_layers:
161
+ seen_layers.add(eid)
162
+ progress.update(task, total=len(seen_layers))
163
+ status = (evt.get("status") or "").lower()
164
+ if any(
165
+ k in status
166
+ for k in ["already exists", "download complete", "pull complete"]
167
+ ):
168
+ done_layers.add(eid)
169
+ progress.update(task, completed=len(done_layers))
170
+ except Exception as e:
171
+ # Image pull failure: surface and exit
172
+ return {
173
+ "status": "error",
174
+ "image": docker_image,
175
+ "model_path": None,
176
+ "error": f"Failed to pull image {docker_image}: {e}",
177
+ "lookup_df": lookup_df,
178
+ }
179
+
180
+ # Optional: verify docker image digest vs remote only when requested
181
+ def _verify_image_digest(client_obj, image_ref: str) -> Tuple[bool, str]:
182
+ """
183
+ Verify local image digest against the registry digest.
184
+
185
+ Returns:
186
+ (True, 'image verified') on success; otherwise (False, reason).
187
+ """
188
+ try:
189
+ dist = client_obj.api.inspect_distribution(image_ref)
190
+ desc = dist.get("Descriptor") or {}
191
+ remote_digest = desc.get("digest") or desc.get("Digest") # tolerate casing
192
+ if not remote_digest:
193
+ return False, "Remote digest unavailable from registry."
194
+ try:
195
+ img = client_obj.images.get(image_ref)
196
+ except Exception:
197
+ return False, "Local image not found after pull."
198
+ repo_digests = img.attrs.get("RepoDigests") or []
199
+ local_digests = set()
200
+ for rd in repo_digests:
201
+ if "@" in rd:
202
+ local_digests.add(rd.split("@", 1)[1])
203
+ if remote_digest in local_digests:
204
+ return True, "image verified"
205
+ return (
206
+ False,
207
+ f"Digest mismatch. Remote={remote_digest}, Local={list(local_digests)[:1] or 'none'}",
208
+ )
209
+ except Exception as e:
210
+ return False, f"Failed to verify image digest: {e}"
211
+
212
+ if only_verify:
213
+ ok_img, msg_img = _verify_image_digest(client, docker_image)
214
+ if not ok_img:
215
+ return {
216
+ "status": "error",
217
+ "image": docker_image,
218
+ "model_path": None,
219
+ "error": f"Docker image verification failed: {msg_img}",
220
+ "lookup_df": lookup_df,
221
+ }
222
+
223
+ # Helpers for HF model verify/download (unchanged, gated by only_verify below)
224
+ def _verify_repo(
225
+ repo_id: str, repo_type: str, target_dir: str, show_progress: bool = True
226
+ ) -> Tuple[bool, str]:
227
+ """
228
+ Verify local HF files against HF metadata:
229
+ - All listed files exist.
230
+ - Sizes match; and if sha256 is present in LFS metadata, hashes match.
231
+
232
+ Returns:
233
+ (True, 'verified') on success; otherwise (False, reason).
234
+ """
235
+ api = HfApi()
236
+ try:
237
+ info = (
238
+ api.dataset_info(repo_id=repo_id, files_metadata=True)
239
+ if repo_type == "dataset"
240
+ else api.model_info(repo_id=repo_id, files_metadata=True)
241
+ )
242
+ except Exception as e:
243
+ return False, f"Failed to fetch {repo_type} metadata for verification: {e}"
244
+
245
+ siblings = list(info.siblings or [])
246
+ missing = []
247
+ wrong_size = []
248
+ wrong_hash = []
249
+ total = len(siblings)
250
+
251
+ def _verify_one(sib) -> None:
252
+ rel = getattr(sib, "rfilename", None)
253
+ if not rel:
254
+ return
255
+ expected_size = int(getattr(sib, "size", 0) or 0)
256
+ expected_sha = None
257
+ lfs = getattr(sib, "lfs", None) or {}
258
+ if isinstance(lfs, dict):
259
+ expected_sha = lfs.get("sha256") or lfs.get("sha")
260
+ expected_sha = expected_sha or getattr(sib, "sha256", None)
261
+
262
+ local_path = os.path.join(target_dir, rel)
263
+ if not os.path.exists(local_path):
264
+ missing.append(rel)
265
+ return
266
+ try:
267
+ stat_size = os.path.getsize(local_path)
268
+ except Exception:
269
+ missing.append(rel)
270
+ return
271
+ if expected_size and stat_size != expected_size:
272
+ wrong_size.append(rel)
273
+ if expected_sha:
274
+ try:
275
+ actual_sha = sha256_file(local_path)
276
+ if actual_sha != expected_sha:
277
+ wrong_hash.append(rel)
278
+ except Exception:
279
+ pass
280
+ return
281
+ if expected_sha:
282
+ try:
283
+ actual_sha = sha256_file(local_path)
284
+ if actual_sha != expected_sha:
285
+ wrong_hash.append(rel)
286
+ except Exception:
287
+ pass
288
+
289
+ if show_progress and total > 0:
290
+ with Progress(
291
+ TextColumn("Verifying model"),
292
+ BarColumn(),
293
+ TextColumn("{task.completed}/{task.total} files"),
294
+ TimeElapsedColumn(),
295
+ TimeRemainingColumn(),
296
+ ) as progress:
297
+ task = progress.add_task("verify", total=total)
298
+ for s in siblings:
299
+ _verify_one(s)
300
+ progress.update(task, advance=1)
301
+ else:
302
+ for s in siblings:
303
+ _verify_one(s)
304
+
305
+ if missing or wrong_size or wrong_hash:
306
+ parts = []
307
+ if missing:
308
+ parts.append(f"missing={len(missing)}")
309
+ if wrong_size:
310
+ parts.append(f"size_mismatch={len(wrong_size)}")
311
+ if wrong_hash:
312
+ parts.append(f"hash_mismatch={len(wrong_hash)}")
313
+ detail = ", ".join(parts)
314
+ return False, f"Verification failed for {repo_type} ({detail})."
315
+ return True, "verified"
316
+
317
+ def _download_repo(repo_id: str, repo_type: str, target_dir: str) -> Tuple[bool, str]:
318
+ """
319
+ Download the Hugging Face repo snapshot to 'target_dir' with an aggregate progress bar.
320
+
321
+ Returns:
322
+ (True, 'ok') on success; otherwise (False, reason).
323
+ """
324
+ os.makedirs(target_dir, exist_ok=True)
325
+ api = HfApi()
326
+ total_size = 0
327
+ try:
328
+ if repo_type == "dataset":
329
+ info = api.dataset_info(repo_id=repo_id, files_metadata=True)
330
+ else:
331
+ info = api.model_info(repo_id=repo_id, files_metadata=True)
332
+ siblings = list(info.siblings or [])
333
+ total_size = sum(int(getattr(s, "size", 0) or 0) for s in siblings)
334
+ except hf_errors.HfHubHTTPError:
335
+ msg = f"Warning: No {repo_type} repo found for {repo_id}; skipping."
336
+ click.echo(msg)
337
+ return False, msg
338
+ except Exception:
339
+ msg = f"Warning: Failed to access {repo_type} repo for {repo_id}; skipping."
340
+ click.echo(msg)
341
+ return False, msg
342
+
343
+ token = os.environ.get("HF_TOKEN") or os.environ.get("HUGGINGFACE_HUB_TOKEN")
344
+
345
+ # Background thread for snapshot_download to allow progress polling
346
+ exc_holder: Dict[str, Any] = {"err": None}
347
+
348
+ def _bg_download():
349
+ try:
350
+ # Suppress HF internal tqdm bars; we render our own Rich progress
351
+ disable_progress_bars()
352
+ snapshot_download(
353
+ repo_id=repo_id,
354
+ repo_type=repo_type if repo_type in ("dataset", "model") else "model",
355
+ local_dir=target_dir,
356
+ token=token,
357
+ )
358
+ except Exception as e:
359
+ exc_holder["err"] = e
360
+ finally:
361
+ # Restore default behavior for any later operations
362
+ try:
363
+ enable_progress_bars()
364
+ except Exception:
365
+ pass
366
+
367
+ t = threading.Thread(target=_bg_download, daemon=True)
368
+ t.start()
369
+
370
+ def _current_bytes() -> int:
371
+ return dir_size_bytes(target_dir)
372
+
373
+ # Initial status snapshot for progress computation
374
+ start_bytes = _current_bytes()
375
+ if verbose:
376
+ click.echo(
377
+ f"Preparing download ({repo_type}): {start_bytes / (1024*1024):.2f} / "
378
+ f"{(total_size or 0) / (1024*1024):.2f} MiB"
379
+ )
380
+
381
+ # Aggregate progress bar (non-verbose) or periodic logs (verbose/unknown total)
382
+ if not verbose and total_size > 0:
383
+ with Progress(
384
+ TextColumn(f"Downloading {repo_type}"),
385
+ BarColumn(),
386
+ TextColumn("{task.percentage:>5.1f}%"),
387
+ TimeElapsedColumn(),
388
+ TimeRemainingColumn(),
389
+ # Keep the bar visible after completion
390
+ ) as progress:
391
+ task = progress.add_task("download", total=total_size)
392
+ progress.update(task, completed=min(start_bytes, total_size))
393
+ while t.is_alive():
394
+ time.sleep(0.5)
395
+ done = _current_bytes()
396
+ progress.update(task, completed=min(done, total_size))
397
+ done = _current_bytes()
398
+ progress.update(task, completed=min(done, total_size))
399
+ else:
400
+ while t.is_alive():
401
+ time.sleep(1.0)
402
+ done = _current_bytes()
403
+ if verbose and total_size > 0:
404
+ pct = (done / max(total_size, 1)) * 100.0
405
+ click.echo(
406
+ f"Downloading {repo_type}: {done / (1024*1024):.2f} / "
407
+ f"{total_size / (1024*1024):.2f} MiB ({pct:.2f}%)"
408
+ )
409
+ elif verbose:
410
+ click.echo(f"Downloading {repo_type}: {done / (1024*1024):.2f} MiB")
411
+ if verbose and total_size > 0:
412
+ done = _current_bytes()
413
+ pct = (done / max(total_size, 1)) * 100.0
414
+ click.echo(
415
+ f"Completed {repo_type}: {done / (1024*1024):.2f} / "
416
+ f"{total_size / (1024*1024):.2f} MiB ({pct:.2f}%)"
417
+ )
418
+
419
+ # Propagate exceptions from the background thread
420
+ if exc_holder["err"] is not None:
421
+ return False, f"Hugging Face download error: {exc_holder['err']}"
422
+
423
+ # Sanity: if we know total size and downloaded too little, likely gated/unauthorized
424
+ done = _current_bytes()
425
+ if total_size > 0 and done < max(int(total_size * 0.5), 100 * 1024 * 1024):
426
+ msg = (
427
+ f"Incomplete {repo_type} download for {repo_id}. "
428
+ "This model likely requires access/acceptance. "
429
+ "Please run `huggingface-cli login` and ensure you have accepted the model's license/terms "
430
+ f"at https://huggingface.co/{repo_id} (or set HF_TOKEN)."
431
+ )
432
+ return False, msg
433
+
434
+ return True, "ok"
435
+
436
+ # If a model already exists at the final path:
437
+ # - if --only-verify: verify; if verified, return success; if not, re-download to staging
438
+ if path_has_files(model_prefix) and not fresh:
439
+ if only_verify:
440
+ # Branch: verify existing local copy first
441
+ ok_existing, msg_exist = _verify_repo(model, "model", model_prefix, show_progress=True)
442
+ if ok_existing:
443
+ return {
444
+ "status": "pulled",
445
+ "image": docker_image,
446
+ "model_path": model_prefix,
447
+ "error": None,
448
+ "lookup_df": lookup_df,
449
+ }
450
+ if verbose:
451
+ click.echo("Existing model failed verification; re-downloading to staging...")
452
+ else:
453
+ # Branch: model exists and no verification requested
454
+ return {
455
+ "status": "pulled",
456
+ "image": docker_image,
457
+ "model_path": model_prefix,
458
+ "error": None,
459
+ "lookup_df": lookup_df,
460
+ }
461
+
462
+ # Download to staging (resumable)
463
+ os.makedirs(staging_prefix, exist_ok=True)
464
+ ok_m, msg_m = _download_repo(model, "model", staging_prefix)
465
+ if not ok_m:
466
+ return {
467
+ "status": "error",
468
+ "image": docker_image,
469
+ "model_path": model_prefix,
470
+ "error": msg_m,
471
+ "lookup_df": lookup_df,
472
+ }
473
+
474
+ # Verify staging only if requested
475
+ if only_verify:
476
+ ok_m, msg_m = _verify_repo(model, "model", staging_prefix, show_progress=True)
477
+ if not ok_m:
478
+ return {
479
+ "status": "error",
480
+ "image": docker_image,
481
+ "model_path": model_prefix,
482
+ "error": msg_m,
483
+ "lookup_df": lookup_df,
484
+ }
485
+
486
+ # Move staging -> final atomically (replace if exists)
487
+ try:
488
+ os.makedirs(os.path.dirname(model_prefix), exist_ok=True)
489
+ if os.path.exists(model_prefix):
490
+ shutil.rmtree(model_prefix, ignore_errors=True)
491
+ shutil.move(staging_prefix, model_prefix)
492
+ except Exception as e:
493
+ return {
494
+ "status": "error",
495
+ "image": docker_image,
496
+ "model_path": model_prefix,
497
+ "error": f"Failed to finalize model directory: {e}",
498
+ "lookup_df": lookup_df,
499
+ }
500
+
501
+ return {
502
+ "status": "pulled",
503
+ "image": docker_image,
504
+ "model_path": model_prefix,
505
+ "error": None,
506
+ "lookup_df": lookup_df,
507
+ }
508
+
509
+
510
+ @click.command(context_settings={"help_option_names": ["-h", "--help"]})
511
+ @click.argument("model", required=True, shell_complete=complete_model_names)
512
+ @click.option(
513
+ "--only-verify",
514
+ is_flag=True,
515
+ help="Only verify docker image digest and HF model files; skip verification otherwise.",
516
+ )
517
+ @click.option(
518
+ "-f",
519
+ "--fresh",
520
+ is_flag=True,
521
+ help="Remove local docker image and HF model and re-download everything fresh.",
522
+ )
523
+ @click.pass_context
524
+ def prep(ctx: click.Context, model, only_verify, fresh):
525
+ """
526
+ Prepare the Docker image and local model assets for a given model.
527
+ """
528
+ verbose = ctx.obj.get("VERBOSE", False)
529
+ result = do_prep(model, verbose=verbose, only_verify=only_verify, fresh=fresh)
530
+
531
+ if result["status"] == "pulled":
532
+ click.echo(f"Successfully pulled {result['image']}")
533
+ if result.get("model_path"):
534
+ click.echo(f"Model assets available at: {result['model_path']}")
535
+ return
536
+
537
+ # On error: return the error string; tabulate only for "No exact match" cases
538
+ err = result["error"] or "Unknown prep error"
539
+ click.echo(err)
540
+
541
+ lookup_df = result.get("lookup_df")
542
+ if (
543
+ isinstance(lookup_df, pd.DataFrame)
544
+ and not lookup_df.empty
545
+ and err.startswith("No exact match")
546
+ ):
547
+ df = lookup_df.copy().reset_index(drop=True)
548
+ df.index += 1
549
+ click.echo(
550
+ tabulate.tabulate(
551
+ df.values.tolist(),
552
+ headers=list(df.columns),
553
+ showindex=list(df.index),
554
+ tablefmt="psql",
555
+ )
556
+ )
557
+
558
+ # Return the error string (no exception)
559
+ return err