PyPI - hugpy - Versions diffs - 0.1.0__tar.gz - Mend

hugpy 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (230) hide show

hugpy-0.1.0/LICENSE ADDED Viewed

@@ -0,0 +1,30 @@
+hugpy — Source-Available License
+Copyright (c) 2026 putkoff (hugpy.ai). All rights reserved.
+Permission is granted, free of charge, to use this software ("hugpy") for
+personal, non-commercial, and evaluation purposes only, subject to the
+following conditions:
+1. Redistribution of this software, in source or binary form, modified or
+   unmodified, is not permitted without prior written permission from the
+   copyright holder.
+2. Commercial use — including offering this software, or any service
+   substantially derived from it, to third parties for a fee — is not
+   permitted without prior written permission from the copyright holder.
+3. Modification for personal or internal evaluation use is permitted;
+   distribution of modified versions is not.
+4. This notice must be retained in all copies or substantial portions of
+   the software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+COPYRIGHT HOLDER BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+IN THE SOFTWARE.
+For commercial licensing or redistribution permission: https://hugpy.ai

hugpy-0.1.0/PKG-INFO ADDED Viewed

@@ -0,0 +1,80 @@
+Metadata-Version: 2.4
+Name: hugpy
+Version: 0.1.0
+Summary: Self-hosted LLM console: model registry & downloads, streaming chat, OpenAI-compatible /v1 API with on-site keys, GPU worker fleet with cross-machine RPC sharding
+Author: putkoff
+License: hugpy — Source-Available License
+        Copyright (c) 2026 putkoff (hugpy.ai). All rights reserved.
+        Permission is granted, free of charge, to use this software ("hugpy") for
+        personal, non-commercial, and evaluation purposes only, subject to the
+        following conditions:
+        1. Redistribution of this software, in source or binary form, modified or
+           unmodified, is not permitted without prior written permission from the
+           copyright holder.
+        2. Commercial use — including offering this software, or any service
+           substantially derived from it, to third parties for a fee — is not
+           permitted without prior written permission from the copyright holder.
+        3. Modification for personal or internal evaluation use is permitted;
+           distribution of modified versions is not.
+        4. This notice must be retained in all copies or substantial portions of
+           the software.
+        THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+        IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+        FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+        COPYRIGHT HOLDER BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY ARISING
+        FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+        IN THE SOFTWARE.
+        For commercial licensing or redistribution permission: https://hugpy.ai
+Project-URL: Homepage, https://hugpy.ai
+Keywords: llm,llama.cpp,transformers,self-hosted,openai-compatible
+Classifier: License :: Other/Proprietary License
+Requires-Python: >=3.10
+Description-Content-Type: text/markdown
+License-File: LICENSE
+Requires-Dist: flask
+Requires-Dist: gunicorn
+Requires-Dist: pydantic>=2
+Requires-Dist: httpx
+Requires-Dist: aiohttp
+Requires-Dist: requests
+Requires-Dist: psutil
+Requires-Dist: pyyaml
+Requires-Dist: numpy<2.4
+Requires-Dist: pillow
+Requires-Dist: huggingface_hub
+Requires-Dist: bcrypt
+Requires-Dist: abstract_apis
+Requires-Dist: abstract_flask
+Requires-Dist: abstract_security
+Requires-Dist: abstract_utilities
+Requires-Dist: abstract_webtools
+Requires-Dist: abstract_ocr
+Requires-Dist: opencv-python-headless
+Requires-Dist: PyPDF2
+Requires-Dist: pdfplumber
+Requires-Dist: onnxruntime
+Requires-Dist: pytest
+Provides-Extra: transformers
+Requires-Dist: torch; extra == "transformers"
+Requires-Dist: transformers; extra == "transformers"
+Requires-Dist: accelerate; extra == "transformers"
+Provides-Extra: llama
+Requires-Dist: llama-cpp-python; extra == "llama"
+Provides-Extra: embed
+Requires-Dist: sentence-transformers; extra == "embed"
+Provides-Extra: finetune
+Requires-Dist: peft; extra == "finetune"
+Provides-Extra: all
+Requires-Dist: hugpy[embed,finetune,llama,transformers]; extra == "all"
+Dynamic: license-file
+hugpy — Inference you own. Console + API in one process: `hugpy serve`. Join a GPU box to the fleet: `hugpy worker --central https://your-hugpy/`.

hugpy-0.1.0/hugpy/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+from .imports import *
+from .managers import *
+from .utils import *

hugpy-0.1.0/hugpy/cli.py ADDED Viewed

@@ -0,0 +1,93 @@
+"""hugpy command line.
+    hugpy serve  [--host 0.0.0.0] [--port 7002] [--auth open|external] ...
+    hugpy worker --central https://your-hugpy/ [worker_agent args...]
+`serve` runs the whole product from one process: the API, the built web
+console (when a ui/dist exists — see flask_app._ui_dist_dir), model downloads,
+chat, and the OpenAI-compatible /v1 surface. No nginx, no node.
+`worker` joins this machine to a hugpy central as a GPU worker (or, with
+--role rpc, lends its GPU to the cross-machine shard pool). All flags after
+the subcommand go straight to the worker agent's own parser.
+"""
+from __future__ import annotations
+import argparse
+import os
+import sys
+def _serve(args: argparse.Namespace) -> int:
+    # Distribution default: single-operator instance, no login wall. The
+    # /v1 API-key system still gates programmatic access. Deployments that
+    # front a real auth service set --auth external (or HUGPY_AUTH_MODE).
+    if args.auth:
+        os.environ["HUGPY_AUTH_MODE"] = args.auth
+    else:
+        os.environ.setdefault("HUGPY_AUTH_MODE", "open")
+    from hugpy.flask_app import get_hugpy_flask
+    origins = [o.strip() for o in (args.origins or "").split(",") if o.strip()] or None
+    flask_app = get_hugpy_flask(name="hugpy", allowed_origins=origins, debug=args.debug)
+    bind = f"{args.host}:{args.port}"
+    try:
+        from gunicorn.app.base import BaseApplication
+    except ImportError:
+        print(f"hugpy: gunicorn not installed; using the Flask dev server on {bind}",
+              file=sys.stderr)
+        flask_app.run(host=args.host, port=args.port, debug=args.debug)
+        return 0
+    class _App(BaseApplication):
+        def load_config(self):
+            self.cfg.set("bind", bind)
+            self.cfg.set("workers", 1)          # singleton registries/job store
+            self.cfg.set("threads", args.threads)
+            self.cfg.set("timeout", 300)
+        def load(self):
+            return flask_app
+    print(f"hugpy serving on http://{bind}  (console at /, API at /api/v1)")
+    _App().run()
+    return 0
+def _worker(_args: argparse.Namespace, passthrough: list[str]) -> int:
+    from hugpy.worker_agent.agent import main as worker_main
+    return worker_main(passthrough)
+def main(argv: list[str] | None = None) -> int:
+    argv = list(sys.argv[1:] if argv is None else argv)
+    parser = argparse.ArgumentParser(prog="hugpy", description=__doc__,
+                                     formatter_class=argparse.RawDescriptionHelpFormatter)
+    sub = parser.add_subparsers(dest="cmd", required=True)
+    s = sub.add_parser("serve", help="run the hugpy console + API in one process")
+    s.add_argument("--host", default="0.0.0.0")
+    s.add_argument("--port", type=int, default=7002)
+    s.add_argument("--threads", type=int, default=8)
+    s.add_argument("--auth", choices=("open", "external"),
+                   help="auth mode (default: open, or HUGPY_AUTH_MODE)")
+    s.add_argument("--origins", help="comma-separated CORS origins (default: same-origin only)")
+    s.add_argument("--debug", action="store_true")
+    w = sub.add_parser("worker", help="join a hugpy central as a worker",
+                       add_help=False)   # the agent owns its own --help
+    # Split: everything after `worker` belongs to the agent's parser.
+    if argv and argv[0] == "worker":
+        return _worker(w, argv[1:])
+    args = parser.parse_args(argv)
+    if args.cmd == "serve":
+        return _serve(args)
+    parser.error("unknown command")
+    return 2
+if __name__ == "__main__":
+    raise SystemExit(main())

hugpy-0.1.0/hugpy/flask_app/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ from .app import *
2	+ from .wsgi_app import *

hugpy-0.1.0/hugpy/flask_app/app/__init__.py ADDED Viewed

@@ -0,0 +1,7 @@
+from .functions import *
+from .routes.llm_storage_routes import llm_bp
+from .routes.chat_routes import chat_bp
+from .routes.search_routes import search_bp
+from .routes.upload_routes import upload_bp
+from .routes.worker_routes import worker_bp

hugpy-0.1.0/hugpy/flask_app/app/functions/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+from .imports import *
+from .downloads import *
+from .chat import *

hugpy-0.1.0/hugpy/flask_app/app/functions/chat/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ from .streaming import *

hugpy-0.1.0/hugpy/flask_app/app/functions/chat/imports.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ from ..imports import *

hugpy-0.1.0/hugpy/flask_app/app/functions/chat/streaming.py ADDED Viewed

@@ -0,0 +1,168 @@
+from .imports import *
+from flask import Response, stream_with_context
+from pydantic import BaseModel
+from typing import Optional, List
+def sse_event(payload: dict) -> bytes:
+    return f"data: {json.dumps(payload, ensure_ascii=False)}\n\n".encode("utf-8")
+def event_to_sse(ev) -> bytes:
+    """Serialize a dispatch StreamEvent to the browser's SSE wire shape.
+    token/done/error get their minimal browser payloads; everything else
+    (status / provisioning progress / continuation markers — including events
+    relayed from a GPU worker) rides through verbatim via model_dump().
+    """
+    t = getattr(ev, "type", None)
+    if t == "token":
+        return sse_event({"type": "token", "text": ev.text})
+    if t == "done":
+        return sse_event({"type": "done", "finish_reason": ev.finish_reason})
+    if t == "error":
+        return sse_event({"type": "error", "message": ev.message})
+    return sse_event(ev.model_dump())
+def chat_iter_sync(agen):
+    """Drive an async generator from Flask's synchronous WSGI context."""
+    loop = asyncio.new_event_loop()
+    try:
+        asyncio.set_event_loop(loop)
+        while True:
+            try:
+                item = loop.run_until_complete(agen.__anext__())
+                if isinstance(item, str):
+                    item = item.encode("utf-8")
+                yield item
+            except StopAsyncIteration:
+                break
+    finally:
+        try:
+            loop.run_until_complete(loop.shutdown_asyncgens())
+        except Exception:
+            pass
+        asyncio.set_event_loop(None)
+        loop.close()
+def _resolve_max_new_tokens(body: ChatBody) -> int:
+    """Default to the model's full context when the client didn't cap it.
+    A tool, not a service — so when max_new_tokens is omitted we give the model
+    as much room as it has. The engine auto-continues past this per-call cap, so
+    this is the per-pass budget, not a hard ceiling on total output.
+    """
+    if body.max_new_tokens:
+        return body.max_new_tokens
+    try:
+        from .imports import get_model_config
+        cfg = get_model_config(body.model_key) if body.model_key else None
+        ctx = getattr(cfg, "model_max_length", None)
+        if ctx and int(ctx) > 0:
+            return int(ctx)
+    except Exception:
+        pass
+    # Fall back to the global default cap.
+    try:
+        from .imports import DEFAULT_MAX_TOKENS
+        return int(DEFAULT_MAX_TOKENS)
+    except Exception:
+        return 4096
+async def stream_events(body: ChatBody):
+    """Build prompt_kwargs and stream the unified chat engine to SSE.
+    The route is deliberately dumb: it does NOT decide local vs worker. It hands
+    prompt_kwargs to execute_chat_stream, which drives resolve() — and resolve()
+    is the single place that picks in-process / placement-peer / live-GPU-worker
+    and falls back to local. So local and worker chat now stream identically
+    (token-by-token, with auto-continuation past the cap), and there is no
+    separate worker-offload path in this route anymore.
+    """
+    from .imports import execute_chat_stream
+    prompt_kwargs = {}
+    if body.max_new_tokens:
+        # Explicit cap from the client -> honor it (bounded, per-call).
+        prompt_kwargs["max_new_tokens"] = body.max_new_tokens
+    else:
+        # No cap requested -> run unbounded: the runner generates chunk-by-chunk
+        # until the model naturally stops, so the response is never truncated by
+        # a token limit. (Per-chunk size uses the model's context.)
+        prompt_kwargs["unbounded"] = True
+        prompt_kwargs["max_new_tokens"] = _resolve_max_new_tokens(body)
+    if body.model_key:
+        prompt_kwargs["model_key"] = body.model_key
+    if body.temperature is not None:
+        prompt_kwargs["temperature"] = body.temperature
+    if body.do_sample is not None:
+        prompt_kwargs["do_sample"] = body.do_sample
+    if body.messages:
+        prompt_kwargs["messages"] = messages_to_dicts(body.messages)
+    else:
+        prompt_kwargs["prompt"] = body.prompt
+    if body.file:
+        prompt_kwargs["file"] = body.file
+    if body.images:
+        prompt_kwargs["images"] = body.images
+    if body.request_id:
+        # Stable id the engine threads through every continuation pass; also lets
+        # the browser correlate the stream.
+        prompt_kwargs["request_id"] = body.request_id
+    # Text-only chat to a multi-task (e.g. vision) model: route to its
+    # text-generation task instead of the default image-text-to-text, so a
+    # plain prompt uses the text runner. The vision runner requires an image
+    # and would otherwise fail validation. Only do this when no image is given
+    # and the model actually lists text-generation.
+    if not body.images and not body.file and body.model_key:
+        try:
+            from .imports import get_model_config
+            cfg = get_model_config(body.model_key)
+            tasks = getattr(cfg, "tasks", None) or []
+            primary = getattr(cfg, "primary_task", None)
+            if primary != "text-generation" and "text-generation" in tasks:
+                prompt_kwargs["task"] = "text-generation"
+        except Exception:
+            pass
+    logger.info("prompt_kwargs == %s", prompt_kwargs)
+    try:
+        async for event in execute_chat_stream(**prompt_kwargs):
+            yield event_to_sse(event)
+    except Exception as exc:
+        logger.exception("stream_events failed")
+        yield sse_event({"type": "error", "message": str(exc)})
+def chat_stream(mimetype=None, headers=None, **kwargs):
+    logger.info(kwargs)
+    body = ChatBody(**kwargs)
+    return Response(
+        stream_with_context(chat_iter_sync(stream_events(body))),
+        mimetype=mimetype or "text/event-stream",
+        headers=headers or {
+            "Cache-Control": "no-cache",
+            "X-Accel-Buffering": "no",
+            "Connection": "keep-alive",
+        },
+        direct_passthrough=True,
+    )

hugpy-0.1.0/hugpy/flask_app/app/functions/downloads/__init__.py ADDED Viewed

@@ -0,0 +1,4 @@
+from .downloads import *
+from .downloader import *
+from .cancelable_downloads import *

hugpy-0.1.0/hugpy/flask_app/app/functions/downloads/cancelable_downloads.py ADDED Viewed

@@ -0,0 +1,269 @@
+import multiprocessing as mp
+import tempfile
+from datetime import datetime, timezone
+from flask import jsonify, abort
+from .imports import *
+from .downloader import *
+# ──────────────────────────────────────────────────────────────────────────
+# Tunables (env-overridable). A download that writes no new bytes for
+# STALL_SECONDS is considered stalled and gets killed + resumed. Each download
+# is attempted up to MAX_ATTEMPTS times; HF keeps partial files on disk so a
+# resume picks up where the previous attempt stopped.
+# ──────────────────────────────────────────────────────────────────────────
+STALL_SECONDS = int(os.environ.get("HUGPY_DOWNLOAD_STALL_SECONDS", "180"))
+MAX_ATTEMPTS  = int(os.environ.get("HUGPY_DOWNLOAD_MAX_ATTEMPTS", "4"))
+# ──────────────────────────────────────────────────────────────────────────
+# Error hand-off across the process boundary — the download runs in a child
+# process, so it writes its failure reason to a temp file the monitor reads.
+# ──────────────────────────────────────────────────────────────────────────
+def _error_path(job_id: str) -> str:
+    return os.path.join(tempfile.gettempdir(), f"hugpy-download-{job_id}.err")
+def _write_error(job_id: str, msg: str) -> None:
+    try:
+        with open(_error_path(job_id), "w", encoding="utf-8") as fh:
+            fh.write(msg[:2000])
+    except OSError:
+        pass
+def _read_error(job_id: str) -> str | None:
+    try:
+        with open(_error_path(job_id), "r", encoding="utf-8") as fh:
+            return fh.read().strip() or None
+    except OSError:
+        return None
+def _clear_error(job_id: str) -> None:
+    try:
+        os.remove(_error_path(job_id))
+    except OSError:
+        pass
+def update_model_status(model: dict) -> dict:
+    model.update(model_status(model))
+    return model
+def _estimate_total_bytes(model: dict) -> int | None:
+    """Sum the sizes of exactly the files this download will fetch, so the
+    progress bar can show a real percentage. Respects filename (single GGUF),
+    include patterns, or full repo. Returns None on any failure -> the bar
+    falls back to indeterminate, which still works."""
+    hub_id = model.get("hub_id")
+    if not hub_id:
+        return None
+    repo_id, _ = split_hub_id(hub_id)
+    try:
+        info = hfApi.model_info(repo_id, files_metadata=True)
+    except Exception as exc:
+        logger.info("size estimate failed for %s: %s", hub_id, exc)
+        return None
+    filename = model.get("filename")
+    include = model.get("include")
+    def will_download(path: str) -> bool:
+        if filename:
+            return path == filename or path.endswith("/" + filename)
+        if include:
+            pats = include if isinstance(include, list) else [include]
+            return any(fnmatch.fnmatch(path, p) for p in pats)
+        return True
+    total = sum((s.size or 0) for s in (info.siblings or []) if will_download(s.rfilename))
+    return total or None
+# ──────────────────────────────────────────────────────────────────────────
+# Subprocess worker — module-level so it's spawn-safe. Captures the real
+# failure reason (HF errors propagate out of download_one) into the error file,
+# then re-raises so the process exits non-zero and the monitor sees the failure.
+# ──────────────────────────────────────────────────────────────────────────
+def _download_worker(job_id: str, model_key: str, model: dict) -> None:
+    os.setpgrp()
+    try:
+        download_one(model=model, model_key=model_key)   # writes hugpy.json via _stamp
+        _clear_error(job_id)
+    except Exception as exc:
+        _write_error(job_id, f"{type(exc).__name__}: {exc}")
+        raise
+def _dir_bytes(path: str) -> int:
+    total = 0
+    for root, _, files in os.walk(path):
+        for f in files:
+            try:
+                total += os.path.getsize(os.path.join(root, f))
+            except OSError:
+                pass
+    return total
+def _is_cancelled(job_id: str) -> bool:
+    cur = job_store.get(job_id)
+    return bool(cur and cur.status == "cancelled")
+def _watch(proc, job_id: str, dest: str, total_bytes: int | None) -> bool:
+    """Sample progress every second while ``proc`` runs.
+    Reports bytes/sec and percentage. Returns True if the transfer STALLED
+    (no new bytes for STALL_SECONDS) — in which case the process group is
+    killed so it can be resumed — or False if the process exited on its own.
+    """
+    last_bytes = _dir_bytes(dest)
+    last_change = time.time()
+    prev_bytes, prev_t = last_bytes, last_change
+    while proc.is_alive():
+        time.sleep(1.0)
+        if _is_cancelled(job_id):
+            return False
+        now = time.time()
+        got = _dir_bytes(dest)
+        bps = max(got - prev_bytes, 0) / max(now - prev_t, 1e-6)
+        prev_bytes, prev_t = got, now
+        if got > last_bytes:
+            last_bytes, last_change = got, now
+        pct = (got / total_bytes) if total_bytes else 0.0
+        job_store.update(job_id, progress=min(pct, 0.999),
+                         downloaded_bytes=got, bytes_per_second=bps, stalled=False)
+        if (now - last_change) >= STALL_SECONDS:
+            job_store.update(job_id, stalled=True)
+            try:
+                os.killpg(os.getpgid(proc.pid), signal.SIGTERM)
+            except (ProcessLookupError, PermissionError):
+                pass
+            return True
+    return False
+# ──────────────────────────────────────────────────────────────────────────
+# Launch: spawn the worker under a monitor that auto-resumes a stalled/failed
+# transfer with backoff, surfaces the real error, and resolves the terminal
+# state. A user cancel at any point (status -> cancelled) stops the loop.
+# ──────────────────────────────────────────────────────────────────────────
+def start_cancellable_download(job: Job, model: dict, total_bytes: int | None = None) -> None:
+    dest = route_destination(model=model)
+    logger.info("download -> %s", dest)
+    job_store.update(
+        job.id, status="running", message="Downloading…",
+        total_bytes=total_bytes, attempt=1, max_attempts=MAX_ATTEMPTS,
+        stalled=False, error=None, _model=model,
+    )
+    def _spawn():
+        _clear_error(job.id)
+        p = mp.Process(target=_download_worker, args=(job.id, job.model_key, model), daemon=True)
+        p.start()
+        job_store.update(job.id, _proc=p)
+        return p
+    def monitor() -> None:
+        nonlocal total_bytes
+        if total_bytes is None:
+            total_bytes = _estimate_total_bytes(model)
+            if total_bytes:
+                job_store.update(job.id, total_bytes=total_bytes)
+        attempt = 1
+        while True:
+            if attempt > 1:
+                job_store.update(
+                    job.id, attempt=attempt, status="running", stalled=False,
+                    message=f"Resuming (attempt {attempt}/{MAX_ATTEMPTS})…",
+                )
+            proc = _spawn()
+            stalled = _watch(proc, job.id, dest, total_bytes)
+            proc.join()
+            if _is_cancelled(job.id):
+                return
+            if not stalled and proc.exitcode == 0:
+                job_store.update(
+                    job.id, status="completed", progress=1.0, stalled=False,
+                    downloaded_bytes=_dir_bytes(dest), error=None,
+                    bytes_per_second=None, message=f"Installed at {dest}",
+                )
+                try:
+                    record_downloaded_model(model, dest)
+                    refresh_registry(run_discovery=False)
+                except Exception as exc:
+                    logger.warning("post-download registry refresh failed: %s", exc)
+                return
+            # Failed or stalled — figure out why, then resume or give up.
+            detail = _read_error(job.id) or (
+                f"stalled: no new data for {STALL_SECONDS}s"
+                if stalled else f"worker exited with code {proc.exitcode}"
+            )
+            if attempt >= MAX_ATTEMPTS:
+                job_store.update(
+                    job.id, status="failed", stalled=stalled, bytes_per_second=None,
+                    message="Download stalled." if stalled else "Download failed.",
+                    error=detail,
+                )
+                return
+            backoff = min(2 ** attempt, 30)
+            job_store.update(
+                job.id, status="running", stalled=stalled, error=detail,
+                message=(f"{'Stalled' if stalled else 'Error'}; retrying in {backoff}s "
+                         f"(attempt {attempt + 1}/{MAX_ATTEMPTS})…"),
+            )
+            for _ in range(backoff):
+                if _is_cancelled(job.id):
+                    return
+                time.sleep(1.0)
+            attempt += 1
+    threading.Thread(target=monitor, daemon=True).start()
+def cancel_download(job_id: str) -> dict:
+    job = job_store.get(job_id)
+    if not job:
+        abort(404, description="Unknown job ID.")
+    if job.status not in ("queued", "running"):
+        return {"cancelled": False, "reason": f"job is {job.status}"}
+    # Set status FIRST so the monitor's auto-resume loop sees the cancel and
+    # won't relaunch after we kill the current attempt.
+    job_store.update(job_id, status="cancelled", message="Cancelled by user.",
+                     stalled=False, bytes_per_second=None)
+    proc = getattr(job, "_proc", None)
+    if proc is not None and proc.is_alive():
+        try:
+            os.killpg(os.getpgid(proc.pid), signal.SIGTERM)
+        except (ProcessLookupError, PermissionError):
+            pass
+    return {"cancelled": True}
+def retry_download(job_id: str) -> dict:
+    """Resume a failed/cancelled download from where it stopped.
+    Reuses the same job id and the model context captured at first launch, so
+    partial files already on disk are continued (HF resumes), not re-fetched.
+    """
+    job = job_store.get(job_id)
+    if not job:
+        abort(404, description="Unknown job ID.")
+    if job.status in ("queued", "running"):
+        return {"retried": False, "reason": f"job is already {job.status}"}
+    model = getattr(job, "_model", None)
+    if not model:
+        return {"retried": False, "reason": "no model context to resume from"}
+    start_cancellable_download(job, model, total_bytes=job.total_bytes)
+    return {"retried": True, "id": job_id}