PyPI - freesolo-flash-dev - Versions diffs - 0.2.25__py3-none-any.whl - Mend

freesolo-flash-dev 0.2.25__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (111) hide show

flash/__init__.py +29 -0
flash/_channel.py +23 -0
flash/_fileio.py +35 -0
flash/_logging.py +49 -0
flash/_update_check.py +266 -0
flash/catalog.py +253 -0
flash/cli/__init__.py +1 -0
flash/cli/main/__init__.py +227 -0
flash/cli/main/__main__.py +6 -0
flash/cli/main/commands.py +636 -0
flash/cli/main/envpush.py +317 -0
flash/cli/main/render.py +599 -0
flash/cli/main/training_doc.py +455 -0
flash/client/__init__.py +14 -0
flash/client/config.py +70 -0
flash/client/http.py +372 -0
flash/client/runtime_secrets.py +69 -0
flash/client/specs.py +20 -0
flash/cost/__init__.py +16 -0
flash/cost/analytical.py +175 -0
flash/cost/facts.py +114 -0
flash/cost/spec.py +113 -0
flash/cost/types.py +158 -0
flash/engine/__init__.py +6 -0
flash/engine/accounting.py +36 -0
flash/engine/chalk_kernels.py +116 -0
flash/engine/multiturn_rollout.py +780 -0
flash/engine/recipe.py +86 -0
flash/engine/vram.py +603 -0
flash/engine/worker/__init__.py +2916 -0
flash/engine/worker/__main__.py +4 -0
flash/engine/worker/kernel_warmup.py +400 -0
flash/engine/worker/lora.py +796 -0
flash/engine/worker/packing.py +366 -0
flash/engine/worker/perf.py +1048 -0
flash/envs/__init__.py +10 -0
flash/envs/adapter/__init__.py +883 -0
flash/envs/adapter/rubric.py +222 -0
flash/envs/base.py +52 -0
flash/envs/registry.py +62 -0
flash/mcp/__init__.py +1 -0
flash/mcp/server.py +85 -0
flash/providers/__init__.py +59 -0
flash/providers/_auth.py +24 -0
flash/providers/_http.py +230 -0
flash/providers/_instance.py +416 -0
flash/providers/_instance_bootstrap.py +517 -0
flash/providers/_poll.py +311 -0
flash/providers/allocator.py +193 -0
flash/providers/base.py +431 -0
flash/providers/hyperstack/__init__.py +127 -0
flash/providers/hyperstack/api.py +522 -0
flash/providers/hyperstack/auth.py +17 -0
flash/providers/hyperstack/gpus.py +29 -0
flash/providers/hyperstack/jobs/__init__.py +632 -0
flash/providers/hyperstack/jobs/builders.py +122 -0
flash/providers/hyperstack/preflight.py +23 -0
flash/providers/hyperstack/pricing.py +26 -0
flash/providers/hyperstack/train.py +25 -0
flash/providers/lambdalabs/__init__.py +139 -0
flash/providers/lambdalabs/api.py +261 -0
flash/providers/lambdalabs/auth.py +18 -0
flash/providers/lambdalabs/gpus.py +29 -0
flash/providers/lambdalabs/jobs/__init__.py +724 -0
flash/providers/lambdalabs/jobs/builders.py +118 -0
flash/providers/lambdalabs/preflight.py +27 -0
flash/providers/lambdalabs/pricing.py +51 -0
flash/providers/lambdalabs/train.py +27 -0
flash/providers/preflight.py +55 -0
flash/providers/realized.py +80 -0
flash/providers/runpod/__init__.py +130 -0
flash/providers/runpod/api.py +186 -0
flash/providers/runpod/auth.py +37 -0
flash/providers/runpod/cost.py +57 -0
flash/providers/runpod/gpus.py +46 -0
flash/providers/runpod/jobs.py +956 -0
flash/providers/runpod/keys.py +139 -0
flash/providers/runpod/preflight.py +30 -0
flash/providers/runpod/preload.py +915 -0
flash/providers/runpod/pricing.py +18 -0
flash/providers/runpod/slots.py +79 -0
flash/providers/runpod/train/__init__.py +150 -0
flash/providers/runpod/train/deps.py +395 -0
flash/providers/runpod/train/endpoints.py +820 -0
flash/py.typed +0 -0
flash/runner/__init__.py +686 -0
flash/runner/checkpoints.py +82 -0
flash/runner/deploy.py +422 -0
flash/runner/lifecycle.py +672 -0
flash/schema/__init__.py +375 -0
flash/schema/fields.py +331 -0
flash/serve/__init__.py +1 -0
flash/serve/deploy.py +326 -0
flash/serve/pricing.py +60 -0
flash/server/__init__.py +1 -0
flash/server/__main__.py +20 -0
flash/server/app.py +961 -0
flash/server/auth.py +263 -0
flash/server/billing.py +124 -0
flash/server/checkpoints.py +110 -0
flash/server/db.py +160 -0
flash/server/environment_registry.py +102 -0
flash/server/envs.py +360 -0
flash/server/reconcile.py +163 -0
flash/server/run_registry.py +150 -0
flash/spec.py +333 -0
freesolo_flash_dev-0.2.25.dist-info/METADATA +192 -0
freesolo_flash_dev-0.2.25.dist-info/RECORD +111 -0
freesolo_flash_dev-0.2.25.dist-info/WHEEL +4 -0
freesolo_flash_dev-0.2.25.dist-info/entry_points.txt +3 -0
freesolo_flash_dev-0.2.25.dist-info/licenses/LICENSE +201 -0

flash/providers/lambdalabs/jobs/builders.py ADDED Viewed

@@ -0,0 +1,118 @@
+"""Pure, monkeypatch-free building blocks for the Lambda Cloud run lifecycle.
+The Lambda-specific leaf of ``flash.providers.lambdalabs.jobs``: the normalized dataclasses
+(``LambdaInstance``, ``LambdaJobHandle``) and the image accessor. The cross-provider pieces — the
+run-derived sweep label, the bootstrap payload, and the cloud-init ``user_data`` — are shared with
+Hyperstack in ``flash.providers._instance`` and re-exported here so the import path is unchanged.
+This module MUST NOT import the ``jobs`` package ``__init__`` (it is imported BY it).
+"""
+from __future__ import annotations
+from dataclasses import dataclass
+# Shared instance-provider helpers (single source of truth; Lambda binds arm="lambda" + its image).
+from flash.providers._instance import (
+    build_payload as _shared_build_payload,
+)
+from flash.providers._instance import (
+    build_user_data as _shared_build_user_data,
+)
+from flash.providers._instance import (
+    instance_label,
+    run_label_prefix,
+)
+__all__ = [
+    "LambdaInstance",
+    "LambdaJobHandle",
+    "build_payload",
+    "build_user_data",
+    "instance_label",
+    "lambda_image",
+    "run_label_prefix",
+]
+@dataclass(frozen=True)
+class LambdaInstance:
+    """A launchable (region, instance_type, $/hr) for a managed GPU class — the Lambda analog of a
+    vetted Vast offer."""
+    gpu: str  # canonical class name (GPU_INFO key)
+    instance_type: str  # Lambda instance-type name (e.g. "gpu_1x_a10")
+    region: str
+    vram_gb: int
+    price_usd_hr: float
+@dataclass
+class LambdaJobHandle:
+    """Persisted in RunStatus.remote so any process can reattach/cancel (cf. base.JobHandle)."""
+    instance_id: str
+    instance_type: str
+    region: str
+    name: str  # the sweep-matchable instance name (run-derived; see ``instance_label``)
+    gpu: str
+    hourly_usd: float
+    attempt: int
+    started_ts: float
+    def to_dict(self) -> dict:
+        return {
+            "provider": "lambda",
+            "instance_id": self.instance_id,
+            "instance_type": self.instance_type,
+            "region": self.region,
+            "name": self.name,
+            "gpu": self.gpu,
+            "hourly_usd": self.hourly_usd,
+            "attempt": self.attempt,
+            "started_ts": self.started_ts,
+        }
+    @classmethod
+    def from_dict(cls, d: dict) -> LambdaJobHandle:
+        return cls(
+            instance_id=str(d["instance_id"]),
+            instance_type=str(d.get("instance_type") or ""),
+            region=str(d.get("region") or ""),
+            name=str(d.get("name") or ""),
+            gpu=str(d.get("gpu") or ""),
+            hourly_usd=float(d.get("hourly_usd") or 0),
+            attempt=int(d.get("attempt") or 0),
+            started_ts=float(d.get("started_ts") or 0),
+        )
+def lambda_image(gpu: str | None = None) -> str:
+    """Docker image the cloud-init runs on the Lambda host: the prebuilt, PUBLIC ``WORKER_IMAGE``
+    (the byte-identical training stack RunPod bakes). ``FLASH_WORKER_IMAGE`` overrides it; when the
+    operator opts into per-SM warmed images (``FLASH_WORKER_IMAGE_PER_SM`` /
+    ``FLASH_WORKER_IMAGE_TEMPLATE``), the GPU class selects the matching ``-smXX`` tag so the worker's
+    baked kernel cache matches the rented GPU's arch (the same selector RunPod uses)."""
+    from flash.providers.runpod.train import WORKER_IMAGE, worker_image_for_gpu
+    # allow_default=True -> always a concrete image to docker-pull (override / per-sm tag / base).
+    return worker_image_for_gpu(gpu, allow_default=True) or WORKER_IMAGE
+def build_payload(
+    spec, seed: int, attempt: int, runtime_secrets: dict | None = None,
+    cache_host_mount: str | None = None,
+    mode: str | None = None, models: list | None = None,
+) -> dict:
+    """The Lambda bootstrap payload (shared builder, arm='lambda'). ``cache_host_mount`` (the host
+    NFS mount of the attached weight-cache filesystem, /lambda/nfs/<name>) points HF_HOME at it.
+    ``mode='preload'`` + ``models`` makes it a download-only warm payload (no worker)."""
+    return _shared_build_payload(
+        spec, seed, attempt, arm="lambda", runtime_secrets=runtime_secrets,
+        cache_host_mount=cache_host_mount, mode=mode, models=models,
+    )
+def build_user_data(payload: dict, *, gpu: str | None = None) -> str:
+    """The Lambda cloud-init user_data (shared builder, runs the Lambda WORKER_IMAGE)."""
+    return _shared_build_user_data(payload, image=lambda_image(gpu))

flash/providers/lambdalabs/preflight.py ADDED Viewed

@@ -0,0 +1,27 @@
+"""Fail-fast credential checks for the Lambda Cloud substrate (operator-side).
+Mirrors ``providers/runpod/preflight.py``. Lambda is OPT-IN (the allocator only reaches for it
+when ``LAMBDA_API_KEY`` is set), so the only Lambda-specific requirement is ``LAMBDA_API_KEY``;
+HF_TOKEN is a shared run requirement checked once centrally by the cross-provider preflight
+(``flash/providers/preflight.py``), which calls each provider-specific check with
+``require_hf=False`` so HF is never double-reported.
+"""
+from __future__ import annotations
+from flash.providers.lambdalabs.auth import load_api_key
+def missing_credentials(require_hf: bool = True) -> list[str]:
+    """Lambda-related operator config that is missing (empty list == ready).
+    ``require_hf`` is accepted only for signature parity with the RunPod check and is
+    intentionally ignored: Lambda has no provider-owned HF requirement (the shared HF_TOKEN is
+    checked once centrally in ``providers.preflight``).
+    """
+    problems: list[str] = []
+    if not load_api_key():
+        problems.append(
+            "  - LAMBDA_API_KEY: the operator's Lambda Cloud API key (for the lambda provider)"
+        )
+    return problems

flash/providers/lambdalabs/pricing.py ADDED Viewed

@@ -0,0 +1,51 @@
+"""Lambda Cloud $/hr: live ``/instance-types`` rate per class, static fallback.
+Lambda prices a fixed instance-type catalog (unlike Vast's live market), so a class's rate is just
+its instance type's ``price_cents_per_hour``. This module gives the provider interface a uniform
+``hourly_rate(gpu)``. Offline-safe: without ``LAMBDA_API_KEY`` (or on any fetch failure) it falls
+back to the static Lambda snapshot below.
+NB: the static fallback is a Lambda-specific map, NOT ``GpuClass.hourly_usd`` — that field is the
+RunPod secure-cloud snapshot, which differs from Lambda's list price for the shared classes (e.g.
+RTX A6000 is $0.49 on RunPod but $1.09 on Lambda).
+"""
+from __future__ import annotations
+from flash._logging import get_logger
+logger = get_logger(__name__)
+# Lambda list prices (snapshot 2026-06-25, from /instance-types). Live rates override these.
+_STATIC_RATES: dict[str, float] = {
+    "A10": 1.29,
+    "RTX A6000": 1.09,
+    "A100 SXM 40GB": 1.99,
+    "H100": 3.29,
+}
+def _static_rate(name: str) -> float:
+    from flash.providers.base import GPU_INFO
+    # Prefer the Lambda snapshot; fall back to the class's nominal rate for a class we somehow
+    # don't have a Lambda price for (keeps ``hourly_rate`` total).
+    return _STATIC_RATES.get(name) or GPU_INFO[name].hourly_usd
+def hourly_rate(gpu_name: str) -> float:
+    """$/hr for one friendly GPU name on Lambda (live ``/instance-types`` if available, else static)."""
+    from flash.providers.base import canonical_gpu, get_gpu_info
+    name = canonical_gpu(gpu_name)
+    info = get_gpu_info(name)
+    if info.lambda_name:
+        try:
+            from flash.providers.lambdalabs.api import instance_type_price_usd_hr
+            live = instance_type_price_usd_hr(info.lambda_name)
+            if live:
+                return live
+        except Exception as exc:
+            logger.debug("live lambda pricing unavailable for %s (%s); using static", name, exc)
+    return _static_rate(name)

flash/providers/lambdalabs/train.py ADDED Viewed

@@ -0,0 +1,27 @@
+"""Lambda Cloud train submission: build the instance payload + submit a run.
+The worker stack/env is substrate-neutral, so the per-run worker env and dependency resolution are
+shared with RunPod (``providers/runpod/train.py``); this module owns the Lambda-specific submission
+entrypoint and the instance payload shape. Provisioning, polling, and teardown live in
+``providers/lambdalabs/jobs``.
+"""
+from __future__ import annotations
+# Shared, substrate-neutral worker stack (single source of truth on RunPod's module).
+from flash.providers.lambdalabs.jobs import build_payload, submit_run_lambda
+from flash.providers.runpod.train import (
+    WORKER_DEPS,
+    WORKER_SYSTEM_DEPS,
+    build_worker_env,
+    resolve_worker_deps,
+)
+__all__ = [
+    "WORKER_DEPS",
+    "WORKER_SYSTEM_DEPS",
+    "build_payload",
+    "build_worker_env",
+    "resolve_worker_deps",
+    "submit_run_lambda",
+]

flash/providers/preflight.py ADDED Viewed

@@ -0,0 +1,55 @@
+"""RunPod startup preflight.
+``check_run_preflight`` aggregates RunPod's missing-config problems plus the shared Hugging Face
+dataset-repo requirements, so a single startup error lists everything missing.
+"""
+from __future__ import annotations
+import os
+from flash.providers.runpod.preflight import (
+    PreflightError,
+    missing_credentials,
+)
+__all__ = [
+    "PreflightError",
+    "check_run_preflight",
+]
+def _missing_hf_credentials() -> list[str]:
+    """Shared run infra every substrate needs."""
+    problems: list[str] = []
+    if not os.environ.get("GITHUB_TOKEN"):
+        problems.append("  - GITHUB_TOKEN: server token with access to managed Freesolo environments")
+    if not os.environ.get("HF_TOKEN"):
+        problems.append(
+            "  - HF_TOKEN: a token with write access to each run's "
+            "`[train] hf_repo`, e.g. `export HF_TOKEN=hf_...`"
+        )
+    return problems
+def _preflight_provider_names() -> set[str]:
+    """The providers whose operator config this control plane must satisfy."""
+    return {"runpod"}
+def check_run_preflight(require_hf: bool = True) -> None:
+    """Validate RunPod operator config; raise on missing."""
+    selected = _preflight_provider_names()
+    problems: list[str] = []
+    # The HF write token is shared run infra and is checked once so it isn't double-reported.
+    # The HF dataset repo itself is per-run (``[train] hf_repo``).
+    if "runpod" in selected:
+        problems += missing_credentials(require_hf=False)
+    if require_hf:
+        problems += _missing_hf_credentials()
+    if problems:
+        raise PreflightError(
+            "the Flash control plane is missing required operator configuration:\n"
+            + "\n".join(problems)
+            + "\n\nSet these on the control-plane host."
+        )

flash/providers/realized.py ADDED Viewed

@@ -0,0 +1,80 @@
+"""Realized provider cost (COGS) for a finished run -- the cost side of estimator accuracy.
+RunPod's billing API gives the dollars it ACTUALLY charged, which the reconciliation job
+compares against the run's charged pre-flight estimate. This module owns the ``RealizedCost``
+shape and dispatches to the RunPod shaper by the run's persisted handle
+(``RunStatus.remote['provider']``). The HTTP calls live in the provider's ``api.py``; the pure
+shaping lives in its ``cost.py`` so it stays offline-testable.
+"""
+from __future__ import annotations
+from dataclasses import dataclass, field
+@dataclass
+class RealizedCost:
+    provider: str
+    realized_usd: float
+    by_resource: dict[str, float] = field(
+        default_factory=dict
+    )  # {"gpu": .., "disk": .., "bwd": ..}
+    wall_seconds: float | None = None
+    source: dict = field(default_factory=dict)  # audit: resource ids / raw refs
+def realized_cost_for_remote(
+    remote: dict | None, *, start: float, end: float, run_end: float | None = None
+) -> RealizedCost | None:
+    """Pull realized cost for a run from its persisted provider handle, or None if unattributable.
+    ``remote`` is ``RunStatus.remote`` (the last/successful attempt's handle dict). Returns None
+    when there is no handle, no resource id, or an unknown provider -- the run then stays
+    unreconciled (and is retried).
+    Two distinct time bounds, because the two cost sources are different:
+      * ``start``/``end`` bound the RunPod BILLING-API query window. The caller pads ``end`` past the
+        run's terminal time so the settled invoice row is in range (see reconcile ``_SETTLE_SECONDS``).
+      * ``run_end`` is the run's ACTUAL terminal time (~teardown). The instance providers
+        (Lambda/Hyperstack) have no billing endpoint: an instance bills at a flat $/hr from launch to
+        teardown, so their realized COGS is wall x rate over ``started_ts -> run_end`` — it must NOT
+        use the settle-padded ``end`` or it would over-bill by the padding (up to an hour). Defaults
+        to ``end`` for back-compat when the caller doesn't distinguish.
+    """
+    if not remote:
+        return None
+    provider = remote.get("provider") or "runpod"
+    if provider == "runpod":
+        from flash.providers.runpod.cost import realized_cost as runpod_realized
+        return runpod_realized(remote.get("endpoint_id"), start=start, end=end)
+    if provider in ("lambda", "hyperstack"):
+        return _instance_realized_cost(remote, start=start, end=run_end if run_end is not None else end)
+    return None
+def _instance_realized_cost(
+    remote: dict, *, start: float, end: float
+) -> RealizedCost | None:
+    """Realized COGS for an instance-billed provider: wall-clock x the instance's flat $/hr.
+    The instance billed from its launch (``started_ts`` on the handle) until teardown (``end``, the
+    run's true terminal time — NOT a settle-padded billing-query bound). Unattributable -> None (no
+    rate persisted) so the run stays unreconciled rather than booking $0.
+    """
+    rate = remote.get("hourly_usd")
+    rid = remote.get("instance_id") or remote.get("vm_id")
+    # Honor the module contract: no rate OR no auditable resource id -> unattributable (None), so the
+    # run stays unreconciled rather than booking instance cost we can't tie to a resource.
+    if not rate or not rid:
+        return None
+    launch = remote.get("started_ts") or start
+    wall = max(0.0, float(end) - float(launch))
+    usd = round(wall / 3600.0 * float(rate), 6)
+    return RealizedCost(
+        provider=str(remote.get("provider")),
+        realized_usd=usd,
+        by_resource={"gpu": usd},
+        wall_seconds=wall,
+        source={"resource_id": str(rid), "hourly_usd": float(rate), "started_ts": float(launch)},
+    )

flash/providers/runpod/__init__.py ADDED Viewed

@@ -0,0 +1,130 @@
+"""RunPod Flash provider: managed, serverless GPUs (no Docker) for Flash.
+Fine-tuning runs on a dedicated RunPod GPU provisioned by Flash. A decorated Python
+handler (``train._train_body``) executes ``flash.engine.worker`` on the GPU; Flash
+handles provisioning, dependency install, execution, and scale-to-zero teardown.
+Serving exposes an OpenAI-compatible endpoint for a trained LoRA adapter.
+``PROVIDER`` is the ``base.Provider`` implementation the registry hands out; the
+orchestrator/allocator only talk to its interface, never these modules directly.
+"""
+from __future__ import annotations
+from typing import Any
+from flash.providers.base import GpuClass, JobHandle, PollResult, Provider
+class RunpodProvider:
+    """``base.Provider`` for the RunPod Flash substrate."""
+    name = "runpod"
+    def is_configured(self) -> bool:
+        # RunPod is the ALWAYS-ON default substrate, so it is always "available" for
+        # allocation. Pricing is static, and a missing RUNPOD_API_KEY surfaces at provision time
+        # via ensure_auth / the preflight, never as a silent empty candidate list. This matches the
+        # historical ``available_providers()`` which listed runpod unconditionally.
+        return True
+    def preflight(self, require_hf: bool = True) -> list[str]:
+        from flash.providers.runpod.preflight import missing_credentials
+        return missing_credentials(require_hf=require_hf)
+    def gpu_classes(self) -> list[GpuClass]:
+        from flash.providers.runpod.gpus import gpu_classes
+        return gpu_classes()
+    def hourly_rate(self, gpu: str) -> float:
+        from flash.providers.runpod.pricing import hourly_rate
+        return hourly_rate(gpu)
+    def submit_run(
+        self,
+        spec,
+        seed: int,
+        *,
+        log: Any = None,
+        on_handle: Any = None,
+        attempt: int = 0,
+        runtime_secrets: dict[str, str] | None = None,
+        on_last_gpu: bool = False,
+    ) -> PollResult:
+        # ``on_last_gpu`` stretches the no-capacity grace when no further GPU attempt will be made
+        # after this one — either the candidate list is exhausted or the retry budget is exhausted (see
+        # ``jobs.stall_kwargs``); waiting longer can't cost a fallback there is none.
+        from flash.providers.runpod.jobs import submit_run
+        kwargs = {
+            "log": log,
+            "on_handle": on_handle,
+            "attempt": attempt,
+            "on_last_gpu": on_last_gpu,
+        }
+        if runtime_secrets:
+            kwargs["runtime_secrets"] = runtime_secrets
+        return submit_run(spec, seed, **kwargs)
+    def poll(self, handle: JobHandle, spec, seed: int, *, log: Any = None) -> PollResult:
+        from flash.providers.runpod.jobs import JobHandle as RunpodJobHandle
+        from flash.providers.runpod.jobs import (
+            make_hf_failure_detail_reader,
+            make_hf_heartbeat_reader,
+            poll_job,
+            stall_kwargs,
+        )
+        hf_repo = spec.train.hf_repo
+        prefix = f"{spec.phase}/{spec.run_id}/seed{seed}"
+        reader = make_hf_heartbeat_reader(hf_repo, prefix) if hf_repo else None
+        failure_reader = (
+            make_hf_failure_detail_reader(hf_repo, prefix, spec.phase) if hf_repo else None
+        )
+        rh = RunpodJobHandle.from_dict(handle.to_dict())
+        if log is not None:
+            print(f"attaching: job={rh.job_id} endpoint={rh.endpoint_name}", file=log, flush=True)
+        # Same stall tuning as the submit path so a reattached run isn't judged differently:
+        # the original submit's ``on_last_gpu`` is persisted in the handle (by the runner's
+        # on_handle), so reproduce its no-capacity grace here instead of defaulting to the
+        # shorter non-last window. Absent (a pre-persist / non-runpod handle) => False, the
+        # historical default.
+        on_last_gpu = bool(handle.to_dict().get("on_last_gpu", False))
+        return poll_job(
+            rh,
+            log=log,
+            heartbeat_reader=reader,
+            failure_detail_reader=failure_reader,
+            **stall_kwargs(on_last_gpu=on_last_gpu),
+        )
+    def cancel(self, handle: JobHandle) -> None:
+        from flash.providers.runpod import api as runpod_api
+        d = handle.to_dict()
+        if d.get("endpoint_id") and d.get("job_id"):
+            runpod_api.cancel_job(d["endpoint_id"], d["job_id"])
+    def destroy(self, handle: JobHandle) -> None:
+        from flash.providers.runpod import api as runpod_api
+        d = handle.to_dict()
+        if d.get("endpoint_id"):
+            runpod_api.delete_endpoint(d["endpoint_id"])
+    def gc(self, spec) -> None:
+        from flash.providers.runpod.train import terminate_endpoint
+        terminate_endpoint(spec.gpu.type, spec.run_id)
+    def sweep_orphans(self, active_labels: set[str] | None = None) -> list[int]:
+        # No-op: RunPod serverless endpoints have no standing per-run billing to reap on
+        # crash recovery (a failed-before-submit endpoint is GC'd by reconstructed name in
+        # recover_runs). Present for the ``base.Provider`` protocol.
+        return []
+PROVIDER: Provider = RunpodProvider()

flash/providers/runpod/api.py ADDED Viewed

@@ -0,0 +1,186 @@
+"""Thin RunPod REST client (no SDK state): endpoints, queue jobs, health.
+Used by the run supervisor and endpoint GC so that a *fresh process* can
+reattach to / clean up after any run using only the persisted ids + RUNPOD_API_KEY —
+independent of the Flash SDK's local resource registry (which is per-directory,
+whole-dict, last-writer-wins and therefore unreliable across processes).
+"""
+from __future__ import annotations
+import urllib.error
+from typing import Any
+from flash.providers._http import RestClient
+from flash.providers.runpod import keys as _keys
+REST_BASE = "https://rest.runpod.io/v1"
+QUEUE_BASE = "https://api.runpod.ai/v2"
+class RunpodApiError(RuntimeError):
+    pass
+# Shared urllib client (full-URL form: callers pass absolute REST/QUEUE urls).
+# Env-only by design: ~/.flash/config.json holds the *Flash* key (client-side),
+# never the RunPod key — the operator sets RUNPOD_API_KEY on the control-plane host.
+#
+# ``RUNPOD_API_KEY`` may be a comma-separated pool of per-account keys: the client tries
+# them active-account-first per call (``keys.ordered_keys``) and fails over to the next
+# account on an auth/quota/not-found error (``keys.is_failover_error``). RunPod endpoints
+# are account-scoped, so a single-account op (status/cancel/delete) resolves no matter
+# which account a failed-over run was provisioned on. A single key => a pool of one.
+_CLIENT = RestClient(
+    env_var="RUNPOD_API_KEY",
+    error_cls=RunpodApiError,
+    keys_provider=_keys.ordered_keys,
+    failover_predicate=_keys.is_failover_error,
+)
+def request_with_retries(
+    url: str,
+    method: str = "GET",
+    body: dict | None = None,
+    retries: int = 4,
+    base_delay: float = 2.0,
+) -> Any:
+    """REST call hardened against transient network/5xx blips (jittered backoff)."""
+    return _CLIENT.request_with_retries(
+        url, method=method, body=body, retries=retries, base_delay=base_delay
+    )
+# ---------------------------------------------------------------------------
+# Endpoints
+# ---------------------------------------------------------------------------
+def list_endpoints() -> list[dict]:
+    # ``RUNPOD_API_KEY`` may be a comma-separated pool of per-account keys. RunPod
+    # endpoints are account-scoped: a plain request_with_retries() call stops at the
+    # first key that succeeds and returns only *that* account's endpoints. Idle-sweep
+    # and slot-reconcile need the full fleet across every account in the pool, so we
+    # query each key independently (with per-key retries) and aggregate.
+    #
+    # Raises on any per-key failure so callers that treat an empty result as "confirmed
+    # absent" (teardown, slot-reconcile) don't act on an incomplete view. Both
+    # sweep_idle_endpoints() and the slot reconcile already catch and skip on exception.
+    pool = _keys.keys()
+    if not pool:
+        # No RUNPOD_API_KEY at all: an empty `pool` would make this return [] WITHOUT a single
+        # authenticated call, and callers read [] as "the fleet is empty / confirmed absent" and may
+        # act on that (teardown, slot-reconcile). Fail loud instead — matching the old single-call
+        # request_with_retries() behavior, which raised on a missing key.
+        raise RunpodApiError(
+            "RUNPOD_API_KEY is not set; refusing to report an empty endpoint fleet"
+        )
+    all_endpoints: list[dict] = []
+    for key in pool:
+        out = _CLIENT.request_with_retries_for_key(key, f"{REST_BASE}/endpoints", retries=2)
+        if not isinstance(out, list):
+            # A 200 whose body isn't the expected list is NOT an empty account — silently skipping it
+            # (the old behavior) yields a partial fleet view that callers trust as complete. Raise so
+            # the per-key failure surfaces, consistent with this function's "fail, don't under-report"
+            # contract above.
+            raise RunpodApiError(
+                f"unexpected /endpoints response for a pool key (got {type(out).__name__}, want list)"
+            )
+        all_endpoints.extend(out)
+    return all_endpoints
+def find_endpoints_by_name(substr: str) -> list[dict]:
+    return [e for e in list_endpoints() if substr in (e.get("name") or "")]
+def delete_endpoint(endpoint_id: str) -> bool:
+    try:
+        request_with_retries(f"{REST_BASE}/endpoints/{endpoint_id}", method="DELETE", retries=2)
+        return True
+    except RunpodApiError as e:
+        # An already-gone endpoint is a clean teardown, not a failure: a 404 (or a body
+        # saying the endpoint "does not exist") means the desired end state — no such
+        # endpoint — already holds. Reporting False here makes undeploy_adapter surface a
+        # misleading "may still be running" 502 for something that's provably gone.
+        return _is_not_found(e)
+def _is_not_found(err: RunpodApiError) -> bool:
+    """True only when a RunpodApiError represents a genuine 404 (endpoint already gone).
+    request_with_retries chains the original urllib HTTPError as ``__cause__`` for every
+    fast-failed 4xx (``raise ... from e``), so the status code is authoritative when a
+    cause is present: a 404 is "already gone", anything else (403/401/5xx) is a real
+    failure and must NOT be swallowed — a body that merely *mentions* "does not exist" on a
+    403 is still a 403. We only fall back to a text match when there is no HTTPError cause
+    (e.g. the "failed after N attempts" path), and even then only on an unambiguous 404.
+    """
+    cause = err.__cause__
+    if isinstance(cause, urllib.error.HTTPError):
+        return cause.code == 404
+    return "http 404" in str(err).lower()
+def endpoint_health(endpoint_id: str) -> dict:
+    return request_with_retries(f"{QUEUE_BASE}/{endpoint_id}/health")
+# ---------------------------------------------------------------------------
+# Queue jobs
+# ---------------------------------------------------------------------------
+def submit_job(endpoint_id: str, input_payload: dict) -> str:
+    """POST /run -> job id (async queue submission)."""
+    out = request_with_retries(
+        f"{QUEUE_BASE}/{endpoint_id}/run", method="POST", body={"input": input_payload}
+    )
+    job_id = out.get("id")
+    if not job_id:
+        raise RunpodApiError(f"submit_job: no job id in response: {out}")
+    return job_id
+def job_status(endpoint_id: str, job_id: str) -> dict:
+    """GET /status/<job_id> -> {status, output?, error?, ...}."""
+    return request_with_retries(f"{QUEUE_BASE}/{endpoint_id}/status/{job_id}")
+def cancel_job(endpoint_id: str, job_id: str) -> dict:
+    return request_with_retries(
+        f"{QUEUE_BASE}/{endpoint_id}/cancel/{job_id}", method="POST", retries=2
+    )
+# ---------------------------------------------------------------------------
+# Realized billing (COGS) -- what RunPod actually charged, for estimator accuracy.
+# ---------------------------------------------------------------------------
+def billing_endpoints(
+    *,
+    start_time: str,
+    end_time: str,
+    endpoint_id: str | None = None,
+    bucket_size: str = "day",
+) -> list[dict]:
+    """Realized serverless spend per endpoint over [start_time, end_time] (ISO-8601).
+    GET /v1/billing/endpoints -> records of {endpointId, time, amount (USD), timeBilledMs, ...}.
+    RunPod has no per-job cost; the finest realized granularity is per-endpoint per time bucket.
+    Flash provisions one endpoint per run, so filtering by ``endpoint_id`` yields that run's
+    realized cost even after the endpoint is torn down (billing history survives deletion).
+    """
+    from urllib.parse import urlencode
+    params: dict[str, str] = {
+        "startTime": start_time,
+        "endTime": end_time,
+        "bucketSize": bucket_size,
+    }
+    if endpoint_id:
+        params["endpointId"] = endpoint_id
+    out = request_with_retries(f"{REST_BASE}/billing/endpoints?{urlencode(params)}")
+    if isinstance(out, list):
+        return out
+    # Defensive: some RunPod list responses wrap rows under a key.
+    if isinstance(out, dict):
+        rows = out.get("data") or out.get("endpoints") or out.get("billing")
+        return rows if isinstance(rows, list) else []
+    return []