freesolo-flash-dev 0.2.25__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- flash/__init__.py +29 -0
- flash/_channel.py +23 -0
- flash/_fileio.py +35 -0
- flash/_logging.py +49 -0
- flash/_update_check.py +266 -0
- flash/catalog.py +253 -0
- flash/cli/__init__.py +1 -0
- flash/cli/main/__init__.py +227 -0
- flash/cli/main/__main__.py +6 -0
- flash/cli/main/commands.py +636 -0
- flash/cli/main/envpush.py +317 -0
- flash/cli/main/render.py +599 -0
- flash/cli/main/training_doc.py +455 -0
- flash/client/__init__.py +14 -0
- flash/client/config.py +70 -0
- flash/client/http.py +372 -0
- flash/client/runtime_secrets.py +69 -0
- flash/client/specs.py +20 -0
- flash/cost/__init__.py +16 -0
- flash/cost/analytical.py +175 -0
- flash/cost/facts.py +114 -0
- flash/cost/spec.py +113 -0
- flash/cost/types.py +158 -0
- flash/engine/__init__.py +6 -0
- flash/engine/accounting.py +36 -0
- flash/engine/chalk_kernels.py +116 -0
- flash/engine/multiturn_rollout.py +780 -0
- flash/engine/recipe.py +86 -0
- flash/engine/vram.py +603 -0
- flash/engine/worker/__init__.py +2916 -0
- flash/engine/worker/__main__.py +4 -0
- flash/engine/worker/kernel_warmup.py +400 -0
- flash/engine/worker/lora.py +796 -0
- flash/engine/worker/packing.py +366 -0
- flash/engine/worker/perf.py +1048 -0
- flash/envs/__init__.py +10 -0
- flash/envs/adapter/__init__.py +883 -0
- flash/envs/adapter/rubric.py +222 -0
- flash/envs/base.py +52 -0
- flash/envs/registry.py +62 -0
- flash/mcp/__init__.py +1 -0
- flash/mcp/server.py +85 -0
- flash/providers/__init__.py +59 -0
- flash/providers/_auth.py +24 -0
- flash/providers/_http.py +230 -0
- flash/providers/_instance.py +416 -0
- flash/providers/_instance_bootstrap.py +517 -0
- flash/providers/_poll.py +311 -0
- flash/providers/allocator.py +193 -0
- flash/providers/base.py +431 -0
- flash/providers/hyperstack/__init__.py +127 -0
- flash/providers/hyperstack/api.py +522 -0
- flash/providers/hyperstack/auth.py +17 -0
- flash/providers/hyperstack/gpus.py +29 -0
- flash/providers/hyperstack/jobs/__init__.py +632 -0
- flash/providers/hyperstack/jobs/builders.py +122 -0
- flash/providers/hyperstack/preflight.py +23 -0
- flash/providers/hyperstack/pricing.py +26 -0
- flash/providers/hyperstack/train.py +25 -0
- flash/providers/lambdalabs/__init__.py +139 -0
- flash/providers/lambdalabs/api.py +261 -0
- flash/providers/lambdalabs/auth.py +18 -0
- flash/providers/lambdalabs/gpus.py +29 -0
- flash/providers/lambdalabs/jobs/__init__.py +724 -0
- flash/providers/lambdalabs/jobs/builders.py +118 -0
- flash/providers/lambdalabs/preflight.py +27 -0
- flash/providers/lambdalabs/pricing.py +51 -0
- flash/providers/lambdalabs/train.py +27 -0
- flash/providers/preflight.py +55 -0
- flash/providers/realized.py +80 -0
- flash/providers/runpod/__init__.py +130 -0
- flash/providers/runpod/api.py +186 -0
- flash/providers/runpod/auth.py +37 -0
- flash/providers/runpod/cost.py +57 -0
- flash/providers/runpod/gpus.py +46 -0
- flash/providers/runpod/jobs.py +956 -0
- flash/providers/runpod/keys.py +139 -0
- flash/providers/runpod/preflight.py +30 -0
- flash/providers/runpod/preload.py +915 -0
- flash/providers/runpod/pricing.py +18 -0
- flash/providers/runpod/slots.py +79 -0
- flash/providers/runpod/train/__init__.py +150 -0
- flash/providers/runpod/train/deps.py +395 -0
- flash/providers/runpod/train/endpoints.py +820 -0
- flash/py.typed +0 -0
- flash/runner/__init__.py +686 -0
- flash/runner/checkpoints.py +82 -0
- flash/runner/deploy.py +422 -0
- flash/runner/lifecycle.py +672 -0
- flash/schema/__init__.py +375 -0
- flash/schema/fields.py +331 -0
- flash/serve/__init__.py +1 -0
- flash/serve/deploy.py +326 -0
- flash/serve/pricing.py +60 -0
- flash/server/__init__.py +1 -0
- flash/server/__main__.py +20 -0
- flash/server/app.py +961 -0
- flash/server/auth.py +263 -0
- flash/server/billing.py +124 -0
- flash/server/checkpoints.py +110 -0
- flash/server/db.py +160 -0
- flash/server/environment_registry.py +102 -0
- flash/server/envs.py +360 -0
- flash/server/reconcile.py +163 -0
- flash/server/run_registry.py +150 -0
- flash/spec.py +333 -0
- freesolo_flash_dev-0.2.25.dist-info/METADATA +192 -0
- freesolo_flash_dev-0.2.25.dist-info/RECORD +111 -0
- freesolo_flash_dev-0.2.25.dist-info/WHEEL +4 -0
- freesolo_flash_dev-0.2.25.dist-info/entry_points.txt +3 -0
- freesolo_flash_dev-0.2.25.dist-info/licenses/LICENSE +201 -0
|
@@ -0,0 +1,139 @@
|
|
|
1
|
+
"""RunPod multi-account key pool with quota failover ("waterfall").
|
|
2
|
+
|
|
3
|
+
``RUNPOD_API_KEY`` may hold a single key or a comma-separated list of keys, each for a
|
|
4
|
+
distinct RunPod account. The pool cycles in order: when the preferred account is
|
|
5
|
+
exhausted — out of worker quota or credits, or its key is rejected — provisioning
|
|
6
|
+
fails over to the next account so runs keep landing, and after the last account the
|
|
7
|
+
pointer wraps back to the first so quota recovered on earlier accounts is reused.
|
|
8
|
+
A single key (no comma) behaves exactly as before: the pool is a list of one and no
|
|
9
|
+
failover ever triggers.
|
|
10
|
+
|
|
11
|
+
Two cooperating notions of "which key":
|
|
12
|
+
|
|
13
|
+
* the **active** key (``_idx``) — the preferred account for *new provisioning*. Only
|
|
14
|
+
``advance_key`` moves it (on a deploy-time quota failover), and it also collapses
|
|
15
|
+
``RUNPOD_API_KEY`` to that single key so the ``runpod_flash`` SDK — which reads the
|
|
16
|
+
raw env var and would otherwise send ``"key1,key2"`` as one bearer token (a 401) —
|
|
17
|
+
authenticates against exactly one account.
|
|
18
|
+
* the **ordered** keys (``ordered_keys``) — the active account first, then the rest.
|
|
19
|
+
The REST client tries them in this order *per call* without moving ``_idx``, so an
|
|
20
|
+
operation on an endpoint that lives on a non-preferred account still resolves (RunPod
|
|
21
|
+
endpoints are account-scoped) even after a provisioning failover moved the pointer.
|
|
22
|
+
|
|
23
|
+
The pool is captured from the environment ONCE and cached, so collapsing
|
|
24
|
+
``RUNPOD_API_KEY`` to a single active key never loses the rest of the pool.
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
from __future__ import annotations
|
|
28
|
+
|
|
29
|
+
import os
|
|
30
|
+
import threading
|
|
31
|
+
import urllib.error
|
|
32
|
+
|
|
33
|
+
_ENV_VAR = "RUNPOD_API_KEY"
|
|
34
|
+
_lock = threading.Lock()
|
|
35
|
+
_pool: list[str] | None = None
|
|
36
|
+
_idx = 0
|
|
37
|
+
|
|
38
|
+
# HTTP statuses that mean "this account/key can't serve the request — try the next key":
|
|
39
|
+
# 401 key rejected, 402 payment required (out of credits), 403 forbidden / spend limit,
|
|
40
|
+
# 404 endpoint/job not on THIS account, 429 quota/rate. A genuine hard 4xx (400/409/422)
|
|
41
|
+
# and a 5xx server error are the same on every account, so they are NOT failover triggers.
|
|
42
|
+
_FAILOVER_CODES = frozenset({401, 402, 403, 404, 429})
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def _ensure_pool() -> list[str]:
|
|
46
|
+
global _pool
|
|
47
|
+
with _lock:
|
|
48
|
+
if _pool is None:
|
|
49
|
+
raw = os.environ.get(_ENV_VAR, "") or ""
|
|
50
|
+
_pool = [k.strip() for k in raw.split(",") if k.strip()]
|
|
51
|
+
return _pool
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def keys() -> list[str]:
|
|
55
|
+
"""The configured key pool, in order (empty if ``RUNPOD_API_KEY`` is unset)."""
|
|
56
|
+
return list(_ensure_pool())
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def key_count() -> int:
|
|
60
|
+
return len(_ensure_pool())
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def active_key() -> str | None:
|
|
64
|
+
"""The preferred account's key, or None if no key is configured."""
|
|
65
|
+
pool = _ensure_pool()
|
|
66
|
+
if not pool:
|
|
67
|
+
return None
|
|
68
|
+
with _lock:
|
|
69
|
+
return pool[min(_idx, len(pool) - 1)]
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def ordered_keys() -> list[str]:
|
|
73
|
+
"""All keys with the active account first (preferred-first per-call try order)."""
|
|
74
|
+
pool = _ensure_pool()
|
|
75
|
+
if not pool:
|
|
76
|
+
return []
|
|
77
|
+
with _lock:
|
|
78
|
+
i = min(_idx, len(pool) - 1)
|
|
79
|
+
return pool[i:] + pool[:i]
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def select_active() -> str | None:
|
|
83
|
+
"""Collapse ``RUNPOD_API_KEY`` to the single active key (for the SDK) and return it.
|
|
84
|
+
|
|
85
|
+
The runpod_flash SDK reads the raw env var, so a comma-list would be sent as one
|
|
86
|
+
bearer token. Collapsing to the active key keeps the SDK authenticated against one
|
|
87
|
+
account; the cached pool still holds the rest for failover.
|
|
88
|
+
"""
|
|
89
|
+
k = active_key()
|
|
90
|
+
if k is not None:
|
|
91
|
+
os.environ[_ENV_VAR] = k
|
|
92
|
+
return k
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def advance_key() -> bool:
|
|
96
|
+
"""Cycle to the next account for new provisioning. False only for a single-key pool.
|
|
97
|
+
|
|
98
|
+
Wraps around after the last key so quota recovered on earlier accounts is reused
|
|
99
|
+
(e.g. key1 → key2 → key1 → ...). With a single key there is nowhere to advance —
|
|
100
|
+
the caller's quota-sweep retry loop handles the wait in that case.
|
|
101
|
+
|
|
102
|
+
Contract caveat: because it WRAPS, a multi-key pool ALWAYS returns True — a True return
|
|
103
|
+
does NOT mean "a fresh, untried account is now active". A `True` never means "more accounts
|
|
104
|
+
remain", so callers must NOT loop on ``while advance_key(): ...`` to drain the pool (that
|
|
105
|
+
spins forever when every account is exhausted); bound the number of failovers by
|
|
106
|
+
``key_count()`` instead (see ``deploy_train_endpoint``).
|
|
107
|
+
|
|
108
|
+
Also collapses ``RUNPOD_API_KEY`` to the newly-active key so the SDK and the
|
|
109
|
+
preferred-first REST ordering both follow the failover.
|
|
110
|
+
"""
|
|
111
|
+
global _idx
|
|
112
|
+
pool = _ensure_pool()
|
|
113
|
+
with _lock:
|
|
114
|
+
if len(pool) <= 1:
|
|
115
|
+
return False
|
|
116
|
+
_idx = (_idx + 1) % len(pool)
|
|
117
|
+
os.environ[_ENV_VAR] = pool[_idx]
|
|
118
|
+
return True
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
def reset() -> None:
|
|
122
|
+
"""Re-read the pool from the environment and reset to the first account (tests)."""
|
|
123
|
+
global _pool, _idx
|
|
124
|
+
with _lock:
|
|
125
|
+
_pool, _idx = None, 0
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def is_failover_error(exc: Exception) -> bool:
|
|
129
|
+
"""True only for an account-specific HTTP status — the cases where another account can
|
|
130
|
+
actually serve the request (auth/credit/quota/not-found, ``_FAILOVER_CODES``).
|
|
131
|
+
|
|
132
|
+
The REST client chains the underlying ``HTTPError`` as ``__cause__`` (``raise ... from e``
|
|
133
|
+
on a fast-failed 4xx, ``raise ... from last`` after the retry loop), so the status code on
|
|
134
|
+
the cause is authoritative. A hard 4xx (400/409/422), a 5xx server error, and network /
|
|
135
|
+
timeout failures are the same on every account — the per-key retry loop already absorbs
|
|
136
|
+
transient blips — so none of them fail over.
|
|
137
|
+
"""
|
|
138
|
+
cause = exc.__cause__
|
|
139
|
+
return isinstance(cause, urllib.error.HTTPError) and cause.code in _FAILOVER_CODES
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
"""Fail-fast credential checks for the RunPod substrate (operator-side).
|
|
2
|
+
|
|
3
|
+
These run when the Flash server starts (and before any RunPod Flash provisioning) so
|
|
4
|
+
missing operator configuration produces one clear, actionable error instead of a
|
|
5
|
+
partial run that dies mid-provisioning. End users never see these — their preflight is
|
|
6
|
+
client-side ("do I have an Flash key?", see flash/client).
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import os
|
|
12
|
+
|
|
13
|
+
from flash.providers.runpod.auth import load_api_key
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class PreflightError(RuntimeError):
|
|
17
|
+
"""Raised when required operator credentials/configuration are missing."""
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def missing_credentials(require_hf: bool = True) -> list[str]:
|
|
21
|
+
"""RunPod-related operator config that is missing (empty list == ready)."""
|
|
22
|
+
problems: list[str] = []
|
|
23
|
+
if not load_api_key():
|
|
24
|
+
problems.append(" - RUNPOD_API_KEY: the operator's RunPod API key")
|
|
25
|
+
if require_hf and not os.environ.get("HF_TOKEN"):
|
|
26
|
+
problems.append(
|
|
27
|
+
" - HF_TOKEN: a token with write access to each run's "
|
|
28
|
+
"`[train] hf_repo`, e.g. `export HF_TOKEN=hf_...`"
|
|
29
|
+
)
|
|
30
|
+
return problems
|