freesolo-flash-dev 0.2.25__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (111) hide show
  1. flash/__init__.py +29 -0
  2. flash/_channel.py +23 -0
  3. flash/_fileio.py +35 -0
  4. flash/_logging.py +49 -0
  5. flash/_update_check.py +266 -0
  6. flash/catalog.py +253 -0
  7. flash/cli/__init__.py +1 -0
  8. flash/cli/main/__init__.py +227 -0
  9. flash/cli/main/__main__.py +6 -0
  10. flash/cli/main/commands.py +636 -0
  11. flash/cli/main/envpush.py +317 -0
  12. flash/cli/main/render.py +599 -0
  13. flash/cli/main/training_doc.py +455 -0
  14. flash/client/__init__.py +14 -0
  15. flash/client/config.py +70 -0
  16. flash/client/http.py +372 -0
  17. flash/client/runtime_secrets.py +69 -0
  18. flash/client/specs.py +20 -0
  19. flash/cost/__init__.py +16 -0
  20. flash/cost/analytical.py +175 -0
  21. flash/cost/facts.py +114 -0
  22. flash/cost/spec.py +113 -0
  23. flash/cost/types.py +158 -0
  24. flash/engine/__init__.py +6 -0
  25. flash/engine/accounting.py +36 -0
  26. flash/engine/chalk_kernels.py +116 -0
  27. flash/engine/multiturn_rollout.py +780 -0
  28. flash/engine/recipe.py +86 -0
  29. flash/engine/vram.py +603 -0
  30. flash/engine/worker/__init__.py +2916 -0
  31. flash/engine/worker/__main__.py +4 -0
  32. flash/engine/worker/kernel_warmup.py +400 -0
  33. flash/engine/worker/lora.py +796 -0
  34. flash/engine/worker/packing.py +366 -0
  35. flash/engine/worker/perf.py +1048 -0
  36. flash/envs/__init__.py +10 -0
  37. flash/envs/adapter/__init__.py +883 -0
  38. flash/envs/adapter/rubric.py +222 -0
  39. flash/envs/base.py +52 -0
  40. flash/envs/registry.py +62 -0
  41. flash/mcp/__init__.py +1 -0
  42. flash/mcp/server.py +85 -0
  43. flash/providers/__init__.py +59 -0
  44. flash/providers/_auth.py +24 -0
  45. flash/providers/_http.py +230 -0
  46. flash/providers/_instance.py +416 -0
  47. flash/providers/_instance_bootstrap.py +517 -0
  48. flash/providers/_poll.py +311 -0
  49. flash/providers/allocator.py +193 -0
  50. flash/providers/base.py +431 -0
  51. flash/providers/hyperstack/__init__.py +127 -0
  52. flash/providers/hyperstack/api.py +522 -0
  53. flash/providers/hyperstack/auth.py +17 -0
  54. flash/providers/hyperstack/gpus.py +29 -0
  55. flash/providers/hyperstack/jobs/__init__.py +632 -0
  56. flash/providers/hyperstack/jobs/builders.py +122 -0
  57. flash/providers/hyperstack/preflight.py +23 -0
  58. flash/providers/hyperstack/pricing.py +26 -0
  59. flash/providers/hyperstack/train.py +25 -0
  60. flash/providers/lambdalabs/__init__.py +139 -0
  61. flash/providers/lambdalabs/api.py +261 -0
  62. flash/providers/lambdalabs/auth.py +18 -0
  63. flash/providers/lambdalabs/gpus.py +29 -0
  64. flash/providers/lambdalabs/jobs/__init__.py +724 -0
  65. flash/providers/lambdalabs/jobs/builders.py +118 -0
  66. flash/providers/lambdalabs/preflight.py +27 -0
  67. flash/providers/lambdalabs/pricing.py +51 -0
  68. flash/providers/lambdalabs/train.py +27 -0
  69. flash/providers/preflight.py +55 -0
  70. flash/providers/realized.py +80 -0
  71. flash/providers/runpod/__init__.py +130 -0
  72. flash/providers/runpod/api.py +186 -0
  73. flash/providers/runpod/auth.py +37 -0
  74. flash/providers/runpod/cost.py +57 -0
  75. flash/providers/runpod/gpus.py +46 -0
  76. flash/providers/runpod/jobs.py +956 -0
  77. flash/providers/runpod/keys.py +139 -0
  78. flash/providers/runpod/preflight.py +30 -0
  79. flash/providers/runpod/preload.py +915 -0
  80. flash/providers/runpod/pricing.py +18 -0
  81. flash/providers/runpod/slots.py +79 -0
  82. flash/providers/runpod/train/__init__.py +150 -0
  83. flash/providers/runpod/train/deps.py +395 -0
  84. flash/providers/runpod/train/endpoints.py +820 -0
  85. flash/py.typed +0 -0
  86. flash/runner/__init__.py +686 -0
  87. flash/runner/checkpoints.py +82 -0
  88. flash/runner/deploy.py +422 -0
  89. flash/runner/lifecycle.py +672 -0
  90. flash/schema/__init__.py +375 -0
  91. flash/schema/fields.py +331 -0
  92. flash/serve/__init__.py +1 -0
  93. flash/serve/deploy.py +326 -0
  94. flash/serve/pricing.py +60 -0
  95. flash/server/__init__.py +1 -0
  96. flash/server/__main__.py +20 -0
  97. flash/server/app.py +961 -0
  98. flash/server/auth.py +263 -0
  99. flash/server/billing.py +124 -0
  100. flash/server/checkpoints.py +110 -0
  101. flash/server/db.py +160 -0
  102. flash/server/environment_registry.py +102 -0
  103. flash/server/envs.py +360 -0
  104. flash/server/reconcile.py +163 -0
  105. flash/server/run_registry.py +150 -0
  106. flash/spec.py +333 -0
  107. freesolo_flash_dev-0.2.25.dist-info/METADATA +192 -0
  108. freesolo_flash_dev-0.2.25.dist-info/RECORD +111 -0
  109. freesolo_flash_dev-0.2.25.dist-info/WHEEL +4 -0
  110. freesolo_flash_dev-0.2.25.dist-info/entry_points.txt +3 -0
  111. freesolo_flash_dev-0.2.25.dist-info/licenses/LICENSE +201 -0
@@ -0,0 +1,139 @@
1
+ """RunPod multi-account key pool with quota failover ("waterfall").
2
+
3
+ ``RUNPOD_API_KEY`` may hold a single key or a comma-separated list of keys, each for a
4
+ distinct RunPod account. The pool cycles in order: when the preferred account is
5
+ exhausted — out of worker quota or credits, or its key is rejected — provisioning
6
+ fails over to the next account so runs keep landing, and after the last account the
7
+ pointer wraps back to the first so quota recovered on earlier accounts is reused.
8
+ A single key (no comma) behaves exactly as before: the pool is a list of one and no
9
+ failover ever triggers.
10
+
11
+ Two cooperating notions of "which key":
12
+
13
+ * the **active** key (``_idx``) — the preferred account for *new provisioning*. Only
14
+ ``advance_key`` moves it (on a deploy-time quota failover), and it also collapses
15
+ ``RUNPOD_API_KEY`` to that single key so the ``runpod_flash`` SDK — which reads the
16
+ raw env var and would otherwise send ``"key1,key2"`` as one bearer token (a 401) —
17
+ authenticates against exactly one account.
18
+ * the **ordered** keys (``ordered_keys``) — the active account first, then the rest.
19
+ The REST client tries them in this order *per call* without moving ``_idx``, so an
20
+ operation on an endpoint that lives on a non-preferred account still resolves (RunPod
21
+ endpoints are account-scoped) even after a provisioning failover moved the pointer.
22
+
23
+ The pool is captured from the environment ONCE and cached, so collapsing
24
+ ``RUNPOD_API_KEY`` to a single active key never loses the rest of the pool.
25
+ """
26
+
27
+ from __future__ import annotations
28
+
29
+ import os
30
+ import threading
31
+ import urllib.error
32
+
33
+ _ENV_VAR = "RUNPOD_API_KEY"
34
+ _lock = threading.Lock()
35
+ _pool: list[str] | None = None
36
+ _idx = 0
37
+
38
+ # HTTP statuses that mean "this account/key can't serve the request — try the next key":
39
+ # 401 key rejected, 402 payment required (out of credits), 403 forbidden / spend limit,
40
+ # 404 endpoint/job not on THIS account, 429 quota/rate. A genuine hard 4xx (400/409/422)
41
+ # and a 5xx server error are the same on every account, so they are NOT failover triggers.
42
+ _FAILOVER_CODES = frozenset({401, 402, 403, 404, 429})
43
+
44
+
45
+ def _ensure_pool() -> list[str]:
46
+ global _pool
47
+ with _lock:
48
+ if _pool is None:
49
+ raw = os.environ.get(_ENV_VAR, "") or ""
50
+ _pool = [k.strip() for k in raw.split(",") if k.strip()]
51
+ return _pool
52
+
53
+
54
+ def keys() -> list[str]:
55
+ """The configured key pool, in order (empty if ``RUNPOD_API_KEY`` is unset)."""
56
+ return list(_ensure_pool())
57
+
58
+
59
+ def key_count() -> int:
60
+ return len(_ensure_pool())
61
+
62
+
63
+ def active_key() -> str | None:
64
+ """The preferred account's key, or None if no key is configured."""
65
+ pool = _ensure_pool()
66
+ if not pool:
67
+ return None
68
+ with _lock:
69
+ return pool[min(_idx, len(pool) - 1)]
70
+
71
+
72
+ def ordered_keys() -> list[str]:
73
+ """All keys with the active account first (preferred-first per-call try order)."""
74
+ pool = _ensure_pool()
75
+ if not pool:
76
+ return []
77
+ with _lock:
78
+ i = min(_idx, len(pool) - 1)
79
+ return pool[i:] + pool[:i]
80
+
81
+
82
+ def select_active() -> str | None:
83
+ """Collapse ``RUNPOD_API_KEY`` to the single active key (for the SDK) and return it.
84
+
85
+ The runpod_flash SDK reads the raw env var, so a comma-list would be sent as one
86
+ bearer token. Collapsing to the active key keeps the SDK authenticated against one
87
+ account; the cached pool still holds the rest for failover.
88
+ """
89
+ k = active_key()
90
+ if k is not None:
91
+ os.environ[_ENV_VAR] = k
92
+ return k
93
+
94
+
95
+ def advance_key() -> bool:
96
+ """Cycle to the next account for new provisioning. False only for a single-key pool.
97
+
98
+ Wraps around after the last key so quota recovered on earlier accounts is reused
99
+ (e.g. key1 → key2 → key1 → ...). With a single key there is nowhere to advance —
100
+ the caller's quota-sweep retry loop handles the wait in that case.
101
+
102
+ Contract caveat: because it WRAPS, a multi-key pool ALWAYS returns True — a True return
103
+ does NOT mean "a fresh, untried account is now active". A `True` never means "more accounts
104
+ remain", so callers must NOT loop on ``while advance_key(): ...`` to drain the pool (that
105
+ spins forever when every account is exhausted); bound the number of failovers by
106
+ ``key_count()`` instead (see ``deploy_train_endpoint``).
107
+
108
+ Also collapses ``RUNPOD_API_KEY`` to the newly-active key so the SDK and the
109
+ preferred-first REST ordering both follow the failover.
110
+ """
111
+ global _idx
112
+ pool = _ensure_pool()
113
+ with _lock:
114
+ if len(pool) <= 1:
115
+ return False
116
+ _idx = (_idx + 1) % len(pool)
117
+ os.environ[_ENV_VAR] = pool[_idx]
118
+ return True
119
+
120
+
121
+ def reset() -> None:
122
+ """Re-read the pool from the environment and reset to the first account (tests)."""
123
+ global _pool, _idx
124
+ with _lock:
125
+ _pool, _idx = None, 0
126
+
127
+
128
+ def is_failover_error(exc: Exception) -> bool:
129
+ """True only for an account-specific HTTP status — the cases where another account can
130
+ actually serve the request (auth/credit/quota/not-found, ``_FAILOVER_CODES``).
131
+
132
+ The REST client chains the underlying ``HTTPError`` as ``__cause__`` (``raise ... from e``
133
+ on a fast-failed 4xx, ``raise ... from last`` after the retry loop), so the status code on
134
+ the cause is authoritative. A hard 4xx (400/409/422), a 5xx server error, and network /
135
+ timeout failures are the same on every account — the per-key retry loop already absorbs
136
+ transient blips — so none of them fail over.
137
+ """
138
+ cause = exc.__cause__
139
+ return isinstance(cause, urllib.error.HTTPError) and cause.code in _FAILOVER_CODES
@@ -0,0 +1,30 @@
1
+ """Fail-fast credential checks for the RunPod substrate (operator-side).
2
+
3
+ These run when the Flash server starts (and before any RunPod Flash provisioning) so
4
+ missing operator configuration produces one clear, actionable error instead of a
5
+ partial run that dies mid-provisioning. End users never see these — their preflight is
6
+ client-side ("do I have an Flash key?", see flash/client).
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import os
12
+
13
+ from flash.providers.runpod.auth import load_api_key
14
+
15
+
16
+ class PreflightError(RuntimeError):
17
+ """Raised when required operator credentials/configuration are missing."""
18
+
19
+
20
+ def missing_credentials(require_hf: bool = True) -> list[str]:
21
+ """RunPod-related operator config that is missing (empty list == ready)."""
22
+ problems: list[str] = []
23
+ if not load_api_key():
24
+ problems.append(" - RUNPOD_API_KEY: the operator's RunPod API key")
25
+ if require_hf and not os.environ.get("HF_TOKEN"):
26
+ problems.append(
27
+ " - HF_TOKEN: a token with write access to each run's "
28
+ "`[train] hf_repo`, e.g. `export HF_TOKEN=hf_...`"
29
+ )
30
+ return problems