freesolo-flash-dev 0.2.25__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (111) hide show
  1. flash/__init__.py +29 -0
  2. flash/_channel.py +23 -0
  3. flash/_fileio.py +35 -0
  4. flash/_logging.py +49 -0
  5. flash/_update_check.py +266 -0
  6. flash/catalog.py +253 -0
  7. flash/cli/__init__.py +1 -0
  8. flash/cli/main/__init__.py +227 -0
  9. flash/cli/main/__main__.py +6 -0
  10. flash/cli/main/commands.py +636 -0
  11. flash/cli/main/envpush.py +317 -0
  12. flash/cli/main/render.py +599 -0
  13. flash/cli/main/training_doc.py +455 -0
  14. flash/client/__init__.py +14 -0
  15. flash/client/config.py +70 -0
  16. flash/client/http.py +372 -0
  17. flash/client/runtime_secrets.py +69 -0
  18. flash/client/specs.py +20 -0
  19. flash/cost/__init__.py +16 -0
  20. flash/cost/analytical.py +175 -0
  21. flash/cost/facts.py +114 -0
  22. flash/cost/spec.py +113 -0
  23. flash/cost/types.py +158 -0
  24. flash/engine/__init__.py +6 -0
  25. flash/engine/accounting.py +36 -0
  26. flash/engine/chalk_kernels.py +116 -0
  27. flash/engine/multiturn_rollout.py +780 -0
  28. flash/engine/recipe.py +86 -0
  29. flash/engine/vram.py +603 -0
  30. flash/engine/worker/__init__.py +2916 -0
  31. flash/engine/worker/__main__.py +4 -0
  32. flash/engine/worker/kernel_warmup.py +400 -0
  33. flash/engine/worker/lora.py +796 -0
  34. flash/engine/worker/packing.py +366 -0
  35. flash/engine/worker/perf.py +1048 -0
  36. flash/envs/__init__.py +10 -0
  37. flash/envs/adapter/__init__.py +883 -0
  38. flash/envs/adapter/rubric.py +222 -0
  39. flash/envs/base.py +52 -0
  40. flash/envs/registry.py +62 -0
  41. flash/mcp/__init__.py +1 -0
  42. flash/mcp/server.py +85 -0
  43. flash/providers/__init__.py +59 -0
  44. flash/providers/_auth.py +24 -0
  45. flash/providers/_http.py +230 -0
  46. flash/providers/_instance.py +416 -0
  47. flash/providers/_instance_bootstrap.py +517 -0
  48. flash/providers/_poll.py +311 -0
  49. flash/providers/allocator.py +193 -0
  50. flash/providers/base.py +431 -0
  51. flash/providers/hyperstack/__init__.py +127 -0
  52. flash/providers/hyperstack/api.py +522 -0
  53. flash/providers/hyperstack/auth.py +17 -0
  54. flash/providers/hyperstack/gpus.py +29 -0
  55. flash/providers/hyperstack/jobs/__init__.py +632 -0
  56. flash/providers/hyperstack/jobs/builders.py +122 -0
  57. flash/providers/hyperstack/preflight.py +23 -0
  58. flash/providers/hyperstack/pricing.py +26 -0
  59. flash/providers/hyperstack/train.py +25 -0
  60. flash/providers/lambdalabs/__init__.py +139 -0
  61. flash/providers/lambdalabs/api.py +261 -0
  62. flash/providers/lambdalabs/auth.py +18 -0
  63. flash/providers/lambdalabs/gpus.py +29 -0
  64. flash/providers/lambdalabs/jobs/__init__.py +724 -0
  65. flash/providers/lambdalabs/jobs/builders.py +118 -0
  66. flash/providers/lambdalabs/preflight.py +27 -0
  67. flash/providers/lambdalabs/pricing.py +51 -0
  68. flash/providers/lambdalabs/train.py +27 -0
  69. flash/providers/preflight.py +55 -0
  70. flash/providers/realized.py +80 -0
  71. flash/providers/runpod/__init__.py +130 -0
  72. flash/providers/runpod/api.py +186 -0
  73. flash/providers/runpod/auth.py +37 -0
  74. flash/providers/runpod/cost.py +57 -0
  75. flash/providers/runpod/gpus.py +46 -0
  76. flash/providers/runpod/jobs.py +956 -0
  77. flash/providers/runpod/keys.py +139 -0
  78. flash/providers/runpod/preflight.py +30 -0
  79. flash/providers/runpod/preload.py +915 -0
  80. flash/providers/runpod/pricing.py +18 -0
  81. flash/providers/runpod/slots.py +79 -0
  82. flash/providers/runpod/train/__init__.py +150 -0
  83. flash/providers/runpod/train/deps.py +395 -0
  84. flash/providers/runpod/train/endpoints.py +820 -0
  85. flash/py.typed +0 -0
  86. flash/runner/__init__.py +686 -0
  87. flash/runner/checkpoints.py +82 -0
  88. flash/runner/deploy.py +422 -0
  89. flash/runner/lifecycle.py +672 -0
  90. flash/schema/__init__.py +375 -0
  91. flash/schema/fields.py +331 -0
  92. flash/serve/__init__.py +1 -0
  93. flash/serve/deploy.py +326 -0
  94. flash/serve/pricing.py +60 -0
  95. flash/server/__init__.py +1 -0
  96. flash/server/__main__.py +20 -0
  97. flash/server/app.py +961 -0
  98. flash/server/auth.py +263 -0
  99. flash/server/billing.py +124 -0
  100. flash/server/checkpoints.py +110 -0
  101. flash/server/db.py +160 -0
  102. flash/server/environment_registry.py +102 -0
  103. flash/server/envs.py +360 -0
  104. flash/server/reconcile.py +163 -0
  105. flash/server/run_registry.py +150 -0
  106. flash/spec.py +333 -0
  107. freesolo_flash_dev-0.2.25.dist-info/METADATA +192 -0
  108. freesolo_flash_dev-0.2.25.dist-info/RECORD +111 -0
  109. freesolo_flash_dev-0.2.25.dist-info/WHEEL +4 -0
  110. freesolo_flash_dev-0.2.25.dist-info/entry_points.txt +3 -0
  111. freesolo_flash_dev-0.2.25.dist-info/licenses/LICENSE +201 -0
@@ -0,0 +1,102 @@
1
+ """Best-effort reporting of published Flash environments to the Freesolo backend."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import contextlib
6
+ import json
7
+ import logging
8
+ import os
9
+ import urllib.error
10
+ import urllib.request
11
+
12
+ from .auth import INTERNAL_KEY_ENV, freesolo_base_url
13
+
14
+ _LOG = logging.getLogger("flash.server.environments")
15
+ _TIMEOUT_S = 10.0
16
+ _PATH = "/api/flash/environments/internal"
17
+ _USE_PATH = "/api/flash/environments/use/internal"
18
+ _DEFAULT_HUB_REPO = "freesolo-co/environment-hub"
19
+ _DEFAULT_HUB_REF = "main"
20
+
21
+
22
+ def record_published_environment(*, slug: str, name: str, key: dict) -> bool:
23
+ """Persist Hub metadata in the platform backend.
24
+
25
+ The GitHub publish is the source of truth for the environment package. This
26
+ metadata write exists so the web UI can list Flash environments, so it is
27
+ deliberately best-effort and never blocks `flash env push`.
28
+ """
29
+ internal_key = os.environ.get(INTERNAL_KEY_ENV)
30
+ org_id = str(key.get("org_id") or "").strip()
31
+ if not internal_key or not org_id:
32
+ return False
33
+
34
+ body = {
35
+ "orgId": org_id,
36
+ "slug": slug,
37
+ "name": name,
38
+ "hubRepo": _DEFAULT_HUB_REPO,
39
+ "hubRef": _DEFAULT_HUB_REF,
40
+ "hubPath": f"{slug}/environment.py",
41
+ "publishedByUserId": key.get("user_id"),
42
+ "apiKeyId": key.get("api_key_id"),
43
+ "metadata": {"source": "flash.env.push"},
44
+ }
45
+ req = urllib.request.Request(
46
+ f"{freesolo_base_url()}{_PATH}",
47
+ data=json.dumps(body).encode("utf-8"),
48
+ method="POST",
49
+ headers={
50
+ "Authorization": f"Bearer {internal_key}",
51
+ "Content-Type": "application/json",
52
+ },
53
+ )
54
+ try:
55
+ with urllib.request.urlopen(req, timeout=_TIMEOUT_S) as resp:
56
+ return 200 <= resp.status < 300
57
+ except urllib.error.HTTPError as exc:
58
+ detail = ""
59
+ with contextlib.suppress(Exception):
60
+ detail = exc.read().decode("utf-8", "replace")[:500]
61
+ _LOG.warning(
62
+ "failed to record published environment %s: HTTP %s %s",
63
+ slug,
64
+ exc.code,
65
+ detail,
66
+ )
67
+ except (urllib.error.URLError, OSError) as exc:
68
+ _LOG.warning("failed to record published environment %s: %s", slug, exc)
69
+ return False
70
+
71
+
72
+ def record_environment_use(*, slug: str, run_id: str, key: dict) -> bool:
73
+ internal_key = os.environ.get(INTERNAL_KEY_ENV)
74
+ org_id = str(key.get("org_id") or "").strip()
75
+ if not internal_key or not org_id:
76
+ return False
77
+ req = urllib.request.Request(
78
+ f"{freesolo_base_url()}{_USE_PATH}",
79
+ data=json.dumps({"orgId": org_id, "slug": slug, "runId": run_id}).encode("utf-8"),
80
+ method="POST",
81
+ headers={
82
+ "Authorization": f"Bearer {internal_key}",
83
+ "Content-Type": "application/json",
84
+ },
85
+ )
86
+ try:
87
+ with urllib.request.urlopen(req, timeout=_TIMEOUT_S) as resp:
88
+ return 200 <= resp.status < 300
89
+ except urllib.error.HTTPError as exc:
90
+ detail = ""
91
+ with contextlib.suppress(Exception):
92
+ detail = exc.read().decode("utf-8", "replace")[:500]
93
+ _LOG.warning(
94
+ "failed to record environment use %s for run %s: HTTP %s %s",
95
+ slug,
96
+ run_id,
97
+ exc.code,
98
+ detail,
99
+ )
100
+ except (urllib.error.URLError, OSError) as exc:
101
+ _LOG.warning("failed to record environment use %s for run %s: %s", slug, run_id, exc)
102
+ return False
flash/server/envs.py ADDED
@@ -0,0 +1,360 @@
1
+ """Managed Freesolo environment publishing.
2
+
3
+ ``POST /v1/envs`` accepts a packaged Freesolo environment and uploads it to the
4
+ managed environment hub. The returned id is a Freesolo environment slug
5
+ (``namespace/name``) that Flash resolves internally.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import base64
11
+ import io
12
+ import os
13
+ import re
14
+ import shutil
15
+ import subprocess
16
+ import tarfile
17
+ import tempfile
18
+ import time
19
+ import urllib.parse
20
+ from pathlib import Path
21
+
22
+ _MAX_UPLOAD_BYTES = 64 * 1024 * 1024
23
+ _MAX_UNCOMPRESSED_BYTES = 256 * 1024 * 1024
24
+ _MAX_MEMBERS = 5000
25
+ _DEFAULT_GITHUB_REPO = "freesolo-co/environment-hub"
26
+ _GITHUB_BRANCH = "main"
27
+ _DEFAULT_ENVIRONMENT_FILE = "environment.py"
28
+ _BLOCKED_TOP_LEVEL_PATHS = {
29
+ ".github",
30
+ ".git",
31
+ "source",
32
+ }
33
+ _GIT_TIMEOUT_S = 180
34
+ _GIT_PUSH_RETRY_DELAYS_SECONDS = (2.0, 5.0)
35
+
36
+
37
+ def _human_mb(n: int) -> str:
38
+ return f"{n / (1024 * 1024):.0f} MB"
39
+
40
+
41
+ class EnvPublishError(Exception):
42
+ def __init__(self, message: str, *, status: int = 400):
43
+ super().__init__(message)
44
+ self.status = status
45
+
46
+
47
+ # Reserved hub namespace for the operator/internal service key. Its /v1/me identity is synthetic
48
+ # and shared (no per-user email), so it can't derive a namespace from an email like a user key does —
49
+ # but it IS a trusted identity that can submit runs and read everything, so it must be able to
50
+ # publish environments too. Matches the slug `internal@freesolo.co` would yield (see
51
+ # flash.server.db.ensure_internal_key) so the namespace is stable regardless of the row's email.
52
+ _INTERNAL_NAMESPACE = "internal-freesolo-co"
53
+
54
+
55
+ def namespace_for(key: dict) -> str:
56
+ # Special case RESERVED for the internal service key ONLY (auth_kind == "internal"): give it a
57
+ # fixed namespace instead of requiring an email. Every other key is a freesolo USER key and still
58
+ # must carry a real email — we never loosen that, so two different users can't collide on one
59
+ # namespace.
60
+ if key.get("auth_kind") == "internal":
61
+ return _INTERNAL_NAMESPACE
62
+ email = str(key.get("email") or "")
63
+ if "@" not in email:
64
+ raise EnvPublishError(
65
+ "authenticated Freesolo key must include an email (used to derive the hub namespace) — "
66
+ "publish with a key created at https://freesolo.co/sign-in (`flash login`)"
67
+ )
68
+ slug = re.sub(r"[^a-z0-9]+", "-", email.lower()).strip("-")
69
+ return slug or "user"
70
+
71
+
72
+ def _sanitize_name(name: str) -> str:
73
+ slug = re.sub(r"[^a-z0-9._-]+", "-", name.lower()).strip("-")
74
+ if slug in {".", ".."} or not re.search(r"[a-z0-9]", slug):
75
+ return "env"
76
+ return slug or "env"
77
+
78
+
79
+ def _safe_extract(tar_bytes: bytes, dest: Path) -> None:
80
+ root = dest.resolve()
81
+ try:
82
+ with tarfile.open(fileobj=io.BytesIO(tar_bytes), mode="r:gz") as tar:
83
+ total = 0
84
+ for count, member in enumerate(tar, start=1):
85
+ if count > _MAX_MEMBERS:
86
+ raise EnvPublishError(
87
+ f"env package has too many members (limit {_MAX_MEMBERS})"
88
+ )
89
+ segments: list[str] = []
90
+ for segment in member.name.replace("\\", "/").split("/"):
91
+ if not segment or segment == ".":
92
+ continue
93
+ if segment == "..":
94
+ raise EnvPublishError(f"unsafe path in env package: {member.name!r}")
95
+ segments.append(segment)
96
+ if not segments:
97
+ continue
98
+ normalized_name = "/".join(segments)
99
+ target = (dest / normalized_name).resolve()
100
+ if target != root and root not in target.parents:
101
+ raise EnvPublishError(f"unsafe path in env package: {member.name!r}")
102
+ if segments[0] in _BLOCKED_TOP_LEVEL_PATHS:
103
+ raise EnvPublishError(
104
+ "env packages must not contain repo-control or source top-level paths"
105
+ )
106
+ if member.islnk() or member.issym():
107
+ raise EnvPublishError(f"links are not allowed in env packages: {member.name!r}")
108
+ if not (member.isreg() or member.isdir()):
109
+ raise EnvPublishError(
110
+ f"only regular files and directories are allowed in env packages, "
111
+ f"but {member.name!r} is a special file"
112
+ )
113
+ total += max(0, member.size)
114
+ if total > _MAX_UNCOMPRESSED_BYTES:
115
+ raise EnvPublishError(
116
+ "env package is too large uncompressed "
117
+ f"(limit {_human_mb(_MAX_UNCOMPRESSED_BYTES)})"
118
+ )
119
+ member.name = normalized_name
120
+ tar.extract(member, dest)
121
+ except tarfile.TarError as exc:
122
+ raise EnvPublishError(f"env package is not a valid .tar.gz archive: {exc}") from exc
123
+ except OSError as exc:
124
+ raise EnvPublishError(f"env package could not be extracted: {exc}") from exc
125
+
126
+
127
+ def _github_repo() -> str:
128
+ return _DEFAULT_GITHUB_REPO
129
+
130
+
131
+ def _github_token() -> str | None:
132
+ return os.environ.get("GITHUB_TOKEN")
133
+
134
+
135
+ def _redact(value: str, token: str) -> str:
136
+ if not token:
137
+ return value
138
+ return value.replace(token, "<redacted>").replace(
139
+ urllib.parse.quote(token, safe=""), "<redacted>"
140
+ )
141
+
142
+
143
+ def _credentialed_repo_url(repo: str, token: str) -> str:
144
+ quoted = urllib.parse.quote(token, safe="")
145
+ return f"https://x-access-token:{quoted}@github.com/{repo}.git"
146
+
147
+
148
+ def _run_git(cwd: Path, args: list[str], *, token: str) -> subprocess.CompletedProcess[str]:
149
+ env = {**os.environ, "GIT_TERMINAL_PROMPT": "0"}
150
+ try:
151
+ proc = subprocess.run(
152
+ ["git", *args],
153
+ cwd=cwd,
154
+ env=env,
155
+ capture_output=True,
156
+ text=True,
157
+ timeout=_GIT_TIMEOUT_S,
158
+ )
159
+ except FileNotFoundError as exc:
160
+ raise EnvPublishError(
161
+ "git is required to upload environments to Freesolo", status=503
162
+ ) from exc
163
+ except subprocess.TimeoutExpired as exc:
164
+ raise EnvPublishError(
165
+ f"Freesolo environment upload git command timed out after {_GIT_TIMEOUT_S}s",
166
+ status=504,
167
+ ) from exc
168
+ if proc.returncode != 0:
169
+ output = f"{proc.stdout or ''}\n{proc.stderr or ''}".strip()
170
+ cmd = "git " + " ".join(args)
171
+ raise EnvPublishError(
172
+ _redact(f"Freesolo environment upload failed during `{cmd}`: {output[:1000]}", token),
173
+ status=502,
174
+ )
175
+ return proc
176
+
177
+
178
+ def _is_retryable_git_publish_error(message: str) -> bool:
179
+ lowered = message.lower()
180
+ permanent = (
181
+ "authentication failed",
182
+ "could not read username",
183
+ "repository not found",
184
+ "permission denied",
185
+ "403",
186
+ "401",
187
+ )
188
+ if any(marker in lowered for marker in permanent):
189
+ return False
190
+ retryable = (
191
+ "failed to push some refs",
192
+ "fetch first",
193
+ "non-fast-forward",
194
+ "stale info",
195
+ "cannot lock ref",
196
+ "connection reset",
197
+ "operation timed out",
198
+ "the remote end hung up",
199
+ "early eof",
200
+ "index.lock",
201
+ "rebase",
202
+ )
203
+ return any(marker in lowered for marker in retryable)
204
+
205
+
206
+ def _copy_package_to_checkout(*, source: Path, checkout: Path, publish_root: str) -> None:
207
+ target = checkout / publish_root
208
+ checkout_root = checkout.resolve()
209
+ target_root = target.resolve()
210
+ if target_root != checkout_root and checkout_root not in target_root.parents:
211
+ raise EnvPublishError("unsafe environment publish path")
212
+ shutil.rmtree(target, ignore_errors=True)
213
+ target.parent.mkdir(parents=True, exist_ok=True)
214
+ shutil.copytree(source, target)
215
+
216
+
217
+ def _commit_environment_update(
218
+ *, checkout: Path, publish_root: str, message: str, token: str
219
+ ) -> bool:
220
+ _run_git(checkout, ["config", "user.name", "freesolo-bot"], token=token)
221
+ _run_git(checkout, ["config", "user.email", "bot@freesolo.co"], token=token)
222
+ _run_git(checkout, ["add", "-A", "--", publish_root], token=token)
223
+ try:
224
+ proc = subprocess.run(
225
+ ["git", "diff", "--cached", "--quiet", "--", publish_root],
226
+ cwd=checkout,
227
+ capture_output=True,
228
+ text=True,
229
+ timeout=_GIT_TIMEOUT_S,
230
+ )
231
+ except subprocess.TimeoutExpired as exc:
232
+ raise EnvPublishError(
233
+ f"Freesolo environment upload git command timed out after {_GIT_TIMEOUT_S}s",
234
+ status=504,
235
+ ) from exc
236
+ if proc.returncode == 0:
237
+ return False
238
+ if proc.returncode != 1:
239
+ output = f"{proc.stdout or ''}\n{proc.stderr or ''}".strip()
240
+ raise EnvPublishError(
241
+ _redact(
242
+ f"Freesolo environment upload failed during staged diff check: {output}", token
243
+ ),
244
+ status=502,
245
+ )
246
+ _run_git(checkout, ["commit", "-m", message], token=token)
247
+ return True
248
+
249
+
250
+ def _push_environment_commit(*, checkout: Path, token: str) -> None:
251
+ _run_git(checkout, ["pull", "--rebase", "origin", _GITHUB_BRANCH], token=token)
252
+ _run_git(checkout, ["push", "origin", f"HEAD:{_GITHUB_BRANCH}"], token=token)
253
+
254
+
255
+ def _github_publish_once(
256
+ *,
257
+ dest: Path,
258
+ repo: str,
259
+ token: str,
260
+ publish_root: str,
261
+ message: str,
262
+ ) -> None:
263
+ with tempfile.TemporaryDirectory(prefix="flash-env-hub-") as tmp:
264
+ tmp_path = Path(tmp)
265
+ checkout = tmp_path / "environment-hub"
266
+ _run_git(
267
+ tmp_path,
268
+ [
269
+ "clone",
270
+ "--branch",
271
+ _GITHUB_BRANCH,
272
+ "--single-branch",
273
+ _credentialed_repo_url(repo, token),
274
+ str(checkout),
275
+ ],
276
+ token=token,
277
+ )
278
+ _copy_package_to_checkout(source=dest, checkout=checkout, publish_root=publish_root)
279
+ if _commit_environment_update(
280
+ checkout=checkout,
281
+ publish_root=publish_root,
282
+ message=message,
283
+ token=token,
284
+ ):
285
+ _push_environment_commit(checkout=checkout, token=token)
286
+
287
+
288
+ def _environment_file_relative_path(root: Path) -> str:
289
+ canonical = root / _DEFAULT_ENVIRONMENT_FILE
290
+ if canonical.is_file():
291
+ return _DEFAULT_ENVIRONMENT_FILE
292
+ raise EnvPublishError("env package must contain environment.py")
293
+
294
+
295
+ def _github_publish(dest: Path, *, name: str, key: dict) -> str:
296
+ token = _github_token()
297
+ if not token:
298
+ raise EnvPublishError(
299
+ "GITHUB_TOKEN is required to upload environments to Freesolo",
300
+ status=503,
301
+ )
302
+ repo = _github_repo()
303
+ ns = namespace_for(key)
304
+ clean = _sanitize_name(name)
305
+ publish_root = f"{ns}/{clean}"
306
+ _environment_file_relative_path(dest)
307
+ if not any(path.is_file() for path in dest.rglob("*")):
308
+ raise EnvPublishError("env package contains no files")
309
+ message = f"Upload Flash environment {ns}/{clean}"
310
+
311
+ last_error: EnvPublishError | None = None
312
+ max_attempts = len(_GIT_PUSH_RETRY_DELAYS_SECONDS) + 1
313
+ for attempt in range(max_attempts):
314
+ if attempt:
315
+ time.sleep(_GIT_PUSH_RETRY_DELAYS_SECONDS[attempt - 1])
316
+ try:
317
+ _github_publish_once(
318
+ dest=dest,
319
+ repo=repo,
320
+ token=token,
321
+ publish_root=publish_root,
322
+ message=message,
323
+ )
324
+ return f"{ns}/{clean}"
325
+ except EnvPublishError as exc:
326
+ last_error = exc
327
+ if attempt == max_attempts - 1 or not _is_retryable_git_publish_error(str(exc)):
328
+ raise
329
+ assert last_error is not None
330
+ raise last_error
331
+
332
+
333
+ def publish_package(*, package_b64: str, name: str, key: dict) -> str:
334
+ if not isinstance(name, str):
335
+ raise EnvPublishError("env name must be a string")
336
+ if not isinstance(package_b64, str):
337
+ raise EnvPublishError("env package must be a base64 string")
338
+ if not name:
339
+ raise EnvPublishError("missing env name")
340
+ max_encoded = ((_MAX_UPLOAD_BYTES + 2) // 3) * 4 + 3
341
+ if len(package_b64) > max_encoded:
342
+ raise EnvPublishError(
343
+ f"env package upload is too large (limit {_human_mb(_MAX_UPLOAD_BYTES)} compressed)",
344
+ status=413,
345
+ )
346
+ try:
347
+ tar_bytes = base64.b64decode(package_b64, validate=True)
348
+ except Exception as exc:
349
+ raise EnvPublishError("env package is not valid base64") from exc
350
+ if not tar_bytes:
351
+ raise EnvPublishError("empty env package")
352
+ if len(tar_bytes) > _MAX_UPLOAD_BYTES:
353
+ raise EnvPublishError(
354
+ f"env package upload is too large (limit {_human_mb(_MAX_UPLOAD_BYTES)} compressed)",
355
+ status=413,
356
+ )
357
+ with tempfile.TemporaryDirectory(prefix="flash-env-publish-") as tmp:
358
+ dest = Path(tmp)
359
+ _safe_extract(tar_bytes, dest)
360
+ return _github_publish(dest, name=name, key=key)
@@ -0,0 +1,163 @@
1
+ """Daily realized-cost reconciliation: pull what the GPU provider actually billed for each
2
+ finished run and report it to the freesolo backend for estimator accuracy tracking.
3
+
4
+ Flash charges customer-facing training usage from the completed run's final ``cost_usd``. This
5
+ job is the COGS side: the realized provider invoice (RunPod /v1/billing/endpoints).
6
+ The backend's training_cost_accuracy view joins the two per run to surface
7
+ charged-vs-realized error.
8
+
9
+ Best-effort and entirely off the run hot path: it runs in a background loop (see the server
10
+ lifespan), never blocks request handling, and any failure is swallowed and retried next cycle.
11
+ Realized cost is reported with the operator INTERNAL key (this is COGS, not a customer charge),
12
+ which also gates the whole feature -- with no FREESOLO_INTERNAL_KEY set, reconciliation is off.
13
+
14
+ Scope note (v1): cost is attributed from the run's last persisted handle (RunStatus.remote),
15
+ which is exact for the common single-seed run. A multi-seed run keeps only its final seed's
16
+ handle, so its realized cost is currently under-counted -- a known limitation to extend by
17
+ persisting every seed's resource id.
18
+ """
19
+
20
+ from __future__ import annotations
21
+
22
+ import contextlib
23
+ import json
24
+ import os
25
+ import time
26
+ import urllib.error
27
+ import urllib.request
28
+
29
+ from flash import runner
30
+ from flash.providers.realized import realized_cost_for_remote
31
+ from flash.server.auth import INTERNAL_KEY_ENV, freesolo_base_url
32
+
33
+ _REPORT_PATH = "/api/billing/training-cost"
34
+ _REPORT_TIMEOUT_S = 10.0
35
+ # Provider billing lags; wait this long after a run goes terminal before pulling (so the
36
+ # invoice has settled) and stop retrying once a run is older than the window.
37
+ _SETTLE_SECONDS = 3600.0 # 1h
38
+ _WINDOW_SECONDS = 7 * 86400.0 # only reconcile runs that finished within the last 7 days
39
+ # States that incur no GPU cost -> never reconciled.
40
+ _FREE_TERMINAL_STATES = frozenset({"dry_run"})
41
+ # States whose training is finished and whose GPU cost is therefore final -> eligible for
42
+ # reconciliation. The terminal billable states plus `deployed`: a deployed run finished
43
+ # training (its training invoice has settled) before serving was stood up on top of it, so
44
+ # its realized training cost is final and must be reconciled like any other finished run.
45
+ # (`deployed` is intentionally NOT in runner.TERMINAL_STATES -- it's a live, undeployable-back
46
+ # state -- so it has to be added explicitly here.) Excludes the free states (e.g. dry_run).
47
+ _RECONCILABLE_STATES = (runner.TERMINAL_STATES | {"deployed"}) - _FREE_TERMINAL_STATES
48
+
49
+
50
+ def reconcile_enabled() -> bool:
51
+ """Reconciliation (and its reporting) is on only when the operator internal key is set."""
52
+ return bool(os.environ.get(INTERNAL_KEY_ENV))
53
+
54
+
55
+ def _report(body: dict) -> bool:
56
+ """POST realized cost to the backend with the internal key (Bearer). Best-effort: returns
57
+ True on a 2xx, False on any failure (never raises). Mirrors ``billing._post_billing`` but
58
+ swallows errors -- a metering report must never affect anything."""
59
+ key = os.environ.get(INTERNAL_KEY_ENV)
60
+ if not key:
61
+ return False
62
+ req = urllib.request.Request(
63
+ f"{freesolo_base_url()}{_REPORT_PATH}",
64
+ data=json.dumps(body).encode("utf-8"),
65
+ method="POST",
66
+ headers={"Authorization": f"Bearer {key}", "Content-Type": "application/json"},
67
+ )
68
+ try:
69
+ with urllib.request.urlopen(req, timeout=_REPORT_TIMEOUT_S) as resp:
70
+ resp.read()
71
+ return True
72
+ except (urllib.error.URLError, OSError):
73
+ return False
74
+
75
+
76
+ def _terminal_ts(status: runner.RunStatus) -> float:
77
+ """The run's training-teardown time, used for both billing (``run_end``) and eligibility
78
+ (settle delay + window). Prefer the frozen ``finished_at`` over the mutable ``updated_at``:
79
+ deploy / late heartbeat / reconcile all move ``updated_at`` past teardown, which would both
80
+ DELAY the settle gate (it counts from the bump, not the finish) and let a long-finished run
81
+ that was merely bumped look "recent" and slip back inside ``_WINDOW_SECONDS``. ``finished_at``
82
+ is stamped once at the terminal transition and never moved; falls back to ``updated_at`` for
83
+ pre-feature runs. ``is not None`` (not truthiness) so a legitimate ``finished_at == 0.0`` is
84
+ honored rather than silently falling back to ``updated_at``."""
85
+ return float(status.finished_at if status.finished_at is not None else status.updated_at)
86
+
87
+
88
+ def _due(status: runner.RunStatus, now: float) -> bool:
89
+ """Whether a run should be reconciled this pass: a billable run whose training is finished
90
+ (a terminal billable state, or `deployed` -- see _RECONCILABLE_STATES), not yet reconciled,
91
+ past the settle delay, still within the window, and carrying a provider handle."""
92
+ if status.state not in _RECONCILABLE_STATES:
93
+ return False
94
+ if status.reconciled_at:
95
+ return False
96
+ age = now - _terminal_ts(status) # from teardown, not a later updated_at bump (see _terminal_ts)
97
+ if age < _SETTLE_SECONDS or age > _WINDOW_SECONDS:
98
+ return False
99
+ return bool(status.remote)
100
+
101
+
102
+ def reconcile_run(status: runner.RunStatus, *, now: float | None = None) -> bool:
103
+ """Pull + report realized cost for one run; mark it reconciled on success. Returns True when
104
+ a positive realized cost was reported. A zero/None result leaves the run unreconciled so a
105
+ later cycle (within the window) retries once the provider invoice settles."""
106
+ now = time.time() if now is None else now
107
+ remote = status.remote or {}
108
+ # Truthiness (`or`), NOT `is not None`: this started_ts comes from a persisted provider handle
109
+ # whose from_dict coerces a MISSING started_ts to 0.0 (see Lambda/HyperstackJobHandle.from_dict),
110
+ # so 0.0 means "unknown launch", not a 1970 epoch launch. Billing the flat $/hr from 0.0 would
111
+ # massively inflate realized cost, so fall back to created_at when started_ts is falsey/missing.
112
+ start = float(remote.get("started_ts") or status.created_at)
113
+ # The run's true terminal time (~teardown / billing stop); see _terminal_ts for why this is
114
+ # the frozen finished_at rather than the mutable updated_at (which deploy/heartbeat move past
115
+ # teardown and would make the instance providers' flat $/hr bill until that later event).
116
+ run_end = _terminal_ts(status)
117
+ # RunPod's billing query pads past run end so the settled invoice is in range; the instance
118
+ # providers bill flat $/hr to teardown, so they get the UN-padded run_end (no extra settle hour).
119
+ realized = realized_cost_for_remote(remote, start=start, end=run_end + _SETTLE_SECONDS, run_end=run_end)
120
+ if realized is None or realized.realized_usd <= 0:
121
+ return False
122
+
123
+ body = {
124
+ "runId": status.run_id,
125
+ "realizedCostUsd": realized.realized_usd,
126
+ "provider": realized.provider,
127
+ "gpu": remote.get("allocated_gpu") or remote.get("gpu"),
128
+ "costByResource": realized.by_resource,
129
+ "wallSeconds": realized.wall_seconds,
130
+ "costBasis": "realized",
131
+ "source": realized.source,
132
+ }
133
+ if not _report(body):
134
+ return False
135
+
136
+ # Persist locally so we don't re-pull/re-report, and so `flash status` can show realized vs
137
+ # estimated. COST-FIELDS-ONLY: record_realized_cost re-reads the run under the lock and writes
138
+ # only the realized-cost columns, never `state`. The `status` here is an earlier snapshot, so
139
+ # writing its `state` back could REVERT a run that advanced since (e.g. to `deployed`) -- which
140
+ # the terminal-sticky CAS does NOT protect against, since `deployed` is non-terminal. Updating
141
+ # only the cost columns keeps the run's current state intact.
142
+ with contextlib.suppress(Exception):
143
+ runner.record_realized_cost(
144
+ status.run_id,
145
+ realized_cost_usd=realized.realized_usd,
146
+ reconciled_at=now,
147
+ )
148
+ return True
149
+
150
+
151
+ def reconcile_once(*, now: float | None = None) -> int:
152
+ """One sweep over local runs: reconcile every due run. Returns how many were reported."""
153
+ if not reconcile_enabled():
154
+ return 0
155
+ now = time.time() if now is None else now
156
+ reported = 0
157
+ for status in runner.list_runs():
158
+ if not _due(status, now):
159
+ continue
160
+ with contextlib.suppress(Exception):
161
+ if reconcile_run(status, now=now):
162
+ reported += 1
163
+ return reported