freesolo-flash-dev 0.2.25__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- flash/__init__.py +29 -0
- flash/_channel.py +23 -0
- flash/_fileio.py +35 -0
- flash/_logging.py +49 -0
- flash/_update_check.py +266 -0
- flash/catalog.py +253 -0
- flash/cli/__init__.py +1 -0
- flash/cli/main/__init__.py +227 -0
- flash/cli/main/__main__.py +6 -0
- flash/cli/main/commands.py +636 -0
- flash/cli/main/envpush.py +317 -0
- flash/cli/main/render.py +599 -0
- flash/cli/main/training_doc.py +455 -0
- flash/client/__init__.py +14 -0
- flash/client/config.py +70 -0
- flash/client/http.py +372 -0
- flash/client/runtime_secrets.py +69 -0
- flash/client/specs.py +20 -0
- flash/cost/__init__.py +16 -0
- flash/cost/analytical.py +175 -0
- flash/cost/facts.py +114 -0
- flash/cost/spec.py +113 -0
- flash/cost/types.py +158 -0
- flash/engine/__init__.py +6 -0
- flash/engine/accounting.py +36 -0
- flash/engine/chalk_kernels.py +116 -0
- flash/engine/multiturn_rollout.py +780 -0
- flash/engine/recipe.py +86 -0
- flash/engine/vram.py +603 -0
- flash/engine/worker/__init__.py +2916 -0
- flash/engine/worker/__main__.py +4 -0
- flash/engine/worker/kernel_warmup.py +400 -0
- flash/engine/worker/lora.py +796 -0
- flash/engine/worker/packing.py +366 -0
- flash/engine/worker/perf.py +1048 -0
- flash/envs/__init__.py +10 -0
- flash/envs/adapter/__init__.py +883 -0
- flash/envs/adapter/rubric.py +222 -0
- flash/envs/base.py +52 -0
- flash/envs/registry.py +62 -0
- flash/mcp/__init__.py +1 -0
- flash/mcp/server.py +85 -0
- flash/providers/__init__.py +59 -0
- flash/providers/_auth.py +24 -0
- flash/providers/_http.py +230 -0
- flash/providers/_instance.py +416 -0
- flash/providers/_instance_bootstrap.py +517 -0
- flash/providers/_poll.py +311 -0
- flash/providers/allocator.py +193 -0
- flash/providers/base.py +431 -0
- flash/providers/hyperstack/__init__.py +127 -0
- flash/providers/hyperstack/api.py +522 -0
- flash/providers/hyperstack/auth.py +17 -0
- flash/providers/hyperstack/gpus.py +29 -0
- flash/providers/hyperstack/jobs/__init__.py +632 -0
- flash/providers/hyperstack/jobs/builders.py +122 -0
- flash/providers/hyperstack/preflight.py +23 -0
- flash/providers/hyperstack/pricing.py +26 -0
- flash/providers/hyperstack/train.py +25 -0
- flash/providers/lambdalabs/__init__.py +139 -0
- flash/providers/lambdalabs/api.py +261 -0
- flash/providers/lambdalabs/auth.py +18 -0
- flash/providers/lambdalabs/gpus.py +29 -0
- flash/providers/lambdalabs/jobs/__init__.py +724 -0
- flash/providers/lambdalabs/jobs/builders.py +118 -0
- flash/providers/lambdalabs/preflight.py +27 -0
- flash/providers/lambdalabs/pricing.py +51 -0
- flash/providers/lambdalabs/train.py +27 -0
- flash/providers/preflight.py +55 -0
- flash/providers/realized.py +80 -0
- flash/providers/runpod/__init__.py +130 -0
- flash/providers/runpod/api.py +186 -0
- flash/providers/runpod/auth.py +37 -0
- flash/providers/runpod/cost.py +57 -0
- flash/providers/runpod/gpus.py +46 -0
- flash/providers/runpod/jobs.py +956 -0
- flash/providers/runpod/keys.py +139 -0
- flash/providers/runpod/preflight.py +30 -0
- flash/providers/runpod/preload.py +915 -0
- flash/providers/runpod/pricing.py +18 -0
- flash/providers/runpod/slots.py +79 -0
- flash/providers/runpod/train/__init__.py +150 -0
- flash/providers/runpod/train/deps.py +395 -0
- flash/providers/runpod/train/endpoints.py +820 -0
- flash/py.typed +0 -0
- flash/runner/__init__.py +686 -0
- flash/runner/checkpoints.py +82 -0
- flash/runner/deploy.py +422 -0
- flash/runner/lifecycle.py +672 -0
- flash/schema/__init__.py +375 -0
- flash/schema/fields.py +331 -0
- flash/serve/__init__.py +1 -0
- flash/serve/deploy.py +326 -0
- flash/serve/pricing.py +60 -0
- flash/server/__init__.py +1 -0
- flash/server/__main__.py +20 -0
- flash/server/app.py +961 -0
- flash/server/auth.py +263 -0
- flash/server/billing.py +124 -0
- flash/server/checkpoints.py +110 -0
- flash/server/db.py +160 -0
- flash/server/environment_registry.py +102 -0
- flash/server/envs.py +360 -0
- flash/server/reconcile.py +163 -0
- flash/server/run_registry.py +150 -0
- flash/spec.py +333 -0
- freesolo_flash_dev-0.2.25.dist-info/METADATA +192 -0
- freesolo_flash_dev-0.2.25.dist-info/RECORD +111 -0
- freesolo_flash_dev-0.2.25.dist-info/WHEEL +4 -0
- freesolo_flash_dev-0.2.25.dist-info/entry_points.txt +3 -0
- freesolo_flash_dev-0.2.25.dist-info/licenses/LICENSE +201 -0
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
"""Best-effort reporting of published Flash environments to the Freesolo backend."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import contextlib
|
|
6
|
+
import json
|
|
7
|
+
import logging
|
|
8
|
+
import os
|
|
9
|
+
import urllib.error
|
|
10
|
+
import urllib.request
|
|
11
|
+
|
|
12
|
+
from .auth import INTERNAL_KEY_ENV, freesolo_base_url
|
|
13
|
+
|
|
14
|
+
_LOG = logging.getLogger("flash.server.environments")
|
|
15
|
+
_TIMEOUT_S = 10.0
|
|
16
|
+
_PATH = "/api/flash/environments/internal"
|
|
17
|
+
_USE_PATH = "/api/flash/environments/use/internal"
|
|
18
|
+
_DEFAULT_HUB_REPO = "freesolo-co/environment-hub"
|
|
19
|
+
_DEFAULT_HUB_REF = "main"
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def record_published_environment(*, slug: str, name: str, key: dict) -> bool:
|
|
23
|
+
"""Persist Hub metadata in the platform backend.
|
|
24
|
+
|
|
25
|
+
The GitHub publish is the source of truth for the environment package. This
|
|
26
|
+
metadata write exists so the web UI can list Flash environments, so it is
|
|
27
|
+
deliberately best-effort and never blocks `flash env push`.
|
|
28
|
+
"""
|
|
29
|
+
internal_key = os.environ.get(INTERNAL_KEY_ENV)
|
|
30
|
+
org_id = str(key.get("org_id") or "").strip()
|
|
31
|
+
if not internal_key or not org_id:
|
|
32
|
+
return False
|
|
33
|
+
|
|
34
|
+
body = {
|
|
35
|
+
"orgId": org_id,
|
|
36
|
+
"slug": slug,
|
|
37
|
+
"name": name,
|
|
38
|
+
"hubRepo": _DEFAULT_HUB_REPO,
|
|
39
|
+
"hubRef": _DEFAULT_HUB_REF,
|
|
40
|
+
"hubPath": f"{slug}/environment.py",
|
|
41
|
+
"publishedByUserId": key.get("user_id"),
|
|
42
|
+
"apiKeyId": key.get("api_key_id"),
|
|
43
|
+
"metadata": {"source": "flash.env.push"},
|
|
44
|
+
}
|
|
45
|
+
req = urllib.request.Request(
|
|
46
|
+
f"{freesolo_base_url()}{_PATH}",
|
|
47
|
+
data=json.dumps(body).encode("utf-8"),
|
|
48
|
+
method="POST",
|
|
49
|
+
headers={
|
|
50
|
+
"Authorization": f"Bearer {internal_key}",
|
|
51
|
+
"Content-Type": "application/json",
|
|
52
|
+
},
|
|
53
|
+
)
|
|
54
|
+
try:
|
|
55
|
+
with urllib.request.urlopen(req, timeout=_TIMEOUT_S) as resp:
|
|
56
|
+
return 200 <= resp.status < 300
|
|
57
|
+
except urllib.error.HTTPError as exc:
|
|
58
|
+
detail = ""
|
|
59
|
+
with contextlib.suppress(Exception):
|
|
60
|
+
detail = exc.read().decode("utf-8", "replace")[:500]
|
|
61
|
+
_LOG.warning(
|
|
62
|
+
"failed to record published environment %s: HTTP %s %s",
|
|
63
|
+
slug,
|
|
64
|
+
exc.code,
|
|
65
|
+
detail,
|
|
66
|
+
)
|
|
67
|
+
except (urllib.error.URLError, OSError) as exc:
|
|
68
|
+
_LOG.warning("failed to record published environment %s: %s", slug, exc)
|
|
69
|
+
return False
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def record_environment_use(*, slug: str, run_id: str, key: dict) -> bool:
|
|
73
|
+
internal_key = os.environ.get(INTERNAL_KEY_ENV)
|
|
74
|
+
org_id = str(key.get("org_id") or "").strip()
|
|
75
|
+
if not internal_key or not org_id:
|
|
76
|
+
return False
|
|
77
|
+
req = urllib.request.Request(
|
|
78
|
+
f"{freesolo_base_url()}{_USE_PATH}",
|
|
79
|
+
data=json.dumps({"orgId": org_id, "slug": slug, "runId": run_id}).encode("utf-8"),
|
|
80
|
+
method="POST",
|
|
81
|
+
headers={
|
|
82
|
+
"Authorization": f"Bearer {internal_key}",
|
|
83
|
+
"Content-Type": "application/json",
|
|
84
|
+
},
|
|
85
|
+
)
|
|
86
|
+
try:
|
|
87
|
+
with urllib.request.urlopen(req, timeout=_TIMEOUT_S) as resp:
|
|
88
|
+
return 200 <= resp.status < 300
|
|
89
|
+
except urllib.error.HTTPError as exc:
|
|
90
|
+
detail = ""
|
|
91
|
+
with contextlib.suppress(Exception):
|
|
92
|
+
detail = exc.read().decode("utf-8", "replace")[:500]
|
|
93
|
+
_LOG.warning(
|
|
94
|
+
"failed to record environment use %s for run %s: HTTP %s %s",
|
|
95
|
+
slug,
|
|
96
|
+
run_id,
|
|
97
|
+
exc.code,
|
|
98
|
+
detail,
|
|
99
|
+
)
|
|
100
|
+
except (urllib.error.URLError, OSError) as exc:
|
|
101
|
+
_LOG.warning("failed to record environment use %s for run %s: %s", slug, run_id, exc)
|
|
102
|
+
return False
|
flash/server/envs.py
ADDED
|
@@ -0,0 +1,360 @@
|
|
|
1
|
+
"""Managed Freesolo environment publishing.
|
|
2
|
+
|
|
3
|
+
``POST /v1/envs`` accepts a packaged Freesolo environment and uploads it to the
|
|
4
|
+
managed environment hub. The returned id is a Freesolo environment slug
|
|
5
|
+
(``namespace/name``) that Flash resolves internally.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import base64
|
|
11
|
+
import io
|
|
12
|
+
import os
|
|
13
|
+
import re
|
|
14
|
+
import shutil
|
|
15
|
+
import subprocess
|
|
16
|
+
import tarfile
|
|
17
|
+
import tempfile
|
|
18
|
+
import time
|
|
19
|
+
import urllib.parse
|
|
20
|
+
from pathlib import Path
|
|
21
|
+
|
|
22
|
+
_MAX_UPLOAD_BYTES = 64 * 1024 * 1024
|
|
23
|
+
_MAX_UNCOMPRESSED_BYTES = 256 * 1024 * 1024
|
|
24
|
+
_MAX_MEMBERS = 5000
|
|
25
|
+
_DEFAULT_GITHUB_REPO = "freesolo-co/environment-hub"
|
|
26
|
+
_GITHUB_BRANCH = "main"
|
|
27
|
+
_DEFAULT_ENVIRONMENT_FILE = "environment.py"
|
|
28
|
+
_BLOCKED_TOP_LEVEL_PATHS = {
|
|
29
|
+
".github",
|
|
30
|
+
".git",
|
|
31
|
+
"source",
|
|
32
|
+
}
|
|
33
|
+
_GIT_TIMEOUT_S = 180
|
|
34
|
+
_GIT_PUSH_RETRY_DELAYS_SECONDS = (2.0, 5.0)
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def _human_mb(n: int) -> str:
|
|
38
|
+
return f"{n / (1024 * 1024):.0f} MB"
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class EnvPublishError(Exception):
|
|
42
|
+
def __init__(self, message: str, *, status: int = 400):
|
|
43
|
+
super().__init__(message)
|
|
44
|
+
self.status = status
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
# Reserved hub namespace for the operator/internal service key. Its /v1/me identity is synthetic
|
|
48
|
+
# and shared (no per-user email), so it can't derive a namespace from an email like a user key does —
|
|
49
|
+
# but it IS a trusted identity that can submit runs and read everything, so it must be able to
|
|
50
|
+
# publish environments too. Matches the slug `internal@freesolo.co` would yield (see
|
|
51
|
+
# flash.server.db.ensure_internal_key) so the namespace is stable regardless of the row's email.
|
|
52
|
+
_INTERNAL_NAMESPACE = "internal-freesolo-co"
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def namespace_for(key: dict) -> str:
|
|
56
|
+
# Special case RESERVED for the internal service key ONLY (auth_kind == "internal"): give it a
|
|
57
|
+
# fixed namespace instead of requiring an email. Every other key is a freesolo USER key and still
|
|
58
|
+
# must carry a real email — we never loosen that, so two different users can't collide on one
|
|
59
|
+
# namespace.
|
|
60
|
+
if key.get("auth_kind") == "internal":
|
|
61
|
+
return _INTERNAL_NAMESPACE
|
|
62
|
+
email = str(key.get("email") or "")
|
|
63
|
+
if "@" not in email:
|
|
64
|
+
raise EnvPublishError(
|
|
65
|
+
"authenticated Freesolo key must include an email (used to derive the hub namespace) — "
|
|
66
|
+
"publish with a key created at https://freesolo.co/sign-in (`flash login`)"
|
|
67
|
+
)
|
|
68
|
+
slug = re.sub(r"[^a-z0-9]+", "-", email.lower()).strip("-")
|
|
69
|
+
return slug or "user"
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def _sanitize_name(name: str) -> str:
|
|
73
|
+
slug = re.sub(r"[^a-z0-9._-]+", "-", name.lower()).strip("-")
|
|
74
|
+
if slug in {".", ".."} or not re.search(r"[a-z0-9]", slug):
|
|
75
|
+
return "env"
|
|
76
|
+
return slug or "env"
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def _safe_extract(tar_bytes: bytes, dest: Path) -> None:
|
|
80
|
+
root = dest.resolve()
|
|
81
|
+
try:
|
|
82
|
+
with tarfile.open(fileobj=io.BytesIO(tar_bytes), mode="r:gz") as tar:
|
|
83
|
+
total = 0
|
|
84
|
+
for count, member in enumerate(tar, start=1):
|
|
85
|
+
if count > _MAX_MEMBERS:
|
|
86
|
+
raise EnvPublishError(
|
|
87
|
+
f"env package has too many members (limit {_MAX_MEMBERS})"
|
|
88
|
+
)
|
|
89
|
+
segments: list[str] = []
|
|
90
|
+
for segment in member.name.replace("\\", "/").split("/"):
|
|
91
|
+
if not segment or segment == ".":
|
|
92
|
+
continue
|
|
93
|
+
if segment == "..":
|
|
94
|
+
raise EnvPublishError(f"unsafe path in env package: {member.name!r}")
|
|
95
|
+
segments.append(segment)
|
|
96
|
+
if not segments:
|
|
97
|
+
continue
|
|
98
|
+
normalized_name = "/".join(segments)
|
|
99
|
+
target = (dest / normalized_name).resolve()
|
|
100
|
+
if target != root and root not in target.parents:
|
|
101
|
+
raise EnvPublishError(f"unsafe path in env package: {member.name!r}")
|
|
102
|
+
if segments[0] in _BLOCKED_TOP_LEVEL_PATHS:
|
|
103
|
+
raise EnvPublishError(
|
|
104
|
+
"env packages must not contain repo-control or source top-level paths"
|
|
105
|
+
)
|
|
106
|
+
if member.islnk() or member.issym():
|
|
107
|
+
raise EnvPublishError(f"links are not allowed in env packages: {member.name!r}")
|
|
108
|
+
if not (member.isreg() or member.isdir()):
|
|
109
|
+
raise EnvPublishError(
|
|
110
|
+
f"only regular files and directories are allowed in env packages, "
|
|
111
|
+
f"but {member.name!r} is a special file"
|
|
112
|
+
)
|
|
113
|
+
total += max(0, member.size)
|
|
114
|
+
if total > _MAX_UNCOMPRESSED_BYTES:
|
|
115
|
+
raise EnvPublishError(
|
|
116
|
+
"env package is too large uncompressed "
|
|
117
|
+
f"(limit {_human_mb(_MAX_UNCOMPRESSED_BYTES)})"
|
|
118
|
+
)
|
|
119
|
+
member.name = normalized_name
|
|
120
|
+
tar.extract(member, dest)
|
|
121
|
+
except tarfile.TarError as exc:
|
|
122
|
+
raise EnvPublishError(f"env package is not a valid .tar.gz archive: {exc}") from exc
|
|
123
|
+
except OSError as exc:
|
|
124
|
+
raise EnvPublishError(f"env package could not be extracted: {exc}") from exc
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
def _github_repo() -> str:
|
|
128
|
+
return _DEFAULT_GITHUB_REPO
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
def _github_token() -> str | None:
|
|
132
|
+
return os.environ.get("GITHUB_TOKEN")
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
def _redact(value: str, token: str) -> str:
|
|
136
|
+
if not token:
|
|
137
|
+
return value
|
|
138
|
+
return value.replace(token, "<redacted>").replace(
|
|
139
|
+
urllib.parse.quote(token, safe=""), "<redacted>"
|
|
140
|
+
)
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
def _credentialed_repo_url(repo: str, token: str) -> str:
|
|
144
|
+
quoted = urllib.parse.quote(token, safe="")
|
|
145
|
+
return f"https://x-access-token:{quoted}@github.com/{repo}.git"
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
def _run_git(cwd: Path, args: list[str], *, token: str) -> subprocess.CompletedProcess[str]:
|
|
149
|
+
env = {**os.environ, "GIT_TERMINAL_PROMPT": "0"}
|
|
150
|
+
try:
|
|
151
|
+
proc = subprocess.run(
|
|
152
|
+
["git", *args],
|
|
153
|
+
cwd=cwd,
|
|
154
|
+
env=env,
|
|
155
|
+
capture_output=True,
|
|
156
|
+
text=True,
|
|
157
|
+
timeout=_GIT_TIMEOUT_S,
|
|
158
|
+
)
|
|
159
|
+
except FileNotFoundError as exc:
|
|
160
|
+
raise EnvPublishError(
|
|
161
|
+
"git is required to upload environments to Freesolo", status=503
|
|
162
|
+
) from exc
|
|
163
|
+
except subprocess.TimeoutExpired as exc:
|
|
164
|
+
raise EnvPublishError(
|
|
165
|
+
f"Freesolo environment upload git command timed out after {_GIT_TIMEOUT_S}s",
|
|
166
|
+
status=504,
|
|
167
|
+
) from exc
|
|
168
|
+
if proc.returncode != 0:
|
|
169
|
+
output = f"{proc.stdout or ''}\n{proc.stderr or ''}".strip()
|
|
170
|
+
cmd = "git " + " ".join(args)
|
|
171
|
+
raise EnvPublishError(
|
|
172
|
+
_redact(f"Freesolo environment upload failed during `{cmd}`: {output[:1000]}", token),
|
|
173
|
+
status=502,
|
|
174
|
+
)
|
|
175
|
+
return proc
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
def _is_retryable_git_publish_error(message: str) -> bool:
|
|
179
|
+
lowered = message.lower()
|
|
180
|
+
permanent = (
|
|
181
|
+
"authentication failed",
|
|
182
|
+
"could not read username",
|
|
183
|
+
"repository not found",
|
|
184
|
+
"permission denied",
|
|
185
|
+
"403",
|
|
186
|
+
"401",
|
|
187
|
+
)
|
|
188
|
+
if any(marker in lowered for marker in permanent):
|
|
189
|
+
return False
|
|
190
|
+
retryable = (
|
|
191
|
+
"failed to push some refs",
|
|
192
|
+
"fetch first",
|
|
193
|
+
"non-fast-forward",
|
|
194
|
+
"stale info",
|
|
195
|
+
"cannot lock ref",
|
|
196
|
+
"connection reset",
|
|
197
|
+
"operation timed out",
|
|
198
|
+
"the remote end hung up",
|
|
199
|
+
"early eof",
|
|
200
|
+
"index.lock",
|
|
201
|
+
"rebase",
|
|
202
|
+
)
|
|
203
|
+
return any(marker in lowered for marker in retryable)
|
|
204
|
+
|
|
205
|
+
|
|
206
|
+
def _copy_package_to_checkout(*, source: Path, checkout: Path, publish_root: str) -> None:
|
|
207
|
+
target = checkout / publish_root
|
|
208
|
+
checkout_root = checkout.resolve()
|
|
209
|
+
target_root = target.resolve()
|
|
210
|
+
if target_root != checkout_root and checkout_root not in target_root.parents:
|
|
211
|
+
raise EnvPublishError("unsafe environment publish path")
|
|
212
|
+
shutil.rmtree(target, ignore_errors=True)
|
|
213
|
+
target.parent.mkdir(parents=True, exist_ok=True)
|
|
214
|
+
shutil.copytree(source, target)
|
|
215
|
+
|
|
216
|
+
|
|
217
|
+
def _commit_environment_update(
|
|
218
|
+
*, checkout: Path, publish_root: str, message: str, token: str
|
|
219
|
+
) -> bool:
|
|
220
|
+
_run_git(checkout, ["config", "user.name", "freesolo-bot"], token=token)
|
|
221
|
+
_run_git(checkout, ["config", "user.email", "bot@freesolo.co"], token=token)
|
|
222
|
+
_run_git(checkout, ["add", "-A", "--", publish_root], token=token)
|
|
223
|
+
try:
|
|
224
|
+
proc = subprocess.run(
|
|
225
|
+
["git", "diff", "--cached", "--quiet", "--", publish_root],
|
|
226
|
+
cwd=checkout,
|
|
227
|
+
capture_output=True,
|
|
228
|
+
text=True,
|
|
229
|
+
timeout=_GIT_TIMEOUT_S,
|
|
230
|
+
)
|
|
231
|
+
except subprocess.TimeoutExpired as exc:
|
|
232
|
+
raise EnvPublishError(
|
|
233
|
+
f"Freesolo environment upload git command timed out after {_GIT_TIMEOUT_S}s",
|
|
234
|
+
status=504,
|
|
235
|
+
) from exc
|
|
236
|
+
if proc.returncode == 0:
|
|
237
|
+
return False
|
|
238
|
+
if proc.returncode != 1:
|
|
239
|
+
output = f"{proc.stdout or ''}\n{proc.stderr or ''}".strip()
|
|
240
|
+
raise EnvPublishError(
|
|
241
|
+
_redact(
|
|
242
|
+
f"Freesolo environment upload failed during staged diff check: {output}", token
|
|
243
|
+
),
|
|
244
|
+
status=502,
|
|
245
|
+
)
|
|
246
|
+
_run_git(checkout, ["commit", "-m", message], token=token)
|
|
247
|
+
return True
|
|
248
|
+
|
|
249
|
+
|
|
250
|
+
def _push_environment_commit(*, checkout: Path, token: str) -> None:
|
|
251
|
+
_run_git(checkout, ["pull", "--rebase", "origin", _GITHUB_BRANCH], token=token)
|
|
252
|
+
_run_git(checkout, ["push", "origin", f"HEAD:{_GITHUB_BRANCH}"], token=token)
|
|
253
|
+
|
|
254
|
+
|
|
255
|
+
def _github_publish_once(
|
|
256
|
+
*,
|
|
257
|
+
dest: Path,
|
|
258
|
+
repo: str,
|
|
259
|
+
token: str,
|
|
260
|
+
publish_root: str,
|
|
261
|
+
message: str,
|
|
262
|
+
) -> None:
|
|
263
|
+
with tempfile.TemporaryDirectory(prefix="flash-env-hub-") as tmp:
|
|
264
|
+
tmp_path = Path(tmp)
|
|
265
|
+
checkout = tmp_path / "environment-hub"
|
|
266
|
+
_run_git(
|
|
267
|
+
tmp_path,
|
|
268
|
+
[
|
|
269
|
+
"clone",
|
|
270
|
+
"--branch",
|
|
271
|
+
_GITHUB_BRANCH,
|
|
272
|
+
"--single-branch",
|
|
273
|
+
_credentialed_repo_url(repo, token),
|
|
274
|
+
str(checkout),
|
|
275
|
+
],
|
|
276
|
+
token=token,
|
|
277
|
+
)
|
|
278
|
+
_copy_package_to_checkout(source=dest, checkout=checkout, publish_root=publish_root)
|
|
279
|
+
if _commit_environment_update(
|
|
280
|
+
checkout=checkout,
|
|
281
|
+
publish_root=publish_root,
|
|
282
|
+
message=message,
|
|
283
|
+
token=token,
|
|
284
|
+
):
|
|
285
|
+
_push_environment_commit(checkout=checkout, token=token)
|
|
286
|
+
|
|
287
|
+
|
|
288
|
+
def _environment_file_relative_path(root: Path) -> str:
|
|
289
|
+
canonical = root / _DEFAULT_ENVIRONMENT_FILE
|
|
290
|
+
if canonical.is_file():
|
|
291
|
+
return _DEFAULT_ENVIRONMENT_FILE
|
|
292
|
+
raise EnvPublishError("env package must contain environment.py")
|
|
293
|
+
|
|
294
|
+
|
|
295
|
+
def _github_publish(dest: Path, *, name: str, key: dict) -> str:
|
|
296
|
+
token = _github_token()
|
|
297
|
+
if not token:
|
|
298
|
+
raise EnvPublishError(
|
|
299
|
+
"GITHUB_TOKEN is required to upload environments to Freesolo",
|
|
300
|
+
status=503,
|
|
301
|
+
)
|
|
302
|
+
repo = _github_repo()
|
|
303
|
+
ns = namespace_for(key)
|
|
304
|
+
clean = _sanitize_name(name)
|
|
305
|
+
publish_root = f"{ns}/{clean}"
|
|
306
|
+
_environment_file_relative_path(dest)
|
|
307
|
+
if not any(path.is_file() for path in dest.rglob("*")):
|
|
308
|
+
raise EnvPublishError("env package contains no files")
|
|
309
|
+
message = f"Upload Flash environment {ns}/{clean}"
|
|
310
|
+
|
|
311
|
+
last_error: EnvPublishError | None = None
|
|
312
|
+
max_attempts = len(_GIT_PUSH_RETRY_DELAYS_SECONDS) + 1
|
|
313
|
+
for attempt in range(max_attempts):
|
|
314
|
+
if attempt:
|
|
315
|
+
time.sleep(_GIT_PUSH_RETRY_DELAYS_SECONDS[attempt - 1])
|
|
316
|
+
try:
|
|
317
|
+
_github_publish_once(
|
|
318
|
+
dest=dest,
|
|
319
|
+
repo=repo,
|
|
320
|
+
token=token,
|
|
321
|
+
publish_root=publish_root,
|
|
322
|
+
message=message,
|
|
323
|
+
)
|
|
324
|
+
return f"{ns}/{clean}"
|
|
325
|
+
except EnvPublishError as exc:
|
|
326
|
+
last_error = exc
|
|
327
|
+
if attempt == max_attempts - 1 or not _is_retryable_git_publish_error(str(exc)):
|
|
328
|
+
raise
|
|
329
|
+
assert last_error is not None
|
|
330
|
+
raise last_error
|
|
331
|
+
|
|
332
|
+
|
|
333
|
+
def publish_package(*, package_b64: str, name: str, key: dict) -> str:
|
|
334
|
+
if not isinstance(name, str):
|
|
335
|
+
raise EnvPublishError("env name must be a string")
|
|
336
|
+
if not isinstance(package_b64, str):
|
|
337
|
+
raise EnvPublishError("env package must be a base64 string")
|
|
338
|
+
if not name:
|
|
339
|
+
raise EnvPublishError("missing env name")
|
|
340
|
+
max_encoded = ((_MAX_UPLOAD_BYTES + 2) // 3) * 4 + 3
|
|
341
|
+
if len(package_b64) > max_encoded:
|
|
342
|
+
raise EnvPublishError(
|
|
343
|
+
f"env package upload is too large (limit {_human_mb(_MAX_UPLOAD_BYTES)} compressed)",
|
|
344
|
+
status=413,
|
|
345
|
+
)
|
|
346
|
+
try:
|
|
347
|
+
tar_bytes = base64.b64decode(package_b64, validate=True)
|
|
348
|
+
except Exception as exc:
|
|
349
|
+
raise EnvPublishError("env package is not valid base64") from exc
|
|
350
|
+
if not tar_bytes:
|
|
351
|
+
raise EnvPublishError("empty env package")
|
|
352
|
+
if len(tar_bytes) > _MAX_UPLOAD_BYTES:
|
|
353
|
+
raise EnvPublishError(
|
|
354
|
+
f"env package upload is too large (limit {_human_mb(_MAX_UPLOAD_BYTES)} compressed)",
|
|
355
|
+
status=413,
|
|
356
|
+
)
|
|
357
|
+
with tempfile.TemporaryDirectory(prefix="flash-env-publish-") as tmp:
|
|
358
|
+
dest = Path(tmp)
|
|
359
|
+
_safe_extract(tar_bytes, dest)
|
|
360
|
+
return _github_publish(dest, name=name, key=key)
|
|
@@ -0,0 +1,163 @@
|
|
|
1
|
+
"""Daily realized-cost reconciliation: pull what the GPU provider actually billed for each
|
|
2
|
+
finished run and report it to the freesolo backend for estimator accuracy tracking.
|
|
3
|
+
|
|
4
|
+
Flash charges customer-facing training usage from the completed run's final ``cost_usd``. This
|
|
5
|
+
job is the COGS side: the realized provider invoice (RunPod /v1/billing/endpoints).
|
|
6
|
+
The backend's training_cost_accuracy view joins the two per run to surface
|
|
7
|
+
charged-vs-realized error.
|
|
8
|
+
|
|
9
|
+
Best-effort and entirely off the run hot path: it runs in a background loop (see the server
|
|
10
|
+
lifespan), never blocks request handling, and any failure is swallowed and retried next cycle.
|
|
11
|
+
Realized cost is reported with the operator INTERNAL key (this is COGS, not a customer charge),
|
|
12
|
+
which also gates the whole feature -- with no FREESOLO_INTERNAL_KEY set, reconciliation is off.
|
|
13
|
+
|
|
14
|
+
Scope note (v1): cost is attributed from the run's last persisted handle (RunStatus.remote),
|
|
15
|
+
which is exact for the common single-seed run. A multi-seed run keeps only its final seed's
|
|
16
|
+
handle, so its realized cost is currently under-counted -- a known limitation to extend by
|
|
17
|
+
persisting every seed's resource id.
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
from __future__ import annotations
|
|
21
|
+
|
|
22
|
+
import contextlib
|
|
23
|
+
import json
|
|
24
|
+
import os
|
|
25
|
+
import time
|
|
26
|
+
import urllib.error
|
|
27
|
+
import urllib.request
|
|
28
|
+
|
|
29
|
+
from flash import runner
|
|
30
|
+
from flash.providers.realized import realized_cost_for_remote
|
|
31
|
+
from flash.server.auth import INTERNAL_KEY_ENV, freesolo_base_url
|
|
32
|
+
|
|
33
|
+
_REPORT_PATH = "/api/billing/training-cost"
|
|
34
|
+
_REPORT_TIMEOUT_S = 10.0
|
|
35
|
+
# Provider billing lags; wait this long after a run goes terminal before pulling (so the
|
|
36
|
+
# invoice has settled) and stop retrying once a run is older than the window.
|
|
37
|
+
_SETTLE_SECONDS = 3600.0 # 1h
|
|
38
|
+
_WINDOW_SECONDS = 7 * 86400.0 # only reconcile runs that finished within the last 7 days
|
|
39
|
+
# States that incur no GPU cost -> never reconciled.
|
|
40
|
+
_FREE_TERMINAL_STATES = frozenset({"dry_run"})
|
|
41
|
+
# States whose training is finished and whose GPU cost is therefore final -> eligible for
|
|
42
|
+
# reconciliation. The terminal billable states plus `deployed`: a deployed run finished
|
|
43
|
+
# training (its training invoice has settled) before serving was stood up on top of it, so
|
|
44
|
+
# its realized training cost is final and must be reconciled like any other finished run.
|
|
45
|
+
# (`deployed` is intentionally NOT in runner.TERMINAL_STATES -- it's a live, undeployable-back
|
|
46
|
+
# state -- so it has to be added explicitly here.) Excludes the free states (e.g. dry_run).
|
|
47
|
+
_RECONCILABLE_STATES = (runner.TERMINAL_STATES | {"deployed"}) - _FREE_TERMINAL_STATES
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def reconcile_enabled() -> bool:
|
|
51
|
+
"""Reconciliation (and its reporting) is on only when the operator internal key is set."""
|
|
52
|
+
return bool(os.environ.get(INTERNAL_KEY_ENV))
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def _report(body: dict) -> bool:
|
|
56
|
+
"""POST realized cost to the backend with the internal key (Bearer). Best-effort: returns
|
|
57
|
+
True on a 2xx, False on any failure (never raises). Mirrors ``billing._post_billing`` but
|
|
58
|
+
swallows errors -- a metering report must never affect anything."""
|
|
59
|
+
key = os.environ.get(INTERNAL_KEY_ENV)
|
|
60
|
+
if not key:
|
|
61
|
+
return False
|
|
62
|
+
req = urllib.request.Request(
|
|
63
|
+
f"{freesolo_base_url()}{_REPORT_PATH}",
|
|
64
|
+
data=json.dumps(body).encode("utf-8"),
|
|
65
|
+
method="POST",
|
|
66
|
+
headers={"Authorization": f"Bearer {key}", "Content-Type": "application/json"},
|
|
67
|
+
)
|
|
68
|
+
try:
|
|
69
|
+
with urllib.request.urlopen(req, timeout=_REPORT_TIMEOUT_S) as resp:
|
|
70
|
+
resp.read()
|
|
71
|
+
return True
|
|
72
|
+
except (urllib.error.URLError, OSError):
|
|
73
|
+
return False
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def _terminal_ts(status: runner.RunStatus) -> float:
|
|
77
|
+
"""The run's training-teardown time, used for both billing (``run_end``) and eligibility
|
|
78
|
+
(settle delay + window). Prefer the frozen ``finished_at`` over the mutable ``updated_at``:
|
|
79
|
+
deploy / late heartbeat / reconcile all move ``updated_at`` past teardown, which would both
|
|
80
|
+
DELAY the settle gate (it counts from the bump, not the finish) and let a long-finished run
|
|
81
|
+
that was merely bumped look "recent" and slip back inside ``_WINDOW_SECONDS``. ``finished_at``
|
|
82
|
+
is stamped once at the terminal transition and never moved; falls back to ``updated_at`` for
|
|
83
|
+
pre-feature runs. ``is not None`` (not truthiness) so a legitimate ``finished_at == 0.0`` is
|
|
84
|
+
honored rather than silently falling back to ``updated_at``."""
|
|
85
|
+
return float(status.finished_at if status.finished_at is not None else status.updated_at)
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def _due(status: runner.RunStatus, now: float) -> bool:
|
|
89
|
+
"""Whether a run should be reconciled this pass: a billable run whose training is finished
|
|
90
|
+
(a terminal billable state, or `deployed` -- see _RECONCILABLE_STATES), not yet reconciled,
|
|
91
|
+
past the settle delay, still within the window, and carrying a provider handle."""
|
|
92
|
+
if status.state not in _RECONCILABLE_STATES:
|
|
93
|
+
return False
|
|
94
|
+
if status.reconciled_at:
|
|
95
|
+
return False
|
|
96
|
+
age = now - _terminal_ts(status) # from teardown, not a later updated_at bump (see _terminal_ts)
|
|
97
|
+
if age < _SETTLE_SECONDS or age > _WINDOW_SECONDS:
|
|
98
|
+
return False
|
|
99
|
+
return bool(status.remote)
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def reconcile_run(status: runner.RunStatus, *, now: float | None = None) -> bool:
|
|
103
|
+
"""Pull + report realized cost for one run; mark it reconciled on success. Returns True when
|
|
104
|
+
a positive realized cost was reported. A zero/None result leaves the run unreconciled so a
|
|
105
|
+
later cycle (within the window) retries once the provider invoice settles."""
|
|
106
|
+
now = time.time() if now is None else now
|
|
107
|
+
remote = status.remote or {}
|
|
108
|
+
# Truthiness (`or`), NOT `is not None`: this started_ts comes from a persisted provider handle
|
|
109
|
+
# whose from_dict coerces a MISSING started_ts to 0.0 (see Lambda/HyperstackJobHandle.from_dict),
|
|
110
|
+
# so 0.0 means "unknown launch", not a 1970 epoch launch. Billing the flat $/hr from 0.0 would
|
|
111
|
+
# massively inflate realized cost, so fall back to created_at when started_ts is falsey/missing.
|
|
112
|
+
start = float(remote.get("started_ts") or status.created_at)
|
|
113
|
+
# The run's true terminal time (~teardown / billing stop); see _terminal_ts for why this is
|
|
114
|
+
# the frozen finished_at rather than the mutable updated_at (which deploy/heartbeat move past
|
|
115
|
+
# teardown and would make the instance providers' flat $/hr bill until that later event).
|
|
116
|
+
run_end = _terminal_ts(status)
|
|
117
|
+
# RunPod's billing query pads past run end so the settled invoice is in range; the instance
|
|
118
|
+
# providers bill flat $/hr to teardown, so they get the UN-padded run_end (no extra settle hour).
|
|
119
|
+
realized = realized_cost_for_remote(remote, start=start, end=run_end + _SETTLE_SECONDS, run_end=run_end)
|
|
120
|
+
if realized is None or realized.realized_usd <= 0:
|
|
121
|
+
return False
|
|
122
|
+
|
|
123
|
+
body = {
|
|
124
|
+
"runId": status.run_id,
|
|
125
|
+
"realizedCostUsd": realized.realized_usd,
|
|
126
|
+
"provider": realized.provider,
|
|
127
|
+
"gpu": remote.get("allocated_gpu") or remote.get("gpu"),
|
|
128
|
+
"costByResource": realized.by_resource,
|
|
129
|
+
"wallSeconds": realized.wall_seconds,
|
|
130
|
+
"costBasis": "realized",
|
|
131
|
+
"source": realized.source,
|
|
132
|
+
}
|
|
133
|
+
if not _report(body):
|
|
134
|
+
return False
|
|
135
|
+
|
|
136
|
+
# Persist locally so we don't re-pull/re-report, and so `flash status` can show realized vs
|
|
137
|
+
# estimated. COST-FIELDS-ONLY: record_realized_cost re-reads the run under the lock and writes
|
|
138
|
+
# only the realized-cost columns, never `state`. The `status` here is an earlier snapshot, so
|
|
139
|
+
# writing its `state` back could REVERT a run that advanced since (e.g. to `deployed`) -- which
|
|
140
|
+
# the terminal-sticky CAS does NOT protect against, since `deployed` is non-terminal. Updating
|
|
141
|
+
# only the cost columns keeps the run's current state intact.
|
|
142
|
+
with contextlib.suppress(Exception):
|
|
143
|
+
runner.record_realized_cost(
|
|
144
|
+
status.run_id,
|
|
145
|
+
realized_cost_usd=realized.realized_usd,
|
|
146
|
+
reconciled_at=now,
|
|
147
|
+
)
|
|
148
|
+
return True
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
def reconcile_once(*, now: float | None = None) -> int:
|
|
152
|
+
"""One sweep over local runs: reconcile every due run. Returns how many were reported."""
|
|
153
|
+
if not reconcile_enabled():
|
|
154
|
+
return 0
|
|
155
|
+
now = time.time() if now is None else now
|
|
156
|
+
reported = 0
|
|
157
|
+
for status in runner.list_runs():
|
|
158
|
+
if not _due(status, now):
|
|
159
|
+
continue
|
|
160
|
+
with contextlib.suppress(Exception):
|
|
161
|
+
if reconcile_run(status, now=now):
|
|
162
|
+
reported += 1
|
|
163
|
+
return reported
|