freesolo-flash-dev 0.2.25__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- flash/__init__.py +29 -0
- flash/_channel.py +23 -0
- flash/_fileio.py +35 -0
- flash/_logging.py +49 -0
- flash/_update_check.py +266 -0
- flash/catalog.py +253 -0
- flash/cli/__init__.py +1 -0
- flash/cli/main/__init__.py +227 -0
- flash/cli/main/__main__.py +6 -0
- flash/cli/main/commands.py +636 -0
- flash/cli/main/envpush.py +317 -0
- flash/cli/main/render.py +599 -0
- flash/cli/main/training_doc.py +455 -0
- flash/client/__init__.py +14 -0
- flash/client/config.py +70 -0
- flash/client/http.py +372 -0
- flash/client/runtime_secrets.py +69 -0
- flash/client/specs.py +20 -0
- flash/cost/__init__.py +16 -0
- flash/cost/analytical.py +175 -0
- flash/cost/facts.py +114 -0
- flash/cost/spec.py +113 -0
- flash/cost/types.py +158 -0
- flash/engine/__init__.py +6 -0
- flash/engine/accounting.py +36 -0
- flash/engine/chalk_kernels.py +116 -0
- flash/engine/multiturn_rollout.py +780 -0
- flash/engine/recipe.py +86 -0
- flash/engine/vram.py +603 -0
- flash/engine/worker/__init__.py +2916 -0
- flash/engine/worker/__main__.py +4 -0
- flash/engine/worker/kernel_warmup.py +400 -0
- flash/engine/worker/lora.py +796 -0
- flash/engine/worker/packing.py +366 -0
- flash/engine/worker/perf.py +1048 -0
- flash/envs/__init__.py +10 -0
- flash/envs/adapter/__init__.py +883 -0
- flash/envs/adapter/rubric.py +222 -0
- flash/envs/base.py +52 -0
- flash/envs/registry.py +62 -0
- flash/mcp/__init__.py +1 -0
- flash/mcp/server.py +85 -0
- flash/providers/__init__.py +59 -0
- flash/providers/_auth.py +24 -0
- flash/providers/_http.py +230 -0
- flash/providers/_instance.py +416 -0
- flash/providers/_instance_bootstrap.py +517 -0
- flash/providers/_poll.py +311 -0
- flash/providers/allocator.py +193 -0
- flash/providers/base.py +431 -0
- flash/providers/hyperstack/__init__.py +127 -0
- flash/providers/hyperstack/api.py +522 -0
- flash/providers/hyperstack/auth.py +17 -0
- flash/providers/hyperstack/gpus.py +29 -0
- flash/providers/hyperstack/jobs/__init__.py +632 -0
- flash/providers/hyperstack/jobs/builders.py +122 -0
- flash/providers/hyperstack/preflight.py +23 -0
- flash/providers/hyperstack/pricing.py +26 -0
- flash/providers/hyperstack/train.py +25 -0
- flash/providers/lambdalabs/__init__.py +139 -0
- flash/providers/lambdalabs/api.py +261 -0
- flash/providers/lambdalabs/auth.py +18 -0
- flash/providers/lambdalabs/gpus.py +29 -0
- flash/providers/lambdalabs/jobs/__init__.py +724 -0
- flash/providers/lambdalabs/jobs/builders.py +118 -0
- flash/providers/lambdalabs/preflight.py +27 -0
- flash/providers/lambdalabs/pricing.py +51 -0
- flash/providers/lambdalabs/train.py +27 -0
- flash/providers/preflight.py +55 -0
- flash/providers/realized.py +80 -0
- flash/providers/runpod/__init__.py +130 -0
- flash/providers/runpod/api.py +186 -0
- flash/providers/runpod/auth.py +37 -0
- flash/providers/runpod/cost.py +57 -0
- flash/providers/runpod/gpus.py +46 -0
- flash/providers/runpod/jobs.py +956 -0
- flash/providers/runpod/keys.py +139 -0
- flash/providers/runpod/preflight.py +30 -0
- flash/providers/runpod/preload.py +915 -0
- flash/providers/runpod/pricing.py +18 -0
- flash/providers/runpod/slots.py +79 -0
- flash/providers/runpod/train/__init__.py +150 -0
- flash/providers/runpod/train/deps.py +395 -0
- flash/providers/runpod/train/endpoints.py +820 -0
- flash/py.typed +0 -0
- flash/runner/__init__.py +686 -0
- flash/runner/checkpoints.py +82 -0
- flash/runner/deploy.py +422 -0
- flash/runner/lifecycle.py +672 -0
- flash/schema/__init__.py +375 -0
- flash/schema/fields.py +331 -0
- flash/serve/__init__.py +1 -0
- flash/serve/deploy.py +326 -0
- flash/serve/pricing.py +60 -0
- flash/server/__init__.py +1 -0
- flash/server/__main__.py +20 -0
- flash/server/app.py +961 -0
- flash/server/auth.py +263 -0
- flash/server/billing.py +124 -0
- flash/server/checkpoints.py +110 -0
- flash/server/db.py +160 -0
- flash/server/environment_registry.py +102 -0
- flash/server/envs.py +360 -0
- flash/server/reconcile.py +163 -0
- flash/server/run_registry.py +150 -0
- flash/spec.py +333 -0
- freesolo_flash_dev-0.2.25.dist-info/METADATA +192 -0
- freesolo_flash_dev-0.2.25.dist-info/RECORD +111 -0
- freesolo_flash_dev-0.2.25.dist-info/WHEEL +4 -0
- freesolo_flash_dev-0.2.25.dist-info/entry_points.txt +3 -0
- freesolo_flash_dev-0.2.25.dist-info/licenses/LICENSE +201 -0
|
@@ -0,0 +1,150 @@
|
|
|
1
|
+
"""Best-effort reporting of managed Flash runs/checkpoints to the Freesolo backend."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import contextlib
|
|
6
|
+
import json
|
|
7
|
+
import logging
|
|
8
|
+
import os
|
|
9
|
+
import urllib.error
|
|
10
|
+
import urllib.request
|
|
11
|
+
from datetime import UTC, datetime
|
|
12
|
+
from typing import Any
|
|
13
|
+
|
|
14
|
+
from .auth import INTERNAL_KEY_ENV, freesolo_base_url
|
|
15
|
+
|
|
16
|
+
_LOG = logging.getLogger("flash.server.runs")
|
|
17
|
+
_TIMEOUT_S = 10.0
|
|
18
|
+
_RUN_PATH = "/api/flash/runs/internal"
|
|
19
|
+
_CHECKPOINT_PATH = "/api/flash/runs/checkpoints/internal"
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def _iso_from_epoch(value: float | int | None) -> str | None:
|
|
23
|
+
if value is None:
|
|
24
|
+
return None
|
|
25
|
+
try:
|
|
26
|
+
return datetime.fromtimestamp(float(value), tz=UTC).isoformat()
|
|
27
|
+
except (TypeError, ValueError, OSError):
|
|
28
|
+
return None
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def _post(path: str, body: dict[str, Any]) -> bool:
|
|
32
|
+
internal_key = os.environ.get(INTERNAL_KEY_ENV)
|
|
33
|
+
if not internal_key:
|
|
34
|
+
return False
|
|
35
|
+
req = urllib.request.Request(
|
|
36
|
+
f"{freesolo_base_url()}{path}",
|
|
37
|
+
data=json.dumps(body).encode("utf-8"),
|
|
38
|
+
method="POST",
|
|
39
|
+
headers={
|
|
40
|
+
"Authorization": f"Bearer {internal_key}",
|
|
41
|
+
"Content-Type": "application/json",
|
|
42
|
+
},
|
|
43
|
+
)
|
|
44
|
+
try:
|
|
45
|
+
with urllib.request.urlopen(req, timeout=_TIMEOUT_S) as resp:
|
|
46
|
+
return 200 <= resp.status < 300
|
|
47
|
+
except urllib.error.HTTPError as exc:
|
|
48
|
+
detail = ""
|
|
49
|
+
with contextlib.suppress(Exception):
|
|
50
|
+
detail = exc.read().decode("utf-8", "replace")[:500]
|
|
51
|
+
_LOG.warning("failed to report %s: HTTP %s %s", path, exc.code, detail)
|
|
52
|
+
except (urllib.error.URLError, OSError) as exc:
|
|
53
|
+
_LOG.warning("failed to report %s: %s", path, exc)
|
|
54
|
+
return False
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def _context_from_status(status: Any) -> dict[str, Any]:
|
|
58
|
+
platform = getattr(status, "platform_context", None)
|
|
59
|
+
if isinstance(platform, dict):
|
|
60
|
+
return platform
|
|
61
|
+
billing = getattr(status, "billing_context", None)
|
|
62
|
+
if isinstance(billing, dict):
|
|
63
|
+
return billing
|
|
64
|
+
return {}
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def _spec_from_status(status: Any) -> dict[str, Any]:
|
|
68
|
+
spec = getattr(status, "spec", None)
|
|
69
|
+
return spec if isinstance(spec, dict) else {}
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def _managed_environment_slug(spec: dict[str, Any]) -> str | None:
|
|
73
|
+
env = spec.get("environment") if isinstance(spec.get("environment"), dict) else {}
|
|
74
|
+
env_id = env.get("id")
|
|
75
|
+
if not isinstance(env_id, str) or not env_id.strip():
|
|
76
|
+
return None
|
|
77
|
+
try:
|
|
78
|
+
from flash.envs.adapter import is_managed_environment_slug
|
|
79
|
+
|
|
80
|
+
return env_id.strip() if is_managed_environment_slug(env_id.strip()) else None
|
|
81
|
+
except Exception:
|
|
82
|
+
return None
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def record_training_run(*, status: Any, key: dict[str, Any] | None = None) -> bool:
|
|
86
|
+
context = {**_context_from_status(status), **(key or {})}
|
|
87
|
+
org_id = str(context.get("org_id") or "").strip()
|
|
88
|
+
if not org_id:
|
|
89
|
+
return False
|
|
90
|
+
|
|
91
|
+
spec = _spec_from_status(status)
|
|
92
|
+
gpu = spec.get("gpu") if isinstance(spec.get("gpu"), dict) else {}
|
|
93
|
+
body = {
|
|
94
|
+
"orgId": org_id,
|
|
95
|
+
"runId": status.run_id,
|
|
96
|
+
"status": status.state,
|
|
97
|
+
"userId": context.get("user_id"),
|
|
98
|
+
"apiKeyId": context.get("api_key_id"),
|
|
99
|
+
"environmentSlug": _managed_environment_slug(spec),
|
|
100
|
+
"model": spec.get("model") if isinstance(spec.get("model"), str) else None,
|
|
101
|
+
"algorithm": spec.get("algorithm") if isinstance(spec.get("algorithm"), str) else None,
|
|
102
|
+
"phase": spec.get("phase") if isinstance(spec.get("phase"), str) else None,
|
|
103
|
+
"gpuType": gpu.get("type") if isinstance(gpu.get("type"), str) else None,
|
|
104
|
+
"costUsd": status.cost_usd,
|
|
105
|
+
"realizedCostUsd": status.realized_cost_usd,
|
|
106
|
+
"adapterRef": status.to_dict().get("adapter_ref"),
|
|
107
|
+
"artifactsDir": status.artifacts_dir,
|
|
108
|
+
"error": status.error,
|
|
109
|
+
"spec": spec,
|
|
110
|
+
"deployment": status.deployment,
|
|
111
|
+
"lastHeartbeat": status.last_heartbeat,
|
|
112
|
+
"gpuStatus": status.gpu_status,
|
|
113
|
+
"createdAt": _iso_from_epoch(status.created_at),
|
|
114
|
+
"updatedAt": _iso_from_epoch(status.updated_at),
|
|
115
|
+
"metadata": {"source": "flash.control_plane"},
|
|
116
|
+
}
|
|
117
|
+
return _post(_RUN_PATH, body)
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
def record_training_checkpoint(
|
|
121
|
+
*,
|
|
122
|
+
spec: Any,
|
|
123
|
+
seed: int,
|
|
124
|
+
metrics: dict[str, Any],
|
|
125
|
+
artifact_path: str,
|
|
126
|
+
) -> bool:
|
|
127
|
+
try:
|
|
128
|
+
from flash.runner import adapter_ref, get_status
|
|
129
|
+
|
|
130
|
+
status = get_status(spec.run_id)
|
|
131
|
+
ref = adapter_ref(spec, seed=seed)
|
|
132
|
+
except Exception:
|
|
133
|
+
return False
|
|
134
|
+
context = _context_from_status(status)
|
|
135
|
+
org_id = str(context.get("org_id") or "").strip()
|
|
136
|
+
if not org_id:
|
|
137
|
+
return False
|
|
138
|
+
body = {
|
|
139
|
+
"orgId": org_id,
|
|
140
|
+
"runId": spec.run_id,
|
|
141
|
+
"checkpointId": f"seed{seed}",
|
|
142
|
+
"seed": seed,
|
|
143
|
+
"phase": getattr(spec, "phase", None),
|
|
144
|
+
"adapterRef": ref,
|
|
145
|
+
"artifactPath": artifact_path,
|
|
146
|
+
"metrics": metrics,
|
|
147
|
+
"metadata": {"source": "flash.control_plane"},
|
|
148
|
+
"updatedAt": _iso_from_epoch(getattr(status, "updated_at", None)),
|
|
149
|
+
}
|
|
150
|
+
return _post(_CHECKPOINT_PATH, body)
|
flash/spec.py
ADDED
|
@@ -0,0 +1,333 @@
|
|
|
1
|
+
"""Structured job specification shared by CLI/API/runner and GPU workers."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
import os
|
|
7
|
+
from dataclasses import asdict, dataclass, field
|
|
8
|
+
from typing import Any
|
|
9
|
+
|
|
10
|
+
from .catalog import DEFAULT_GPU, DEFAULT_MODEL, normalize_algorithm
|
|
11
|
+
|
|
12
|
+
_FALSE_STRINGS = {"", "0", "false", "no", "off", "none"}
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def _str_tuple(value: Any) -> tuple[str, ...]:
|
|
16
|
+
"""Normalize a string-or-list knob (e.g. stop_sequences) to a tuple of strings.
|
|
17
|
+
|
|
18
|
+
A bare string is ONE element — never iterated into characters ("</s>" must not become
|
|
19
|
+
('<','/','s','>')). None and empty strings -> () (no stop configured); empty entries
|
|
20
|
+
in a list are dropped."""
|
|
21
|
+
if value is None:
|
|
22
|
+
return ()
|
|
23
|
+
if isinstance(value, str):
|
|
24
|
+
return (value,) if value else ()
|
|
25
|
+
return tuple(s for s in (str(x) for x in value) if s)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def coerce_bool(value: Any) -> bool:
|
|
29
|
+
"""Parse a bool from loosely-typed sources (JSON request bodies / env / persisted dicts).
|
|
30
|
+
|
|
31
|
+
bool(...) on a string is truthy for ANY non-empty string, so "false"/"0"/"no" would
|
|
32
|
+
wrongly become True; treat the usual falsey strings (see ``_FALSE_STRINGS``) as False.
|
|
33
|
+
An already-bool value passes through.
|
|
34
|
+
"""
|
|
35
|
+
if isinstance(value, str):
|
|
36
|
+
return value.strip().lower() not in _FALSE_STRINGS
|
|
37
|
+
return bool(value)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def _coerce_str_map(value: Any) -> dict[str, str]:
|
|
41
|
+
"""Coerce a loosely-typed spec field into a ``dict[str, str]``.
|
|
42
|
+
|
|
43
|
+
A malformed persisted spec (or programmatic caller) can set a mapping field to a non-dict;
|
|
44
|
+
`.items()` on that would crash `from_dict` with AttributeError. Treat a non-dict as empty,
|
|
45
|
+
mirroring how the other nested fields tolerate missing/garbage input.
|
|
46
|
+
"""
|
|
47
|
+
if not isinstance(value, dict):
|
|
48
|
+
return {}
|
|
49
|
+
return {str(k): str(v) for k, v in value.items()}
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def _coerce_wandb(value: Any) -> WandbSpec:
|
|
53
|
+
"""Coerce a loosely-typed ``wandb`` spec field into a ``WandbSpec``.
|
|
54
|
+
|
|
55
|
+
A malformed/older persisted spec can set ``wandb`` to a non-dict (e.g. a bare string), and
|
|
56
|
+
``(value or {}).get(...)`` would crash ``from_dict`` with AttributeError on the worker. Treat
|
|
57
|
+
a non-dict as empty (default naming), mirroring ``_coerce_str_map``. String-coerce + trim the
|
|
58
|
+
leaves so a non-string label can't reach the W&B SDK / run-name path; blank -> None (default).
|
|
59
|
+
"""
|
|
60
|
+
if not isinstance(value, dict):
|
|
61
|
+
return WandbSpec()
|
|
62
|
+
|
|
63
|
+
def _label(v: Any) -> str | None:
|
|
64
|
+
if v is None:
|
|
65
|
+
return None
|
|
66
|
+
s = str(v).strip()
|
|
67
|
+
return s or None
|
|
68
|
+
|
|
69
|
+
return WandbSpec(project=_label(value.get("project")), run_name=_label(value.get("run_name")))
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def _volume_gb(value: Any, default: int = 100) -> int:
|
|
73
|
+
"""Parse the platform-managed weight-cache volume size, defaulting on anything not a positive int.
|
|
74
|
+
|
|
75
|
+
Tolerant by design (the field is platform-set, and stale/hand-edited specs must still load): a
|
|
76
|
+
missing / null / empty / non-numeric value, or a non-positive size (incl. the string "0" or a
|
|
77
|
+
negative), all fall back to ``default`` rather than crashing or round-tripping a nonsensical size.
|
|
78
|
+
"""
|
|
79
|
+
if isinstance(value, bool):
|
|
80
|
+
# bool is an int subclass (int(True) == 1), so a stray boolean would become a 1 GB volume;
|
|
81
|
+
# treat it as invalid and default (mirrors the bool rejection in _opt_int).
|
|
82
|
+
return default
|
|
83
|
+
try:
|
|
84
|
+
gb = int(value)
|
|
85
|
+
except (TypeError, ValueError):
|
|
86
|
+
return default
|
|
87
|
+
return gb if gb > 0 else default
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def _opt_int(value: Any) -> int | None:
|
|
91
|
+
"""Parse an optional int from a loosely-typed spec source; None stays None.
|
|
92
|
+
|
|
93
|
+
Rejects JSON booleans: ``bool`` is an ``int`` subclass in Python, so ``int(True)`` would
|
|
94
|
+
silently coerce a stray boolean train knob to 1 (and ``False`` to 0). Mirrors the
|
|
95
|
+
bool rejection in schema._train_int — a bool is a type error, not a number.
|
|
96
|
+
"""
|
|
97
|
+
if value is None:
|
|
98
|
+
return None
|
|
99
|
+
if isinstance(value, bool):
|
|
100
|
+
raise TypeError(f"expected a number, got bool {value!r}")
|
|
101
|
+
return int(value)
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def _opt_float(value: Any) -> float | None:
|
|
105
|
+
"""Parse an optional float from a loosely-typed spec source; None stays None.
|
|
106
|
+
|
|
107
|
+
Rejects JSON booleans (``bool`` is an ``int`` subclass) so a stray boolean train knob is
|
|
108
|
+
not silently coerced to 0.0/1.0; mirrors the bool rejection in schema._train_float.
|
|
109
|
+
"""
|
|
110
|
+
if value is None:
|
|
111
|
+
return None
|
|
112
|
+
if isinstance(value, bool):
|
|
113
|
+
raise TypeError(f"expected a number, got bool {value!r}")
|
|
114
|
+
return float(value)
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
@dataclass(frozen=True)
|
|
118
|
+
class EnvironmentSpec:
|
|
119
|
+
# Freesolo environment id. No default:
|
|
120
|
+
# a run must name an environment explicitly (validated in schema / the worker).
|
|
121
|
+
id: str = ""
|
|
122
|
+
params: dict[str, Any] = field(default_factory=dict)
|
|
123
|
+
# Pip requirements the GPU worker needs for this environment.
|
|
124
|
+
# Filled in client-side from the local install manifest so the managed control
|
|
125
|
+
# plane never depends on client-local state; empty means "derive on the server".
|
|
126
|
+
pip: tuple[str, ...] = ()
|
|
127
|
+
# Secret env var names the environment requires on the worker. Values are never stored in the
|
|
128
|
+
# spec; the client reads matching local env/.env values and sends them out-of-band via
|
|
129
|
+
# runtime_secrets.
|
|
130
|
+
secrets: tuple[str, ...] = ()
|
|
131
|
+
# Optional pinned commit SHA for the environment's GitHub ref, resolved ONCE in the control
|
|
132
|
+
# plane (runner._assign_resolved_env_sha, called from submit_job after the spec is finalized) so
|
|
133
|
+
# every worker boots from an immutable sha instead of each one re-resolving the symbolic ref
|
|
134
|
+
# (e.g. "main") against the GitHub commits API — which trips GitHub's secondary rate limit on a
|
|
135
|
+
# cold spawn wave. Empty (the default, and whenever the control-plane resolve fails) preserves
|
|
136
|
+
# today's behavior: the worker resolves the ref itself. The adapter only trusts a real 40-char
|
|
137
|
+
# sha (see adapter._resolve_ref_sha), so a stale/garbage value falls back to live resolution.
|
|
138
|
+
resolved_sha: str = ""
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
@dataclass(frozen=True)
|
|
142
|
+
class TrainSpec:
|
|
143
|
+
steps: int | None = None
|
|
144
|
+
epochs: int | None = None
|
|
145
|
+
lora_rank: int = 32
|
|
146
|
+
lora_alpha: int = 64
|
|
147
|
+
seeds: tuple[int, ...] = (0,)
|
|
148
|
+
# Artifact-store adapter ref output by `flash status`:
|
|
149
|
+
# ``<hf_repo>:<phase>/<run_id>/seed<N>``.
|
|
150
|
+
init_from_adapter: str = ""
|
|
151
|
+
# Per-run HuggingFace artifact repo ("owner/name") for this run's adapter/checkpoint/
|
|
152
|
+
# code storage AND serving. PLATFORM-MANAGED, not a user field: the control plane assigns
|
|
153
|
+
# it server-side in runner.submit_job (a per-run private dataset under the operator's
|
|
154
|
+
# namespace, written by the operator HF_TOKEN). A user-supplied value is ignored by
|
|
155
|
+
# schema.spec_from_dict; this field carries the control-plane-assigned repo to the worker.
|
|
156
|
+
hf_repo: str = ""
|
|
157
|
+
# Optimizer/batching knobs (SFT + GRPO). None -> the worker's tuned recipe default.
|
|
158
|
+
# batch_size is the GLOBAL/effective batch (SFT: grad-accum is sized to hit it; GRPO:
|
|
159
|
+
# prompts per optimizer step). max_length is the SFT max sequence length. save_every
|
|
160
|
+
# is the checkpoint interval in optimizer steps.
|
|
161
|
+
learning_rate: float | None = None
|
|
162
|
+
batch_size: int | None = None
|
|
163
|
+
max_length: int | None = None
|
|
164
|
+
save_every: int | None = None
|
|
165
|
+
# SFT caps (None/0 -> no cap). max_steps caps optimizer steps (cheap pre-flight smoke);
|
|
166
|
+
# max_examples truncates the SFT dataset.
|
|
167
|
+
max_steps: int | None = None
|
|
168
|
+
max_examples: int | None = None
|
|
169
|
+
# GRPO recipe knobs (datums parity), shipped by the SDK in [train] (NOT in
|
|
170
|
+
# [environment.params], which is forwarded verbatim to the Freesolo env loader).
|
|
171
|
+
# None/() -> recipe default. group_size = completions per prompt; temperature = rollout
|
|
172
|
+
# sampling temp; max_tokens = completion budget; kl_penalty_coef = KL beta;
|
|
173
|
+
# advantage_clip = centered-advantage clamp; thinking_length_penalty_coef =
|
|
174
|
+
# per-<think>-token reward deduction; stop_sequences = rollout stop strings.
|
|
175
|
+
group_size: int | None = None
|
|
176
|
+
temperature: float | None = None
|
|
177
|
+
max_tokens: int | None = None
|
|
178
|
+
kl_penalty_coef: float | None = None
|
|
179
|
+
advantage_clip: float | None = None
|
|
180
|
+
thinking_length_penalty_coef: float | None = None
|
|
181
|
+
stop_sequences: tuple[str, ...] = ()
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
@dataclass(frozen=True)
|
|
185
|
+
class GpuSpec:
|
|
186
|
+
# The parse-time provisional GPU class (cheapest VALIDATED class that fits the model). GPU
|
|
187
|
+
# pinning is gone: the submit-time allocator always re-picks the cheapest fitting validated
|
|
188
|
+
# active RunPod class, so a config's gpu.type does NOT pin — ``type`` is just the offline
|
|
189
|
+
# sizing/display default and the carrier the runner overwrites with the actually-allocated
|
|
190
|
+
# class.
|
|
191
|
+
type: str = DEFAULT_GPU
|
|
192
|
+
disk_gb: int = 60
|
|
193
|
+
max_wall_seconds: int = 24 * 3600
|
|
194
|
+
# Auto-resubmit budget for infra-shaped failures (worker loss / stall / timeout);
|
|
195
|
+
# each retry resumes from the latest streamed checkpoint.
|
|
196
|
+
max_retries: int = 2
|
|
197
|
+
# Persistent RunPod network-volume weight cache (platform-managed, NOT user config). When set,
|
|
198
|
+
# the RunPod provider attaches a same-named volume in EVERY datacenter in the cache fleet and
|
|
199
|
+
# allows the endpoint across all of them (no single-DC pin), and the worker points HF_HOME at
|
|
200
|
+
# the mount so a model download is a one-time cost per region instead of per run. Assigned by
|
|
201
|
+
# the runner (``_assign_weight_cache_volume``); a single fixed datacenter is intentionally NOT
|
|
202
|
+
# a field — the DC SET is deploy-time platform policy (see jobs.weight_cache_datacenters), so a
|
|
203
|
+
# run can never be region-pinned. ``None`` = no volume (cold download, cross-region).
|
|
204
|
+
network_volume: str | None = None
|
|
205
|
+
network_volume_gb: int = 100
|
|
206
|
+
|
|
207
|
+
|
|
208
|
+
@dataclass(frozen=True)
|
|
209
|
+
class WandbSpec:
|
|
210
|
+
# Optional W&B naming, defined in the [wandb] config table (first-class spec config, NOT
|
|
211
|
+
# env vars). project/run_name are non-secret labels; the actual WANDB_API_KEY stays an
|
|
212
|
+
# env-var secret. None -> the worker's defaults ("flash" project, "flash-<phase>-<run_id>-
|
|
213
|
+
# seedN" run name).
|
|
214
|
+
project: str | None = None
|
|
215
|
+
run_name: str | None = None
|
|
216
|
+
|
|
217
|
+
|
|
218
|
+
@dataclass(frozen=True)
|
|
219
|
+
class JobSpec:
|
|
220
|
+
model: str = DEFAULT_MODEL
|
|
221
|
+
algorithm: str = "grpo"
|
|
222
|
+
environment: EnvironmentSpec = field(default_factory=EnvironmentSpec)
|
|
223
|
+
train: TrainSpec = field(default_factory=TrainSpec)
|
|
224
|
+
gpu: GpuSpec = field(default_factory=GpuSpec)
|
|
225
|
+
run_id: str = "local"
|
|
226
|
+
# Per-run worker-environment overrides merged into the GPU worker's env (highest precedence
|
|
227
|
+
# over the control-plane os.environ allowlist). The escape hatch for A/B kernel experiments
|
|
228
|
+
# that must differ PER RUN, not globally: e.g. an optimizer or LoRA-init override on just the
|
|
229
|
+
# experiment run while others keep the global default. Forwarded verbatim (string values);
|
|
230
|
+
# never set secrets here.
|
|
231
|
+
worker_env: dict[str, str] = field(default_factory=dict)
|
|
232
|
+
# "catalog" (curated models only) or "allow" (any HF model that fits the GPU).
|
|
233
|
+
model_policy: str = "catalog"
|
|
234
|
+
# Thinking/reasoning mode (thinking-capable models only). One flag per run, consumed
|
|
235
|
+
# identically by SFT rendering, RL rollouts, and serving (decoding parity). OFF by default
|
|
236
|
+
# (operator preference: training defaults to no-reasoning; set thinking = true to enable).
|
|
237
|
+
thinking: bool = False
|
|
238
|
+
# Optional W&B run naming from the [wandb] config table. Carried as typed spec config
|
|
239
|
+
# (round-tripped in the job-spec JSON the worker reads), not as environment variables.
|
|
240
|
+
wandb: WandbSpec = field(default_factory=WandbSpec)
|
|
241
|
+
|
|
242
|
+
@property
|
|
243
|
+
def phase(self) -> str:
|
|
244
|
+
return "rl" if self.algorithm == "grpo" else self.algorithm
|
|
245
|
+
|
|
246
|
+
def to_dict(self) -> dict[str, Any]:
|
|
247
|
+
return asdict(self)
|
|
248
|
+
|
|
249
|
+
def to_json(self) -> str:
|
|
250
|
+
return json.dumps(self.to_dict(), sort_keys=True)
|
|
251
|
+
|
|
252
|
+
@classmethod
|
|
253
|
+
def from_dict(cls, data: dict[str, Any]) -> JobSpec:
|
|
254
|
+
env = data.get("environment") or {}
|
|
255
|
+
# Defense-in-depth: a stale/older payload may still carry a local `path`. The worker only
|
|
256
|
+
# runs published Freesolo environment ids, so reject it here rather than silently
|
|
257
|
+
# dropping it.
|
|
258
|
+
if isinstance(env, dict) and env.get("path"):
|
|
259
|
+
raise ValueError(
|
|
260
|
+
"local environment paths are no longer supported; the worker only runs "
|
|
261
|
+
"published Freesolo environment ids"
|
|
262
|
+
)
|
|
263
|
+
train = data.get("train") or {}
|
|
264
|
+
gpu = data.get("gpu") or {}
|
|
265
|
+
return cls(
|
|
266
|
+
model=data.get("model", cls.model),
|
|
267
|
+
algorithm=normalize_algorithm(data.get("algorithm", cls.algorithm)),
|
|
268
|
+
environment=EnvironmentSpec(
|
|
269
|
+
id=env.get("id", ""),
|
|
270
|
+
params=dict(env.get("params") or {}),
|
|
271
|
+
pip=tuple(str(p) for p in env.get("pip") or ()),
|
|
272
|
+
secrets=_str_tuple(env.get("secrets")),
|
|
273
|
+
resolved_sha=str(env.get("resolved_sha") or ""),
|
|
274
|
+
),
|
|
275
|
+
train=TrainSpec(
|
|
276
|
+
steps=_opt_int(train.get("steps")),
|
|
277
|
+
epochs=_opt_int(train.get("epochs")),
|
|
278
|
+
lora_rank=int(train.get("lora_rank", 32)),
|
|
279
|
+
lora_alpha=int(train.get("lora_alpha", 64)),
|
|
280
|
+
seeds=tuple(int(s) for s in train.get("seeds", (0,))),
|
|
281
|
+
init_from_adapter=str(train.get("init_from_adapter") or ""),
|
|
282
|
+
hf_repo=str(train.get("hf_repo") or ""),
|
|
283
|
+
learning_rate=_opt_float(train.get("learning_rate")),
|
|
284
|
+
batch_size=_opt_int(train.get("batch_size")),
|
|
285
|
+
max_length=_opt_int(train.get("max_length")),
|
|
286
|
+
save_every=_opt_int(train.get("save_every")),
|
|
287
|
+
max_steps=_opt_int(train.get("max_steps")),
|
|
288
|
+
max_examples=_opt_int(train.get("max_examples")),
|
|
289
|
+
group_size=_opt_int(train.get("group_size")),
|
|
290
|
+
temperature=_opt_float(train.get("temperature")),
|
|
291
|
+
max_tokens=_opt_int(train.get("max_tokens")),
|
|
292
|
+
kl_penalty_coef=_opt_float(train.get("kl_penalty_coef")),
|
|
293
|
+
advantage_clip=_opt_float(train.get("advantage_clip")),
|
|
294
|
+
thinking_length_penalty_coef=_opt_float(train.get("thinking_length_penalty_coef")),
|
|
295
|
+
stop_sequences=_str_tuple(train.get("stop_sequences")),
|
|
296
|
+
),
|
|
297
|
+
gpu=GpuSpec(
|
|
298
|
+
type=gpu.get("type", DEFAULT_GPU),
|
|
299
|
+
disk_gb=int(gpu.get("disk_gb", 60)),
|
|
300
|
+
max_wall_seconds=int(gpu.get("max_wall_seconds", 24 * 3600)),
|
|
301
|
+
max_retries=int(gpu.get("max_retries", 2)),
|
|
302
|
+
# network_volume/network_volume_gb round-trip so the runner-assigned weight cache
|
|
303
|
+
# survives the to_dict()->from_dict() hops in _with_model_disk / _spec_with_gpu /
|
|
304
|
+
# _assign_managed_hf_repo before deploy. A legacy ``datacenter`` key (from the
|
|
305
|
+
# reverted single-DC pin) is intentionally ignored — the DC set is deploy-time
|
|
306
|
+
# policy now, so stale specs carrying it are tolerated, never region-pinned.
|
|
307
|
+
network_volume=gpu.get("network_volume"),
|
|
308
|
+
# Tolerant: null / "" / "0" / 0 / negative / non-numeric / missing -> the default.
|
|
309
|
+
# Platform-managed, so a stale or hand-edited spec must still load with a sane size.
|
|
310
|
+
network_volume_gb=_volume_gb(gpu.get("network_volume_gb")),
|
|
311
|
+
),
|
|
312
|
+
run_id=data.get("run_id", "local"),
|
|
313
|
+
worker_env=_coerce_str_map(data.get("worker_env")),
|
|
314
|
+
model_policy=data.get("model_policy", "catalog"),
|
|
315
|
+
thinking=coerce_bool(data.get("thinking", False)),
|
|
316
|
+
wandb=_coerce_wandb(data.get("wandb")),
|
|
317
|
+
)
|
|
318
|
+
|
|
319
|
+
@classmethod
|
|
320
|
+
def from_json(cls, raw: str) -> JobSpec:
|
|
321
|
+
return cls.from_dict(json.loads(raw))
|
|
322
|
+
|
|
323
|
+
|
|
324
|
+
def load_job_spec_from_env() -> JobSpec | None:
|
|
325
|
+
"""Load FLASH_JOB_SPEC_JSON or FLASH_JOB_SPEC_PATH if present on a worker node."""
|
|
326
|
+
raw = os.environ.get("FLASH_JOB_SPEC_JSON")
|
|
327
|
+
if raw:
|
|
328
|
+
return JobSpec.from_json(raw)
|
|
329
|
+
path = os.environ.get("FLASH_JOB_SPEC_PATH")
|
|
330
|
+
if path and os.path.exists(path):
|
|
331
|
+
with open(path) as f:
|
|
332
|
+
return JobSpec.from_json(f.read())
|
|
333
|
+
return None
|
|
@@ -0,0 +1,192 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: freesolo-flash-dev
|
|
3
|
+
Version: 0.2.25
|
|
4
|
+
Summary: Flash — managed LoRA post-training (SFT/GRPO) for Freesolo environments, driven by the `flash` CLI
|
|
5
|
+
Project-URL: Homepage, https://github.com/freesolo-co/flash
|
|
6
|
+
Project-URL: Repository, https://github.com/freesolo-co/flash
|
|
7
|
+
Author: Freesolo
|
|
8
|
+
License-Expression: Apache-2.0
|
|
9
|
+
License-File: LICENSE
|
|
10
|
+
Keywords: fine-tuning,freesolo,grpo,llm,lora,rl,sft
|
|
11
|
+
Classifier: Development Status :: 4 - Beta
|
|
12
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
|
13
|
+
Classifier: Operating System :: OS Independent
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
17
|
+
Requires-Python: <3.13,>=3.11
|
|
18
|
+
Provides-Extra: dev
|
|
19
|
+
Requires-Dist: datasets>=2.19; extra == 'dev'
|
|
20
|
+
Requires-Dist: fastapi; extra == 'dev'
|
|
21
|
+
Requires-Dist: freesolo>=0.2.49; extra == 'dev'
|
|
22
|
+
Requires-Dist: httpx>=0.27; extra == 'dev'
|
|
23
|
+
Requires-Dist: huggingface-hub>=0.34; extra == 'dev'
|
|
24
|
+
Requires-Dist: mypy>=1.13.0; extra == 'dev'
|
|
25
|
+
Requires-Dist: pytest>=9.0.3; extra == 'dev'
|
|
26
|
+
Requires-Dist: ruff>=0.6; extra == 'dev'
|
|
27
|
+
Requires-Dist: runpod-flash; extra == 'dev'
|
|
28
|
+
Requires-Dist: uvicorn; extra == 'dev'
|
|
29
|
+
Provides-Extra: gpu
|
|
30
|
+
Requires-Dist: accelerate>=1.4; extra == 'gpu'
|
|
31
|
+
Requires-Dist: bitsandbytes>=0.49; extra == 'gpu'
|
|
32
|
+
Requires-Dist: datasets>=2.19; extra == 'gpu'
|
|
33
|
+
Requires-Dist: freesolo>=0.2.49; extra == 'gpu'
|
|
34
|
+
Requires-Dist: huggingface-hub>=0.34; extra == 'gpu'
|
|
35
|
+
Requires-Dist: peft>=0.19; extra == 'gpu'
|
|
36
|
+
Requires-Dist: torch==2.10.0; extra == 'gpu'
|
|
37
|
+
Requires-Dist: transformers<5.11,>=5.6; extra == 'gpu'
|
|
38
|
+
Requires-Dist: trl<1.7,>=1.6; extra == 'gpu'
|
|
39
|
+
Requires-Dist: vllm==0.19.1; extra == 'gpu'
|
|
40
|
+
Provides-Extra: server
|
|
41
|
+
Requires-Dist: datasets>=2.19; extra == 'server'
|
|
42
|
+
Requires-Dist: fastapi; extra == 'server'
|
|
43
|
+
Requires-Dist: freesolo>=0.2.49; extra == 'server'
|
|
44
|
+
Requires-Dist: httpx>=0.27; extra == 'server'
|
|
45
|
+
Requires-Dist: huggingface-hub>=0.34; extra == 'server'
|
|
46
|
+
Requires-Dist: runpod-flash; extra == 'server'
|
|
47
|
+
Requires-Dist: uvicorn; extra == 'server'
|
|
48
|
+
Description-Content-Type: text/markdown
|
|
49
|
+
|
|
50
|
+
# Flash
|
|
51
|
+
|
|
52
|
+
Managed LoRA post-training service: SFT and GRPO on managed RunPod Flash GPUs.
|
|
53
|
+
The allocator picks the cheapest validated RunPod GPU class that fits the run.
|
|
54
|
+
|
|
55
|
+
## Scope
|
|
56
|
+
|
|
57
|
+
- `flash train <cfg.toml>` / control-plane `POST /runs` — submit a training job;
|
|
58
|
+
one dedicated GPU per run, supervised server-side (stall watchdog, bounded
|
|
59
|
+
auto-retry resuming from the last streamed checkpoint, endpoint GC).
|
|
60
|
+
- `flash deploy`, `flash chat` — serving for trained adapters.
|
|
61
|
+
- **Freesolo SDK environments.** Every run names a Freesolo environment id.
|
|
62
|
+
Scaffold `environment.py` plus `datasets/train.jsonl`, upload `.` or another
|
|
63
|
+
folder with `flash env push --name <name> <folder>`, then reference the
|
|
64
|
+
returned id. The worker loads it through `freesolo.environments`. There are no
|
|
65
|
+
built-in task environments. Single-turn and bounded multi-turn environments are
|
|
66
|
+
supported.
|
|
67
|
+
|
|
68
|
+
## Layout
|
|
69
|
+
|
|
70
|
+
- `flash/catalog.py` — curated model catalog (Qwen3 dense supported tier;
|
|
71
|
+
Qwen3.5/3.6 experimental tier) + `model_policy = "allow"` VRAM-fit check + each
|
|
72
|
+
model's `thinking` capability (opt-in reasoning mode `thinking = true`)
|
|
73
|
+
- `flash/schema.py`, `flash/spec.py` — TOML → `JobSpec`
|
|
74
|
+
- `flash/runner.py` — server-side run supervisor (durable job handle,
|
|
75
|
+
retries, cost guard, endpoint GC)
|
|
76
|
+
- `flash/providers/` — RunPod Flash provider code (pricing, gpus, durable
|
|
77
|
+
submit/poll, preflight) behind the `base.Provider` protocol, with an
|
|
78
|
+
`allocator.py` that picks the cheapest fitting class
|
|
79
|
+
- `flash/engine/` — the on-GPU worker (TRL + colocated vLLM rollouts) and the
|
|
80
|
+
shared recipe; SFT targets and RL rewards route through the active environment
|
|
81
|
+
(task-specific grading lives with its example, not in the engine)
|
|
82
|
+
- `flash/envs/` — environment machinery: registry and the adapter that loads
|
|
83
|
+
Freesolo SDK environments onto the worker's interface
|
|
84
|
+
- `flash env setup` — scaffold a starter local Freesolo env, `datasets/train.jsonl`,
|
|
85
|
+
and ready-to-run configs to start from
|
|
86
|
+
- `flash/serve/`, `flash/server/` — adapter serving and the FastAPI control
|
|
87
|
+
plane (run operator-side via the separate `flash-server` command)
|
|
88
|
+
- `flash/mcp/` — stdio MCP bridge for coding agents
|
|
89
|
+
- `Dockerfile` — the control-plane image (used by the repo docker-compose)
|
|
90
|
+
- `tests/` — pytest suite (CPU-only; offline-by-default, no GPU/network)
|
|
91
|
+
|
|
92
|
+
## Local commands
|
|
93
|
+
|
|
94
|
+
```bash
|
|
95
|
+
cd flash
|
|
96
|
+
uv sync --extra server
|
|
97
|
+
uv run pytest # CPU tests (offline-by-default, no GPU/network)
|
|
98
|
+
uv run ruff check . && uv run ruff format .
|
|
99
|
+
uv run flash --help
|
|
100
|
+
uv run flash-server # control plane (operator-side, run once)
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
The control plane owns provider credentials: `RUNPOD_API_KEY` is always required,
|
|
104
|
+
plus the shared `HF_TOKEN`.
|
|
105
|
+
The artifact repo is platform-managed and per-run (each run gets its own
|
|
106
|
+
`Freesolo-Co/flashrun-<run_id>`, written by the operator `HF_TOKEN`); it is not a user
|
|
107
|
+
knob and not an operator-wide env var. Clients authenticate with their freesolo API key
|
|
108
|
+
(`flash login`).
|
|
109
|
+
|
|
110
|
+
## Release channels
|
|
111
|
+
|
|
112
|
+
Two channels are published to PyPI from the *same source*, distinguished by one line in
|
|
113
|
+
`flash/_channel.py` (`CHANNEL`):
|
|
114
|
+
|
|
115
|
+
| Channel | PyPI package | CLI | Default plane | Published from |
|
|
116
|
+
| --- | --- | --- | --- | --- |
|
|
117
|
+
| prod | `freesolo-flash` | `flash` | `flash.freesolo.co` | push to `main` that bumps `[project].version` (`.github/workflows/publish.yml`) |
|
|
118
|
+
| dev | `freesolo-flash-dev` | `flash-dev` | `flash-dev.freesolo.co` | push to `dev` whose `[tool.flash-dev].version` isn't on PyPI yet (`.github/workflows/publish-dev.yml`) |
|
|
119
|
+
|
|
120
|
+
The two install side by side (distinct package + CLI names). The dev build is produced by
|
|
121
|
+
`scripts/build_dev_dist.py`, which renames the package/CLI and flips `CHANNEL` to `dev` before
|
|
122
|
+
`uv build`. To cut a dev release, bump `[tool.flash-dev].version` and merge to `dev`. Either CLI
|
|
123
|
+
still honours an explicit `FLASH_API_URL` / `flash login --api-url`; the channel only sets the
|
|
124
|
+
default.
|
|
125
|
+
|
|
126
|
+
## Serving From an API
|
|
127
|
+
|
|
128
|
+
`flash chat` is a CLI wrapper around the Flash control-plane chat endpoint. To call a
|
|
129
|
+
deployed adapter from your own app, deploy the finished run once and then POST chat
|
|
130
|
+
requests with your freesolo API key:
|
|
131
|
+
|
|
132
|
+
```bash
|
|
133
|
+
export FLASH_API_URL=https://flash.freesolo.co
|
|
134
|
+
export FREESOLO_API_KEY=fslo_...
|
|
135
|
+
export RUN_ID=flash-1782194170-ce1cfcff
|
|
136
|
+
|
|
137
|
+
curl -X POST "$FLASH_API_URL/v1/runs/$RUN_ID/deploy" \
|
|
138
|
+
-H "Authorization: Bearer $FREESOLO_API_KEY" \
|
|
139
|
+
-H "Content-Type: application/json" \
|
|
140
|
+
-d '{"dry_run": false}'
|
|
141
|
+
|
|
142
|
+
curl -X POST "$FLASH_API_URL/v1/runs/$RUN_ID/chat" \
|
|
143
|
+
-H "Authorization: Bearer $FREESOLO_API_KEY" \
|
|
144
|
+
-H "Content-Type: application/json" \
|
|
145
|
+
-d '{
|
|
146
|
+
"messages": [
|
|
147
|
+
{"role": "user", "content": "Write a two-sentence summary of the run."}
|
|
148
|
+
],
|
|
149
|
+
"temperature": 0.0,
|
|
150
|
+
"max_tokens": 256
|
|
151
|
+
}'
|
|
152
|
+
```
|
|
153
|
+
|
|
154
|
+
The response uses the OpenAI chat-completions shape:
|
|
155
|
+
|
|
156
|
+
```json
|
|
157
|
+
{
|
|
158
|
+
"choices": [
|
|
159
|
+
{
|
|
160
|
+
"message": {
|
|
161
|
+
"role": "assistant",
|
|
162
|
+
"content": "..."
|
|
163
|
+
}
|
|
164
|
+
}
|
|
165
|
+
]
|
|
166
|
+
}
|
|
167
|
+
```
|
|
168
|
+
|
|
169
|
+
Use `choices[0].message.content` for the generated text. The run id is the adapter id
|
|
170
|
+
for serving. If the run is not deployed yet, `/v1/runs/<run_id>/chat` returns `409`
|
|
171
|
+
with a hint to deploy first.
|
|
172
|
+
|
|
173
|
+
Operators can also call the Modal serving app directly after the adapter is registered.
|
|
174
|
+
The default serving app is `https://clado-ai--freesolo-lora-serving.modal.run`, and
|
|
175
|
+
operators can point Flash at another serving app by setting `FREESOLO_SERVING_URL`.
|
|
176
|
+
Use that same base URL when calling the app directly; pass the run id as `model`:
|
|
177
|
+
|
|
178
|
+
```bash
|
|
179
|
+
export FREESOLO_SERVING_URL=https://clado-ai--freesolo-lora-serving.modal.run
|
|
180
|
+
|
|
181
|
+
curl -X POST "$FREESOLO_SERVING_URL/v1/chat/completions" \
|
|
182
|
+
-H "Content-Type: application/json" \
|
|
183
|
+
-d '{
|
|
184
|
+
"model": "flash-1782194170-ce1cfcff",
|
|
185
|
+
"messages": [{"role": "user", "content": "Hello"}],
|
|
186
|
+
"temperature": 0.0,
|
|
187
|
+
"max_tokens": 256
|
|
188
|
+
}'
|
|
189
|
+
```
|
|
190
|
+
|
|
191
|
+
Prefer the Flash control-plane endpoint for user apps because it enforces run ownership
|
|
192
|
+
and forwards per-run serving options such as thinking-mode parity.
|