benchmax 0.1.2.dev27__py3-none-any.whl → 0.1.2.dev29__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- benchmax/cli.py +78 -0
- benchmax/config.py +42 -1
- benchmax/envs/example_id.py +21 -19
- benchmax/envs/telestich/example.py +668 -0
- benchmax/envs/telestich/telestich_env.py +1107 -0
- benchmax/envs/types.py +99 -1
- benchmax/platform/__init__.py +13 -0
- benchmax/platform/client.py +316 -16
- benchmax/platform/credentials.py +259 -4
- benchmax/platform/device_auth.py +81 -0
- benchmax/platform/login.py +81 -0
- benchmax/platform/training_run.py +29 -3
- benchmax/platform/validation.py +418 -61
- benchmax/rag/corpus/chroma/search.py +63 -6
- benchmax/rag/corpus/postgres/client.py +9 -1
- benchmax/rag/corpus/postgres/source.py +21 -11
- benchmax/rag/qa_generation/filters/env_rollout.py +9 -1
- benchmax/rag/qa_generation/filters/grounding_llm.py +9 -1
- benchmax/rag/qa_generation/filters/hop_count_validity.py +7 -6
- benchmax/rag/qa_generation/filters/retrieval_llm.py +8 -1
- benchmax/rag/qa_generation/pipeline.py +10 -4
- benchmax/rag/qa_generation/pipeline_config.py +7 -3
- benchmax/rewards/__init__.py +0 -0
- benchmax/rewards/diversity.py +305 -0
- benchmax/rubrics/_utils.py +3 -2
- benchmax/rubrics/adaptive.py +4 -2
- benchmax/rubrics/rubric.py +127 -68
- benchmax/traces/__init__.py +6 -1
- benchmax/traces/adapter.py +113 -53
- benchmax/traces/braintrust/message_extraction.py +6 -79
- benchmax/traces/processing.py +16 -16
- benchmax-0.1.2.dev29.dist-info/METADATA +75 -0
- {benchmax-0.1.2.dev27.dist-info → benchmax-0.1.2.dev29.dist-info}/RECORD +37 -29
- benchmax-0.1.2.dev29.dist-info/entry_points.txt +2 -0
- benchmax-0.1.2.dev27.dist-info/METADATA +0 -188
- {benchmax-0.1.2.dev27.dist-info → benchmax-0.1.2.dev29.dist-info}/WHEEL +0 -0
- {benchmax-0.1.2.dev27.dist-info → benchmax-0.1.2.dev29.dist-info}/licenses/LICENSE +0 -0
- {benchmax-0.1.2.dev27.dist-info → benchmax-0.1.2.dev29.dist-info}/top_level.txt +0 -0
benchmax/cli.py
ADDED
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
"""``castform`` CLI — browser-based login for the SDK.
|
|
2
|
+
|
|
3
|
+
Commands: ``login`` (device authorization), ``logout``, ``whoami``. The login
|
|
4
|
+
flow + the reusable ``ensure_session`` live in :mod:`benchmax.platform.login`;
|
|
5
|
+
this module is the thin argparse wrapper. After ``castform login`` the SDK
|
|
6
|
+
resolves its bearer from ``~/.castform`` automatically — no API key or URL.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import argparse
|
|
12
|
+
import sys
|
|
13
|
+
|
|
14
|
+
from benchmax.platform import credentials
|
|
15
|
+
from benchmax.platform.device_auth import DeviceAuthError
|
|
16
|
+
from benchmax.platform.login import _login
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def _cmd_login(args: argparse.Namespace) -> int:
|
|
20
|
+
env = "staging" if args.env == "staging" else None
|
|
21
|
+
try:
|
|
22
|
+
_login(env)
|
|
23
|
+
except DeviceAuthError as exc:
|
|
24
|
+
print(f"Login failed: {exc}", file=sys.stderr)
|
|
25
|
+
return 1
|
|
26
|
+
print(f"\n✓ Logged in to {args.env}.")
|
|
27
|
+
return 0
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def _cmd_logout(_args: argparse.Namespace) -> int:
|
|
31
|
+
credentials.clear_castform_session()
|
|
32
|
+
print("✓ Logged out.")
|
|
33
|
+
return 0
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def _cmd_whoami(_args: argparse.Namespace) -> int:
|
|
37
|
+
session = credentials.read_castform_session()
|
|
38
|
+
if not session:
|
|
39
|
+
print("Not logged in. Run `castform login`.", file=sys.stderr)
|
|
40
|
+
return 1
|
|
41
|
+
env = session.get("env", "prod")
|
|
42
|
+
jwt = credentials._session_jwt() # mints from the session; None if invalid/expired/offline
|
|
43
|
+
if not jwt:
|
|
44
|
+
print(
|
|
45
|
+
f"Session present (env: {env}), but couldn't reach auth-service to "
|
|
46
|
+
"verify it (offline, or the session expired). If this persists, run "
|
|
47
|
+
"`castform login` again.",
|
|
48
|
+
file=sys.stderr,
|
|
49
|
+
)
|
|
50
|
+
return 1
|
|
51
|
+
claims = credentials._jwt_claims(jwt)
|
|
52
|
+
who = claims.get("email") or claims.get("sub", "<unknown>")
|
|
53
|
+
print(f"Logged in as {who} (env: {env}).")
|
|
54
|
+
return 0
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def main(argv: list[str] | None = None) -> int:
|
|
58
|
+
parser = argparse.ArgumentParser(prog="castform", description="Castform CLI")
|
|
59
|
+
sub = parser.add_subparsers(dest="command", required=True)
|
|
60
|
+
|
|
61
|
+
p_login = sub.add_parser("login", help="Sign in via your browser")
|
|
62
|
+
p_login.add_argument(
|
|
63
|
+
"--env",
|
|
64
|
+
choices=["prod", "staging"],
|
|
65
|
+
default="prod",
|
|
66
|
+
help="Environment to sign in to (staging is internal-only)",
|
|
67
|
+
)
|
|
68
|
+
p_login.set_defaults(func=_cmd_login)
|
|
69
|
+
|
|
70
|
+
sub.add_parser("logout", help="Clear the cached session").set_defaults(func=_cmd_logout)
|
|
71
|
+
sub.add_parser("whoami", help="Show the current login").set_defaults(func=_cmd_whoami)
|
|
72
|
+
|
|
73
|
+
args = parser.parse_args(argv)
|
|
74
|
+
return args.func(args)
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
if __name__ == "__main__":
|
|
78
|
+
sys.exit(main())
|
benchmax/config.py
CHANGED
|
@@ -16,7 +16,38 @@ DEFAULT_BASE_DOMAIN = "castform.com"
|
|
|
16
16
|
|
|
17
17
|
|
|
18
18
|
def base_domain() -> str:
|
|
19
|
-
|
|
19
|
+
"""Resolve the platform base domain.
|
|
20
|
+
|
|
21
|
+
Precedence: explicit ``CASTFORM_BASE_DOMAIN`` → the cached device-auth
|
|
22
|
+
session's ``env`` (``staging`` → ``castform.dev``) → ``prod`` default
|
|
23
|
+
(``castform.com``). The ``env`` claim travels with the credential, so a
|
|
24
|
+
logged-in SDK routes to the same environment it authenticated against —
|
|
25
|
+
URL and credential can't desync. A prod session carries no ``env`` marker
|
|
26
|
+
(``None`` → prod), so only internal staging logins deviate from the default.
|
|
27
|
+
"""
|
|
28
|
+
override = os.environ.get("CASTFORM_BASE_DOMAIN")
|
|
29
|
+
if override:
|
|
30
|
+
return override
|
|
31
|
+
if _session_env() == "staging":
|
|
32
|
+
return "castform.dev"
|
|
33
|
+
return DEFAULT_BASE_DOMAIN
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def _session_env() -> str | None:
|
|
37
|
+
"""The ``env`` from the cached device-auth session, if any.
|
|
38
|
+
|
|
39
|
+
Lazy import: ``config`` is a leaf that ``benchmax.platform`` depends on, so a
|
|
40
|
+
top-level import would cycle (platform/__init__ → client → config)."""
|
|
41
|
+
try:
|
|
42
|
+
from benchmax.platform.credentials import read_castform_session
|
|
43
|
+
|
|
44
|
+
session = read_castform_session()
|
|
45
|
+
except Exception:
|
|
46
|
+
return None
|
|
47
|
+
if not session:
|
|
48
|
+
return None
|
|
49
|
+
env = session.get("env")
|
|
50
|
+
return env if isinstance(env, str) else None
|
|
20
51
|
|
|
21
52
|
|
|
22
53
|
def platform_url() -> str:
|
|
@@ -38,3 +69,13 @@ def web_app_url() -> str:
|
|
|
38
69
|
def llm_url() -> str:
|
|
39
70
|
"""OpenAI-compatible LLM endpoint hosted by the platform."""
|
|
40
71
|
return os.environ.get("CASTFORM_LLM_URL") or f"https://llm.{base_domain()}/v1"
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def auth_url() -> str:
|
|
75
|
+
"""Auth-service base URL (device-authorization + JWT mint endpoints).
|
|
76
|
+
|
|
77
|
+
Used by ``castform login`` and the per-process session→JWT mint. Derives from
|
|
78
|
+
the same base domain as everything else, so a session minted against ``staging``
|
|
79
|
+
talks to ``auth.castform.dev`` and a ``prod`` session to ``auth.castform.com``.
|
|
80
|
+
"""
|
|
81
|
+
return os.environ.get("CASTFORM_AUTH_URL") or f"https://auth.{base_domain()}"
|
benchmax/envs/example_id.py
CHANGED
|
@@ -1,24 +1,23 @@
|
|
|
1
1
|
"""Canonical example identity.
|
|
2
2
|
|
|
3
3
|
``canonical_example_id(prompt_messages, task)`` returns a SHA-256 hex digest
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
v:2 bump went together with the ``seed_messages`` → ``prompt_messages``
|
|
20
|
-
field rename in 2026-05; v:1 hashes are obsolete.
|
|
4
|
+
stable across processes. Identity is computed only here, in Python — both the
|
|
5
|
+
trainer and rollout-service hash via this module.
|
|
6
|
+
|
|
7
|
+
Normalization keeps the digest loader-independent:
|
|
8
|
+
- integer-valued floats → int, -0.0 → 0; NaN/Inf rejected.
|
|
9
|
+
- dict keys whose value is ``None`` are dropped, so a key absent in one loader
|
|
10
|
+
and present-but-null in another (Arrow schema-unification) hashes the same;
|
|
11
|
+
nulls *inside lists* are kept (length/order are identity).
|
|
12
|
+
- ambiguous values rejected: non-str dict keys, ints beyond
|
|
13
|
+
``Number.MAX_SAFE_INTEGER``, byte strings, lone surrogates, unknown types.
|
|
14
|
+
- canonical JSON: sorted keys, no whitespace, no ASCII escaping.
|
|
15
|
+
|
|
16
|
+
Payload tag ``v:3``. History: v:1→v:2 = the 2026-05 ``seed_messages`` →
|
|
17
|
+
``prompt_messages`` rename; v:2→v:3 = drop null-valued dict keys (loader skew).
|
|
18
|
+
Older hashes are obsolete.
|
|
21
19
|
"""
|
|
20
|
+
|
|
22
21
|
from __future__ import annotations
|
|
23
22
|
|
|
24
23
|
import hashlib
|
|
@@ -78,7 +77,10 @@ def _normalize(v: Any) -> Any:
|
|
|
78
77
|
raise ValueError(
|
|
79
78
|
f"dict keys must be str for canonical hashing; got {type(k).__name__}"
|
|
80
79
|
)
|
|
81
|
-
|
|
80
|
+
nx = _normalize(x)
|
|
81
|
+
if nx is None:
|
|
82
|
+
continue
|
|
83
|
+
out[k] = nx
|
|
82
84
|
return out
|
|
83
85
|
raise ValueError(
|
|
84
86
|
f"type {type(v).__name__} is not JSON-canonicalizable; "
|
|
@@ -90,7 +92,7 @@ def canonical_example_id(
|
|
|
90
92
|
prompt_messages: Messages,
|
|
91
93
|
task: dict[str, Any] | None,
|
|
92
94
|
) -> str:
|
|
93
|
-
payload = {"v":
|
|
95
|
+
payload = {"v": 3, "prompt_messages": prompt_messages, "task": task}
|
|
94
96
|
serialized = json.dumps(
|
|
95
97
|
_normalize(payload),
|
|
96
98
|
sort_keys=True,
|