benchmax 0.1.2.dev27__py3-none-any.whl → 0.1.2.dev29__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. benchmax/cli.py +78 -0
  2. benchmax/config.py +42 -1
  3. benchmax/envs/example_id.py +21 -19
  4. benchmax/envs/telestich/example.py +668 -0
  5. benchmax/envs/telestich/telestich_env.py +1107 -0
  6. benchmax/envs/types.py +99 -1
  7. benchmax/platform/__init__.py +13 -0
  8. benchmax/platform/client.py +316 -16
  9. benchmax/platform/credentials.py +259 -4
  10. benchmax/platform/device_auth.py +81 -0
  11. benchmax/platform/login.py +81 -0
  12. benchmax/platform/training_run.py +29 -3
  13. benchmax/platform/validation.py +418 -61
  14. benchmax/rag/corpus/chroma/search.py +63 -6
  15. benchmax/rag/corpus/postgres/client.py +9 -1
  16. benchmax/rag/corpus/postgres/source.py +21 -11
  17. benchmax/rag/qa_generation/filters/env_rollout.py +9 -1
  18. benchmax/rag/qa_generation/filters/grounding_llm.py +9 -1
  19. benchmax/rag/qa_generation/filters/hop_count_validity.py +7 -6
  20. benchmax/rag/qa_generation/filters/retrieval_llm.py +8 -1
  21. benchmax/rag/qa_generation/pipeline.py +10 -4
  22. benchmax/rag/qa_generation/pipeline_config.py +7 -3
  23. benchmax/rewards/__init__.py +0 -0
  24. benchmax/rewards/diversity.py +305 -0
  25. benchmax/rubrics/_utils.py +3 -2
  26. benchmax/rubrics/adaptive.py +4 -2
  27. benchmax/rubrics/rubric.py +127 -68
  28. benchmax/traces/__init__.py +6 -1
  29. benchmax/traces/adapter.py +113 -53
  30. benchmax/traces/braintrust/message_extraction.py +6 -79
  31. benchmax/traces/processing.py +16 -16
  32. benchmax-0.1.2.dev29.dist-info/METADATA +75 -0
  33. {benchmax-0.1.2.dev27.dist-info → benchmax-0.1.2.dev29.dist-info}/RECORD +37 -29
  34. benchmax-0.1.2.dev29.dist-info/entry_points.txt +2 -0
  35. benchmax-0.1.2.dev27.dist-info/METADATA +0 -188
  36. {benchmax-0.1.2.dev27.dist-info → benchmax-0.1.2.dev29.dist-info}/WHEEL +0 -0
  37. {benchmax-0.1.2.dev27.dist-info → benchmax-0.1.2.dev29.dist-info}/licenses/LICENSE +0 -0
  38. {benchmax-0.1.2.dev27.dist-info → benchmax-0.1.2.dev29.dist-info}/top_level.txt +0 -0
benchmax/cli.py ADDED
@@ -0,0 +1,78 @@
1
+ """``castform`` CLI — browser-based login for the SDK.
2
+
3
+ Commands: ``login`` (device authorization), ``logout``, ``whoami``. The login
4
+ flow + the reusable ``ensure_session`` live in :mod:`benchmax.platform.login`;
5
+ this module is the thin argparse wrapper. After ``castform login`` the SDK
6
+ resolves its bearer from ``~/.castform`` automatically — no API key or URL.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import argparse
12
+ import sys
13
+
14
+ from benchmax.platform import credentials
15
+ from benchmax.platform.device_auth import DeviceAuthError
16
+ from benchmax.platform.login import _login
17
+
18
+
19
+ def _cmd_login(args: argparse.Namespace) -> int:
20
+ env = "staging" if args.env == "staging" else None
21
+ try:
22
+ _login(env)
23
+ except DeviceAuthError as exc:
24
+ print(f"Login failed: {exc}", file=sys.stderr)
25
+ return 1
26
+ print(f"\n✓ Logged in to {args.env}.")
27
+ return 0
28
+
29
+
30
+ def _cmd_logout(_args: argparse.Namespace) -> int:
31
+ credentials.clear_castform_session()
32
+ print("✓ Logged out.")
33
+ return 0
34
+
35
+
36
+ def _cmd_whoami(_args: argparse.Namespace) -> int:
37
+ session = credentials.read_castform_session()
38
+ if not session:
39
+ print("Not logged in. Run `castform login`.", file=sys.stderr)
40
+ return 1
41
+ env = session.get("env", "prod")
42
+ jwt = credentials._session_jwt() # mints from the session; None if invalid/expired/offline
43
+ if not jwt:
44
+ print(
45
+ f"Session present (env: {env}), but couldn't reach auth-service to "
46
+ "verify it (offline, or the session expired). If this persists, run "
47
+ "`castform login` again.",
48
+ file=sys.stderr,
49
+ )
50
+ return 1
51
+ claims = credentials._jwt_claims(jwt)
52
+ who = claims.get("email") or claims.get("sub", "<unknown>")
53
+ print(f"Logged in as {who} (env: {env}).")
54
+ return 0
55
+
56
+
57
+ def main(argv: list[str] | None = None) -> int:
58
+ parser = argparse.ArgumentParser(prog="castform", description="Castform CLI")
59
+ sub = parser.add_subparsers(dest="command", required=True)
60
+
61
+ p_login = sub.add_parser("login", help="Sign in via your browser")
62
+ p_login.add_argument(
63
+ "--env",
64
+ choices=["prod", "staging"],
65
+ default="prod",
66
+ help="Environment to sign in to (staging is internal-only)",
67
+ )
68
+ p_login.set_defaults(func=_cmd_login)
69
+
70
+ sub.add_parser("logout", help="Clear the cached session").set_defaults(func=_cmd_logout)
71
+ sub.add_parser("whoami", help="Show the current login").set_defaults(func=_cmd_whoami)
72
+
73
+ args = parser.parse_args(argv)
74
+ return args.func(args)
75
+
76
+
77
+ if __name__ == "__main__":
78
+ sys.exit(main())
benchmax/config.py CHANGED
@@ -16,7 +16,38 @@ DEFAULT_BASE_DOMAIN = "castform.com"
16
16
 
17
17
 
18
18
  def base_domain() -> str:
19
- return os.environ.get("CASTFORM_BASE_DOMAIN", DEFAULT_BASE_DOMAIN)
19
+ """Resolve the platform base domain.
20
+
21
+ Precedence: explicit ``CASTFORM_BASE_DOMAIN`` → the cached device-auth
22
+ session's ``env`` (``staging`` → ``castform.dev``) → ``prod`` default
23
+ (``castform.com``). The ``env`` claim travels with the credential, so a
24
+ logged-in SDK routes to the same environment it authenticated against —
25
+ URL and credential can't desync. A prod session carries no ``env`` marker
26
+ (``None`` → prod), so only internal staging logins deviate from the default.
27
+ """
28
+ override = os.environ.get("CASTFORM_BASE_DOMAIN")
29
+ if override:
30
+ return override
31
+ if _session_env() == "staging":
32
+ return "castform.dev"
33
+ return DEFAULT_BASE_DOMAIN
34
+
35
+
36
+ def _session_env() -> str | None:
37
+ """The ``env`` from the cached device-auth session, if any.
38
+
39
+ Lazy import: ``config`` is a leaf that ``benchmax.platform`` depends on, so a
40
+ top-level import would cycle (platform/__init__ → client → config)."""
41
+ try:
42
+ from benchmax.platform.credentials import read_castform_session
43
+
44
+ session = read_castform_session()
45
+ except Exception:
46
+ return None
47
+ if not session:
48
+ return None
49
+ env = session.get("env")
50
+ return env if isinstance(env, str) else None
20
51
 
21
52
 
22
53
  def platform_url() -> str:
@@ -38,3 +69,13 @@ def web_app_url() -> str:
38
69
  def llm_url() -> str:
39
70
  """OpenAI-compatible LLM endpoint hosted by the platform."""
40
71
  return os.environ.get("CASTFORM_LLM_URL") or f"https://llm.{base_domain()}/v1"
72
+
73
+
74
+ def auth_url() -> str:
75
+ """Auth-service base URL (device-authorization + JWT mint endpoints).
76
+
77
+ Used by ``castform login`` and the per-process session→JWT mint. Derives from
78
+ the same base domain as everything else, so a session minted against ``staging``
79
+ talks to ``auth.castform.dev`` and a ``prod`` session to ``auth.castform.com``.
80
+ """
81
+ return os.environ.get("CASTFORM_AUTH_URL") or f"https://auth.{base_domain()}"
@@ -1,24 +1,23 @@
1
1
  """Canonical example identity.
2
2
 
3
3
  ``canonical_example_id(prompt_messages, task)`` returns a SHA-256 hex digest
4
- that is stable across processes and languages: a TypeScript port lives in
5
- ``platform-service/src/lib/canonical-example-id.ts`` and is exercised by a
6
- parity test.
7
-
8
- Determinism is achieved by:
9
- - normalizing numeric values so JSON output matches between Python and JS
10
- (JS has no int/float distinction; integer-valued floats are coerced to int,
11
- -0.0 to 0; NaN/Inf are rejected).
12
- - rejecting values whose JSON serialization diverges between Python and JS:
13
- non-string dict keys, integers outside JS ``Number.MAX_SAFE_INTEGER``,
14
- byte strings, lone surrogates, and unknown types.
15
- - emitting canonical JSON with sorted keys, no whitespace, and no ASCII
16
- escaping (modern JSON.stringify also preserves non-ASCII).
17
-
18
- The hash is computed over ``{"v": 2, "prompt_messages": ..., "task": ...}``.
19
- v:2 bump went together with the ``seed_messages`` → ``prompt_messages``
20
- field rename in 2026-05; v:1 hashes are obsolete.
4
+ stable across processes. Identity is computed only here, in Python — both the
5
+ trainer and rollout-service hash via this module.
6
+
7
+ Normalization keeps the digest loader-independent:
8
+ - integer-valued floats → int, -0.0 → 0; NaN/Inf rejected.
9
+ - dict keys whose value is ``None`` are dropped, so a key absent in one loader
10
+ and present-but-null in another (Arrow schema-unification) hashes the same;
11
+ nulls *inside lists* are kept (length/order are identity).
12
+ - ambiguous values rejected: non-str dict keys, ints beyond
13
+ ``Number.MAX_SAFE_INTEGER``, byte strings, lone surrogates, unknown types.
14
+ - canonical JSON: sorted keys, no whitespace, no ASCII escaping.
15
+
16
+ Payload tag ``v:3``. History: v:1→v:2 = the 2026-05 ``seed_messages`` →
17
+ ``prompt_messages`` rename; v:2→v:3 = drop null-valued dict keys (loader skew).
18
+ Older hashes are obsolete.
21
19
  """
20
+
22
21
  from __future__ import annotations
23
22
 
24
23
  import hashlib
@@ -78,7 +77,10 @@ def _normalize(v: Any) -> Any:
78
77
  raise ValueError(
79
78
  f"dict keys must be str for canonical hashing; got {type(k).__name__}"
80
79
  )
81
- out[k] = _normalize(x)
80
+ nx = _normalize(x)
81
+ if nx is None:
82
+ continue
83
+ out[k] = nx
82
84
  return out
83
85
  raise ValueError(
84
86
  f"type {type(v).__name__} is not JSON-canonicalizable; "
@@ -90,7 +92,7 @@ def canonical_example_id(
90
92
  prompt_messages: Messages,
91
93
  task: dict[str, Any] | None,
92
94
  ) -> str:
93
- payload = {"v": 2, "prompt_messages": prompt_messages, "task": task}
95
+ payload = {"v": 3, "prompt_messages": prompt_messages, "task": task}
94
96
  serialized = json.dumps(
95
97
  _normalize(payload),
96
98
  sort_keys=True,