PyPI - benchmax - Versions diffs - 0.1.2.dev28__py3-none-any.whl → 0.1.2.dev29__py3-none-any.whl - Mend

benchmax 0.1.2.dev28py3-none-any.whl → 0.1.2.dev29py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

benchmax/cli.py +78 -0
benchmax/config.py +42 -1
benchmax/platform/__init__.py +10 -0
benchmax/platform/client.py +303 -16
benchmax/platform/credentials.py +224 -4
benchmax/platform/device_auth.py +81 -0
benchmax/platform/login.py +81 -0
benchmax/platform/training_run.py +5 -3
benchmax/platform/validation.py +151 -7
benchmax/rag/corpus/postgres/client.py +9 -1
benchmax/rag/corpus/postgres/source.py +21 -11
benchmax/rag/qa_generation/filters/env_rollout.py +9 -1
benchmax/rag/qa_generation/filters/grounding_llm.py +9 -1
benchmax/rag/qa_generation/filters/hop_count_validity.py +7 -6
benchmax/rag/qa_generation/filters/retrieval_llm.py +8 -1
benchmax/rag/qa_generation/pipeline.py +10 -4
benchmax/rag/qa_generation/pipeline_config.py +7 -3
{benchmax-0.1.2.dev28.dist-info → benchmax-0.1.2.dev29.dist-info}/METADATA +1 -1
{benchmax-0.1.2.dev28.dist-info → benchmax-0.1.2.dev29.dist-info}/RECORD +23 -19
benchmax-0.1.2.dev29.dist-info/entry_points.txt +2 -0
{benchmax-0.1.2.dev28.dist-info → benchmax-0.1.2.dev29.dist-info}/WHEEL +0 -0
{benchmax-0.1.2.dev28.dist-info → benchmax-0.1.2.dev29.dist-info}/licenses/LICENSE +0 -0
{benchmax-0.1.2.dev28.dist-info → benchmax-0.1.2.dev29.dist-info}/top_level.txt +0 -0

benchmax/cli.py ADDED Viewed

@@ -0,0 +1,78 @@
+"""``castform`` CLI — browser-based login for the SDK.
+Commands: ``login`` (device authorization), ``logout``, ``whoami``. The login
+flow + the reusable ``ensure_session`` live in :mod:`benchmax.platform.login`;
+this module is the thin argparse wrapper. After ``castform login`` the SDK
+resolves its bearer from ``~/.castform`` automatically — no API key or URL.
+"""
+from __future__ import annotations
+import argparse
+import sys
+from benchmax.platform import credentials
+from benchmax.platform.device_auth import DeviceAuthError
+from benchmax.platform.login import _login
+def _cmd_login(args: argparse.Namespace) -> int:
+    env = "staging" if args.env == "staging" else None
+    try:
+        _login(env)
+    except DeviceAuthError as exc:
+        print(f"Login failed: {exc}", file=sys.stderr)
+        return 1
+    print(f"\n✓ Logged in to {args.env}.")
+    return 0
+def _cmd_logout(_args: argparse.Namespace) -> int:
+    credentials.clear_castform_session()
+    print("✓ Logged out.")
+    return 0
+def _cmd_whoami(_args: argparse.Namespace) -> int:
+    session = credentials.read_castform_session()
+    if not session:
+        print("Not logged in. Run `castform login`.", file=sys.stderr)
+        return 1
+    env = session.get("env", "prod")
+    jwt = credentials._session_jwt()  # mints from the session; None if invalid/expired/offline
+    if not jwt:
+        print(
+            f"Session present (env: {env}), but couldn't reach auth-service to "
+            "verify it (offline, or the session expired). If this persists, run "
+            "`castform login` again.",
+            file=sys.stderr,
+        )
+        return 1
+    claims = credentials._jwt_claims(jwt)
+    who = claims.get("email") or claims.get("sub", "<unknown>")
+    print(f"Logged in as {who} (env: {env}).")
+    return 0
+def main(argv: list[str] | None = None) -> int:
+    parser = argparse.ArgumentParser(prog="castform", description="Castform CLI")
+    sub = parser.add_subparsers(dest="command", required=True)
+    p_login = sub.add_parser("login", help="Sign in via your browser")
+    p_login.add_argument(
+        "--env",
+        choices=["prod", "staging"],
+        default="prod",
+        help="Environment to sign in to (staging is internal-only)",
+    )
+    p_login.set_defaults(func=_cmd_login)
+    sub.add_parser("logout", help="Clear the cached session").set_defaults(func=_cmd_logout)
+    sub.add_parser("whoami", help="Show the current login").set_defaults(func=_cmd_whoami)
+    args = parser.parse_args(argv)
+    return args.func(args)
+if __name__ == "__main__":
+    sys.exit(main())

benchmax/config.py CHANGED Viewed

@@ -16,7 +16,38 @@ DEFAULT_BASE_DOMAIN = "castform.com"
 def base_domain() -> str:
-    return os.environ.get("CASTFORM_BASE_DOMAIN", DEFAULT_BASE_DOMAIN)
+    """Resolve the platform base domain.
+    Precedence: explicit ``CASTFORM_BASE_DOMAIN`` → the cached device-auth
+    session's ``env`` (``staging`` → ``castform.dev``) → ``prod`` default
+    (``castform.com``). The ``env`` claim travels with the credential, so a
+    logged-in SDK routes to the same environment it authenticated against —
+    URL and credential can't desync. A prod session carries no ``env`` marker
+    (``None`` → prod), so only internal staging logins deviate from the default.
+    """
+    override = os.environ.get("CASTFORM_BASE_DOMAIN")
+    if override:
+        return override
+    if _session_env() == "staging":
+        return "castform.dev"
+    return DEFAULT_BASE_DOMAIN
+def _session_env() -> str | None:
+    """The ``env`` from the cached device-auth session, if any.
+    Lazy import: ``config`` is a leaf that ``benchmax.platform`` depends on, so a
+    top-level import would cycle (platform/__init__ → client → config)."""
+    try:
+        from benchmax.platform.credentials import read_castform_session
+        session = read_castform_session()
+    except Exception:
+        return None
+    if not session:
+        return None
+    env = session.get("env")
+    return env if isinstance(env, str) else None
 def platform_url() -> str:
@@ -38,3 +69,13 @@ def web_app_url() -> str:
 def llm_url() -> str:
     """OpenAI-compatible LLM endpoint hosted by the platform."""
     return os.environ.get("CASTFORM_LLM_URL") or f"https://llm.{base_domain()}/v1"
+def auth_url() -> str:
+    """Auth-service base URL (device-authorization + JWT mint endpoints).
+    Used by ``castform login`` and the per-process session→JWT mint. Derives from
+    the same base domain as everything else, so a session minted against ``staging``
+    talks to ``auth.castform.dev`` and a ``prod`` session to ``auth.castform.com``.
+    """
+    return os.environ.get("CASTFORM_AUTH_URL") or f"https://auth.{base_domain()}"

benchmax/platform/__init__.py CHANGED Viewed

@@ -1,15 +1,25 @@
 """Castform platform clients (storage, training runs, rollout)."""
 from .client import RolloutClient, StorageClient, TrainerClient
+from .credentials import platform_bearer
 from .training_run import UploadedTrainingRun, upload_training_run
 from .validation import ValidationReport, validate_env
+# Imported last: login depends on credentials/device_auth (siblings), so this
+# stays cycle-free as long as those are already loaded by the imports above.
+from .login import ensure_session
 __all__ = [
     "RolloutClient",
     "StorageClient",
     "TrainerClient",
     "UploadedTrainingRun",
     "ValidationReport",
+    # The seam token-getter: generated scripts pass it to a raw OpenAI client
+    # (e.g. the traces pivot), so it's part of the public surface alongside
+    # ensure_session — not just an internal credentials helper.
+    "platform_bearer",
+    "ensure_session",
     "upload_training_run",
     "validate_env",
 ]

benchmax/platform/client.py CHANGED Viewed

@@ -14,10 +14,9 @@ from typing import TYPE_CHECKING, Any
 import httpx
-logger = logging.getLogger(__name__)
 from benchmax import config
+from .credentials import TokenProvider, resolve_token_provider
 from .exceptions import (
     AuthenticationError,
     JobLaunchError,
@@ -28,6 +27,8 @@ from .exceptions import (
     TrainerError,
 )
+logger = logging.getLogger(__name__)
 if TYPE_CHECKING:
     from types import ModuleType
@@ -74,10 +75,17 @@ class ValidationResult:
     """
     examples: list[ExampleValidation]
+    # Outcome of the compute_group_reward contract check, run on the real
+    # smoke-rollout transcripts. None when the env has no group reward, the
+    # env class wasn't supplied, or the check was skipped (deps not installed
+    # locally — it runs on the trainer instead). Its index is -1.
+    group_reward: ExampleValidation | None = None
     @property
     def ok(self) -> bool:
-        return all(ex.ok for ex in self.examples)
+        rollouts_ok = all(ex.ok for ex in self.examples)
+        group_ok = self.group_reward is None or self.group_reward.ok
+        return rollouts_ok and group_ok
     def __bool__(self) -> bool:
         return self.ok
@@ -110,6 +118,22 @@ def _file_hash(content: bytes, length: int = 8) -> str:
     return hashlib.sha256(content).hexdigest()[:length]
+class _BearerAuth(httpx.Auth):
+    """Resolve the platform bearer per request via ``token_provider``.
+    Built once but called on every request, so the auth header is never frozen
+    at construction — a rotating/expiring device or act-as token is picked up
+    each call (the "token expires mid-run" bug ``credentials.py`` warns about).
+    """
+    def __init__(self, token_provider: TokenProvider) -> None:
+        self._token_provider = token_provider
+    def auth_flow(self, request: httpx.Request):
+        request.headers["Authorization"] = f"Bearer {self._token_provider()}"
+        yield request
 @dataclass
 class StorageClient:
     """Client for uploading files to storage via pre-signed URLs.
@@ -117,6 +141,10 @@ class StorageClient:
     Uses the ``GET /api/storage/upload-url`` endpoint to obtain a pre-signed
     upload URL, then PUTs the file content directly to that URL.
+    ``api_key`` is optional: when omitted the bearer resolves per request via
+    the credential seam (``ACT_AS_TOKEN_PATH`` / ``PLATFORM_API_KEY``). Pass
+    ``api_key`` to override, or ``token_provider`` for a custom per-call source.
     Example:
         client = StorageClient(api_key="sk_...", base_url="http://localhost:3000")
         result = client.upload_file(
@@ -127,19 +155,22 @@ class StorageClient:
         print(f"Uploaded to {result['blobPath']}")
     """
-    api_key: str
+    api_key: str | None = None
     base_url: str = field(default_factory=config.platform_url)
     timeout: float = 60.0
     # SAS-URL PUTs are bounded by file size, not API latency. Default to
     # 30 minutes so multi-GB datasets don't time out at the platform-API timeout.
     upload_timeout: float = 1800.0
+    token_provider: TokenProvider | None = None
+    _token_provider: TokenProvider = field(init=False, repr=False)
     _http_client: httpx.Client = field(init=False, repr=False)
     def __post_init__(self) -> None:
-        """Initialize HTTP client with auth headers."""
+        """Initialize HTTP client; auth resolves per request, never baked here."""
+        self._token_provider = resolve_token_provider(self.api_key, self.token_provider)
         self._http_client = httpx.Client(
             base_url=self.base_url,
-            headers={"Authorization": f"Bearer {self.api_key}"},
+            auth=_BearerAuth(self._token_provider),
             timeout=self.timeout,
         )
@@ -294,6 +325,10 @@ class StorageClient:
 class TrainerClient:
     """Client for launching and managing training runs.
+    ``api_key`` is optional: when omitted the bearer resolves per request via
+    the credential seam (``ACT_AS_TOKEN_PATH`` / ``PLATFORM_API_KEY``). Pass
+    ``api_key`` to override, or ``token_provider`` for a custom per-call source.
     Example:
         client = TrainerClient(api_key="sk_...", base_url="http://localhost:3000")
         run_id = client.launch_training_run(
@@ -306,16 +341,19 @@ class TrainerClient:
         print(f"Launched: {run_id}")
     """
-    api_key: str
+    api_key: str | None = None
     base_url: str = field(default_factory=config.platform_url)
     timeout: float = 30.0
+    token_provider: TokenProvider | None = None
+    _token_provider: TokenProvider = field(init=False, repr=False)
     _http_client: httpx.Client = field(init=False, repr=False)
     def __post_init__(self) -> None:
-        """Initialize HTTP client with auth headers."""
+        """Initialize HTTP client; auth resolves per request, never baked here."""
+        self._token_provider = resolve_token_provider(self.api_key, self.token_provider)
         self._http_client = httpx.Client(
             base_url=self.base_url,
-            headers={"Authorization": f"Bearer {self.api_key}"},
+            auth=_BearerAuth(self._token_provider),
             timeout=self.timeout,
         )
@@ -716,23 +754,32 @@ class RolloutClient:
        raw file contents; they will be base64-encoded and sent inline.
     Args:
-        api_key:    Platform API key (``sk_``); forwarded as the Bearer token
-                    platform-service validates.
+        api_key:    Platform API key forwarded as the Bearer token
+                    platform-service validates. Optional — when omitted the
+                    bearer resolves per request via the credential seam
+                    (``ACT_AS_TOKEN_PATH`` / ``PLATFORM_API_KEY``).
         server_url: Base URL of platform-service. Defaults to
                     ``config.platform_url()``; the ``/v1/rollout/stream`` path is
                     appended per request.
         timeout:    Per-request timeout in seconds (default 300 — rollouts can be slow).
+        token_provider: Custom per-call bearer source; overrides the seam when
+                    ``api_key`` is unset.
     """
     _TERMINAL = {"rollout_completed", "worker_error", "cancelled", "error"}
     def __init__(
         self,
-        api_key: str,
+        api_key: str | None = None,
         server_url: str | None = None,
         timeout: float = 300.0,
+        *,
+        token_provider: TokenProvider | None = None,
     ) -> None:
-        self._api_key = api_key
+        # Bearer resolves per request (see stream_rollout): explicit api_key →
+        # token_provider → platform_bearer seam. Optional so a logged-in/CI
+        # caller need not pass one.
+        self._token_provider = resolve_token_provider(api_key, token_provider)
         # Resolve at construction time, not import time, so env-var changes
         # take effect (mirrors StorageClient/TrainerClient default_factory pattern).
         # Target platform-service (the API-key gate), not the rollout-service
@@ -834,6 +881,12 @@ class RolloutClient:
             env_metadata_bytes,
         )
+        # Resolve the platform bearer once, per request (never frozen at
+        # construction): a rotating/expiring device or act-as token is picked
+        # up each call. Used for the platform-service header below AND, when the
+        # LLM leg hits the platform's own endpoint, as that leg's key.
+        bearer = self._token_provider()
         # Resolve LLM URL lazily. The platform key is only auto-forwarded when
         # the LLM endpoint is the platform's own LLM service — pointing at a
         # third-party host (Azure OpenAI, Anthropic) requires an explicit
@@ -842,7 +895,7 @@ class RolloutClient:
         resolved_llm_url = llm_base_url or platform_llm_url
         if not llm_api_key:
             if resolved_llm_url == platform_llm_url:
-                llm_api_key = self._api_key
+                llm_api_key = bearer
             else:
                 raise ValueError(
                     "llm_api_key is required when llm_base_url points outside the "
@@ -870,7 +923,7 @@ class RolloutClient:
         # platform-service mounts the proxy at /v1/rollout/stream; it validates
         # the platform key and forwards to rollout-service with an act_as JWT.
         url = f"{self._server_url}/v1/rollout/stream"
-        headers = {"Authorization": f"Bearer {self._api_key}"}
+        headers = {"Authorization": f"Bearer {bearer}"}
         with httpx.stream(
             "POST",
@@ -960,6 +1013,8 @@ class RolloutClient:
         llm_api_key: str = "",
         llm_model: str = _VALIDATION_MODEL,
         max_turns: int = 4,
+        check_group_reward: bool = True,
+        group_reward_samples: int = 2,
         verbose: bool = True,
     ) -> ValidationResult:
         """Run rollouts on the first *n* examples and report pass/fail.
@@ -991,6 +1046,19 @@ class RolloutClient:
                                 ``llm_base_url`` points outside the platform LLM
                                 endpoint (stream_rollout refuses to forward the
                                 platform key to a third-party host).
+            check_group_reward: After the rollouts, run a REAL same-example
+                                group through rollout-service (one example,
+                                samples_per_example=N) so the env's
+                                ``compute_group_reward`` executes server-side in
+                                the trainer image, over co-located siblings —
+                                the trainer/external-eval path. A server-side
+                                failure (raise or contract violation) comes back
+                                as ``group_reward_error`` and fails validation.
+                                Only fires when ``env_class`` is given and the
+                                env overrides the method.
+            group_reward_samples: Size of that group (the batch's
+                                ``samples_per_example``). Costs this many extra
+                                rollouts; ignored unless the group check runs.
             verbose:            Print colored progress to stdout (default True for
                                 interactive/notebook UX). Set False for programmatic
                                 callers that consume the returned ValidationResult.
@@ -1026,6 +1094,18 @@ class RolloutClient:
             env_cls_path, env_metadata_path, env_cls_bytes, env_metadata_bytes
         )
+        # compute_group_reward runs on a whole rollout GROUP, which the
+        # per-example smoke above never forms (each is a group of 1). Run a real
+        # same-example group server-side (run_group, below) so the env method
+        # executes on the trainer's path; the server reports any failure as
+        # group_reward_error. Needs the env_class, and is pointless unless the
+        # env overrides the no-op default.
+        want_group = False
+        if check_group_reward and env_class is not None:
+            from .validation import overrides_compute_group_reward
+            want_group = overrides_compute_group_reward(env_class)
         sample = examples[:n]
         if verbose:
             print(
@@ -1066,7 +1146,55 @@ class RolloutClient:
                     print(_err(f"  Example {i} failed: {exc}"))
                 per_example.append(ExampleValidation(index=i, ok=False, error=str(exc)))
-        result = ValidationResult(examples=per_example)
+        group_reward: ExampleValidation | None = None
+        if want_group and sample and group_reward_samples >= 1:
+            # Faithful check: run a REAL same-example group through
+            # rollout-service (one example, samples_per_example=N) so the env's
+            # compute_group_reward runs server-side in the trainer image, over
+            # co-located siblings — exactly the trainer/external-eval path. A
+            # server-side failure comes back as group_reward_error per rollout.
+            if verbose:
+                print(
+                    _info(
+                        f"\n  Group reward — {group_reward_samples} server-side "
+                        "sibling(s) of example 0"
+                    )
+                )
+            try:
+                events = self.run_group(
+                    sample[0],
+                    samples=group_reward_samples,
+                    env_cls_path=env_cls_path,
+                    env_metadata_path=env_metadata_path,
+                    env_cls_bytes=env_cls_bytes,
+                    env_metadata_bytes=env_metadata_bytes,
+                    llm_base_url=llm_base_url,
+                    llm_api_key=llm_api_key,
+                    llm_model=llm_model,
+                    max_turns=max_turns,
+                    verbose=verbose,
+                )
+                group_reward = self._assess_group_events(
+                    events, group_reward_samples, verbose
+                )
+            except RolloutNotFound:
+                # The batch proxy (/v1/rollout/batch/stream) isn't deployed on
+                # this server yet — skip rather than fail, so the SDK can land
+                # ahead of platform-service. group_reward stays None; the offline
+                # local check still covered shape.
+                if verbose:
+                    print(
+                        _info(
+                            "  compute_group_reward: skipped — server has no "
+                            "/rollout/batch/stream yet"
+                        )
+                    )
+            except (RolloutError, RuntimeError) as exc:
+                if verbose:
+                    print(_err(f"  group reward check failed: {exc}"))
+                group_reward = ExampleValidation(index=-1, ok=False, error=str(exc))
+        result = ValidationResult(examples=per_example, group_reward=group_reward)
         if verbose:
             print()
             if result.ok:
@@ -1079,3 +1207,162 @@ class RolloutClient:
                 )
         return result
+    def run_group(
+        self,
+        example: dict[str, Any],
+        *,
+        samples: int,
+        env_cls_path: str | None = None,
+        env_metadata_path: str | None = None,
+        env_cls_bytes: bytes | None = None,
+        env_metadata_bytes: bytes | None = None,
+        llm_base_url: str | None = None,
+        llm_api_key: str = "",
+        llm_model: str = _VALIDATION_MODEL,
+        max_turns: int = 4,
+        verbose: bool = True,
+    ) -> list[dict[str, Any]]:
+        """Run ONE example as a real ``samples``-member group; return its
+        ``rollout_completed`` events.
+        Submits a one-row batch with ``samples_per_example=samples`` to
+        ``/v1/rollout/batch/stream``. rollout-service co-locates the siblings on
+        one worker and runs ``env.compute_group_reward`` over them — the same
+        path the trainer/external-eval use, in the trainer image. Each event
+        carries ``success``, ``rewards`` and (on a server new enough to report
+        it) ``group_reward_error``. Raises the same typed errors as
+        ``stream_rollout`` on a non-200.
+        """
+        env = self._build_env(
+            env_cls_path, env_metadata_path, env_cls_bytes, env_metadata_bytes
+        )
+        # Resolve the platform bearer once, per request (never frozen at
+        # construction) — used for the request header below AND, when the LLM leg
+        # hits the platform's own endpoint, as that leg's key. Mirrors stream_rollout.
+        bearer = self._token_provider()
+        # The platform key is only auto-forwarded to the platform's own LLM host
+        # (see stream_rollout for the no-leak rationale).
+        platform_llm_url = config.llm_url()
+        resolved_llm_url = llm_base_url or platform_llm_url
+        if not llm_api_key:
+            if resolved_llm_url == platform_llm_url:
+                llm_api_key = bearer
+            else:
+                raise ValueError(
+                    "llm_api_key is required when llm_base_url points outside the "
+                    f"platform LLM endpoint ({platform_llm_url}). Refusing to "
+                    "forward the platform API key to a third-party host."
+                )
+        payload = {
+            "dataset_bytes": base64.b64encode(json.dumps(example).encode()).decode(),
+            "is_dataset_standardized": False,
+            # One example → one group; compute_group_reward needs all siblings
+            # co-located on a single worker anyway. Pin to 1 so rollout-service
+            # doesn't spin up extra workers that get an empty partition and crash.
+            "concurrent_workers": 1,
+            "env": env,
+            "llm": {
+                "base_url": resolved_llm_url,
+                "api_key": llm_api_key,
+                "model": llm_model,
+            },
+            "options": {"max_turns": max_turns, "samples_per_example": samples},
+        }
+        # platform-service mounts the batch proxy at /v1/rollout/batch/stream; it
+        # validates the platform key and forwards to rollout-service with an
+        # act_as JWT, same as the single /v1/rollout/stream proxy.
+        url = f"{self._server_url}/v1/rollout/batch/stream"
+        headers = {"Authorization": f"Bearer {bearer}"}
+        completed: list[dict[str, Any]] = []
+        with httpx.stream(
+            "POST", url, json=payload, headers=headers, timeout=self._timeout
+        ) as response:
+            if response.status_code != 200:
+                body = response.read().decode()
+                if response.status_code in (401, 403):
+                    raise AuthenticationError(body[:300], response.status_code)
+                if response.status_code == 404:
+                    raise RolloutNotFound(body[:300], response.status_code)
+                if 500 <= response.status_code < 600:
+                    raise RolloutServerError(body[:300], response.status_code)
+                raise RolloutError(body[:300], response.status_code)
+            for event in _iter_sse(response):
+                etype = event.get("event")
+                if etype == "batch_started":
+                    if verbose:
+                        print(
+                            _info(
+                                f"  group batch started ({event.get('total')} rollouts)"
+                            )
+                        )
+                elif etype == "rollout_completed":
+                    completed.append(event)
+                elif etype == "worker_error":
+                    # A sandbox process crashed. Non-fatal to the group on its
+                    # own — the verdict comes from the rollout_completed events
+                    # (_assess_group_events fails if none succeeded). Surfaced
+                    # for visibility, not raised.
+                    if verbose:
+                        print(_err(f"  worker_error: {str(event.get('error'))[:200]}"))
+                elif etype == "error":
+                    raise RolloutError(str(event.get("error"))[:300], 500)
+                elif etype in ("batch_completed", "cancelled"):
+                    break
+        return completed
+    def _assess_group_events(
+        self,
+        events: list[dict[str, Any]],
+        samples: int,
+        verbose: bool,
+    ) -> ExampleValidation:
+        """Turn a group's ``rollout_completed`` events into a pass/fail verdict.
+        Fails when the server reported a ``group_reward_error`` for any sibling
+        — compute_group_reward raised or violated its contract (see
+        rollout-service's ``_compute_group_rewards_safe``). Also fails if every
+        group rollout failed (nothing to assess). Index -1.
+        Note: a server that predates ``group_reward_error`` can't report a
+        failure, so a green verdict there means "no failure observed", not
+        "verified" — the offline local check still covers shape regardless.
+        """
+        errors = [
+            e["group_reward_error"] for e in events if e.get("group_reward_error")
+        ]
+        if errors:
+            msg = str(errors[0])
+            if verbose:
+                print(_err(f"  compute_group_reward FAILED server-side: {msg}"))
+            return ExampleValidation(index=-1, ok=False, error=msg)
+        succeeded = [e for e in events if e.get("success")]
+        if not succeeded:
+            first = next(
+                (e.get("error") for e in events if e.get("error")),
+                "no successful group rollouts",
+            )
+            if verbose:
+                print(
+                    _err(
+                        "  group reward not validated — all group rollouts "
+                        f"failed: {first}"
+                    )
+                )
+            return ExampleValidation(
+                index=-1, ok=False, error=f"all group rollouts failed: {first}"
+            )
+        if verbose:
+            print(
+                _ok(
+                    "  compute_group_reward OK server-side on a group of "
+                    f"{len(succeeded)}"
+                )
+            )
+        return ExampleValidation(index=-1, ok=True)

benchmax 0.1.2.dev28__py3-none-any.whl → 0.1.2.dev29__py3-none-any.whl

benchmax 0.1.2.dev28py3-none-any.whl → 0.1.2.dev29py3-none-any.whl