PyPI - benchmax - Versions diffs - 0.1.2.dev29__py3-none-any.whl → 0.1.2.dev30__py3-none-any.whl - Mend

benchmax 0.1.2.dev29py3-none-any.whl → 0.1.2.dev30py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

benchmax/cli.py +7 -14
benchmax/config.py +11 -37
benchmax/envs/postgres_search/search_env.py +7 -1
benchmax/envs/reward_helpers.py +9 -2
benchmax/envs/telestich/example.py +28 -39
benchmax/envs/telestich/telestich_env.py +627 -414
benchmax/platform/credentials.py +1 -2
benchmax/platform/login.py +28 -17
benchmax/rag/corpus/chroma/client.py +97 -0
benchmax/rag/corpus/chroma/source.py +35 -5
benchmax/rubrics/rubric.py +101 -26
{benchmax-0.1.2.dev29.dist-info → benchmax-0.1.2.dev30.dist-info}/METADATA +1 -1
{benchmax-0.1.2.dev29.dist-info → benchmax-0.1.2.dev30.dist-info}/RECORD +17 -17
{benchmax-0.1.2.dev29.dist-info → benchmax-0.1.2.dev30.dist-info}/WHEEL +0 -0
{benchmax-0.1.2.dev29.dist-info → benchmax-0.1.2.dev30.dist-info}/entry_points.txt +0 -0
{benchmax-0.1.2.dev29.dist-info → benchmax-0.1.2.dev30.dist-info}/licenses/LICENSE +0 -0
{benchmax-0.1.2.dev29.dist-info → benchmax-0.1.2.dev30.dist-info}/top_level.txt +0 -0

benchmax/cli.py CHANGED Viewed

@@ -11,19 +11,19 @@ from __future__ import annotations
 import argparse
 import sys
+from benchmax import config
 from benchmax.platform import credentials
 from benchmax.platform.device_auth import DeviceAuthError
 from benchmax.platform.login import _login
-def _cmd_login(args: argparse.Namespace) -> int:
-    env = "staging" if args.env == "staging" else None
+def _cmd_login(_args: argparse.Namespace) -> int:
     try:
-        _login(env)
+        _login()
     except DeviceAuthError as exc:
         print(f"Login failed: {exc}", file=sys.stderr)
         return 1
-    print(f"\n✓ Logged in to {args.env}.")
+    print(f"\n✓ Logged in to {config.base_domain()}.")
     return 0
@@ -38,19 +38,18 @@ def _cmd_whoami(_args: argparse.Namespace) -> int:
     if not session:
         print("Not logged in. Run `castform login`.", file=sys.stderr)
         return 1
-    env = session.get("env", "prod")
     jwt = credentials._session_jwt()  # mints from the session; None if invalid/expired/offline
     if not jwt:
         print(
-            f"Session present (env: {env}), but couldn't reach auth-service to "
-            "verify it (offline, or the session expired). If this persists, run "
+            "Session present, but couldn't reach auth-service to verify it "
+            "(offline, or the session expired). If this persists, run "
             "`castform login` again.",
             file=sys.stderr,
         )
         return 1
     claims = credentials._jwt_claims(jwt)
     who = claims.get("email") or claims.get("sub", "<unknown>")
-    print(f"Logged in as {who} (env: {env}).")
+    print(f"Logged in as {who} ({config.base_domain()}).")
     return 0
@@ -59,12 +58,6 @@ def main(argv: list[str] | None = None) -> int:
     sub = parser.add_subparsers(dest="command", required=True)
     p_login = sub.add_parser("login", help="Sign in via your browser")
-    p_login.add_argument(
-        "--env",
-        choices=["prod", "staging"],
-        default="prod",
-        help="Environment to sign in to (staging is internal-only)",
-    )
     p_login.set_defaults(func=_cmd_login)
     sub.add_parser("logout", help="Clear the cached session").set_defaults(func=_cmd_logout)

benchmax/config.py CHANGED Viewed

@@ -1,8 +1,11 @@
 """Centralized URL configuration for the Castform platform.
-All URLs derive from a single base domain. Set ``CASTFORM_BASE_DOMAIN`` to
-point at a different environment (e.g. ``staging.castform.com``); individual
-URL components may be overridden via their own env vars when needed.
+All URLs derive from a single base domain, resolved from exactly two places: the
+``CASTFORM_BASE_DOMAIN`` env var, or the built-in ``castform.com`` default.
+Individual URLs may be overridden via their own env vars
+(``CASTFORM_PLATFORM_URL`` / ``CASTFORM_LLM_URL`` / ``CASTFORM_AUTH_URL`` /
+``CASTFORM_WEB_APP_URL``) — e.g. point platform at ``http://localhost:3000`` for
+local dev while auth keeps talking to the real host.
 Usage::
@@ -16,38 +19,10 @@ DEFAULT_BASE_DOMAIN = "castform.com"
 def base_domain() -> str:
-    """Resolve the platform base domain.
-    Precedence: explicit ``CASTFORM_BASE_DOMAIN`` → the cached device-auth
-    session's ``env`` (``staging`` → ``castform.dev``) → ``prod`` default
-    (``castform.com``). The ``env`` claim travels with the credential, so a
-    logged-in SDK routes to the same environment it authenticated against —
-    URL and credential can't desync. A prod session carries no ``env`` marker
-    (``None`` → prod), so only internal staging logins deviate from the default.
-    """
-    override = os.environ.get("CASTFORM_BASE_DOMAIN")
-    if override:
-        return override
-    if _session_env() == "staging":
-        return "castform.dev"
-    return DEFAULT_BASE_DOMAIN
-def _session_env() -> str | None:
-    """The ``env`` from the cached device-auth session, if any.
-    Lazy import: ``config`` is a leaf that ``benchmax.platform`` depends on, so a
-    top-level import would cycle (platform/__init__ → client → config)."""
-    try:
-        from benchmax.platform.credentials import read_castform_session
-        session = read_castform_session()
-    except Exception:
-        return None
-    if not session:
-        return None
-    env = session.get("env")
-    return env if isinstance(env, str) else None
+    """Resolve the platform base domain: ``CASTFORM_BASE_DOMAIN`` or the
+    ``castform.com`` default. To target another environment (e.g. internal
+    staging), export ``CASTFORM_BASE_DOMAIN=castform.dev``."""
+    return os.environ.get("CASTFORM_BASE_DOMAIN") or DEFAULT_BASE_DOMAIN
 def platform_url() -> str:
@@ -75,7 +50,6 @@ def auth_url() -> str:
     """Auth-service base URL (device-authorization + JWT mint endpoints).
     Used by ``castform login`` and the per-process session→JWT mint. Derives from
-    the same base domain as everything else, so a session minted against ``staging``
-    talks to ``auth.castform.dev`` and a ``prod`` session to ``auth.castform.com``.
+    the same base domain as everything else, or ``CASTFORM_AUTH_URL`` to override.
     """
     return os.environ.get("CASTFORM_AUTH_URL") or f"https://auth.{base_domain()}"

benchmax/envs/postgres_search/search_env.py CHANGED Viewed

@@ -285,8 +285,14 @@ tags. Cite your sources inline using [Source: <source_id>] next to each claim.
             if not text.strip():
                 return zeros
-            t = task or {}
+            # No final <answer> block → no answer to score. Return all-zero
+            # rewards so conciseness / citations / efficiency can't accrue
+            # from reasoning or tool-call text alone.
             answer = extract_answer_block(text)
+            if not answer:
+                return zeros
+            t = task or {}
             prompt = str(t.get("question") or t.get("prompt") or "")
             gt_str = str(t.get("ground_truth") or "")
             reference_chunks = t.get("reference_chunks", [])

benchmax/envs/reward_helpers.py CHANGED Viewed

@@ -82,9 +82,16 @@ def extract_completion_text(completion: str | list[dict[str, Any]]) -> str:
 def extract_answer_block(text: str) -> str:
-    """Extract content from <answer> tags, or return full text."""
+    """Extract content from ``<answer>`` tags.
+    Returns the (stripped) tag contents when an ``<answer>…</answer>`` block
+    is present, otherwise ``""``. A missing answer block is treated as "no
+    final answer" rather than silently falling back to the full completion —
+    consumers can gate rewards on a non-empty result. ``<answer></answer>``
+    likewise yields ``""``.
+    """
     match = _ANSWER_TAG_RE.search(text or "")
-    return (match.group(1) if match else text).strip()
+    return match.group(1).strip() if match else ""
 def clip01(value: Any) -> float:

benchmax/envs/telestich/example.py CHANGED Viewed

@@ -17,8 +17,8 @@ env's word-list / rhyme dependencies):
 (``CASTFORM_LLM_API_KEY`` is optional — it defaults to ``CASTFORM_API_KEY``.)
-By default this is a 2-example smoke run. Set ``TELESTICH_FULL_RUN=1`` to launch
-a real run on the full seed dataset (~90/10 train/eval split).
+This launches a real training run on the full committed seed dataset
+(~90/10 train/eval split).
 """
 import asyncio
@@ -42,7 +42,7 @@ from benchmax.rubrics import rubric as rubric_mod
 #
 # Defaults route through ``benchmax.config``: the prod LLM endpoint is
 # ``https://llm.castform.com/v1`` and the platform control plane is
-# ``https://api.castform.com``. Point at staging or a different env by setting
+# ``https://api.castform.com``. Point at a different environment by setting
 # ``CASTFORM_BASE_DOMAIN`` (or override URLs individually via
 # ``CASTFORM_PLATFORM_URL`` / ``CASTFORM_LLM_URL``).
 from benchmax import config
@@ -59,6 +59,10 @@ EXPERIMENT_PREFIX = "telestich"
 DATASET_PATH = str(Path(__file__).parent / "telestich_dataset.jsonl")
 NUM_EXAMPLES = 400
 CONCURRENCY = 15
+# Trainer model — the launch `model` arg selects the trainer YAML (and thus the GPU
+# pool) server-side. Supported: "Qwen/Qwen3.5-4B" (gpu4) or "Qwen/Qwen3.5-35B-A3B"
+# (gpu8). Override via TELESTICH_MODEL.
+MODEL = os.environ.get("TELESTICH_MODEL", "Qwen/Qwen3.5-4B")
 # (model, weight). Weights reflect observed reliability on our checks:
 # - Both grok models leak banned example words and rubber-stamp the CoT self-check.
@@ -552,7 +556,6 @@ def get_dataset():
 # alongside the pickle so a UI can show "what code is in this env" without
 # unpickling.
 if __name__ == "__main__":
-    import tempfile
     import uuid
     from benchmax.platform.client import TrainerClient
@@ -565,42 +568,25 @@ if __name__ == "__main__":
     print(f"Platform URL: {BASE_URL}")
     print(f"LLM URL:      {LLM_BASE_URL}\n")
-    # 1. Build the dataset.
-    #    Full run (TELESTICH_FULL_RUN=1): the committed seed dataset, topped up
-    #    to NUM_EXAMPLES via the platform LLM if short, split ~90/10 train/eval.
-    #    Default: a 2-example smoke that just exercises gen -> bundle -> upload
-    #    -> launch (and the key-less judge path), not a real training job.
-    full_run = bool(os.environ.get("TELESTICH_FULL_RUN"))
-    if full_run:
-        examples = get_dataset()
-        if len(examples) < 2:
-            raise SystemExit(f"Need >=2 examples for a full run, got {len(examples)}.")
-        # Hold out a representative eval set at random; keep TRAIN in curriculum
-        # order (simpler first) so the difficulty ramp is preserved.
-        n_eval = max(1, len(examples) // 10)
-        eval_idx = set(random.sample(range(len(examples)), n_eval))
-        eval_data = [e for i, e in enumerate(examples) if i in eval_idx]
-        train_data = [e for i, e in enumerate(examples) if i not in eval_idx]
-        print(f"Full run: {len(train_data)} train (curriculum order) / {len(eval_data)} eval.\n")
-    else:
-        with tempfile.TemporaryDirectory() as tmp:
-            gen_path = Path(tmp) / "gen.jsonl"
-            print(f"Generating 2 examples via {LLM_BASE_URL} ...")
-            asyncio.run(generate_dataset(n=2, path=str(gen_path), concurrency=2))
-            examples = load_dataset(str(gen_path))
-        if len(examples) < 2:
-            raise SystemExit(f"Needed 2 examples, only got {len(examples)}.")
-        train_data, eval_data = examples[:1], examples[1:2]
-        print(f"Smoke run: generated {len(examples)} examples — 1 train, 1 eval.\n")
+    # 1. Build the dataset from the committed seed file (curriculum order). Hold out a
+    #    representative eval set at random; keep TRAIN in curriculum order (simpler first)
+    #    so the difficulty ramp is preserved.
+    examples = get_dataset()
+    if len(examples) < 2:
+        raise SystemExit(f"Need >=2 examples, got {len(examples)}.")
+    n_eval = max(1, len(examples) // 10)
+    eval_idx = set(random.sample(range(len(examples)), n_eval))
+    eval_data = [e for i, e in enumerate(examples) if i in eval_idx]
+    train_data = [e for i, e in enumerate(examples) if i not in eval_idx]
+    print(f"{len(train_data)} train (curriculum order) / {len(eval_data)} eval.\n")
     # 2. Bundle the env class and upload everything to platform storage.
     # Bundle config, defined once so the pre-flight validation below exercises
     # the EXACT same env_args / by-value modules / deps as the launch.
     #  - local_modules: ship env + rubric by value (the platform's installed
     #    benchmax may not contain this version of these modules).
-    #  - judge_api_key="": satisfies the constructor without leaking a key; the
-    #    judge resolves its bearer at runtime via the platform act-as seam.
-    constructor_args = {"judge_base_url": LLM_BASE_URL, "judge_api_key": ""}
+    #  - judge bearer resolves at runtime via the device-auth / platform seam.
+    constructor_args = {"judge_base_url": LLM_BASE_URL}
     local_modules = [telestich_env_mod, rubric_mod]
     # All three are still required (is_valid_word → correctness; pronouncing →
     # rhyme). Removing word_bank did NOT free any of them.
@@ -623,10 +609,12 @@ if __name__ == "__main__":
         llm_api_key="",
         remote_examples=2,
     ):
-        raise SystemExit("Env validation failed — aborting before launch (see output above).")
+        raise SystemExit(
+            "Env validation failed — aborting before launch (see output above)."
+        )
     # 3. Bundle the env class and upload everything to platform storage.
-    run_name = f"telestich-{'full' if full_run else 'example'}-{uuid.uuid4().hex[:8]}"
+    run_name = f"telestich-full-{uuid.uuid4().hex[:8]}"
     print(f"\nUploading bundle + datasets as {run_name!r} ...")
     uploaded = upload_training_run(
         env_class=TelestichEnv,
@@ -647,8 +635,9 @@ if __name__ == "__main__":
     ):
         print(f"  {label:<14}: {path}")
-    # 4. Launch the training run. ``simple`` is the deployed 4B/gpu4 template.
-    print("\nLaunching training run ...")
+    # 4. Launch the training run. training_run_type="simple" + the `model` arg select
+    #    the trainer YAML/pool server-side (Qwen3.5-4B→gpu4, Qwen3.5-35B-A3B→gpu8).
+    print(f"\nLaunching training run (model={MODEL}) ...")
     with TrainerClient(api_key=API_KEY, base_url=BASE_URL) as trainer:
         run_id = trainer.launch_training_run(
             training_run_type="simple",
@@ -661,7 +650,7 @@ if __name__ == "__main__":
             # max_response_len 3000: a brief reason + 1-2 tool rounds + poem fits well
             # under this; lowered from 4000 to cut off in-head enumeration rambles
             # sooner (they truncate to a 0-reward anyway).
-            launcher_args={"max_response_len": 3000, "num_epochs": 10},
+            launcher_args={"model": MODEL, "max_response_len": 3000, "num_epochs": 10},
         )
     print(f"\n✓ Launched run_id={run_id}")

benchmax 0.1.2.dev29__py3-none-any.whl → 0.1.2.dev30__py3-none-any.whl

benchmax 0.1.2.dev29py3-none-any.whl → 0.1.2.dev30py3-none-any.whl