PyPI - benchmax - Versions diffs - 0.1.2.dev29__py3-none-any.whl → 0.1.2.dev31__py3-none-any.whl - Mend

benchmax 0.1.2.dev29py3-none-any.whl → 0.1.2.dev31py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

benchmax/bundle.py +74 -0
benchmax/cli.py +7 -14
benchmax/config.py +11 -37
benchmax/envs/postgres_search/search_env.py +7 -1
benchmax/envs/reward_helpers.py +9 -2
benchmax/envs/telestich/example.py +44 -48
benchmax/envs/telestich/telestich_env.py +627 -414
benchmax/platform/client.py +6 -2
benchmax/platform/credentials.py +1 -2
benchmax/platform/login.py +28 -17
benchmax/platform/validation.py +43 -1
benchmax/rag/corpus/chroma/client.py +97 -0
benchmax/rag/corpus/chroma/source.py +35 -5
benchmax/rag/corpus/pinecone/index_client.py +78 -5
benchmax/rag/corpus/pinecone/search.py +5 -0
benchmax/rag/corpus/pinecone/source.py +52 -26
benchmax/rag/corpus/turbopuffer/namespace.py +21 -0
benchmax/rag/corpus/turbopuffer/search.py +15 -3
benchmax/rag/corpus/turbopuffer/source.py +14 -8
benchmax/rubrics/rubric.py +101 -26
{benchmax-0.1.2.dev29.dist-info → benchmax-0.1.2.dev31.dist-info}/METADATA +1 -1
{benchmax-0.1.2.dev29.dist-info → benchmax-0.1.2.dev31.dist-info}/RECORD +26 -26
{benchmax-0.1.2.dev29.dist-info → benchmax-0.1.2.dev31.dist-info}/WHEEL +0 -0
{benchmax-0.1.2.dev29.dist-info → benchmax-0.1.2.dev31.dist-info}/entry_points.txt +0 -0
{benchmax-0.1.2.dev29.dist-info → benchmax-0.1.2.dev31.dist-info}/licenses/LICENSE +0 -0
{benchmax-0.1.2.dev29.dist-info → benchmax-0.1.2.dev31.dist-info}/top_level.txt +0 -0

benchmax/bundle.py CHANGED Viewed

@@ -1,5 +1,6 @@
 from __future__ import annotations
+import importlib
 import inspect
 import io
 import json
@@ -76,6 +77,7 @@ def dump_bundle(
     pip_dependencies: list[str] | None = None,
     local_modules: list[ModuleType] | None = None,
     env_class_source: str | None = None,
+    auto_local_modules: bool = True,
 ) -> Bundle:
     """Pickle ``(env_class, constructor_args)`` and stamp metadata.
@@ -90,6 +92,10 @@ def dump_bundle(
             recover it — e.g. a class produced by ``exec()`` into an in-memory
             namespace, which has no source file on disk. When ``None``
             (default), source is introspected from ``env_class``.
+        auto_local_modules: When True (default), any local module the pickle
+            references but that wasn't passed in ``local_modules`` is imported
+            and pickled by value automatically (a warning names them). When
+            False, such a reference raises ``BundlingError`` instead.
     Raises:
         BundlingError: bad env_class, cloudpickle failure, or pickle references
@@ -124,6 +130,46 @@ def dump_bundle(
                 except Exception:
                     pass
+    if auto_local_modules and _unregistered_local_refs(pickled):
+        # Import each referenced local module and re-dump with it pickled by
+        # value. Loop because a by-value module can surface further local refs;
+        # registrations accumulate (and are torn down once at the end) so an
+        # earlier module stays by-value while we resolve the ones it pulled in.
+        seen: set[str] = {m.__name__ for m in local_modules}
+        registered: list[ModuleType] = []
+        with _BUNDLE_LOCK:
+            try:
+                for _ in range(10):
+                    pending = [
+                        m for m in _unregistered_local_refs(pickled) if m not in seen
+                    ]
+                    if not pending:
+                        break
+                    new_mods: list[ModuleType] = []
+                    for name in pending:
+                        seen.add(name)  # unimportable names fall through to the guard
+                        try:
+                            new_mods.append(importlib.import_module(name))
+                        except Exception:
+                            pass
+                    if not new_mods:
+                        break
+                    logger.warning(
+                        "[bundle] %s: auto-bundling local module(s): %s ",
+                        env_class.__name__,
+                        ", ".join(sorted(m.__name__ for m in new_mods)),
+                    )
+                    for mod in new_mods:
+                        cloudpickle.register_pickle_by_value(mod)
+                        registered.append(mod)
+                    pickled = cloudpickle.dumps((env_class, constructor_args))
+            finally:
+                for mod in registered:
+                    try:
+                        cloudpickle.unregister_pickle_by_value(mod)
+                    except Exception:
+                        pass
     risky = _unregistered_local_refs(pickled)
     if risky:
         msg = (
@@ -259,6 +305,15 @@ def _referenced_modules(pickled: bytes) -> set[str]:
     # Hooks find_class so we see every (module, name) the unpickler would import —
     # i.e. exactly what'd raise ModuleNotFoundError on a fresh interpreter. The stub
     # lets unpickling proceed past missing classes so we collect every ref.
+    #
+    # find_class alone has a blind spot: a bare ``import foo`` that leaves a
+    # module *object* in the env's globals is pickled as
+    # ``cloudpickle.subimport("foo")`` — the module name is a REDUCE argument,
+    # not a find_class path, so we'd only see ``cloudpickle.cloudpickle`` (which
+    # looks installed) and miss ``foo``. We shim subimport to record its arg and
+    # return a stub instead of importing, so a missing module is captured rather
+    # than aborting the whole load early. (``dynamic_subimport`` is by-value /
+    # self-contained — leave it to the real find_class so we don't flag it.)
     refs: set[str] = set()
     class _Stub:
@@ -271,9 +326,28 @@ def _referenced_modules(pickled: bytes) -> set[str]:
         def __reduce__(self) -> tuple:
             return (type(self), ())
+    def _recording_subimport(name: str, *a: Any, **kw: Any) -> ModuleType:
+        refs.add(name)
+        return ModuleType(str(name))
+    def _noop_setstate(obj: Any, *a: Any, **kw: Any) -> Any:
+        # cloudpickle's _make_skeleton_class resolves the class_tracker_id back
+        # to the *live* class (it was tracked when env_class was dumped), so the
+        # real ``_class_setstate``/``_function_setstate`` would setattr the
+        # reconstructed (stub-globals) members onto the live class/function —
+        # mutating the caller's class mid-bundle and poisoning any later dump.
+        # We only need the refs from ``state``, which are already recorded while
+        # it's unpickled; the setter itself is a no-op here.
+        return obj
     class _Recorder(pickle.Unpickler):
         def find_class(self, module: str, name: str) -> Any:
             refs.add(module)
+            if module.startswith("cloudpickle"):
+                if name == "subimport":
+                    return _recording_subimport
+                if name in ("_class_setstate", "_function_setstate"):
+                    return _noop_setstate
             try:
                 return super().find_class(module, name)
             except Exception:

benchmax/cli.py CHANGED Viewed

@@ -11,19 +11,19 @@ from __future__ import annotations
 import argparse
 import sys
+from benchmax import config
 from benchmax.platform import credentials
 from benchmax.platform.device_auth import DeviceAuthError
 from benchmax.platform.login import _login
-def _cmd_login(args: argparse.Namespace) -> int:
-    env = "staging" if args.env == "staging" else None
+def _cmd_login(_args: argparse.Namespace) -> int:
     try:
-        _login(env)
+        _login()
     except DeviceAuthError as exc:
         print(f"Login failed: {exc}", file=sys.stderr)
         return 1
-    print(f"\n✓ Logged in to {args.env}.")
+    print(f"\n✓ Logged in to {config.base_domain()}.")
     return 0
@@ -38,19 +38,18 @@ def _cmd_whoami(_args: argparse.Namespace) -> int:
     if not session:
         print("Not logged in. Run `castform login`.", file=sys.stderr)
         return 1
-    env = session.get("env", "prod")
     jwt = credentials._session_jwt()  # mints from the session; None if invalid/expired/offline
     if not jwt:
         print(
-            f"Session present (env: {env}), but couldn't reach auth-service to "
-            "verify it (offline, or the session expired). If this persists, run "
+            "Session present, but couldn't reach auth-service to verify it "
+            "(offline, or the session expired). If this persists, run "
             "`castform login` again.",
             file=sys.stderr,
         )
         return 1
     claims = credentials._jwt_claims(jwt)
     who = claims.get("email") or claims.get("sub", "<unknown>")
-    print(f"Logged in as {who} (env: {env}).")
+    print(f"Logged in as {who} ({config.base_domain()}).")
     return 0
@@ -59,12 +58,6 @@ def main(argv: list[str] | None = None) -> int:
     sub = parser.add_subparsers(dest="command", required=True)
     p_login = sub.add_parser("login", help="Sign in via your browser")
-    p_login.add_argument(
-        "--env",
-        choices=["prod", "staging"],
-        default="prod",
-        help="Environment to sign in to (staging is internal-only)",
-    )
     p_login.set_defaults(func=_cmd_login)
     sub.add_parser("logout", help="Clear the cached session").set_defaults(func=_cmd_logout)

benchmax/config.py CHANGED Viewed

@@ -1,8 +1,11 @@
 """Centralized URL configuration for the Castform platform.
-All URLs derive from a single base domain. Set ``CASTFORM_BASE_DOMAIN`` to
-point at a different environment (e.g. ``staging.castform.com``); individual
-URL components may be overridden via their own env vars when needed.
+All URLs derive from a single base domain, resolved from exactly two places: the
+``CASTFORM_BASE_DOMAIN`` env var, or the built-in ``castform.com`` default.
+Individual URLs may be overridden via their own env vars
+(``CASTFORM_PLATFORM_URL`` / ``CASTFORM_LLM_URL`` / ``CASTFORM_AUTH_URL`` /
+``CASTFORM_WEB_APP_URL``) — e.g. point platform at ``http://localhost:3000`` for
+local dev while auth keeps talking to the real host.
 Usage::
@@ -16,38 +19,10 @@ DEFAULT_BASE_DOMAIN = "castform.com"
 def base_domain() -> str:
-    """Resolve the platform base domain.
-    Precedence: explicit ``CASTFORM_BASE_DOMAIN`` → the cached device-auth
-    session's ``env`` (``staging`` → ``castform.dev``) → ``prod`` default
-    (``castform.com``). The ``env`` claim travels with the credential, so a
-    logged-in SDK routes to the same environment it authenticated against —
-    URL and credential can't desync. A prod session carries no ``env`` marker
-    (``None`` → prod), so only internal staging logins deviate from the default.
-    """
-    override = os.environ.get("CASTFORM_BASE_DOMAIN")
-    if override:
-        return override
-    if _session_env() == "staging":
-        return "castform.dev"
-    return DEFAULT_BASE_DOMAIN
-def _session_env() -> str | None:
-    """The ``env`` from the cached device-auth session, if any.
-    Lazy import: ``config`` is a leaf that ``benchmax.platform`` depends on, so a
-    top-level import would cycle (platform/__init__ → client → config)."""
-    try:
-        from benchmax.platform.credentials import read_castform_session
-        session = read_castform_session()
-    except Exception:
-        return None
-    if not session:
-        return None
-    env = session.get("env")
-    return env if isinstance(env, str) else None
+    """Resolve the platform base domain: ``CASTFORM_BASE_DOMAIN`` or the
+    ``castform.com`` default. To target another environment (e.g. internal
+    staging), export ``CASTFORM_BASE_DOMAIN=castform.dev``."""
+    return os.environ.get("CASTFORM_BASE_DOMAIN") or DEFAULT_BASE_DOMAIN
 def platform_url() -> str:
@@ -75,7 +50,6 @@ def auth_url() -> str:
     """Auth-service base URL (device-authorization + JWT mint endpoints).
     Used by ``castform login`` and the per-process session→JWT mint. Derives from
-    the same base domain as everything else, so a session minted against ``staging``
-    talks to ``auth.castform.dev`` and a ``prod`` session to ``auth.castform.com``.
+    the same base domain as everything else, or ``CASTFORM_AUTH_URL`` to override.
     """
     return os.environ.get("CASTFORM_AUTH_URL") or f"https://auth.{base_domain()}"

benchmax/envs/postgres_search/search_env.py CHANGED Viewed

@@ -285,8 +285,14 @@ tags. Cite your sources inline using [Source: <source_id>] next to each claim.
             if not text.strip():
                 return zeros
-            t = task or {}
+            # No final <answer> block → no answer to score. Return all-zero
+            # rewards so conciseness / citations / efficiency can't accrue
+            # from reasoning or tool-call text alone.
             answer = extract_answer_block(text)
+            if not answer:
+                return zeros
+            t = task or {}
             prompt = str(t.get("question") or t.get("prompt") or "")
             gt_str = str(t.get("ground_truth") or "")
             reference_chunks = t.get("reference_chunks", [])

benchmax/envs/reward_helpers.py CHANGED Viewed

@@ -82,9 +82,16 @@ def extract_completion_text(completion: str | list[dict[str, Any]]) -> str:
 def extract_answer_block(text: str) -> str:
-    """Extract content from <answer> tags, or return full text."""
+    """Extract content from ``<answer>`` tags.
+    Returns the (stripped) tag contents when an ``<answer>…</answer>`` block
+    is present, otherwise ``""``. A missing answer block is treated as "no
+    final answer" rather than silently falling back to the full completion —
+    consumers can gate rewards on a non-empty result. ``<answer></answer>``
+    likewise yields ``""``.
+    """
     match = _ANSWER_TAG_RE.search(text or "")
-    return (match.group(1) if match else text).strip()
+    return match.group(1).strip() if match else ""
 def clip01(value: Any) -> float:

benchmax/envs/telestich/example.py CHANGED Viewed

@@ -12,13 +12,15 @@ Run it from the benchmax project root (the ``telestich`` extra pulls in the
 env's word-list / rhyme dependencies):
     cd core/benchmax
-    CASTFORM_API_KEY=sk_... \
-        uv run --extra telestich python -m benchmax.envs.telestich.example
+    uv run --extra telestich python -m benchmax.envs.telestich.example
-(``CASTFORM_LLM_API_KEY`` is optional — it defaults to ``CASTFORM_API_KEY``.)
+Auth is the device-auth session (``ensure_session()`` opens a browser login if
+``~/.castform`` has no valid session) — no API key needed. ``CASTFORM_API_KEY``
+/ ``CASTFORM_LLM_API_KEY`` are only consulted by the offline dataset-generation
+helpers, not the launch path.
-By default this is a 2-example smoke run. Set ``TELESTICH_FULL_RUN=1`` to launch
-a real run on the full seed dataset (~90/10 train/eval split).
+This launches a real training run on the full committed seed dataset
+(~90/10 train/eval split).
 """
 import asyncio
@@ -42,7 +44,7 @@ from benchmax.rubrics import rubric as rubric_mod
 #
 # Defaults route through ``benchmax.config``: the prod LLM endpoint is
 # ``https://llm.castform.com/v1`` and the platform control plane is
-# ``https://api.castform.com``. Point at staging or a different env by setting
+# ``https://api.castform.com``. Point at a different environment by setting
 # ``CASTFORM_BASE_DOMAIN`` (or override URLs individually via
 # ``CASTFORM_PLATFORM_URL`` / ``CASTFORM_LLM_URL``).
 from benchmax import config
@@ -59,6 +61,12 @@ EXPERIMENT_PREFIX = "telestich"
 DATASET_PATH = str(Path(__file__).parent / "telestich_dataset.jsonl")
 NUM_EXAMPLES = 400
 CONCURRENCY = 15
+# Trainer model — the launch `model` arg selects the trainer YAML (and thus the GPU
+# pool) server-side. Supported: "Qwen/Qwen3.5-4B" (gpu4) or "Qwen/Qwen3.5-35B-A3B"
+# (gpu8). Override via TELESTICH_MODEL.
+MODEL = os.environ.get("TELESTICH_MODEL", "Qwen/Qwen3.5-4B")
+# Run name — defaults to a unique telestich-full-<uuid>. Override via TELESTICH_RUN_NAME.
+RUN_NAME = os.environ.get("TELESTICH_RUN_NAME", "")
 # (model, weight). Weights reflect observed reliability on our checks:
 # - Both grok models leak banned example words and rubber-stamp the CoT self-check.
@@ -552,55 +560,40 @@ def get_dataset():
 # alongside the pickle so a UI can show "what code is in this env" without
 # unpickling.
 if __name__ == "__main__":
-    import tempfile
     import uuid
+    from benchmax.platform import ensure_session
     from benchmax.platform.client import TrainerClient
     from benchmax.platform.training_run import upload_training_run
     from benchmax.platform.validation import validate_env
-    if not API_KEY:
-        raise SystemExit("Set CASTFORM_API_KEY before running this example.")
+    # Device-auth session bootstrap: browser login if no credential resolves.
+    # After this the platform bearer comes from ~/.castform — no API key needed,
+    # so we pass api_key="" to the platform calls below (resolves via the seam).
+    ensure_session()
     print(f"Platform URL: {BASE_URL}")
     print(f"LLM URL:      {LLM_BASE_URL}\n")
-    # 1. Build the dataset.
-    #    Full run (TELESTICH_FULL_RUN=1): the committed seed dataset, topped up
-    #    to NUM_EXAMPLES via the platform LLM if short, split ~90/10 train/eval.
-    #    Default: a 2-example smoke that just exercises gen -> bundle -> upload
-    #    -> launch (and the key-less judge path), not a real training job.
-    full_run = bool(os.environ.get("TELESTICH_FULL_RUN"))
-    if full_run:
-        examples = get_dataset()
-        if len(examples) < 2:
-            raise SystemExit(f"Need >=2 examples for a full run, got {len(examples)}.")
-        # Hold out a representative eval set at random; keep TRAIN in curriculum
-        # order (simpler first) so the difficulty ramp is preserved.
-        n_eval = max(1, len(examples) // 10)
-        eval_idx = set(random.sample(range(len(examples)), n_eval))
-        eval_data = [e for i, e in enumerate(examples) if i in eval_idx]
-        train_data = [e for i, e in enumerate(examples) if i not in eval_idx]
-        print(f"Full run: {len(train_data)} train (curriculum order) / {len(eval_data)} eval.\n")
-    else:
-        with tempfile.TemporaryDirectory() as tmp:
-            gen_path = Path(tmp) / "gen.jsonl"
-            print(f"Generating 2 examples via {LLM_BASE_URL} ...")
-            asyncio.run(generate_dataset(n=2, path=str(gen_path), concurrency=2))
-            examples = load_dataset(str(gen_path))
-        if len(examples) < 2:
-            raise SystemExit(f"Needed 2 examples, only got {len(examples)}.")
-        train_data, eval_data = examples[:1], examples[1:2]
-        print(f"Smoke run: generated {len(examples)} examples — 1 train, 1 eval.\n")
+    # 1. Build the dataset from the committed seed file (curriculum order). Hold out a
+    #    representative eval set at random; keep TRAIN in curriculum order (simpler first)
+    #    so the difficulty ramp is preserved.
+    examples = get_dataset()
+    if len(examples) < 2:
+        raise SystemExit(f"Need >=2 examples, got {len(examples)}.")
+    n_eval = max(1, len(examples) // 10)
+    eval_idx = set(random.sample(range(len(examples)), n_eval))
+    eval_data = [e for i, e in enumerate(examples) if i in eval_idx]
+    train_data = [e for i, e in enumerate(examples) if i not in eval_idx]
+    print(f"{len(train_data)} train (curriculum order) / {len(eval_data)} eval.\n")
     # 2. Bundle the env class and upload everything to platform storage.
     # Bundle config, defined once so the pre-flight validation below exercises
     # the EXACT same env_args / by-value modules / deps as the launch.
     #  - local_modules: ship env + rubric by value (the platform's installed
     #    benchmax may not contain this version of these modules).
-    #  - judge_api_key="": satisfies the constructor without leaking a key; the
-    #    judge resolves its bearer at runtime via the platform act-as seam.
-    constructor_args = {"judge_base_url": LLM_BASE_URL, "judge_api_key": ""}
+    #  - judge bearer resolves at runtime via the device-auth / platform seam.
+    constructor_args = {"judge_base_url": LLM_BASE_URL}
     local_modules = [telestich_env_mod, rubric_mod]
     # All three are still required (is_valid_word → correctness; pronouncing →
     # rhyme). Removing word_bank did NOT free any of them.
@@ -617,23 +610,25 @@ if __name__ == "__main__":
         eval_dataset=eval_data[:2],
         local_modules=local_modules,
         pip_dependencies=pip_dependencies,
-        api_key=API_KEY,
+        api_key="",  # session bearer via ensure_session()
         base_url=BASE_URL,
         llm_base_url=LLM_BASE_URL,
         llm_api_key="",
         remote_examples=2,
     ):
-        raise SystemExit("Env validation failed — aborting before launch (see output above).")
+        raise SystemExit(
+            "Env validation failed — aborting before launch (see output above)."
+        )
     # 3. Bundle the env class and upload everything to platform storage.
-    run_name = f"telestich-{'full' if full_run else 'example'}-{uuid.uuid4().hex[:8]}"
+    run_name = RUN_NAME or f"telestich-full-{uuid.uuid4().hex[:8]}"
     print(f"\nUploading bundle + datasets as {run_name!r} ...")
     uploaded = upload_training_run(
         env_class=TelestichEnv,
         train_dataset=train_data,
         eval_dataset=eval_data,
         run_name=run_name,
-        api_key=API_KEY,
+        api_key="",  # session bearer via ensure_session()
         base_url=BASE_URL,
         local_modules=local_modules,
         constructor_args=constructor_args,
@@ -647,9 +642,10 @@ if __name__ == "__main__":
     ):
         print(f"  {label:<14}: {path}")
-    # 4. Launch the training run. ``simple`` is the deployed 4B/gpu4 template.
-    print("\nLaunching training run ...")
-    with TrainerClient(api_key=API_KEY, base_url=BASE_URL) as trainer:
+    # 4. Launch the training run. training_run_type="simple" + the `model` arg select
+    #    the trainer YAML/pool server-side (Qwen3.5-4B→gpu4, Qwen3.5-35B-A3B→gpu8).
+    print(f"\nLaunching training run (model={MODEL}) ...")
+    with TrainerClient(api_key="", base_url=BASE_URL) as trainer:
         run_id = trainer.launch_training_run(
             training_run_type="simple",
             env_cls_path=uploaded.env_cls_path,
@@ -658,10 +654,10 @@ if __name__ == "__main__":
             eval_dataset_path=uploaded.eval_dataset_path,
             name=run_name,
             # num_epochs: passes over the train set (platform default is 5).
-            # max_response_len 3000: a brief reason + 1-2 tool rounds + poem fits well
+            # max_rollout_len 3000: a brief reason + 1-2 tool rounds + poem fits well
             # under this; lowered from 4000 to cut off in-head enumeration rambles
             # sooner (they truncate to a 0-reward anyway).
-            launcher_args={"max_response_len": 3000, "num_epochs": 10},
+            launcher_args={"model": MODEL, "max_rollout_len": 3000, "num_epochs": 10},
         )
     print(f"\n✓ Launched run_id={run_id}")

benchmax 0.1.2.dev29__py3-none-any.whl → 0.1.2.dev31__py3-none-any.whl

benchmax 0.1.2.dev29py3-none-any.whl → 0.1.2.dev31py3-none-any.whl