PyPI - benchmax - Versions diffs - 0.1.2.dev33__py3-none-any.whl → 0.1.2.dev35__py3-none-any.whl - Mend

benchmax 0.1.2.dev33py3-none-any.whl → 0.1.2.dev35py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (48) hide show

benchmax/cli/__init__.py +71 -0
benchmax/{cli.py → cli/_auth.py} +16 -22
benchmax/cli/_client.py +49 -0
benchmax/cli/_output.py +134 -0
benchmax/cli/_project.py +138 -0
benchmax/cli/_providers.py +60 -0
benchmax/cli/control.py +28 -0
benchmax/cli/corpus.py +230 -0
benchmax/cli/data.py +441 -0
benchmax/cli/help.py +233 -0
benchmax/cli/launch.py +241 -0
benchmax/cli/runs.py +187 -0
benchmax/cli/scaffold/CLAUDE.md +132 -0
benchmax/cli/scaffold/STARTER.md +93 -0
benchmax/cli/scaffold/__init__.py +0 -0
benchmax/cli/scaffold/rag_run.py +72 -0
benchmax/cli/scaffold/skills/design-environment/SKILL.md +327 -0
benchmax/cli/scaffold/skills/generate-data/SKILL.md +192 -0
benchmax/cli/scaffold/skills/launch-run/SKILL.md +68 -0
benchmax/cli/scaffold/skills/verify-environment/SKILL.md +199 -0
benchmax/cli/scaffold/skills/view-progress/SKILL.md +63 -0
benchmax/cli/setup.py +286 -0
benchmax/cli/validate.py +448 -0
benchmax/envs/postgres_search/search_env.py +14 -3
benchmax/envs/telestich/example.py +2 -3
benchmax/platform/client.py +117 -9
benchmax/platform/training_run.py +0 -1
benchmax/platform/validation.py +12 -1
benchmax/rag/corpus/embed.py +54 -0
benchmax/rag/corpus/postgres/client.py +237 -12
benchmax/rag/corpus/postgres/exceptions.py +2 -2
benchmax/rag/corpus/postgres/source.py +93 -26
benchmax/rag/qa_generation/batch_processor.py +138 -12
benchmax/rag/qa_generation/filters/grounding_llm.py +117 -34
benchmax/rag/qa_generation/filters/hop_count_validity.py +116 -31
benchmax/rag/qa_generation/filters/retrieval_llm.py +131 -44
benchmax/rag/qa_generation/generators/direct_llm.py +123 -43
benchmax/rag/qa_generation/metadata_linker.py +179 -10
benchmax/rag/qa_generation/pipeline.py +297 -205
benchmax/rag/qa_generation/pipeline_config.py +89 -0
benchmax/rag/qa_generation/search_agent_linker.py +59 -6
benchmax/rag/qa_generation/wiki_chunk_linker.py +34 -6
{benchmax-0.1.2.dev33.dist-info → benchmax-0.1.2.dev35.dist-info}/METADATA +4 -2
{benchmax-0.1.2.dev33.dist-info → benchmax-0.1.2.dev35.dist-info}/RECORD +48 -25
{benchmax-0.1.2.dev33.dist-info → benchmax-0.1.2.dev35.dist-info}/WHEEL +0 -0
{benchmax-0.1.2.dev33.dist-info → benchmax-0.1.2.dev35.dist-info}/entry_points.txt +0 -0
{benchmax-0.1.2.dev33.dist-info → benchmax-0.1.2.dev35.dist-info}/licenses/LICENSE +0 -0
{benchmax-0.1.2.dev33.dist-info → benchmax-0.1.2.dev35.dist-info}/top_level.txt +0 -0

benchmax/cli/__init__.py ADDED Viewed

@@ -0,0 +1,71 @@
+"""``castform`` CLI — a single argparse tree assembled from command groups.
+Each command group lives in its own module exposing ``register(sub)``;
+``build_parser`` wires them onto the top-level subparsers and ``main`` dispatches
+to the selected handler's ``func``. Bundled with the benchmax SDK — entry point
+``benchmax.cli:main`` (``pyproject.toml``). Argparse (not typer) is deliberate:
+bundled packaging means a CLI dep would land in the training-engine closure; see
+``docs/plans/castform-cli-rl-workflow.md`` slice 1.1.
+"""
+from __future__ import annotations
+import argparse
+import sys
+from benchmax.cli import (
+    _auth,
+    control,
+    corpus,
+    data,
+    help,
+    launch,
+    runs,
+    setup,
+    validate,
+)
+# Re-export auth handlers — tests/unit/test_cli.py imports them as cli._cmd_*.
+from benchmax.cli._auth import _cmd_login, _cmd_logout, _cmd_whoami
+__all__ = ["build_parser", "main", "_cmd_login", "_cmd_logout", "_cmd_whoami"]
+def build_parser() -> argparse.ArgumentParser:
+    """Build the full castform parser. Tests snapshot its ``format_help()``."""
+    parser = argparse.ArgumentParser(prog="castform", description="Castform CLI")
+    sub = parser.add_subparsers(dest="command", required=True, metavar="<command>")
+    _auth.register(sub)
+    runs.register(sub)
+    control.register(sub)
+    validate.register(sub)
+    launch.register(sub)
+    data.register(sub)
+    corpus.register(sub)
+    setup.register(sub)
+    # `guide` renders the getting-started walkthrough (the renderer lives in
+    # ``benchmax.cli.help``). Named `guide`, not `quickstart`, because `setup`
+    # is itself the quickstart flow — keep the two from blurring together.
+    gp = sub.add_parser("guide", help="Walk through your first run")
+    gp.set_defaults(func=help._cmd_help)
+    # `help` mirrors `-h`: just list the commands. Users reach for it by habit;
+    # the walkthrough is `castform guide`, not here.
+    def _list_commands(_args: argparse.Namespace) -> int:
+        parser.print_help()
+        return 0
+    hp = sub.add_parser("help", help="List the available commands")
+    hp.set_defaults(func=_list_commands)
+    return parser
+def main(argv: list[str] | None = None) -> int:
+    parser = build_parser()
+    args = parser.parse_args(argv)
+    return args.func(args)
+if __name__ == "__main__":
+    sys.exit(main())

benchmax/{cli.py → cli/_auth.py} RENAMED Viewed

@@ -1,9 +1,8 @@
-"""``castform`` CLI — browser-based login for the SDK.
+"""castform auth commands: ``login`` / ``logout`` / ``whoami``.
-Commands: ``login`` (device authorization), ``logout``, ``whoami``. The login
-flow + the reusable ``ensure_session`` live in :mod:`benchmax.platform.login`;
-this module is the thin argparse wrapper. After ``castform login`` the SDK
-resolves its bearer from ``~/.castform`` automatically — no API key or URL.
+The device-auth flow + the reusable ``ensure_session`` live in
+:mod:`benchmax.platform.login`; these handlers are thin argparse wrappers. After
+``castform login`` the SDK resolves its bearer from ``~/.castform`` automatically.
 """
 from __future__ import annotations
@@ -38,7 +37,7 @@ def _cmd_whoami(_args: argparse.Namespace) -> int:
     if not session:
         print("Not logged in. Run `castform login`.", file=sys.stderr)
         return 1
-    jwt = credentials._session_jwt()  # mints from the session; None if invalid/expired/offline
+    jwt = credentials._session_jwt()  # None if invalid/expired/offline
     if not jwt:
         print(
             "Session present, but couldn't reach auth-service to verify it "
@@ -53,19 +52,14 @@ def _cmd_whoami(_args: argparse.Namespace) -> int:
     return 0
-def main(argv: list[str] | None = None) -> int:
-    parser = argparse.ArgumentParser(prog="castform", description="Castform CLI")
-    sub = parser.add_subparsers(dest="command", required=True)
-    p_login = sub.add_parser("login", help="Sign in via your browser")
-    p_login.set_defaults(func=_cmd_login)
-    sub.add_parser("logout", help="Clear the cached session").set_defaults(func=_cmd_logout)
-    sub.add_parser("whoami", help="Show the current login").set_defaults(func=_cmd_whoami)
-    args = parser.parse_args(argv)
-    return args.func(args)
-if __name__ == "__main__":
-    sys.exit(main())
+def register(sub: argparse._SubParsersAction) -> None:
+    """Attach login/logout/whoami to the top-level subparsers."""
+    sub.add_parser("login", help="Sign in via your browser").set_defaults(
+        func=_cmd_login
+    )
+    sub.add_parser("logout", help="Clear the cached session").set_defaults(
+        func=_cmd_logout
+    )
+    sub.add_parser("whoami", help="Show the current login").set_defaults(
+        func=_cmd_whoami
+    )

benchmax/cli/_client.py ADDED Viewed

@@ -0,0 +1,49 @@
+"""Shared platform-client wiring + error handling for CLI command groups.
+Read/control commands resolve their bearer through the credential seam
+(:func:`benchmax.platform.credentials.platform_bearer`) — ``ACT_AS_TOKEN_PATH``
+→ ``PLATFORM_API_KEY`` → cached ``~/.castform`` session — against the host from
+:mod:`benchmax.config`. ``handle_errors`` keeps tracebacks out of normal failures.
+"""
+from __future__ import annotations
+import functools
+import sys
+from collections.abc import Callable
+import httpx
+from benchmax.platform.client import TrainerClient
+from benchmax.platform.exceptions import AuthenticationError, TrainerError
+def trainer_client() -> TrainerClient:
+    """A TrainerClient bound to the configured platform host + bearer seam."""
+    return TrainerClient()
+def handle_errors(func: Callable) -> Callable:
+    """Turn client/credential/network failures into a clean stderr line + exit 1."""
+    @functools.wraps(func)
+    def wrapper(args):
+        try:
+            return func(args)
+        except AuthenticationError:
+            print(
+                "Not logged in (or session expired). Run `castform login`.",
+                file=sys.stderr,
+            )
+            return 1
+        except TrainerError as exc:
+            print(f"Error: {exc.message}", file=sys.stderr)
+            return 1
+        except RuntimeError as exc:  # platform_bearer with no resolvable credential
+            print(f"Error: {exc}", file=sys.stderr)
+            return 1
+        except httpx.HTTPError as exc:
+            print(f"Network error: {exc}", file=sys.stderr)
+            return 1
+    return wrapper

benchmax/cli/_output.py ADDED Viewed

@@ -0,0 +1,134 @@
+"""Small, dependency-free output helpers shared by CLI command groups."""
+from __future__ import annotations
+import json as _json
+import os
+import re
+import shutil
+import sys
+from typing import Any
+# Castform brand palette (web-app diagram tokens: blue #3b76f6, orange #f97316),
+# softened for the terminal and tuned per background. We can't query the terminal
+# background synchronously, but many emulators export COLORFGBG ("fg;bg"); when bg
+# reads light we pick deeper, higher-contrast tones, otherwise softer pastels that
+# don't glare on black. Falls back to the dark (pastel) set. Rendered as truecolor.
+def _is_light_terminal() -> bool:
+    parts = os.environ.get("COLORFGBG", "").split(";")
+    if parts and parts[-1].strip().isdigit():
+        return int(parts[-1].strip()) >= 7  # 7/15 = light background
+    return False
+if _is_light_terminal():
+    BLUE = (37, 99, 235)  # #2563eb — deeper, reads on white
+    ORANGE = (194, 87, 28)  # #c2571c — terracotta
+else:
+    BLUE = (125, 166, 232)  # #7da6e8 — soft sky
+    ORANGE = (230, 166, 99)  # #e6a663 — soft amber
+_GREY = (140, 140, 140)
+_ANSI_RE = re.compile(r"\x1b\[[0-9;]*m")
+def color_enabled() -> bool:
+    """True when we should emit ANSI — a real TTY and ``NO_COLOR`` unset."""
+    return sys.stdout.isatty() and not os.environ.get("NO_COLOR")
+def paint(
+    text: str,
+    rgb: tuple[int, int, int] | None = None,
+    *,
+    bold: bool = False,
+    italic: bool = False,
+    dim: bool = False,
+) -> str:
+    """Wrap ``text`` in ANSI styling, or return it unchanged when color is off."""
+    if not color_enabled():
+        return text
+    codes: list[str] = []
+    if bold:
+        codes.append("1")
+    if dim:
+        codes.append("2")
+    if italic:
+        codes.append("3")
+    if rgb is not None:
+        codes.append(f"38;2;{rgb[0]};{rgb[1]};{rgb[2]}")
+    if not codes:
+        return text
+    return f"\x1b[{';'.join(codes)}m{text}\x1b[0m"
+def _visible_len(s: str) -> int:
+    """Length of ``s`` ignoring ANSI escapes — for padding pre-colored text."""
+    return len(_ANSI_RE.sub("", s))
+def term_width(default: int = 80) -> int:
+    return shutil.get_terminal_size((default, 24)).columns
+def rule_label(text: str, color: tuple[int, int, int], width: int) -> str:
+    """The standard section divider: a centered title flanked by rules, fully
+    colored — ``──────── title ────────`` — spanning ``width`` columns."""
+    total = max(width - len(text) - 2, 0)
+    left = total // 2
+    line = "─" * left + f" {text} " + "─" * (total - left)
+    return paint(line, color, bold=True)
+def boxed(
+    lines: list[str],
+    *,
+    color: tuple[int, int, int],
+    width: int,
+    title: str = "",
+) -> list[str]:
+    """Render a rounded box ``width`` columns wide (content area) around ``lines``.
+    ``lines`` may already contain ANSI codes or be nested boxes — widths are
+    measured ignoring escapes, so padding stays aligned. ``title`` is centered in
+    the top border. Returns the box as a list of rendered rows.
+    """
+    inner = width + 2  # one space of padding on each side
+    seg = f" {title} " if title else ""
+    fill = max(inner - _visible_len(seg), 0)
+    left = fill // 2
+    top = "╭" + "─" * left + seg + "─" * (fill - left) + "╮"
+    bottom = "╰" + "─" * inner + "╯"
+    bar = paint("│", color)
+    out = [paint(top, color, bold=True)]
+    for ln in lines:
+        pad = max(width - _visible_len(ln), 0)
+        out.append(f"{bar} {ln}{' ' * pad} {bar}")
+    out.append(paint(bottom, color))
+    return out
+def print_json(obj: Any) -> None:
+    """Emit ``obj`` as pretty JSON (``default=str`` so stray types don't crash)."""
+    print(_json.dumps(obj, indent=2, default=str))
+def render_table(headers: list[str], rows: list[list[Any]]) -> None:
+    """Print a left-aligned fixed-width table. No-op styling — pipe-friendly."""
+    widths = [len(str(h)) for h in headers]
+    for row in rows:
+        for i, cell in enumerate(row):
+            widths[i] = max(widths[i], len(str(cell)))
+    fmt = "  ".join("{:<" + str(w) + "}" for w in widths)
+    print(fmt.format(*headers))
+    for row in rows:
+        print(fmt.format(*[str(c) for c in row]))
+def fmt_value(value: Any) -> str:
+    """Compact numeric formatting for scalar values; pass through non-numbers."""
+    if isinstance(value, float):
+        return f"{value:.4g}"
+    return str(value)

benchmax/cli/_project.py ADDED Viewed

@@ -0,0 +1,138 @@
+"""Load a benchmax project (env class + datasets) from a directory.
+Convention mirrors the web-app scaffold (``buildAgentContextBody``): ``run.py``
+defines a single :class:`BaseEnv` subclass; ``train_dataset.jsonl`` /
+``eval_dataset.jsonl`` hold one JSON object per line. ``validate`` and ``launch``
+share this loader. An importable module path (``--module``) is an alternative to
+``run.py`` for shipped envs / fixtures.
+"""
+from __future__ import annotations
+import importlib
+import importlib.util
+import inspect
+import json
+import sys
+from dataclasses import dataclass
+from pathlib import Path
+from types import ModuleType
+from typing import Any
+class ProjectError(Exception):
+    """A project couldn't be loaded (missing run.py/dataset, or no/ambiguous env)."""
+@dataclass
+class LoadedProject:
+    env_class: type
+    train_dataset: list[dict[str, Any]]
+    eval_dataset: list[dict[str, Any]]
+    module: ModuleType
+    from_file: (
+        bool  # loaded from a run.py path (pickle env by value) vs an importable module
+    )
+def _load_module_from_file(path: Path) -> ModuleType:
+    spec = importlib.util.spec_from_file_location(path.stem, path)
+    if spec is None or spec.loader is None:
+        raise ProjectError(f"Could not load a module from {path}")
+    module = importlib.util.module_from_spec(spec)
+    sys.modules[path.stem] = module  # so dataclass/pickle name resolution works
+    try:
+        spec.loader.exec_module(module)
+    except Exception as exc:  # surface the user's import/syntax error cleanly
+        raise ProjectError(f"Failed to import {path.name}: {exc}") from exc
+    return module
+def discover_env_class(module: ModuleType, explicit: str | None = None) -> type:
+    """Find the env class in ``module``. With no ``explicit`` name, require exactly
+    one BaseEnv subclass *defined in* the module (imported ones are ignored)."""
+    from benchmax.envs.base_env import BaseEnv
+    def _is_env(obj: Any) -> bool:
+        return inspect.isclass(obj) and issubclass(obj, BaseEnv) and obj is not BaseEnv
+    if explicit:
+        for obj in vars(module).values():
+            if _is_env(obj) and obj.__name__ == explicit:
+                return obj
+        raise ProjectError(f"No BaseEnv subclass named {explicit!r} in the module.")
+    defined_here = [
+        obj
+        for obj in vars(module).values()
+        if _is_env(obj) and obj.__module__ == module.__name__
+    ]
+    if not defined_here:
+        raise ProjectError("No BaseEnv subclass defined in the module.")
+    if len(defined_here) > 1:
+        names = sorted(c.__name__ for c in defined_here)
+        raise ProjectError(
+            f"Multiple env classes {names}; pass --env-class to pick one."
+        )
+    return defined_here[0]
+def _load_jsonl(path: Path) -> list[dict[str, Any]]:
+    if not path.exists():
+        raise ProjectError(f"Dataset not found: {path}")
+    rows: list[dict[str, Any]] = []
+    for n, raw in enumerate(path.read_text(encoding="utf-8").splitlines(), 1):
+        line = raw.strip()
+        if not line:
+            continue
+        try:
+            rows.append(json.loads(line))
+        except json.JSONDecodeError as exc:
+            raise ProjectError(f"{path}:{n}: invalid JSON ({exc})") from exc
+    if not rows:
+        raise ProjectError(f"Dataset is empty: {path}")
+    return rows
+def load_project(
+    *,
+    directory: str = ".",
+    run_file: str = "run.py",
+    module_path: str | None = None,
+    env_class_name: str | None = None,
+    train_file: str = "train_dataset.jsonl",
+    eval_file: str = "eval_dataset.jsonl",
+    require_eval: bool = False,
+) -> LoadedProject:
+    """Load the env class + datasets for a project dir (or an importable module)."""
+    from_file = module_path is None
+    if module_path:
+        try:
+            module = importlib.import_module(module_path)
+        except Exception as exc:  # missing dep, bad path, import-time error
+            raise ProjectError(
+                f"Could not import module {module_path!r}: {exc}"
+            ) from exc
+    else:
+        path = Path(directory) / run_file
+        if not path.exists():
+            raise ProjectError(
+                f"{run_file} not found in {directory!r} — run inside a project dir, "
+                "or pass --module for an importable env."
+            )
+        module = _load_module_from_file(path)
+    env_class = discover_env_class(module, env_class_name)
+    base = Path(directory)
+    train_dataset = _load_jsonl(base / train_file)
+    eval_path = base / eval_file
+    eval_dataset = _load_jsonl(eval_path) if eval_path.exists() else []
+    if require_eval and not eval_dataset:
+        raise ProjectError(f"Eval dataset required but not found: {eval_path}")
+    return LoadedProject(
+        env_class=env_class,
+        train_dataset=train_dataset,
+        eval_dataset=eval_dataset,
+        module=module,
+        from_file=from_file,
+    )

benchmax/cli/_providers.py ADDED Viewed

@@ -0,0 +1,60 @@
+"""Single source of truth for the RAG corpus providers and their sandbox deps.
+``PROVIDER_PIP`` mirrors the per-provider extras in ``pyproject.toml`` — a unit test
+asserts byte-equality against ``[project.optional-dependencies]``, so the two can't
+silently drift (chroma must carry ``snowballstemmer``, which chromadb's BM25 needs but
+doesn't declare). ``data``/``validate``/``launch`` all read their ``--provider``
+choices and install hints from here instead of repeating the SDK names procedurally.
+"""
+from __future__ import annotations
+# Maps a provider key → the pip requirements its search SDK needs in the rollout
+# sandbox. Mirrors pyproject.toml's [project.optional-dependencies] provider extras.
+PROVIDER_PIP: dict[str, list[str]] = {
+    "turbopuffer": ["turbopuffer>=1.16.2"],
+    "pinecone": ["pinecone>=5.0.0"],
+    # chromadb's BM25 embedding function needs snowballstemmer but doesn't declare it.
+    "chroma": ["chromadb>=1.0.0", "snowballstemmer>=2.2.0"],
+}
+def provider_choices() -> list[str]:
+    """The provider keys, for argparse ``choices=`` on ``--provider``."""
+    return list(PROVIDER_PIP)
+def install_hint(provider: str) -> str:
+    """The user-facing ``Install with: pip install castform[<extra>]`` line."""
+    return f"Install with: pip install castform[{provider}]"
+def resolve_pip_dependencies(
+    explicit: list[str] | None,
+    env_class: type | None = None,
+    provider: str | None = None,
+) -> list[str] | None:
+    """Compose the rollout-sandbox pip deps for ``validate``/``launch``.
+    Merges, in order: ``--pip`` (``explicit``), the env's self-declared
+    ``PIP_DEPENDENCIES`` class attribute, then ``PROVIDER_PIP[provider]`` when
+    ``--provider`` was passed — de-duped preserving first-seen order. Returns
+    ``None`` when nothing resolves, so the no-slot/no-provider case is the old
+    ``args.pip or None`` verbatim (preserving the single ``dump_bundle`` channel).
+    Read CLI-side, not sandbox-side: ``dump_bundle`` pickles the env class by value
+    but the sandbox installs from ``BundleMetadata.pip_dependencies``, so the slot
+    must be resolved here and fed the existing ``pip_dependencies=`` channel.
+    """
+    deps: list[str] = list(explicit or [])
+    declared = getattr(env_class, "PIP_DEPENDENCIES", None)
+    if isinstance(declared, (list, tuple)):  # guard: a list-of-str slot only
+        deps.extend(d for d in declared if isinstance(d, str))
+    if provider:
+        deps.extend(PROVIDER_PIP.get(provider, []))
+    seen: set[str] = set()
+    ordered = [d for d in deps if not (d in seen or seen.add(d))]
+    return ordered or None

benchmax/cli/control.py ADDED Viewed

@@ -0,0 +1,28 @@
+"""castform run-control verbs (slice 1.3).
+Top-level ``castform stop <id>`` → ``POST /v1/train/runs/{id}/cancel``. Owner-only
+(403 otherwise). A launched run emits ``training.cancelling`` + cancels its
+launcher job; a run with no job is marked complete directly (no cancelling event).
+"""
+from __future__ import annotations
+import argparse
+from benchmax.cli._client import handle_errors, trainer_client
+@handle_errors
+def _cmd_stop(args: argparse.Namespace) -> int:
+    with trainer_client() as client:
+        result = client.cancel_run(args.run_id)
+    message = result.get("message") or "Cancellation requested"
+    print(f"✓ {message}")
+    return 0
+def register(sub: argparse._SubParsersAction) -> None:
+    """Attach the top-level `stop` verb."""
+    p_stop = sub.add_parser("stop", help="Stop (cancel) a run you own")
+    p_stop.add_argument("run_id")
+    p_stop.set_defaults(func=_cmd_stop)

benchmax 0.1.2.dev33__py3-none-any.whl → 0.1.2.dev35__py3-none-any.whl

benchmax 0.1.2.dev33py3-none-any.whl → 0.1.2.dev35py3-none-any.whl