npm - ma-agents - Versions diffs - 3.12.3 → 3.13.1 - Mend

ma-agents 3.12.3 → 3.13.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (257) hide show

package/lib/bmad-cache/bmb/skills/bmad-eval-runner/scripts/run_evals.py ADDED Viewed

@@ -0,0 +1,492 @@
+#!/usr/bin/env python3
+# /// script
+# requires-python = ">=3.9"
+# ///
+"""Run a skill's artifact evals in isolated workspaces.
+For each eval, the runner:
+  1. Stages a fresh workspace (Docker container or local tmp dir under ~/bmad-evals).
+  2. Applies the setup overlay (base then per-eval) so _bmad/ config and dependency
+     skills land in the workspace BEFORE the skill is staged — the skill's own copy
+     always wins over overlay content.
+  3. Copies the skill into .claude/skills/ so it is discoverable by claude.
+  4. Stages any fixture files declared in the eval's `files` list.
+  5. Runs `claude -p '<prompt>' --output-format stream-json --verbose`, capturing
+     the transcript. The Skill tool is available in -p mode and fires for installed
+     skills, so dependency skills provided by the setup overlay are properly invokable.
+  6. Rsyncs any files claude wrote into `<run-dir>/<eval-id>/artifacts/`.
+  7. Writes `metrics.json` (tool-call counts, timing, output sizes).
+Grading is performed separately by the parent skill's grader subagents.
+Usage:
+  python3 run_evals.py \\
+    --skill-path PATH \\
+    --evals-file PATH/evals.json \\
+    --project-root PATH \\
+    --output-dir PATH \\
+    --isolation docker|local \\
+    [--workers N] [--timeout SECS] [--eval-ids A1,B3] [--quiet]
+"""
+from __future__ import annotations
+import argparse
+import json
+import os
+import shutil
+import subprocess
+import sys
+import time
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from pathlib import Path
+SCRIPT_DIR = Path(__file__).resolve().parent
+sys.path.insert(0, str(SCRIPT_DIR))
+from utils import (  # noqa: E402
+    apply_setup_overlay,
+    discover_setup_dirs,
+    new_run_id,
+    parse_skill_md,
+    read_json,
+    read_macos_keychain_credentials,
+    stage_credentials,
+    utc_now_iso,
+    write_json,
+)
+DOCKER_IMAGE = "bmad-eval-runner:latest"
+_KEYCHAIN_CREDS: str | None = read_macos_keychain_credentials()
+RSYNC_EXCLUDES = (
+    ".git", ".bare", "node_modules", ".venv", "__pycache__",
+    ".pytest_cache", ".next", "dist", "build", ".cache",
+    ".DS_Store", "*.pyc",
+)
+def stage_workspace_local(
+    workspace: Path,
+    project_root: Path,
+    skill_path: Path,
+    fixtures: list[tuple[Path, str]],
+    setup_dirs: list[Path] | None = None,
+) -> Path:
+    """Build a clean local workspace. Returns the project root inside workspace."""
+    workspace.mkdir(parents=True, exist_ok=True)
+    project_dest = workspace / "project"
+    home_dir = workspace / ".home"
+    (home_dir / ".claude").mkdir(parents=True, exist_ok=True)
+    excludes: list[str] = []
+    for pat in RSYNC_EXCLUDES:
+        excludes.extend(["--exclude", pat])
+    if shutil.which("rsync"):
+        subprocess.run(
+            ["rsync", "-a", *excludes, f"{project_root}/", f"{project_dest}/"],
+            check=True,
+        )
+    else:
+        shutil.copytree(project_root, project_dest, dirs_exist_ok=True,
+                        ignore=shutil.ignore_patterns(*RSYNC_EXCLUDES))
+    # Apply setup overlay before staging the skill — the skill's own copy wins.
+    if setup_dirs:
+        apply_setup_overlay(setup_dirs, project_dest)
+    skill_link_dir = project_dest / ".claude" / "skills"
+    skill_link_dir.mkdir(parents=True, exist_ok=True)
+    skill_dest = skill_link_dir / skill_path.name
+    if not skill_dest.exists():
+        try:
+            os.symlink(skill_path, skill_dest)
+        except OSError:
+            shutil.copytree(skill_path, skill_dest, dirs_exist_ok=True)
+    for src, dest_rel in fixtures:
+        dest = project_dest / dest_rel
+        dest.parent.mkdir(parents=True, exist_ok=True)
+        shutil.copy2(src, dest)
+    return project_dest
+def run_eval_local(
+    eval_item: dict,
+    run_dir: Path,
+    skill_path: Path,
+    project_root: Path,
+    timeout: int,
+    setup_dirs: list[Path] | None = None,
+) -> dict:
+    eval_id = str(eval_item.get("id", "unnamed"))
+    eval_dir = run_dir / eval_id
+    workspace_root = eval_dir / "workspace"
+    artifacts_dir = eval_dir / "artifacts"
+    transcript_path = eval_dir / "transcript.jsonl"
+    eval_dir.mkdir(parents=True, exist_ok=True)
+    artifacts_dir.mkdir(parents=True, exist_ok=True)
+    fixtures = resolve_fixtures(eval_item.get("files", []), project_root)
+    workspace_project = stage_workspace_local(
+        workspace_root, project_root, skill_path, fixtures, setup_dirs
+    )
+    (eval_dir / "prompt.txt").write_text(eval_item["prompt"], encoding="utf-8")
+    workspace_snapshot_before = snapshot_files(workspace_project)
+    home_dir = workspace_root / ".home"
+    stage_credentials(home_dir / ".claude", _KEYCHAIN_CREDS)
+    env = {
+        "HOME": str(home_dir),
+        "CLAUDE_CONFIG_DIR": str(home_dir / ".claude"),
+        "PATH": os.environ.get("PATH", ""),
+        "ANTHROPIC_API_KEY": os.environ.get("ANTHROPIC_API_KEY", ""),
+    }
+    cmd = [
+        "claude",
+        "-p", eval_item["prompt"],
+        "--output-format", "stream-json",
+        "--verbose",
+        "--dangerously-skip-permissions",
+    ]
+    start = time.time()
+    try:
+        with transcript_path.open("wb") as out:
+            proc = subprocess.run(
+                cmd,
+                stdout=out,
+                stderr=subprocess.PIPE,
+                cwd=str(workspace_project),
+                env=env,
+                timeout=timeout,
+            )
+        elapsed = time.time() - start
+        return_code = proc.returncode
+        stderr_tail = (proc.stderr or b"").decode("utf-8", errors="replace")[-2000:]
+    except subprocess.TimeoutExpired as e:
+        elapsed = time.time() - start
+        return_code = -1
+        stderr_tail = f"TIMEOUT after {timeout}s"
+        if e.stderr:
+            stderr_tail += "\n" + e.stderr.decode("utf-8", errors="replace")[-2000:]
+    new_files = diff_workspace(workspace_project, workspace_snapshot_before)
+    sync_artifacts(workspace_project, new_files, artifacts_dir)
+    metrics = compute_metrics(transcript_path, artifacts_dir, elapsed, return_code, stderr_tail)
+    write_json(eval_dir / "metrics.json", metrics)
+    return {
+        "eval_id": eval_id,
+        "elapsed_s": elapsed,
+        "return_code": return_code,
+        "transcript": str(transcript_path.relative_to(run_dir)),
+        "artifacts_dir": str(artifacts_dir.relative_to(run_dir)),
+        "metrics": metrics,
+    }
+def run_eval_docker(
+    eval_item: dict,
+    run_dir: Path,
+    skill_path: Path,
+    project_root: Path,
+    timeout: int,
+    setup_dirs: list[Path] | None = None,
+) -> dict:
+    eval_id = str(eval_item.get("id", "unnamed"))
+    eval_dir = run_dir / eval_id
+    artifacts_dir = eval_dir / "artifacts"
+    transcript_path = eval_dir / "transcript.jsonl"
+    eval_dir.mkdir(parents=True, exist_ok=True)
+    artifacts_dir.mkdir(parents=True, exist_ok=True)
+    fixtures_staging = eval_dir / "fixtures_in"
+    fixtures_staging.mkdir(parents=True, exist_ok=True)
+    fixtures = resolve_fixtures(eval_item.get("files", []), project_root)
+    for src, dest_rel in fixtures:
+        dest = fixtures_staging / dest_rel
+        dest.parent.mkdir(parents=True, exist_ok=True)
+        shutil.copy2(src, dest)
+    (eval_dir / "prompt.txt").write_text(eval_item["prompt"], encoding="utf-8")
+    # Pre-merge setup overlay dirs on the host; mount as /setup:ro in the container.
+    setup_merged: Path | None = None
+    if setup_dirs:
+        setup_merged = eval_dir / "setup_merged"
+        apply_setup_overlay(setup_dirs, setup_merged)
+        if not any(setup_merged.iterdir()):
+            setup_merged = None
+    creds_dir: Path | None = None
+    if _KEYCHAIN_CREDS:
+        creds_dir = eval_dir / "creds"
+        creds_dir.mkdir(parents=True, exist_ok=True)
+        (creds_dir / ".credentials.json").write_text(_KEYCHAIN_CREDS, encoding="utf-8")
+    container_script = r"""
+set -e
+mkdir -p /workspace
+rsync -a \
+  --exclude=.git --exclude=.bare --exclude=node_modules --exclude=.venv \
+  --exclude=__pycache__ --exclude=.pytest_cache --exclude=.next \
+  --exclude=dist --exclude=build --exclude=.cache --exclude=.DS_Store \
+  /project/ /workspace/
+if [ -d /setup ]; then
+  rsync -a /setup/ /workspace/
+fi
+mkdir -p /workspace/.claude/skills
+cp -R "$SKILL_SRC" "/workspace/.claude/skills/$SKILL_NAME"
+if [ -d /fixtures ]; then
+  cp -R /fixtures/. /workspace/
+fi
+if [ -f /creds/.credentials.json ]; then
+  mkdir -p /home/evaluator/.claude
+  cp /creds/.credentials.json /home/evaluator/.claude/.credentials.json
+fi
+cd /workspace
+claude -p "$EVAL_PROMPT" \
+  --output-format stream-json --verbose \
+  --dangerously-skip-permissions \
+  > /output/transcript.jsonl 2> /output/stderr.log || true
+mkdir -p /output/artifacts
+rsync -a --exclude=.claude --exclude=node_modules --exclude=.git \
+  --filter='+ */' --filter='+ *' \
+  /workspace/ /output/artifacts/
+"""
+    skill_name = skill_path.name
+    cmd = [
+        "docker", "run", "--rm",
+        "-v", f"{project_root}:/project:ro",
+        "-v", f"{skill_path}:/skill_src:ro",
+        "-v", f"{eval_dir}:/output",
+        "-e", "ANTHROPIC_API_KEY",
+        "-e", f"EVAL_PROMPT={eval_item['prompt']}",
+        "-e", f"SKILL_SRC=/skill_src",
+        "-e", f"SKILL_NAME={skill_name}",
+    ]
+    if creds_dir:
+        cmd += ["-v", f"{creds_dir}:/creds:ro"]
+    if fixtures:
+        cmd += ["-v", f"{fixtures_staging}:/fixtures:ro"]
+    if setup_merged:
+        cmd += ["-v", f"{setup_merged}:/setup:ro"]
+    cmd += [DOCKER_IMAGE, "bash", "-c", container_script]
+    start = time.time()
+    try:
+        proc = subprocess.run(
+            cmd,
+            capture_output=True,
+            timeout=timeout + 30,
+        )
+        elapsed = time.time() - start
+        return_code = proc.returncode
+        stderr_tail = proc.stderr.decode("utf-8", errors="replace")[-2000:]
+        if proc.stdout:
+            (eval_dir / "docker.stdout.log").write_bytes(proc.stdout)
+    except subprocess.TimeoutExpired as e:
+        elapsed = time.time() - start
+        return_code = -1
+        stderr_tail = f"TIMEOUT after {timeout}s"
+        if e.stderr:
+            stderr_tail += "\n" + e.stderr.decode("utf-8", errors="replace")[-2000:]
+    metrics = compute_metrics(transcript_path, artifacts_dir, elapsed, return_code, stderr_tail)
+    write_json(eval_dir / "metrics.json", metrics)
+    shutil.rmtree(fixtures_staging, ignore_errors=True)
+    return {
+        "eval_id": eval_id,
+        "elapsed_s": elapsed,
+        "return_code": return_code,
+        "transcript": str(transcript_path.relative_to(run_dir)),
+        "artifacts_dir": str(artifacts_dir.relative_to(run_dir)),
+        "metrics": metrics,
+    }
+def resolve_fixtures(files: list[str], project_root: Path) -> list[tuple[Path, str]]:
+    out: list[tuple[Path, str]] = []
+    for entry in files:
+        candidate = (project_root / entry).resolve()
+        if not candidate.is_file():
+            alt = Path(entry).resolve()
+            if alt.is_file():
+                candidate = alt
+            else:
+                print(f"Warning: fixture not found: {entry}", file=sys.stderr)
+                continue
+        out.append((candidate, entry))
+    return out
+def snapshot_files(root: Path) -> set[str]:
+    snap: set[str] = set()
+    for p in root.rglob("*"):
+        if p.is_file():
+            snap.add(str(p.relative_to(root)))
+    return snap
+def diff_workspace(root: Path, before: set[str]) -> list[str]:
+    after = snapshot_files(root)
+    return sorted(after - before)
+def sync_artifacts(workspace: Path, new_files: list[str], dest: Path) -> None:
+    for rel in new_files:
+        src = workspace / rel
+        if not src.is_file():
+            continue
+        if any(part in (".claude", "node_modules", ".git", ".venv") for part in src.parts):
+            continue
+        target = dest / rel
+        target.parent.mkdir(parents=True, exist_ok=True)
+        shutil.copy2(src, target)
+def compute_metrics(transcript: Path, artifacts: Path, elapsed: float,
+                    rc: int, stderr_tail: str) -> dict:
+    tool_calls: dict[str, int] = {}
+    total_steps = 0
+    if transcript.is_file():
+        for raw in transcript.read_text(encoding="utf-8", errors="replace").splitlines():
+            raw = raw.strip()
+            if not raw:
+                continue
+            try:
+                evt = json.loads(raw)
+            except json.JSONDecodeError:
+                continue
+            if evt.get("type") == "assistant":
+                total_steps += 1
+                for item in evt.get("message", {}).get("content", []):
+                    if item.get("type") == "tool_use":
+                        name = item.get("name", "?")
+                        tool_calls[name] = tool_calls.get(name, 0) + 1
+    output_chars = 0
+    for f in artifacts.rglob("*"):
+        if f.is_file():
+            try:
+                output_chars += f.stat().st_size
+            except OSError:
+                pass
+    return {
+        "elapsed_s": round(elapsed, 2),
+        "return_code": rc,
+        "tool_calls": tool_calls,
+        "total_tool_calls": sum(tool_calls.values()),
+        "total_steps": total_steps,
+        "output_chars": output_chars,
+        "transcript_chars": transcript.stat().st_size if transcript.is_file() else 0,
+        "stderr_tail": stderr_tail,
+    }
+def main() -> int:
+    parser = argparse.ArgumentParser(description="Run a skill's artifact evals in isolation")
+    parser.add_argument("--skill-path", required=True, type=Path)
+    parser.add_argument("--evals-file", required=True, type=Path)
+    parser.add_argument("--project-root", required=True, type=Path)
+    parser.add_argument("--output-dir", required=True, type=Path)
+    parser.add_argument("--isolation", choices=("docker", "local"), required=True)
+    parser.add_argument("--workers", type=int, default=8)
+    parser.add_argument("--timeout", type=int, default=600)
+    parser.add_argument("--eval-ids", default=None, help="Comma-separated subset of eval ids to run")
+    parser.add_argument("--quiet", action="store_true")
+    args = parser.parse_args()
+    skill_path = args.skill_path.resolve()
+    project_root = args.project_root.resolve()
+    evals_file = args.evals_file.resolve()
+    if not evals_file.is_file():
+        print(f"evals file not found: {evals_file}", file=sys.stderr)
+        return 2
+    skill_name, _, _ = parse_skill_md(skill_path)
+    data = read_json(evals_file)
+    evals = data["evals"] if isinstance(data, dict) and "evals" in data else data
+    if args.eval_ids:
+        wanted = {x.strip() for x in args.eval_ids.split(",") if x.strip()}
+        evals = [e for e in evals if str(e.get("id")) in wanted]
+    run_id = new_run_id(skill_name)
+    run_dir = (args.output_dir / run_id).resolve()
+    run_dir.mkdir(parents=True, exist_ok=True)
+    write_json(run_dir / "run.json", {
+        "run_id": run_id,
+        "skill_name": skill_name,
+        "skill_path": str(skill_path),
+        "project_root": str(project_root),
+        "evals_file": str(evals_file),
+        "isolation": args.isolation,
+        "started_at": utc_now_iso(),
+        "eval_count": len(evals),
+    })
+    runner = run_eval_docker if args.isolation == "docker" else run_eval_local
+    results: list[dict] = []
+    if not args.quiet:
+        print(
+            f"[run_evals] {len(evals)} evals, isolation={args.isolation}, run_dir={run_dir}",
+            file=sys.stderr,
+        )
+    with ThreadPoolExecutor(max_workers=args.workers) as pool:
+        future_to_eval = {
+            pool.submit(
+                runner,
+                item,
+                run_dir,
+                skill_path,
+                project_root,
+                int(item.get("timeout", args.timeout)),
+                discover_setup_dirs(evals_file, str(item.get("id", ""))),
+            ): item
+            for item in evals
+        }
+        for fut in as_completed(future_to_eval):
+            item = future_to_eval[fut]
+            try:
+                res = fut.result()
+            except Exception as e:
+                res = {"eval_id": str(item.get("id")), "error": str(e), "return_code": -1}
+            results.append(res)
+            if not args.quiet:
+                rc = res.get("return_code")
+                status = "ok" if rc == 0 else f"rc={rc}"
+                print(
+                    f"  [{status}] eval {res.get('eval_id')} ({res.get('elapsed_s', 0):.1f}s)",
+                    file=sys.stderr,
+                )
+    summary = {
+        "run_id": run_id,
+        "completed_at": utc_now_iso(),
+        "total": len(evals),
+        "executed": len(results),
+        "exec_failures": sum(1 for r in results if r.get("return_code") != 0),
+        "run_dir": str(run_dir),
+        "results": results,
+    }
+    write_json(run_dir / "execution-summary.json", summary)
+    print(json.dumps(summary, indent=2))
+    return 0
+if __name__ == "__main__":
+    sys.exit(main())