npm - ma-agents - Versions diffs - 3.12.2 → 3.13.0 - Mend

ma-agents 3.12.2 → 3.13.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (255) hide show

package/lib/bmad-cache/bmb/skills/bmad-eval-runner/scripts/docker_setup.py ADDED Viewed

@@ -0,0 +1,115 @@
+#!/usr/bin/env python3
+# /// script
+# requires-python = ">=3.9"
+# ///
+"""Detect Docker and build the bmad-eval-runner image when needed.
+Usage:
+  python3 docker_setup.py --check                # exit 0 if image is ready, 1 otherwise
+  python3 docker_setup.py --build                # build the image (no-op if present)
+  python3 docker_setup.py --rebuild              # force rebuild
+"""
+from __future__ import annotations
+import argparse
+import json
+import shutil
+import subprocess
+import sys
+from pathlib import Path
+IMAGE_TAG = "bmad-eval-runner:latest"
+SCRIPT_DIR = Path(__file__).resolve().parent
+DOCKERFILE = SCRIPT_DIR.parent / "assets" / "Dockerfile"
+def docker_available() -> tuple[bool, str]:
+    if shutil.which("docker") is None:
+        return False, "docker CLI not found on PATH"
+    try:
+        result = subprocess.run(
+            ["docker", "info"],
+            capture_output=True,
+            text=True,
+            timeout=5,
+        )
+        if result.returncode != 0:
+            return False, f"`docker info` failed: {result.stderr.strip().splitlines()[-1] if result.stderr.strip() else 'unknown'}"
+        return True, "ok"
+    except subprocess.TimeoutExpired:
+        return False, "`docker info` timed out"
+    except Exception as e:
+        return False, f"docker check error: {e}"
+def image_present(tag: str = IMAGE_TAG) -> bool:
+    try:
+        result = subprocess.run(
+            ["docker", "image", "inspect", tag],
+            stdout=subprocess.DEVNULL,
+            stderr=subprocess.DEVNULL,
+            timeout=10,
+        )
+        return result.returncode == 0
+    except Exception:
+        return False
+def build_image(tag: str = IMAGE_TAG, force: bool = False, verbose: bool = True) -> int:
+    if not DOCKERFILE.is_file():
+        print(f"Dockerfile missing at {DOCKERFILE}", file=sys.stderr)
+        return 2
+    cmd = ["docker", "build", "-t", tag, "-f", str(DOCKERFILE), str(DOCKERFILE.parent)]
+    if force:
+        cmd.insert(2, "--no-cache")
+    if verbose:
+        print(f"Building {tag} from {DOCKERFILE} ...", file=sys.stderr)
+    proc = subprocess.run(cmd, stdout=sys.stderr if verbose else subprocess.DEVNULL, stderr=sys.stderr)
+    return proc.returncode
+def main() -> int:
+    parser = argparse.ArgumentParser(description="Manage the bmad-eval-runner Docker image")
+    group = parser.add_mutually_exclusive_group(required=True)
+    group.add_argument("--check", action="store_true", help="Report status as JSON; exit 0 if image is ready")
+    group.add_argument("--build", action="store_true", help="Build the image (no-op if already present)")
+    group.add_argument("--rebuild", action="store_true", help="Force rebuild")
+    parser.add_argument("--quiet", action="store_true")
+    args = parser.parse_args()
+    available, reason = docker_available()
+    present = image_present() if available else False
+    if args.check:
+        print(json.dumps({
+            "docker_available": available,
+            "docker_reason": reason,
+            "image_present": present,
+            "image_tag": IMAGE_TAG,
+        }, indent=2))
+        return 0 if (available and present) else 1
+    if not available:
+        print(f"Docker is not available: {reason}", file=sys.stderr)
+        return 3
+    if args.rebuild:
+        return build_image(force=True, verbose=not args.quiet)
+    if args.build:
+        if present:
+            if not args.quiet:
+                print(f"{IMAGE_TAG} already present; skipping build (use --rebuild to force).", file=sys.stderr)
+            return 0
+        return build_image(force=False, verbose=not args.quiet)
+    return 0
+if __name__ == "__main__":
+    sys.exit(main())

package/lib/bmad-cache/bmb/skills/bmad-eval-runner/scripts/generate_report.py ADDED Viewed

@@ -0,0 +1,184 @@
+#!/usr/bin/env python3
+# /// script
+# requires-python = ">=3.9"
+# ///
+"""Generate an aggregate HTML report for a run folder.
+Reads run.json, execution-summary.json, each <eval-id>/grading.json (if present),
+and triggers-result.json (if present), then renders a single-file HTML report.
+Usage:
+  python3 generate_report.py --run-dir PATH [-o report.html]
+"""
+from __future__ import annotations
+import argparse
+import html as html_lib
+import json
+import sys
+from pathlib import Path
+def esc(s: object) -> str:
+    return html_lib.escape(str(s), quote=True)
+def load(path: Path) -> dict | list | None:
+    if not path.is_file():
+        return None
+    try:
+        return json.loads(path.read_text(encoding="utf-8"))
+    except json.JSONDecodeError:
+        return None
+def render(run_dir: Path) -> str:
+    run_meta = load(run_dir / "run.json") or {}
+    exec_summary = load(run_dir / "execution-summary.json") or {}
+    triggers = load(run_dir / "triggers-result.json")
+    eval_blocks: list[str] = []
+    grading_total = 0
+    grading_passed = 0
+    for res in exec_summary.get("results", []):
+        eval_id = str(res.get("eval_id", "?"))
+        eval_dir = run_dir / eval_id
+        grading = load(eval_dir / "grading.json")
+        metrics = res.get("metrics") or load(eval_dir / "metrics.json") or {}
+        rc = res.get("return_code")
+        rows: list[str] = []
+        if grading:
+            for exp in grading.get("expectations", []):
+                passed = bool(exp.get("passed"))
+                grading_total += 1
+                if passed:
+                    grading_passed += 1
+                rows.append(
+                    f'<tr class="{ "pass" if passed else "fail" }">'
+                    f'<td>{ "✔" if passed else "✘" }</td>'
+                    f'<td>{esc(exp.get("text", ""))}</td>'
+                    f'<td>{esc(exp.get("evidence", ""))}</td></tr>'
+                )
+        feedback = (grading or {}).get("eval_feedback") or {}
+        feedback_html = ""
+        if feedback:
+            sugg = feedback.get("suggestions") or []
+            sugg_html = "".join(
+                f"<li><strong>{esc(s.get('assertion','(general)'))}</strong>: {esc(s.get('reason',''))}</li>"
+                for s in sugg
+            )
+            overall = esc(feedback.get("overall", ""))
+            feedback_html = (
+                f'<details class="feedback"><summary>Grader feedback on the evals</summary>'
+                f'<p>{overall}</p>'
+                f'{"<ul>" + sugg_html + "</ul>" if sugg_html else ""}'
+                f'</details>'
+            )
+        artifacts_listing = ""
+        artifacts_dir = eval_dir / "artifacts"
+        if artifacts_dir.is_dir():
+            files = sorted(p for p in artifacts_dir.rglob("*") if p.is_file())
+            if files:
+                artifacts_listing = "<ul>" + "".join(
+                    f'<li><code>{esc(p.relative_to(eval_dir))}</code> '
+                    f'<span class="muted">({p.stat().st_size}b)</span></li>'
+                    for p in files
+                ) + "</ul>"
+        tool_calls = metrics.get("tool_calls", {})
+        tool_summary = ", ".join(f"{k}={v}" for k, v in sorted(tool_calls.items())) or "—"
+        eval_blocks.append(f"""
+        <section class="eval">
+          <h3>Eval {esc(eval_id)} <span class="muted">rc={esc(rc)} · {esc(metrics.get('elapsed_s', '?'))}s</span></h3>
+          <p class="muted">Tool calls: {esc(tool_summary)} · output {esc(metrics.get('output_chars', 0))}b · transcript {esc(metrics.get('transcript_chars', 0))}b</p>
+          { '<table><thead><tr><th></th><th>Expectation</th><th>Evidence</th></tr></thead><tbody>' + ''.join(rows) + '</tbody></table>' if rows else '<p class="muted">No grading.json yet.</p>' }
+          {feedback_html}
+          <details><summary>Artifacts</summary>{artifacts_listing or '<p class="muted">No artifacts captured.</p>'}</details>
+        </section>
+        """)
+    triggers_html = ""
+    if triggers:
+        rows = []
+        for r in triggers.get("results", []):
+            rows.append(
+                f'<tr class="{ "pass" if r["pass"] else "fail" }">'
+                f'<td>{ "✔" if r["pass"] else "✘" }</td>'
+                f'<td>{esc(r["query"])}</td>'
+                f'<td>{esc(r["should_trigger"])}</td>'
+                f'<td>{r["triggers"]}/{r["runs"]} ({r["trigger_rate"]:.2f})</td></tr>'
+            )
+        s = triggers.get("summary", {})
+        triggers_html = f"""
+        <section class="triggers">
+          <h2>Trigger Evals — {s.get('passed',0)}/{s.get('total',0)} pass</h2>
+          <table><thead><tr><th></th><th>Query</th><th>Should fire</th><th>Rate</th></tr></thead>
+          <tbody>{''.join(rows)}</tbody></table>
+        </section>
+        """
+    artifact_summary = ""
+    if exec_summary:
+        artifact_summary = (
+            f"<p>Executed {exec_summary.get('executed', 0)} / {exec_summary.get('total', 0)} "
+            f"evals · {exec_summary.get('exec_failures', 0)} execution failures · "
+            f"grader: {grading_passed}/{grading_total} expectations passed</p>"
+        )
+    return f"""<!doctype html>
+<html><head><meta charset="utf-8"><title>Eval Run — {esc(run_meta.get('skill_name','?'))}</title>
+<style>
+  body {{ font: 14px/1.5 system-ui, sans-serif; max-width: 1080px; margin: 2em auto; color: #222; padding: 0 1em; }}
+  h1, h2, h3 {{ font-weight: 600; }}
+  h1 {{ font-size: 1.6em; margin-bottom: 0.2em; }}
+  .meta {{ color: #666; margin-bottom: 1.5em; }}
+  .muted {{ color: #888; font-weight: normal; }}
+  section.eval {{ border: 1px solid #ddd; border-radius: 6px; padding: 1em 1.2em; margin: 1em 0; background: #fafafa; }}
+  table {{ width: 100%; border-collapse: collapse; margin: 0.5em 0; font-size: 13px; }}
+  th, td {{ text-align: left; padding: 6px 8px; border-bottom: 1px solid #eee; vertical-align: top; }}
+  tr.pass td:first-child {{ color: #2c8a3a; font-weight: 700; }}
+  tr.fail td:first-child {{ color: #b3261e; font-weight: 700; }}
+  tr.fail {{ background: #fdf3f2; }}
+  details.feedback {{ margin-top: 0.6em; padding: 0.4em 0.7em; background: #fff8e1; border-radius: 4px; }}
+  details summary {{ cursor: pointer; font-weight: 600; }}
+  code {{ background: #eee; padding: 1px 4px; border-radius: 3px; }}
+</style></head>
+<body>
+<h1>{esc(run_meta.get('skill_name','?'))} — eval run</h1>
+<div class="meta">
+  Run id: <code>{esc(run_meta.get('run_id','?'))}</code> ·
+  isolation: {esc(run_meta.get('isolation','?'))} ·
+  started: {esc(run_meta.get('started_at','?'))}
+</div>
+{artifact_summary}
+{''.join(eval_blocks)}
+{triggers_html}
+</body></html>
+"""
+def main() -> int:
+    parser = argparse.ArgumentParser(description="Generate HTML report for an eval run folder")
+    parser.add_argument("--run-dir", required=True, type=Path)
+    parser.add_argument("-o", "--output", type=Path, default=None)
+    args = parser.parse_args()
+    run_dir = args.run_dir.resolve()
+    if not run_dir.is_dir():
+        print(f"run-dir not found: {run_dir}", file=sys.stderr)
+        return 2
+    out = args.output or (run_dir / "report.html")
+    out.write_text(render(run_dir), encoding="utf-8")
+    print(str(out))
+    return 0
+if __name__ == "__main__":
+    sys.exit(main())

package/lib/bmad-cache/bmb/skills/bmad-eval-runner/scripts/pty_runner.py ADDED Viewed

@@ -0,0 +1,171 @@
+#!/usr/bin/env python3
+# /// script
+# requires-python = ">=3.9"
+# ///
+"""Run claude interactively via PTY so the Skill tool is available.
+In `claude -p` (print mode) the Skill tool is never offered — Claude handles
+everything inline. Running `claude` in interactive mode activates the Skill
+tool so dependency skills installed in .claude/skills/ can be properly invoked.
+The PTY tricks claude into thinking it has a terminal (interactive mode) while
+we capture its stream-json output programmatically.
+Usage:
+  python3 pty_runner.py --prompt-file /path/to/prompt.txt \\
+                        --output /path/to/transcript.jsonl \\
+                        [--timeout 600]
+  python3 pty_runner.py --prompt "Run headless. ..." --output transcript.jsonl
+"""
+from __future__ import annotations
+import argparse
+import json
+import os
+import pty
+import re
+import select
+import subprocess
+import sys
+import time
+from pathlib import Path
+ANSI_RE = re.compile(r"\x1b(?:[@-Z\\-_]|\[[0-?]*[ -/]*[@-~])|\r")
+# How long to wait for claude to initialize before sending the prompt.
+# Claude loads skill registry, checks credentials, etc. on startup.
+INIT_WAIT_S = 5.0
+# How long to wait after the stream-json 'result' event before killing claude.
+# Trailing tool-result output sometimes follows the result event.
+POST_RESULT_S = 4.0
+def _strip_ansi(text: str) -> str:
+    return ANSI_RE.sub("", text)
+def run_interactive(prompt: str, output: Path, timeout: int = 600) -> None:
+    """Spawn claude interactively via PTY, send one prompt, capture transcript."""
+    master, slave = pty.openpty()
+    proc = subprocess.Popen(
+        [
+            "claude",
+            "--output-format", "stream-json",
+            "--verbose",
+            "--dangerously-skip-permissions",
+        ],
+        stdin=slave,
+        stdout=slave,
+        stderr=slave,
+        close_fds=True,
+    )
+    os.close(slave)
+    json_lines: list[str] = []
+    buf = b""
+    prompt_sent = False
+    done_at: float | None = None
+    start = time.time()
+    try:
+        while True:
+            elapsed = time.time() - start
+            if elapsed > timeout:
+                print(f"[pty_runner] timeout after {elapsed:.0f}s", file=sys.stderr)
+                break
+            if done_at is not None and (time.time() - done_at) > POST_RESULT_S:
+                break
+            # Short select so we stay responsive but don't spin.
+            r, _, _ = select.select([master], [], [], 0.3)
+            if r:
+                try:
+                    chunk = os.read(master, 8192)
+                except OSError:
+                    break  # PTY closed — claude exited
+                buf += chunk
+                # Process all complete lines in buffer.
+                while b"\n" in buf:
+                    raw, buf = buf.split(b"\n", 1)
+                    line = _strip_ansi(raw.decode("utf-8", errors="replace")).strip()
+                    if not line.startswith("{"):
+                        continue
+                    json_lines.append(line)
+                    try:
+                        obj = json.loads(line)
+                        # 'result' marks end of a claude turn.
+                        if obj.get("type") == "result" and done_at is None:
+                            done_at = time.time()
+                            print(
+                                f"[pty_runner] result event at t={time.time()-start:.1f}s "
+                                f"({len(json_lines)} lines so far)",
+                                file=sys.stderr,
+                            )
+                    except json.JSONDecodeError:
+                        pass
+            else:
+                # Silence window — send prompt once claude has had time to init.
+                if not prompt_sent and (time.time() - start) >= INIT_WAIT_S:
+                    os.write(master, (prompt + "\n").encode())
+                    prompt_sent = True
+                    print(
+                        f"[pty_runner] prompt sent at t={time.time()-start:.1f}s",
+                        file=sys.stderr,
+                    )
+    finally:
+        # Politely ask claude to exit, then hard-kill if needed.
+        try:
+            os.write(master, b"exit\n")
+            time.sleep(0.3)
+        except OSError:
+            pass
+        try:
+            proc.terminate()
+            proc.wait(timeout=5)
+        except Exception:
+            try:
+                proc.kill()
+            except Exception:
+                pass
+        try:
+            os.close(master)
+        except OSError:
+            pass
+    output.parent.mkdir(parents=True, exist_ok=True)
+    content = "\n".join(json_lines) + ("\n" if json_lines else "")
+    output.write_text(content, encoding="utf-8")
+    print(
+        f"[pty_runner] wrote {len(json_lines)} transcript lines → {output}",
+        file=sys.stderr,
+    )
+def main() -> int:
+    p = argparse.ArgumentParser(
+        description="Run claude interactively via PTY and capture stream-json transcript"
+    )
+    grp = p.add_mutually_exclusive_group(required=True)
+    grp.add_argument("--prompt", help="Prompt text")
+    grp.add_argument("--prompt-file", type=Path, help="File containing the prompt")
+    p.add_argument("--output", type=Path, required=True, help="Output .jsonl transcript file")
+    p.add_argument("--timeout", type=int, default=600, help="Hard timeout in seconds")
+    args = p.parse_args()
+    prompt = (
+        args.prompt_file.read_text(encoding="utf-8").strip()
+        if args.prompt_file
+        else args.prompt
+    )
+    run_interactive(prompt, args.output, args.timeout)
+    return 0
+if __name__ == "__main__":
+    sys.exit(main())