npm - agent-harness-kit - Versions diffs - 0.3.0 - Mend

agent-harness-kit 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (55) hide show

package/.claude-plugin/marketplace.json +27 -0
package/.claude-plugin/plugin.json +25 -0
package/LICENSE +21 -0
package/README.md +165 -0
package/bin/cli.mjs +261 -0
package/package.json +64 -0
package/src/core/detect-stack.mjs +181 -0
package/src/core/doctor.mjs +106 -0
package/src/core/patch-package-json.mjs +53 -0
package/src/core/render-templates.mjs +277 -0
package/src/core/upgrade.mjs +274 -0
package/src/templates/.claude/agents/api-consistency-reviewer.md +33 -0
package/src/templates/.claude/agents/architecture-reviewer.md.hbs +41 -0
package/src/templates/.claude/agents/performance-reviewer.md +35 -0
package/src/templates/.claude/agents/reliability-reviewer.md +38 -0
package/src/templates/.claude/agents/security-reviewer.md +39 -0
package/src/templates/.claude/hooks/hooks.json.hbs +39 -0
package/src/templates/.claude/settings.json.hbs +25 -0
package/src/templates/.claude/skills/add-adr/SKILL.md +60 -0
package/src/templates/.claude/skills/add-feature/SKILL.md.hbs +50 -0
package/src/templates/.claude/skills/debug-flow/SKILL.md.hbs +38 -0
package/src/templates/.claude/skills/doc-drift-scan/SKILL.md +43 -0
package/src/templates/.claude/skills/eval-runner/SKILL.md +55 -0
package/src/templates/.claude/skills/garbage-collection/SKILL.md.hbs +49 -0
package/src/templates/.claude/skills/inspect-app/SKILL.md +57 -0
package/src/templates/.claude/skills/inspect-module/SKILL.md.hbs +53 -0
package/src/templates/.claude/skills/propose-harness-improvement/SKILL.md +43 -0
package/src/templates/.claude/skills/structural-test-author/SKILL.md.hbs +46 -0
package/src/templates/.claude/skills/write-skill/SKILL.md +39 -0
package/src/templates/CLAUDE.md.hbs +70 -0
package/src/templates/_adapter-python/.importlinter +14 -0
package/src/templates/_adapter-python/harness/__init__.py +0 -0
package/src/templates/_adapter-python/harness/eval_runner.py +281 -0
package/src/templates/_adapter-python/harness/structural_test.py +195 -0
package/src/templates/_adapter-typescript/.dependency-cruiser.cjs +27 -0
package/src/templates/_adapter-typescript/eslint.config.mjs +38 -0
package/src/templates/_adapter-typescript/harness/eval-runner.mjs +322 -0
package/src/templates/_adapter-typescript/harness/structural-test.mjs +125 -0
package/src/templates/_ci/.github/workflows/eval-nightly.yml +59 -0
package/src/templates/_ci/.github/workflows/harness.yml +55 -0
package/src/templates/docs/adr/0001-use-agent-harness-kit.md.hbs +56 -0
package/src/templates/docs/agent-failures.md +25 -0
package/src/templates/docs/architecture.md.hbs +47 -0
package/src/templates/docs/core-beliefs.md.hbs +41 -0
package/src/templates/docs/golden-principles.md.hbs +80 -0
package/src/templates/docs/tech-debt-tracker.md +30 -0
package/src/templates/feature_list.json.hbs +29 -0
package/src/templates/harness.config.json.hbs +40 -0
package/src/templates/scripts/dev-up.sh.hbs +51 -0
package/src/templates/scripts/harness-report.mjs +189 -0
package/src/templates/scripts/install-git-hooks.sh +18 -0
package/src/templates/scripts/pre-push.sh +21 -0
package/src/templates/scripts/precompletion-checklist.sh.hbs +99 -0
package/src/templates/scripts/structural-test-on-edit.sh.hbs +53 -0
package/src/templates/scripts/telemetry-on-skill.sh +26 -0

package/src/templates/_adapter-python/harness/eval_runner.py ADDED Viewed

@@ -0,0 +1,281 @@
+"""Drive Claude Code through .harness/eval/tasks/*.json and grade each on
+outcome / process / style / efficiency.
+Per-task JSONL row goes to .harness/eval/results/<sha>.jsonl. On regression
+(any task failing in CI), exit 1 so the workflow blocks merge.
+Transports:
+    --transport=claude-cli  spawn `claude -p` and capture stream-json (default)
+    --transport=mock        synthetic transcript — use in CI smoke-tests, no API key needed
+Sets:
+    --quick                 first 3 tasks (~$0.30, ~2 min on Sonnet)
+    --full                  all tasks (~$2, ~15 min)
+    --tasks <glob>          custom set
+Usage::
+    python -m harness.eval_runner --quick
+    python -m harness.eval_runner --full --transport=mock
+    python -m harness.eval_runner --tasks 01-trivial-endpoint.json
+"""
+from __future__ import annotations
+import argparse
+import json
+import os
+import subprocess
+import sys
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import Callable
+ROOT = Path.cwd()
+TASKS_DIR = ROOT / ".harness" / "eval" / "tasks"
+RESULTS_DIR = ROOT / ".harness" / "eval" / "results"
+def _git_sha() -> str:
+    try:
+        return subprocess.check_output(
+            ["git", "rev-parse", "--short", "HEAD"], stderr=subprocess.DEVNULL
+        ).decode().strip()
+    except Exception:
+        return "no-git"
+def _load_tasks(args: argparse.Namespace) -> list[dict]:
+    if not TASKS_DIR.exists():
+        print(f"No tasks directory at {TASKS_DIR}. Run `agent-harness-kit init` first.", file=sys.stderr)
+        sys.exit(1)
+    files = sorted(p for p in TASKS_DIR.glob("*.json"))
+    if args.tasks:
+        files = [p for p in files if args.tasks == p.name or args.tasks in p.name]
+    elif args.quick:
+        files = files[:3]
+    out = []
+    for f in files:
+        t = json.loads(f.read_text())
+        t["_file"] = str(f)
+        out.append(t)
+    return out
+# ---- transports ----
+def _transport_claude_cli(task: dict) -> dict:
+    """Spawn `claude -p` with stream-json output and flatten the wire format
+    into the same shape the mock transport produces (so the graders don't
+    have to know about both shapes).
+    Real wire format (Claude Code 2.1.x)::
+        {type:"assistant", message:{content:[{type:"tool_use", name, input}]}}
+        {type:"result", usage:{input_tokens, output_tokens, cache_*}}
+    Flat shape graders consume::
+        {type:"tool_use", tool:<name>, path:<input.file_path|input.path>}
+        {type:"token_usage", total:<sum of all token fields>}
+    """
+    proc = subprocess.run(
+        [
+            "claude",
+            "-p",
+            task["input"],
+            "--output-format",
+            "stream-json",
+            "--verbose",
+            "--max-turns",
+            "20",
+        ],
+        capture_output=True,
+        text=True,
+        check=False,
+    )
+    events: list[dict] = []
+    for line in proc.stdout.splitlines():
+        line = line.strip()
+        if not line:
+            continue
+        try:
+            raw = json.loads(line)
+        except json.JSONDecodeError:
+            continue
+        events.append({"type": raw.get("type"), "raw": raw})
+        # Flatten tool_use blocks from assistant messages.
+        if raw.get("type") == "assistant" and raw.get("message", {}).get("content"):
+            for block in raw["message"]["content"]:
+                if block.get("type") != "tool_use":
+                    continue
+                # /skill invocations come in as the Skill tool with input.skill.
+                if block.get("name") == "Skill" and block.get("input", {}).get("skill"):
+                    events.append({"type": "tool_use", "tool": block["input"]["skill"]})
+                path = block.get("input", {}).get("file_path") or block.get("input", {}).get("path")
+                events.append({"type": "tool_use", "tool": block.get("name"), "path": path})
+        # Final result has aggregated usage.
+        if raw.get("type") == "result" and raw.get("usage"):
+            u = raw["usage"]
+            total = (
+                u.get("input_tokens", 0)
+                + u.get("output_tokens", 0)
+                + u.get("cache_creation_input_tokens", 0)
+                + u.get("cache_read_input_tokens", 0)
+            )
+            events.append({"type": "token_usage", "total": total})
+    if proc.returncode != 0:
+        return {"events": events, "stderr": proc.stderr[:500], "error": True}
+    return {"events": events, "stderr": proc.stderr}
+def _transport_mock(task: dict) -> dict:
+    """Synthetic transcript that satisfies the default expectations."""
+    expected = task.get("expected", {})
+    events = []
+    for skill in expected.get("skillsInvoked", []):
+        events.append({"type": "tool_use", "tool": skill})
+    min_files = (expected.get("filesChanged") or {}).get("min", 1)
+    for i in range(min_files):
+        events.append({"type": "tool_use", "tool": "Write", "path": f"app/mock_{i}.py"})
+    events.append({"type": "token_usage", "total": min(expected.get("tokensMax", 5000), 5000)})
+    return {"events": events, "stderr": ""}
+TRANSPORTS: dict[str, Callable[[dict], dict]] = {
+    "claude-cli": _transport_claude_cli,
+    "mock": _transport_mock,
+}
+# ---- graders ----
+def _grade_outcome(task: dict) -> dict | None:
+    if task.get("expected", {}).get("structuralTest") != "pass":
+        return None
+    rc = subprocess.run(
+        ["python", "-m", "harness.structural_test"],
+        stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL,
+    ).returncode
+    return {
+        "dim": "outcome",
+        "score": 1 if rc == 0 else 0,
+        "info": "structural test passed" if rc == 0 else "structural test failed",
+    }
+def _grade_process(task: dict, transcript: dict) -> dict | None:
+    expected = task.get("expected", {}).get("skillsInvoked", [])
+    if not expected:
+        return None
+    invoked = {e.get("tool") for e in transcript["events"] if e.get("type") == "tool_use"}
+    missing = [s for s in expected if s not in invoked]
+    return {
+        "dim": "process",
+        "score": 1 if not missing else 0,
+        "info": "all expected skills invoked" if not missing else f"missing skills: {', '.join(missing)}",
+    }
+def _grade_style(task: dict, transcript: dict) -> dict | None:
+    rng = task.get("expected", {}).get("filesChanged")
+    if not rng:
+        return None
+    writes = [
+        e for e in transcript["events"]
+        if e.get("type") == "tool_use" and e.get("tool") in ("Write", "Edit", "MultiEdit")
+    ]
+    distinct = len({e.get("path") for e in writes if e.get("path")})
+    ok = rng["min"] <= distinct <= rng["max"]
+    return {
+        "dim": "style",
+        "score": 1 if ok else 0,
+        "info": f"{distinct} files changed (expected {rng['min']}-{rng['max']})",
+    }
+def _grade_efficiency(task: dict, transcript: dict) -> dict | None:
+    cap = task.get("expected", {}).get("tokensMax")
+    if not cap:
+        return None
+    tokens = sum(e.get("total", 0) for e in transcript["events"] if e.get("type") == "token_usage")
+    return {
+        "dim": "efficiency",
+        "score": 1 if tokens <= cap else 0,
+        "info": f"{tokens} tokens (cap {cap})",
+    }
+def run_eval(args: argparse.Namespace) -> dict:
+    tasks = _load_tasks(args)
+    if not tasks:
+        print("No tasks matched.", file=sys.stderr)
+        return {"results": [], "passed": 0}
+    transport = TRANSPORTS.get(args.transport)
+    if transport is None:
+        print(
+            f"Unknown transport: {args.transport}. Try: {', '.join(TRANSPORTS)}",
+            file=sys.stderr,
+        )
+        sys.exit(2)
+    sha = _git_sha()
+    out_path = Path(args.out) if args.out else RESULTS_DIR / f"{sha}.jsonl"
+    out_path.parent.mkdir(parents=True, exist_ok=True)
+    results: list[dict] = []
+    for task in tasks:
+        try:
+            transcript = transport(task)
+        except Exception as exc:
+            transcript = {"events": [], "stderr": str(exc), "error": True}
+        grades = [
+            g for g in (
+                _grade_outcome(task),
+                _grade_process(task, transcript),
+                _grade_style(task, transcript),
+                _grade_efficiency(task, transcript),
+            ) if g is not None
+        ]
+        passed = bool(grades) and all(g["score"] == 1 for g in grades)
+        row = {
+            "taskId": task["id"],
+            "sha": sha,
+            "ts": datetime.now(timezone.utc).isoformat(),
+            "grades": grades,
+            "passed": passed,
+        }
+        results.append(row)
+        with out_path.open("a") as fh:
+            fh.write(json.dumps(row) + "\n")
+    return {"results": results, "passed": sum(1 for r in results if r["passed"]), "outPath": str(out_path), "sha": sha}
+def _summarize(summary: dict) -> None:
+    print(f"\nEval run {summary['sha']} — {summary['passed']}/{len(summary['results'])} passed ({summary['outPath']})")
+    for r in summary["results"]:
+        mark = "✓" if r["passed"] else "✗"
+        print(f"  {mark} {r['taskId']}")
+        for g in r["grades"]:
+            m = "✓" if g["score"] == 1 else "✗"
+            print(f"      {m} {g['dim']}: {g['info']}")
+def _main() -> int:
+    ap = argparse.ArgumentParser(description="Drive eval tasks against Claude Code.")
+    ap.add_argument("--quick", action="store_true", help="run first 3 tasks only")
+    ap.add_argument("--full", action="store_true", help="run all tasks")
+    ap.add_argument("--tasks", help="filename or substring to filter tasks")
+    ap.add_argument("--transport", default="claude-cli", choices=list(TRANSPORTS), help="transport to use")
+    ap.add_argument("--out", help="results path (default .harness/eval/results/<sha>.jsonl)")
+    args = ap.parse_args()
+    summary = run_eval(args)
+    _summarize(summary)
+    if os.environ.get("CI") == "true" and summary["passed"] < len(summary["results"]):
+        return 1
+    return 0
+if __name__ == "__main__":
+    raise SystemExit(_main())

package/src/templates/_adapter-python/harness/structural_test.py ADDED Viewed

@@ -0,0 +1,195 @@
+"""Forward-only layer enforcement for Python projects.
+Reads ``harness.config.json``. For each domain, parses every source file's
+imports (via libcst) and asserts that no import goes "backward" through the
+layer order. New violations on existing code are baselined into
+``.harness/structural-baseline.json`` on first run.
+Exit codes:
+    0 -- clean (or only baselined violations)
+    2 -- new violations found (Claude Code reads stderr and re-prompts)
+Usage::
+    python -m harness.structural_test            # full repo
+    python -m harness.structural_test --file F   # single file (PostToolUse hook)
+"""
+from __future__ import annotations
+import argparse
+import json
+import sys
+from pathlib import Path
+try:
+    import libcst as cst
+except ImportError:  # pragma: no cover -- helpful error for first-time users
+    print(
+        "libcst is not installed. Run `pip install libcst` (or add it to your dev deps).",
+        file=sys.stderr,
+    )
+    sys.exit(1)
+ROOT = Path.cwd()
+CFG_PATH = ROOT / "harness.config.json"
+BASELINE_PATH = ROOT / ".harness" / "structural-baseline.json"
+def _load_cfg() -> dict:
+    return json.loads(CFG_PATH.read_text())
+def _layer_of(path: Path, cfg: dict) -> tuple[str, dict] | None:
+    rel = str(path.relative_to(ROOT)) if path.is_absolute() else str(path)
+    for d in cfg["domains"]:
+        if not rel.startswith(d["root"]):
+            continue
+        for layer in d["layers"]:
+            if f"/{layer}/" in rel or rel.endswith(f"/{layer}.py"):
+                return layer, d
+    return None
+def _resolve_imported_module(module: str, cfg: dict) -> Path | None:
+    """Convert a dotted import path into the file path it most likely resolves to."""
+    parts = module.split(".")
+    candidate_pkg = ROOT / Path(*parts) / "__init__.py"
+    candidate_mod = ROOT / Path(*parts).with_suffix(".py")
+    if candidate_pkg.exists():
+        return candidate_pkg
+    if candidate_mod.exists():
+        return candidate_mod
+    return None
+class _ImportCollector(cst.CSTVisitor):
+    METADATA_DEPENDENCIES = (cst.metadata.PositionProvider,)
+    def __init__(self) -> None:
+        self.imports: list[tuple[int, str]] = []
+    def visit_Import(self, node: cst.Import) -> None:
+        for alias in node.names:
+            line = self.get_metadata(cst.metadata.PositionProvider, node).start.line
+            full = self._dotted(alias.name)
+            if full:
+                self.imports.append((line, full))
+    def visit_ImportFrom(self, node: cst.ImportFrom) -> None:
+        if node.module is None:
+            return
+        line = self.get_metadata(cst.metadata.PositionProvider, node).start.line
+        full = self._dotted(node.module)
+        if full:
+            self.imports.append((line, full))
+    @staticmethod
+    def _dotted(node: cst.CSTNode) -> str | None:
+        parts: list[str] = []
+        cur = node
+        while isinstance(cur, cst.Attribute):
+            if isinstance(cur.attr, cst.Name):
+                parts.insert(0, cur.attr.value)
+            cur = cur.value
+        if isinstance(cur, cst.Name):
+            parts.insert(0, cur.value)
+        return ".".join(parts) if parts else None
+def collect_violations(scoped_file: Path | None = None) -> list[dict]:
+    cfg = _load_cfg()
+    out: list[dict] = []
+    sources = list((ROOT / cfg["domains"][0]["root"]).rglob("*.py"))
+    for src_path in sources:
+        if scoped_file and src_path.resolve() != scoped_file.resolve():
+            continue
+        src = _layer_of(src_path, cfg)
+        if not src:
+            continue
+        src_layer, src_domain = src
+        src_idx = src_domain["layers"].index(src_layer)
+        try:
+            wrapper = cst.MetadataWrapper(cst.parse_module(src_path.read_text()))
+        except cst.ParserSyntaxError:
+            continue
+        col = _ImportCollector()
+        wrapper.visit(col)
+        for line, imported in col.imports:
+            tgt_path = _resolve_imported_module(imported, cfg)
+            if tgt_path is None:
+                continue
+            tgt = _layer_of(tgt_path, cfg)
+            if not tgt:
+                continue
+            tgt_layer, tgt_domain = tgt
+            if tgt_domain["name"] != src_domain["name"]:
+                continue
+            tgt_idx = tgt_domain["layers"].index(tgt_layer)
+            if src_idx < tgt_idx:
+                out.append(
+                    {
+                        "file": str(src_path),
+                        "line": line,
+                        "from": src_layer,
+                        "to": tgt_layer,
+                        "key": f"{src_path}::{imported}",
+                    }
+                )
+    return out
+def _main() -> int:
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--file", type=Path, default=None)
+    args = ap.parse_args()
+    cfg = _load_cfg()
+    baseline = (
+        set(json.loads(BASELINE_PATH.read_text()))
+        if BASELINE_PATH.exists()
+        else None
+    )
+    violations = [
+        v for v in collect_violations(args.file)
+        if baseline is None or v["key"] not in baseline
+    ]
+    # First-run baseline.
+    if baseline is None and violations:
+        BASELINE_PATH.parent.mkdir(parents=True, exist_ok=True)
+        BASELINE_PATH.write_text(
+            json.dumps([v["key"] for v in violations], indent=2) + "\n"
+        )
+        print(
+            f"✓ structural test: baselined {len(violations)} existing violations "
+            f"(.harness/structural-baseline.json)."
+        )
+        print(
+            "  New violations introduced after this point will block. "
+            "Existing ones can be fixed incrementally."
+        )
+        return 0
+    if not violations:
+        print("✓ structural test passed")
+        return 0
+    for v in violations:
+        print(
+            f"✖ {v['file']}:{v['line']}  layer={v['from']} → {v['to']}  (must be forward-only)",
+            file=sys.stderr,
+        )
+    print(
+        f"\n{len(violations)} new layer violation(s). Fix the import direction.",
+        file=sys.stderr,
+    )
+    print(
+        f"Layer order for domain \"{cfg['domains'][0]['name']}\": "
+        f"{' → '.join(cfg['domains'][0]['layers'])}",
+        file=sys.stderr,
+    )
+    return 2
+if __name__ == "__main__":
+    raise SystemExit(_main())

package/src/templates/_adapter-typescript/.dependency-cruiser.cjs ADDED Viewed

@@ -0,0 +1,27 @@
+// dependency-cruiser — third structural sensor (after ts-morph + eslint).
+// Catches circular imports and orphan modules in addition to layer violations.
+module.exports = {
+  forbidden: [
+    {
+      name: "no-backward-layer",
+      severity: "error",
+      from: { path: "^src/[^/]+/types/" },
+      to:   { path: "^src/[^/]+/(config|repo|service|runtime|ui)/" },
+    },
+    { name: "no-circular", severity: "error", from: {}, to: { circular: true } },
+    {
+      name: "no-orphan",
+      severity: "warn",
+      from: { orphan: true, pathNot: "(\\.spec|\\.test|/__tests__/|/__mocks__/)" },
+      to: {},
+    },
+  ],
+  options: {
+    tsConfig: { fileName: "tsconfig.json" },
+    enhancedResolveOptions: {
+      exportsFields: ["exports"],
+      conditionNames: ["import", "require", "node"],
+    },
+  },
+};

package/src/templates/_adapter-typescript/eslint.config.mjs ADDED Viewed

@@ -0,0 +1,38 @@
+// eslint-plugin-boundaries — defense in depth alongside the structural test.
+// The structural test is the source of truth; this catches the same violations
+// in the editor for faster feedback.
+//
+// Note: requires `eslint-plugin-boundaries` ≥ 5. Install if missing:
+//   npm i -D eslint-plugin-boundaries
+import boundaries from "eslint-plugin-boundaries";
+export default [
+  {
+    plugins: { boundaries },
+    settings: {
+      "boundaries/elements": [
+        { type: "types",   pattern: "src/*/types/**" },
+        { type: "config",  pattern: "src/*/config/**" },
+        { type: "repo",    pattern: "src/*/repo/**" },
+        { type: "service", pattern: "src/*/service/**" },
+        { type: "runtime", pattern: "src/*/runtime/**" },
+        { type: "ui",      pattern: "src/*/ui/**" },
+      ],
+      "boundaries/include": ["src/**/*"],
+    },
+    rules: {
+      "boundaries/dependencies": [2, {
+        default: "disallow",
+        rules: [
+          { from: { type: "ui" },      allow: { to: { type: ["runtime","service","config","types"] } } },
+          { from: { type: "runtime" }, allow: { to: { type: ["service","repo","config","types"] } } },
+          { from: { type: "service" }, allow: { to: { type: ["repo","config","types"] } } },
+          { from: { type: "repo" },    allow: { to: { type: ["config","types"] } } },
+          { from: { type: "config" },  allow: { to: { type: ["types"] } } },
+          { from: { type: "types" },   disallow: { to: { type: "*" } } },
+        ],
+      }],
+    },
+  },
+];