PyPI - harness-maker - Versions diffs - 0.14.0__py3-none-any.whl - Mend

harness-maker 0.14.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (183) hide show

harness_maker/__init__.py +3 -0
harness_maker/__main__.py +6 -0
harness_maker/_metrics_io.py +82 -0
harness_maker/add_domain.py +143 -0
harness_maker/agent_quality.py +146 -0
harness_maker/ai_readiness.py +281 -0
harness_maker/autoloop_driver.py +511 -0
harness_maker/block_merge.py +584 -0
harness_maker/cache.py +93 -0
harness_maker/cache_diagnostics.py +313 -0
harness_maker/cli.py +1660 -0
harness_maker/communication_audit.py +210 -0
harness_maker/conditional_router.py +229 -0
harness_maker/context_lint.py +144 -0
harness_maker/crawler/__init__.py +82 -0
harness_maker/crawler/anthropic_blog.py +105 -0
harness_maker/crawler/arxiv.py +80 -0
harness_maker/crawler/github_releases.py +112 -0
harness_maker/crawler/osv_dev.py +146 -0
harness_maker/detection_cache.py +161 -0
harness_maker/drift_monitor.py +257 -0
harness_maker/foreign_config.py +592 -0
harness_maker/gates/__init__.py +1 -0
harness_maker/gates/permission_gate.py +140 -0
harness_maker/gates/spec_gate.py +168 -0
harness_maker/gates/worktree_gate.py +157 -0
harness_maker/hooks/__init__.py +1 -0
harness_maker/hooks/flush_session.py +95 -0
harness_maker/hooks/loop_gate.py +99 -0
harness_maker/hooks/post_write_reminder.py +123 -0
harness_maker/hooks/sessionstart_drift.py +475 -0
harness_maker/i18n.py +55 -0
harness_maker/i18n_messages.py +47 -0
harness_maker/improvement.py +195 -0
harness_maker/interview.py +982 -0
harness_maker/io_utils.py +120 -0
harness_maker/llm_judge.py +308 -0
harness_maker/memory/__init__.py +8 -0
harness_maker/memory/_locking.py +102 -0
harness_maker/memory/episodic.py +109 -0
harness_maker/memory/profile.py +94 -0
harness_maker/memory/retrieval.py +60 -0
harness_maker/memory/semantic.py +142 -0
harness_maker/models.py +612 -0
harness_maker/modular_edit.py +206 -0
harness_maker/observability/__init__.py +7 -0
harness_maker/observability/dashboard.py +273 -0
harness_maker/observability/verification_cache.py +190 -0
harness_maker/personalization_audit.py +476 -0
harness_maker/plan_verify.py +178 -0
harness_maker/profile.py +489 -0
harness_maker/provenance.py +72 -0
harness_maker/readiness.py +887 -0
harness_maker/recommendation.py +343 -0
harness_maker/reconcile.py +532 -0
harness_maker/refdocs_index.py +252 -0
harness_maker/relevance.py +431 -0
harness_maker/render.py +787 -0
harness_maker/review_telemetry.py +199 -0
harness_maker/rubric_loader.py +78 -0
harness_maker/rubrics/personalization.yaml +38 -0
harness_maker/second_brain.py +473 -0
harness_maker/secscan/__init__.py +24 -0
harness_maker/secscan/dependency_cves.py +121 -0
harness_maker/secscan/hallucination.py +167 -0
harness_maker/secscan/hook_injection.py +73 -0
harness_maker/secscan/permissions.py +83 -0
harness_maker/secscan/prod_name_guard.py +122 -0
harness_maker/secscan/prompt_injection.py +215 -0
harness_maker/secscan/secrets.py +87 -0
harness_maker/security_scanner.py +243 -0
harness_maker/spec_quality.py +209 -0
harness_maker/synthesize.py +595 -0
harness_maker/telemetry.py +469 -0
harness_maker/templates/agents/_partials/communication_full.md.j2 +10 -0
harness_maker/templates/agents/_partials/communication_reframe.md.j2 +16 -0
harness_maker/templates/agents/_partials/communication_soft.md.j2 +8 -0
harness_maker/templates/agents/_partials/finding_schema.md.j2 +94 -0
harness_maker/templates/agents/_partials/hard_rules.md.j2 +11 -0
harness_maker/templates/agents/_partials/reasoning.md.j2 +14 -0
harness_maker/templates/agents/_partials/rubric.md.j2 +15 -0
harness_maker/templates/agents/_standards/_template.md.j2 +13 -0
harness_maker/templates/agents/_standards/python.md.j2 +35 -0
harness_maker/templates/agents/autoloop-coder.md.j2 +9 -0
harness_maker/templates/agents/autoloop-coder_body.md.j2 +50 -0
harness_maker/templates/agents/code-reviewer.md.j2 +29 -0
harness_maker/templates/agents/code-reviewer_body.md.j2 +64 -0
harness_maker/templates/agents/code-verifier.md.j2 +29 -0
harness_maker/templates/agents/code-verifier_body.md.j2 +99 -0
harness_maker/templates/agents/concurrency-reviewer.md.j2 +29 -0
harness_maker/templates/agents/concurrency-reviewer_body.md.j2 +62 -0
harness_maker/templates/agents/consensus-arbiter.md.j2 +9 -0
harness_maker/templates/agents/consensus-arbiter_body.md.j2 +127 -0
harness_maker/templates/agents/executor.md.j2 +33 -0
harness_maker/templates/agents/executor_body.md.j2 +49 -0
harness_maker/templates/agents/performance-reviewer.md.j2 +29 -0
harness_maker/templates/agents/performance-reviewer_body.md.j2 +63 -0
harness_maker/templates/agents/plan-validator.md.j2 +9 -0
harness_maker/templates/agents/plan-validator_body.md.j2 +95 -0
harness_maker/templates/agents/security-auditor.md.j2 +9 -0
harness_maker/templates/agents/security-auditor_body.md.j2 +117 -0
harness_maker/templates/agents/security-reviewer.md.j2 +29 -0
harness_maker/templates/agents/security-reviewer_body.md.j2 +64 -0
harness_maker/templates/agents/stuck.md.j2 +9 -0
harness_maker/templates/agents/stuck_body.md.j2 +129 -0
harness_maker/templates/agents/test-reviewer.md.j2 +9 -0
harness_maker/templates/agents/test-reviewer_body.md.j2 +109 -0
harness_maker/templates/agents/trajectory-monitor.md.j2 +71 -0
harness_maker/templates/agents/ux-reviewer.md.j2 +29 -0
harness_maker/templates/agents/ux-reviewer_body.md.j2 +65 -0
harness_maker/templates/claude-md/Production.en.md.j2 +27 -0
harness_maker/templates/claude-md/Production.ko.md.j2 +27 -0
harness_maker/templates/claude-md/Side.en.md.j2 +23 -0
harness_maker/templates/claude-md/Side.ko.md.j2 +23 -0
harness_maker/templates/codex/AGENTS.md.j2 +57 -0
harness_maker/templates/codex/agent.toml.j2 +8 -0
harness_maker/templates/codex/config.toml.j2 +21 -0
harness_maker/templates/codex/hooks.json.j2 +73 -0
harness_maker/templates/codex/loop_skill.md.j2 +10 -0
harness_maker/templates/codex/stage_skill.md.j2 +10 -0
harness_maker/templates/codex/workflow_skill.md.j2 +38 -0
harness_maker/templates/commands/hm/atomic_command.md.j2 +1 -0
harness_maker/templates/commands/hm/configure.md.j2 +214 -0
harness_maker/templates/commands/hm/health.md.j2 +46 -0
harness_maker/templates/commands/hm/loop.md.j2 +779 -0
harness_maker/templates/commands/hm/make.md.j2 +47 -0
harness_maker/templates/commands/hm/uninstall.md.j2 +57 -0
harness_maker/templates/commands/hm/workflow_command.md.j2 +69 -0
harness_maker/templates/cursor/hooks.json.j2 +62 -0
harness_maker/templates/cursor/mcp.json.j2 +10 -0
harness_maker/templates/cursor/rules/harness.mdc.j2 +132 -0
harness_maker/templates/foreign-configs/agents_md.md.j2 +22 -0
harness_maker/templates/foreign-configs/aider_conf.yml.j2 +16 -0
harness_maker/templates/foreign-configs/claude_md.md.j2 +19 -0
harness_maker/templates/foreign-configs/continue_config.json.j2 +10 -0
harness_maker/templates/foreign-configs/copilot_instructions.md.j2 +16 -0
harness_maker/templates/foreign-configs/cursor_rules.mdc.j2 +23 -0
harness_maker/templates/harness-yaml/Production.yaml.j2 +82 -0
harness_maker/templates/harness-yaml/Side.yaml.j2 +82 -0
harness_maker/templates/hooks/hooks.json.j2 +92 -0
harness_maker/templates/memory/failures.en.md.j2 +20 -0
harness_maker/templates/memory/failures.ko.md.j2 +20 -0
harness_maker/templates/memory/session-readme.md.j2 +34 -0
harness_maker/templates/memory/wiki.en.md.j2 +19 -0
harness_maker/templates/memory/wiki.ko.md.j2 +19 -0
harness_maker/templates/observability/dashboard.md.j2 +28 -0
harness_maker/templates/rubrics/agent_prompt.yaml.j2 +49 -0
harness_maker/templates/rubrics/claude_md.yaml.j2 +74 -0
harness_maker/templates/rubrics/skill.yaml.j2 +43 -0
harness_maker/templates/rubrics/workflow.yaml.j2 +40 -0
harness_maker/templates/settings/Production.json.j2 +7 -0
harness_maker/templates/settings/Side.json.j2 +4 -0
harness_maker/templates/skills/agent-quality-rubric/SKILL.md.j2 +68 -0
harness_maker/templates/skills/ai-readiness-rubric/SKILL.md.j2 +60 -0
harness_maker/templates/skills/autoloop-driver/SKILL.md.j2 +153 -0
harness_maker/templates/skills/conditional-router/SKILL.md.j2 +65 -0
harness_maker/templates/skills/context-linter/SKILL.md.j2 +58 -0
harness_maker/templates/skills/refdocs-search/SKILL.md.j2 +69 -0
harness_maker/templates/skills/relevance-filter/SKILL.md.j2 +64 -0
harness_maker/templates/skills/research-crawler/SKILL.md.j2 +54 -0
harness_maker/templates/skills/security-scanner/SKILL.md.j2 +68 -0
harness_maker/templates/skills/trajectory-monitor/SKILL.md.j2 +60 -0
harness_maker/templates/skills/verify-before-completion/SKILL.md.j2 +113 -0
harness_maker/templates/skills/worktree-isolator/SKILL.md.j2 +96 -0
harness_maker/templates/stages/execute.md.j2 +288 -0
harness_maker/templates/stages/plan.md.j2 +397 -0
harness_maker/templates/stages/research.md.j2 +317 -0
harness_maker/templates/stages/review.md.j2 +433 -0
harness_maker/templates/stages/spec.md.j2 +299 -0
harness_maker/templates/stages/verify.md.j2 +250 -0
harness_maker/templates/stages/wrapup.md.j2 +277 -0
harness_maker/test_dep_map.py +130 -0
harness_maker/tool_cascade.py +102 -0
harness_maker/two_pass_review.py +473 -0
harness_maker/validators.py +80 -0
harness_maker/verify.py +89 -0
harness_maker/workflow_fuse.py +167 -0
harness_maker/worktree.py +676 -0
harness_maker-0.14.0.dist-info/METADATA +708 -0
harness_maker-0.14.0.dist-info/RECORD +183 -0
harness_maker-0.14.0.dist-info/WHEEL +4 -0
harness_maker-0.14.0.dist-info/entry_points.txt +3 -0
harness_maker-0.14.0.dist-info/licenses/LICENSE +21 -0

harness_maker/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+"""harness-maker package."""
+__version__ = "0.14.0"

harness_maker/__main__.py ADDED Viewed

@@ -0,0 +1,6 @@
+"""Module entrypoint: ``python -m harness_maker``."""
+from harness_maker.cli import main
+if __name__ == "__main__":
+    main()

harness_maker/_metrics_io.py ADDED Viewed

@@ -0,0 +1,82 @@
+"""Shared metrics.jsonl reader — date-sharded files + legacy fallback.
+ADR-103: telemetry rotates per-day to ``metrics-YYYY-MM-DD.jsonl``. Readers
+glob the obs dir and walk the most recent files first, falling back to the
+pre-0.7.1 single ``metrics.jsonl`` so existing dashboards keep functioning
+during the transition.
+"""
+from __future__ import annotations
+import json
+import re
+from collections.abc import Iterator
+from pathlib import Path
+from typing import Any
+_DATED_RE = re.compile(r"^metrics-(\d{4}-\d{2}-\d{2})\.jsonl$")
+_LEGACY_NAME = "metrics.jsonl"
+def _candidate_files(obs_dir: Path, days: int) -> list[Path]:
+    """Return files to read in newest-first order, capped at ``days`` recent days.
+    Date-sharded files are sorted by their ISO-date stem (lexicographic order
+    matches chronological order for ``YYYY-MM-DD``). The legacy
+    ``metrics.jsonl`` always trails — pre-0.7.1 entries lack date sharding,
+    so they are read last and treated as the oldest data.
+    """
+    if not obs_dir.is_dir():
+        return []
+    dated: list[tuple[str, Path]] = []
+    legacy: Path | None = None
+    for child in obs_dir.iterdir():
+        if not child.is_file():
+            continue
+        m = _DATED_RE.match(child.name)
+        if m:
+            dated.append((m.group(1), child))
+        elif child.name == _LEGACY_NAME:
+            legacy = child
+    dated.sort(key=lambda pair: pair[0], reverse=True)
+    files = [p for _, p in dated[:days]]
+    if legacy is not None:
+        files.append(legacy)
+    return files
+def iter_recent_entries(
+    obs_dir: Path,
+    days: int = 7,
+    event: str | None = None,
+) -> Iterator[dict[str, Any]]:
+    """Yield JSONL entries from the most recent ``days`` daily files.
+    The generator walks files newest-first. Within each file, entries are
+    yielded in reverse (newest line first) so callers collecting the last N
+    matching entries can short-circuit cheaply. Malformed lines are silently
+    skipped — observability files are best-effort, never fatal.
+    When ``event`` is supplied, only entries whose ``event`` field equals it
+    are yielded. Pre-0.5.4 entries lacking the ``event`` tag are treated as
+    ``post_tool_use`` for backward compatibility.
+    """
+    for path in _candidate_files(obs_dir, days):
+        try:
+            text = path.read_text(encoding="utf-8")
+        except OSError:
+            continue
+        for line in reversed(text.splitlines()):
+            if not line.strip():
+                continue
+            try:
+                parsed = json.loads(line)
+            except (json.JSONDecodeError, ValueError):
+                continue
+            if not isinstance(parsed, dict):
+                continue
+            if event is not None:
+                tag = parsed.get("event", "post_tool_use")
+                if tag != event:
+                    continue
+            yield parsed

harness_maker/add_domain.py ADDED Viewed

@@ -0,0 +1,143 @@
+"""--add-domain helper: render a user-authored standards stub + register the name.
+Why split this from cli.py: the work is a small, testable transform — validate
+the domain name, render the skeleton template, and atomically update
+``harness.yaml``'s ``project.domains`` list. Surfacing it as a function lets
+the unit tests drive it without typer.
+"""
+from __future__ import annotations
+import re
+from datetime import UTC, datetime
+from pathlib import Path
+import yaml
+from harness_maker.io_utils import atomic_write
+from harness_maker.render import _make_env
+_NAME_PATTERN = re.compile(r"^[a-z][a-z0-9-]{0,30}$")
+class AddDomainError(ValueError):
+    """Raised when --add-domain inputs or filesystem state are invalid."""
+def validate_domain_name(name: str) -> str:
+    """Return name unchanged when valid; raise AddDomainError otherwise.
+    Why a strict pattern: the name becomes a filename and a Jinja include path
+    fragment in five reviewer agents. Accepting shell-meta or path-traversal
+    here would propagate.
+    """
+    if not _NAME_PATTERN.fullmatch(name):
+        msg = (
+            f"invalid domain name {name!r}: must match {_NAME_PATTERN.pattern}; "
+            "lowercase + digits + dashes, ≤ 31 chars, starts with a letter"
+        )
+        raise AddDomainError(msg)
+    return name
+def _today_iso() -> str:
+    return datetime.now(tz=UTC).date().isoformat()
+def _render_skeleton(name: str, today: str) -> str:
+    """Render `_template.md.j2` with the domain name + today's date filled in.
+    ``include_metadata=True`` switches on the HTML-comment annotation in the
+    skeleton so the rendered user-side ``.md`` carries a ``last_reviewed_at``
+    that ``detect_stale_assets`` can later parse.
+    """
+    env = _make_env()
+    tpl = env.get_template("agents/_standards/_template.md.j2")
+    return tpl.render(domain_name=name, today=today, include_metadata=True)
+def _read_yaml(path: Path) -> dict[str, object]:
+    try:
+        text = path.read_text(encoding="utf-8")
+    except (OSError, UnicodeDecodeError):
+        return {}
+    # harness.yaml gets a YAML provenance frontmatter wrapper from render.py;
+    # strip it before parsing the body.
+    if text.startswith("---\n"):
+        end = text.find("\n---\n", 4)
+        if end != -1:
+            text = text[end + 5 :]
+    try:
+        raw = yaml.safe_load(text)
+    except yaml.YAMLError:
+        return {}
+    return raw if isinstance(raw, dict) else {}
+def _format_yaml(data: dict[str, object]) -> str:
+    """Match the existing harness.yaml dump style (insertion order, allow unicode)."""
+    return yaml.safe_dump(
+        data,
+        sort_keys=False,
+        allow_unicode=True,
+        default_flow_style=False,
+    )
+def update_harness_yaml(harness_yaml_path: Path, name: str) -> bool:
+    """Append ``name`` to ``project.domains`` in-place. Return True if changed.
+    Preserves the YAML frontmatter wrapper (``---`` block) so provenance is not
+    lost. If the wrapper is absent (greenfield), writes a plain YAML body.
+    """
+    if not harness_yaml_path.exists():
+        msg = f"harness.yaml not found at {harness_yaml_path}; run /harness-maker:make first"
+        raise AddDomainError(msg)
+    text = harness_yaml_path.read_text(encoding="utf-8")
+    frontmatter = ""
+    body = text
+    if text.startswith("---\n"):
+        end = text.find("\n---\n", 4)
+        if end != -1:
+            frontmatter = text[: end + 5]
+            body = text[end + 5 :]
+    try:
+        data = yaml.safe_load(body) or {}
+    except yaml.YAMLError as e:
+        msg = f"harness.yaml is not valid YAML: {e}"
+        raise AddDomainError(msg) from e
+    if not isinstance(data, dict):
+        msg = "harness.yaml top-level must be a mapping"
+        raise AddDomainError(msg)
+    project = data.setdefault("project", {})
+    if not isinstance(project, dict):
+        msg = "harness.yaml: project must be a mapping"
+        raise AddDomainError(msg)
+    domains = project.setdefault("domains", [])
+    if not isinstance(domains, list):
+        msg = "harness.yaml: project.domains must be a list"
+        raise AddDomainError(msg)
+    if name in domains:
+        return False
+    domains.append(name)
+    new_body = _format_yaml(data)
+    atomic_write(harness_yaml_path, frontmatter + new_body)
+    return True
+def add_domain(target: Path, name: str, *, today: str | None = None) -> Path:
+    """Create ``.claude/agents/_standards/<name>.md`` and register the domain.
+    Returns the path of the created stub. Existing stubs are not overwritten —
+    raises AddDomainError so the user can review the conflict.
+    """
+    validate_domain_name(name)
+    standards_dir = target / ".claude" / "agents" / "_standards"
+    out = standards_dir / f"{name}.md"
+    if out.exists():
+        msg = f"{out} already exists; remove it first if you intend to recreate"
+        raise AddDomainError(msg)
+    body = _render_skeleton(name, today or _today_iso())
+    atomic_write(out, body)
+    update_harness_yaml(target / ".claude" / "harness.yaml", name)
+    return out

harness_maker/agent_quality.py ADDED Viewed

@@ -0,0 +1,146 @@
+"""Agent prompt quality scoring → Platinum/Gold/Silver/Bronze tier.
+Hybrid score: static structural checks (line count, frontmatter, bullets)
+combined with an optional Layer-2 LLM judgment against the shipped
+``agent_prompt.yaml`` rubric. When a ``JudgeClient`` and ``rubric_dir`` are
+provided, the LLM half lifts the score above the structural floor; on any
+LLM failure we degrade to the static score with a logged warning.
+Tier thresholds are preserved: composite ≥90 Platinum, ≥80 Gold, ≥70 Silver,
+else Bronze (which auto-flags an agent for /hm:refresh anti-rot review).
+"""
+from __future__ import annotations
+import hashlib
+import logging
+from pathlib import Path
+from typing import Any
+from harness_maker.cache import HttpCache
+from harness_maker.llm_judge import JudgeClient, judge_file
+from harness_maker.rubric_loader import load_rubric_file
+_LOG = logging.getLogger(__name__)
+_SKIP_TIERS = {"Platinum", "Gold"}
+def _static_score(agent_md: Path) -> int:
+    try:
+        text = agent_md.read_text(encoding="utf-8")
+    except OSError:
+        return 0
+    if not text.strip():
+        return 0
+    score = 0
+    lines = text.splitlines()
+    line_count = len(lines)
+    if 100 <= line_count <= 500:
+        score += 40
+    elif 50 <= line_count < 100 or 500 < line_count <= 700:
+        score += 20
+    if text.startswith("---"):
+        rest = text[4:]
+        if "\n---" in rest:
+            score += 30
+    if any(line.lstrip().startswith(("-", "*", "+")) for line in lines) or "```" in text:
+        score += 30
+    return min(100, score)
+def _tier(composite: int) -> str:
+    if composite >= 90:
+        return "Platinum"
+    if composite >= 80:
+        return "Gold"
+    if composite >= 70:
+        return "Silver"
+    return "Bronze"
+def _content_hash(agent_md: Path) -> str:
+    try:
+        content = agent_md.read_bytes()
+    except OSError:
+        return ""
+    return hashlib.sha256(content).hexdigest()[:16]
+def _get_cached_score(agent_md: Path) -> dict[str, Any] | None:
+    """Return previous score if tier was Platinum/Gold and content unchanged."""
+    cache = HttpCache("agent-quality")
+    key = hashlib.sha256(str(agent_md.resolve()).encode()).hexdigest()[:16]
+    cached = cache.get(key, ttl=float("inf"))  # no TTL — content-based
+    if not isinstance(cached, dict):
+        return None
+    if cached.get("tier") not in _SKIP_TIERS:
+        return None
+    if cached.get("content_hash") != _content_hash(agent_md):
+        return None
+    _LOG.info("agent_quality: skip (cached tier=%s) for %s", cached["tier"], agent_md.name)
+    return cached
+def _cache_score(agent_md: Path, result: dict[str, Any]) -> None:
+    cache = HttpCache("agent-quality")
+    key = hashlib.sha256(str(agent_md.resolve()).encode()).hexdigest()[:16]
+    entry = {**result, "content_hash": _content_hash(agent_md)}
+    cache.put(key, entry)
+def score_agent(
+    agent_md: Path,
+    *,
+    rubric_dir: Path | None = None,
+    client: JudgeClient | None = None,
+    model: str = "claude-sonnet-4-6",
+    force: bool = False,
+) -> dict[str, Any]:
+    """Score one agent prompt and emit a tier.
+    Args:
+        agent_md: Path to ``.claude/agents/<name>.md``.
+        rubric_dir: When provided alongside ``client``, points at the
+            ``.claude/rubrics/`` directory; the ``agent_prompt.yaml`` rubric
+            inside drives the LLM judgment.
+        client: Optional LLM client (``JudgeClient`` Protocol). When omitted,
+            the LLM half is skipped and the score reflects structural signals
+            only.
+        model: Anthropic model id passed through to the judge.
+    Returns:
+        ``{"static": int, "llm": int|None, "composite": int, "tier": str}``.
+    """
+    if not force:
+        cached = _get_cached_score(agent_md)
+        if cached is not None:
+            return {k: cached[k] for k in ("static", "llm", "composite", "tier") if k in cached}
+    static = _static_score(agent_md)
+    llm: int | None = None
+    if client is not None and rubric_dir is not None:
+        rubric_path = rubric_dir / "agent_prompt.yaml"
+        rubric = load_rubric_file(rubric_path)
+        if rubric is None:
+            _LOG.warning("agent_quality: rubric not found at %s; static-only score", rubric_path)
+        else:
+            try:
+                result = judge_file(agent_md, rubric, client=client, model=model)
+            except Exception as e:  # noqa: BLE001 — LLM transport degrades gracefully
+                _LOG.warning("agent_quality: LLM judge failed (%s); static-only score", e)
+                result = None
+            if result is not None and result.error is None:
+                llm = result.score
+            elif result is not None and result.error:
+                _LOG.warning("agent_quality: LLM judge reported %s", result.error)
+    composite = static if llm is None else (static + llm) // 2
+    score: dict[str, Any] = {
+        "static": static,
+        "llm": llm,
+        "composite": composite,
+        "tier": _tier(composite),
+    }
+    _cache_score(agent_md, score)
+    return score

harness_maker/ai_readiness.py ADDED Viewed

@@ -0,0 +1,281 @@
+"""Orchestrator — combine readiness layers into a plan + renders.
+PLAN health-consolidation Phase 1 (0.13.0) split the 3-layer composite
+score into a ``structural`` field of the unified ``/hm:health`` dashboard.
+The new entrypoint ``run_structural(project_dir, preset)`` returns a
+minimal ``{"score": int, "signals_failed": [...]}`` dict suitable
+for the dashboard third-section writer; the legacy ``run_ai_readiness``
+and rendering helpers are retained so existing callers and tests in the
+package continue to work until the templates catch up (Phase 2).
+Public API:
+- ``run_structural(project_dir, preset)`` — NEW, 0.13.0 health field.
+- ``run_ai_readiness(project_dir, preset, ...)`` — legacy full pipeline.
+- ``run_ai_readiness_structural(project_dir, preset, ...)`` — L1+L3 only.
+- ``finalize_from_verdicts_json(scores_path, verdicts_path)`` — legacy.
+- ``render_terminal_summary(plan)`` — concise text for CLI output.
+- ``render_dashboard_markdown(plan, project_name)`` — legacy dashboard body.
+"""
+from __future__ import annotations
+import json
+from pathlib import Path
+from typing import Any
+from harness_maker.cache_diagnostics import CacheDiagnosis, diagnose_cache
+from harness_maker.communication_audit import audit_communication
+from harness_maker.improvement import ActionItem, ImprovementPlan, build_improvement_plan
+from harness_maker.llm_judge import (
+    AnthropicJudgeClient,
+    JudgeClient,
+    JudgeResult,
+    RubricVerdict,
+    compute_score_from_verdicts,
+    judge_target,
+)
+from harness_maker.models import Preset
+from harness_maker.readiness import ReadinessResult, compute_readiness
+from harness_maker.rubric_loader import load_rubrics
+def _build_judge_client() -> JudgeClient | None:
+    """Best-effort Anthropic SDK client (requires ANTHROPIC_API_KEY).
+    In Claude Code subscription contexts Layer 2 runs prompt-natively
+    (the executing Claude agent evaluates rubrics inline). This fallback
+    is kept for non-interactive / CI environments that do have an API key.
+    """
+    try:
+        return AnthropicJudgeClient()
+    except Exception:  # noqa: BLE001 — missing API key etc.
+        return None
+def run_ai_readiness(
+    project_dir: Path,
+    *,
+    preset: Preset,
+    skip_llm: bool = False,
+    judge_client: JudgeClient | None = None,
+    model: str = "claude-sonnet-4-6",
+) -> ImprovementPlan:
+    """Run the 3-layer pipeline and return a composite improvement plan."""
+    readiness = compute_readiness(project_dir, preset)
+    metrics = project_dir / ".claude" / "observability" / "metrics.jsonl"
+    cache = diagnose_cache(metrics, model=model)
+    judge_results: list[JudgeResult] = []
+    if not skip_llm:
+        rubrics = load_rubrics(project_dir / ".claude" / "rubrics")
+        if rubrics:
+            client = judge_client or _build_judge_client()
+            if client is not None:
+                for rf in rubrics.values():
+                    judge_results.extend(judge_target(project_dir, rf, client=client, model=model))
+    return build_improvement_plan(readiness, judge_results, cache)
+def run_ai_readiness_structural(
+    project_dir: Path,
+    *,
+    preset: Preset,
+    model: str = "claude-sonnet-4-6",
+) -> dict[str, Any]:
+    """Run L1+L3 only and return a JSON-serializable dict.
+    The dict is written by ``--json-output`` so that ``ai-readiness-finalize``
+    can reconstruct a full plan after Claude provides L2 verdicts inline.
+    """
+    readiness = compute_readiness(project_dir, preset)
+    metrics = project_dir / ".claude" / "observability" / "metrics.jsonl"
+    cache = diagnose_cache(metrics, model=model)
+    return {
+        "readiness": readiness.model_dump(),
+        "cache": cache.model_dump(),
+        "preset": preset.value,
+    }
+def run_structural(
+    project_dir: Path,
+    *,
+    preset: Preset,
+    model: str = "claude-sonnet-4-6",
+) -> dict[str, Any]:
+    """Compute the ``structural`` field for the /hm:health dashboard (0.13.0).
+    Returns ``{"score": <0-100 int>, "signals_failed": [...]}``. The
+    score is the weighted blend of the deterministic L1 readiness signals
+    (70%) and the L3 cache-diagnostic score (5% in the legacy weighting —
+    surfaced here as a small additive component so a degenerate cache
+    state can still pull the structural score down). L2 is intentionally
+    NOT folded into ``structural``: the LLM-judged content score belongs
+    to a different concern and the verify-stage Check 3 contract names
+    "structural" specifically.
+    Key rename (0.13.1, PLAN-health-plugin-bugs-2026-05 ADR-001): the
+    inner score key was renamed from ``"structural"`` to ``"score"`` so
+    the schema is no longer nested under the same name as the outer
+    section. The dashboard renderer and its unit tests have always read
+    ``.get("score")`` — pre-0.13.1 the producer drifted to ``"structural"``
+    silently, causing every rendered dashboard to show ``score: 0 / 100``.
+    ``signals_failed`` is the flat list of ``layer1:<signal_id>`` entries
+    whose ``passed`` flag is False — one line per failed deterministic
+    check so the dashboard reader can show a count without re-running the
+    layer.
+    """
+    readiness = compute_readiness(project_dir, preset)
+    metrics = project_dir / ".claude" / "observability" / "metrics.jsonl"
+    cache = diagnose_cache(metrics, model=model)
+    # Blend: 70% readiness (deterministic structural) + 5% cache; the
+    # remaining 25% slot belongs to L2 (llm_judge) which lives in a
+    # separate field of the dashboard once the templates land in Phase 2.
+    # Until then we treat L2 as neutral 50 so the structural number remains
+    # comparable to the pre-0.13.0 single-score dashboard for users mid-
+    # migration. Renormalize after dropping L2 so the result is in [0, 100].
+    weighted = (readiness.composite * 0.70 + cache.score * 0.05) / 0.75
+    structural_score = max(0, min(100, round(weighted)))
+    signals_failed: list[str] = []
+    for dim_name, dim in readiness.dimensions.items():
+        for sig in dim.signals:
+            if not sig.passed:
+                signals_failed.append(f"{dim_name}:{sig.id}")
+    # PLAN-antisycophancy-2026-05 ADR-006: communication-protocol sub-check.
+    # Discovers dispatcher templates + 5 pinned LLM-judgment skills, requires
+    # `communication_variant` frontmatter on each, and verifies the rendered
+    # marker matches source. Silent-miss (the R4 WRONG-probe failure mode)
+    # surfaces here as structured ActionItem records; the /hm:health
+    # accept/reject/defer loop walks them unchanged (0.13.0 ADR-001).
+    templates_root = Path(__file__).resolve().parent / "templates"
+    output_root = project_dir / ".claude"
+    comm_items = audit_communication(
+        templates_root, output_dir=output_root if output_root.is_dir() else None
+    )
+    for item in comm_items:
+        signals_failed.append(f"communication_protocol:{item.target}")
+    return {
+        "score": structural_score,
+        "signals_failed": signals_failed,
+        "communication_items": [it.model_dump() for it in comm_items],
+    }
+def finalize_from_verdicts_json(
+    scores_path: Path,
+    verdicts_path: Path,
+) -> ImprovementPlan:
+    """Reconstruct a full ImprovementPlan from pre-computed L1+L3 + Claude L2 verdicts.
+    ``verdicts_path`` must contain a JSON array of objects in the form:
+    ``[{"file": "...", "dimension": "...", "verdicts": [{...RubricVerdict fields...}]}]``
+    The ``score`` field is computed from the verdicts; ``error`` defaults to null.
+    """
+    try:
+        scores = json.loads(scores_path.read_text(encoding="utf-8"))
+        readiness = ReadinessResult.model_validate(scores["readiness"])
+        cache = CacheDiagnosis.model_validate(scores["cache"])
+    except (json.JSONDecodeError, KeyError, Exception) as e:
+        msg = f"Could not parse scores JSON at {scores_path}: {e}"
+        raise ValueError(msg) from e
+    try:
+        raw_verdicts = json.loads(verdicts_path.read_text(encoding="utf-8"))
+    except json.JSONDecodeError as e:
+        msg = f"Could not parse verdicts JSON at {verdicts_path}: {e}"
+        raise ValueError(msg) from e
+    judge_results: list[JudgeResult] = []
+    if isinstance(raw_verdicts, list):
+        for entry in raw_verdicts:
+            if not isinstance(entry, dict):
+                continue
+            verdicts = [
+                RubricVerdict.model_validate(v)
+                for v in entry.get("verdicts", [])
+                if isinstance(v, dict)
+            ]
+            score = compute_score_from_verdicts(verdicts) if verdicts else 50
+            judge_results.append(
+                JudgeResult(
+                    file=str(entry.get("file", "")),
+                    dimension=str(entry.get("dimension", "")),
+                    score=score,
+                    verdicts=verdicts,
+                    error=entry.get("error"),
+                )
+            )
+    return build_improvement_plan(readiness, judge_results, cache)
+def render_terminal_summary(plan: ImprovementPlan, *, max_actions: int = 10) -> str:
+    """Concise text suitable for stdout when /hm:ai-readiness is invoked."""
+    lines = [
+        f"ai-readiness: {plan.composite_score} / 100",
+        "",
+        "Layer scores:",
+        f"  readiness  : {plan.layer_scores['readiness']:>3}  (deterministic structural)",
+        f"  llm_judge  : {plan.layer_scores['llm_judge']:>3}  (LLM content quality)",
+        f"  cache      : {plan.layer_scores['cache']:>3}  (prompt-caching efficiency)",
+        "",
+    ]
+    if not plan.actions:
+        lines.append("No actions — project looks healthy.")
+        return "\n".join(lines)
+    lines.append(f"Top {min(max_actions, len(plan.actions))} of {len(plan.actions)} actions:")
+    for a in plan.actions[:max_actions]:
+        lines.append(f"  [{a.priority}] {a.dimension} :: {a.summary}")
+        lines.append(f"        → {a.suggestion}")
+    if len(plan.actions) > max_actions:
+        lines.append(f"  … {len(plan.actions) - max_actions} more (run --verbose for full list)")
+    return "\n".join(lines)
+def _format_action_row(a: ActionItem) -> str:
+    suggestion = a.suggestion.replace("|", r"\|").replace("\n", " ")
+    summary = a.summary.replace("|", r"\|").replace("\n", " ")
+    return f"| {a.priority} | {a.dimension} | {summary} | {suggestion} |"
+def render_dashboard_markdown(plan: ImprovementPlan, project_name: str) -> str:
+    """Markdown dashboard body for ``.claude/observability/dashboard.md``."""
+    lines: list[str] = [
+        f"# AI Readiness — {project_name}",
+        "",
+        f"**Composite:** {plan.composite_score} / 100",
+        "",
+        "## Layer scores",
+        "",
+        "| Layer | Score | What it measures |",
+        "|-------|------:|------------------|",
+        f"| readiness | {plan.layer_scores['readiness']} | "
+        "Deterministic structural signals (CLAUDE.md, hooks, tests, CI, …) |",
+        f"| llm_judge | {plan.layer_scores['llm_judge']} | LLM-judged content quality vs rubrics |",
+        f"| cache | {plan.layer_scores['cache']} | "
+        "Prompt-cache hit rate + failure-mode diagnosis |",
+        "",
+    ]
+    if not plan.actions:
+        lines.extend(["## Actions", "", "(none — project looks healthy)", ""])
+        return "\n".join(lines) + "\n"
+    lines.extend(
+        [
+            "## Actions",
+            "",
+            "| Priority | Dimension | Summary | Suggestion |",
+            "|----------|-----------|---------|------------|",
+        ]
+    )
+    for a in plan.actions:
+        lines.append(_format_action_row(a))
+    lines.append("")
+    return "\n".join(lines) + "\n"