harness-maker 0.14.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (183) hide show
  1. harness_maker/__init__.py +3 -0
  2. harness_maker/__main__.py +6 -0
  3. harness_maker/_metrics_io.py +82 -0
  4. harness_maker/add_domain.py +143 -0
  5. harness_maker/agent_quality.py +146 -0
  6. harness_maker/ai_readiness.py +281 -0
  7. harness_maker/autoloop_driver.py +511 -0
  8. harness_maker/block_merge.py +584 -0
  9. harness_maker/cache.py +93 -0
  10. harness_maker/cache_diagnostics.py +313 -0
  11. harness_maker/cli.py +1660 -0
  12. harness_maker/communication_audit.py +210 -0
  13. harness_maker/conditional_router.py +229 -0
  14. harness_maker/context_lint.py +144 -0
  15. harness_maker/crawler/__init__.py +82 -0
  16. harness_maker/crawler/anthropic_blog.py +105 -0
  17. harness_maker/crawler/arxiv.py +80 -0
  18. harness_maker/crawler/github_releases.py +112 -0
  19. harness_maker/crawler/osv_dev.py +146 -0
  20. harness_maker/detection_cache.py +161 -0
  21. harness_maker/drift_monitor.py +257 -0
  22. harness_maker/foreign_config.py +592 -0
  23. harness_maker/gates/__init__.py +1 -0
  24. harness_maker/gates/permission_gate.py +140 -0
  25. harness_maker/gates/spec_gate.py +168 -0
  26. harness_maker/gates/worktree_gate.py +157 -0
  27. harness_maker/hooks/__init__.py +1 -0
  28. harness_maker/hooks/flush_session.py +95 -0
  29. harness_maker/hooks/loop_gate.py +99 -0
  30. harness_maker/hooks/post_write_reminder.py +123 -0
  31. harness_maker/hooks/sessionstart_drift.py +475 -0
  32. harness_maker/i18n.py +55 -0
  33. harness_maker/i18n_messages.py +47 -0
  34. harness_maker/improvement.py +195 -0
  35. harness_maker/interview.py +982 -0
  36. harness_maker/io_utils.py +120 -0
  37. harness_maker/llm_judge.py +308 -0
  38. harness_maker/memory/__init__.py +8 -0
  39. harness_maker/memory/_locking.py +102 -0
  40. harness_maker/memory/episodic.py +109 -0
  41. harness_maker/memory/profile.py +94 -0
  42. harness_maker/memory/retrieval.py +60 -0
  43. harness_maker/memory/semantic.py +142 -0
  44. harness_maker/models.py +612 -0
  45. harness_maker/modular_edit.py +206 -0
  46. harness_maker/observability/__init__.py +7 -0
  47. harness_maker/observability/dashboard.py +273 -0
  48. harness_maker/observability/verification_cache.py +190 -0
  49. harness_maker/personalization_audit.py +476 -0
  50. harness_maker/plan_verify.py +178 -0
  51. harness_maker/profile.py +489 -0
  52. harness_maker/provenance.py +72 -0
  53. harness_maker/readiness.py +887 -0
  54. harness_maker/recommendation.py +343 -0
  55. harness_maker/reconcile.py +532 -0
  56. harness_maker/refdocs_index.py +252 -0
  57. harness_maker/relevance.py +431 -0
  58. harness_maker/render.py +787 -0
  59. harness_maker/review_telemetry.py +199 -0
  60. harness_maker/rubric_loader.py +78 -0
  61. harness_maker/rubrics/personalization.yaml +38 -0
  62. harness_maker/second_brain.py +473 -0
  63. harness_maker/secscan/__init__.py +24 -0
  64. harness_maker/secscan/dependency_cves.py +121 -0
  65. harness_maker/secscan/hallucination.py +167 -0
  66. harness_maker/secscan/hook_injection.py +73 -0
  67. harness_maker/secscan/permissions.py +83 -0
  68. harness_maker/secscan/prod_name_guard.py +122 -0
  69. harness_maker/secscan/prompt_injection.py +215 -0
  70. harness_maker/secscan/secrets.py +87 -0
  71. harness_maker/security_scanner.py +243 -0
  72. harness_maker/spec_quality.py +209 -0
  73. harness_maker/synthesize.py +595 -0
  74. harness_maker/telemetry.py +469 -0
  75. harness_maker/templates/agents/_partials/communication_full.md.j2 +10 -0
  76. harness_maker/templates/agents/_partials/communication_reframe.md.j2 +16 -0
  77. harness_maker/templates/agents/_partials/communication_soft.md.j2 +8 -0
  78. harness_maker/templates/agents/_partials/finding_schema.md.j2 +94 -0
  79. harness_maker/templates/agents/_partials/hard_rules.md.j2 +11 -0
  80. harness_maker/templates/agents/_partials/reasoning.md.j2 +14 -0
  81. harness_maker/templates/agents/_partials/rubric.md.j2 +15 -0
  82. harness_maker/templates/agents/_standards/_template.md.j2 +13 -0
  83. harness_maker/templates/agents/_standards/python.md.j2 +35 -0
  84. harness_maker/templates/agents/autoloop-coder.md.j2 +9 -0
  85. harness_maker/templates/agents/autoloop-coder_body.md.j2 +50 -0
  86. harness_maker/templates/agents/code-reviewer.md.j2 +29 -0
  87. harness_maker/templates/agents/code-reviewer_body.md.j2 +64 -0
  88. harness_maker/templates/agents/code-verifier.md.j2 +29 -0
  89. harness_maker/templates/agents/code-verifier_body.md.j2 +99 -0
  90. harness_maker/templates/agents/concurrency-reviewer.md.j2 +29 -0
  91. harness_maker/templates/agents/concurrency-reviewer_body.md.j2 +62 -0
  92. harness_maker/templates/agents/consensus-arbiter.md.j2 +9 -0
  93. harness_maker/templates/agents/consensus-arbiter_body.md.j2 +127 -0
  94. harness_maker/templates/agents/executor.md.j2 +33 -0
  95. harness_maker/templates/agents/executor_body.md.j2 +49 -0
  96. harness_maker/templates/agents/performance-reviewer.md.j2 +29 -0
  97. harness_maker/templates/agents/performance-reviewer_body.md.j2 +63 -0
  98. harness_maker/templates/agents/plan-validator.md.j2 +9 -0
  99. harness_maker/templates/agents/plan-validator_body.md.j2 +95 -0
  100. harness_maker/templates/agents/security-auditor.md.j2 +9 -0
  101. harness_maker/templates/agents/security-auditor_body.md.j2 +117 -0
  102. harness_maker/templates/agents/security-reviewer.md.j2 +29 -0
  103. harness_maker/templates/agents/security-reviewer_body.md.j2 +64 -0
  104. harness_maker/templates/agents/stuck.md.j2 +9 -0
  105. harness_maker/templates/agents/stuck_body.md.j2 +129 -0
  106. harness_maker/templates/agents/test-reviewer.md.j2 +9 -0
  107. harness_maker/templates/agents/test-reviewer_body.md.j2 +109 -0
  108. harness_maker/templates/agents/trajectory-monitor.md.j2 +71 -0
  109. harness_maker/templates/agents/ux-reviewer.md.j2 +29 -0
  110. harness_maker/templates/agents/ux-reviewer_body.md.j2 +65 -0
  111. harness_maker/templates/claude-md/Production.en.md.j2 +27 -0
  112. harness_maker/templates/claude-md/Production.ko.md.j2 +27 -0
  113. harness_maker/templates/claude-md/Side.en.md.j2 +23 -0
  114. harness_maker/templates/claude-md/Side.ko.md.j2 +23 -0
  115. harness_maker/templates/codex/AGENTS.md.j2 +57 -0
  116. harness_maker/templates/codex/agent.toml.j2 +8 -0
  117. harness_maker/templates/codex/config.toml.j2 +21 -0
  118. harness_maker/templates/codex/hooks.json.j2 +73 -0
  119. harness_maker/templates/codex/loop_skill.md.j2 +10 -0
  120. harness_maker/templates/codex/stage_skill.md.j2 +10 -0
  121. harness_maker/templates/codex/workflow_skill.md.j2 +38 -0
  122. harness_maker/templates/commands/hm/atomic_command.md.j2 +1 -0
  123. harness_maker/templates/commands/hm/configure.md.j2 +214 -0
  124. harness_maker/templates/commands/hm/health.md.j2 +46 -0
  125. harness_maker/templates/commands/hm/loop.md.j2 +779 -0
  126. harness_maker/templates/commands/hm/make.md.j2 +47 -0
  127. harness_maker/templates/commands/hm/uninstall.md.j2 +57 -0
  128. harness_maker/templates/commands/hm/workflow_command.md.j2 +69 -0
  129. harness_maker/templates/cursor/hooks.json.j2 +62 -0
  130. harness_maker/templates/cursor/mcp.json.j2 +10 -0
  131. harness_maker/templates/cursor/rules/harness.mdc.j2 +132 -0
  132. harness_maker/templates/foreign-configs/agents_md.md.j2 +22 -0
  133. harness_maker/templates/foreign-configs/aider_conf.yml.j2 +16 -0
  134. harness_maker/templates/foreign-configs/claude_md.md.j2 +19 -0
  135. harness_maker/templates/foreign-configs/continue_config.json.j2 +10 -0
  136. harness_maker/templates/foreign-configs/copilot_instructions.md.j2 +16 -0
  137. harness_maker/templates/foreign-configs/cursor_rules.mdc.j2 +23 -0
  138. harness_maker/templates/harness-yaml/Production.yaml.j2 +82 -0
  139. harness_maker/templates/harness-yaml/Side.yaml.j2 +82 -0
  140. harness_maker/templates/hooks/hooks.json.j2 +92 -0
  141. harness_maker/templates/memory/failures.en.md.j2 +20 -0
  142. harness_maker/templates/memory/failures.ko.md.j2 +20 -0
  143. harness_maker/templates/memory/session-readme.md.j2 +34 -0
  144. harness_maker/templates/memory/wiki.en.md.j2 +19 -0
  145. harness_maker/templates/memory/wiki.ko.md.j2 +19 -0
  146. harness_maker/templates/observability/dashboard.md.j2 +28 -0
  147. harness_maker/templates/rubrics/agent_prompt.yaml.j2 +49 -0
  148. harness_maker/templates/rubrics/claude_md.yaml.j2 +74 -0
  149. harness_maker/templates/rubrics/skill.yaml.j2 +43 -0
  150. harness_maker/templates/rubrics/workflow.yaml.j2 +40 -0
  151. harness_maker/templates/settings/Production.json.j2 +7 -0
  152. harness_maker/templates/settings/Side.json.j2 +4 -0
  153. harness_maker/templates/skills/agent-quality-rubric/SKILL.md.j2 +68 -0
  154. harness_maker/templates/skills/ai-readiness-rubric/SKILL.md.j2 +60 -0
  155. harness_maker/templates/skills/autoloop-driver/SKILL.md.j2 +153 -0
  156. harness_maker/templates/skills/conditional-router/SKILL.md.j2 +65 -0
  157. harness_maker/templates/skills/context-linter/SKILL.md.j2 +58 -0
  158. harness_maker/templates/skills/refdocs-search/SKILL.md.j2 +69 -0
  159. harness_maker/templates/skills/relevance-filter/SKILL.md.j2 +64 -0
  160. harness_maker/templates/skills/research-crawler/SKILL.md.j2 +54 -0
  161. harness_maker/templates/skills/security-scanner/SKILL.md.j2 +68 -0
  162. harness_maker/templates/skills/trajectory-monitor/SKILL.md.j2 +60 -0
  163. harness_maker/templates/skills/verify-before-completion/SKILL.md.j2 +113 -0
  164. harness_maker/templates/skills/worktree-isolator/SKILL.md.j2 +96 -0
  165. harness_maker/templates/stages/execute.md.j2 +288 -0
  166. harness_maker/templates/stages/plan.md.j2 +397 -0
  167. harness_maker/templates/stages/research.md.j2 +317 -0
  168. harness_maker/templates/stages/review.md.j2 +433 -0
  169. harness_maker/templates/stages/spec.md.j2 +299 -0
  170. harness_maker/templates/stages/verify.md.j2 +250 -0
  171. harness_maker/templates/stages/wrapup.md.j2 +277 -0
  172. harness_maker/test_dep_map.py +130 -0
  173. harness_maker/tool_cascade.py +102 -0
  174. harness_maker/two_pass_review.py +473 -0
  175. harness_maker/validators.py +80 -0
  176. harness_maker/verify.py +89 -0
  177. harness_maker/workflow_fuse.py +167 -0
  178. harness_maker/worktree.py +676 -0
  179. harness_maker-0.14.0.dist-info/METADATA +708 -0
  180. harness_maker-0.14.0.dist-info/RECORD +183 -0
  181. harness_maker-0.14.0.dist-info/WHEEL +4 -0
  182. harness_maker-0.14.0.dist-info/entry_points.txt +3 -0
  183. harness_maker-0.14.0.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,3 @@
1
+ """harness-maker package."""
2
+
3
+ __version__ = "0.14.0"
@@ -0,0 +1,6 @@
1
+ """Module entrypoint: ``python -m harness_maker``."""
2
+
3
+ from harness_maker.cli import main
4
+
5
+ if __name__ == "__main__":
6
+ main()
@@ -0,0 +1,82 @@
1
+ """Shared metrics.jsonl reader — date-sharded files + legacy fallback.
2
+
3
+ ADR-103: telemetry rotates per-day to ``metrics-YYYY-MM-DD.jsonl``. Readers
4
+ glob the obs dir and walk the most recent files first, falling back to the
5
+ pre-0.7.1 single ``metrics.jsonl`` so existing dashboards keep functioning
6
+ during the transition.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import json
12
+ import re
13
+ from collections.abc import Iterator
14
+ from pathlib import Path
15
+ from typing import Any
16
+
17
+ _DATED_RE = re.compile(r"^metrics-(\d{4}-\d{2}-\d{2})\.jsonl$")
18
+ _LEGACY_NAME = "metrics.jsonl"
19
+
20
+
21
+ def _candidate_files(obs_dir: Path, days: int) -> list[Path]:
22
+ """Return files to read in newest-first order, capped at ``days`` recent days.
23
+
24
+ Date-sharded files are sorted by their ISO-date stem (lexicographic order
25
+ matches chronological order for ``YYYY-MM-DD``). The legacy
26
+ ``metrics.jsonl`` always trails — pre-0.7.1 entries lack date sharding,
27
+ so they are read last and treated as the oldest data.
28
+ """
29
+ if not obs_dir.is_dir():
30
+ return []
31
+ dated: list[tuple[str, Path]] = []
32
+ legacy: Path | None = None
33
+ for child in obs_dir.iterdir():
34
+ if not child.is_file():
35
+ continue
36
+ m = _DATED_RE.match(child.name)
37
+ if m:
38
+ dated.append((m.group(1), child))
39
+ elif child.name == _LEGACY_NAME:
40
+ legacy = child
41
+ dated.sort(key=lambda pair: pair[0], reverse=True)
42
+ files = [p for _, p in dated[:days]]
43
+ if legacy is not None:
44
+ files.append(legacy)
45
+ return files
46
+
47
+
48
+ def iter_recent_entries(
49
+ obs_dir: Path,
50
+ days: int = 7,
51
+ event: str | None = None,
52
+ ) -> Iterator[dict[str, Any]]:
53
+ """Yield JSONL entries from the most recent ``days`` daily files.
54
+
55
+ The generator walks files newest-first. Within each file, entries are
56
+ yielded in reverse (newest line first) so callers collecting the last N
57
+ matching entries can short-circuit cheaply. Malformed lines are silently
58
+ skipped — observability files are best-effort, never fatal.
59
+
60
+ When ``event`` is supplied, only entries whose ``event`` field equals it
61
+ are yielded. Pre-0.5.4 entries lacking the ``event`` tag are treated as
62
+ ``post_tool_use`` for backward compatibility.
63
+ """
64
+ for path in _candidate_files(obs_dir, days):
65
+ try:
66
+ text = path.read_text(encoding="utf-8")
67
+ except OSError:
68
+ continue
69
+ for line in reversed(text.splitlines()):
70
+ if not line.strip():
71
+ continue
72
+ try:
73
+ parsed = json.loads(line)
74
+ except (json.JSONDecodeError, ValueError):
75
+ continue
76
+ if not isinstance(parsed, dict):
77
+ continue
78
+ if event is not None:
79
+ tag = parsed.get("event", "post_tool_use")
80
+ if tag != event:
81
+ continue
82
+ yield parsed
@@ -0,0 +1,143 @@
1
+ """--add-domain helper: render a user-authored standards stub + register the name.
2
+
3
+ Why split this from cli.py: the work is a small, testable transform — validate
4
+ the domain name, render the skeleton template, and atomically update
5
+ ``harness.yaml``'s ``project.domains`` list. Surfacing it as a function lets
6
+ the unit tests drive it without typer.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import re
12
+ from datetime import UTC, datetime
13
+ from pathlib import Path
14
+
15
+ import yaml
16
+
17
+ from harness_maker.io_utils import atomic_write
18
+ from harness_maker.render import _make_env
19
+
20
+ _NAME_PATTERN = re.compile(r"^[a-z][a-z0-9-]{0,30}$")
21
+
22
+
23
+ class AddDomainError(ValueError):
24
+ """Raised when --add-domain inputs or filesystem state are invalid."""
25
+
26
+
27
+ def validate_domain_name(name: str) -> str:
28
+ """Return name unchanged when valid; raise AddDomainError otherwise.
29
+
30
+ Why a strict pattern: the name becomes a filename and a Jinja include path
31
+ fragment in five reviewer agents. Accepting shell-meta or path-traversal
32
+ here would propagate.
33
+ """
34
+ if not _NAME_PATTERN.fullmatch(name):
35
+ msg = (
36
+ f"invalid domain name {name!r}: must match {_NAME_PATTERN.pattern}; "
37
+ "lowercase + digits + dashes, ≤ 31 chars, starts with a letter"
38
+ )
39
+ raise AddDomainError(msg)
40
+ return name
41
+
42
+
43
+ def _today_iso() -> str:
44
+ return datetime.now(tz=UTC).date().isoformat()
45
+
46
+
47
+ def _render_skeleton(name: str, today: str) -> str:
48
+ """Render `_template.md.j2` with the domain name + today's date filled in.
49
+
50
+ ``include_metadata=True`` switches on the HTML-comment annotation in the
51
+ skeleton so the rendered user-side ``.md`` carries a ``last_reviewed_at``
52
+ that ``detect_stale_assets`` can later parse.
53
+ """
54
+ env = _make_env()
55
+ tpl = env.get_template("agents/_standards/_template.md.j2")
56
+ return tpl.render(domain_name=name, today=today, include_metadata=True)
57
+
58
+
59
+ def _read_yaml(path: Path) -> dict[str, object]:
60
+ try:
61
+ text = path.read_text(encoding="utf-8")
62
+ except (OSError, UnicodeDecodeError):
63
+ return {}
64
+ # harness.yaml gets a YAML provenance frontmatter wrapper from render.py;
65
+ # strip it before parsing the body.
66
+ if text.startswith("---\n"):
67
+ end = text.find("\n---\n", 4)
68
+ if end != -1:
69
+ text = text[end + 5 :]
70
+ try:
71
+ raw = yaml.safe_load(text)
72
+ except yaml.YAMLError:
73
+ return {}
74
+ return raw if isinstance(raw, dict) else {}
75
+
76
+
77
+ def _format_yaml(data: dict[str, object]) -> str:
78
+ """Match the existing harness.yaml dump style (insertion order, allow unicode)."""
79
+ return yaml.safe_dump(
80
+ data,
81
+ sort_keys=False,
82
+ allow_unicode=True,
83
+ default_flow_style=False,
84
+ )
85
+
86
+
87
+ def update_harness_yaml(harness_yaml_path: Path, name: str) -> bool:
88
+ """Append ``name`` to ``project.domains`` in-place. Return True if changed.
89
+
90
+ Preserves the YAML frontmatter wrapper (``---`` block) so provenance is not
91
+ lost. If the wrapper is absent (greenfield), writes a plain YAML body.
92
+ """
93
+ if not harness_yaml_path.exists():
94
+ msg = f"harness.yaml not found at {harness_yaml_path}; run /harness-maker:make first"
95
+ raise AddDomainError(msg)
96
+ text = harness_yaml_path.read_text(encoding="utf-8")
97
+ frontmatter = ""
98
+ body = text
99
+ if text.startswith("---\n"):
100
+ end = text.find("\n---\n", 4)
101
+ if end != -1:
102
+ frontmatter = text[: end + 5]
103
+ body = text[end + 5 :]
104
+ try:
105
+ data = yaml.safe_load(body) or {}
106
+ except yaml.YAMLError as e:
107
+ msg = f"harness.yaml is not valid YAML: {e}"
108
+ raise AddDomainError(msg) from e
109
+ if not isinstance(data, dict):
110
+ msg = "harness.yaml top-level must be a mapping"
111
+ raise AddDomainError(msg)
112
+ project = data.setdefault("project", {})
113
+ if not isinstance(project, dict):
114
+ msg = "harness.yaml: project must be a mapping"
115
+ raise AddDomainError(msg)
116
+ domains = project.setdefault("domains", [])
117
+ if not isinstance(domains, list):
118
+ msg = "harness.yaml: project.domains must be a list"
119
+ raise AddDomainError(msg)
120
+ if name in domains:
121
+ return False
122
+ domains.append(name)
123
+ new_body = _format_yaml(data)
124
+ atomic_write(harness_yaml_path, frontmatter + new_body)
125
+ return True
126
+
127
+
128
+ def add_domain(target: Path, name: str, *, today: str | None = None) -> Path:
129
+ """Create ``.claude/agents/_standards/<name>.md`` and register the domain.
130
+
131
+ Returns the path of the created stub. Existing stubs are not overwritten —
132
+ raises AddDomainError so the user can review the conflict.
133
+ """
134
+ validate_domain_name(name)
135
+ standards_dir = target / ".claude" / "agents" / "_standards"
136
+ out = standards_dir / f"{name}.md"
137
+ if out.exists():
138
+ msg = f"{out} already exists; remove it first if you intend to recreate"
139
+ raise AddDomainError(msg)
140
+ body = _render_skeleton(name, today or _today_iso())
141
+ atomic_write(out, body)
142
+ update_harness_yaml(target / ".claude" / "harness.yaml", name)
143
+ return out
@@ -0,0 +1,146 @@
1
+ """Agent prompt quality scoring → Platinum/Gold/Silver/Bronze tier.
2
+
3
+ Hybrid score: static structural checks (line count, frontmatter, bullets)
4
+ combined with an optional Layer-2 LLM judgment against the shipped
5
+ ``agent_prompt.yaml`` rubric. When a ``JudgeClient`` and ``rubric_dir`` are
6
+ provided, the LLM half lifts the score above the structural floor; on any
7
+ LLM failure we degrade to the static score with a logged warning.
8
+
9
+ Tier thresholds are preserved: composite ≥90 Platinum, ≥80 Gold, ≥70 Silver,
10
+ else Bronze (which auto-flags an agent for /hm:refresh anti-rot review).
11
+ """
12
+
13
+ from __future__ import annotations
14
+
15
+ import hashlib
16
+ import logging
17
+ from pathlib import Path
18
+ from typing import Any
19
+
20
+ from harness_maker.cache import HttpCache
21
+ from harness_maker.llm_judge import JudgeClient, judge_file
22
+ from harness_maker.rubric_loader import load_rubric_file
23
+
24
+ _LOG = logging.getLogger(__name__)
25
+ _SKIP_TIERS = {"Platinum", "Gold"}
26
+
27
+
28
+ def _static_score(agent_md: Path) -> int:
29
+ try:
30
+ text = agent_md.read_text(encoding="utf-8")
31
+ except OSError:
32
+ return 0
33
+ if not text.strip():
34
+ return 0
35
+ score = 0
36
+ lines = text.splitlines()
37
+ line_count = len(lines)
38
+ if 100 <= line_count <= 500:
39
+ score += 40
40
+ elif 50 <= line_count < 100 or 500 < line_count <= 700:
41
+ score += 20
42
+ if text.startswith("---"):
43
+ rest = text[4:]
44
+ if "\n---" in rest:
45
+ score += 30
46
+ if any(line.lstrip().startswith(("-", "*", "+")) for line in lines) or "```" in text:
47
+ score += 30
48
+ return min(100, score)
49
+
50
+
51
+ def _tier(composite: int) -> str:
52
+ if composite >= 90:
53
+ return "Platinum"
54
+ if composite >= 80:
55
+ return "Gold"
56
+ if composite >= 70:
57
+ return "Silver"
58
+ return "Bronze"
59
+
60
+
61
+ def _content_hash(agent_md: Path) -> str:
62
+ try:
63
+ content = agent_md.read_bytes()
64
+ except OSError:
65
+ return ""
66
+ return hashlib.sha256(content).hexdigest()[:16]
67
+
68
+
69
+ def _get_cached_score(agent_md: Path) -> dict[str, Any] | None:
70
+ """Return previous score if tier was Platinum/Gold and content unchanged."""
71
+ cache = HttpCache("agent-quality")
72
+ key = hashlib.sha256(str(agent_md.resolve()).encode()).hexdigest()[:16]
73
+ cached = cache.get(key, ttl=float("inf")) # no TTL — content-based
74
+ if not isinstance(cached, dict):
75
+ return None
76
+ if cached.get("tier") not in _SKIP_TIERS:
77
+ return None
78
+ if cached.get("content_hash") != _content_hash(agent_md):
79
+ return None
80
+ _LOG.info("agent_quality: skip (cached tier=%s) for %s", cached["tier"], agent_md.name)
81
+ return cached
82
+
83
+
84
+ def _cache_score(agent_md: Path, result: dict[str, Any]) -> None:
85
+ cache = HttpCache("agent-quality")
86
+ key = hashlib.sha256(str(agent_md.resolve()).encode()).hexdigest()[:16]
87
+ entry = {**result, "content_hash": _content_hash(agent_md)}
88
+ cache.put(key, entry)
89
+
90
+
91
+ def score_agent(
92
+ agent_md: Path,
93
+ *,
94
+ rubric_dir: Path | None = None,
95
+ client: JudgeClient | None = None,
96
+ model: str = "claude-sonnet-4-6",
97
+ force: bool = False,
98
+ ) -> dict[str, Any]:
99
+ """Score one agent prompt and emit a tier.
100
+
101
+ Args:
102
+ agent_md: Path to ``.claude/agents/<name>.md``.
103
+ rubric_dir: When provided alongside ``client``, points at the
104
+ ``.claude/rubrics/`` directory; the ``agent_prompt.yaml`` rubric
105
+ inside drives the LLM judgment.
106
+ client: Optional LLM client (``JudgeClient`` Protocol). When omitted,
107
+ the LLM half is skipped and the score reflects structural signals
108
+ only.
109
+ model: Anthropic model id passed through to the judge.
110
+
111
+ Returns:
112
+ ``{"static": int, "llm": int|None, "composite": int, "tier": str}``.
113
+ """
114
+ if not force:
115
+ cached = _get_cached_score(agent_md)
116
+ if cached is not None:
117
+ return {k: cached[k] for k in ("static", "llm", "composite", "tier") if k in cached}
118
+
119
+ static = _static_score(agent_md)
120
+ llm: int | None = None
121
+
122
+ if client is not None and rubric_dir is not None:
123
+ rubric_path = rubric_dir / "agent_prompt.yaml"
124
+ rubric = load_rubric_file(rubric_path)
125
+ if rubric is None:
126
+ _LOG.warning("agent_quality: rubric not found at %s; static-only score", rubric_path)
127
+ else:
128
+ try:
129
+ result = judge_file(agent_md, rubric, client=client, model=model)
130
+ except Exception as e: # noqa: BLE001 — LLM transport degrades gracefully
131
+ _LOG.warning("agent_quality: LLM judge failed (%s); static-only score", e)
132
+ result = None
133
+ if result is not None and result.error is None:
134
+ llm = result.score
135
+ elif result is not None and result.error:
136
+ _LOG.warning("agent_quality: LLM judge reported %s", result.error)
137
+
138
+ composite = static if llm is None else (static + llm) // 2
139
+ score: dict[str, Any] = {
140
+ "static": static,
141
+ "llm": llm,
142
+ "composite": composite,
143
+ "tier": _tier(composite),
144
+ }
145
+ _cache_score(agent_md, score)
146
+ return score
@@ -0,0 +1,281 @@
1
+ """Orchestrator — combine readiness layers into a plan + renders.
2
+
3
+ PLAN health-consolidation Phase 1 (0.13.0) split the 3-layer composite
4
+ score into a ``structural`` field of the unified ``/hm:health`` dashboard.
5
+ The new entrypoint ``run_structural(project_dir, preset)`` returns a
6
+ minimal ``{"score": int, "signals_failed": [...]}`` dict suitable
7
+ for the dashboard third-section writer; the legacy ``run_ai_readiness``
8
+ and rendering helpers are retained so existing callers and tests in the
9
+ package continue to work until the templates catch up (Phase 2).
10
+
11
+ Public API:
12
+ - ``run_structural(project_dir, preset)`` — NEW, 0.13.0 health field.
13
+ - ``run_ai_readiness(project_dir, preset, ...)`` — legacy full pipeline.
14
+ - ``run_ai_readiness_structural(project_dir, preset, ...)`` — L1+L3 only.
15
+ - ``finalize_from_verdicts_json(scores_path, verdicts_path)`` — legacy.
16
+ - ``render_terminal_summary(plan)`` — concise text for CLI output.
17
+ - ``render_dashboard_markdown(plan, project_name)`` — legacy dashboard body.
18
+ """
19
+
20
+ from __future__ import annotations
21
+
22
+ import json
23
+ from pathlib import Path
24
+ from typing import Any
25
+
26
+ from harness_maker.cache_diagnostics import CacheDiagnosis, diagnose_cache
27
+ from harness_maker.communication_audit import audit_communication
28
+ from harness_maker.improvement import ActionItem, ImprovementPlan, build_improvement_plan
29
+ from harness_maker.llm_judge import (
30
+ AnthropicJudgeClient,
31
+ JudgeClient,
32
+ JudgeResult,
33
+ RubricVerdict,
34
+ compute_score_from_verdicts,
35
+ judge_target,
36
+ )
37
+ from harness_maker.models import Preset
38
+ from harness_maker.readiness import ReadinessResult, compute_readiness
39
+ from harness_maker.rubric_loader import load_rubrics
40
+
41
+
42
+ def _build_judge_client() -> JudgeClient | None:
43
+ """Best-effort Anthropic SDK client (requires ANTHROPIC_API_KEY).
44
+
45
+ In Claude Code subscription contexts Layer 2 runs prompt-natively
46
+ (the executing Claude agent evaluates rubrics inline). This fallback
47
+ is kept for non-interactive / CI environments that do have an API key.
48
+ """
49
+ try:
50
+ return AnthropicJudgeClient()
51
+ except Exception: # noqa: BLE001 — missing API key etc.
52
+ return None
53
+
54
+
55
+ def run_ai_readiness(
56
+ project_dir: Path,
57
+ *,
58
+ preset: Preset,
59
+ skip_llm: bool = False,
60
+ judge_client: JudgeClient | None = None,
61
+ model: str = "claude-sonnet-4-6",
62
+ ) -> ImprovementPlan:
63
+ """Run the 3-layer pipeline and return a composite improvement plan."""
64
+ readiness = compute_readiness(project_dir, preset)
65
+ metrics = project_dir / ".claude" / "observability" / "metrics.jsonl"
66
+ cache = diagnose_cache(metrics, model=model)
67
+
68
+ judge_results: list[JudgeResult] = []
69
+ if not skip_llm:
70
+ rubrics = load_rubrics(project_dir / ".claude" / "rubrics")
71
+ if rubrics:
72
+ client = judge_client or _build_judge_client()
73
+ if client is not None:
74
+ for rf in rubrics.values():
75
+ judge_results.extend(judge_target(project_dir, rf, client=client, model=model))
76
+
77
+ return build_improvement_plan(readiness, judge_results, cache)
78
+
79
+
80
+ def run_ai_readiness_structural(
81
+ project_dir: Path,
82
+ *,
83
+ preset: Preset,
84
+ model: str = "claude-sonnet-4-6",
85
+ ) -> dict[str, Any]:
86
+ """Run L1+L3 only and return a JSON-serializable dict.
87
+
88
+ The dict is written by ``--json-output`` so that ``ai-readiness-finalize``
89
+ can reconstruct a full plan after Claude provides L2 verdicts inline.
90
+ """
91
+ readiness = compute_readiness(project_dir, preset)
92
+ metrics = project_dir / ".claude" / "observability" / "metrics.jsonl"
93
+ cache = diagnose_cache(metrics, model=model)
94
+ return {
95
+ "readiness": readiness.model_dump(),
96
+ "cache": cache.model_dump(),
97
+ "preset": preset.value,
98
+ }
99
+
100
+
101
+ def run_structural(
102
+ project_dir: Path,
103
+ *,
104
+ preset: Preset,
105
+ model: str = "claude-sonnet-4-6",
106
+ ) -> dict[str, Any]:
107
+ """Compute the ``structural`` field for the /hm:health dashboard (0.13.0).
108
+
109
+ Returns ``{"score": <0-100 int>, "signals_failed": [...]}``. The
110
+ score is the weighted blend of the deterministic L1 readiness signals
111
+ (70%) and the L3 cache-diagnostic score (5% in the legacy weighting —
112
+ surfaced here as a small additive component so a degenerate cache
113
+ state can still pull the structural score down). L2 is intentionally
114
+ NOT folded into ``structural``: the LLM-judged content score belongs
115
+ to a different concern and the verify-stage Check 3 contract names
116
+ "structural" specifically.
117
+
118
+ Key rename (0.13.1, PLAN-health-plugin-bugs-2026-05 ADR-001): the
119
+ inner score key was renamed from ``"structural"`` to ``"score"`` so
120
+ the schema is no longer nested under the same name as the outer
121
+ section. The dashboard renderer and its unit tests have always read
122
+ ``.get("score")`` — pre-0.13.1 the producer drifted to ``"structural"``
123
+ silently, causing every rendered dashboard to show ``score: 0 / 100``.
124
+
125
+ ``signals_failed`` is the flat list of ``layer1:<signal_id>`` entries
126
+ whose ``passed`` flag is False — one line per failed deterministic
127
+ check so the dashboard reader can show a count without re-running the
128
+ layer.
129
+ """
130
+ readiness = compute_readiness(project_dir, preset)
131
+ metrics = project_dir / ".claude" / "observability" / "metrics.jsonl"
132
+ cache = diagnose_cache(metrics, model=model)
133
+
134
+ # Blend: 70% readiness (deterministic structural) + 5% cache; the
135
+ # remaining 25% slot belongs to L2 (llm_judge) which lives in a
136
+ # separate field of the dashboard once the templates land in Phase 2.
137
+ # Until then we treat L2 as neutral 50 so the structural number remains
138
+ # comparable to the pre-0.13.0 single-score dashboard for users mid-
139
+ # migration. Renormalize after dropping L2 so the result is in [0, 100].
140
+ weighted = (readiness.composite * 0.70 + cache.score * 0.05) / 0.75
141
+ structural_score = max(0, min(100, round(weighted)))
142
+
143
+ signals_failed: list[str] = []
144
+ for dim_name, dim in readiness.dimensions.items():
145
+ for sig in dim.signals:
146
+ if not sig.passed:
147
+ signals_failed.append(f"{dim_name}:{sig.id}")
148
+
149
+ # PLAN-antisycophancy-2026-05 ADR-006: communication-protocol sub-check.
150
+ # Discovers dispatcher templates + 5 pinned LLM-judgment skills, requires
151
+ # `communication_variant` frontmatter on each, and verifies the rendered
152
+ # marker matches source. Silent-miss (the R4 WRONG-probe failure mode)
153
+ # surfaces here as structured ActionItem records; the /hm:health
154
+ # accept/reject/defer loop walks them unchanged (0.13.0 ADR-001).
155
+ templates_root = Path(__file__).resolve().parent / "templates"
156
+ output_root = project_dir / ".claude"
157
+ comm_items = audit_communication(
158
+ templates_root, output_dir=output_root if output_root.is_dir() else None
159
+ )
160
+ for item in comm_items:
161
+ signals_failed.append(f"communication_protocol:{item.target}")
162
+
163
+ return {
164
+ "score": structural_score,
165
+ "signals_failed": signals_failed,
166
+ "communication_items": [it.model_dump() for it in comm_items],
167
+ }
168
+
169
+
170
+ def finalize_from_verdicts_json(
171
+ scores_path: Path,
172
+ verdicts_path: Path,
173
+ ) -> ImprovementPlan:
174
+ """Reconstruct a full ImprovementPlan from pre-computed L1+L3 + Claude L2 verdicts.
175
+
176
+ ``verdicts_path`` must contain a JSON array of objects in the form:
177
+ ``[{"file": "...", "dimension": "...", "verdicts": [{...RubricVerdict fields...}]}]``
178
+
179
+ The ``score`` field is computed from the verdicts; ``error`` defaults to null.
180
+ """
181
+ try:
182
+ scores = json.loads(scores_path.read_text(encoding="utf-8"))
183
+ readiness = ReadinessResult.model_validate(scores["readiness"])
184
+ cache = CacheDiagnosis.model_validate(scores["cache"])
185
+ except (json.JSONDecodeError, KeyError, Exception) as e:
186
+ msg = f"Could not parse scores JSON at {scores_path}: {e}"
187
+ raise ValueError(msg) from e
188
+
189
+ try:
190
+ raw_verdicts = json.loads(verdicts_path.read_text(encoding="utf-8"))
191
+ except json.JSONDecodeError as e:
192
+ msg = f"Could not parse verdicts JSON at {verdicts_path}: {e}"
193
+ raise ValueError(msg) from e
194
+ judge_results: list[JudgeResult] = []
195
+ if isinstance(raw_verdicts, list):
196
+ for entry in raw_verdicts:
197
+ if not isinstance(entry, dict):
198
+ continue
199
+ verdicts = [
200
+ RubricVerdict.model_validate(v)
201
+ for v in entry.get("verdicts", [])
202
+ if isinstance(v, dict)
203
+ ]
204
+ score = compute_score_from_verdicts(verdicts) if verdicts else 50
205
+ judge_results.append(
206
+ JudgeResult(
207
+ file=str(entry.get("file", "")),
208
+ dimension=str(entry.get("dimension", "")),
209
+ score=score,
210
+ verdicts=verdicts,
211
+ error=entry.get("error"),
212
+ )
213
+ )
214
+
215
+ return build_improvement_plan(readiness, judge_results, cache)
216
+
217
+
218
+ def render_terminal_summary(plan: ImprovementPlan, *, max_actions: int = 10) -> str:
219
+ """Concise text suitable for stdout when /hm:ai-readiness is invoked."""
220
+ lines = [
221
+ f"ai-readiness: {plan.composite_score} / 100",
222
+ "",
223
+ "Layer scores:",
224
+ f" readiness : {plan.layer_scores['readiness']:>3} (deterministic structural)",
225
+ f" llm_judge : {plan.layer_scores['llm_judge']:>3} (LLM content quality)",
226
+ f" cache : {plan.layer_scores['cache']:>3} (prompt-caching efficiency)",
227
+ "",
228
+ ]
229
+ if not plan.actions:
230
+ lines.append("No actions — project looks healthy.")
231
+ return "\n".join(lines)
232
+
233
+ lines.append(f"Top {min(max_actions, len(plan.actions))} of {len(plan.actions)} actions:")
234
+ for a in plan.actions[:max_actions]:
235
+ lines.append(f" [{a.priority}] {a.dimension} :: {a.summary}")
236
+ lines.append(f" → {a.suggestion}")
237
+ if len(plan.actions) > max_actions:
238
+ lines.append(f" … {len(plan.actions) - max_actions} more (run --verbose for full list)")
239
+ return "\n".join(lines)
240
+
241
+
242
+ def _format_action_row(a: ActionItem) -> str:
243
+ suggestion = a.suggestion.replace("|", r"\|").replace("\n", " ")
244
+ summary = a.summary.replace("|", r"\|").replace("\n", " ")
245
+ return f"| {a.priority} | {a.dimension} | {summary} | {suggestion} |"
246
+
247
+
248
+ def render_dashboard_markdown(plan: ImprovementPlan, project_name: str) -> str:
249
+ """Markdown dashboard body for ``.claude/observability/dashboard.md``."""
250
+ lines: list[str] = [
251
+ f"# AI Readiness — {project_name}",
252
+ "",
253
+ f"**Composite:** {plan.composite_score} / 100",
254
+ "",
255
+ "## Layer scores",
256
+ "",
257
+ "| Layer | Score | What it measures |",
258
+ "|-------|------:|------------------|",
259
+ f"| readiness | {plan.layer_scores['readiness']} | "
260
+ "Deterministic structural signals (CLAUDE.md, hooks, tests, CI, …) |",
261
+ f"| llm_judge | {plan.layer_scores['llm_judge']} | LLM-judged content quality vs rubrics |",
262
+ f"| cache | {plan.layer_scores['cache']} | "
263
+ "Prompt-cache hit rate + failure-mode diagnosis |",
264
+ "",
265
+ ]
266
+ if not plan.actions:
267
+ lines.extend(["## Actions", "", "(none — project looks healthy)", ""])
268
+ return "\n".join(lines) + "\n"
269
+
270
+ lines.extend(
271
+ [
272
+ "## Actions",
273
+ "",
274
+ "| Priority | Dimension | Summary | Suggestion |",
275
+ "|----------|-----------|---------|------------|",
276
+ ]
277
+ )
278
+ for a in plan.actions:
279
+ lines.append(_format_action_row(a))
280
+ lines.append("")
281
+ return "\n".join(lines) + "\n"