@hallucination-studio/harness-engine 1.0.0 → 1.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +185 -27
  3. package/bin/install.js +29 -17
  4. package/package.json +10 -4
  5. package/skills/harness-engine/SKILL.md +97 -0
  6. package/skills/harness-engine/agents/openai.yaml +4 -0
  7. package/skills/harness-engine/evals/cases.json +94 -0
  8. package/skills/harness-engine/evals/harness_engine_evals/__init__.py +1 -0
  9. package/skills/harness-engine/evals/harness_engine_evals/cases_frontend.py +211 -0
  10. package/skills/harness-engine/evals/harness_engine_evals/cases_lifecycle.py +1616 -0
  11. package/skills/harness-engine/evals/harness_engine_evals/helpers.py +155 -0
  12. package/skills/harness-engine/evals/harness_engine_evals/registry.py +55 -0
  13. package/skills/harness-engine/evals/harness_engine_evals/report.py +36 -0
  14. package/skills/harness-engine/evals/harness_engine_evals/runner.py +53 -0
  15. package/skills/harness-engine/evals/run_evals.py +14 -0
  16. package/skills/{harness-repo-bootstrap → harness-engine}/references/evaluation-loop.md +8 -2
  17. package/skills/harness-engine/references/evidence-first-evals.md +187 -0
  18. package/skills/harness-engine/references/exec-plans.md +59 -0
  19. package/skills/{harness-repo-bootstrap → harness-engine}/references/file-map.md +3 -3
  20. package/skills/{harness-repo-bootstrap → harness-engine}/references/knowledge-capture.md +2 -2
  21. package/skills/{harness-repo-bootstrap → harness-engine}/references/sop-index.md +3 -0
  22. package/skills/harness-engine/references/template-policy.md +17 -0
  23. package/skills/harness-engine/references/workflow.md +62 -0
  24. package/skills/harness-engine/scripts/harness_engine/__init__.py +1 -0
  25. package/skills/harness-engine/scripts/harness_engine/analysis.py +240 -0
  26. package/skills/harness-engine/scripts/harness_engine/checks.py +287 -0
  27. package/skills/harness-engine/scripts/harness_engine/cli.py +656 -0
  28. package/skills/harness-engine/scripts/harness_engine/common.py +977 -0
  29. package/skills/harness-engine/scripts/harness_engine/continuation.py +520 -0
  30. package/skills/harness-engine/scripts/harness_engine/git_ops.py +88 -0
  31. package/skills/harness-engine/scripts/harness_engine/knowledge.py +329 -0
  32. package/skills/harness-engine/scripts/harness_engine/plans.py +630 -0
  33. package/skills/harness-engine/scripts/harness_engine/templates.py +124 -0
  34. package/skills/harness-engine/scripts/manage_harness.py +14 -0
  35. package/skills/harness-repo-bootstrap/SKILL.md +0 -68
  36. package/skills/harness-repo-bootstrap/agents/openai.yaml +0 -4
  37. package/skills/harness-repo-bootstrap/evals/cases.json +0 -18
  38. package/skills/harness-repo-bootstrap/evals/run_evals.py +0 -337
  39. package/skills/harness-repo-bootstrap/references/exec-plans.md +0 -39
  40. package/skills/harness-repo-bootstrap/references/template-policy.md +0 -12
  41. package/skills/harness-repo-bootstrap/references/workflow.md +0 -47
  42. package/skills/harness-repo-bootstrap/scripts/manage_harness.py +0 -1181
  43. /package/skills/{harness-repo-bootstrap → harness-engine}/assets/repo-template/.keep +0 -0
  44. /package/skills/{harness-repo-bootstrap → harness-engine}/assets/sops/.keep +0 -0
  45. /package/skills/{harness-repo-bootstrap → harness-engine}/references/question-catalog.md +0 -0
@@ -0,0 +1,1616 @@
1
+ import json
2
+ import os
3
+ import subprocess
4
+ import sys
5
+ import tempfile
6
+ import time
7
+ from pathlib import Path
8
+
9
+ from .helpers import *
10
+ from .report import build_report
11
+
12
+ def test_empty_repo_init(tmp_root):
13
+ repo = tmp_root / "empty-repo"
14
+ repo.mkdir()
15
+ answers = tmp_root / "answers.json"
16
+ write_answers(answers)
17
+
18
+ analysis = run_manager("analyze", "--repo", str(repo))
19
+ if analysis["recommended_action"] != "init":
20
+ raise AssertionError("Empty repo should recommend init")
21
+ if not analysis["missing_exec_plan_state"]:
22
+ raise AssertionError("Analysis should report missing exec-plan state")
23
+ if not analysis["missing_sops"]:
24
+ raise AssertionError("Analysis should report missing SOPs")
25
+ nested_output = tmp_root / "nested" / "generated" / "analysis.json"
26
+ run_manager("analyze", "--repo", str(repo), "--output", str(nested_output))
27
+ if not nested_output.exists():
28
+ raise AssertionError("analyze --output should create missing parent directories")
29
+
30
+ run_manager("init", "--repo", str(repo), "--answers", str(answers))
31
+ for relative_path in [
32
+ "AGENTS.md",
33
+ "ARCHITECTURE.md",
34
+ "docs/PLANS.md",
35
+ "docs/QUALITY_SCORE.md",
36
+ "docs/exec-plans/workstreams.md",
37
+ "docs/exec-plans/active/_template.md",
38
+ "docs/exec-plans/completed/README.md",
39
+ "docs/sops/encode-unseen-knowledge.md",
40
+ "docs/sops/evidence-first-eval-loop.md",
41
+ ]:
42
+ assert_exists(repo, relative_path)
43
+ assert_contains(repo, "AGENTS.md", "docs/exec-plans/active/")
44
+ assert_contains(repo, "AGENTS.md", "docs/exec-plans/workstreams.md")
45
+ assert_contains(repo, "AGENTS.md", "docs/sops/")
46
+ assert_contains(repo, "AGENTS.md", "Codex runs the local harness check before handoff")
47
+ assert_contains(repo, "AGENTS.md", "## Harness Task Intake")
48
+ assert_contains(repo, "AGENTS.md", "Default rule: any request that changes repository files or behavior goes through the harness lifecycle")
49
+ assert_contains(repo, "AGENTS.md", "No-plan exceptions are narrow")
50
+ assert_contains(repo, "AGENTS.md", "plan-start")
51
+ assert_contains(repo, "AGENTS.md", "acceptance-set")
52
+ assert_contains(repo, "AGENTS.md", "quality-score")
53
+ assert_contains(repo, "AGENTS.md", "plan-close")
54
+ assert_contains(repo, "AGENTS.md", "## Issue Workflows")
55
+ assert_contains(repo, "AGENTS.md", "Product contract or acceptance drift")
56
+ assert_contains(repo, "AGENTS.md", "Backend, API, runtime behavior, background jobs, or integrations")
57
+ assert_contains(repo, "AGENTS.md", "Architecture boundaries, layering, data flow, or dependency direction")
58
+ assert_contains(repo, "AGENTS.md", "Data, state, migrations, cache, queues, or file formats")
59
+ assert_contains(repo, "AGENTS.md", "Security, privacy, auth, authorization, secrets, or sensitive data")
60
+ assert_contains(repo, "AGENTS.md", "Performance, capacity, timeout, resource use, or availability")
61
+ assert_contains(repo, "AGENTS.md", "Convert requirements, risks, or reported failures into assertions, tests, smoke checks, or review evidence")
62
+ assert_contains(repo, "AGENTS.md", "Log confirmed defects or missing evidence with `defect-log`")
63
+ assert_contains(repo, "docs/PLANS.md", "Create or reuse an execution plan for every repository change")
64
+ assert_contains(repo, "docs/PLANS.md", "For small changes, keep the plan lightweight")
65
+ assert_contains(repo, "docs/PLANS.md", "Only skip an execution plan for pure question answering")
66
+ assert_contains(repo, "docs/exec-plans/active/README.md", "Create one markdown file per in-flight repository change")
67
+ assert_contains(repo, "docs/exec-plans/active/_template.md", "## Continuation Decision")
68
+ assert_contains(repo, "docs/exec-plans/active/_template.md", "Decision: pending")
69
+ assert_contains(repo, "docs/sops/evidence-first-eval-loop.md", "Read Harness Task Intake in `AGENTS.md`")
70
+ assert_contains(repo, "docs/QUALITY_SCORE.md", "Evidence Requirements")
71
+ assert_contains(repo, "docs/QUALITY_SCORE.md", "Treat LLM or human judgment as a summary over evidence")
72
+ assert_contains(repo, "docs/QUALITY_SCORE.md", "Backend and runtime scores must cite")
73
+ assert_contains(repo, "docs/QUALITY_SCORE.md", "Architecture scores must cite")
74
+ assert_contains(repo, "docs/QUALITY_SCORE.md", "Security scores must cite")
75
+ for relative_path in [
76
+ "docs/FRONTEND.md",
77
+ "docs/DESIGN.md",
78
+ "docs/design-docs/style-options.md",
79
+ ]:
80
+ if (repo / relative_path).exists():
81
+ raise AssertionError(f"Empty backend-shaped repo should not receive frontend design docs: {relative_path}")
82
+ assert_contains(repo, "docs/sops/evidence-first-eval-loop.md", "Report per-case results")
83
+
84
+
85
+ def test_init_reconciles_existing_harness(tmp_root):
86
+ repo = tmp_root / "reconcile-repo"
87
+ repo.mkdir()
88
+ answers = tmp_root / "reconcile-answers.json"
89
+ write_answers(answers, project_name="reconcile-demo")
90
+ init_result = run_manager("init", "--repo", str(repo), "--answers", str(answers))
91
+ if init_result["mode"] != "init" or "AGENTS.md" not in init_result["created"]:
92
+ raise AssertionError("init should report created managed files")
93
+
94
+ existing_analysis = run_manager("analyze", "--repo", str(repo))
95
+ if existing_analysis["recommended_action"] != "init" or existing_analysis["harness_state"] != "existing":
96
+ raise AssertionError("existing harnesses should still route through init reconciliation")
97
+
98
+ target = repo / "docs" / "sops" / "evidence-first-eval-loop.md"
99
+ target.unlink()
100
+ (repo / "AGENTS.md").write_text("<!-- harness-engine:managed -->\n# stale managed router\n")
101
+ reconcile_result = run_manager("init", "--repo", str(repo), "--answers", str(answers))
102
+ if reconcile_result["mode"] != "init" or reconcile_result["operation"] != "reconciled":
103
+ raise AssertionError("init should reconcile an existing managed harness")
104
+ if "docs/sops/evidence-first-eval-loop.md" not in reconcile_result["created"]:
105
+ raise AssertionError("init reconcile should create missing managed files introduced by newer templates")
106
+ if "AGENTS.md" not in reconcile_result["refreshed"]:
107
+ raise AssertionError("init reconcile should refresh existing managed files")
108
+ assert_contains(repo, "AGENTS.md", "## Issue Workflows")
109
+ assert_exists(repo, "docs/sops/evidence-first-eval-loop.md")
110
+
111
+
112
+ def test_clean_removes_runtime_state_and_untracks_artifacts(tmp_root):
113
+ non_git_repo = tmp_root / "clean-non-git-repo"
114
+ non_git_repo.mkdir()
115
+ generated = non_git_repo / "docs" / "generated" / "local-smoke.txt"
116
+ generated.parent.mkdir(parents=True, exist_ok=True)
117
+ generated.write_text("local generated evidence\n")
118
+ non_git_dry_run = run_manager("clean", "--repo", str(non_git_repo))
119
+ if non_git_dry_run["tracked_candidate_count"] != 0 or non_git_dry_run["tracked_candidates"]:
120
+ raise AssertionError("clean dry-run should not require git metadata in a non-git repository")
121
+ if "docs/generated/local-smoke.txt" not in non_git_dry_run["local_candidates"]:
122
+ raise AssertionError("clean dry-run should still preview local generated evidence in non-git repositories")
123
+
124
+ repo = tmp_root / "clean-repo"
125
+ repo.mkdir()
126
+ subprocess.run(["git", "init"], cwd=repo, text=True, capture_output=True, check=True)
127
+ subprocess.run(
128
+ ["git", "config", "user.email", "harness-eval@example.com"],
129
+ cwd=repo,
130
+ text=True,
131
+ capture_output=True,
132
+ check=True,
133
+ )
134
+ subprocess.run(
135
+ ["git", "config", "user.name", "Harness Eval"],
136
+ cwd=repo,
137
+ text=True,
138
+ capture_output=True,
139
+ check=True,
140
+ )
141
+ tracked_files = [
142
+ ".codex/skills/harness-engine/SKILL.md",
143
+ "docs/generated/canvas-polish-desktop-final.png",
144
+ "docs/generated/harness-analysis.json",
145
+ ]
146
+ durable_plan_files = [
147
+ "docs/exec-plans/active/2026-06-11-old-task.md",
148
+ "docs/exec-plans/active/2026-06-11-old-task.json",
149
+ "docs/exec-plans/completed/2026-06-11-old-task.md",
150
+ "docs/exec-plans/completed/2026-06-11-old-task.json",
151
+ "docs/exec-plans/workstreams.md",
152
+ ]
153
+ all_files = tracked_files + durable_plan_files
154
+ for relative_path in all_files:
155
+ path = repo / relative_path
156
+ path.parent.mkdir(parents=True, exist_ok=True)
157
+ path.write_text("tracked harness file\n")
158
+ subprocess.run(["git", "add", *all_files], cwd=repo, text=True, capture_output=True, check=True)
159
+ subprocess.run(
160
+ ["git", "commit", "-m", "track runtime artifacts"],
161
+ cwd=repo,
162
+ text=True,
163
+ capture_output=True,
164
+ check=True,
165
+ )
166
+
167
+ dry_run = run_manager("clean", "--repo", str(repo))
168
+ if dry_run["mode"] != "dry-run" or dry_run["tracked_candidate_count"] != len(tracked_files):
169
+ raise AssertionError("clean should dry-run tracked runtime artifact candidates")
170
+ if set(dry_run["tracked_candidates"]) != set(tracked_files):
171
+ raise AssertionError("clean tracked candidates should include only local skill installs and generated evidence")
172
+ if set(dry_run["tracked_candidates"]) & set(durable_plan_files):
173
+ raise AssertionError("clean dry-run should not list execution plans, sidecars, or workstreams as tracked candidates")
174
+ if "docs/generated/canvas-polish-desktop-final.png" not in set(dry_run["local_candidates"]):
175
+ raise AssertionError("clean should preview local generated evidence cleanup")
176
+ if set(dry_run["local_candidates"]) & set(durable_plan_files):
177
+ raise AssertionError("clean dry-run should not list execution plans, sidecars, or workstreams as local cleanup candidates")
178
+ for relative_path in all_files:
179
+ if not (repo / relative_path).exists():
180
+ raise AssertionError("clean dry-run should not delete local files")
181
+
182
+ applied = run_manager("clean", "--repo", str(repo), "--apply")
183
+ if applied["mode"] != "apply" or set(applied["removed_from_index"]) != set(tracked_files):
184
+ raise AssertionError("clean --apply should remove candidates from the git index")
185
+ if set(applied["removed_from_index"]) & set(durable_plan_files):
186
+ raise AssertionError("clean --apply should not untrack execution plans, sidecars, or workstreams")
187
+ assert_contains(repo, ".gitignore", ".codex/skills/")
188
+ assert_contains(repo, ".gitignore", "docs/generated/")
189
+ status = subprocess.run(
190
+ ["git", "status", "--short"],
191
+ cwd=repo,
192
+ text=True,
193
+ capture_output=True,
194
+ check=True,
195
+ ).stdout
196
+ for relative_path in tracked_files:
197
+ if f"D {relative_path}" not in status:
198
+ raise AssertionError(f"clean should stage index deletion for {relative_path}")
199
+ for relative_path in durable_plan_files:
200
+ if f"D {relative_path}" in status:
201
+ raise AssertionError(f"clean should not stage index deletion for durable plan state {relative_path}")
202
+ if not (repo / relative_path).exists():
203
+ raise AssertionError(f"clean should keep durable plan state file {relative_path}")
204
+ for relative_path in tracked_files:
205
+ if relative_path.startswith(".codex/skills/"):
206
+ if not (repo / relative_path).exists():
207
+ raise AssertionError(f"clean should keep local skill install file for {relative_path}")
208
+ elif (repo / relative_path).exists():
209
+ raise AssertionError(f"clean should delete local runtime file for {relative_path}")
210
+ if "A .gitignore" not in status:
211
+ raise AssertionError("clean should stage the new .gitignore block")
212
+
213
+
214
+ def test_broad_task_intake_routes_repo_changes(tmp_root):
215
+ repo = tmp_root / "task-intake-repo"
216
+ repo.mkdir()
217
+ answers = tmp_root / "task-intake-answers.json"
218
+ write_answers(answers, project_name="task-intake-demo")
219
+ run_manager("init", "--repo", str(repo), "--answers", str(answers))
220
+
221
+ agents = (repo / "AGENTS.md").read_text()
222
+ plans = (repo / "docs" / "PLANS.md").read_text()
223
+ active_readme = (repo / "docs" / "exec-plans" / "active" / "README.md").read_text()
224
+ sop = (repo / "docs" / "sops" / "evidence-first-eval-loop.md").read_text()
225
+
226
+ for needle in [
227
+ "## Harness Task Intake",
228
+ "Default rule: any request that changes repository files or behavior goes through the harness lifecycle",
229
+ "code, docs, configuration, tests, dependencies, generated templates, build/release scripts, runtime behavior, migrations, cleanup",
230
+ "No-plan exceptions are narrow",
231
+ "Codex creates or reuses an active plan with `plan-start`",
232
+ "Codex defines a ready Acceptance Contract with `acceptance-set` before implementation",
233
+ "have Codex score with `quality-score`",
234
+ "Codex closes with `plan-close`",
235
+ "Codex runs the local harness check before handoff",
236
+ ]:
237
+ if needle not in agents:
238
+ raise AssertionError(f"AGENTS.md should include broad task intake rule: {needle}")
239
+
240
+ scenario_needles = [
241
+ "New feature or product behavior",
242
+ "Bug, regression, or user-reported issue",
243
+ "Refactor, cleanup, or code organization",
244
+ "Frontend, UI, design, layout, terminal interface, visual state, or interaction",
245
+ "Tests, evals, fixtures, or validation harnesses",
246
+ "Documentation, policy, specs, or generated harness templates",
247
+ "Dependencies, tooling, package manager, or build system",
248
+ "Build, release, deployment, or packaging",
249
+ "Configuration, environment, flags, secrets handling, or policy gates",
250
+ "Data, migrations, storage, cache, queues, or file formats",
251
+ "Performance, reliability, observability, or operational behavior",
252
+ "Security, privacy, auth, authorization, or sensitive data",
253
+ "Code review finding or user feedback that requires changes",
254
+ ]
255
+ for needle in scenario_needles:
256
+ if needle not in agents:
257
+ raise AssertionError(f"AGENTS.md should route scenario: {needle}")
258
+
259
+ evidence_needles = [
260
+ "Product assertions, workflow checks, tests or smoke evidence",
261
+ "Reproduction, regression assertion, fix validation, defect log if confirmed",
262
+ "Before/after behavior checks, boundary or dependency notes, compatibility evidence",
263
+ "Browser or local-runtime evidence for workflows, states, and relevant viewports",
264
+ "Failing-before or coverage rationale, passing test/eval output, artifact paths when produced",
265
+ "Doc diff review, link/path validation, generated-output or eval evidence when templates change",
266
+ "Install/build/test output, lockfile or package diff, compatibility and rollback notes",
267
+ "Repeatable build/package output, smoke check, release-risk notes",
268
+ "Config diff, secret-handling review, permission or failure-mode evidence",
269
+ "Fixtures or migration checks, rollback/compatibility evidence, data-loss risk notes",
270
+ "Baseline measurement, repeatable benchmark or smoke check, logs/traces, before/after evidence",
271
+ "Threat check, sensitive-data path, permission test, and secret-handling evidence",
272
+ ]
273
+ for needle in evidence_needles:
274
+ if needle not in agents:
275
+ raise AssertionError(f"AGENTS.md should name minimum evidence: {needle}")
276
+
277
+ if "Issue handling is one branch of Harness Task Intake" not in agents:
278
+ raise AssertionError("Issue Workflows should be subordinate to Harness Task Intake")
279
+ if "Create or reuse an execution plan for every repository change" not in plans:
280
+ raise AssertionError("PLANS.md should require plans for every repository change")
281
+ if "For small changes, keep the plan lightweight" not in plans:
282
+ raise AssertionError("PLANS.md should keep small changes lightweight but planned")
283
+ if "Only skip an execution plan for pure question answering" not in plans:
284
+ raise AssertionError("PLANS.md should document no-plan exceptions")
285
+ if "Create one markdown file per in-flight repository change" not in active_readme:
286
+ raise AssertionError("active README should cover any in-flight repository change")
287
+ if "Read Harness Task Intake in `AGENTS.md`" not in sop:
288
+ raise AssertionError("SOP should start from Harness Task Intake")
289
+
290
+
291
+ def test_closed_loop_plan(tmp_root):
292
+ repo = tmp_root / "loop-repo"
293
+ repo.mkdir()
294
+ (repo / "snake.sh").write_text("#!/usr/bin/env bash\nprintf 'snake\\n'\n")
295
+ (repo / ".codex" / "skills" / "demo" / "scripts").mkdir(parents=True)
296
+ (repo / ".codex" / "skills" / "demo" / "scripts" / "tool.py").write_text("print('ignore me')\n")
297
+ answers = tmp_root / "loop-answers.json"
298
+ write_answers(answers, project_name="loop-demo")
299
+ analysis = run_manager("analyze", "--repo", str(repo))
300
+ if "Shell" not in analysis["languages"]:
301
+ raise AssertionError("Shell should be detected from target project files")
302
+ if "Python" in analysis["languages"]:
303
+ raise AssertionError(".codex skill files should not affect target project language detection")
304
+ run_manager("init", "--repo", str(repo), "--answers", str(answers))
305
+
306
+ plan_result = run_manager(
307
+ "plan-start",
308
+ "--repo",
309
+ str(repo),
310
+ "--slug",
311
+ "knowledge-loop",
312
+ "--goal",
313
+ "Validate durable knowledge closure",
314
+ )
315
+ plan_path = Path(plan_result["plan"])
316
+ fill_plan_details(plan_path)
317
+ relative_plan = str(plan_path.resolve().relative_to(repo.resolve()))
318
+ fact = "Install mode must distinguish local and global skill destinations"
319
+ run_manager(
320
+ "knowledge-log",
321
+ "--repo",
322
+ str(repo),
323
+ "--plan",
324
+ relative_plan,
325
+ "--fact",
326
+ fact,
327
+ "--destination",
328
+ "docs/PRODUCT_SENSE.md",
329
+ )
330
+ open_knowledge_close = run_manager(
331
+ "plan-close",
332
+ "--repo",
333
+ str(repo),
334
+ "--plan",
335
+ relative_plan,
336
+ "--summary",
337
+ "done",
338
+ expect_success=False,
339
+ )
340
+ if open_knowledge_close.get("reason") != "acceptance-contract-not-ready":
341
+ raise AssertionError("plan-close should return structured acceptance-contract-not-ready JSON before acceptance")
342
+ run_manager(
343
+ "knowledge-mark-written",
344
+ "--repo",
345
+ str(repo),
346
+ "--plan",
347
+ relative_plan,
348
+ "--fact",
349
+ fact,
350
+ "--destination",
351
+ "docs/PRODUCT_SENSE.md",
352
+ expect_success=False,
353
+ )
354
+ run_manager(
355
+ "knowledge-mark-written",
356
+ "--repo",
357
+ str(repo),
358
+ "--plan",
359
+ relative_plan,
360
+ "--fact",
361
+ fact,
362
+ "--destination",
363
+ "docs/PRODUCT_SENSE.md",
364
+ "--append",
365
+ )
366
+ assert_contains(repo, "docs/PRODUCT_SENSE.md", fact)
367
+ set_acceptance(repo, relative_plan)
368
+ no_score_close = run_manager(
369
+ "plan-close",
370
+ "--repo",
371
+ str(repo),
372
+ "--plan",
373
+ relative_plan,
374
+ "--summary",
375
+ "done",
376
+ expect_success=False,
377
+ )
378
+ if no_score_close.get("reason") != "quality-result-not-passing":
379
+ raise AssertionError("plan-close should return structured quality-result-not-passing JSON before scoring")
380
+ failing_score = run_manager(
381
+ "quality-score",
382
+ "--repo",
383
+ str(repo),
384
+ "--plan",
385
+ relative_plan,
386
+ "--product-correctness",
387
+ "9",
388
+ "--ux-operator-clarity",
389
+ "8",
390
+ "--architecture-maintainability",
391
+ "7",
392
+ "--reliability-observability",
393
+ "8",
394
+ "--security-data-handling",
395
+ "8",
396
+ "--architecture-note",
397
+ "Plan closure review found architecture evidence below the required threshold.",
398
+ *quality_note_args(
399
+ architecture="Plan closure review found architecture evidence below the required threshold.",
400
+ ),
401
+ expect_success=False,
402
+ )
403
+ if failing_score["status"] != "fail":
404
+ raise AssertionError("Low dimension score should fail the quality gate")
405
+ plan_text_after_fail = plan_path.read_text()
406
+ if "## Rework Required" not in plan_text_after_fail:
407
+ raise AssertionError("Failing quality score should keep a rework section")
408
+ if "Improve Architecture and maintainability" not in plan_text_after_fail:
409
+ raise AssertionError("Failing quality score should name the low dimension")
410
+ set_continuation_complete(repo, relative_plan)
411
+ check_after_fail = run_manager("check", "--repo", str(repo))
412
+ if check_after_fail["status"] != "pass":
413
+ raise AssertionError("Active plan check should require acceptance readiness, not a passing post-implementation score")
414
+ passing_score = run_manager(
415
+ "quality-score",
416
+ "--repo",
417
+ str(repo),
418
+ "--plan",
419
+ relative_plan,
420
+ "--product-correctness",
421
+ "9",
422
+ "--ux-operator-clarity",
423
+ "8",
424
+ "--architecture-maintainability",
425
+ "8",
426
+ "--reliability-observability",
427
+ "8",
428
+ "--security-data-handling",
429
+ "8",
430
+ *quality_note_args(
431
+ product="Requested behavior was validated by the closed-loop eval command.",
432
+ architecture="Plan closure architecture was reviewed in plan sidecar files.",
433
+ ),
434
+ )
435
+ if passing_score["status"] != "pass":
436
+ raise AssertionError("Scores at or above the minimum should pass")
437
+ close_result = run_manager(
438
+ "plan-close",
439
+ "--repo",
440
+ str(repo),
441
+ "--plan",
442
+ relative_plan,
443
+ "--summary",
444
+ "Closed after writing durable knowledge.",
445
+ )
446
+ if close_result["status"] != "closed":
447
+ raise AssertionError("Plan should close after knowledge is marked written")
448
+ if plan_path.exists():
449
+ raise AssertionError("Active plan should be moved after close")
450
+ assert_exists(repo, "docs/exec-plans/completed/" + plan_path.name)
451
+ check_result = run_manager("check", "--repo", str(repo))
452
+ if check_result["status"] != "pass":
453
+ raise AssertionError("Harness check should pass after plan closure")
454
+
455
+ formatted_plan = create_formatted_plan(repo)
456
+ formatted_relative_plan = str(formatted_plan.resolve().relative_to(repo.resolve()))
457
+ formatted_fact = "snake.sh is the single runtime entrypoint and owns terminal control directly with stty and tput"
458
+ with (repo / "ARCHITECTURE.md").open("a") as handle:
459
+ handle.write("\n`snake.sh` is the single runtime entrypoint and owns terminal control directly with `stty` and `tput`.\n")
460
+ run_manager(
461
+ "knowledge-mark-written",
462
+ "--repo",
463
+ str(repo),
464
+ "--plan",
465
+ formatted_relative_plan,
466
+ "--fact",
467
+ formatted_fact,
468
+ "--destination",
469
+ "ARCHITECTURE.md",
470
+ )
471
+
472
+ id_plan_result = run_manager(
473
+ "plan-start",
474
+ "--repo",
475
+ str(repo),
476
+ "--slug",
477
+ "id-knowledge-loop",
478
+ "--goal",
479
+ "Validate id-based durable knowledge closure",
480
+ )
481
+ id_plan_path = Path(id_plan_result["plan"])
482
+ fill_plan_details(id_plan_path)
483
+ id_relative_plan = str(id_plan_path.resolve().relative_to(repo.resolve()))
484
+ id_fact = "Runtime input is owned by the terminal runner and core game logic remains independent of terminal packages"
485
+ log_result = run_manager(
486
+ "knowledge-log",
487
+ "--repo",
488
+ str(repo),
489
+ "--plan",
490
+ id_relative_plan,
491
+ "--fact",
492
+ id_fact,
493
+ "--destination",
494
+ "ARCHITECTURE.md",
495
+ )
496
+ with (repo / "ARCHITECTURE.md").open("a") as handle:
497
+ handle.write(
498
+ "\nThe `main` package owns keyboard input and rendering, while `game` contains pure state transitions.\n"
499
+ )
500
+ evidence_file = tmp_root / "evidence.txt"
501
+ evidence_file.write_text("main package owns keyboard input and rendering\n")
502
+ run_manager(
503
+ "knowledge-mark-written",
504
+ "--repo",
505
+ str(repo),
506
+ "--plan",
507
+ id_relative_plan,
508
+ "--id",
509
+ log_result["id"],
510
+ "--evidence-file",
511
+ str(evidence_file),
512
+ )
513
+ set_acceptance(repo, id_relative_plan)
514
+ run_manager(
515
+ "quality-score",
516
+ "--repo",
517
+ str(repo),
518
+ "--plan",
519
+ id_relative_plan,
520
+ "--product-correctness",
521
+ "8",
522
+ "--ux-operator-clarity",
523
+ "8",
524
+ "--architecture-maintainability",
525
+ "8",
526
+ "--reliability-observability",
527
+ "8",
528
+ "--security-data-handling",
529
+ "8",
530
+ *quality_note_args(
531
+ architecture="Id-based evidence closure was validated against ARCHITECTURE.md",
532
+ ),
533
+ )
534
+ plan_text = id_plan_path.read_text()
535
+ if id_fact in (repo / "ARCHITECTURE.md").read_text():
536
+ raise AssertionError("Id/evidence closure should not require appending the exact fact to the destination")
537
+ if "| evidence: main package owns keyboard input and rendering" not in plan_text:
538
+ raise AssertionError("Closed knowledge item should record the verification evidence")
539
+ set_continuation_complete(repo, id_relative_plan)
540
+ run_manager(
541
+ "plan-close",
542
+ "--repo",
543
+ str(repo),
544
+ "--plan",
545
+ id_relative_plan,
546
+ "--summary",
547
+ "Closed with id-based evidence.",
548
+ )
549
+
550
+
551
+ def create_formatted_plan(repo):
552
+ plan_path = repo / "docs" / "exec-plans" / "active" / "formatted-plan.md"
553
+ plan_path.write_text(
554
+ """# Execution Plan: Formatted Plan
555
+
556
+ ## Quality Gate
557
+
558
+ Status: pass
559
+ Minimum score: 8.0
560
+ Average score: 8.0
561
+ Last scored: 2026-06-11T00:00:00Z
562
+
563
+ | Dimension | Score | Notes |
564
+ | --- | ---: | --- |
565
+ | Product correctness | 8.0 | ok |
566
+ | UX and operator clarity | 8.0 | ok |
567
+ | Architecture and maintainability | 8.0 | ok |
568
+ | Reliability and observability | 8.0 | ok |
569
+ | Security and data handling | 8.0 | ok |
570
+
571
+ ## Durable Knowledge To Capture
572
+
573
+ - [ ] `snake.sh` is the single runtime entrypoint and owns terminal control directly with `stty` and `tput`. -> `ARCHITECTURE.md`
574
+ """
575
+ )
576
+ return plan_path
577
+
578
+
579
+ def test_preserve_unmanaged_docs(tmp_root):
580
+ repo = tmp_root / "partial-repo"
581
+ repo.mkdir()
582
+ (repo / "AGENTS.md").write_text("# Existing user router\n\nKeep this custom content.\n")
583
+ answers = tmp_root / "partial-answers.json"
584
+ write_answers(answers)
585
+
586
+ result = run_manager("init", "--repo", str(repo), "--answers", str(answers))
587
+ if "AGENTS.md" not in result["skipped"]:
588
+ raise AssertionError("Unmanaged AGENTS.md should be skipped")
589
+ assert_contains(repo, "AGENTS.md", "Keep this custom content.")
590
+ assert_exists(repo, "docs/PLANS.md")
591
+
592
+
593
+ def test_continuation_decision_workstream(tmp_root):
594
+ repo = tmp_root / "continuation-repo"
595
+ repo.mkdir()
596
+ answers = tmp_root / "phase-answers.json"
597
+ write_answers(answers, project_name="phase-demo")
598
+ run_manager("init", "--repo", str(repo), "--answers", str(answers))
599
+
600
+ plan_result = run_manager(
601
+ "plan-start",
602
+ "--repo",
603
+ str(repo),
604
+ "--slug",
605
+ "local-workbench-phase-1",
606
+ "--goal",
607
+ "Complete Local Workbench Phase 1",
608
+ )
609
+ plan_path = Path(plan_result["plan"])
610
+ fill_plan_details(plan_path)
611
+ plan_relative_for_assert = str(plan_path.resolve().relative_to(repo.resolve()))
612
+ assert_contains(repo, plan_relative_for_assert, "## Continuation Decision")
613
+ assert_contains(repo, plan_relative_for_assert, "Decision: pending")
614
+ relative_plan = str(plan_path.resolve().relative_to(repo.resolve()))
615
+ set_acceptance(repo, relative_plan)
616
+ run_manager(
617
+ "quality-score",
618
+ "--repo",
619
+ str(repo),
620
+ "--plan",
621
+ relative_plan,
622
+ "--product-correctness",
623
+ "8",
624
+ "--ux-operator-clarity",
625
+ "8",
626
+ "--architecture-maintainability",
627
+ "8",
628
+ "--reliability-observability",
629
+ "8",
630
+ "--security-data-handling",
631
+ "8",
632
+ *quality_note_args(
633
+ product="Phase 1 plan state was validated by the eval command.",
634
+ architecture="Workstream continuity was inspected in docs/exec-plans/workstreams.md.",
635
+ ),
636
+ )
637
+ close_without_continuity = run_manager(
638
+ "plan-close",
639
+ "--repo",
640
+ str(repo),
641
+ "--plan",
642
+ relative_plan,
643
+ "--summary",
644
+ "Phase 1 done",
645
+ expect_success=False,
646
+ )
647
+ if close_without_continuity.get("reason") != "continuation-decision-incomplete":
648
+ raise AssertionError("plan-close should return structured continuation-decision-incomplete JSON")
649
+ check_without_continuity = run_manager("check", "--repo", str(repo), expect_success=False)
650
+ issue_codes = {issue["code"] for issue in check_without_continuity["issues"]}
651
+ if "continuation-decision-pending" not in issue_codes:
652
+ raise AssertionError("check should flag plans that do not declare a continuation decision")
653
+
654
+ run_manager(
655
+ "continuation-set",
656
+ "--repo",
657
+ str(repo),
658
+ "--plan",
659
+ relative_plan,
660
+ "--decision",
661
+ "continue",
662
+ "--workstream",
663
+ "local-workbench",
664
+ "--next-target",
665
+ "docs/exec-plans/workstreams.md#local-workbench",
666
+ "--next-action",
667
+ "Create Phase 2 plan for command adapters",
668
+ "--resume-notes",
669
+ "Read completed Phase 1 plan and ARCHITECTURE.md before continuing",
670
+ )
671
+ assert_contains(repo, "docs/exec-plans/workstreams.md", "local-workbench")
672
+ assert_contains(repo, "docs/exec-plans/workstreams.md", "Create Phase 2 plan for command adapters")
673
+ assert_contains(repo, "docs/exec-plans/workstreams.md", "Goal: Complete Local Workbench Phase 1")
674
+ if "Goal: none" in (repo / "docs/exec-plans/workstreams.md").read_text():
675
+ raise AssertionError("continuation-set should derive a useful workstream goal instead of writing Goal: none")
676
+ close_result = run_manager(
677
+ "plan-close",
678
+ "--repo",
679
+ str(repo),
680
+ "--plan",
681
+ relative_plan,
682
+ "--summary",
683
+ "Phase 1 done; Phase 2 recovery is recorded in workstreams.",
684
+ )
685
+ if close_result["status"] != "closed":
686
+ raise AssertionError("Phased plan should close after continuity and workstream recovery are recorded")
687
+ completed_relative_plan = "docs/exec-plans/completed/" + plan_path.name
688
+ workstreams_text = (repo / "docs/exec-plans/workstreams.md").read_text()
689
+ if completed_relative_plan not in workstreams_text:
690
+ raise AssertionError("plan-close should update workstream ledger to the completed plan path")
691
+ if relative_plan in workstreams_text:
692
+ raise AssertionError("workstream ledger should not keep stale active plan references after plan-close")
693
+ broken = workstreams_text.replace(completed_relative_plan, relative_plan)
694
+ (repo / "docs/exec-plans/workstreams.md").write_text(broken)
695
+ broken_check = run_manager("check", "--repo", str(repo), expect_success=False)
696
+ broken_codes = {issue["code"] for issue in broken_check["issues"]}
697
+ if "missing-workstream-plan-reference" not in broken_codes:
698
+ raise AssertionError("check should fail when workstream ledger points to a missing plan")
699
+
700
+ complete_plan_result = run_manager(
701
+ "plan-start",
702
+ "--repo",
703
+ str(repo),
704
+ "--slug",
705
+ "single-plan-complete",
706
+ "--goal",
707
+ "Validate complete continuation decision",
708
+ )
709
+ complete_plan = Path(complete_plan_result["plan"])
710
+ fill_plan_details(complete_plan)
711
+ complete_relative = str(complete_plan.resolve().relative_to(repo.resolve()))
712
+ set_acceptance(repo, complete_relative)
713
+ run_manager(
714
+ "quality-score",
715
+ "--repo",
716
+ str(repo),
717
+ "--plan",
718
+ complete_relative,
719
+ "--product-correctness",
720
+ "8",
721
+ "--ux-operator-clarity",
722
+ "8",
723
+ "--architecture-maintainability",
724
+ "8",
725
+ "--reliability-observability",
726
+ "8",
727
+ "--security-data-handling",
728
+ "8",
729
+ *quality_note_args(product="Complete continuation decision was validated by eval closure."),
730
+ )
731
+ set_continuation_complete(repo, complete_relative)
732
+ complete_close = run_manager(
733
+ "plan-close",
734
+ "--repo",
735
+ str(repo),
736
+ "--plan",
737
+ complete_relative,
738
+ "--summary",
739
+ "Closed as complete with no follow-up.",
740
+ )
741
+ if complete_close["status"] != "closed":
742
+ raise AssertionError("complete continuation decision should allow single-plan closure")
743
+
744
+ pause_plan = repo / "docs" / "exec-plans" / "active" / "pause-plan.md"
745
+ pause_plan.write_text(
746
+ "# Execution Plan: Pause Plan\n\n## Continuation Decision\n\nDecision: pause\nWorkstream: pause-demo\nNext target: docs/exec-plans/workstreams.md#pause-demo\nNext action: Resume after dependency lands\nClosure reason: none\nResume notes: none\n"
747
+ )
748
+ pause_issues = continuation_codes(repo, pause_plan)
749
+ if "missing-resume-condition" not in pause_issues or "missing-resume-notes" not in pause_issues:
750
+ raise AssertionError("pause decisions should require resume condition and notes")
751
+ invalid_pause = run_manager(
752
+ "continuation-set",
753
+ "--repo",
754
+ str(repo),
755
+ "--plan",
756
+ str(pause_plan.relative_to(repo)),
757
+ "--decision",
758
+ "pause",
759
+ "--workstream",
760
+ "pause-demo",
761
+ "--next-target",
762
+ "docs/exec-plans/workstreams.md#pause-demo",
763
+ "--next-action",
764
+ "Resume after dependency lands",
765
+ expect_success=False,
766
+ )
767
+ invalid_pause_codes = {issue["code"] for issue in invalid_pause.get("issues", [])}
768
+ if "missing-resume-condition" not in invalid_pause_codes or "missing-resume-notes" not in invalid_pause_codes:
769
+ raise AssertionError("continuation-set should reject pause before writing when resume fields are missing")
770
+ if "pause-demo" in (repo / "docs/exec-plans/workstreams.md").read_text():
771
+ raise AssertionError("invalid pause continuation-set should not write a half-valid workstream")
772
+ run_manager(
773
+ "continuation-set",
774
+ "--repo",
775
+ str(repo),
776
+ "--plan",
777
+ str(pause_plan.relative_to(repo)),
778
+ "--decision",
779
+ "pause",
780
+ "--workstream",
781
+ "pause-demo",
782
+ "--next-target",
783
+ "docs/exec-plans/workstreams.md#pause-demo",
784
+ "--next-action",
785
+ "Resume after dependency lands",
786
+ "--closure-reason",
787
+ "Resume when the dependency is released",
788
+ "--resume-notes",
789
+ "Read dependency release notes before continuing",
790
+ )
791
+ if continuation_codes(repo, pause_plan):
792
+ raise AssertionError("pause decision with resume condition and notes should validate")
793
+
794
+ defer_plan = repo / "docs" / "exec-plans" / "active" / "defer-plan.md"
795
+ defer_plan.write_text(
796
+ "# Execution Plan: Defer Plan\n\n## Continuation Decision\n\nDecision: defer\nWorkstream: none\nNext target: none\nNext action: none\nClosure reason: Follow-up is outside this workstream\nResume notes: none\n"
797
+ )
798
+ if "missing-deferred-target" not in continuation_codes(repo, defer_plan):
799
+ raise AssertionError("defer decisions should require a tech-debt or follow-up target")
800
+
801
+ legacy_plan = repo / "docs" / "exec-plans" / "active" / "legacy-plan.md"
802
+ legacy_plan.write_text(
803
+ "# Execution Plan: Legacy Plan\n\n## Phase Continuity\n\nMode: single-phase\nWorkstream: none\nCurrent phase: none\nNext phase: none\nContinuation: none\nNext action: none\nClosure reason: Legacy single-phase plan is complete.\nResume notes: none\n"
804
+ )
805
+ if continuation_codes(repo, legacy_plan):
806
+ raise AssertionError("legacy single-phase Phase Continuity should map to complete")
807
+ alias_result = run_manager(
808
+ "phase-set",
809
+ "--repo",
810
+ str(repo),
811
+ "--plan",
812
+ str(legacy_plan.relative_to(repo)),
813
+ "--mode",
814
+ "completed",
815
+ "--closure-reason",
816
+ "Legacy alias remains supported.",
817
+ )
818
+ if alias_result["decision"] != "complete" or "deprecated" not in alias_result.get("warning", ""):
819
+ raise AssertionError("phase-set should remain as a deprecated compatibility alias")
820
+
821
+
822
+ def test_plan_path_canonicalization(tmp_root):
823
+ repo = tmp_root / "canonical-repo"
824
+ repo.mkdir()
825
+ answers = tmp_root / "canonical-answers.json"
826
+ write_answers(answers, project_name="canonical-demo")
827
+ run_manager("init", "--repo", str(repo), "--answers", str(answers))
828
+
829
+ plan_result = run_manager(
830
+ "plan-start",
831
+ "--repo",
832
+ str(repo),
833
+ "--slug",
834
+ "canonical-close",
835
+ "--goal",
836
+ "Close a plan when repo and plan paths use different filesystem spellings",
837
+ )
838
+ plan_path = Path(plan_result["plan"])
839
+ fill_plan_details(plan_path)
840
+ relative_plan = str(plan_path.resolve().relative_to(repo.resolve()))
841
+ set_acceptance(repo, relative_plan)
842
+ run_manager(
843
+ "quality-score",
844
+ "--repo",
845
+ str(repo),
846
+ "--plan",
847
+ str(plan_path),
848
+ "--product-correctness",
849
+ "8",
850
+ "--ux-operator-clarity",
851
+ "8",
852
+ "--architecture-maintainability",
853
+ "8",
854
+ "--reliability-observability",
855
+ "8",
856
+ "--security-data-handling",
857
+ "8",
858
+ *quality_note_args(
859
+ architecture="Canonical plan path normalization was validated by file path inspection.",
860
+ ),
861
+ )
862
+ run_manager(
863
+ "continuation-set",
864
+ "--repo",
865
+ str(repo),
866
+ "--plan",
867
+ relative_plan,
868
+ "--decision",
869
+ "continue",
870
+ "--workstream",
871
+ "canonical-close",
872
+ "--next-target",
873
+ "docs/exec-plans/workstreams.md#canonical-close",
874
+ "--next-action",
875
+ "Close after canonical path validation",
876
+ "--resume-notes",
877
+ "No special resume notes",
878
+ )
879
+
880
+ repo_arg = os.path.realpath(repo)
881
+ plan_arg = str(plan_path)
882
+ if repo_arg == str(repo) and plan_arg == str(plan_path.resolve()):
883
+ repo_arg = str(repo)
884
+ plan_arg = str(plan_path.resolve())
885
+
886
+ close_result = run_manager(
887
+ "plan-close",
888
+ "--repo",
889
+ repo_arg,
890
+ "--plan",
891
+ plan_arg,
892
+ "--summary",
893
+ "Closed with canonicalized plan path.",
894
+ )
895
+ if close_result["status"] != "closed":
896
+ raise AssertionError("plan-close should accept absolute plan paths inside the repo")
897
+ completed_relative_plan = "docs/exec-plans/completed/" + plan_path.name
898
+ workstreams_text = (repo / "docs/exec-plans/workstreams.md").read_text()
899
+ if completed_relative_plan not in workstreams_text:
900
+ raise AssertionError("canonicalized plan-close should update last completed plan")
901
+ if relative_plan in workstreams_text:
902
+ raise AssertionError("canonicalized plan-close should remove stale current plan references")
903
+ check_result = run_manager("check", "--repo", str(repo))
904
+ if check_result["status"] != "pass":
905
+ raise AssertionError("canonicalized plan-close should leave harness check passing")
906
+
907
+
908
+ def test_defect_recovery_loop(tmp_root):
909
+ repo = tmp_root / "defect-repo"
910
+ repo.mkdir()
911
+ answers = tmp_root / "defect-answers.json"
912
+ write_answers(answers, project_name="defect-demo")
913
+ run_manager("init", "--repo", str(repo), "--answers", str(answers))
914
+
915
+ plan_result = run_manager(
916
+ "plan-start",
917
+ "--repo",
918
+ str(repo),
919
+ "--slug",
920
+ "snake-tail-collision",
921
+ "--goal",
922
+ "Validate defect recovery when Snake tail-cell collision behavior fails",
923
+ )
924
+ plan_path = Path(plan_result["plan"])
925
+ fill_plan_details(plan_path)
926
+ relative_plan = str(plan_path.resolve().relative_to(repo.resolve()))
927
+ defect_summary = (
928
+ "Snake marks game over when the head moves into the current tail cell during a non-eating tick"
929
+ )
930
+ defect_result = run_manager(
931
+ "defect-log",
932
+ "--repo",
933
+ str(repo),
934
+ "--plan",
935
+ relative_plan,
936
+ "--severity",
937
+ "P1",
938
+ "--summary",
939
+ defect_summary,
940
+ "--evidence",
941
+ "go test ./internal/game -run TestCanMoveIntoVacatedTailCell failed",
942
+ expect_success=False,
943
+ )
944
+ defect_id = defect_result["id"]
945
+ plan_text = plan_path.read_text()
946
+ if "## Defects To Resolve" not in plan_text or defect_id not in plan_text:
947
+ raise AssertionError("defect-log should record the open defect in the plan")
948
+ if "Status: pending" not in plan_text:
949
+ raise AssertionError("defect-log should invalidate any existing quality result")
950
+ if "Resolve all open defects" not in plan_text:
951
+ raise AssertionError("defect-log should turn the bug into rework input")
952
+
953
+ set_acceptance(repo, relative_plan)
954
+ score_with_open_defect = run_manager(
955
+ "quality-score",
956
+ "--repo",
957
+ str(repo),
958
+ "--plan",
959
+ relative_plan,
960
+ "--product-correctness",
961
+ "10",
962
+ "--ux-operator-clarity",
963
+ "10",
964
+ "--architecture-maintainability",
965
+ "10",
966
+ "--reliability-observability",
967
+ "10",
968
+ "--security-data-handling",
969
+ "10",
970
+ *quality_note_args(
971
+ product="Open Snake defect remains unresolved in go test evidence.",
972
+ reliability="Open defect blocking was validated by the eval command.",
973
+ ),
974
+ expect_success=False,
975
+ )
976
+ if score_with_open_defect["status"] != "fail" or defect_id not in score_with_open_defect["open_defects"]:
977
+ raise AssertionError("quality-score should fail while any defect is open")
978
+ check_with_open_defect = run_manager("check", "--repo", str(repo), expect_success=False)
979
+ issue_codes = {issue["code"] for issue in check_with_open_defect["issues"]}
980
+ if "open-defect" not in issue_codes:
981
+ raise AssertionError("check should surface unresolved defects")
982
+ close_with_open_defect = run_manager(
983
+ "plan-close",
984
+ "--repo",
985
+ str(repo),
986
+ "--plan",
987
+ relative_plan,
988
+ "--summary",
989
+ "Should not close with open defects",
990
+ expect_success=False,
991
+ )
992
+ if close_with_open_defect.get("reason") != "open-defects":
993
+ raise AssertionError("plan-close should return structured open-defects JSON")
994
+
995
+ run_manager(
996
+ "defect-resolve",
997
+ "--repo",
998
+ str(repo),
999
+ "--plan",
1000
+ relative_plan,
1001
+ "--id",
1002
+ defect_id,
1003
+ "--fix-evidence",
1004
+ "go test ./internal/game -run TestCanMoveIntoVacatedTailCell passed",
1005
+ )
1006
+ plan_text_after_resolve = plan_path.read_text()
1007
+ if f"- [x] [bug:{defect_id}]" not in plan_text_after_resolve:
1008
+ raise AssertionError("defect-resolve should close the defect checkbox")
1009
+ if "Defects resolved. Re-run validation and `quality-score` before closing." not in plan_text_after_resolve:
1010
+ raise AssertionError("defect-resolve should require a fresh quality score")
1011
+
1012
+ passing_score = run_manager(
1013
+ "quality-score",
1014
+ "--repo",
1015
+ str(repo),
1016
+ "--plan",
1017
+ relative_plan,
1018
+ "--product-correctness",
1019
+ "9",
1020
+ "--ux-operator-clarity",
1021
+ "8",
1022
+ "--architecture-maintainability",
1023
+ "8",
1024
+ "--reliability-observability",
1025
+ "9",
1026
+ "--security-data-handling",
1027
+ "10",
1028
+ *quality_note_args(
1029
+ product="Snake tail-cell defect was resolved with passing test evidence.",
1030
+ reliability="Defect recovery was validated with fresh passing evidence.",
1031
+ ),
1032
+ )
1033
+ if passing_score["status"] != "pass":
1034
+ raise AssertionError("quality-score should pass after defects are resolved")
1035
+ set_continuation_complete(repo, relative_plan)
1036
+ close_result = run_manager(
1037
+ "plan-close",
1038
+ "--repo",
1039
+ str(repo),
1040
+ "--plan",
1041
+ relative_plan,
1042
+ "--summary",
1043
+ "Closed after defect recovery and fresh quality score.",
1044
+ )
1045
+ if close_result["status"] != "closed":
1046
+ raise AssertionError("plan-close should close after defect recovery")
1047
+ completed_plan = repo / "docs" / "exec-plans" / "completed" / plan_path.name
1048
+ completed_text = completed_plan.read_text()
1049
+ if "- [x] Add durable facts here as they emerge" in completed_text:
1050
+ raise AssertionError("plan-close should not mark the default knowledge placeholder as completed")
1051
+
1052
+
1053
+ def test_quality_score_requires_notes(tmp_root):
1054
+ repo = tmp_root / "quality-notes-repo"
1055
+ repo.mkdir()
1056
+ answers = tmp_root / "quality-notes-answers.json"
1057
+ write_answers(answers, project_name="quality-notes-demo")
1058
+ run_manager("init", "--repo", str(repo), "--answers", str(answers))
1059
+
1060
+ plan_result = run_manager(
1061
+ "plan-start",
1062
+ "--repo",
1063
+ str(repo),
1064
+ "--slug",
1065
+ "quality-notes",
1066
+ "--goal",
1067
+ "Validate quality-score evidence notes are required",
1068
+ )
1069
+ relative_plan = str(Path(plan_result["plan"]).resolve().relative_to(repo.resolve()))
1070
+ set_acceptance(repo, relative_plan)
1071
+ missing_notes = run_manager(
1072
+ "quality-score",
1073
+ "--repo",
1074
+ str(repo),
1075
+ "--plan",
1076
+ relative_plan,
1077
+ "--product-correctness",
1078
+ "9",
1079
+ "--ux-operator-clarity",
1080
+ "9",
1081
+ "--architecture-maintainability",
1082
+ "9",
1083
+ "--reliability-observability",
1084
+ "9",
1085
+ "--security-data-handling",
1086
+ "9",
1087
+ expect_success=False,
1088
+ )
1089
+ if missing_notes["reason"] != "missing-quality-notes":
1090
+ raise AssertionError("quality-score should fail with a missing-quality-notes reason")
1091
+ if len(missing_notes["missing_notes"]) != 5:
1092
+ raise AssertionError("quality-score should name every dimension missing an evidence note")
1093
+ arguments = {item["argument"] for item in missing_notes["missing_notes"]}
1094
+ if "--product-note" not in arguments or "--security-note" not in arguments:
1095
+ raise AssertionError("quality-score should name the missing note arguments")
1096
+
1097
+ passing_score = run_manager(
1098
+ "quality-score",
1099
+ "--repo",
1100
+ str(repo),
1101
+ "--plan",
1102
+ relative_plan,
1103
+ "--product-correctness",
1104
+ "9",
1105
+ "--ux-operator-clarity",
1106
+ "9",
1107
+ "--architecture-maintainability",
1108
+ "9",
1109
+ "--reliability-observability",
1110
+ "9",
1111
+ "--security-data-handling",
1112
+ "9",
1113
+ *quality_note_args(
1114
+ product="Product assertions were checked by the eval command.",
1115
+ ux="User workflow evidence was reviewed in the generated plan.",
1116
+ architecture="Architecture evidence was inspected in plan files.",
1117
+ reliability="Validation command evidence was checked.",
1118
+ security="Security evidence was reviewed in generated metadata files.",
1119
+ ),
1120
+ )
1121
+ if passing_score["status"] != "pass":
1122
+ raise AssertionError("quality-score should pass when all evidence notes are present")
1123
+ plan_text = Path(plan_result["plan"]).read_text()
1124
+ if "No note provided" in plan_text:
1125
+ raise AssertionError("quality-score should not write placeholder notes when evidence is required")
1126
+
1127
+
1128
+ def test_knowledge_evidence_verbatim(tmp_root):
1129
+ repo = tmp_root / "knowledge-evidence-repo"
1130
+ repo.mkdir()
1131
+ answers = tmp_root / "knowledge-evidence-answers.json"
1132
+ write_answers(answers, project_name="knowledge-evidence-demo")
1133
+ run_manager("init", "--repo", str(repo), "--answers", str(answers))
1134
+
1135
+ plan_result = run_manager(
1136
+ "plan-start",
1137
+ "--repo",
1138
+ str(repo),
1139
+ "--slug",
1140
+ "knowledge-evidence",
1141
+ "--goal",
1142
+ "Validate durable knowledge evidence must be exact destination text",
1143
+ )
1144
+ plan_path = Path(plan_result["plan"])
1145
+ fill_plan_details(plan_path)
1146
+ relative_plan = str(plan_path.resolve().relative_to(repo.resolve()))
1147
+ fact = "Snake non-growth movement may enter the current tail cell because the tail leaves during the same tick"
1148
+ log_result = run_manager(
1149
+ "knowledge-log",
1150
+ "--repo",
1151
+ str(repo),
1152
+ "--plan",
1153
+ relative_plan,
1154
+ "--fact",
1155
+ fact,
1156
+ "--destination",
1157
+ "docs/product-specs/snake.md",
1158
+ )
1159
+ destination = repo / "docs" / "product-specs" / "snake.md"
1160
+ destination.parent.mkdir(parents=True, exist_ok=True)
1161
+ exact_evidence = "On a non-eating tick, moving into the current tail cell is legal because the tail leaves during the same tick."
1162
+ destination.write_text(f"# Snake Rules\n\n- {exact_evidence}\n")
1163
+
1164
+ paraphrase_result = run_manager(
1165
+ "knowledge-mark-written",
1166
+ "--repo",
1167
+ str(repo),
1168
+ "--plan",
1169
+ relative_plan,
1170
+ "--id",
1171
+ log_result["id"],
1172
+ "--evidence",
1173
+ "docs/product-specs/snake.md now states the tail-vacating rule.",
1174
+ expect_success=False,
1175
+ )
1176
+ if paraphrase_result:
1177
+ raise AssertionError("Paraphrased knowledge evidence should not succeed")
1178
+ plan_text_after_failure = plan_path.read_text()
1179
+ if f"- [x] [id:{log_result['id']}]" in plan_text_after_failure:
1180
+ raise AssertionError("Failed knowledge evidence should not close the knowledge item")
1181
+
1182
+ evidence_file = tmp_root / "snake-evidence.txt"
1183
+ evidence_file.write_text(exact_evidence + "\n")
1184
+ run_manager(
1185
+ "knowledge-mark-written",
1186
+ "--repo",
1187
+ str(repo),
1188
+ "--plan",
1189
+ relative_plan,
1190
+ "--id",
1191
+ log_result["id"],
1192
+ "--evidence-file",
1193
+ str(evidence_file),
1194
+ )
1195
+ plan_text = plan_path.read_text()
1196
+ if f"- [x] [id:{log_result['id']}]" not in plan_text:
1197
+ raise AssertionError("Exact destination evidence should close the knowledge item")
1198
+ if f"| evidence: {exact_evidence}" not in plan_text:
1199
+ raise AssertionError("Closed knowledge item should record the exact verification evidence")
1200
+
1201
+
1202
+ def test_structured_plan_sidecar_and_acceptance(tmp_root):
1203
+ repo = tmp_root / "structured-plan-repo"
1204
+ repo.mkdir()
1205
+ answers = tmp_root / "structured-answers.json"
1206
+ write_answers(answers, project_name="structured-demo")
1207
+ run_manager("init", "--repo", str(repo), "--answers", str(answers))
1208
+
1209
+ plan_result = run_manager(
1210
+ "plan-start",
1211
+ "--repo",
1212
+ str(repo),
1213
+ "--slug",
1214
+ "structured-sidecar",
1215
+ "--goal",
1216
+ "Validate structured sidecar creation and acceptance readiness",
1217
+ )
1218
+ plan_path = Path(plan_result["plan"])
1219
+ sidecar_path = plan_path.with_suffix(".json")
1220
+ if not sidecar_path.exists():
1221
+ raise AssertionError("plan-start should create a JSON sidecar")
1222
+ state = json.loads(sidecar_path.read_text())
1223
+ if state["acceptance_contract"]["status"] != "draft":
1224
+ raise AssertionError("new plan sidecar should start with draft acceptance contract")
1225
+ if "## Acceptance Contract" not in plan_path.read_text() or "## Quality Result" not in plan_path.read_text():
1226
+ raise AssertionError("new plan markdown should render acceptance and quality sections")
1227
+
1228
+ relative_plan = str(plan_path.resolve().relative_to(repo.resolve()))
1229
+ check_draft = run_manager("check", "--repo", str(repo), expect_success=False)
1230
+ if "acceptance-contract-not-ready" not in {issue["code"] for issue in check_draft["issues"]}:
1231
+ raise AssertionError("active check should require ready acceptance contract")
1232
+
1233
+ generic = run_manager(
1234
+ "acceptance-set",
1235
+ "--repo",
1236
+ str(repo),
1237
+ "--plan",
1238
+ relative_plan,
1239
+ "--product",
1240
+ "Confirm requested behavior is complete.",
1241
+ "--ux",
1242
+ "Confirm requested behavior is complete.",
1243
+ "--architecture",
1244
+ "Confirm requested behavior is complete.",
1245
+ "--reliability",
1246
+ "Confirm requested behavior is complete.",
1247
+ "--security",
1248
+ "Confirm requested behavior is complete.",
1249
+ expect_success=False,
1250
+ )
1251
+ if generic["reason"] != "acceptance-criteria-not-specific":
1252
+ raise AssertionError("acceptance-set should reject generic template criteria")
1253
+
1254
+ ready = set_acceptance(repo, relative_plan)
1255
+ if ready["status"] != "ready" or not ready["criteria_fingerprint"]:
1256
+ raise AssertionError("acceptance-set should mark the contract ready with a fingerprint")
1257
+ set_continuation_complete(repo, relative_plan)
1258
+ check_ready = run_manager("check", "--repo", str(repo))
1259
+ if check_ready["status"] != "pass":
1260
+ raise AssertionError("active check should pass with ready acceptance contract and no open defects")
1261
+
1262
+
1263
+ def test_quality_score_requires_ready_acceptance(tmp_root):
1264
+ repo = tmp_root / "quality-contract-repo"
1265
+ repo.mkdir()
1266
+ answers = tmp_root / "quality-contract-answers.json"
1267
+ write_answers(answers, project_name="quality-contract-demo")
1268
+ run_manager("init", "--repo", str(repo), "--answers", str(answers))
1269
+ plan_result = run_manager(
1270
+ "plan-start",
1271
+ "--repo",
1272
+ str(repo),
1273
+ "--slug",
1274
+ "quality-contract",
1275
+ "--goal",
1276
+ "Validate quality-score blocks before acceptance is ready",
1277
+ )
1278
+ relative_plan = str(Path(plan_result["plan"]).resolve().relative_to(repo.resolve()))
1279
+ blocked = run_manager(
1280
+ "quality-score",
1281
+ "--repo",
1282
+ str(repo),
1283
+ "--plan",
1284
+ relative_plan,
1285
+ "--product-correctness",
1286
+ "8",
1287
+ "--ux-operator-clarity",
1288
+ "8",
1289
+ "--architecture-maintainability",
1290
+ "8",
1291
+ "--reliability-observability",
1292
+ "8",
1293
+ "--security-data-handling",
1294
+ "8",
1295
+ *quality_note_args(),
1296
+ expect_success=False,
1297
+ )
1298
+ if blocked["reason"] != "acceptance-contract-not-ready":
1299
+ raise AssertionError("quality-score should require a ready acceptance contract before scoring")
1300
+
1301
+
1302
+ def test_plan_close_rejects_template_placeholders(tmp_root):
1303
+ repo = tmp_root / "placeholder-close-repo"
1304
+ repo.mkdir()
1305
+ answers = tmp_root / "placeholder-close-answers.json"
1306
+ write_answers(answers, project_name="placeholder-close-demo")
1307
+ run_manager("init", "--repo", str(repo), "--answers", str(answers))
1308
+ plan_result = run_manager(
1309
+ "plan-start",
1310
+ "--repo",
1311
+ str(repo),
1312
+ "--slug",
1313
+ "placeholder-close",
1314
+ "--goal",
1315
+ "Validate plan-close rejects unresolved starter placeholders",
1316
+ )
1317
+ plan_path = Path(plan_result["plan"])
1318
+ relative_plan = str(plan_path.resolve().relative_to(repo.resolve()))
1319
+ set_acceptance(repo, relative_plan)
1320
+ run_manager(
1321
+ "quality-score",
1322
+ "--repo",
1323
+ str(repo),
1324
+ "--plan",
1325
+ relative_plan,
1326
+ "--product-correctness",
1327
+ "8",
1328
+ "--ux-operator-clarity",
1329
+ "8",
1330
+ "--architecture-maintainability",
1331
+ "8",
1332
+ "--reliability-observability",
1333
+ "8",
1334
+ "--security-data-handling",
1335
+ "8",
1336
+ *quality_note_args(),
1337
+ )
1338
+ set_continuation_complete(repo, relative_plan)
1339
+ blocked = run_manager(
1340
+ "plan-close",
1341
+ "--repo",
1342
+ str(repo),
1343
+ "--plan",
1344
+ relative_plan,
1345
+ "--summary",
1346
+ "Should reject placeholders",
1347
+ expect_success=False,
1348
+ )
1349
+ if blocked.get("reason") != "plan-placeholders-unresolved":
1350
+ raise AssertionError("plan-close should return structured plan-placeholders-unresolved JSON")
1351
+ if not plan_path.exists():
1352
+ raise AssertionError("plan-close should leave the active plan in place when placeholders remain")
1353
+
1354
+
1355
+ def test_plan_close_returns_open_knowledge_json(tmp_root):
1356
+ repo = tmp_root / "open-knowledge-close-repo"
1357
+ repo.mkdir()
1358
+ answers = tmp_root / "open-knowledge-close-answers.json"
1359
+ write_answers(answers, project_name="open-knowledge-close-demo")
1360
+ run_manager("init", "--repo", str(repo), "--answers", str(answers))
1361
+ plan_result = run_manager(
1362
+ "plan-start",
1363
+ "--repo",
1364
+ str(repo),
1365
+ "--slug",
1366
+ "open-knowledge-close",
1367
+ "--goal",
1368
+ "Validate structured close output for open durable knowledge",
1369
+ )
1370
+ plan_path = Path(plan_result["plan"])
1371
+ fill_plan_details(plan_path)
1372
+ relative_plan = str(plan_path.resolve().relative_to(repo.resolve()))
1373
+ set_acceptance(repo, relative_plan)
1374
+ fact = "Structured plan-close output should identify open durable knowledge items"
1375
+ run_manager(
1376
+ "knowledge-log",
1377
+ "--repo",
1378
+ str(repo),
1379
+ "--plan",
1380
+ relative_plan,
1381
+ "--fact",
1382
+ fact,
1383
+ "--destination",
1384
+ "docs/QUALITY_SCORE.md",
1385
+ )
1386
+ run_manager(
1387
+ "quality-score",
1388
+ "--repo",
1389
+ str(repo),
1390
+ "--plan",
1391
+ relative_plan,
1392
+ "--product-correctness",
1393
+ "8",
1394
+ "--ux-operator-clarity",
1395
+ "8",
1396
+ "--architecture-maintainability",
1397
+ "8",
1398
+ "--reliability-observability",
1399
+ "8",
1400
+ "--security-data-handling",
1401
+ "8",
1402
+ *quality_note_args(),
1403
+ )
1404
+ set_continuation_complete(repo, relative_plan)
1405
+ blocked = run_manager(
1406
+ "plan-close",
1407
+ "--repo",
1408
+ str(repo),
1409
+ "--plan",
1410
+ relative_plan,
1411
+ "--summary",
1412
+ "Should reject open knowledge",
1413
+ expect_success=False,
1414
+ )
1415
+ if blocked.get("reason") != "open-durable-knowledge":
1416
+ raise AssertionError("plan-close should return structured open-durable-knowledge JSON")
1417
+ if fact not in "\n".join(blocked.get("details", {}).get("open_items", [])):
1418
+ raise AssertionError("structured open knowledge JSON should include the blocked item")
1419
+
1420
+
1421
+ def test_plan_close_moves_sidecar_and_rejects_stale_score(tmp_root):
1422
+ repo = tmp_root / "stale-score-repo"
1423
+ repo.mkdir()
1424
+ answers = tmp_root / "stale-score-answers.json"
1425
+ write_answers(answers, project_name="stale-score-demo")
1426
+ run_manager("init", "--repo", str(repo), "--answers", str(answers))
1427
+ plan_result = run_manager(
1428
+ "plan-start",
1429
+ "--repo",
1430
+ str(repo),
1431
+ "--slug",
1432
+ "stale-score",
1433
+ "--goal",
1434
+ "Validate plan-close rejects stale fingerprints and moves sidecars",
1435
+ )
1436
+ plan_path = Path(plan_result["plan"])
1437
+ fill_plan_details(plan_path)
1438
+ relative_plan = str(plan_path.resolve().relative_to(repo.resolve()))
1439
+ set_acceptance(repo, relative_plan)
1440
+ run_manager(
1441
+ "quality-score",
1442
+ "--repo",
1443
+ str(repo),
1444
+ "--plan",
1445
+ relative_plan,
1446
+ "--product-correctness",
1447
+ "8",
1448
+ "--ux-operator-clarity",
1449
+ "8",
1450
+ "--architecture-maintainability",
1451
+ "8",
1452
+ "--reliability-observability",
1453
+ "8",
1454
+ "--security-data-handling",
1455
+ "8",
1456
+ *quality_note_args(),
1457
+ )
1458
+ state_path = plan_path.with_suffix(".json")
1459
+ state = json.loads(state_path.read_text())
1460
+ state["acceptance_contract"]["criteria"]["product_correctness"] = "A changed product criterion makes the previous score stale."
1461
+ state_path.write_text(json.dumps(state, indent=2) + "\n")
1462
+ stale_close = run_manager(
1463
+ "plan-close",
1464
+ "--repo",
1465
+ str(repo),
1466
+ "--plan",
1467
+ relative_plan,
1468
+ "--summary",
1469
+ "Should reject stale score",
1470
+ expect_success=False,
1471
+ )
1472
+ if stale_close.get("reason") != "acceptance-fingerprint-stale":
1473
+ raise AssertionError("plan-close should return structured acceptance-fingerprint-stale JSON")
1474
+
1475
+ set_acceptance(
1476
+ repo,
1477
+ relative_plan,
1478
+ product="The stale score plan closes only after rescoring the changed product criterion.",
1479
+ )
1480
+ run_manager(
1481
+ "quality-score",
1482
+ "--repo",
1483
+ str(repo),
1484
+ "--plan",
1485
+ relative_plan,
1486
+ "--product-correctness",
1487
+ "8",
1488
+ "--ux-operator-clarity",
1489
+ "8",
1490
+ "--architecture-maintainability",
1491
+ "8",
1492
+ "--reliability-observability",
1493
+ "8",
1494
+ "--security-data-handling",
1495
+ "8",
1496
+ *quality_note_args(product="Changed acceptance criterion was rescored with eval command evidence."),
1497
+ )
1498
+ set_continuation_complete(repo, relative_plan)
1499
+ close_result = run_manager(
1500
+ "plan-close",
1501
+ "--repo",
1502
+ str(repo),
1503
+ "--plan",
1504
+ relative_plan,
1505
+ "--summary",
1506
+ "Closed after rescoring changed acceptance contract.",
1507
+ )
1508
+ if close_result["status"] != "closed":
1509
+ raise AssertionError("plan-close should close after fresh passing score")
1510
+ if plan_path.exists() or state_path.exists():
1511
+ raise AssertionError("plan-close should remove active markdown and sidecar")
1512
+ completed_plan = repo / "docs" / "exec-plans" / "completed" / plan_path.name
1513
+ completed_sidecar = completed_plan.with_suffix(".json")
1514
+ if not completed_plan.exists() or not completed_sidecar.exists():
1515
+ raise AssertionError("plan-close should move markdown and sidecar to completed")
1516
+ completed_check = run_manager("check", "--repo", str(repo))
1517
+ if completed_check["status"] != "pass":
1518
+ raise AssertionError("completed structured plan should satisfy check")
1519
+ sidecar_before_check = completed_sidecar.read_text()
1520
+ second_completed_check = run_manager("check", "--repo", str(repo))
1521
+ if second_completed_check["status"] != "pass":
1522
+ raise AssertionError("repeated completed-plan check should still pass")
1523
+ if completed_sidecar.read_text() != sidecar_before_check:
1524
+ raise AssertionError("check should not rewrite unchanged completed plan sidecars or bump updated_at")
1525
+
1526
+
1527
+ def test_evidence_prune_generated_artifacts(tmp_root):
1528
+ repo = tmp_root / "prune-repo"
1529
+ repo.mkdir()
1530
+ answers = tmp_root / "prune-answers.json"
1531
+ write_answers(answers, project_name="prune-demo")
1532
+ run_manager("init", "--repo", str(repo), "--answers", str(answers))
1533
+
1534
+ generated = repo / "docs" / "generated"
1535
+ stale = generated / "old-layout.json"
1536
+ referenced = generated / "kept-layout.json"
1537
+ recent = generated / "recent-layout.json"
1538
+ managed = generated / "managed-starter.md"
1539
+ stale.write_text('{"old": true}\n')
1540
+ referenced.write_text('{"referenced": true}\n')
1541
+ recent.write_text('{"recent": true}\n')
1542
+ managed.write_text("<!-- harness-engine:managed -->\n# Starter\n")
1543
+ old_time = time.time() - (30 * 24 * 60 * 60)
1544
+ for path in [stale, referenced, managed]:
1545
+ os.utime(path, (old_time, old_time))
1546
+ (repo / "docs" / "PLANS.md").write_text(
1547
+ (repo / "docs" / "PLANS.md").read_text()
1548
+ + "\nKeep evidence at docs/generated/kept-layout.json for the closed mobile layout plan.\n"
1549
+ )
1550
+
1551
+ dry_run = run_manager("evidence-prune", "--repo", str(repo), "--older-than-days", "14")
1552
+ candidate_paths = {item["path"] for item in dry_run["candidates"]}
1553
+ if dry_run["mode"] != "dry-run" or dry_run["removed"]:
1554
+ raise AssertionError("evidence-prune should dry-run by default")
1555
+ if "docs/generated/old-layout.json" not in candidate_paths:
1556
+ raise AssertionError("stale unreferenced generated evidence should be a prune candidate")
1557
+ if "docs/generated/kept-layout.json" in candidate_paths:
1558
+ raise AssertionError("referenced generated evidence should not be a prune candidate")
1559
+ if "docs/generated/recent-layout.json" in candidate_paths:
1560
+ raise AssertionError("recent generated evidence should not be a prune candidate")
1561
+ if "docs/generated/managed-starter.md" in candidate_paths:
1562
+ raise AssertionError("managed starter files should not be prune candidates")
1563
+ if not stale.exists():
1564
+ raise AssertionError("dry-run should not delete candidates")
1565
+
1566
+ applied = run_manager(
1567
+ "evidence-prune",
1568
+ "--repo",
1569
+ str(repo),
1570
+ "--older-than-days",
1571
+ "14",
1572
+ "--apply",
1573
+ )
1574
+ if "docs/generated/old-layout.json" not in applied["removed"]:
1575
+ raise AssertionError("apply should remove stale unreferenced generated evidence")
1576
+ if stale.exists() or not referenced.exists() or not recent.exists() or not managed.exists():
1577
+ raise AssertionError("apply should delete only stale unreferenced evidence")
1578
+
1579
+
1580
+ def test_eval_report_shape(tmp_root):
1581
+ case_metadata = load_case_metadata()
1582
+ report = build_report(
1583
+ [
1584
+ {
1585
+ "id": "empty-repo-init",
1586
+ "status": "pass",
1587
+ "description": case_metadata["empty-repo-init"]["description"],
1588
+ "score": 1.0,
1589
+ "duration_seconds": 0.01,
1590
+ "findings": [],
1591
+ "recommended_actions": [],
1592
+ },
1593
+ {
1594
+ "id": "frontend-analysis",
1595
+ "status": "fail",
1596
+ "description": case_metadata["frontend-analysis"]["description"],
1597
+ "score": 0.0,
1598
+ "duration_seconds": 0.02,
1599
+ "findings": ["Frontend repo should ask frontend confirmation questions"],
1600
+ "recommended_actions": ["Fix frontend-analysis before release."],
1601
+ },
1602
+ ]
1603
+ )
1604
+ if report["schema_version"] != "harness-eval-report.v1":
1605
+ raise AssertionError("Eval report should expose a stable schema version")
1606
+ if report["status"] != "fail" or report["score"] != 50:
1607
+ raise AssertionError("Eval report should expose aggregate status and score")
1608
+ if report["metrics"]["case_pass_rate"] != 0.5:
1609
+ raise AssertionError("Eval report should expose detailed aggregate metrics")
1610
+ if "case_results" not in report or len(report["case_results"]) != 2:
1611
+ raise AssertionError("Eval report should expose per-case results")
1612
+ failed_case = report["case_results"][1]
1613
+ if not failed_case["findings"] or not failed_case["recommended_actions"]:
1614
+ raise AssertionError("Failed eval cases should expose findings and recommended actions")
1615
+ if "Review `case_results`" not in report["user_message"]:
1616
+ raise AssertionError("Eval report should include a user-facing failure message")