@hallucination-studio/harness-engine 1.0.0 → 1.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +185 -27
  3. package/bin/install.js +29 -17
  4. package/package.json +10 -4
  5. package/skills/harness-engine/SKILL.md +97 -0
  6. package/skills/harness-engine/agents/openai.yaml +4 -0
  7. package/skills/harness-engine/evals/cases.json +94 -0
  8. package/skills/harness-engine/evals/harness_engine_evals/__init__.py +1 -0
  9. package/skills/harness-engine/evals/harness_engine_evals/cases_frontend.py +211 -0
  10. package/skills/harness-engine/evals/harness_engine_evals/cases_lifecycle.py +1616 -0
  11. package/skills/harness-engine/evals/harness_engine_evals/helpers.py +155 -0
  12. package/skills/harness-engine/evals/harness_engine_evals/registry.py +55 -0
  13. package/skills/harness-engine/evals/harness_engine_evals/report.py +36 -0
  14. package/skills/harness-engine/evals/harness_engine_evals/runner.py +53 -0
  15. package/skills/harness-engine/evals/run_evals.py +14 -0
  16. package/skills/{harness-repo-bootstrap → harness-engine}/references/evaluation-loop.md +8 -2
  17. package/skills/harness-engine/references/evidence-first-evals.md +187 -0
  18. package/skills/harness-engine/references/exec-plans.md +59 -0
  19. package/skills/{harness-repo-bootstrap → harness-engine}/references/file-map.md +3 -3
  20. package/skills/{harness-repo-bootstrap → harness-engine}/references/knowledge-capture.md +2 -2
  21. package/skills/{harness-repo-bootstrap → harness-engine}/references/sop-index.md +3 -0
  22. package/skills/harness-engine/references/template-policy.md +17 -0
  23. package/skills/harness-engine/references/workflow.md +62 -0
  24. package/skills/harness-engine/scripts/harness_engine/__init__.py +1 -0
  25. package/skills/harness-engine/scripts/harness_engine/analysis.py +240 -0
  26. package/skills/harness-engine/scripts/harness_engine/checks.py +287 -0
  27. package/skills/harness-engine/scripts/harness_engine/cli.py +656 -0
  28. package/skills/harness-engine/scripts/harness_engine/common.py +977 -0
  29. package/skills/harness-engine/scripts/harness_engine/continuation.py +520 -0
  30. package/skills/harness-engine/scripts/harness_engine/git_ops.py +88 -0
  31. package/skills/harness-engine/scripts/harness_engine/knowledge.py +329 -0
  32. package/skills/harness-engine/scripts/harness_engine/plans.py +630 -0
  33. package/skills/harness-engine/scripts/harness_engine/templates.py +124 -0
  34. package/skills/harness-engine/scripts/manage_harness.py +14 -0
  35. package/skills/harness-repo-bootstrap/SKILL.md +0 -68
  36. package/skills/harness-repo-bootstrap/agents/openai.yaml +0 -4
  37. package/skills/harness-repo-bootstrap/evals/cases.json +0 -18
  38. package/skills/harness-repo-bootstrap/evals/run_evals.py +0 -337
  39. package/skills/harness-repo-bootstrap/references/exec-plans.md +0 -39
  40. package/skills/harness-repo-bootstrap/references/template-policy.md +0 -12
  41. package/skills/harness-repo-bootstrap/references/workflow.md +0 -47
  42. package/skills/harness-repo-bootstrap/scripts/manage_harness.py +0 -1181
  43. /package/skills/{harness-repo-bootstrap → harness-engine}/assets/repo-template/.keep +0 -0
  44. /package/skills/{harness-repo-bootstrap → harness-engine}/assets/sops/.keep +0 -0
  45. /package/skills/{harness-repo-bootstrap → harness-engine}/references/question-catalog.md +0 -0
@@ -0,0 +1,155 @@
1
+ import json
2
+ import subprocess
3
+ import sys
4
+ from pathlib import Path
5
+
6
+ SKILL_DIR = Path(__file__).resolve().parents[2]
7
+ REPO_ROOT = SKILL_DIR.parents[1]
8
+ MANAGER = SKILL_DIR / "scripts" / "manage_harness.py"
9
+ CASES_PATH = Path(__file__).resolve().parents[1] / "cases.json"
10
+ SCRIPTS_DIR = SKILL_DIR / "scripts"
11
+ if str(SCRIPTS_DIR) not in sys.path:
12
+ sys.path.insert(0, str(SCRIPTS_DIR))
13
+
14
+ from harness_engine.continuation import continuation_decision_issues
15
+
16
+ def load_case_metadata():
17
+ if not CASES_PATH.exists():
18
+ return {}
19
+ return {item["id"]: item for item in json.loads(CASES_PATH.read_text())}
20
+
21
+
22
+ def run_manager(*args, expect_success=True):
23
+ result = subprocess.run(
24
+ [sys.executable, str(MANAGER), *args],
25
+ text=True,
26
+ capture_output=True,
27
+ check=False,
28
+ )
29
+ if expect_success and result.returncode != 0:
30
+ raise AssertionError(result.stderr or result.stdout)
31
+ if not expect_success and result.returncode == 0:
32
+ raise AssertionError("Command succeeded unexpectedly")
33
+ if result.stdout.strip():
34
+ return json.loads(result.stdout)
35
+ return {}
36
+
37
+
38
+ def write_answers(path, project_name="demo"):
39
+ answers = {
40
+ "project_name": project_name,
41
+ "project_summary": "A developer tooling project used to install and maintain Codex harness docs.",
42
+ "primary_users": "Codex users and maintainers",
43
+ "deployment_targets": "npm package and local repositories",
44
+ "product_domain": "developer tooling",
45
+ "reliability_targets": "Repeatable local commands and safe init behavior",
46
+ "security_constraints": "Do not write secrets or overwrite user-owned docs without consent",
47
+ "frontend_stack_notes": "Frontend changes require browser validation when a UI is detected",
48
+ "design_style_direction": "A restrained developer-tool interface with high-contrast text, calm neutral surfaces, compact spacing, and no decorative gradients.",
49
+ "quality_focus": "installer behavior, generated docs, plan closure, and knowledge capture",
50
+ "frontend_scope": "No frontend unless one is detected by analysis",
51
+ }
52
+ path.write_text(json.dumps(answers, indent=2) + "\n")
53
+
54
+
55
+ def assert_exists(repo, relative_path):
56
+ path = repo / relative_path
57
+ if not path.exists():
58
+ raise AssertionError(f"Expected {relative_path} to exist")
59
+
60
+
61
+ def assert_contains(repo, relative_path, needle):
62
+ text = (repo / relative_path).read_text()
63
+ if needle not in text:
64
+ raise AssertionError(f"Expected {relative_path} to contain {needle!r}")
65
+
66
+
67
+ def quality_note_args(
68
+ product="Product behavior was validated by the eval case command.",
69
+ ux="User/operator workflow evidence was reviewed in the generated plan.",
70
+ architecture="Architecture and plan state were inspected in repository files.",
71
+ reliability="Repeatable validation command evidence was produced by the eval case.",
72
+ security="Security and data-handling assumptions were reviewed in generated metadata files.",
73
+ ):
74
+ return [
75
+ "--product-note",
76
+ product,
77
+ "--ux-note",
78
+ ux,
79
+ "--architecture-note",
80
+ architecture,
81
+ "--reliability-note",
82
+ reliability,
83
+ "--security-note",
84
+ security,
85
+ ]
86
+
87
+
88
+ def acceptance_args(
89
+ product="The requested behavior is verified against a concrete product assertion for this eval case.",
90
+ ux="The user or operator workflow remains understandable for this eval case.",
91
+ architecture="The implementation keeps lifecycle state and repository boundaries maintainable for this eval case.",
92
+ reliability="The eval case records repeatable command evidence for the lifecycle behavior.",
93
+ security="The eval case confirms no secrets or sensitive data are introduced into plan metadata.",
94
+ ):
95
+ return [
96
+ "--product",
97
+ product,
98
+ "--ux",
99
+ ux,
100
+ "--architecture",
101
+ architecture,
102
+ "--reliability",
103
+ reliability,
104
+ "--security",
105
+ security,
106
+ ]
107
+
108
+
109
+ def set_acceptance(repo, relative_plan, **kwargs):
110
+ return run_manager(
111
+ "acceptance-set",
112
+ "--repo",
113
+ str(repo),
114
+ "--plan",
115
+ relative_plan,
116
+ *acceptance_args(**kwargs),
117
+ )
118
+
119
+
120
+ def set_continuation_complete(repo, relative_plan):
121
+ return run_manager(
122
+ "continuation-set",
123
+ "--repo",
124
+ str(repo),
125
+ "--plan",
126
+ relative_plan,
127
+ "--decision",
128
+ "complete",
129
+ "--closure-reason",
130
+ "The eval plan is complete and has no follow-up workstream.",
131
+ )
132
+
133
+
134
+ def continuation_codes(repo, plan_path):
135
+ return {
136
+ issue["code"]
137
+ for issue in continuation_decision_issues(repo, plan_path, plan_path.read_text())
138
+ }
139
+
140
+
141
+ def fill_plan_details(plan_path):
142
+ path = Path(plan_path)
143
+ text = path.read_text()
144
+ replacements = {
145
+ "- Define in-scope work.\n- Define out-of-scope work.": "- Implement the requested lifecycle behavior.\n- Keep unrelated repository behavior out of scope.",
146
+ "- Add relevant product, architecture, reliability, security, or delivery constraints.": "- Preserve existing command semantics unless this eval explicitly changes them.\n- Keep all validation evidence in repository-local files.",
147
+ "1. Add the first concrete step.\n2. Add the next concrete step.": "1. Prepare the target plan state.\n2. Run the lifecycle command under test.\n3. Verify the command result and persisted files.",
148
+ "1. Add the first concrete step.\n2. Add the next step.": "1. Prepare the target plan state.\n2. Run the lifecycle command under test.\n3. Verify the command result and persisted files.",
149
+ "- Describe how the work will be verified.": "- Run the relevant eval command and inspect generated Markdown and JSON state.",
150
+ }
151
+ for before, after in replacements.items():
152
+ text = text.replace(before, after)
153
+ path.write_text(text)
154
+
155
+
@@ -0,0 +1,55 @@
1
+ from .cases_lifecycle import (
2
+ test_empty_repo_init,
3
+ test_init_reconciles_existing_harness,
4
+ test_clean_removes_runtime_state_and_untracks_artifacts,
5
+ test_broad_task_intake_routes_repo_changes,
6
+ test_closed_loop_plan,
7
+ test_continuation_decision_workstream,
8
+ test_plan_path_canonicalization,
9
+ test_defect_recovery_loop,
10
+ test_quality_score_requires_notes,
11
+ test_knowledge_evidence_verbatim,
12
+ test_structured_plan_sidecar_and_acceptance,
13
+ test_quality_score_requires_ready_acceptance,
14
+ test_plan_close_rejects_template_placeholders,
15
+ test_plan_close_returns_open_knowledge_json,
16
+ test_plan_close_moves_sidecar_and_rejects_stale_score,
17
+ test_evidence_prune_generated_artifacts,
18
+ test_eval_report_shape,
19
+ test_preserve_unmanaged_docs,
20
+ )
21
+ from .cases_frontend import (
22
+ test_frontend_analysis,
23
+ test_backend_init_skips_frontend_design_docs,
24
+ test_frontend_design_control_plane,
25
+ test_no_external_design_dependency,
26
+ test_pack_excludes_external_design_dependency,
27
+ )
28
+
29
+ EVALS = [
30
+ ("empty-repo-init", test_empty_repo_init),
31
+ ("frontend-analysis", test_frontend_analysis),
32
+ ("init-reconciles-existing-harness", test_init_reconciles_existing_harness),
33
+ ("clean-removes-runtime-state-and-untracks-artifacts", test_clean_removes_runtime_state_and_untracks_artifacts),
34
+ ("broad-task-intake-routes-repo-changes", test_broad_task_intake_routes_repo_changes),
35
+ ("closed-loop-plan", test_closed_loop_plan),
36
+ ("continuation-decision-workstream", test_continuation_decision_workstream),
37
+ ("plan-path-canonicalization", test_plan_path_canonicalization),
38
+ ("defect-recovery-loop", test_defect_recovery_loop),
39
+ ("quality-score-requires-notes", test_quality_score_requires_notes),
40
+ ("knowledge-evidence-verbatim", test_knowledge_evidence_verbatim),
41
+ ("structured-plan-sidecar-and-acceptance", test_structured_plan_sidecar_and_acceptance),
42
+ ("quality-score-requires-ready-acceptance", test_quality_score_requires_ready_acceptance),
43
+ ("plan-close-rejects-template-placeholders", test_plan_close_rejects_template_placeholders),
44
+ ("plan-close-returns-open-knowledge-json", test_plan_close_returns_open_knowledge_json),
45
+ ("plan-close-moves-sidecar-and-rejects-stale-score", test_plan_close_moves_sidecar_and_rejects_stale_score),
46
+ ("evidence-prune-generated-artifacts", test_evidence_prune_generated_artifacts),
47
+ ("eval-report-shape", test_eval_report_shape),
48
+ ("preserve-unmanaged-docs", test_preserve_unmanaged_docs),
49
+ ("backend-init-skips-frontend-design-docs", test_backend_init_skips_frontend_design_docs),
50
+ ("frontend-design-control-plane", test_frontend_design_control_plane),
51
+ ("no-external-design-dependency", test_no_external_design_dependency),
52
+ ("pack-excludes-external-design-dependency", test_pack_excludes_external_design_dependency),
53
+ ]
54
+
55
+
@@ -0,0 +1,36 @@
1
+ def build_report(results):
2
+ passed = sum(1 for result in results if result["status"] == "pass")
3
+ total = len(results)
4
+ failed_results = [result for result in results if result["status"] == "fail"]
5
+ return {
6
+ "schema_version": "harness-eval-report.v1",
7
+ "status": "pass" if passed == total else "fail",
8
+ "score": round((passed / total) * 100) if total else 0,
9
+ "summary": {
10
+ "passed": passed,
11
+ "failed": total - passed,
12
+ "total": total,
13
+ "message": (
14
+ f"All {total} harness eval cases passed."
15
+ if passed == total
16
+ else f"{total - passed} of {total} harness eval cases failed."
17
+ ),
18
+ },
19
+ "metrics": {
20
+ "case_pass_rate": round(passed / total, 4) if total else 0,
21
+ "case_fail_rate": round((total - passed) / total, 4) if total else 0,
22
+ "failed_case_count": total - passed,
23
+ },
24
+ "case_results": results,
25
+ "user_message": (
26
+ "Harness evals passed. No release-blocking eval findings were detected."
27
+ if passed == total
28
+ else "Harness evals failed. Review `case_results` and fix the listed findings before handoff or release."
29
+ ),
30
+ "recommended_actions": [
31
+ action
32
+ for result in failed_results
33
+ for action in result["recommended_actions"]
34
+ ],
35
+ }
36
+
@@ -0,0 +1,53 @@
1
+ import json
2
+ import sys
3
+ import tempfile
4
+ import time
5
+ from pathlib import Path
6
+
7
+ from .helpers import load_case_metadata
8
+ from .registry import EVALS
9
+
10
+ from .report import build_report
11
+ def main():
12
+ results = []
13
+ case_metadata = load_case_metadata()
14
+ with tempfile.TemporaryDirectory() as tmp:
15
+ tmp_root = Path(tmp)
16
+ for eval_id, test_func in EVALS:
17
+ started = time.monotonic()
18
+ metadata = case_metadata.get(eval_id, {})
19
+ try:
20
+ test_func(tmp_root)
21
+ results.append(
22
+ {
23
+ "id": eval_id,
24
+ "status": "pass",
25
+ "description": metadata.get("description", ""),
26
+ "score": 1.0,
27
+ "duration_seconds": round(time.monotonic() - started, 3),
28
+ "findings": [],
29
+ "recommended_actions": [],
30
+ }
31
+ )
32
+ except Exception as error:
33
+ message = str(error)
34
+ results.append(
35
+ {
36
+ "id": eval_id,
37
+ "status": "fail",
38
+ "description": metadata.get("description", ""),
39
+ "score": 0.0,
40
+ "duration_seconds": round(time.monotonic() - started, 3),
41
+ "findings": [message],
42
+ "recommended_actions": [
43
+ f"Reproduce `{eval_id}` locally with python3 skills/harness-engine/evals/run_evals.py.",
44
+ "Treat the failing assertion as the next implementation input before release.",
45
+ ],
46
+ }
47
+ )
48
+
49
+ report = build_report(results)
50
+ print(json.dumps(report, indent=2) + "\n")
51
+ if report["status"] != "pass":
52
+ sys.exit(1)
53
+
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env python3
2
+
3
+ from pathlib import Path
4
+ import sys
5
+
6
+ EVALS_DIR = Path(__file__).resolve().parent
7
+ if str(EVALS_DIR) not in sys.path:
8
+ sys.path.insert(0, str(EVALS_DIR))
9
+
10
+ from harness_engine_evals.runner import main
11
+
12
+
13
+ if __name__ == "__main__":
14
+ main()
@@ -5,13 +5,19 @@ Use this loop when changing the skill, templates, scripts, or policy references:
5
5
  1. Draft the behavior in `SKILL.md`, `references/`, templates, or scripts.
6
6
  2. Test it with the deterministic commands in `scripts/manage_harness.py`.
7
7
  3. Evaluate it with `python3 evals/run_evals.py`.
8
- 4. Iterate until the runner passes and the score stays at 100.
8
+ 4. Read the structured `harness-eval-report.v1` output: aggregate metrics, per-case results,
9
+ findings, user message, and recommended actions.
10
+ 5. Iterate until the runner passes, the score stays at 100, and failed-case output would be
11
+ actionable for a user if the eval regressed.
9
12
 
10
13
  ## What The Evals Cover
11
14
 
12
15
  - first-time initialization of an empty repository
13
16
  - frontend-aware repository analysis
14
- - execution-plan and knowledge-capture closure
17
+ - execution-plan sidecars, acceptance contracts, and knowledge-capture closure
18
+ - quality results that block closure and force rework when scores fail or become stale
19
+ - continuation decisions and workstream recovery for resumable work
20
+ - structured eval report output with per-case findings and recommended actions
15
21
  - preservation of unmanaged user-owned docs
16
22
  - local harness checks that do not require user-project CI
17
23
 
@@ -0,0 +1,187 @@
1
+ # Evidence-First Evals
2
+
3
+ Use this reference when a task needs stronger validation than an LLM-written quality estimate.
4
+ The quality score is the final readiness summary, not the eval itself.
5
+
6
+ ## Core Rule
7
+
8
+ Every eval must separate four layers:
9
+
10
+ 1. **Product contract checks**: machine-readable assertions derived from `product.md`,
11
+ product specs, acceptance criteria, or the user's prompt.
12
+ 2. **Runtime behavior checks**: tests, API smoke checks, CLI checks, browser interactions,
13
+ and state assertions that prove the implementation works.
14
+ 3. **Visual and UX evidence**: screenshots, DOM/accessibility snapshots, responsive viewport
15
+ checks, and layout invariants for user-facing surfaces.
16
+ 4. **Reviewer judgment**: LLM or human scoring only after the first three layers have produced
17
+ evidence and logged defects.
18
+
19
+ If a requirement cannot be checked directly, write down why and replace it with the narrowest
20
+ observable proxy. Do not silently convert it into a vague score.
21
+
22
+ Repository-mutating eval work follows Harness Task Intake: create or reuse an active plan, set
23
+ acceptance before implementation, validate with evidence, score with `quality-score`, close the
24
+ plan, and run `check`. This applies to feature, bug, refactor, docs/policy, dependency/tooling,
25
+ UI, test/eval, security, and performance changes.
26
+
27
+ ## Eval Case Shape
28
+
29
+ Model each case like an OpenAI eval sample: stable id, input, expected behavior, recorded events,
30
+ and aggregate metrics.
31
+
32
+ Recommended fields:
33
+
34
+ - `id`: stable case id, versioned when the case changes materially.
35
+ - `source`: product spec, user request, bug report, design file, or regression source.
36
+ - `risk`: what failure this case is meant to catch.
37
+ - `setup`: fixtures, seed data, feature flags, viewport, network state, or browser route.
38
+ - `actions`: exact commands, API calls, browser actions, or user flows.
39
+ - `assertions`: deterministic checks that must pass.
40
+ - `artifacts`: logs, screenshots, traces, DOM snapshots, accessibility snapshots, or diffs.
41
+ - `defect_policy`: severity and `defect-log` summary to use if the case fails.
42
+ - `metrics`: pass/fail fields and numeric measurements to aggregate.
43
+
44
+ Do not accept an eval case whose only assertion is "LLM rates this highly".
45
+
46
+ ## Product Contract Checks
47
+
48
+ Before implementation, extract product requirements into a checklist that can be tested:
49
+
50
+ - required capabilities and forbidden capabilities
51
+ - key user workflows and edge cases
52
+ - copy, information architecture, and domain terminology that must appear
53
+ - persistence, permissions, latency, error handling, and empty states
54
+ - explicit non-goals such as "do not add CI" or "do not introduce auth"
55
+
56
+ For every product claim in the final answer, there should be a matching command, test, browser
57
+ assertion, artifact, or explicitly documented limitation.
58
+
59
+ ## Domain Issue Workflows
60
+
61
+ Issue triage is one branch of Harness Task Intake and should be domain-routed before implementation.
62
+ The generated `AGENTS.md` owns the current routing table; use it to decide which durable docs and
63
+ SOPs to read first.
64
+
65
+ Minimum expectations by domain:
66
+
67
+ - Product contract: convert requirements, specs, and acceptance criteria into assertions.
68
+ - Frontend/UI: capture browser or local-runtime evidence for the affected workflow and viewport.
69
+ - Backend/runtime: reproduce the behavior narrowly and verify with tests, API smoke checks, logs,
70
+ or integration evidence.
71
+ - Architecture: document boundary, dependency, data-flow, migration, and compatibility impact.
72
+ - Data/state: verify fixtures, migrations, rollback or compatibility behavior, and data-loss risk.
73
+ - Security/privacy: review sensitive data paths, permissions, auth boundaries, and secret handling.
74
+ - Performance/reliability: collect baseline measurement, repeatable benchmark or smoke evidence,
75
+ and before/after comparison.
76
+
77
+ Confirmed defects or evidence gaps should be logged into the active plan before quality scoring.
78
+ Before implementation, write the concrete Acceptance Contract with `acceptance-set`. Each
79
+ `quality-score` dimension must include a concrete evidence note tied to that contract. A numeric
80
+ score without evidence is not a valid readiness signal.
81
+
82
+ Use exact evidence when closing knowledge items: the text passed to `knowledge-mark-written`
83
+ must already appear in the durable destination doc. If the destination uses different wording,
84
+ copy a short phrase from that destination into an evidence file and pass `--evidence-file`.
85
+
86
+ ## Frontend Checks
87
+
88
+ For frontend work, use browser evidence instead of relying on a screenshot glance:
89
+
90
+ - Open the live route in a browser, not only static file inspection.
91
+ - Capture at least one desktop and one mobile viewport for meaningful UI changes.
92
+ - Assert important text, controls, selected state, loading state, empty state, error state,
93
+ and primary interaction outcomes from the DOM or accessibility tree.
94
+ - Check layout invariants: no critical overlap, no clipped primary text, stable toolbar/grid
95
+ dimensions, usable tap targets, and visible focus/selected states.
96
+ - For canvas/WebGL/game UIs, add pixel or scene-state checks so a blank canvas cannot pass.
97
+ - Save screenshots or snapshot paths in the plan or `docs/generated/` when visual evidence
98
+ matters for later review.
99
+
100
+ If the browser tool is unavailable, record the limitation as validation evidence and replace it
101
+ with the strongest available fallback: static DOM checks, component tests, image snapshots, or
102
+ API smoke checks. Do not mark UX as fully validated without saying what was missing.
103
+
104
+ ## Frontend Issue Reports
105
+
106
+ Frontend feedback is an eval trigger even when the harness skill was not explicitly invoked.
107
+ Handle any UI, layout, interaction, responsive behavior, visual state, canvas, or design fidelity
108
+ question through the repository's frontend workflow.
109
+
110
+ The correct response is:
111
+
112
+ - read `docs/FRONTEND.md`, `docs/DESIGN.md`, and the relevant SOP
113
+ - inspect the affected route, component, viewport, and user workflow
114
+ - reproduce the behavior with browser or local-runtime evidence when possible
115
+ - turn the finding into product/UX assertions or a regression case
116
+ - log confirmed defects or missing evidence in the active plan
117
+ - fix and validate against the same workflow before claiming the UI is acceptable
118
+
119
+ Do not answer from memory or aesthetic judgment alone when the question is about a concrete
120
+ frontend behavior.
121
+
122
+ ## Bug Discovery Evals
123
+
124
+ Add regression cases for failures that were previously missed.
125
+
126
+ A good bug-discovery eval proves two things:
127
+
128
+ - the bad implementation fails a narrow test or observable assertion
129
+ - the harness blocks closure through `acceptance-set`, `defect-log`, `quality-score`, `plan-close`, and `check`
130
+
131
+ Track missed-bug classes separately from generic test pass rate. Examples:
132
+
133
+ - product-spec drift not detected
134
+ - browser layout defect not detected
135
+ - generated app behavior bug not detected
136
+ - unresolved defect allowed through handoff
137
+ - missing visual evidence accepted as UX validation
138
+
139
+ ## Metrics
140
+
141
+ Record sample-level events first, then aggregate.
142
+
143
+ Useful aggregate metrics:
144
+
145
+ - `case_pass_rate`: passed cases divided by total cases
146
+ - `product_contract_pass_rate`: product assertions passed divided by product assertions
147
+ - `visual_evidence_coverage`: frontend cases with required screenshots/snapshots
148
+ - `defect_block_rate`: known defects that blocked closure when injected
149
+ - `missed_defect_count`: known defects that reached a passing Quality Result
150
+ - `artifact_completeness`: required logs/screenshots/traces present
151
+ - `llm_judge_agreement`: optional reviewer score agreement with labeled cases
152
+
153
+ Fail release or handoff when a P0/P1 defect is missed, required product assertions are untested,
154
+ or frontend evidence is absent for meaningful UI work.
155
+
156
+ ## Report Output
157
+
158
+ Eval runners should emit structured JSON that can be shown to users and consumed by tools.
159
+ Use a stable schema name and include both aggregate and per-case results.
160
+
161
+ Recommended top-level fields:
162
+
163
+ - `schema_version`: stable report schema such as `harness-eval-report.v1`.
164
+ - `status`: `pass` or `fail`.
165
+ - `score`: whole-number aggregate score from `0` to `100`.
166
+ - `summary`: passed, failed, total, and one concise message.
167
+ - `metrics`: named aggregate metrics, not only one score.
168
+ - `case_results`: one object per case with `id`, `description`, `status`, `score`,
169
+ `duration_seconds`, `findings`, and `recommended_actions`.
170
+ - `user_message`: direct text the agent can relay to the user.
171
+ - `recommended_actions`: deduplicated next actions for failed cases.
172
+
173
+ Failure output must name the specific failed case, failed assertion or evidence gap, and the next
174
+ action. Passing output should still include per-case scores so the user can see what was actually
175
+ covered.
176
+
177
+ ## Meta-Eval Calibration
178
+
179
+ When an LLM judge is used, keep a small labeled meta-eval set:
180
+
181
+ - examples that should pass
182
+ - examples that should fail product correctness
183
+ - examples that should fail visual/UX evidence
184
+ - examples with open defects that must block handoff
185
+
186
+ Run the judge against these labels and treat disagreement as an eval bug. The judge may summarize
187
+ evidence and suggest risks, but it must not override deterministic failures.
@@ -0,0 +1,59 @@
1
+ # Execution Plans
2
+
3
+ Execution plans are required for every repository-mutating change. This includes code, docs, configuration, tests, dependencies, build/release scripts, generated templates, runtime behavior, migrations, cleanup, and fixes found during review.
4
+
5
+ ## When To Create One
6
+
7
+ - any file will be edited or created
8
+ - repository behavior, policy, generated templates, dependency state, build output, runtime behavior, or validation coverage will change
9
+ - a review finding, user feedback item, bug, or regression requires a repository change
10
+ - work will span enough time that another agent may resume it later
11
+
12
+ Only skip a plan for pure question answering, read-only investigation, showing command output, or status reporting with no file changes. If the work moves from investigation to editing files, create or reuse an active plan before editing.
13
+
14
+ ## Location
15
+
16
+ - Workstream recovery ledger: `docs/exec-plans/workstreams.md`
17
+ - Active: `docs/exec-plans/active/`
18
+ - Completed: `docs/exec-plans/completed/`
19
+
20
+ Active plans, completed plans, JSON sidecars, and `workstreams.md` are durable project state and should be version-controlled.
21
+
22
+ ## Minimum Sections
23
+
24
+ - goal
25
+ - scope
26
+ - constraints
27
+ - steps
28
+ - validation
29
+ - acceptance contract
30
+ - quality result
31
+ - defects to resolve
32
+ - rework required
33
+ - continuation decision
34
+ - durable knowledge to capture
35
+ - completion notes
36
+
37
+ ## Operating Rule
38
+
39
+ Update the active plan during the work. Define the Acceptance Contract before implementation, score the completed work against that contract, complete any required rework, record the continuation decision, move it to `completed`, and leave behind any durable facts in the right permanent docs.
40
+
41
+ For small changes, keep the plan lightweight: narrow scope, short steps, and focused validation are acceptable. Do not skip `acceptance-set`, evidence-backed validation, `quality-score`, `plan-close`, or the final `check`.
42
+
43
+ Before scoring or closing, replace generic starter text with task-specific content. Do not leave placeholders such as "Define in-scope work", "Add the first concrete step", or "Describe how the work will be verified". The default unused durable-knowledge line may remain open, but any real knowledge TODO must be logged, written, and marked complete.
44
+
45
+ ## Closed Loop
46
+
47
+ Codex should use the script, not ad hoc manual edits, for the lifecycle. Users express intent in natural language; Codex translates that intent into these commands:
48
+
49
+ - `plan-start`: create a new active execution plan
50
+ - `acceptance-set`: write concrete product, UX, architecture, reliability, and security acceptance criteria before implementation; this updates the structured sidecar fingerprint
51
+ - `knowledge-log`: append a durable fact that still needs to be written into permanent docs and return its stable id; use `--fact-file` for shell-sensitive facts
52
+ - `knowledge-mark-written`: verify and mark a logged fact as written into its permanent doc; evidence must be exact text already present in the destination doc; prefer `--id <knowledge-id> --evidence-file <file>` for shell-sensitive evidence, and use `--append` only to append the exact fact first
53
+ - `defect-log`: record a bug found by validation, evals, browser testing, or code review; this invalidates any existing quality result and makes the defect the next rework input
54
+ - `defect-resolve`: mark a logged defect fixed with validation or code evidence; re-run validation and `quality-score` before closing
55
+ - `quality-score`: write a scored Quality Result into the plan based on the ready Acceptance Contract; every dimension must include an evidence note; if it fails, the generated `## Rework Required` section becomes the next implementation input
56
+ - `continuation-set`: declare whether the work is complete, continues, pauses, stops, or is deferred; `continue` and `pause` update `docs/exec-plans/workstreams.md` automatically after required fields validate, and `--goal` can set the resumable workstream goal
57
+ - `workstream-upsert`: manually update `docs/exec-plans/workstreams.md` when repairing or migrating resumable workstream state
58
+ - `plan-close`: refuse to close cleanly until the Acceptance Contract is ready, the Quality Result passes against the current contract fingerprint, the continuation decision is recorded, and the listed knowledge items are marked as written to durable docs; blocked closes return structured JSON with `status: "blocked"`, `reason`, `message`, and `details`
59
+ - `check`: run a local handoff check without requiring target-repo CI
@@ -2,7 +2,7 @@
2
2
 
3
3
  - `AGENTS.md`: short router, reading order, repo-specific guardrails
4
4
  - `ARCHITECTURE.md`: domain boundaries, runtime topology, integration seams
5
- - `docs/PLANS.md`: plan lifecycle and storage rules
5
+ - `docs/PLANS.md`: default repository-change plan lifecycle and storage rules
6
6
  - `docs/PRODUCT_SENSE.md`: product heuristics and tradeoff rules
7
7
  - `docs/QUALITY_SCORE.md`: quality rubric by domain and layer
8
8
  - `docs/RELIABILITY.md`: SLOs, failure modes, observability expectations
@@ -11,7 +11,7 @@
11
11
  - `docs/FRONTEND.md`: frontend stack conventions and validation loop
12
12
  - `docs/design-docs/`: durable design decisions
13
13
  - `docs/product-specs/`: durable product specs
14
- - `docs/exec-plans/`: active plans, completed plans, and tech debt tracker
14
+ - `docs/exec-plans/`: durable active plans, completed plans, JSON sidecars, workstreams, and tech debt tracker
15
15
  - `docs/sops/`: mechanical procedures for recurring workflows and validation loops
16
- - `docs/generated/`: generated facts such as schemas
16
+ - `docs/generated/`: generated evidence and facts such as schemas, browser screenshots, DOM snapshots, layout summaries, and smoke outputs; use `evidence-prune` to preview stale unreferenced artifacts before deleting
17
17
  - `docs/references/`: external references rewritten or linked for model-friendly discovery
@@ -27,9 +27,9 @@ Prefer the script workflow:
27
27
 
28
28
  1. Log the fact into the active execution plan with `knowledge-log`.
29
29
  2. Write the fact into its permanent destination doc.
30
- 3. Mark the plan item complete with `knowledge-mark-written --id <knowledge-id> --evidence "<text present in durable doc>"`.
30
+ 3. Mark the plan item complete with `knowledge-mark-written --id <knowledge-id> --evidence "<verbatim text present in durable doc>"`.
31
31
  4. Close the plan with `plan-close`.
32
32
 
33
33
  `knowledge-log` returns a stable id. Prefer id-based closure so permanent docs can use concise, natural wording rather than duplicating the exact plan fact.
34
34
 
35
- `knowledge-mark-written` verifies that the destination file contains either the provided evidence text or, for legacy calls, the exact fact. Use `--append` only when the exact fact should be appended to the destination doc by the tool.
35
+ `knowledge-mark-written` verifies that the destination file contains either the provided evidence text or the exact fact. Evidence must be copied from the destination doc; a summary such as "the doc now states this rule" is rejected unless that exact sentence is in the doc. Use `--append` only when the exact fact should be appended to the destination doc by the tool.
@@ -6,5 +6,8 @@ Choose an SOP whenever the task touches one of these areas:
6
6
  - missing durable repository knowledge: `docs/sops/encode-unseen-knowledge.md`
7
7
  - runtime debugging or observability setup: `docs/sops/local-observability-feedback-loop.md`
8
8
  - user interface work: `docs/sops/chrome-devtools-ui-validation-loop.md`
9
+ - product correctness, frontend layout, or bug-discovery evals: `docs/sops/evidence-first-eval-loop.md`
10
+ - repository-mutating work: start from Harness Task Intake in `AGENTS.md`, then follow the matching SOP and domain docs listed there
11
+ - backend behavior, architecture boundaries, data/state, security, or performance issue triage: use the Issue Workflows branch in `AGENTS.md`, then follow the domain docs listed there
9
12
 
10
13
  If no SOP exists for a recurring workflow, create one in `docs/sops/` as part of the task.
@@ -0,0 +1,17 @@
1
+ # Template Policy
2
+
3
+ Every generated file starts with a managed marker:
4
+
5
+ `<!-- harness-engine:managed -->`
6
+
7
+ Init behavior:
8
+
9
+ - `init`: create missing files for new repositories; when an existing managed harness is detected, refresh managed files and create missing files while preserving unmanaged files
10
+ - `clean` removes transient runtime state under `docs/generated/`
11
+ - `clean` maintains `.gitignore` entries for `.codex/skills/` and `docs/generated/`
12
+ - `clean` previews or stages removal of already tracked local skill installs and generated evidence from the git index; it requires `--apply` before running `git rm --cached`
13
+ - execution plans, JSON sidecars, and `docs/exec-plans/workstreams.md` are durable project state and must not be cleaned, ignored, or untracked by default
14
+
15
+ Use `init` as the normal workspace command so creation and reconciliation share one path. Use `--force` only when the human explicitly accepts overwriting.
16
+
17
+ If a file exists without the managed marker, treat it as user-owned unless the human explicitly asks to replace it.