npm - @hallucination-studio/harness-engine - Versions diffs - 1.0.0 → 1.0.1 - Mend

@hallucination-studio/harness-engine 1.0.0 → 1.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (45) hide show

package/skills/harness-engine/evals/harness_engine_evals/helpers.py ADDED Viewed

@@ -0,0 +1,155 @@
+import json
+import subprocess
+import sys
+from pathlib import Path
+SKILL_DIR = Path(__file__).resolve().parents[2]
+REPO_ROOT = SKILL_DIR.parents[1]
+MANAGER = SKILL_DIR / "scripts" / "manage_harness.py"
+CASES_PATH = Path(__file__).resolve().parents[1] / "cases.json"
+SCRIPTS_DIR = SKILL_DIR / "scripts"
+if str(SCRIPTS_DIR) not in sys.path:
+    sys.path.insert(0, str(SCRIPTS_DIR))
+from harness_engine.continuation import continuation_decision_issues
+def load_case_metadata():
+    if not CASES_PATH.exists():
+        return {}
+    return {item["id"]: item for item in json.loads(CASES_PATH.read_text())}
+def run_manager(*args, expect_success=True):
+    result = subprocess.run(
+        [sys.executable, str(MANAGER), *args],
+        text=True,
+        capture_output=True,
+        check=False,
+    )
+    if expect_success and result.returncode != 0:
+        raise AssertionError(result.stderr or result.stdout)
+    if not expect_success and result.returncode == 0:
+        raise AssertionError("Command succeeded unexpectedly")
+    if result.stdout.strip():
+        return json.loads(result.stdout)
+    return {}
+def write_answers(path, project_name="demo"):
+    answers = {
+        "project_name": project_name,
+        "project_summary": "A developer tooling project used to install and maintain Codex harness docs.",
+        "primary_users": "Codex users and maintainers",
+        "deployment_targets": "npm package and local repositories",
+        "product_domain": "developer tooling",
+        "reliability_targets": "Repeatable local commands and safe init behavior",
+        "security_constraints": "Do not write secrets or overwrite user-owned docs without consent",
+        "frontend_stack_notes": "Frontend changes require browser validation when a UI is detected",
+        "design_style_direction": "A restrained developer-tool interface with high-contrast text, calm neutral surfaces, compact spacing, and no decorative gradients.",
+        "quality_focus": "installer behavior, generated docs, plan closure, and knowledge capture",
+        "frontend_scope": "No frontend unless one is detected by analysis",
+    }
+    path.write_text(json.dumps(answers, indent=2) + "\n")
+def assert_exists(repo, relative_path):
+    path = repo / relative_path
+    if not path.exists():
+        raise AssertionError(f"Expected {relative_path} to exist")
+def assert_contains(repo, relative_path, needle):
+    text = (repo / relative_path).read_text()
+    if needle not in text:
+        raise AssertionError(f"Expected {relative_path} to contain {needle!r}")
+def quality_note_args(
+    product="Product behavior was validated by the eval case command.",
+    ux="User/operator workflow evidence was reviewed in the generated plan.",
+    architecture="Architecture and plan state were inspected in repository files.",
+    reliability="Repeatable validation command evidence was produced by the eval case.",
+    security="Security and data-handling assumptions were reviewed in generated metadata files.",
+):
+    return [
+        "--product-note",
+        product,
+        "--ux-note",
+        ux,
+        "--architecture-note",
+        architecture,
+        "--reliability-note",
+        reliability,
+        "--security-note",
+        security,
+    ]
+def acceptance_args(
+    product="The requested behavior is verified against a concrete product assertion for this eval case.",
+    ux="The user or operator workflow remains understandable for this eval case.",
+    architecture="The implementation keeps lifecycle state and repository boundaries maintainable for this eval case.",
+    reliability="The eval case records repeatable command evidence for the lifecycle behavior.",
+    security="The eval case confirms no secrets or sensitive data are introduced into plan metadata.",
+):
+    return [
+        "--product",
+        product,
+        "--ux",
+        ux,
+        "--architecture",
+        architecture,
+        "--reliability",
+        reliability,
+        "--security",
+        security,
+    ]
+def set_acceptance(repo, relative_plan, **kwargs):
+    return run_manager(
+        "acceptance-set",
+        "--repo",
+        str(repo),
+        "--plan",
+        relative_plan,
+        *acceptance_args(**kwargs),
+    )
+def set_continuation_complete(repo, relative_plan):
+    return run_manager(
+        "continuation-set",
+        "--repo",
+        str(repo),
+        "--plan",
+        relative_plan,
+        "--decision",
+        "complete",
+        "--closure-reason",
+        "The eval plan is complete and has no follow-up workstream.",
+    )
+def continuation_codes(repo, plan_path):
+    return {
+        issue["code"]
+        for issue in continuation_decision_issues(repo, plan_path, plan_path.read_text())
+    }
+def fill_plan_details(plan_path):
+    path = Path(plan_path)
+    text = path.read_text()
+    replacements = {
+        "- Define in-scope work.\n- Define out-of-scope work.": "- Implement the requested lifecycle behavior.\n- Keep unrelated repository behavior out of scope.",
+        "- Add relevant product, architecture, reliability, security, or delivery constraints.": "- Preserve existing command semantics unless this eval explicitly changes them.\n- Keep all validation evidence in repository-local files.",
+        "1. Add the first concrete step.\n2. Add the next concrete step.": "1. Prepare the target plan state.\n2. Run the lifecycle command under test.\n3. Verify the command result and persisted files.",
+        "1. Add the first concrete step.\n2. Add the next step.": "1. Prepare the target plan state.\n2. Run the lifecycle command under test.\n3. Verify the command result and persisted files.",
+        "- Describe how the work will be verified.": "- Run the relevant eval command and inspect generated Markdown and JSON state.",
+    }
+    for before, after in replacements.items():
+        text = text.replace(before, after)
+    path.write_text(text)

package/skills/harness-engine/evals/harness_engine_evals/registry.py ADDED Viewed

@@ -0,0 +1,55 @@
+from .cases_lifecycle import (
+    test_empty_repo_init,
+    test_init_reconciles_existing_harness,
+    test_clean_removes_runtime_state_and_untracks_artifacts,
+    test_broad_task_intake_routes_repo_changes,
+    test_closed_loop_plan,
+    test_continuation_decision_workstream,
+    test_plan_path_canonicalization,
+    test_defect_recovery_loop,
+    test_quality_score_requires_notes,
+    test_knowledge_evidence_verbatim,
+    test_structured_plan_sidecar_and_acceptance,
+    test_quality_score_requires_ready_acceptance,
+    test_plan_close_rejects_template_placeholders,
+    test_plan_close_returns_open_knowledge_json,
+    test_plan_close_moves_sidecar_and_rejects_stale_score,
+    test_evidence_prune_generated_artifacts,
+    test_eval_report_shape,
+    test_preserve_unmanaged_docs,
+)
+from .cases_frontend import (
+    test_frontend_analysis,
+    test_backend_init_skips_frontend_design_docs,
+    test_frontend_design_control_plane,
+    test_no_external_design_dependency,
+    test_pack_excludes_external_design_dependency,
+)
+EVALS = [
+    ("empty-repo-init", test_empty_repo_init),
+    ("frontend-analysis", test_frontend_analysis),
+    ("init-reconciles-existing-harness", test_init_reconciles_existing_harness),
+    ("clean-removes-runtime-state-and-untracks-artifacts", test_clean_removes_runtime_state_and_untracks_artifacts),
+    ("broad-task-intake-routes-repo-changes", test_broad_task_intake_routes_repo_changes),
+    ("closed-loop-plan", test_closed_loop_plan),
+    ("continuation-decision-workstream", test_continuation_decision_workstream),
+    ("plan-path-canonicalization", test_plan_path_canonicalization),
+    ("defect-recovery-loop", test_defect_recovery_loop),
+    ("quality-score-requires-notes", test_quality_score_requires_notes),
+    ("knowledge-evidence-verbatim", test_knowledge_evidence_verbatim),
+    ("structured-plan-sidecar-and-acceptance", test_structured_plan_sidecar_and_acceptance),
+    ("quality-score-requires-ready-acceptance", test_quality_score_requires_ready_acceptance),
+    ("plan-close-rejects-template-placeholders", test_plan_close_rejects_template_placeholders),
+    ("plan-close-returns-open-knowledge-json", test_plan_close_returns_open_knowledge_json),
+    ("plan-close-moves-sidecar-and-rejects-stale-score", test_plan_close_moves_sidecar_and_rejects_stale_score),
+    ("evidence-prune-generated-artifacts", test_evidence_prune_generated_artifacts),
+    ("eval-report-shape", test_eval_report_shape),
+    ("preserve-unmanaged-docs", test_preserve_unmanaged_docs),
+    ("backend-init-skips-frontend-design-docs", test_backend_init_skips_frontend_design_docs),
+    ("frontend-design-control-plane", test_frontend_design_control_plane),
+    ("no-external-design-dependency", test_no_external_design_dependency),
+    ("pack-excludes-external-design-dependency", test_pack_excludes_external_design_dependency),
+]

package/skills/harness-engine/evals/harness_engine_evals/report.py ADDED Viewed

@@ -0,0 +1,36 @@
+def build_report(results):
+    passed = sum(1 for result in results if result["status"] == "pass")
+    total = len(results)
+    failed_results = [result for result in results if result["status"] == "fail"]
+    return {
+        "schema_version": "harness-eval-report.v1",
+        "status": "pass" if passed == total else "fail",
+        "score": round((passed / total) * 100) if total else 0,
+        "summary": {
+            "passed": passed,
+            "failed": total - passed,
+            "total": total,
+            "message": (
+                f"All {total} harness eval cases passed."
+                if passed == total
+                else f"{total - passed} of {total} harness eval cases failed."
+            ),
+        },
+        "metrics": {
+            "case_pass_rate": round(passed / total, 4) if total else 0,
+            "case_fail_rate": round((total - passed) / total, 4) if total else 0,
+            "failed_case_count": total - passed,
+        },
+        "case_results": results,
+        "user_message": (
+            "Harness evals passed. No release-blocking eval findings were detected."
+            if passed == total
+            else "Harness evals failed. Review `case_results` and fix the listed findings before handoff or release."
+        ),
+        "recommended_actions": [
+            action
+            for result in failed_results
+            for action in result["recommended_actions"]
+        ],
+    }

package/skills/harness-engine/evals/harness_engine_evals/runner.py ADDED Viewed

@@ -0,0 +1,53 @@
+import json
+import sys
+import tempfile
+import time
+from pathlib import Path
+from .helpers import load_case_metadata
+from .registry import EVALS
+from .report import build_report
+def main():
+    results = []
+    case_metadata = load_case_metadata()
+    with tempfile.TemporaryDirectory() as tmp:
+        tmp_root = Path(tmp)
+        for eval_id, test_func in EVALS:
+            started = time.monotonic()
+            metadata = case_metadata.get(eval_id, {})
+            try:
+                test_func(tmp_root)
+                results.append(
+                    {
+                        "id": eval_id,
+                        "status": "pass",
+                        "description": metadata.get("description", ""),
+                        "score": 1.0,
+                        "duration_seconds": round(time.monotonic() - started, 3),
+                        "findings": [],
+                        "recommended_actions": [],
+                    }
+                )
+            except Exception as error:
+                message = str(error)
+                results.append(
+                    {
+                        "id": eval_id,
+                        "status": "fail",
+                        "description": metadata.get("description", ""),
+                        "score": 0.0,
+                        "duration_seconds": round(time.monotonic() - started, 3),
+                        "findings": [message],
+                        "recommended_actions": [
+                            f"Reproduce `{eval_id}` locally with python3 skills/harness-engine/evals/run_evals.py.",
+                            "Treat the failing assertion as the next implementation input before release.",
+                        ],
+                    }
+                )
+    report = build_report(results)
+    print(json.dumps(report, indent=2) + "\n")
+    if report["status"] != "pass":
+        sys.exit(1)

package/skills/harness-engine/evals/run_evals.py ADDED Viewed

@@ -0,0 +1,14 @@
+#!/usr/bin/env python3
+from pathlib import Path
+import sys
+EVALS_DIR = Path(__file__).resolve().parent
+if str(EVALS_DIR) not in sys.path:
+    sys.path.insert(0, str(EVALS_DIR))
+from harness_engine_evals.runner import main
+if __name__ == "__main__":
+    main()

package/skills/{harness-repo-bootstrap → harness-engine}/references/evaluation-loop.md RENAMED Viewed

@@ -5,13 +5,19 @@ Use this loop when changing the skill, templates, scripts, or policy references:
 1. Draft the behavior in `SKILL.md`, `references/`, templates, or scripts.
 2. Test it with the deterministic commands in `scripts/manage_harness.py`.
 3. Evaluate it with `python3 evals/run_evals.py`.
-4. Iterate until the runner passes and the score stays at 100.
+4. Read the structured `harness-eval-report.v1` output: aggregate metrics, per-case results,
+   findings, user message, and recommended actions.
+5. Iterate until the runner passes, the score stays at 100, and failed-case output would be
+   actionable for a user if the eval regressed.
 ## What The Evals Cover
 - first-time initialization of an empty repository
 - frontend-aware repository analysis
-- execution-plan and knowledge-capture closure
+- execution-plan sidecars, acceptance contracts, and knowledge-capture closure
+- quality results that block closure and force rework when scores fail or become stale
+- continuation decisions and workstream recovery for resumable work
+- structured eval report output with per-case findings and recommended actions
 - preservation of unmanaged user-owned docs
 - local harness checks that do not require user-project CI

package/skills/harness-engine/references/evidence-first-evals.md ADDED Viewed

@@ -0,0 +1,187 @@
+# Evidence-First Evals
+Use this reference when a task needs stronger validation than an LLM-written quality estimate.
+The quality score is the final readiness summary, not the eval itself.
+## Core Rule
+Every eval must separate four layers:
+1. **Product contract checks**: machine-readable assertions derived from `product.md`,
+   product specs, acceptance criteria, or the user's prompt.
+2. **Runtime behavior checks**: tests, API smoke checks, CLI checks, browser interactions,
+   and state assertions that prove the implementation works.
+3. **Visual and UX evidence**: screenshots, DOM/accessibility snapshots, responsive viewport
+   checks, and layout invariants for user-facing surfaces.
+4. **Reviewer judgment**: LLM or human scoring only after the first three layers have produced
+   evidence and logged defects.
+If a requirement cannot be checked directly, write down why and replace it with the narrowest
+observable proxy. Do not silently convert it into a vague score.
+Repository-mutating eval work follows Harness Task Intake: create or reuse an active plan, set
+acceptance before implementation, validate with evidence, score with `quality-score`, close the
+plan, and run `check`. This applies to feature, bug, refactor, docs/policy, dependency/tooling,
+UI, test/eval, security, and performance changes.
+## Eval Case Shape
+Model each case like an OpenAI eval sample: stable id, input, expected behavior, recorded events,
+and aggregate metrics.
+Recommended fields:
+- `id`: stable case id, versioned when the case changes materially.
+- `source`: product spec, user request, bug report, design file, or regression source.
+- `risk`: what failure this case is meant to catch.
+- `setup`: fixtures, seed data, feature flags, viewport, network state, or browser route.
+- `actions`: exact commands, API calls, browser actions, or user flows.
+- `assertions`: deterministic checks that must pass.
+- `artifacts`: logs, screenshots, traces, DOM snapshots, accessibility snapshots, or diffs.
+- `defect_policy`: severity and `defect-log` summary to use if the case fails.
+- `metrics`: pass/fail fields and numeric measurements to aggregate.
+Do not accept an eval case whose only assertion is "LLM rates this highly".
+## Product Contract Checks
+Before implementation, extract product requirements into a checklist that can be tested:
+- required capabilities and forbidden capabilities
+- key user workflows and edge cases
+- copy, information architecture, and domain terminology that must appear
+- persistence, permissions, latency, error handling, and empty states
+- explicit non-goals such as "do not add CI" or "do not introduce auth"
+For every product claim in the final answer, there should be a matching command, test, browser
+assertion, artifact, or explicitly documented limitation.
+## Domain Issue Workflows
+Issue triage is one branch of Harness Task Intake and should be domain-routed before implementation.
+The generated `AGENTS.md` owns the current routing table; use it to decide which durable docs and
+SOPs to read first.
+Minimum expectations by domain:
+- Product contract: convert requirements, specs, and acceptance criteria into assertions.
+- Frontend/UI: capture browser or local-runtime evidence for the affected workflow and viewport.
+- Backend/runtime: reproduce the behavior narrowly and verify with tests, API smoke checks, logs,
+  or integration evidence.
+- Architecture: document boundary, dependency, data-flow, migration, and compatibility impact.
+- Data/state: verify fixtures, migrations, rollback or compatibility behavior, and data-loss risk.
+- Security/privacy: review sensitive data paths, permissions, auth boundaries, and secret handling.
+- Performance/reliability: collect baseline measurement, repeatable benchmark or smoke evidence,
+  and before/after comparison.
+Confirmed defects or evidence gaps should be logged into the active plan before quality scoring.
+Before implementation, write the concrete Acceptance Contract with `acceptance-set`. Each
+`quality-score` dimension must include a concrete evidence note tied to that contract. A numeric
+score without evidence is not a valid readiness signal.
+Use exact evidence when closing knowledge items: the text passed to `knowledge-mark-written`
+must already appear in the durable destination doc. If the destination uses different wording,
+copy a short phrase from that destination into an evidence file and pass `--evidence-file`.
+## Frontend Checks
+For frontend work, use browser evidence instead of relying on a screenshot glance:
+- Open the live route in a browser, not only static file inspection.
+- Capture at least one desktop and one mobile viewport for meaningful UI changes.
+- Assert important text, controls, selected state, loading state, empty state, error state,
+  and primary interaction outcomes from the DOM or accessibility tree.
+- Check layout invariants: no critical overlap, no clipped primary text, stable toolbar/grid
+  dimensions, usable tap targets, and visible focus/selected states.
+- For canvas/WebGL/game UIs, add pixel or scene-state checks so a blank canvas cannot pass.
+- Save screenshots or snapshot paths in the plan or `docs/generated/` when visual evidence
+  matters for later review.
+If the browser tool is unavailable, record the limitation as validation evidence and replace it
+with the strongest available fallback: static DOM checks, component tests, image snapshots, or
+API smoke checks. Do not mark UX as fully validated without saying what was missing.
+## Frontend Issue Reports
+Frontend feedback is an eval trigger even when the harness skill was not explicitly invoked.
+Handle any UI, layout, interaction, responsive behavior, visual state, canvas, or design fidelity
+question through the repository's frontend workflow.
+The correct response is:
+- read `docs/FRONTEND.md`, `docs/DESIGN.md`, and the relevant SOP
+- inspect the affected route, component, viewport, and user workflow
+- reproduce the behavior with browser or local-runtime evidence when possible
+- turn the finding into product/UX assertions or a regression case
+- log confirmed defects or missing evidence in the active plan
+- fix and validate against the same workflow before claiming the UI is acceptable
+Do not answer from memory or aesthetic judgment alone when the question is about a concrete
+frontend behavior.
+## Bug Discovery Evals
+Add regression cases for failures that were previously missed.
+A good bug-discovery eval proves two things:
+- the bad implementation fails a narrow test or observable assertion
+- the harness blocks closure through `acceptance-set`, `defect-log`, `quality-score`, `plan-close`, and `check`
+Track missed-bug classes separately from generic test pass rate. Examples:
+- product-spec drift not detected
+- browser layout defect not detected
+- generated app behavior bug not detected
+- unresolved defect allowed through handoff
+- missing visual evidence accepted as UX validation
+## Metrics
+Record sample-level events first, then aggregate.
+Useful aggregate metrics:
+- `case_pass_rate`: passed cases divided by total cases
+- `product_contract_pass_rate`: product assertions passed divided by product assertions
+- `visual_evidence_coverage`: frontend cases with required screenshots/snapshots
+- `defect_block_rate`: known defects that blocked closure when injected
+- `missed_defect_count`: known defects that reached a passing Quality Result
+- `artifact_completeness`: required logs/screenshots/traces present
+- `llm_judge_agreement`: optional reviewer score agreement with labeled cases
+Fail release or handoff when a P0/P1 defect is missed, required product assertions are untested,
+or frontend evidence is absent for meaningful UI work.
+## Report Output
+Eval runners should emit structured JSON that can be shown to users and consumed by tools.
+Use a stable schema name and include both aggregate and per-case results.
+Recommended top-level fields:
+- `schema_version`: stable report schema such as `harness-eval-report.v1`.
+- `status`: `pass` or `fail`.
+- `score`: whole-number aggregate score from `0` to `100`.
+- `summary`: passed, failed, total, and one concise message.
+- `metrics`: named aggregate metrics, not only one score.
+- `case_results`: one object per case with `id`, `description`, `status`, `score`,
+  `duration_seconds`, `findings`, and `recommended_actions`.
+- `user_message`: direct text the agent can relay to the user.
+- `recommended_actions`: deduplicated next actions for failed cases.
+Failure output must name the specific failed case, failed assertion or evidence gap, and the next
+action. Passing output should still include per-case scores so the user can see what was actually
+covered.
+## Meta-Eval Calibration
+When an LLM judge is used, keep a small labeled meta-eval set:
+- examples that should pass
+- examples that should fail product correctness
+- examples that should fail visual/UX evidence
+- examples with open defects that must block handoff
+Run the judge against these labels and treat disagreement as an eval bug. The judge may summarize
+evidence and suggest risks, but it must not override deterministic failures.

package/skills/harness-engine/references/exec-plans.md ADDED Viewed

@@ -0,0 +1,59 @@
+# Execution Plans
+Execution plans are required for every repository-mutating change. This includes code, docs, configuration, tests, dependencies, build/release scripts, generated templates, runtime behavior, migrations, cleanup, and fixes found during review.
+## When To Create One
+- any file will be edited or created
+- repository behavior, policy, generated templates, dependency state, build output, runtime behavior, or validation coverage will change
+- a review finding, user feedback item, bug, or regression requires a repository change
+- work will span enough time that another agent may resume it later
+Only skip a plan for pure question answering, read-only investigation, showing command output, or status reporting with no file changes. If the work moves from investigation to editing files, create or reuse an active plan before editing.
+## Location
+- Workstream recovery ledger: `docs/exec-plans/workstreams.md`
+- Active: `docs/exec-plans/active/`
+- Completed: `docs/exec-plans/completed/`
+Active plans, completed plans, JSON sidecars, and `workstreams.md` are durable project state and should be version-controlled.
+## Minimum Sections
+- goal
+- scope
+- constraints
+- steps
+- validation
+- acceptance contract
+- quality result
+- defects to resolve
+- rework required
+- continuation decision
+- durable knowledge to capture
+- completion notes
+## Operating Rule
+Update the active plan during the work. Define the Acceptance Contract before implementation, score the completed work against that contract, complete any required rework, record the continuation decision, move it to `completed`, and leave behind any durable facts in the right permanent docs.
+For small changes, keep the plan lightweight: narrow scope, short steps, and focused validation are acceptable. Do not skip `acceptance-set`, evidence-backed validation, `quality-score`, `plan-close`, or the final `check`.
+Before scoring or closing, replace generic starter text with task-specific content. Do not leave placeholders such as "Define in-scope work", "Add the first concrete step", or "Describe how the work will be verified". The default unused durable-knowledge line may remain open, but any real knowledge TODO must be logged, written, and marked complete.
+## Closed Loop
+Codex should use the script, not ad hoc manual edits, for the lifecycle. Users express intent in natural language; Codex translates that intent into these commands:
+- `plan-start`: create a new active execution plan
+- `acceptance-set`: write concrete product, UX, architecture, reliability, and security acceptance criteria before implementation; this updates the structured sidecar fingerprint
+- `knowledge-log`: append a durable fact that still needs to be written into permanent docs and return its stable id; use `--fact-file` for shell-sensitive facts
+- `knowledge-mark-written`: verify and mark a logged fact as written into its permanent doc; evidence must be exact text already present in the destination doc; prefer `--id <knowledge-id> --evidence-file <file>` for shell-sensitive evidence, and use `--append` only to append the exact fact first
+- `defect-log`: record a bug found by validation, evals, browser testing, or code review; this invalidates any existing quality result and makes the defect the next rework input
+- `defect-resolve`: mark a logged defect fixed with validation or code evidence; re-run validation and `quality-score` before closing
+- `quality-score`: write a scored Quality Result into the plan based on the ready Acceptance Contract; every dimension must include an evidence note; if it fails, the generated `## Rework Required` section becomes the next implementation input
+- `continuation-set`: declare whether the work is complete, continues, pauses, stops, or is deferred; `continue` and `pause` update `docs/exec-plans/workstreams.md` automatically after required fields validate, and `--goal` can set the resumable workstream goal
+- `workstream-upsert`: manually update `docs/exec-plans/workstreams.md` when repairing or migrating resumable workstream state
+- `plan-close`: refuse to close cleanly until the Acceptance Contract is ready, the Quality Result passes against the current contract fingerprint, the continuation decision is recorded, and the listed knowledge items are marked as written to durable docs; blocked closes return structured JSON with `status: "blocked"`, `reason`, `message`, and `details`
+- `check`: run a local handoff check without requiring target-repo CI

package/skills/{harness-repo-bootstrap → harness-engine}/references/file-map.md RENAMED Viewed

@@ -2,7 +2,7 @@
 - `AGENTS.md`: short router, reading order, repo-specific guardrails
 - `ARCHITECTURE.md`: domain boundaries, runtime topology, integration seams
-- `docs/PLANS.md`: plan lifecycle and storage rules
+- `docs/PLANS.md`: default repository-change plan lifecycle and storage rules
 - `docs/PRODUCT_SENSE.md`: product heuristics and tradeoff rules
 - `docs/QUALITY_SCORE.md`: quality rubric by domain and layer
 - `docs/RELIABILITY.md`: SLOs, failure modes, observability expectations
@@ -11,7 +11,7 @@
 - `docs/FRONTEND.md`: frontend stack conventions and validation loop
 - `docs/design-docs/`: durable design decisions
 - `docs/product-specs/`: durable product specs
-- `docs/exec-plans/`: active plans, completed plans, and tech debt tracker
+- `docs/exec-plans/`: durable active plans, completed plans, JSON sidecars, workstreams, and tech debt tracker
 - `docs/sops/`: mechanical procedures for recurring workflows and validation loops
-- `docs/generated/`: generated facts such as schemas
+- `docs/generated/`: generated evidence and facts such as schemas, browser screenshots, DOM snapshots, layout summaries, and smoke outputs; use `evidence-prune` to preview stale unreferenced artifacts before deleting
 - `docs/references/`: external references rewritten or linked for model-friendly discovery

package/skills/{harness-repo-bootstrap → harness-engine}/references/knowledge-capture.md RENAMED Viewed

@@ -27,9 +27,9 @@ Prefer the script workflow:
 1. Log the fact into the active execution plan with `knowledge-log`.
 2. Write the fact into its permanent destination doc.
-3. Mark the plan item complete with `knowledge-mark-written --id <knowledge-id> --evidence "<text present in durable doc>"`.
+3. Mark the plan item complete with `knowledge-mark-written --id <knowledge-id> --evidence "<verbatim text present in durable doc>"`.
 4. Close the plan with `plan-close`.
 `knowledge-log` returns a stable id. Prefer id-based closure so permanent docs can use concise, natural wording rather than duplicating the exact plan fact.
-`knowledge-mark-written` verifies that the destination file contains either the provided evidence text or, for legacy calls, the exact fact. Use `--append` only when the exact fact should be appended to the destination doc by the tool.
+`knowledge-mark-written` verifies that the destination file contains either the provided evidence text or the exact fact. Evidence must be copied from the destination doc; a summary such as "the doc now states this rule" is rejected unless that exact sentence is in the doc. Use `--append` only when the exact fact should be appended to the destination doc by the tool.

package/skills/{harness-repo-bootstrap → harness-engine}/references/sop-index.md RENAMED Viewed

@@ -6,5 +6,8 @@ Choose an SOP whenever the task touches one of these areas:
 - missing durable repository knowledge: `docs/sops/encode-unseen-knowledge.md`
 - runtime debugging or observability setup: `docs/sops/local-observability-feedback-loop.md`
 - user interface work: `docs/sops/chrome-devtools-ui-validation-loop.md`
+- product correctness, frontend layout, or bug-discovery evals: `docs/sops/evidence-first-eval-loop.md`
+- repository-mutating work: start from Harness Task Intake in `AGENTS.md`, then follow the matching SOP and domain docs listed there
+- backend behavior, architecture boundaries, data/state, security, or performance issue triage: use the Issue Workflows branch in `AGENTS.md`, then follow the domain docs listed there
 If no SOP exists for a recurring workflow, create one in `docs/sops/` as part of the task.

package/skills/harness-engine/references/template-policy.md ADDED Viewed

@@ -0,0 +1,17 @@
+# Template Policy
+Every generated file starts with a managed marker:
+`<!-- harness-engine:managed -->`
+Init behavior:
+- `init`: create missing files for new repositories; when an existing managed harness is detected, refresh managed files and create missing files while preserving unmanaged files
+- `clean` removes transient runtime state under `docs/generated/`
+- `clean` maintains `.gitignore` entries for `.codex/skills/` and `docs/generated/`
+- `clean` previews or stages removal of already tracked local skill installs and generated evidence from the git index; it requires `--apply` before running `git rm --cached`
+- execution plans, JSON sidecars, and `docs/exec-plans/workstreams.md` are durable project state and must not be cleaned, ignored, or untracked by default
+Use `init` as the normal workspace command so creation and reconciliation share one path. Use `--force` only when the human explicitly accepts overwriting.
+If a file exists without the managed marker, treat it as user-owned unless the human explicitly asks to replace it.