npm - @hallucination-studio/harness-engine - Versions diffs - 1.0.0 - Mend

@hallucination-studio/harness-engine 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

package/skills/harness-repo-bootstrap/evals/run_evals.py ADDED Viewed

@@ -0,0 +1,337 @@
+#!/usr/bin/env python3
+import json
+import subprocess
+import sys
+import tempfile
+from pathlib import Path
+SKILL_DIR = Path(__file__).resolve().parents[1]
+MANAGER = SKILL_DIR / "scripts" / "manage_harness.py"
+def run_manager(*args, expect_success=True):
+    result = subprocess.run(
+        [sys.executable, str(MANAGER), *args],
+        text=True,
+        capture_output=True,
+        check=False,
+    )
+    if expect_success and result.returncode != 0:
+        raise AssertionError(result.stderr or result.stdout)
+    if not expect_success and result.returncode == 0:
+        raise AssertionError("Command succeeded unexpectedly")
+    if result.stdout.strip():
+        return json.loads(result.stdout)
+    return {}
+def write_answers(path, project_name="demo"):
+    answers = {
+        "project_name": project_name,
+        "project_summary": "A developer tooling project used to install and maintain Codex harness docs.",
+        "primary_users": "Codex users and maintainers",
+        "deployment_targets": "npm package and local repositories",
+        "product_domain": "developer tooling",
+        "reliability_targets": "Repeatable local commands and safe update behavior",
+        "security_constraints": "Do not write secrets or overwrite user-owned docs without consent",
+        "frontend_stack_notes": "Frontend changes require browser validation when a UI is detected",
+        "quality_focus": "installer behavior, generated docs, plan closure, and knowledge capture",
+        "frontend_scope": "No frontend unless one is detected by analysis",
+    }
+    path.write_text(json.dumps(answers, indent=2) + "\n")
+def assert_exists(repo, relative_path):
+    path = repo / relative_path
+    if not path.exists():
+        raise AssertionError(f"Expected {relative_path} to exist")
+def assert_contains(repo, relative_path, needle):
+    text = (repo / relative_path).read_text()
+    if needle not in text:
+        raise AssertionError(f"Expected {relative_path} to contain {needle!r}")
+def test_empty_repo_init(tmp_root):
+    repo = tmp_root / "empty-repo"
+    repo.mkdir()
+    answers = tmp_root / "answers.json"
+    write_answers(answers)
+    analysis = run_manager("analyze", "--repo", str(repo))
+    if analysis["recommended_action"] != "init":
+        raise AssertionError("Empty repo should recommend init")
+    if not analysis["missing_exec_plan_state"]:
+        raise AssertionError("Analysis should report missing exec-plan state")
+    if not analysis["missing_sops"]:
+        raise AssertionError("Analysis should report missing SOPs")
+    run_manager("init", "--repo", str(repo), "--answers", str(answers))
+    for relative_path in [
+        "AGENTS.md",
+        "ARCHITECTURE.md",
+        "docs/PLANS.md",
+        "docs/QUALITY_SCORE.md",
+        "docs/exec-plans/active/_template.md",
+        "docs/exec-plans/completed/README.md",
+        "docs/sops/encode-unseen-knowledge.md",
+    ]:
+        assert_exists(repo, relative_path)
+    assert_contains(repo, "AGENTS.md", "docs/exec-plans/active/")
+    assert_contains(repo, "AGENTS.md", "docs/sops/")
+    assert_contains(repo, "AGENTS.md", ".codex/skills/harness-repo-bootstrap/scripts/manage_harness.py check")
+def test_frontend_analysis(tmp_root):
+    repo = tmp_root / "frontend-repo"
+    repo.mkdir()
+    (repo / "package.json").write_text(
+        json.dumps(
+            {
+                "dependencies": {
+                    "react": "^19.0.0",
+                    "vite": "^6.0.0",
+                }
+            },
+            indent=2,
+        )
+        + "\n"
+    )
+    (repo / "src").mkdir()
+    (repo / "src" / "App.tsx").write_text("export default function App() { return null; }\n")
+    analysis = run_manager("analyze", "--repo", str(repo))
+    question_ids = {item["id"] for item in analysis["human_confirmations"]}
+    if not analysis["has_frontend"]:
+        raise AssertionError("Frontend repo should be detected")
+    if "frontend_stack_notes" not in question_ids:
+        raise AssertionError("Frontend repo should ask frontend confirmation questions")
+    if "React" not in analysis["frameworks"]:
+        raise AssertionError("React should be detected")
+def test_closed_loop_plan(tmp_root):
+    repo = tmp_root / "loop-repo"
+    repo.mkdir()
+    (repo / "snake.sh").write_text("#!/usr/bin/env bash\nprintf 'snake\\n'\n")
+    (repo / ".codex" / "skills" / "demo" / "scripts").mkdir(parents=True)
+    (repo / ".codex" / "skills" / "demo" / "scripts" / "tool.py").write_text("print('ignore me')\n")
+    answers = tmp_root / "loop-answers.json"
+    write_answers(answers, project_name="loop-demo")
+    analysis = run_manager("analyze", "--repo", str(repo))
+    if "Shell" not in analysis["languages"]:
+        raise AssertionError("Shell should be detected from target project files")
+    if "Python" in analysis["languages"]:
+        raise AssertionError(".codex skill files should not affect target project language detection")
+    run_manager("init", "--repo", str(repo), "--answers", str(answers))
+    plan_result = run_manager(
+        "plan-start",
+        "--repo",
+        str(repo),
+        "--slug",
+        "knowledge-loop",
+        "--goal",
+        "Validate durable knowledge closure",
+    )
+    plan_path = Path(plan_result["plan"])
+    relative_plan = str(plan_path.resolve().relative_to(repo.resolve()))
+    fact = "Install mode must distinguish local and global skill destinations"
+    run_manager(
+        "knowledge-log",
+        "--repo",
+        str(repo),
+        "--plan",
+        relative_plan,
+        "--fact",
+        fact,
+        "--destination",
+        "docs/PRODUCT_SENSE.md",
+    )
+    run_manager(
+        "plan-close",
+        "--repo",
+        str(repo),
+        "--plan",
+        relative_plan,
+        "--summary",
+        "done",
+        expect_success=False,
+    )
+    run_manager(
+        "knowledge-mark-written",
+        "--repo",
+        str(repo),
+        "--plan",
+        relative_plan,
+        "--fact",
+        fact,
+        "--destination",
+        "docs/PRODUCT_SENSE.md",
+        expect_success=False,
+    )
+    run_manager(
+        "knowledge-mark-written",
+        "--repo",
+        str(repo),
+        "--plan",
+        relative_plan,
+        "--fact",
+        fact,
+        "--destination",
+        "docs/PRODUCT_SENSE.md",
+        "--append",
+    )
+    assert_contains(repo, "docs/PRODUCT_SENSE.md", fact)
+    close_result = run_manager(
+        "plan-close",
+        "--repo",
+        str(repo),
+        "--plan",
+        relative_plan,
+        "--summary",
+        "Closed after writing durable knowledge.",
+    )
+    if close_result["status"] != "closed":
+        raise AssertionError("Plan should close after knowledge is marked written")
+    if plan_path.exists():
+        raise AssertionError("Active plan should be moved after close")
+    assert_exists(repo, "docs/exec-plans/completed/" + plan_path.name)
+    check_result = run_manager("check", "--repo", str(repo))
+    if check_result["status"] != "pass":
+        raise AssertionError("Harness check should pass after plan closure")
+    formatted_plan = create_formatted_plan(repo)
+    formatted_relative_plan = str(formatted_plan.resolve().relative_to(repo.resolve()))
+    formatted_fact = "snake.sh is the single runtime entrypoint and owns terminal control directly with stty and tput"
+    with (repo / "ARCHITECTURE.md").open("a") as handle:
+        handle.write("\n`snake.sh` is the single runtime entrypoint and owns terminal control directly with `stty` and `tput`.\n")
+    run_manager(
+        "knowledge-mark-written",
+        "--repo",
+        str(repo),
+        "--plan",
+        formatted_relative_plan,
+        "--fact",
+        formatted_fact,
+        "--destination",
+        "ARCHITECTURE.md",
+    )
+    id_plan_result = run_manager(
+        "plan-start",
+        "--repo",
+        str(repo),
+        "--slug",
+        "id-knowledge-loop",
+        "--goal",
+        "Validate id-based durable knowledge closure",
+    )
+    id_plan_path = Path(id_plan_result["plan"])
+    id_relative_plan = str(id_plan_path.resolve().relative_to(repo.resolve()))
+    id_fact = "Runtime input is owned by the terminal runner and core game logic remains independent of terminal packages"
+    log_result = run_manager(
+        "knowledge-log",
+        "--repo",
+        str(repo),
+        "--plan",
+        id_relative_plan,
+        "--fact",
+        id_fact,
+        "--destination",
+        "ARCHITECTURE.md",
+    )
+    with (repo / "ARCHITECTURE.md").open("a") as handle:
+        handle.write(
+            "\nThe `main` package owns keyboard input and rendering, while `game` contains pure state transitions.\n"
+        )
+    run_manager(
+        "knowledge-mark-written",
+        "--repo",
+        str(repo),
+        "--plan",
+        id_relative_plan,
+        "--id",
+        log_result["id"],
+        "--evidence",
+        "main package owns keyboard input and rendering",
+    )
+    plan_text = id_plan_path.read_text()
+    if id_fact in (repo / "ARCHITECTURE.md").read_text():
+        raise AssertionError("Id/evidence closure should not require appending the exact fact to the destination")
+    if "| evidence: main package owns keyboard input and rendering" not in plan_text:
+        raise AssertionError("Closed knowledge item should record the verification evidence")
+    run_manager(
+        "plan-close",
+        "--repo",
+        str(repo),
+        "--plan",
+        id_relative_plan,
+        "--summary",
+        "Closed with id-based evidence.",
+    )
+def create_formatted_plan(repo):
+    plan_path = repo / "docs" / "exec-plans" / "active" / "formatted-plan.md"
+    plan_path.write_text(
+        """# Execution Plan: Formatted Plan
+## Durable Knowledge To Capture
+- [ ] `snake.sh` is the single runtime entrypoint and owns terminal control directly with `stty` and `tput`. -> `ARCHITECTURE.md`
+"""
+    )
+    return plan_path
+def test_preserve_unmanaged_docs(tmp_root):
+    repo = tmp_root / "partial-repo"
+    repo.mkdir()
+    (repo / "AGENTS.md").write_text("# Existing user router\n\nKeep this custom content.\n")
+    answers = tmp_root / "partial-answers.json"
+    write_answers(answers)
+    result = run_manager("init", "--repo", str(repo), "--answers", str(answers))
+    if "AGENTS.md" not in result["skipped"]:
+        raise AssertionError("Unmanaged AGENTS.md should be skipped")
+    assert_contains(repo, "AGENTS.md", "Keep this custom content.")
+    assert_exists(repo, "docs/PLANS.md")
+EVALS = [
+    ("empty-repo-init", test_empty_repo_init),
+    ("frontend-analysis", test_frontend_analysis),
+    ("closed-loop-plan", test_closed_loop_plan),
+    ("preserve-unmanaged-docs", test_preserve_unmanaged_docs),
+]
+def main():
+    results = []
+    with tempfile.TemporaryDirectory() as tmp:
+        tmp_root = Path(tmp)
+        for eval_id, test_func in EVALS:
+            try:
+                test_func(tmp_root)
+                results.append({"id": eval_id, "status": "pass"})
+            except Exception as error:
+                results.append({"id": eval_id, "status": "fail", "error": str(error)})
+    passed = sum(1 for result in results if result["status"] == "pass")
+    total = len(results)
+    report = {
+        "score": round((passed / total) * 100),
+        "passed": passed,
+        "total": total,
+        "results": results,
+    }
+    print(json.dumps(report, indent=2) + "\n")
+    if passed != total:
+        sys.exit(1)
+if __name__ == "__main__":
+    main()

package/skills/harness-repo-bootstrap/references/evaluation-loop.md ADDED Viewed

@@ -0,0 +1,18 @@
+# Evaluation Loop
+Use this loop when changing the skill, templates, scripts, or policy references:
+1. Draft the behavior in `SKILL.md`, `references/`, templates, or scripts.
+2. Test it with the deterministic commands in `scripts/manage_harness.py`.
+3. Evaluate it with `python3 evals/run_evals.py`.
+4. Iterate until the runner passes and the score stays at 100.
+## What The Evals Cover
+- first-time initialization of an empty repository
+- frontend-aware repository analysis
+- execution-plan and knowledge-capture closure
+- preservation of unmanaged user-owned docs
+- local harness checks that do not require user-project CI
+Add a new eval case whenever a regression would be easy to miss by reading the files manually.

package/skills/harness-repo-bootstrap/references/exec-plans.md ADDED Viewed

@@ -0,0 +1,39 @@
+# Execution Plans
+Execution plans are required for multi-step work, risky changes, or tasks that need coordination across files.
+## When To Create One
+- more than one implementation step is required
+- validation is non-trivial
+- architecture, product, reliability, or security decisions are involved
+- work will span enough time that another agent may resume it later
+## Location
+- Active: `docs/exec-plans/active/`
+- Completed: `docs/exec-plans/completed/`
+## Minimum Sections
+- goal
+- scope
+- constraints
+- steps
+- validation
+- durable knowledge to capture
+- completion notes
+## Operating Rule
+Update the active plan during the work. When the work is done, move it to `completed` and leave behind any durable facts in the right permanent docs.
+## Closed Loop
+Use the script, not ad hoc manual edits, for the lifecycle:
+- `plan-start`: create a new active execution plan
+- `knowledge-log`: append a durable fact that still needs to be written into permanent docs and return its stable id
+- `knowledge-mark-written`: verify and mark a logged fact as written into its permanent doc; prefer `--id <knowledge-id> --evidence "<doc text>"`, and use `--append` only to append the exact fact first
+- `plan-close`: refuse to close cleanly until the listed knowledge items are marked as written to durable docs
+- `check`: run a local handoff check without requiring target-repo CI

package/skills/harness-repo-bootstrap/references/file-map.md ADDED Viewed

@@ -0,0 +1,17 @@
+# File Map
+- `AGENTS.md`: short router, reading order, repo-specific guardrails
+- `ARCHITECTURE.md`: domain boundaries, runtime topology, integration seams
+- `docs/PLANS.md`: plan lifecycle and storage rules
+- `docs/PRODUCT_SENSE.md`: product heuristics and tradeoff rules
+- `docs/QUALITY_SCORE.md`: quality rubric by domain and layer
+- `docs/RELIABILITY.md`: SLOs, failure modes, observability expectations
+- `docs/SECURITY.md`: security constraints, secrets, auth, data handling
+- `docs/DESIGN.md`: design principles and review heuristics
+- `docs/FRONTEND.md`: frontend stack conventions and validation loop
+- `docs/design-docs/`: durable design decisions
+- `docs/product-specs/`: durable product specs
+- `docs/exec-plans/`: active plans, completed plans, and tech debt tracker
+- `docs/sops/`: mechanical procedures for recurring workflows and validation loops
+- `docs/generated/`: generated facts such as schemas
+- `docs/references/`: external references rewritten or linked for model-friendly discovery

package/skills/harness-repo-bootstrap/references/knowledge-capture.md ADDED Viewed

@@ -0,0 +1,35 @@
+# Knowledge Capture
+Write durable knowledge into the repository whenever one of these is true:
+- the fact changed your implementation plan
+- the fact would likely be needed by another agent later
+- the fact came from a human answer rather than directly from code
+- the fact explains why a policy, architecture choice, or validation loop exists
+- the fact would be annoying to rediscover from scratch
+## Where To Write It
+- Product behavior or workflow intent: `docs/product-specs/`
+- Design rationale or UX rules: `docs/design-docs/`
+- Runtime validation, incidents, or observability loops: `docs/RELIABILITY.md` or `docs/sops/`
+- Security constraints or review gates: `docs/SECURITY.md`
+- Architecture boundaries or integration seams: `ARCHITECTURE.md`
+- Reusable external material: `docs/references/`
+## Minimum Rule
+If a useful fact would otherwise live only in chat, move it into the repo before closing the task.
+## Closed Loop
+Prefer the script workflow:
+1. Log the fact into the active execution plan with `knowledge-log`.
+2. Write the fact into its permanent destination doc.
+3. Mark the plan item complete with `knowledge-mark-written --id <knowledge-id> --evidence "<text present in durable doc>"`.
+4. Close the plan with `plan-close`.
+`knowledge-log` returns a stable id. Prefer id-based closure so permanent docs can use concise, natural wording rather than duplicating the exact plan fact.
+`knowledge-mark-written` verifies that the destination file contains either the provided evidence text or, for legacy calls, the exact fact. Use `--append` only when the exact fact should be appended to the destination doc by the tool.

package/skills/harness-repo-bootstrap/references/question-catalog.md ADDED Viewed

@@ -0,0 +1,29 @@
+# Question Catalog
+Use these prompts only when the repo analysis cannot answer them.
+## Product
+- What core user outcome does this repository serve?
+- Which flows matter enough to deserve explicit product specs first?
+- Which non-goals should the harness make visible?
+## Reliability
+- What failure is unacceptable in production?
+- What recovery time or uptime expectation matters most?
+- Which runtime environments must be validated locally before merge?
+## Security
+- Does the repo handle credentials, customer data, regulated data, or privileged actions?
+- Are there required review gates for authentication, authorization, or secrets handling?
+## Frontend
+- Is the product expected to have a polished user-facing interface, an internal tool UI, or no frontend?
+- Which browsers, devices, or accessibility expectations are non-negotiable?
+## References
+- Which external docs are worth copying into `docs/references/` because the team uses them repeatedly?

package/skills/harness-repo-bootstrap/references/sop-index.md ADDED Viewed

@@ -0,0 +1,10 @@
+# SOP Index
+Choose an SOP whenever the task touches one of these areas:
+- architecture or layering changes: `docs/sops/layered-domain-architecture-setup.md`
+- missing durable repository knowledge: `docs/sops/encode-unseen-knowledge.md`
+- runtime debugging or observability setup: `docs/sops/local-observability-feedback-loop.md`
+- user interface work: `docs/sops/chrome-devtools-ui-validation-loop.md`
+If no SOP exists for a recurring workflow, create one in `docs/sops/` as part of the task.

package/skills/harness-repo-bootstrap/references/template-policy.md ADDED Viewed

@@ -0,0 +1,12 @@
+# Template Policy
+Every generated file starts with a managed marker:
+`<!-- harness-repo-bootstrap:managed -->`
+Update behavior:
+- `init`: create missing files and skip existing files unless `--force`
+- `update`: create missing files, skip existing unmanaged files, and refresh managed files only when `--refresh-managed` or `--force` is passed
+If a file exists without the managed marker, treat it as user-owned unless the human explicitly asks to replace it.

package/skills/harness-repo-bootstrap/references/workflow.md ADDED Viewed

@@ -0,0 +1,47 @@
+# Workflow
+Use this skill in two passes.
+## Pass 1: Analyze and Confirm
+Run `analyze` before editing repository docs.
+Ask the human only about facts that cannot be derived safely from the repo, especially:
+- product domain and top-level outcomes
+- intended users or operators
+- production reliability expectations
+- security or compliance constraints
+- frontend experience bar
+- canonical external references worth pinning inside `docs/references/`
+Do not ask for facts that can be inferred from source layout, dependency manifests, or existing docs.
+Also inspect the analysis for:
+- missing durable knowledge that should be written during the task
+- missing execution-plan state
+- which SOPs should be referenced in the generated router docs
+## Pass 2: Scaffold or Refresh
+Run `sample-answers`, fill the answers, then run `init` or `update`.
+Use `init` for first-time adoption.
+Use `update` to add missing managed files or refresh managed files when `--refresh-managed` is passed.
+After the script runs, read the generated docs once and tighten weak generic phrases before handing off.
+## Ongoing Use
+After the scaffold exists:
+- create an execution plan before multi-step work
+- use `plan-start` instead of creating plan files manually when possible
+- log durable facts during execution instead of waiting until the end
+- follow the matching SOP for architecture, UI, observability, or knowledge capture work
+- encode durable knowledge back into the repository before closing the task
+- mark logged knowledge items as written after updating the permanent docs
+- use `plan-close` to verify no durable knowledge is left stranded in the active plan
+- run `.codex/skills/harness-repo-bootstrap/scripts/manage_harness.py check --repo <target-repo>` before handoff
+- do not add CI to the target repository unless the human explicitly asks for it