@hallucination-studio/harness-engine 1.0.0 → 1.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +185 -27
- package/bin/install.js +29 -17
- package/package.json +10 -4
- package/skills/harness-engine/SKILL.md +97 -0
- package/skills/harness-engine/agents/openai.yaml +4 -0
- package/skills/harness-engine/evals/cases.json +94 -0
- package/skills/harness-engine/evals/harness_engine_evals/__init__.py +1 -0
- package/skills/harness-engine/evals/harness_engine_evals/cases_frontend.py +211 -0
- package/skills/harness-engine/evals/harness_engine_evals/cases_lifecycle.py +1616 -0
- package/skills/harness-engine/evals/harness_engine_evals/helpers.py +155 -0
- package/skills/harness-engine/evals/harness_engine_evals/registry.py +55 -0
- package/skills/harness-engine/evals/harness_engine_evals/report.py +36 -0
- package/skills/harness-engine/evals/harness_engine_evals/runner.py +53 -0
- package/skills/harness-engine/evals/run_evals.py +14 -0
- package/skills/{harness-repo-bootstrap → harness-engine}/references/evaluation-loop.md +8 -2
- package/skills/harness-engine/references/evidence-first-evals.md +187 -0
- package/skills/harness-engine/references/exec-plans.md +59 -0
- package/skills/{harness-repo-bootstrap → harness-engine}/references/file-map.md +3 -3
- package/skills/{harness-repo-bootstrap → harness-engine}/references/knowledge-capture.md +2 -2
- package/skills/{harness-repo-bootstrap → harness-engine}/references/sop-index.md +3 -0
- package/skills/harness-engine/references/template-policy.md +17 -0
- package/skills/harness-engine/references/workflow.md +62 -0
- package/skills/harness-engine/scripts/harness_engine/__init__.py +1 -0
- package/skills/harness-engine/scripts/harness_engine/analysis.py +240 -0
- package/skills/harness-engine/scripts/harness_engine/checks.py +287 -0
- package/skills/harness-engine/scripts/harness_engine/cli.py +656 -0
- package/skills/harness-engine/scripts/harness_engine/common.py +977 -0
- package/skills/harness-engine/scripts/harness_engine/continuation.py +520 -0
- package/skills/harness-engine/scripts/harness_engine/git_ops.py +88 -0
- package/skills/harness-engine/scripts/harness_engine/knowledge.py +329 -0
- package/skills/harness-engine/scripts/harness_engine/plans.py +630 -0
- package/skills/harness-engine/scripts/harness_engine/templates.py +124 -0
- package/skills/harness-engine/scripts/manage_harness.py +14 -0
- package/skills/harness-repo-bootstrap/SKILL.md +0 -68
- package/skills/harness-repo-bootstrap/agents/openai.yaml +0 -4
- package/skills/harness-repo-bootstrap/evals/cases.json +0 -18
- package/skills/harness-repo-bootstrap/evals/run_evals.py +0 -337
- package/skills/harness-repo-bootstrap/references/exec-plans.md +0 -39
- package/skills/harness-repo-bootstrap/references/template-policy.md +0 -12
- package/skills/harness-repo-bootstrap/references/workflow.md +0 -47
- package/skills/harness-repo-bootstrap/scripts/manage_harness.py +0 -1181
- /package/skills/{harness-repo-bootstrap → harness-engine}/assets/repo-template/.keep +0 -0
- /package/skills/{harness-repo-bootstrap → harness-engine}/assets/sops/.keep +0 -0
- /package/skills/{harness-repo-bootstrap → harness-engine}/references/question-catalog.md +0 -0
|
@@ -0,0 +1,124 @@
|
|
|
1
|
+
from .common import *
|
|
2
|
+
|
|
3
|
+
def make_default_answers(analysis):
|
|
4
|
+
repo_name = analysis["project_name"]
|
|
5
|
+
frameworks = ", ".join(analysis["frameworks"]) or "Unknown"
|
|
6
|
+
style_files = analysis.get("frontend_style_files") or []
|
|
7
|
+
style_file_summary = ", ".join(style_files) if style_files else "No shared style, theme, token, or component style files detected yet."
|
|
8
|
+
has_frontend = analysis["has_frontend"]
|
|
9
|
+
frontend_scope = (
|
|
10
|
+
"User-facing or operator-facing frontend work is expected."
|
|
11
|
+
if has_frontend
|
|
12
|
+
else "No clear frontend surface was detected yet. Update this if a UI emerges."
|
|
13
|
+
)
|
|
14
|
+
frontend_validation_loop = (
|
|
15
|
+
"- Run local UI changes in a browser.\n"
|
|
16
|
+
"- Check desktop and mobile layouts when relevant.\n"
|
|
17
|
+
"- Verify key flows, empty states, and failure states.\n"
|
|
18
|
+
"- Record reusable UI findings in `docs/design-docs/`."
|
|
19
|
+
if has_frontend
|
|
20
|
+
else "- Validate interface changes in the relevant local runtime.\n"
|
|
21
|
+
"- Verify key flows, empty states, failure states, and cleanup behavior where applicable.\n"
|
|
22
|
+
"- Record reusable interface findings in `docs/design-docs/`."
|
|
23
|
+
)
|
|
24
|
+
defaults = {
|
|
25
|
+
"project_name": repo_name,
|
|
26
|
+
"project_summary": f"Summarize the main outcome that {repo_name} should deliver.",
|
|
27
|
+
"primary_users": "Describe the primary users, operators, or internal teams.",
|
|
28
|
+
"deployment_targets": "Describe the main runtime or deployment targets.",
|
|
29
|
+
"product_domain": "Describe the product domain in one line.",
|
|
30
|
+
"reliability_targets": "Describe uptime, failure tolerance, recovery expectations, and required validation loops.",
|
|
31
|
+
"security_constraints": "Describe auth, secrets, compliance, sensitive data, and review constraints.",
|
|
32
|
+
"frontend_stack_notes": (
|
|
33
|
+
f"Detected frameworks: {frameworks}. Describe UX expectations, supported environments, and review rules."
|
|
34
|
+
if has_frontend
|
|
35
|
+
else "No frontend detected. Replace this if the repo includes UI work."
|
|
36
|
+
),
|
|
37
|
+
"design_style_direction": (
|
|
38
|
+
"Describe the concrete visual direction before major UI work: reference point, mood, density, palette, typography, component shape, and hard don'ts."
|
|
39
|
+
if has_frontend
|
|
40
|
+
else "No frontend detected."
|
|
41
|
+
),
|
|
42
|
+
"existing_frontend_style_notes": style_file_summary,
|
|
43
|
+
"quality_focus": "List the product areas and architectural layers that deserve the strictest quality bar.",
|
|
44
|
+
"frontend_scope": frontend_scope,
|
|
45
|
+
"frontend_validation_loop": frontend_validation_loop,
|
|
46
|
+
}
|
|
47
|
+
return defaults
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def fill_template(template, answers, analysis):
|
|
51
|
+
merged = {}
|
|
52
|
+
merged.update(make_default_answers(analysis))
|
|
53
|
+
merged.update(answers)
|
|
54
|
+
merged.update(
|
|
55
|
+
{
|
|
56
|
+
"marker": MANAGED_MARKER,
|
|
57
|
+
"languages": ", ".join(analysis["languages"]) or "Unknown",
|
|
58
|
+
"package_managers": ", ".join(analysis["package_managers"]) or "Unknown",
|
|
59
|
+
"frameworks": ", ".join(analysis["frameworks"]) or "Unknown",
|
|
60
|
+
}
|
|
61
|
+
)
|
|
62
|
+
return template.format(**merged)
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def ensure_parent(path):
|
|
66
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def is_managed_text(text):
|
|
70
|
+
return text.startswith(MANAGED_MARKER) or (
|
|
71
|
+
text.startswith("---") and "\nsource: harness-engine-template\n" in text[:500]
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def is_obsolete_managed_text(text):
|
|
76
|
+
return any(text.startswith(marker) for marker in OBSOLETE_MANAGED_MARKERS)
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def is_harness_owned_text(text):
|
|
80
|
+
return is_managed_text(text) or is_obsolete_managed_text(text)
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def should_write(path, refresh_managed, force):
|
|
84
|
+
if not path.exists():
|
|
85
|
+
return True
|
|
86
|
+
if force:
|
|
87
|
+
return True
|
|
88
|
+
try:
|
|
89
|
+
is_managed = is_harness_owned_text(path.read_text())
|
|
90
|
+
except UnicodeDecodeError:
|
|
91
|
+
return False
|
|
92
|
+
if refresh_managed and is_managed:
|
|
93
|
+
return True
|
|
94
|
+
return False
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def write_scaffold(repo, analysis, answers, refresh_managed=False, force=False):
|
|
98
|
+
written = []
|
|
99
|
+
created = []
|
|
100
|
+
refreshed = []
|
|
101
|
+
skipped = []
|
|
102
|
+
all_templates = {}
|
|
103
|
+
all_templates.update(ROOT_FILES)
|
|
104
|
+
all_templates.update(DOC_FILES)
|
|
105
|
+
if analysis["has_frontend"]:
|
|
106
|
+
all_templates.update(FRONTEND_DOC_FILES)
|
|
107
|
+
|
|
108
|
+
for relative_path, template in all_templates.items():
|
|
109
|
+
target = repo / relative_path
|
|
110
|
+
existed = target.exists()
|
|
111
|
+
if should_write(target, refresh_managed, force):
|
|
112
|
+
ensure_parent(target)
|
|
113
|
+
content = fill_template(template, answers, analysis)
|
|
114
|
+
target.write_text(content)
|
|
115
|
+
written.append(relative_path)
|
|
116
|
+
if existed:
|
|
117
|
+
refreshed.append(relative_path)
|
|
118
|
+
else:
|
|
119
|
+
created.append(relative_path)
|
|
120
|
+
else:
|
|
121
|
+
skipped.append(relative_path)
|
|
122
|
+
return written, skipped, created, refreshed
|
|
123
|
+
|
|
124
|
+
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
import sys
|
|
5
|
+
|
|
6
|
+
SCRIPT_DIR = Path(__file__).resolve().parent
|
|
7
|
+
if str(SCRIPT_DIR) not in sys.path:
|
|
8
|
+
sys.path.insert(0, str(SCRIPT_DIR))
|
|
9
|
+
|
|
10
|
+
from harness_engine.cli import main
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
if __name__ == "__main__":
|
|
14
|
+
main()
|
|
@@ -1,68 +0,0 @@
|
|
|
1
|
-
---
|
|
2
|
-
name: harness-repo-bootstrap
|
|
3
|
-
description: Bootstrap or refresh an advanced harness-engineering repository shape for Codex-driven projects. Use when Codex needs to analyze a repository, ask the human to confirm high-impact product and architecture facts, and then create or update AGENTS.md, architecture docs, policy docs, plan folders, reference folders, and SOP-backed starter files for the repository.
|
|
4
|
-
---
|
|
5
|
-
|
|
6
|
-
# Harness Repo Bootstrap
|
|
7
|
-
|
|
8
|
-
Run the packaged script to inspect the target repository before editing files. Use the generated analysis to decide what to ask the human, what durable knowledge is missing from the repo, and which execution-plan and SOP files must be created or updated.
|
|
9
|
-
|
|
10
|
-
## Workflow
|
|
11
|
-
|
|
12
|
-
1. Run `python3 scripts/manage_harness.py analyze --repo <target-repo> --output <analysis.json>`.
|
|
13
|
-
2. Read `analysis.json`.
|
|
14
|
-
3. Ask the human only the unresolved, high-impact questions from `human_confirmations`.
|
|
15
|
-
4. Run `python3 scripts/manage_harness.py sample-answers --analysis <analysis.json> --output <answers.json>`.
|
|
16
|
-
5. Fill the placeholders in `answers.json` from the repository and the human's confirmed answers.
|
|
17
|
-
6. Run one of:
|
|
18
|
-
- `python3 scripts/manage_harness.py init --repo <target-repo> --answers <answers.json>`
|
|
19
|
-
- `python3 scripts/manage_harness.py update --repo <target-repo> --answers <answers.json>`
|
|
20
|
-
7. If the task is multi-step, run `python3 scripts/manage_harness.py plan-start --repo <target-repo> --slug <task-name> --goal "<goal>"`.
|
|
21
|
-
8. If you learn durable facts during the work, run `python3 scripts/manage_harness.py knowledge-log --repo <target-repo> --plan <plan-file> --fact "<fact>" --destination <durable-doc>` and keep the returned `id`.
|
|
22
|
-
9. Before closing the task, write those facts into their durable docs.
|
|
23
|
-
10. Run `python3 scripts/manage_harness.py knowledge-mark-written --repo <target-repo> --plan <plan-file> --id <knowledge-id> --evidence "<text already in durable doc>"`; use `--append` only when the exact fact should be appended mechanically.
|
|
24
|
-
11. Close the plan with `python3 scripts/manage_harness.py plan-close --repo <target-repo> --plan <plan-file> --summary "<summary>"`.
|
|
25
|
-
12. Before handoff, run `python3 .codex/skills/harness-repo-bootstrap/scripts/manage_harness.py check --repo <target-repo>` from an installed target repository.
|
|
26
|
-
13. After changing this skill, run `python3 evals/run_evals.py` and iterate until it passes.
|
|
27
|
-
|
|
28
|
-
## Reading Order
|
|
29
|
-
|
|
30
|
-
- Read [references/workflow.md](references/workflow.md) first for the operating model and question policy.
|
|
31
|
-
- Read [references/file-map.md](references/file-map.md) when deciding which generated file to update.
|
|
32
|
-
- Read [references/question-catalog.md](references/question-catalog.md) when the analysis surfaces ambiguous product, security, reliability, or frontend facts.
|
|
33
|
-
- Read [references/knowledge-capture.md](references/knowledge-capture.md) when you discover facts that should survive chat history.
|
|
34
|
-
- Read [references/exec-plans.md](references/exec-plans.md) before planning or updating any multi-step work.
|
|
35
|
-
- Read [references/sop-index.md](references/sop-index.md) to choose the right SOP for architecture, UI validation, observability, or knowledge capture work.
|
|
36
|
-
- Read [references/template-policy.md](references/template-policy.md) before overwriting existing files.
|
|
37
|
-
- Read [references/evaluation-loop.md](references/evaluation-loop.md) before changing the skill, templates, scripts, or policy references.
|
|
38
|
-
|
|
39
|
-
## Command Rules
|
|
40
|
-
|
|
41
|
-
- Prefer `analyze` before `init` or `update`.
|
|
42
|
-
- Prefer the draft, test, evaluate, iterate loop for changes to this skill.
|
|
43
|
-
- Prefer `init` when the target repo has none of the managed files.
|
|
44
|
-
- Prefer `update` when the repo already contains any managed file or a partial harness layout.
|
|
45
|
-
- Do not overwrite existing files unless the human asked for it or you pass `--force`.
|
|
46
|
-
- Treat the generated files as starting points. After generation, tighten them with repository-specific details instead of leaving placeholders behind.
|
|
47
|
-
- Treat `docs/exec-plans/` as required state for multi-step work, not optional notes.
|
|
48
|
-
- Treat `docs/sops/` as mechanical operating procedures, not background reading.
|
|
49
|
-
- When you answer a question using facts that are not yet in the repo but should be reusable, write them into a durable doc before finishing.
|
|
50
|
-
- Prefer `knowledge-mark-written --id ... --evidence ...` so durable docs can use natural wording instead of duplicated exact fact strings.
|
|
51
|
-
- Use `plan-close` as the final guardrail so plan state and durable docs stay synchronized.
|
|
52
|
-
- Use `check` as the local handoff guardrail for user repositories.
|
|
53
|
-
- Run `python3 evals/run_evals.py` after skill changes and treat failures as iteration input.
|
|
54
|
-
- Do not add CI to user repositories unless the human explicitly asks for it.
|
|
55
|
-
|
|
56
|
-
## Output Rules
|
|
57
|
-
|
|
58
|
-
- Keep `AGENTS.md` short and routing-oriented.
|
|
59
|
-
- Keep durable knowledge in repo docs, not in chat-only explanations.
|
|
60
|
-
- Keep plans under `docs/exec-plans/active/` and move finished plans to `docs/exec-plans/completed/`.
|
|
61
|
-
- Keep generated material under `docs/generated/`.
|
|
62
|
-
- Keep external, model-friendly references under `docs/references/`.
|
|
63
|
-
- Keep SOPs explicit and task-triggered so the next agent can follow the same path mechanically.
|
|
64
|
-
|
|
65
|
-
## Assets
|
|
66
|
-
|
|
67
|
-
- Scaffold templates live under [assets/repo-template](assets/repo-template).
|
|
68
|
-
- SOP starter docs live under [assets/sops](assets/sops).
|
|
@@ -1,18 +0,0 @@
|
|
|
1
|
-
[
|
|
2
|
-
{
|
|
3
|
-
"id": "empty-repo-init",
|
|
4
|
-
"description": "Empty repositories should receive the full advanced harness scaffold."
|
|
5
|
-
},
|
|
6
|
-
{
|
|
7
|
-
"id": "frontend-analysis",
|
|
8
|
-
"description": "Frontend repositories should trigger frontend-specific confirmation and policy output."
|
|
9
|
-
},
|
|
10
|
-
{
|
|
11
|
-
"id": "closed-loop-plan",
|
|
12
|
-
"description": "Execution plans should refuse to close until durable knowledge is written back."
|
|
13
|
-
},
|
|
14
|
-
{
|
|
15
|
-
"id": "preserve-unmanaged-docs",
|
|
16
|
-
"description": "Existing user-owned harness files should be skipped unless explicitly forced."
|
|
17
|
-
}
|
|
18
|
-
]
|
|
@@ -1,337 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env python3
|
|
2
|
-
|
|
3
|
-
import json
|
|
4
|
-
import subprocess
|
|
5
|
-
import sys
|
|
6
|
-
import tempfile
|
|
7
|
-
from pathlib import Path
|
|
8
|
-
|
|
9
|
-
SKILL_DIR = Path(__file__).resolve().parents[1]
|
|
10
|
-
MANAGER = SKILL_DIR / "scripts" / "manage_harness.py"
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
def run_manager(*args, expect_success=True):
|
|
14
|
-
result = subprocess.run(
|
|
15
|
-
[sys.executable, str(MANAGER), *args],
|
|
16
|
-
text=True,
|
|
17
|
-
capture_output=True,
|
|
18
|
-
check=False,
|
|
19
|
-
)
|
|
20
|
-
if expect_success and result.returncode != 0:
|
|
21
|
-
raise AssertionError(result.stderr or result.stdout)
|
|
22
|
-
if not expect_success and result.returncode == 0:
|
|
23
|
-
raise AssertionError("Command succeeded unexpectedly")
|
|
24
|
-
if result.stdout.strip():
|
|
25
|
-
return json.loads(result.stdout)
|
|
26
|
-
return {}
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
def write_answers(path, project_name="demo"):
|
|
30
|
-
answers = {
|
|
31
|
-
"project_name": project_name,
|
|
32
|
-
"project_summary": "A developer tooling project used to install and maintain Codex harness docs.",
|
|
33
|
-
"primary_users": "Codex users and maintainers",
|
|
34
|
-
"deployment_targets": "npm package and local repositories",
|
|
35
|
-
"product_domain": "developer tooling",
|
|
36
|
-
"reliability_targets": "Repeatable local commands and safe update behavior",
|
|
37
|
-
"security_constraints": "Do not write secrets or overwrite user-owned docs without consent",
|
|
38
|
-
"frontend_stack_notes": "Frontend changes require browser validation when a UI is detected",
|
|
39
|
-
"quality_focus": "installer behavior, generated docs, plan closure, and knowledge capture",
|
|
40
|
-
"frontend_scope": "No frontend unless one is detected by analysis",
|
|
41
|
-
}
|
|
42
|
-
path.write_text(json.dumps(answers, indent=2) + "\n")
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
def assert_exists(repo, relative_path):
|
|
46
|
-
path = repo / relative_path
|
|
47
|
-
if not path.exists():
|
|
48
|
-
raise AssertionError(f"Expected {relative_path} to exist")
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
def assert_contains(repo, relative_path, needle):
|
|
52
|
-
text = (repo / relative_path).read_text()
|
|
53
|
-
if needle not in text:
|
|
54
|
-
raise AssertionError(f"Expected {relative_path} to contain {needle!r}")
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
def test_empty_repo_init(tmp_root):
|
|
58
|
-
repo = tmp_root / "empty-repo"
|
|
59
|
-
repo.mkdir()
|
|
60
|
-
answers = tmp_root / "answers.json"
|
|
61
|
-
write_answers(answers)
|
|
62
|
-
|
|
63
|
-
analysis = run_manager("analyze", "--repo", str(repo))
|
|
64
|
-
if analysis["recommended_action"] != "init":
|
|
65
|
-
raise AssertionError("Empty repo should recommend init")
|
|
66
|
-
if not analysis["missing_exec_plan_state"]:
|
|
67
|
-
raise AssertionError("Analysis should report missing exec-plan state")
|
|
68
|
-
if not analysis["missing_sops"]:
|
|
69
|
-
raise AssertionError("Analysis should report missing SOPs")
|
|
70
|
-
|
|
71
|
-
run_manager("init", "--repo", str(repo), "--answers", str(answers))
|
|
72
|
-
for relative_path in [
|
|
73
|
-
"AGENTS.md",
|
|
74
|
-
"ARCHITECTURE.md",
|
|
75
|
-
"docs/PLANS.md",
|
|
76
|
-
"docs/QUALITY_SCORE.md",
|
|
77
|
-
"docs/exec-plans/active/_template.md",
|
|
78
|
-
"docs/exec-plans/completed/README.md",
|
|
79
|
-
"docs/sops/encode-unseen-knowledge.md",
|
|
80
|
-
]:
|
|
81
|
-
assert_exists(repo, relative_path)
|
|
82
|
-
assert_contains(repo, "AGENTS.md", "docs/exec-plans/active/")
|
|
83
|
-
assert_contains(repo, "AGENTS.md", "docs/sops/")
|
|
84
|
-
assert_contains(repo, "AGENTS.md", ".codex/skills/harness-repo-bootstrap/scripts/manage_harness.py check")
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
def test_frontend_analysis(tmp_root):
|
|
88
|
-
repo = tmp_root / "frontend-repo"
|
|
89
|
-
repo.mkdir()
|
|
90
|
-
(repo / "package.json").write_text(
|
|
91
|
-
json.dumps(
|
|
92
|
-
{
|
|
93
|
-
"dependencies": {
|
|
94
|
-
"react": "^19.0.0",
|
|
95
|
-
"vite": "^6.0.0",
|
|
96
|
-
}
|
|
97
|
-
},
|
|
98
|
-
indent=2,
|
|
99
|
-
)
|
|
100
|
-
+ "\n"
|
|
101
|
-
)
|
|
102
|
-
(repo / "src").mkdir()
|
|
103
|
-
(repo / "src" / "App.tsx").write_text("export default function App() { return null; }\n")
|
|
104
|
-
|
|
105
|
-
analysis = run_manager("analyze", "--repo", str(repo))
|
|
106
|
-
question_ids = {item["id"] for item in analysis["human_confirmations"]}
|
|
107
|
-
if not analysis["has_frontend"]:
|
|
108
|
-
raise AssertionError("Frontend repo should be detected")
|
|
109
|
-
if "frontend_stack_notes" not in question_ids:
|
|
110
|
-
raise AssertionError("Frontend repo should ask frontend confirmation questions")
|
|
111
|
-
if "React" not in analysis["frameworks"]:
|
|
112
|
-
raise AssertionError("React should be detected")
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
def test_closed_loop_plan(tmp_root):
|
|
116
|
-
repo = tmp_root / "loop-repo"
|
|
117
|
-
repo.mkdir()
|
|
118
|
-
(repo / "snake.sh").write_text("#!/usr/bin/env bash\nprintf 'snake\\n'\n")
|
|
119
|
-
(repo / ".codex" / "skills" / "demo" / "scripts").mkdir(parents=True)
|
|
120
|
-
(repo / ".codex" / "skills" / "demo" / "scripts" / "tool.py").write_text("print('ignore me')\n")
|
|
121
|
-
answers = tmp_root / "loop-answers.json"
|
|
122
|
-
write_answers(answers, project_name="loop-demo")
|
|
123
|
-
analysis = run_manager("analyze", "--repo", str(repo))
|
|
124
|
-
if "Shell" not in analysis["languages"]:
|
|
125
|
-
raise AssertionError("Shell should be detected from target project files")
|
|
126
|
-
if "Python" in analysis["languages"]:
|
|
127
|
-
raise AssertionError(".codex skill files should not affect target project language detection")
|
|
128
|
-
run_manager("init", "--repo", str(repo), "--answers", str(answers))
|
|
129
|
-
|
|
130
|
-
plan_result = run_manager(
|
|
131
|
-
"plan-start",
|
|
132
|
-
"--repo",
|
|
133
|
-
str(repo),
|
|
134
|
-
"--slug",
|
|
135
|
-
"knowledge-loop",
|
|
136
|
-
"--goal",
|
|
137
|
-
"Validate durable knowledge closure",
|
|
138
|
-
)
|
|
139
|
-
plan_path = Path(plan_result["plan"])
|
|
140
|
-
relative_plan = str(plan_path.resolve().relative_to(repo.resolve()))
|
|
141
|
-
fact = "Install mode must distinguish local and global skill destinations"
|
|
142
|
-
run_manager(
|
|
143
|
-
"knowledge-log",
|
|
144
|
-
"--repo",
|
|
145
|
-
str(repo),
|
|
146
|
-
"--plan",
|
|
147
|
-
relative_plan,
|
|
148
|
-
"--fact",
|
|
149
|
-
fact,
|
|
150
|
-
"--destination",
|
|
151
|
-
"docs/PRODUCT_SENSE.md",
|
|
152
|
-
)
|
|
153
|
-
run_manager(
|
|
154
|
-
"plan-close",
|
|
155
|
-
"--repo",
|
|
156
|
-
str(repo),
|
|
157
|
-
"--plan",
|
|
158
|
-
relative_plan,
|
|
159
|
-
"--summary",
|
|
160
|
-
"done",
|
|
161
|
-
expect_success=False,
|
|
162
|
-
)
|
|
163
|
-
run_manager(
|
|
164
|
-
"knowledge-mark-written",
|
|
165
|
-
"--repo",
|
|
166
|
-
str(repo),
|
|
167
|
-
"--plan",
|
|
168
|
-
relative_plan,
|
|
169
|
-
"--fact",
|
|
170
|
-
fact,
|
|
171
|
-
"--destination",
|
|
172
|
-
"docs/PRODUCT_SENSE.md",
|
|
173
|
-
expect_success=False,
|
|
174
|
-
)
|
|
175
|
-
run_manager(
|
|
176
|
-
"knowledge-mark-written",
|
|
177
|
-
"--repo",
|
|
178
|
-
str(repo),
|
|
179
|
-
"--plan",
|
|
180
|
-
relative_plan,
|
|
181
|
-
"--fact",
|
|
182
|
-
fact,
|
|
183
|
-
"--destination",
|
|
184
|
-
"docs/PRODUCT_SENSE.md",
|
|
185
|
-
"--append",
|
|
186
|
-
)
|
|
187
|
-
assert_contains(repo, "docs/PRODUCT_SENSE.md", fact)
|
|
188
|
-
close_result = run_manager(
|
|
189
|
-
"plan-close",
|
|
190
|
-
"--repo",
|
|
191
|
-
str(repo),
|
|
192
|
-
"--plan",
|
|
193
|
-
relative_plan,
|
|
194
|
-
"--summary",
|
|
195
|
-
"Closed after writing durable knowledge.",
|
|
196
|
-
)
|
|
197
|
-
if close_result["status"] != "closed":
|
|
198
|
-
raise AssertionError("Plan should close after knowledge is marked written")
|
|
199
|
-
if plan_path.exists():
|
|
200
|
-
raise AssertionError("Active plan should be moved after close")
|
|
201
|
-
assert_exists(repo, "docs/exec-plans/completed/" + plan_path.name)
|
|
202
|
-
check_result = run_manager("check", "--repo", str(repo))
|
|
203
|
-
if check_result["status"] != "pass":
|
|
204
|
-
raise AssertionError("Harness check should pass after plan closure")
|
|
205
|
-
|
|
206
|
-
formatted_plan = create_formatted_plan(repo)
|
|
207
|
-
formatted_relative_plan = str(formatted_plan.resolve().relative_to(repo.resolve()))
|
|
208
|
-
formatted_fact = "snake.sh is the single runtime entrypoint and owns terminal control directly with stty and tput"
|
|
209
|
-
with (repo / "ARCHITECTURE.md").open("a") as handle:
|
|
210
|
-
handle.write("\n`snake.sh` is the single runtime entrypoint and owns terminal control directly with `stty` and `tput`.\n")
|
|
211
|
-
run_manager(
|
|
212
|
-
"knowledge-mark-written",
|
|
213
|
-
"--repo",
|
|
214
|
-
str(repo),
|
|
215
|
-
"--plan",
|
|
216
|
-
formatted_relative_plan,
|
|
217
|
-
"--fact",
|
|
218
|
-
formatted_fact,
|
|
219
|
-
"--destination",
|
|
220
|
-
"ARCHITECTURE.md",
|
|
221
|
-
)
|
|
222
|
-
|
|
223
|
-
id_plan_result = run_manager(
|
|
224
|
-
"plan-start",
|
|
225
|
-
"--repo",
|
|
226
|
-
str(repo),
|
|
227
|
-
"--slug",
|
|
228
|
-
"id-knowledge-loop",
|
|
229
|
-
"--goal",
|
|
230
|
-
"Validate id-based durable knowledge closure",
|
|
231
|
-
)
|
|
232
|
-
id_plan_path = Path(id_plan_result["plan"])
|
|
233
|
-
id_relative_plan = str(id_plan_path.resolve().relative_to(repo.resolve()))
|
|
234
|
-
id_fact = "Runtime input is owned by the terminal runner and core game logic remains independent of terminal packages"
|
|
235
|
-
log_result = run_manager(
|
|
236
|
-
"knowledge-log",
|
|
237
|
-
"--repo",
|
|
238
|
-
str(repo),
|
|
239
|
-
"--plan",
|
|
240
|
-
id_relative_plan,
|
|
241
|
-
"--fact",
|
|
242
|
-
id_fact,
|
|
243
|
-
"--destination",
|
|
244
|
-
"ARCHITECTURE.md",
|
|
245
|
-
)
|
|
246
|
-
with (repo / "ARCHITECTURE.md").open("a") as handle:
|
|
247
|
-
handle.write(
|
|
248
|
-
"\nThe `main` package owns keyboard input and rendering, while `game` contains pure state transitions.\n"
|
|
249
|
-
)
|
|
250
|
-
run_manager(
|
|
251
|
-
"knowledge-mark-written",
|
|
252
|
-
"--repo",
|
|
253
|
-
str(repo),
|
|
254
|
-
"--plan",
|
|
255
|
-
id_relative_plan,
|
|
256
|
-
"--id",
|
|
257
|
-
log_result["id"],
|
|
258
|
-
"--evidence",
|
|
259
|
-
"main package owns keyboard input and rendering",
|
|
260
|
-
)
|
|
261
|
-
plan_text = id_plan_path.read_text()
|
|
262
|
-
if id_fact in (repo / "ARCHITECTURE.md").read_text():
|
|
263
|
-
raise AssertionError("Id/evidence closure should not require appending the exact fact to the destination")
|
|
264
|
-
if "| evidence: main package owns keyboard input and rendering" not in plan_text:
|
|
265
|
-
raise AssertionError("Closed knowledge item should record the verification evidence")
|
|
266
|
-
run_manager(
|
|
267
|
-
"plan-close",
|
|
268
|
-
"--repo",
|
|
269
|
-
str(repo),
|
|
270
|
-
"--plan",
|
|
271
|
-
id_relative_plan,
|
|
272
|
-
"--summary",
|
|
273
|
-
"Closed with id-based evidence.",
|
|
274
|
-
)
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
def create_formatted_plan(repo):
|
|
278
|
-
plan_path = repo / "docs" / "exec-plans" / "active" / "formatted-plan.md"
|
|
279
|
-
plan_path.write_text(
|
|
280
|
-
"""# Execution Plan: Formatted Plan
|
|
281
|
-
|
|
282
|
-
## Durable Knowledge To Capture
|
|
283
|
-
|
|
284
|
-
- [ ] `snake.sh` is the single runtime entrypoint and owns terminal control directly with `stty` and `tput`. -> `ARCHITECTURE.md`
|
|
285
|
-
"""
|
|
286
|
-
)
|
|
287
|
-
return plan_path
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
def test_preserve_unmanaged_docs(tmp_root):
|
|
291
|
-
repo = tmp_root / "partial-repo"
|
|
292
|
-
repo.mkdir()
|
|
293
|
-
(repo / "AGENTS.md").write_text("# Existing user router\n\nKeep this custom content.\n")
|
|
294
|
-
answers = tmp_root / "partial-answers.json"
|
|
295
|
-
write_answers(answers)
|
|
296
|
-
|
|
297
|
-
result = run_manager("init", "--repo", str(repo), "--answers", str(answers))
|
|
298
|
-
if "AGENTS.md" not in result["skipped"]:
|
|
299
|
-
raise AssertionError("Unmanaged AGENTS.md should be skipped")
|
|
300
|
-
assert_contains(repo, "AGENTS.md", "Keep this custom content.")
|
|
301
|
-
assert_exists(repo, "docs/PLANS.md")
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
EVALS = [
|
|
305
|
-
("empty-repo-init", test_empty_repo_init),
|
|
306
|
-
("frontend-analysis", test_frontend_analysis),
|
|
307
|
-
("closed-loop-plan", test_closed_loop_plan),
|
|
308
|
-
("preserve-unmanaged-docs", test_preserve_unmanaged_docs),
|
|
309
|
-
]
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
def main():
|
|
313
|
-
results = []
|
|
314
|
-
with tempfile.TemporaryDirectory() as tmp:
|
|
315
|
-
tmp_root = Path(tmp)
|
|
316
|
-
for eval_id, test_func in EVALS:
|
|
317
|
-
try:
|
|
318
|
-
test_func(tmp_root)
|
|
319
|
-
results.append({"id": eval_id, "status": "pass"})
|
|
320
|
-
except Exception as error:
|
|
321
|
-
results.append({"id": eval_id, "status": "fail", "error": str(error)})
|
|
322
|
-
|
|
323
|
-
passed = sum(1 for result in results if result["status"] == "pass")
|
|
324
|
-
total = len(results)
|
|
325
|
-
report = {
|
|
326
|
-
"score": round((passed / total) * 100),
|
|
327
|
-
"passed": passed,
|
|
328
|
-
"total": total,
|
|
329
|
-
"results": results,
|
|
330
|
-
}
|
|
331
|
-
print(json.dumps(report, indent=2) + "\n")
|
|
332
|
-
if passed != total:
|
|
333
|
-
sys.exit(1)
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
if __name__ == "__main__":
|
|
337
|
-
main()
|
|
@@ -1,39 +0,0 @@
|
|
|
1
|
-
# Execution Plans
|
|
2
|
-
|
|
3
|
-
Execution plans are required for multi-step work, risky changes, or tasks that need coordination across files.
|
|
4
|
-
|
|
5
|
-
## When To Create One
|
|
6
|
-
|
|
7
|
-
- more than one implementation step is required
|
|
8
|
-
- validation is non-trivial
|
|
9
|
-
- architecture, product, reliability, or security decisions are involved
|
|
10
|
-
- work will span enough time that another agent may resume it later
|
|
11
|
-
|
|
12
|
-
## Location
|
|
13
|
-
|
|
14
|
-
- Active: `docs/exec-plans/active/`
|
|
15
|
-
- Completed: `docs/exec-plans/completed/`
|
|
16
|
-
|
|
17
|
-
## Minimum Sections
|
|
18
|
-
|
|
19
|
-
- goal
|
|
20
|
-
- scope
|
|
21
|
-
- constraints
|
|
22
|
-
- steps
|
|
23
|
-
- validation
|
|
24
|
-
- durable knowledge to capture
|
|
25
|
-
- completion notes
|
|
26
|
-
|
|
27
|
-
## Operating Rule
|
|
28
|
-
|
|
29
|
-
Update the active plan during the work. When the work is done, move it to `completed` and leave behind any durable facts in the right permanent docs.
|
|
30
|
-
|
|
31
|
-
## Closed Loop
|
|
32
|
-
|
|
33
|
-
Use the script, not ad hoc manual edits, for the lifecycle:
|
|
34
|
-
|
|
35
|
-
- `plan-start`: create a new active execution plan
|
|
36
|
-
- `knowledge-log`: append a durable fact that still needs to be written into permanent docs and return its stable id
|
|
37
|
-
- `knowledge-mark-written`: verify and mark a logged fact as written into its permanent doc; prefer `--id <knowledge-id> --evidence "<doc text>"`, and use `--append` only to append the exact fact first
|
|
38
|
-
- `plan-close`: refuse to close cleanly until the listed knowledge items are marked as written to durable docs
|
|
39
|
-
- `check`: run a local handoff check without requiring target-repo CI
|
|
@@ -1,12 +0,0 @@
|
|
|
1
|
-
# Template Policy
|
|
2
|
-
|
|
3
|
-
Every generated file starts with a managed marker:
|
|
4
|
-
|
|
5
|
-
`<!-- harness-repo-bootstrap:managed -->`
|
|
6
|
-
|
|
7
|
-
Update behavior:
|
|
8
|
-
|
|
9
|
-
- `init`: create missing files and skip existing files unless `--force`
|
|
10
|
-
- `update`: create missing files, skip existing unmanaged files, and refresh managed files only when `--refresh-managed` or `--force` is passed
|
|
11
|
-
|
|
12
|
-
If a file exists without the managed marker, treat it as user-owned unless the human explicitly asks to replace it.
|
|
@@ -1,47 +0,0 @@
|
|
|
1
|
-
# Workflow
|
|
2
|
-
|
|
3
|
-
Use this skill in two passes.
|
|
4
|
-
|
|
5
|
-
## Pass 1: Analyze and Confirm
|
|
6
|
-
|
|
7
|
-
Run `analyze` before editing repository docs.
|
|
8
|
-
|
|
9
|
-
Ask the human only about facts that cannot be derived safely from the repo, especially:
|
|
10
|
-
|
|
11
|
-
- product domain and top-level outcomes
|
|
12
|
-
- intended users or operators
|
|
13
|
-
- production reliability expectations
|
|
14
|
-
- security or compliance constraints
|
|
15
|
-
- frontend experience bar
|
|
16
|
-
- canonical external references worth pinning inside `docs/references/`
|
|
17
|
-
|
|
18
|
-
Do not ask for facts that can be inferred from source layout, dependency manifests, or existing docs.
|
|
19
|
-
|
|
20
|
-
Also inspect the analysis for:
|
|
21
|
-
|
|
22
|
-
- missing durable knowledge that should be written during the task
|
|
23
|
-
- missing execution-plan state
|
|
24
|
-
- which SOPs should be referenced in the generated router docs
|
|
25
|
-
|
|
26
|
-
## Pass 2: Scaffold or Refresh
|
|
27
|
-
|
|
28
|
-
Run `sample-answers`, fill the answers, then run `init` or `update`.
|
|
29
|
-
|
|
30
|
-
Use `init` for first-time adoption.
|
|
31
|
-
Use `update` to add missing managed files or refresh managed files when `--refresh-managed` is passed.
|
|
32
|
-
|
|
33
|
-
After the script runs, read the generated docs once and tighten weak generic phrases before handing off.
|
|
34
|
-
|
|
35
|
-
## Ongoing Use
|
|
36
|
-
|
|
37
|
-
After the scaffold exists:
|
|
38
|
-
|
|
39
|
-
- create an execution plan before multi-step work
|
|
40
|
-
- use `plan-start` instead of creating plan files manually when possible
|
|
41
|
-
- log durable facts during execution instead of waiting until the end
|
|
42
|
-
- follow the matching SOP for architecture, UI, observability, or knowledge capture work
|
|
43
|
-
- encode durable knowledge back into the repository before closing the task
|
|
44
|
-
- mark logged knowledge items as written after updating the permanent docs
|
|
45
|
-
- use `plan-close` to verify no durable knowledge is left stranded in the active plan
|
|
46
|
-
- run `.codex/skills/harness-repo-bootstrap/scripts/manage_harness.py check --repo <target-repo>` before handoff
|
|
47
|
-
- do not add CI to the target repository unless the human explicitly asks for it
|