@hallucination-studio/harness-engine 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,337 @@
1
+ #!/usr/bin/env python3
2
+
3
+ import json
4
+ import subprocess
5
+ import sys
6
+ import tempfile
7
+ from pathlib import Path
8
+
9
+ SKILL_DIR = Path(__file__).resolve().parents[1]
10
+ MANAGER = SKILL_DIR / "scripts" / "manage_harness.py"
11
+
12
+
13
+ def run_manager(*args, expect_success=True):
14
+ result = subprocess.run(
15
+ [sys.executable, str(MANAGER), *args],
16
+ text=True,
17
+ capture_output=True,
18
+ check=False,
19
+ )
20
+ if expect_success and result.returncode != 0:
21
+ raise AssertionError(result.stderr or result.stdout)
22
+ if not expect_success and result.returncode == 0:
23
+ raise AssertionError("Command succeeded unexpectedly")
24
+ if result.stdout.strip():
25
+ return json.loads(result.stdout)
26
+ return {}
27
+
28
+
29
+ def write_answers(path, project_name="demo"):
30
+ answers = {
31
+ "project_name": project_name,
32
+ "project_summary": "A developer tooling project used to install and maintain Codex harness docs.",
33
+ "primary_users": "Codex users and maintainers",
34
+ "deployment_targets": "npm package and local repositories",
35
+ "product_domain": "developer tooling",
36
+ "reliability_targets": "Repeatable local commands and safe update behavior",
37
+ "security_constraints": "Do not write secrets or overwrite user-owned docs without consent",
38
+ "frontend_stack_notes": "Frontend changes require browser validation when a UI is detected",
39
+ "quality_focus": "installer behavior, generated docs, plan closure, and knowledge capture",
40
+ "frontend_scope": "No frontend unless one is detected by analysis",
41
+ }
42
+ path.write_text(json.dumps(answers, indent=2) + "\n")
43
+
44
+
45
+ def assert_exists(repo, relative_path):
46
+ path = repo / relative_path
47
+ if not path.exists():
48
+ raise AssertionError(f"Expected {relative_path} to exist")
49
+
50
+
51
+ def assert_contains(repo, relative_path, needle):
52
+ text = (repo / relative_path).read_text()
53
+ if needle not in text:
54
+ raise AssertionError(f"Expected {relative_path} to contain {needle!r}")
55
+
56
+
57
+ def test_empty_repo_init(tmp_root):
58
+ repo = tmp_root / "empty-repo"
59
+ repo.mkdir()
60
+ answers = tmp_root / "answers.json"
61
+ write_answers(answers)
62
+
63
+ analysis = run_manager("analyze", "--repo", str(repo))
64
+ if analysis["recommended_action"] != "init":
65
+ raise AssertionError("Empty repo should recommend init")
66
+ if not analysis["missing_exec_plan_state"]:
67
+ raise AssertionError("Analysis should report missing exec-plan state")
68
+ if not analysis["missing_sops"]:
69
+ raise AssertionError("Analysis should report missing SOPs")
70
+
71
+ run_manager("init", "--repo", str(repo), "--answers", str(answers))
72
+ for relative_path in [
73
+ "AGENTS.md",
74
+ "ARCHITECTURE.md",
75
+ "docs/PLANS.md",
76
+ "docs/QUALITY_SCORE.md",
77
+ "docs/exec-plans/active/_template.md",
78
+ "docs/exec-plans/completed/README.md",
79
+ "docs/sops/encode-unseen-knowledge.md",
80
+ ]:
81
+ assert_exists(repo, relative_path)
82
+ assert_contains(repo, "AGENTS.md", "docs/exec-plans/active/")
83
+ assert_contains(repo, "AGENTS.md", "docs/sops/")
84
+ assert_contains(repo, "AGENTS.md", ".codex/skills/harness-repo-bootstrap/scripts/manage_harness.py check")
85
+
86
+
87
+ def test_frontend_analysis(tmp_root):
88
+ repo = tmp_root / "frontend-repo"
89
+ repo.mkdir()
90
+ (repo / "package.json").write_text(
91
+ json.dumps(
92
+ {
93
+ "dependencies": {
94
+ "react": "^19.0.0",
95
+ "vite": "^6.0.0",
96
+ }
97
+ },
98
+ indent=2,
99
+ )
100
+ + "\n"
101
+ )
102
+ (repo / "src").mkdir()
103
+ (repo / "src" / "App.tsx").write_text("export default function App() { return null; }\n")
104
+
105
+ analysis = run_manager("analyze", "--repo", str(repo))
106
+ question_ids = {item["id"] for item in analysis["human_confirmations"]}
107
+ if not analysis["has_frontend"]:
108
+ raise AssertionError("Frontend repo should be detected")
109
+ if "frontend_stack_notes" not in question_ids:
110
+ raise AssertionError("Frontend repo should ask frontend confirmation questions")
111
+ if "React" not in analysis["frameworks"]:
112
+ raise AssertionError("React should be detected")
113
+
114
+
115
+ def test_closed_loop_plan(tmp_root):
116
+ repo = tmp_root / "loop-repo"
117
+ repo.mkdir()
118
+ (repo / "snake.sh").write_text("#!/usr/bin/env bash\nprintf 'snake\\n'\n")
119
+ (repo / ".codex" / "skills" / "demo" / "scripts").mkdir(parents=True)
120
+ (repo / ".codex" / "skills" / "demo" / "scripts" / "tool.py").write_text("print('ignore me')\n")
121
+ answers = tmp_root / "loop-answers.json"
122
+ write_answers(answers, project_name="loop-demo")
123
+ analysis = run_manager("analyze", "--repo", str(repo))
124
+ if "Shell" not in analysis["languages"]:
125
+ raise AssertionError("Shell should be detected from target project files")
126
+ if "Python" in analysis["languages"]:
127
+ raise AssertionError(".codex skill files should not affect target project language detection")
128
+ run_manager("init", "--repo", str(repo), "--answers", str(answers))
129
+
130
+ plan_result = run_manager(
131
+ "plan-start",
132
+ "--repo",
133
+ str(repo),
134
+ "--slug",
135
+ "knowledge-loop",
136
+ "--goal",
137
+ "Validate durable knowledge closure",
138
+ )
139
+ plan_path = Path(plan_result["plan"])
140
+ relative_plan = str(plan_path.resolve().relative_to(repo.resolve()))
141
+ fact = "Install mode must distinguish local and global skill destinations"
142
+ run_manager(
143
+ "knowledge-log",
144
+ "--repo",
145
+ str(repo),
146
+ "--plan",
147
+ relative_plan,
148
+ "--fact",
149
+ fact,
150
+ "--destination",
151
+ "docs/PRODUCT_SENSE.md",
152
+ )
153
+ run_manager(
154
+ "plan-close",
155
+ "--repo",
156
+ str(repo),
157
+ "--plan",
158
+ relative_plan,
159
+ "--summary",
160
+ "done",
161
+ expect_success=False,
162
+ )
163
+ run_manager(
164
+ "knowledge-mark-written",
165
+ "--repo",
166
+ str(repo),
167
+ "--plan",
168
+ relative_plan,
169
+ "--fact",
170
+ fact,
171
+ "--destination",
172
+ "docs/PRODUCT_SENSE.md",
173
+ expect_success=False,
174
+ )
175
+ run_manager(
176
+ "knowledge-mark-written",
177
+ "--repo",
178
+ str(repo),
179
+ "--plan",
180
+ relative_plan,
181
+ "--fact",
182
+ fact,
183
+ "--destination",
184
+ "docs/PRODUCT_SENSE.md",
185
+ "--append",
186
+ )
187
+ assert_contains(repo, "docs/PRODUCT_SENSE.md", fact)
188
+ close_result = run_manager(
189
+ "plan-close",
190
+ "--repo",
191
+ str(repo),
192
+ "--plan",
193
+ relative_plan,
194
+ "--summary",
195
+ "Closed after writing durable knowledge.",
196
+ )
197
+ if close_result["status"] != "closed":
198
+ raise AssertionError("Plan should close after knowledge is marked written")
199
+ if plan_path.exists():
200
+ raise AssertionError("Active plan should be moved after close")
201
+ assert_exists(repo, "docs/exec-plans/completed/" + plan_path.name)
202
+ check_result = run_manager("check", "--repo", str(repo))
203
+ if check_result["status"] != "pass":
204
+ raise AssertionError("Harness check should pass after plan closure")
205
+
206
+ formatted_plan = create_formatted_plan(repo)
207
+ formatted_relative_plan = str(formatted_plan.resolve().relative_to(repo.resolve()))
208
+ formatted_fact = "snake.sh is the single runtime entrypoint and owns terminal control directly with stty and tput"
209
+ with (repo / "ARCHITECTURE.md").open("a") as handle:
210
+ handle.write("\n`snake.sh` is the single runtime entrypoint and owns terminal control directly with `stty` and `tput`.\n")
211
+ run_manager(
212
+ "knowledge-mark-written",
213
+ "--repo",
214
+ str(repo),
215
+ "--plan",
216
+ formatted_relative_plan,
217
+ "--fact",
218
+ formatted_fact,
219
+ "--destination",
220
+ "ARCHITECTURE.md",
221
+ )
222
+
223
+ id_plan_result = run_manager(
224
+ "plan-start",
225
+ "--repo",
226
+ str(repo),
227
+ "--slug",
228
+ "id-knowledge-loop",
229
+ "--goal",
230
+ "Validate id-based durable knowledge closure",
231
+ )
232
+ id_plan_path = Path(id_plan_result["plan"])
233
+ id_relative_plan = str(id_plan_path.resolve().relative_to(repo.resolve()))
234
+ id_fact = "Runtime input is owned by the terminal runner and core game logic remains independent of terminal packages"
235
+ log_result = run_manager(
236
+ "knowledge-log",
237
+ "--repo",
238
+ str(repo),
239
+ "--plan",
240
+ id_relative_plan,
241
+ "--fact",
242
+ id_fact,
243
+ "--destination",
244
+ "ARCHITECTURE.md",
245
+ )
246
+ with (repo / "ARCHITECTURE.md").open("a") as handle:
247
+ handle.write(
248
+ "\nThe `main` package owns keyboard input and rendering, while `game` contains pure state transitions.\n"
249
+ )
250
+ run_manager(
251
+ "knowledge-mark-written",
252
+ "--repo",
253
+ str(repo),
254
+ "--plan",
255
+ id_relative_plan,
256
+ "--id",
257
+ log_result["id"],
258
+ "--evidence",
259
+ "main package owns keyboard input and rendering",
260
+ )
261
+ plan_text = id_plan_path.read_text()
262
+ if id_fact in (repo / "ARCHITECTURE.md").read_text():
263
+ raise AssertionError("Id/evidence closure should not require appending the exact fact to the destination")
264
+ if "| evidence: main package owns keyboard input and rendering" not in plan_text:
265
+ raise AssertionError("Closed knowledge item should record the verification evidence")
266
+ run_manager(
267
+ "plan-close",
268
+ "--repo",
269
+ str(repo),
270
+ "--plan",
271
+ id_relative_plan,
272
+ "--summary",
273
+ "Closed with id-based evidence.",
274
+ )
275
+
276
+
277
+ def create_formatted_plan(repo):
278
+ plan_path = repo / "docs" / "exec-plans" / "active" / "formatted-plan.md"
279
+ plan_path.write_text(
280
+ """# Execution Plan: Formatted Plan
281
+
282
+ ## Durable Knowledge To Capture
283
+
284
+ - [ ] `snake.sh` is the single runtime entrypoint and owns terminal control directly with `stty` and `tput`. -> `ARCHITECTURE.md`
285
+ """
286
+ )
287
+ return plan_path
288
+
289
+
290
+ def test_preserve_unmanaged_docs(tmp_root):
291
+ repo = tmp_root / "partial-repo"
292
+ repo.mkdir()
293
+ (repo / "AGENTS.md").write_text("# Existing user router\n\nKeep this custom content.\n")
294
+ answers = tmp_root / "partial-answers.json"
295
+ write_answers(answers)
296
+
297
+ result = run_manager("init", "--repo", str(repo), "--answers", str(answers))
298
+ if "AGENTS.md" not in result["skipped"]:
299
+ raise AssertionError("Unmanaged AGENTS.md should be skipped")
300
+ assert_contains(repo, "AGENTS.md", "Keep this custom content.")
301
+ assert_exists(repo, "docs/PLANS.md")
302
+
303
+
304
+ EVALS = [
305
+ ("empty-repo-init", test_empty_repo_init),
306
+ ("frontend-analysis", test_frontend_analysis),
307
+ ("closed-loop-plan", test_closed_loop_plan),
308
+ ("preserve-unmanaged-docs", test_preserve_unmanaged_docs),
309
+ ]
310
+
311
+
312
+ def main():
313
+ results = []
314
+ with tempfile.TemporaryDirectory() as tmp:
315
+ tmp_root = Path(tmp)
316
+ for eval_id, test_func in EVALS:
317
+ try:
318
+ test_func(tmp_root)
319
+ results.append({"id": eval_id, "status": "pass"})
320
+ except Exception as error:
321
+ results.append({"id": eval_id, "status": "fail", "error": str(error)})
322
+
323
+ passed = sum(1 for result in results if result["status"] == "pass")
324
+ total = len(results)
325
+ report = {
326
+ "score": round((passed / total) * 100),
327
+ "passed": passed,
328
+ "total": total,
329
+ "results": results,
330
+ }
331
+ print(json.dumps(report, indent=2) + "\n")
332
+ if passed != total:
333
+ sys.exit(1)
334
+
335
+
336
+ if __name__ == "__main__":
337
+ main()
@@ -0,0 +1,18 @@
1
+ # Evaluation Loop
2
+
3
+ Use this loop when changing the skill, templates, scripts, or policy references:
4
+
5
+ 1. Draft the behavior in `SKILL.md`, `references/`, templates, or scripts.
6
+ 2. Test it with the deterministic commands in `scripts/manage_harness.py`.
7
+ 3. Evaluate it with `python3 evals/run_evals.py`.
8
+ 4. Iterate until the runner passes and the score stays at 100.
9
+
10
+ ## What The Evals Cover
11
+
12
+ - first-time initialization of an empty repository
13
+ - frontend-aware repository analysis
14
+ - execution-plan and knowledge-capture closure
15
+ - preservation of unmanaged user-owned docs
16
+ - local harness checks that do not require user-project CI
17
+
18
+ Add a new eval case whenever a regression would be easy to miss by reading the files manually.
@@ -0,0 +1,39 @@
1
+ # Execution Plans
2
+
3
+ Execution plans are required for multi-step work, risky changes, or tasks that need coordination across files.
4
+
5
+ ## When To Create One
6
+
7
+ - more than one implementation step is required
8
+ - validation is non-trivial
9
+ - architecture, product, reliability, or security decisions are involved
10
+ - work will span enough time that another agent may resume it later
11
+
12
+ ## Location
13
+
14
+ - Active: `docs/exec-plans/active/`
15
+ - Completed: `docs/exec-plans/completed/`
16
+
17
+ ## Minimum Sections
18
+
19
+ - goal
20
+ - scope
21
+ - constraints
22
+ - steps
23
+ - validation
24
+ - durable knowledge to capture
25
+ - completion notes
26
+
27
+ ## Operating Rule
28
+
29
+ Update the active plan during the work. When the work is done, move it to `completed` and leave behind any durable facts in the right permanent docs.
30
+
31
+ ## Closed Loop
32
+
33
+ Use the script, not ad hoc manual edits, for the lifecycle:
34
+
35
+ - `plan-start`: create a new active execution plan
36
+ - `knowledge-log`: append a durable fact that still needs to be written into permanent docs and return its stable id
37
+ - `knowledge-mark-written`: verify and mark a logged fact as written into its permanent doc; prefer `--id <knowledge-id> --evidence "<doc text>"`, and use `--append` only to append the exact fact first
38
+ - `plan-close`: refuse to close cleanly until the listed knowledge items are marked as written to durable docs
39
+ - `check`: run a local handoff check without requiring target-repo CI
@@ -0,0 +1,17 @@
1
+ # File Map
2
+
3
+ - `AGENTS.md`: short router, reading order, repo-specific guardrails
4
+ - `ARCHITECTURE.md`: domain boundaries, runtime topology, integration seams
5
+ - `docs/PLANS.md`: plan lifecycle and storage rules
6
+ - `docs/PRODUCT_SENSE.md`: product heuristics and tradeoff rules
7
+ - `docs/QUALITY_SCORE.md`: quality rubric by domain and layer
8
+ - `docs/RELIABILITY.md`: SLOs, failure modes, observability expectations
9
+ - `docs/SECURITY.md`: security constraints, secrets, auth, data handling
10
+ - `docs/DESIGN.md`: design principles and review heuristics
11
+ - `docs/FRONTEND.md`: frontend stack conventions and validation loop
12
+ - `docs/design-docs/`: durable design decisions
13
+ - `docs/product-specs/`: durable product specs
14
+ - `docs/exec-plans/`: active plans, completed plans, and tech debt tracker
15
+ - `docs/sops/`: mechanical procedures for recurring workflows and validation loops
16
+ - `docs/generated/`: generated facts such as schemas
17
+ - `docs/references/`: external references rewritten or linked for model-friendly discovery
@@ -0,0 +1,35 @@
1
+ # Knowledge Capture
2
+
3
+ Write durable knowledge into the repository whenever one of these is true:
4
+
5
+ - the fact changed your implementation plan
6
+ - the fact would likely be needed by another agent later
7
+ - the fact came from a human answer rather than directly from code
8
+ - the fact explains why a policy, architecture choice, or validation loop exists
9
+ - the fact would be annoying to rediscover from scratch
10
+
11
+ ## Where To Write It
12
+
13
+ - Product behavior or workflow intent: `docs/product-specs/`
14
+ - Design rationale or UX rules: `docs/design-docs/`
15
+ - Runtime validation, incidents, or observability loops: `docs/RELIABILITY.md` or `docs/sops/`
16
+ - Security constraints or review gates: `docs/SECURITY.md`
17
+ - Architecture boundaries or integration seams: `ARCHITECTURE.md`
18
+ - Reusable external material: `docs/references/`
19
+
20
+ ## Minimum Rule
21
+
22
+ If a useful fact would otherwise live only in chat, move it into the repo before closing the task.
23
+
24
+ ## Closed Loop
25
+
26
+ Prefer the script workflow:
27
+
28
+ 1. Log the fact into the active execution plan with `knowledge-log`.
29
+ 2. Write the fact into its permanent destination doc.
30
+ 3. Mark the plan item complete with `knowledge-mark-written --id <knowledge-id> --evidence "<text present in durable doc>"`.
31
+ 4. Close the plan with `plan-close`.
32
+
33
+ `knowledge-log` returns a stable id. Prefer id-based closure so permanent docs can use concise, natural wording rather than duplicating the exact plan fact.
34
+
35
+ `knowledge-mark-written` verifies that the destination file contains either the provided evidence text or, for legacy calls, the exact fact. Use `--append` only when the exact fact should be appended to the destination doc by the tool.
@@ -0,0 +1,29 @@
1
+ # Question Catalog
2
+
3
+ Use these prompts only when the repo analysis cannot answer them.
4
+
5
+ ## Product
6
+
7
+ - What core user outcome does this repository serve?
8
+ - Which flows matter enough to deserve explicit product specs first?
9
+ - Which non-goals should the harness make visible?
10
+
11
+ ## Reliability
12
+
13
+ - What failure is unacceptable in production?
14
+ - What recovery time or uptime expectation matters most?
15
+ - Which runtime environments must be validated locally before merge?
16
+
17
+ ## Security
18
+
19
+ - Does the repo handle credentials, customer data, regulated data, or privileged actions?
20
+ - Are there required review gates for authentication, authorization, or secrets handling?
21
+
22
+ ## Frontend
23
+
24
+ - Is the product expected to have a polished user-facing interface, an internal tool UI, or no frontend?
25
+ - Which browsers, devices, or accessibility expectations are non-negotiable?
26
+
27
+ ## References
28
+
29
+ - Which external docs are worth copying into `docs/references/` because the team uses them repeatedly?
@@ -0,0 +1,10 @@
1
+ # SOP Index
2
+
3
+ Choose an SOP whenever the task touches one of these areas:
4
+
5
+ - architecture or layering changes: `docs/sops/layered-domain-architecture-setup.md`
6
+ - missing durable repository knowledge: `docs/sops/encode-unseen-knowledge.md`
7
+ - runtime debugging or observability setup: `docs/sops/local-observability-feedback-loop.md`
8
+ - user interface work: `docs/sops/chrome-devtools-ui-validation-loop.md`
9
+
10
+ If no SOP exists for a recurring workflow, create one in `docs/sops/` as part of the task.
@@ -0,0 +1,12 @@
1
+ # Template Policy
2
+
3
+ Every generated file starts with a managed marker:
4
+
5
+ `<!-- harness-repo-bootstrap:managed -->`
6
+
7
+ Update behavior:
8
+
9
+ - `init`: create missing files and skip existing files unless `--force`
10
+ - `update`: create missing files, skip existing unmanaged files, and refresh managed files only when `--refresh-managed` or `--force` is passed
11
+
12
+ If a file exists without the managed marker, treat it as user-owned unless the human explicitly asks to replace it.
@@ -0,0 +1,47 @@
1
+ # Workflow
2
+
3
+ Use this skill in two passes.
4
+
5
+ ## Pass 1: Analyze and Confirm
6
+
7
+ Run `analyze` before editing repository docs.
8
+
9
+ Ask the human only about facts that cannot be derived safely from the repo, especially:
10
+
11
+ - product domain and top-level outcomes
12
+ - intended users or operators
13
+ - production reliability expectations
14
+ - security or compliance constraints
15
+ - frontend experience bar
16
+ - canonical external references worth pinning inside `docs/references/`
17
+
18
+ Do not ask for facts that can be inferred from source layout, dependency manifests, or existing docs.
19
+
20
+ Also inspect the analysis for:
21
+
22
+ - missing durable knowledge that should be written during the task
23
+ - missing execution-plan state
24
+ - which SOPs should be referenced in the generated router docs
25
+
26
+ ## Pass 2: Scaffold or Refresh
27
+
28
+ Run `sample-answers`, fill the answers, then run `init` or `update`.
29
+
30
+ Use `init` for first-time adoption.
31
+ Use `update` to add missing managed files or refresh managed files when `--refresh-managed` is passed.
32
+
33
+ After the script runs, read the generated docs once and tighten weak generic phrases before handing off.
34
+
35
+ ## Ongoing Use
36
+
37
+ After the scaffold exists:
38
+
39
+ - create an execution plan before multi-step work
40
+ - use `plan-start` instead of creating plan files manually when possible
41
+ - log durable facts during execution instead of waiting until the end
42
+ - follow the matching SOP for architecture, UI, observability, or knowledge capture work
43
+ - encode durable knowledge back into the repository before closing the task
44
+ - mark logged knowledge items as written after updating the permanent docs
45
+ - use `plan-close` to verify no durable knowledge is left stranded in the active plan
46
+ - run `.codex/skills/harness-repo-bootstrap/scripts/manage_harness.py check --repo <target-repo>` before handoff
47
+ - do not add CI to the target repository unless the human explicitly asks for it