codex-harness-engineering 0.1.4 → 0.1.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. package/AGENTS.md +18 -6
  2. package/LICENSE +21 -0
  3. package/README.md +69 -6
  4. package/docs/harness-engineering/implementation-playbook.md +232 -286
  5. package/docs/harness-engineering/index.md +7 -4
  6. package/docs/harness-engineering/research-note.md +294 -274
  7. package/docs/harness-engineering/sources.md +166 -72
  8. package/package.json +9 -4
  9. package/scripts/install-skills.mjs +73 -15
  10. package/scripts/publish.sh +2 -2
  11. package/scripts/verify-harness.mjs +61 -4
  12. package/skills/acceptance-contract/SKILL.md +39 -49
  13. package/skills/acceptance-contract/agents/openai.yaml +2 -2
  14. package/skills/cleanup-harness/SKILL.md +48 -59
  15. package/skills/cleanup-harness/agents/openai.yaml +2 -2
  16. package/skills/creator-harness/SKILL.md +79 -95
  17. package/skills/creator-harness/agents/openai.yaml +2 -2
  18. package/skills/creator-harness/references/harness-artifacts.md +63 -62
  19. package/skills/lessons-harness/SKILL.md +68 -0
  20. package/skills/lessons-harness/agents/openai.yaml +4 -0
  21. package/templates/harness/AGENTS.md +77 -0
  22. package/templates/harness/feature_list.json +16 -0
  23. package/templates/harness/init.sh +15 -0
  24. package/templates/harness/lessons.md +18 -0
  25. package/templates/harness/memory/README.md +22 -0
  26. package/templates/harness/progress.md +33 -0
  27. package/templates/harness/rotate-state.mjs +131 -0
  28. package/templates/harness/verify-state.mjs +117 -0
  29. package/templates/team/roles/evaluator.md +43 -0
  30. package/templates/team/roles/implementer.md +29 -0
  31. package/templates/team/roles/planner.md +28 -0
  32. package/templates/team/sprint-template.md +36 -0
  33. package/templates/team/verify-team.mjs +71 -0
  34. package/templates/team/workflow.md +62 -0
@@ -0,0 +1,117 @@
1
+ #!/usr/bin/env node
2
+ // Mechanical state gate for the repository harness.
3
+ //
4
+ // Fails when the working tree contains behavior changes that lack matching
5
+ // updates to feature_list.json and progress.md, or when the latest progress
6
+ // entry does not name the changed files in backticks.
7
+ //
8
+ // Run it before committing: node verify-state.mjs
9
+ // Customize BEHAVIOR_IGNORE_PATTERNS for your project. By default every
10
+ // changed file counts as a behavior change except state files, Markdown
11
+ // docs, and common non-behavior files.
12
+
13
+ import { execFileSync } from "node:child_process";
14
+ import { access, readFile } from "node:fs/promises";
15
+ import path from "node:path";
16
+ import { fileURLToPath } from "node:url";
17
+
18
+ const ROOT = path.dirname(fileURLToPath(import.meta.url));
19
+ const REQUIRED_FILES = ["AGENTS.md", "progress.md", "feature_list.json", "lessons.md", "init.sh"];
20
+ const STATE_FILES = ["feature_list.json", "progress.md"];
21
+ const BEHAVIOR_IGNORE_PATTERNS = [
22
+ /\.md$/,
23
+ /^\.agents\//,
24
+ /^memory\//,
25
+ /^team\//,
26
+ /^\.gitignore$/,
27
+ /^LICENSE$/,
28
+ /(^|\/)\.DS_Store$/,
29
+ ];
30
+ const LINE_BUDGETS = {
31
+ "progress.md": 400,
32
+ "lessons.md": 400,
33
+ };
34
+
35
+ async function exists(filePath) {
36
+ try {
37
+ await access(filePath);
38
+ return true;
39
+ } catch {
40
+ return false;
41
+ }
42
+ }
43
+
44
+ function gitLines(args) {
45
+ try {
46
+ return execFileSync("git", args, { cwd: ROOT, encoding: "utf8" })
47
+ .split("\n")
48
+ .filter(Boolean);
49
+ } catch {
50
+ return [];
51
+ }
52
+ }
53
+
54
+ function changedFiles() {
55
+ const modified = gitLines(["diff", "--name-only", "HEAD", "--"]);
56
+ const untracked = gitLines(["ls-files", "--others", "--exclude-standard"]);
57
+ return [...new Set([...modified, ...untracked])];
58
+ }
59
+
60
+ function isBehaviorChange(relativePath) {
61
+ return !STATE_FILES.includes(relativePath) &&
62
+ !BEHAVIOR_IGNORE_PATTERNS.some((pattern) => pattern.test(relativePath));
63
+ }
64
+
65
+ function latestProgressEntry(progress) {
66
+ const entryStart = progress.lastIndexOf("\n## ");
67
+ return entryStart === -1 ? progress : progress.slice(entryStart + 1);
68
+ }
69
+
70
+ const errors = [];
71
+
72
+ for (const relativePath of REQUIRED_FILES) {
73
+ if (!await exists(path.join(ROOT, relativePath))) {
74
+ errors.push(`${relativePath}: required harness artifact is missing`);
75
+ }
76
+ }
77
+
78
+ for (const [relativePath, budget] of Object.entries(LINE_BUDGETS)) {
79
+ if (await exists(path.join(ROOT, relativePath))) {
80
+ const lineCount = (await readFile(path.join(ROOT, relativePath), "utf8")).split("\n").length;
81
+ if (lineCount > budget) {
82
+ errors.push(
83
+ `${relativePath}: ${lineCount} lines exceeds the ${budget}-line budget; run node rotate-state.mjs to archive older entries`
84
+ );
85
+ }
86
+ }
87
+ }
88
+
89
+ const changed = changedFiles();
90
+ const behaviorChanges = changed.filter(isBehaviorChange);
91
+
92
+ if (behaviorChanges.length > 0) {
93
+ for (const stateFile of STATE_FILES) {
94
+ if (!changed.includes(stateFile)) {
95
+ errors.push(`${stateFile}: must be updated when behavior changes`);
96
+ }
97
+ }
98
+
99
+ if (changed.includes("progress.md") && await exists(path.join(ROOT, "progress.md"))) {
100
+ const progress = await readFile(path.join(ROOT, "progress.md"), "utf8");
101
+ const latestEntry = latestProgressEntry(progress);
102
+ for (const relativePath of behaviorChanges) {
103
+ if (!latestEntry.includes(`\`${relativePath}\``)) {
104
+ errors.push(`progress.md: latest entry must reference changed file ${relativePath}`);
105
+ }
106
+ }
107
+ }
108
+ }
109
+
110
+ if (errors.length > 0) {
111
+ for (const error of errors) {
112
+ console.error(error);
113
+ }
114
+ process.exitCode = 1;
115
+ } else {
116
+ console.log("State gate passed.");
117
+ }
@@ -0,0 +1,43 @@
1
+ # Role: Evaluator
2
+
3
+ You grade the sprint against its contract with observed evidence. You are
4
+ deliberately skeptical: the implementer's self-report is a claim, not proof.
5
+ You never edit implementation code.
6
+
7
+ ## Inputs
8
+
9
+ 1. `team/sprints/S00X/sprint.md` — the only grading standard.
10
+ 2. The running application, tests, logs, API responses, database state.
11
+ 3. A fresh session: do not reuse the implementer's session or context.
12
+
13
+ ## Outputs
14
+
15
+ - `team/sprints/S00X/evaluation.md`:
16
+
17
+ ```markdown
18
+ # Evaluation: S00X
19
+
20
+ ## Checks Run
21
+ - Command/check:
22
+ - Result:
23
+ - Artifact: (output, screenshot, response body, log line)
24
+
25
+ ## Findings
26
+ - [ ] P0/P1/P2:
27
+ - Evidence:
28
+ - Repro:
29
+ - Suggested next step:
30
+
31
+ ## Verdict: pass | fail
32
+ - Reason:
33
+ ```
34
+
35
+ ## Rules
36
+
37
+ - Execute the contract's verification yourself; never trust reported results.
38
+ - Exercise the real runtime: user paths end to end, the negative cases, and
39
+ the API/data assertions in the contract — not just the unit tests.
40
+ - Every finding needs evidence and a repro; "feels wrong" is not a finding.
41
+ - Grade against the contract, not your taste. Out-of-contract ideas go in a
42
+ note to the planner, not into the verdict.
43
+ - `Verdict: pass` only when every "Done Means" item passed a check you ran.
@@ -0,0 +1,29 @@
1
+ # Role: Implementer
2
+
3
+ You build exactly what the sprint contract says, prove it with the contract's
4
+ checks, and hand off cleanly. You do not grade your own work beyond running
5
+ the listed verification.
6
+
7
+ ## Inputs
8
+
9
+ 1. `team/sprints/S00X/sprint.md` (the contract — read it first).
10
+ 2. `AGENTS.md`, `progress.md`, `feature_list.json`.
11
+ 3. `./init.sh` must pass before you edit anything.
12
+
13
+ ## Outputs
14
+
15
+ - Implementation scoped to the contract, one feature at a time.
16
+ - Updated `feature_list.json` statuses backed by the contract's verification
17
+ commands.
18
+ - A `progress.md` entry naming changed files, plus a descriptive commit.
19
+ - Sprint status moved `building` → `evaluating` at handoff.
20
+
21
+ ## Rules
22
+
23
+ - The contract is the scope authority. If you discover the contract is wrong,
24
+ stop and send it back to the planner; do not silently widen or shrink it.
25
+ - Run the narrowest check that proves each "Done Means" item before marking
26
+ it done. `node verify-state.mjs` must pass before you commit.
27
+ - When the evaluator returns `Verdict: fail`, work the findings list in
28
+ order; do not relitigate the contract.
29
+ - Do not refactor unrelated code.
@@ -0,0 +1,28 @@
1
+ # Role: Planner
2
+
3
+ You turn a short request into a sprint contract the implementer can build and
4
+ the evaluator can grade. You do not write implementation code.
5
+
6
+ ## Inputs
7
+
8
+ 1. The user's request.
9
+ 2. `feature_list.json`, latest `progress.md` entries, and the codebase.
10
+ 3. `team/sprint-template.md`.
11
+
12
+ ## Outputs
13
+
14
+ - `team/sprints/S00X/sprint.md` with status `planned`.
15
+ - New or updated entries in `feature_list.json` for capabilities the sprint
16
+ creates (status `not_started`).
17
+
18
+ ## Rules
19
+
20
+ - Scope one sprint smaller than what feels comfortable; the implementer can
21
+ always start the next sprint sooner.
22
+ - Every "Done Means" item must be checkable by a command or an observable
23
+ runtime signal. If you cannot name the check, the item is not ready.
24
+ - Name the user path, the API/data path, and at least one negative case.
25
+ - List what is explicitly out of scope; drift starts where the contract is
26
+ silent.
27
+ - If the request is ambiguous in a way that changes the contract, ask one
28
+ concise question instead of guessing.
@@ -0,0 +1,36 @@
1
+ # Sprint: S00X
2
+
3
+ Status: planned
4
+ <!-- planned | building | evaluating | done -->
5
+ <!-- done requires evaluation.md with Verdict: pass — enforced by team/verify-team.mjs -->
6
+
7
+ ## Scope
8
+
9
+ - Feature:
10
+ - User path:
11
+ - API/data path:
12
+ - Likely files/modules:
13
+
14
+ ## Done Means
15
+
16
+ - [ ] User can ...
17
+ - [ ] API or data reflects ...
18
+ - [ ] Error state handles ...
19
+ - [ ] No regression in ...
20
+
21
+ ## Verification
22
+
23
+ - Unit:
24
+ - Integration:
25
+ - Browser/API:
26
+ - Log/metric/trace:
27
+
28
+ ## Evaluator Focus
29
+
30
+ - Runtime behavior:
31
+ - Negative cases:
32
+ - UX or quality concerns:
33
+
34
+ ## Out of Scope
35
+
36
+ - ...
@@ -0,0 +1,71 @@
1
+ #!/usr/bin/env node
2
+ // Team pipeline gate.
3
+ //
4
+ // A sprint may only carry "Status: done" when its directory contains an
5
+ // evaluation.md with "Verdict: pass" written by the evaluator role. This
6
+ // keeps the generator from grading its own work.
7
+ //
8
+ // Run from the project root: node team/verify-team.mjs
9
+
10
+ import { access, readdir, readFile } from "node:fs/promises";
11
+ import path from "node:path";
12
+ import { fileURLToPath } from "node:url";
13
+
14
+ const TEAM_ROOT = path.dirname(fileURLToPath(import.meta.url));
15
+ const SPRINTS_ROOT = path.join(TEAM_ROOT, "sprints");
16
+
17
+ async function exists(filePath) {
18
+ try {
19
+ await access(filePath);
20
+ return true;
21
+ } catch {
22
+ return false;
23
+ }
24
+ }
25
+
26
+ const errors = [];
27
+
28
+ if (await exists(SPRINTS_ROOT)) {
29
+ const entries = await readdir(SPRINTS_ROOT, { withFileTypes: true });
30
+
31
+ for (const entry of entries) {
32
+ if (!entry.isDirectory()) {
33
+ continue;
34
+ }
35
+
36
+ const sprintDir = path.join(SPRINTS_ROOT, entry.name);
37
+ const sprintPath = path.join(sprintDir, "sprint.md");
38
+ const label = `team/sprints/${entry.name}`;
39
+
40
+ if (!await exists(sprintPath)) {
41
+ errors.push(`${label}/sprint.md: sprint contract is missing`);
42
+ continue;
43
+ }
44
+
45
+ const sprint = await readFile(sprintPath, "utf8");
46
+ const isDone = /^Status:\s*done\s*$/m.test(sprint);
47
+ if (!isDone) {
48
+ continue;
49
+ }
50
+
51
+ const evaluationPath = path.join(sprintDir, "evaluation.md");
52
+ if (!await exists(evaluationPath)) {
53
+ errors.push(`${label}: marked done but evaluation.md is missing`);
54
+ continue;
55
+ }
56
+
57
+ const evaluation = await readFile(evaluationPath, "utf8");
58
+ if (!/^#{0,4}\s*Verdict:\s*pass\b/m.test(evaluation)) {
59
+ errors.push(`${label}: marked done but evaluation.md has no "Verdict: pass"`);
60
+ }
61
+ }
62
+ }
63
+
64
+ if (errors.length > 0) {
65
+ for (const error of errors) {
66
+ console.error(error);
67
+ }
68
+ process.exitCode = 1;
69
+ } else {
70
+ console.log("Team gate passed.");
71
+ }
@@ -0,0 +1,62 @@
1
+ # Team Workflow
2
+
3
+ This directory turns Codex sessions into a small production team. Each role is
4
+ one Codex session bound to one contract file; roles hand work to each other
5
+ through artifacts in `team/sprints/`, never through chat history.
6
+
7
+ ## Pipeline
8
+
9
+ ```
10
+ PLAN BUILD EVALUATE SHIP
11
+ planner session ──▶ implementer session ──▶ evaluator session ──▶ commit + release
12
+ writes sprint.md builds to contract writes evaluation.md status: done
13
+ status: planned status: building status: evaluating (gate enforced)
14
+ ```
15
+
16
+ ## When to use the team
17
+
18
+ Use the full pipeline only when the task spans multiple files, runtime
19
+ behavior, or subjective quality. For a small bug or feature, skip the team and
20
+ use `$acceptance-contract` in a single session — the simplest sufficient
21
+ structure wins.
22
+
23
+ ## Running a role
24
+
25
+ Open a fresh Codex session and start with one line:
26
+
27
+ ```
28
+ Act as the role defined in team/roles/planner.md for sprint S001.
29
+ ```
30
+
31
+ Same for `implementer.md` and `evaluator.md`. One session, one role, one
32
+ sprint. Do not let one session play two roles back to back; the evaluator
33
+ must start without the implementer's context.
34
+
35
+ ## Starting a sprint
36
+
37
+ 1. Planner copies `team/sprint-template.md` to `team/sprints/S00X/sprint.md`
38
+ and fills it in. Status: `planned`.
39
+ 2. Implementer sets status `building`, works, then sets `evaluating` and
40
+ hands off with a commit.
41
+ 3. Evaluator writes `team/sprints/S00X/evaluation.md` with evidence and a
42
+ verdict. On `Verdict: fail`, the implementer iterates on the findings.
43
+ 4. Only after `Verdict: pass` may the sprint status become `done`.
44
+
45
+ ## Gate
46
+
47
+ ```
48
+ node team/verify-team.mjs
49
+ ```
50
+
51
+ Fails when any sprint marked `done` lacks an `evaluation.md` containing
52
+ `Verdict: pass`. Run it together with `node verify-state.mjs` before
53
+ committing.
54
+
55
+ ## Rules
56
+
57
+ - The sprint contract is the only scope authority. Implementer does not widen
58
+ it; evaluator grades against it, not against taste.
59
+ - The evaluator never edits implementation code. Findings go to
60
+ `evaluation.md`; fixes belong to the implementer's next pass.
61
+ - Every handoff is a commit plus a `progress.md` entry, so any role can be
62
+ resumed by a brand-new session.