npm - codex-harness-engineering - Versions diffs - 0.1.4 → 0.1.6 - Mend

codex-harness-engineering 0.1.4 → 0.1.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (34) hide show

package/AGENTS.md +18 -6
package/LICENSE +21 -0
package/README.md +69 -6
package/docs/harness-engineering/implementation-playbook.md +232 -286
package/docs/harness-engineering/index.md +7 -4
package/docs/harness-engineering/research-note.md +294 -274
package/docs/harness-engineering/sources.md +166 -72
package/package.json +9 -4
package/scripts/install-skills.mjs +73 -15
package/scripts/publish.sh +2 -2
package/scripts/verify-harness.mjs +61 -4
package/skills/acceptance-contract/SKILL.md +39 -49
package/skills/acceptance-contract/agents/openai.yaml +2 -2
package/skills/cleanup-harness/SKILL.md +48 -59
package/skills/cleanup-harness/agents/openai.yaml +2 -2
package/skills/creator-harness/SKILL.md +79 -95
package/skills/creator-harness/agents/openai.yaml +2 -2
package/skills/creator-harness/references/harness-artifacts.md +63 -62
package/skills/lessons-harness/SKILL.md +68 -0
package/skills/lessons-harness/agents/openai.yaml +4 -0
package/templates/harness/AGENTS.md +77 -0
package/templates/harness/feature_list.json +16 -0
package/templates/harness/init.sh +15 -0
package/templates/harness/lessons.md +18 -0
package/templates/harness/memory/README.md +22 -0
package/templates/harness/progress.md +33 -0
package/templates/harness/rotate-state.mjs +131 -0
package/templates/harness/verify-state.mjs +117 -0
package/templates/team/roles/evaluator.md +43 -0
package/templates/team/roles/implementer.md +29 -0
package/templates/team/roles/planner.md +28 -0
package/templates/team/sprint-template.md +36 -0
package/templates/team/verify-team.mjs +71 -0
package/templates/team/workflow.md +62 -0

package/templates/harness/verify-state.mjs ADDED Viewed

@@ -0,0 +1,117 @@
+#!/usr/bin/env node
+// Mechanical state gate for the repository harness.
+//
+// Fails when the working tree contains behavior changes that lack matching
+// updates to feature_list.json and progress.md, or when the latest progress
+// entry does not name the changed files in backticks.
+//
+// Run it before committing: node verify-state.mjs
+// Customize BEHAVIOR_IGNORE_PATTERNS for your project. By default every
+// changed file counts as a behavior change except state files, Markdown
+// docs, and common non-behavior files.
+import { execFileSync } from "node:child_process";
+import { access, readFile } from "node:fs/promises";
+import path from "node:path";
+import { fileURLToPath } from "node:url";
+const ROOT = path.dirname(fileURLToPath(import.meta.url));
+const REQUIRED_FILES = ["AGENTS.md", "progress.md", "feature_list.json", "lessons.md", "init.sh"];
+const STATE_FILES = ["feature_list.json", "progress.md"];
+const BEHAVIOR_IGNORE_PATTERNS = [
+  /\.md$/,
+  /^\.agents\//,
+  /^memory\//,
+  /^team\//,
+  /^\.gitignore$/,
+  /^LICENSE$/,
+  /(^|\/)\.DS_Store$/,
+];
+const LINE_BUDGETS = {
+  "progress.md": 400,
+  "lessons.md": 400,
+};
+async function exists(filePath) {
+  try {
+    await access(filePath);
+    return true;
+  } catch {
+    return false;
+  }
+}
+function gitLines(args) {
+  try {
+    return execFileSync("git", args, { cwd: ROOT, encoding: "utf8" })
+      .split("\n")
+      .filter(Boolean);
+  } catch {
+    return [];
+  }
+}
+function changedFiles() {
+  const modified = gitLines(["diff", "--name-only", "HEAD", "--"]);
+  const untracked = gitLines(["ls-files", "--others", "--exclude-standard"]);
+  return [...new Set([...modified, ...untracked])];
+}
+function isBehaviorChange(relativePath) {
+  return !STATE_FILES.includes(relativePath) &&
+    !BEHAVIOR_IGNORE_PATTERNS.some((pattern) => pattern.test(relativePath));
+}
+function latestProgressEntry(progress) {
+  const entryStart = progress.lastIndexOf("\n## ");
+  return entryStart === -1 ? progress : progress.slice(entryStart + 1);
+}
+const errors = [];
+for (const relativePath of REQUIRED_FILES) {
+  if (!await exists(path.join(ROOT, relativePath))) {
+    errors.push(`${relativePath}: required harness artifact is missing`);
+  }
+}
+for (const [relativePath, budget] of Object.entries(LINE_BUDGETS)) {
+  if (await exists(path.join(ROOT, relativePath))) {
+    const lineCount = (await readFile(path.join(ROOT, relativePath), "utf8")).split("\n").length;
+    if (lineCount > budget) {
+      errors.push(
+        `${relativePath}: ${lineCount} lines exceeds the ${budget}-line budget; run node rotate-state.mjs to archive older entries`
+      );
+    }
+  }
+}
+const changed = changedFiles();
+const behaviorChanges = changed.filter(isBehaviorChange);
+if (behaviorChanges.length > 0) {
+  for (const stateFile of STATE_FILES) {
+    if (!changed.includes(stateFile)) {
+      errors.push(`${stateFile}: must be updated when behavior changes`);
+    }
+  }
+  if (changed.includes("progress.md") && await exists(path.join(ROOT, "progress.md"))) {
+    const progress = await readFile(path.join(ROOT, "progress.md"), "utf8");
+    const latestEntry = latestProgressEntry(progress);
+    for (const relativePath of behaviorChanges) {
+      if (!latestEntry.includes(`\`${relativePath}\``)) {
+        errors.push(`progress.md: latest entry must reference changed file ${relativePath}`);
+      }
+    }
+  }
+}
+if (errors.length > 0) {
+  for (const error of errors) {
+    console.error(error);
+  }
+  process.exitCode = 1;
+} else {
+  console.log("State gate passed.");
+}

package/templates/team/roles/evaluator.md ADDED Viewed

@@ -0,0 +1,43 @@
+# Role: Evaluator
+You grade the sprint against its contract with observed evidence. You are
+deliberately skeptical: the implementer's self-report is a claim, not proof.
+You never edit implementation code.
+## Inputs
+1. `team/sprints/S00X/sprint.md` — the only grading standard.
+2. The running application, tests, logs, API responses, database state.
+3. A fresh session: do not reuse the implementer's session or context.
+## Outputs
+- `team/sprints/S00X/evaluation.md`:
+```markdown
+# Evaluation: S00X
+## Checks Run
+- Command/check:
+- Result:
+- Artifact: (output, screenshot, response body, log line)
+## Findings
+- [ ] P0/P1/P2:
+  - Evidence:
+  - Repro:
+  - Suggested next step:
+## Verdict: pass | fail
+- Reason:
+```
+## Rules
+- Execute the contract's verification yourself; never trust reported results.
+- Exercise the real runtime: user paths end to end, the negative cases, and
+  the API/data assertions in the contract — not just the unit tests.
+- Every finding needs evidence and a repro; "feels wrong" is not a finding.
+- Grade against the contract, not your taste. Out-of-contract ideas go in a
+  note to the planner, not into the verdict.
+- `Verdict: pass` only when every "Done Means" item passed a check you ran.

package/templates/team/roles/implementer.md ADDED Viewed

@@ -0,0 +1,29 @@
+# Role: Implementer
+You build exactly what the sprint contract says, prove it with the contract's
+checks, and hand off cleanly. You do not grade your own work beyond running
+the listed verification.
+## Inputs
+1. `team/sprints/S00X/sprint.md` (the contract — read it first).
+2. `AGENTS.md`, `progress.md`, `feature_list.json`.
+3. `./init.sh` must pass before you edit anything.
+## Outputs
+- Implementation scoped to the contract, one feature at a time.
+- Updated `feature_list.json` statuses backed by the contract's verification
+  commands.
+- A `progress.md` entry naming changed files, plus a descriptive commit.
+- Sprint status moved `building` → `evaluating` at handoff.
+## Rules
+- The contract is the scope authority. If you discover the contract is wrong,
+  stop and send it back to the planner; do not silently widen or shrink it.
+- Run the narrowest check that proves each "Done Means" item before marking
+  it done. `node verify-state.mjs` must pass before you commit.
+- When the evaluator returns `Verdict: fail`, work the findings list in
+  order; do not relitigate the contract.
+- Do not refactor unrelated code.

package/templates/team/roles/planner.md ADDED Viewed

@@ -0,0 +1,28 @@
+# Role: Planner
+You turn a short request into a sprint contract the implementer can build and
+the evaluator can grade. You do not write implementation code.
+## Inputs
+1. The user's request.
+2. `feature_list.json`, latest `progress.md` entries, and the codebase.
+3. `team/sprint-template.md`.
+## Outputs
+- `team/sprints/S00X/sprint.md` with status `planned`.
+- New or updated entries in `feature_list.json` for capabilities the sprint
+  creates (status `not_started`).
+## Rules
+- Scope one sprint smaller than what feels comfortable; the implementer can
+  always start the next sprint sooner.
+- Every "Done Means" item must be checkable by a command or an observable
+  runtime signal. If you cannot name the check, the item is not ready.
+- Name the user path, the API/data path, and at least one negative case.
+- List what is explicitly out of scope; drift starts where the contract is
+  silent.
+- If the request is ambiguous in a way that changes the contract, ask one
+  concise question instead of guessing.

package/templates/team/sprint-template.md ADDED Viewed

@@ -0,0 +1,36 @@
+# Sprint: S00X
+Status: planned
+<!-- planned | building | evaluating | done -->
+<!-- done requires evaluation.md with Verdict: pass — enforced by team/verify-team.mjs -->
+## Scope
+- Feature:
+- User path:
+- API/data path:
+- Likely files/modules:
+## Done Means
+- [ ] User can ...
+- [ ] API or data reflects ...
+- [ ] Error state handles ...
+- [ ] No regression in ...
+## Verification
+- Unit:
+- Integration:
+- Browser/API:
+- Log/metric/trace:
+## Evaluator Focus
+- Runtime behavior:
+- Negative cases:
+- UX or quality concerns:
+## Out of Scope
+- ...

package/templates/team/verify-team.mjs ADDED Viewed

@@ -0,0 +1,71 @@
+#!/usr/bin/env node
+// Team pipeline gate.
+//
+// A sprint may only carry "Status: done" when its directory contains an
+// evaluation.md with "Verdict: pass" written by the evaluator role. This
+// keeps the generator from grading its own work.
+//
+// Run from the project root: node team/verify-team.mjs
+import { access, readdir, readFile } from "node:fs/promises";
+import path from "node:path";
+import { fileURLToPath } from "node:url";
+const TEAM_ROOT = path.dirname(fileURLToPath(import.meta.url));
+const SPRINTS_ROOT = path.join(TEAM_ROOT, "sprints");
+async function exists(filePath) {
+  try {
+    await access(filePath);
+    return true;
+  } catch {
+    return false;
+  }
+}
+const errors = [];
+if (await exists(SPRINTS_ROOT)) {
+  const entries = await readdir(SPRINTS_ROOT, { withFileTypes: true });
+  for (const entry of entries) {
+    if (!entry.isDirectory()) {
+      continue;
+    }
+    const sprintDir = path.join(SPRINTS_ROOT, entry.name);
+    const sprintPath = path.join(sprintDir, "sprint.md");
+    const label = `team/sprints/${entry.name}`;
+    if (!await exists(sprintPath)) {
+      errors.push(`${label}/sprint.md: sprint contract is missing`);
+      continue;
+    }
+    const sprint = await readFile(sprintPath, "utf8");
+    const isDone = /^Status:\s*done\s*$/m.test(sprint);
+    if (!isDone) {
+      continue;
+    }
+    const evaluationPath = path.join(sprintDir, "evaluation.md");
+    if (!await exists(evaluationPath)) {
+      errors.push(`${label}: marked done but evaluation.md is missing`);
+      continue;
+    }
+    const evaluation = await readFile(evaluationPath, "utf8");
+    if (!/^#{0,4}\s*Verdict:\s*pass\b/m.test(evaluation)) {
+      errors.push(`${label}: marked done but evaluation.md has no "Verdict: pass"`);
+    }
+  }
+}
+if (errors.length > 0) {
+  for (const error of errors) {
+    console.error(error);
+  }
+  process.exitCode = 1;
+} else {
+  console.log("Team gate passed.");
+}

package/templates/team/workflow.md ADDED Viewed

@@ -0,0 +1,62 @@
+# Team Workflow
+This directory turns Codex sessions into a small production team. Each role is
+one Codex session bound to one contract file; roles hand work to each other
+through artifacts in `team/sprints/`, never through chat history.
+## Pipeline
+```
+PLAN                BUILD                 EVALUATE              SHIP
+planner session ──▶ implementer session ──▶ evaluator session ──▶ commit + release
+writes sprint.md    builds to contract     writes evaluation.md  status: done
+status: planned     status: building       status: evaluating    (gate enforced)
+```
+## When to use the team
+Use the full pipeline only when the task spans multiple files, runtime
+behavior, or subjective quality. For a small bug or feature, skip the team and
+use `$acceptance-contract` in a single session — the simplest sufficient
+structure wins.
+## Running a role
+Open a fresh Codex session and start with one line:
+```
+Act as the role defined in team/roles/planner.md for sprint S001.
+```
+Same for `implementer.md` and `evaluator.md`. One session, one role, one
+sprint. Do not let one session play two roles back to back; the evaluator
+must start without the implementer's context.
+## Starting a sprint
+1. Planner copies `team/sprint-template.md` to `team/sprints/S00X/sprint.md`
+   and fills it in. Status: `planned`.
+2. Implementer sets status `building`, works, then sets `evaluating` and
+   hands off with a commit.
+3. Evaluator writes `team/sprints/S00X/evaluation.md` with evidence and a
+   verdict. On `Verdict: fail`, the implementer iterates on the findings.
+4. Only after `Verdict: pass` may the sprint status become `done`.
+## Gate
+```
+node team/verify-team.mjs
+```
+Fails when any sprint marked `done` lacks an `evaluation.md` containing
+`Verdict: pass`. Run it together with `node verify-state.mjs` before
+committing.
+## Rules
+- The sprint contract is the only scope authority. Implementer does not widen
+  it; evaluator grades against it, not against taste.
+- The evaluator never edits implementation code. Findings go to
+  `evaluation.md`; fixes belong to the implementer's next pass.
+- Every handoff is a commit plus a `progress.md` entry, so any role can be
+  resumed by a brand-new session.