npm - codex-harness-engineering - Versions diffs - 0.1.4 - Mend

codex-harness-engineering 0.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

package/AGENTS.md +73 -0
package/README.md +136 -0
package/docs/harness-engineering/implementation-playbook.md +370 -0
package/docs/harness-engineering/index.md +61 -0
package/docs/harness-engineering/research-note.md +318 -0
package/docs/harness-engineering/sources.md +126 -0
package/package.json +38 -0
package/scripts/install-skills.mjs +104 -0
package/scripts/publish.sh +139 -0
package/scripts/verify-harness.mjs +175 -0
package/skills/acceptance-contract/SKILL.md +78 -0
package/skills/acceptance-contract/agents/openai.yaml +4 -0
package/skills/cleanup-harness/SKILL.md +90 -0
package/skills/cleanup-harness/agents/openai.yaml +4 -0
package/skills/creator-harness/SKILL.md +124 -0
package/skills/creator-harness/agents/openai.yaml +4 -0
package/skills/creator-harness/references/harness-artifacts.md +302 -0

package/scripts/publish.sh ADDED Viewed

@@ -0,0 +1,139 @@
+#!/usr/bin/env bash
+# Bump version, verify package contents, publish to npm, then commit/tag/push.
+#
+# Usage:
+#   ./scripts/publish.sh                    # patch bump
+#   ./scripts/publish.sh minor              # minor bump
+#   ./scripts/publish.sh major              # major bump
+#   ./scripts/publish.sh 0.2.0              # exact version
+#   ./scripts/publish.sh patch --otp 123456 # npm 2FA
+set -euo pipefail
+cd "$(dirname "$0")/.."
+usage() {
+  echo "Usage: ./scripts/publish.sh [patch|minor|major|x.y.z] [--otp 123456]"
+}
+die() {
+  echo "$*" >&2
+  exit 1
+}
+BUMP="patch"
+OTP="${NPM_CONFIG_OTP:-}"
+while [[ $# -gt 0 ]]; do
+  case "$1" in
+    --otp)
+      [[ $# -ge 2 ]] || die "Missing value for --otp"
+      OTP="$2"
+      shift 2
+      ;;
+    --otp=*)
+      OTP="${1#--otp=}"
+      shift
+      ;;
+    patch|minor|major|prepatch|preminor|premajor|prerelease|[0-9]*)
+      BUMP="$1"
+      shift
+      ;;
+    -h|--help)
+      usage
+      exit 0
+      ;;
+    *)
+      usage >&2
+      die "Unknown argument: $1"
+      ;;
+  esac
+done
+PACKAGE_NAME="$(node -p "require('./package.json').name")"
+CURRENT_VERSION="$(node -p "require('./package.json').version")"
+BRANCH="$(git branch --show-current)"
+NEW_VERSION=""
+PUBLISHED=0
+rollback_version() {
+  if [[ "$PUBLISHED" -eq 0 && -n "$NEW_VERSION" ]]; then
+    echo ""
+    echo "Publish did not complete. Rolling package.json back to $CURRENT_VERSION..."
+    npm version "$CURRENT_VERSION" --no-git-tag-version >/dev/null 2>&1 || true
+  fi
+}
+trap rollback_version ERR
+[[ -n "$BRANCH" ]] || die "Cannot determine current git branch."
+if [[ -n "$(git status --porcelain)" ]]; then
+  echo "Working tree is not clean. Commit or stash changes before publishing."
+  git status --short
+  exit 1
+fi
+if ! npm whoami >/dev/null 2>&1; then
+  die "Not logged in to npm. Run: npm login"
+fi
+echo "Package: $PACKAGE_NAME"
+echo "Current version: $CURRENT_VERSION"
+echo "Bump: $BUMP"
+echo "Branch: $BRANCH"
+echo ""
+if [[ "$BUMP" == "$CURRENT_VERSION" ]]; then
+  NEW_VERSION="$CURRENT_VERSION"
+  echo "Version is already $NEW_VERSION; skipping npm version."
+else
+  NEW_VERSION="$(npm version "$BUMP" --no-git-tag-version)"
+  NEW_VERSION="${NEW_VERSION#v}"
+fi
+echo "New version: $NEW_VERSION"
+echo ""
+echo "Running tests..."
+npm test
+echo ""
+echo "Checking package contents..."
+npm pack --dry-run
+echo ""
+echo "Publishing $PACKAGE_NAME@$NEW_VERSION to npm..."
+PUBLISH_ARGS=(publish --access public)
+if [[ -n "$OTP" ]]; then
+  PUBLISH_ARGS+=(--otp "$OTP")
+fi
+npm "${PUBLISH_ARGS[@]}"
+PUBLISHED=1
+echo ""
+echo "Committing and tagging release..."
+git add package.json
+if [[ -f package-lock.json ]]; then
+  git add package-lock.json
+fi
+if ! git diff --cached --quiet; then
+  git commit -m "chore: publish $PACKAGE_NAME@$NEW_VERSION"
+else
+  echo "No version file changes to commit."
+fi
+if git rev-parse "v$NEW_VERSION" >/dev/null 2>&1; then
+  echo "Tag v$NEW_VERSION already exists; skipping tag creation."
+else
+  git tag "v$NEW_VERSION"
+fi
+echo ""
+echo "Pushing $BRANCH and tags..."
+git push origin "$BRANCH" --tags
+echo ""
+echo "Done: $PACKAGE_NAME@$NEW_VERSION"
+echo "https://www.npmjs.com/package/$PACKAGE_NAME"

package/scripts/verify-harness.mjs ADDED Viewed

@@ -0,0 +1,175 @@
+#!/usr/bin/env node
+import { execFileSync } from "node:child_process";
+import { access, readdir, readFile } from "node:fs/promises";
+import path from "node:path";
+import { fileURLToPath, pathToFileURL } from "node:url";
+const PACKAGE_ROOT = path.resolve(path.dirname(fileURLToPath(import.meta.url)), "..");
+const REQUIRED_FILES = [
+  "AGENTS.md",
+  "README.md",
+  "progress.md",
+  "feature_list.json",
+  "init.sh",
+  "docs/harness-engineering/index.md",
+  "docs/harness-engineering/research-note.md",
+  "docs/harness-engineering/implementation-playbook.md",
+  "docs/harness-engineering/sources.md",
+  "skills/creator-harness/SKILL.md",
+  "skills/acceptance-contract/SKILL.md",
+  "skills/cleanup-harness/SKILL.md",
+];
+const README_MAPPED_FILES = REQUIRED_FILES.filter(
+  (relativePath) => relativePath !== "AGENTS.md" && relativePath !== "README.md"
+);
+const STATE_FILES = ["feature_list.json", "progress.md"];
+const BEHAVIOR_CHANGE_PATTERNS = [
+  /^(scripts|tests|skills)\//,
+  /^package(?:-lock)?\.json$/,
+  /^init\.sh$/,
+  /^AGENTS\.md$/,
+];
+async function exists(filePath) {
+  try {
+    await access(filePath);
+    return true;
+  } catch {
+    return false;
+  }
+}
+async function markdownFiles(directory, relativeRoot) {
+  if (!await exists(directory)) {
+    return [];
+  }
+  const found = [];
+  const entries = await readdir(directory, { withFileTypes: true });
+  for (const entry of entries) {
+    const absolutePath = path.join(directory, entry.name);
+    const relativePath = path.join(relativeRoot, entry.name);
+    if (entry.isDirectory()) {
+      found.push(...await markdownFiles(absolutePath, relativePath));
+    } else if (entry.isFile() && entry.name.endsWith(".md")) {
+      found.push(relativePath);
+    }
+  }
+  return found;
+}
+function changesHarnessBehavior(relativePath) {
+  return BEHAVIOR_CHANGE_PATTERNS.some((pattern) => pattern.test(relativePath));
+}
+function changedFilesInGit(root) {
+  try {
+    const modified = execFileSync("git", ["diff", "--name-only", "HEAD", "--"], {
+      cwd: root,
+      encoding: "utf8",
+    });
+    const untracked = execFileSync("git", ["ls-files", "--others", "--exclude-standard"], {
+      cwd: root,
+      encoding: "utf8",
+    });
+    return [...new Set(`${modified}\n${untracked}`.split("\n").filter(Boolean))];
+  } catch {
+    return [];
+  }
+}
+function latestProgressEntry(progress) {
+  const entryStart = progress.lastIndexOf("\n## ");
+  return entryStart === -1 ? progress : progress.slice(entryStart + 1);
+}
+export async function verifyHarness(root = PACKAGE_ROOT, { changedFiles = [] } = {}) {
+  const errors = [];
+  for (const relativePath of REQUIRED_FILES) {
+    if (!await exists(path.join(root, relativePath))) {
+      errors.push(`${relativePath}: required artifact is missing`);
+    }
+  }
+  const readmePath = path.join(root, "README.md");
+  if (await exists(readmePath)) {
+    const readme = await readFile(readmePath, "utf8");
+    for (const relativePath of README_MAPPED_FILES) {
+      if (!readme.includes(`\`${relativePath}\``)) {
+        errors.push(`README.md: required artifact ${relativePath} is not referenced`);
+      }
+    }
+  }
+  const scannableFiles = [
+    "AGENTS.md",
+    "README.md",
+    ...await markdownFiles(path.join(root, "docs", "harness-engineering"), "docs/harness-engineering"),
+    ...await markdownFiles(path.join(root, "skills"), "skills"),
+  ];
+  for (const relativePath of scannableFiles) {
+    const filePath = path.join(root, relativePath);
+    if (!await exists(filePath)) {
+      continue;
+    }
+    const contents = await readFile(filePath, "utf8");
+    for (const match of contents.matchAll(/\[S(\d+)\]/g)) {
+      const sourceNumber = Number(match[1]);
+      if (sourceNumber < 1 || sourceNumber > 5) {
+        errors.push(`${relativePath}: citation ${match[0]} is outside permitted range [S1]-[S5]`);
+      }
+    }
+  }
+  const behaviorChanges = changedFiles.filter(changesHarnessBehavior);
+  if (behaviorChanges.length > 0) {
+    for (const stateFile of STATE_FILES) {
+      if (!changedFiles.includes(stateFile)) {
+        errors.push(
+          `${stateFile}: must be updated when implementation or harness behavior changes`
+        );
+      }
+    }
+    if (changedFiles.includes("progress.md") && await exists(path.join(root, "progress.md"))) {
+      const progress = await readFile(path.join(root, "progress.md"), "utf8");
+      const latestEntry = latestProgressEntry(progress);
+      for (const relativePath of behaviorChanges) {
+        if (!latestEntry.includes(`\`${relativePath}\``)) {
+          errors.push(
+            `progress.md: latest entry must reference changed behavior artifact ${relativePath}`
+          );
+        }
+      }
+    }
+  }
+  return errors;
+}
+function isDirectRun() {
+  return Boolean(process.argv[1]) &&
+    pathToFileURL(path.resolve(process.argv[1])).href === import.meta.url;
+}
+if (isDirectRun()) {
+  const errors = await verifyHarness(PACKAGE_ROOT, {
+    changedFiles: changedFilesInGit(PACKAGE_ROOT),
+  });
+  if (errors.length > 0) {
+    for (const error of errors) {
+      console.error(error);
+    }
+    process.exitCode = 1;
+  } else {
+    console.log("Harness verification passed.");
+  }
+}

package/skills/acceptance-contract/SKILL.md ADDED Viewed

@@ -0,0 +1,78 @@
+---
+name: acceptance-contract
+description: Use when a user asks to define success criteria, clarify scope, prevent premature done claims, or prepare an AI agent/coding agent task before implementation.
+---
+# Acceptance Contract
+## Core Principle
+Turn an unclear request into a small, verifiable contract before implementation.
+Use this skill when "done" is ambiguous, the task could drift, or an agent may
+claim completion without evidence.
+In this repository, follow the local source policy: use only `[S1]-[S5]` for
+harness claims. Read `docs/harness-engineering/sources.md` only when you need to
+check that policy. For templates, prefer the relevant section of
+`docs/harness-engineering/implementation-playbook.md` instead of loading the
+whole research note.
+## Workflow
+1. State assumptions in one short list.
+2. Name any ambiguity that changes implementation or verification.
+3. Keep the scope smaller than the implementation work.
+4. Define user-visible or system-visible behavior.
+5. Define acceptance criteria that can be checked.
+6. Define verification commands or observable signals.
+7. Mark non-goals so the agent does not widen the task.
+8. Implement only after the contract is clear enough to verify.
+If the missing information cannot be inferred safely, ask one concise question
+before writing code.
+## Contract Template
+```markdown
+# Acceptance Contract
+## Assumptions
+- ...
+## Scope
+- Feature/fix:
+- User-visible behavior:
+- Likely files:
+## Acceptance Criteria
+- [ ] ...
+- [ ] ...
+## Verification
+- Unit:
+- Integration:
+- Browser/API:
+- Log/metric/trace:
+## Out of Scope
+- ...
+```
+## Verification Rules
+- Prefer an existing project command over a new script.
+- For code changes, run the narrowest test that proves the criteria.
+- For UI/runtime behavior, use browser, API, log, metric, trace, or screenshot
+  evidence when available.
+- Do not mark criteria done until verification has run or the skipped check is
+  explicitly explained.
+## Source Mapping
+- Small tasks should use the simplest sufficient workflow [S3].
+- Long-running agent tasks need state and verification to avoid early done
+  claims [S2].
+- Runtime-visible checks improve agent feedback loops [S1], [S2], [S4].
+- Sprint contracts and evaluator criteria help when task quality is subjective
+  or multi-step [S4].
+- Trajectory evaluation and LLM-as-a-judge monitor execution path quality, and AutoHarness enforces constraints when manual rules are too complex [S5].

package/skills/acceptance-contract/agents/openai.yaml ADDED Viewed

@@ -0,0 +1,4 @@
+interface:
+  display_name: "Acceptance Contract"
+  short_description: "Define scope, done criteria, and checks"
+  default_prompt: "Use $acceptance-contract to define scope, acceptance criteria, and verification for this task."

package/skills/cleanup-harness/SKILL.md ADDED Viewed

@@ -0,0 +1,90 @@
+---
+name: cleanup-harness
+description: Use when a user asks to design, scope, or run cleanup for agent-created code, documentation drift, repeated review defects, architecture drift, or accumulated harness debt.
+---
+# Cleanup Harness
+## Core Principle
+Treat cleanup as a scoped harness task, not opportunistic refactoring. Cleanup
+needs a trigger, acceptance criteria, verification, and rollback path because
+high agent throughput can spread weak patterns quickly.
+In this repository, follow the local source policy: use only `[S1]-[S5]` for
+harness claims. Read `docs/harness-engineering/sources.md` only when you need to
+check that policy. For cleanup templates, prefer the relevant section of
+`docs/harness-engineering/implementation-playbook.md` instead of loading the
+whole research note.
+## Cleanup Triggers
+Start a cleanup task only when at least one trigger is visible:
+- the same helper, workaround, or pattern appears repeatedly;
+- a feature bypasses an architecture boundary;
+- progress logs repeat the same failure;
+- evaluator or review feedback catches the same defect class multiple times;
+- docs, indexes, or `AGENTS.md` drift from the repository state;
+- new work adds workaround code instead of fixing the cause.
+If no trigger is visible, mention the potential issue but do not edit unrelated
+code.
+## Workflow
+1. Identify the concrete trigger and evidence.
+2. Define the smallest cleanup scope that removes the repeated problem.
+3. List files likely to change.
+4. Define acceptance criteria.
+5. Define verification commands or observable signals.
+6. Remove only debt inside the declared scope.
+7. Convert repeated judgment into a mechanical guardrail when practical.
+8. Record what was verified and any residual risk.
+## Cleanup Task Template
+```markdown
+# Cleanup Task
+## Trigger
+- Evidence:
+## Scope
+- Clean up:
+- Likely files:
+## Acceptance Criteria
+- [ ] Duplicate or drift source is removed.
+- [ ] Behavior remains unchanged unless explicitly requested.
+- [ ] Guardrail is added or the reason for not adding one is stated.
+## Verification
+- Tests:
+- Lint/structural check:
+- Runtime check:
+## Rollback
+- Safe restore point:
+```
+## Guardrail Guidance
+Prefer a mechanical check when the same issue is likely to recur:
+- lint or structural test for architecture boundaries;
+- doc/index freshness check for repository source of truth;
+- smoke test for setup or runtime drift;
+- evaluator rubric for repeated subjective quality failures.
+Do not add broad rules that protect no concrete invariant.
+## Source Mapping
+- Cleanup is part of repository-level harness maintenance when throughput
+  increases entropy [S1].
+- Mechanical guardrails are stronger than prose for repeated invariants [S1].
+- Keep the intervention as simple as the failure mode allows [S3].
+- Long-running work benefits from explicit state, verification, and recovery
+  points [S2], [S4].
+- AutoHarness can automatically enforce code constraints to reduce cleanup debt, and trajectory evaluation tracks whether cleanup alters agent execution paths [S5].

package/skills/cleanup-harness/agents/openai.yaml ADDED Viewed

@@ -0,0 +1,4 @@
+interface:
+  display_name: "Cleanup Harness"
+  short_description: "Scope cleanup with triggers and checks"
+  default_prompt: "Use $cleanup-harness to scope a cleanup task with trigger evidence, acceptance criteria, and verification."

package/skills/creator-harness/SKILL.md ADDED Viewed

@@ -0,0 +1,124 @@
+---
+name: creator-harness
+description: Use when a user asks to create, design, audit, or improve a harness for AI agents, coding agents, long-running work, eval loops, repository workflows, or agent operating procedures.
+---
+# Creator Harness
+## Core Principle
+Create the smallest harness that changes agent behavior. A harness is the
+control plane around an agent: durable state, readable tools, verification
+loops, evaluator feedback when needed, and mechanical guardrails.
+Use only the local five-source research as the source of truth:
+- `docs/harness-engineering/sources.md`
+- `docs/harness-engineering/research-note.md`
+- `docs/harness-engineering/implementation-playbook.md`
+Do not introduce external harness resources unless the user explicitly asks to
+expand beyond the five OpenAI/Anthropic/Google articles.
+## Working Rules
+1. State assumptions before creating files. If the target agent, runtime, or
+   success criteria are unknowable, ask one concise question.
+2. Start with a single-agent harness plus state and verification. Add planner,
+   evaluator, telemetry, or cleanup automation only when a named failure mode
+   requires it.
+3. Touch only harness artifacts unless the user explicitly asks for product code
+   changes.
+4. Every harness artifact must answer at least one question: What should the
+   agent know? What state survives context loss? What can it observe? How does
+   it verify? What invariant is mechanically enforced?
+5. Convert important preferences into checks where practical: tests, lint,
+   scripts, CI jobs, evaluator rubrics, or reviewer contracts.
+6. For one-shot Markdown or research-note edits in this repository, do not start
+   autonomous loops unless the user explicitly requests them.
+## Design Workflow
+1. Inventory existing harness surface:
+   - `AGENTS.md`, `README.md`, architecture docs, product specs;
+   - setup scripts, task runner, CI, tests, smoke tests;
+   - progress logs, feature lists, todos, research state;
+   - eval prompts, evaluator rubrics, screenshots, traces, telemetry;
+   - tool contracts, permissions, escalation rules.
+2. Name the failure modes:
+   - lost context across sessions;
+   - early "done" claims;
+   - weak runtime observability;
+   - overbroad implementation;
+   - self-evaluation optimism;
+   - architecture drift;
+   - cleanup debt from high agent throughput.
+3. Pick the minimal intervention:
+   - unclear task: acceptance contract;
+   - lost context: `progress.md`, `feature_list.json`, git protocol;
+   - broken environment: `init.sh`, smoke test;
+   - invisible runtime: browser/API/log/metric/trace checks;
+   - weak self-review: evaluator rubric or separate evaluator pass;
+   - drift: structural lint or architecture test;
+   - throughput entropy: targeted cleanup task with verification;
+   - complex constraints: AutoHarness synthesized code wrapper [S5];
+   - agent trajectory drift: Trajectory Evaluation and LLM-as-a-judge [S5].
+4. Write a harness contract:
+   - agent role and allowed scope;
+   - durable state files;
+   - required tools and observable signals;
+   - verification commands;
+   - loop cadence;
+   - stop/escalation conditions;
+   - out-of-scope work.
+5. Create only the needed files. For templates, read
+   `references/harness-artifacts.md`.
+6. Verify the harness:
+   - run syntax/format validators for files created;
+   - run the declared smoke test if one exists;
+   - run the placeholder and citation scan from `AGENTS.md`;
+   - verify no recurring automation was created for a one-shot documentation
+     task;
+   - if editing this skill, validate the skill if a validator exists locally.
+## Harness Types
+| Situation                   | Default harness                                           |
+| --------------------------- | --------------------------------------------------------- |
+| Small bug or feature        | Acceptance criteria and a verification command            |
+| Multi-session coding        | `progress.md`, `feature_list.json`, `init.sh`, smoke test |
+| UI/runtime-heavy app        | Sprint contract, browser/API checks, evaluator notes      |
+| Long application build      | Planner, generator, evaluator, sprint contract            |
+| Architecture-sensitive repo | Dependency rules, structural tests, cleanup cadence       |
+| Complex or rule-heavy env   | AutoHarness (wrapper), Trajectory evaluation / VeRO       |
+## Output Shape
+When answering without file edits, produce:
+```markdown
+## Assumptions
+- ...
+## Failure Modes
+- ...
+## Minimal Harness
+- Artifact:
+- Purpose:
+- Verification:
+## Next Step
+- ...
+```
+When editing files, summarize changed files and verification run.

package/skills/creator-harness/agents/openai.yaml ADDED Viewed

@@ -0,0 +1,4 @@
+interface:
+  display_name: "Creator Harness"
+  short_description: "Design practical agent harnesses"
+  default_prompt: "Use $creator-harness to design a minimal harness for this repository."