codex-harness-engineering 0.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,139 @@
1
+ #!/usr/bin/env bash
2
+ # Bump version, verify package contents, publish to npm, then commit/tag/push.
3
+ #
4
+ # Usage:
5
+ # ./scripts/publish.sh # patch bump
6
+ # ./scripts/publish.sh minor # minor bump
7
+ # ./scripts/publish.sh major # major bump
8
+ # ./scripts/publish.sh 0.2.0 # exact version
9
+ # ./scripts/publish.sh patch --otp 123456 # npm 2FA
10
+
11
+ set -euo pipefail
12
+
13
+ cd "$(dirname "$0")/.."
14
+
15
+ usage() {
16
+ echo "Usage: ./scripts/publish.sh [patch|minor|major|x.y.z] [--otp 123456]"
17
+ }
18
+
19
+ die() {
20
+ echo "$*" >&2
21
+ exit 1
22
+ }
23
+
24
+ BUMP="patch"
25
+ OTP="${NPM_CONFIG_OTP:-}"
26
+
27
+ while [[ $# -gt 0 ]]; do
28
+ case "$1" in
29
+ --otp)
30
+ [[ $# -ge 2 ]] || die "Missing value for --otp"
31
+ OTP="$2"
32
+ shift 2
33
+ ;;
34
+ --otp=*)
35
+ OTP="${1#--otp=}"
36
+ shift
37
+ ;;
38
+ patch|minor|major|prepatch|preminor|premajor|prerelease|[0-9]*)
39
+ BUMP="$1"
40
+ shift
41
+ ;;
42
+ -h|--help)
43
+ usage
44
+ exit 0
45
+ ;;
46
+ *)
47
+ usage >&2
48
+ die "Unknown argument: $1"
49
+ ;;
50
+ esac
51
+ done
52
+
53
+ PACKAGE_NAME="$(node -p "require('./package.json').name")"
54
+ CURRENT_VERSION="$(node -p "require('./package.json').version")"
55
+ BRANCH="$(git branch --show-current)"
56
+ NEW_VERSION=""
57
+ PUBLISHED=0
58
+
59
+ rollback_version() {
60
+ if [[ "$PUBLISHED" -eq 0 && -n "$NEW_VERSION" ]]; then
61
+ echo ""
62
+ echo "Publish did not complete. Rolling package.json back to $CURRENT_VERSION..."
63
+ npm version "$CURRENT_VERSION" --no-git-tag-version >/dev/null 2>&1 || true
64
+ fi
65
+ }
66
+
67
+ trap rollback_version ERR
68
+
69
+ [[ -n "$BRANCH" ]] || die "Cannot determine current git branch."
70
+
71
+ if [[ -n "$(git status --porcelain)" ]]; then
72
+ echo "Working tree is not clean. Commit or stash changes before publishing."
73
+ git status --short
74
+ exit 1
75
+ fi
76
+
77
+ if ! npm whoami >/dev/null 2>&1; then
78
+ die "Not logged in to npm. Run: npm login"
79
+ fi
80
+
81
+ echo "Package: $PACKAGE_NAME"
82
+ echo "Current version: $CURRENT_VERSION"
83
+ echo "Bump: $BUMP"
84
+ echo "Branch: $BRANCH"
85
+ echo ""
86
+
87
+ if [[ "$BUMP" == "$CURRENT_VERSION" ]]; then
88
+ NEW_VERSION="$CURRENT_VERSION"
89
+ echo "Version is already $NEW_VERSION; skipping npm version."
90
+ else
91
+ NEW_VERSION="$(npm version "$BUMP" --no-git-tag-version)"
92
+ NEW_VERSION="${NEW_VERSION#v}"
93
+ fi
94
+
95
+ echo "New version: $NEW_VERSION"
96
+ echo ""
97
+
98
+ echo "Running tests..."
99
+ npm test
100
+
101
+ echo ""
102
+ echo "Checking package contents..."
103
+ npm pack --dry-run
104
+
105
+ echo ""
106
+ echo "Publishing $PACKAGE_NAME@$NEW_VERSION to npm..."
107
+ PUBLISH_ARGS=(publish --access public)
108
+ if [[ -n "$OTP" ]]; then
109
+ PUBLISH_ARGS+=(--otp "$OTP")
110
+ fi
111
+ npm "${PUBLISH_ARGS[@]}"
112
+ PUBLISHED=1
113
+
114
+ echo ""
115
+ echo "Committing and tagging release..."
116
+ git add package.json
117
+ if [[ -f package-lock.json ]]; then
118
+ git add package-lock.json
119
+ fi
120
+
121
+ if ! git diff --cached --quiet; then
122
+ git commit -m "chore: publish $PACKAGE_NAME@$NEW_VERSION"
123
+ else
124
+ echo "No version file changes to commit."
125
+ fi
126
+
127
+ if git rev-parse "v$NEW_VERSION" >/dev/null 2>&1; then
128
+ echo "Tag v$NEW_VERSION already exists; skipping tag creation."
129
+ else
130
+ git tag "v$NEW_VERSION"
131
+ fi
132
+
133
+ echo ""
134
+ echo "Pushing $BRANCH and tags..."
135
+ git push origin "$BRANCH" --tags
136
+
137
+ echo ""
138
+ echo "Done: $PACKAGE_NAME@$NEW_VERSION"
139
+ echo "https://www.npmjs.com/package/$PACKAGE_NAME"
@@ -0,0 +1,175 @@
1
+ #!/usr/bin/env node
2
+
3
+ import { execFileSync } from "node:child_process";
4
+ import { access, readdir, readFile } from "node:fs/promises";
5
+ import path from "node:path";
6
+ import { fileURLToPath, pathToFileURL } from "node:url";
7
+
8
+ const PACKAGE_ROOT = path.resolve(path.dirname(fileURLToPath(import.meta.url)), "..");
9
+ const REQUIRED_FILES = [
10
+ "AGENTS.md",
11
+ "README.md",
12
+ "progress.md",
13
+ "feature_list.json",
14
+ "init.sh",
15
+ "docs/harness-engineering/index.md",
16
+ "docs/harness-engineering/research-note.md",
17
+ "docs/harness-engineering/implementation-playbook.md",
18
+ "docs/harness-engineering/sources.md",
19
+ "skills/creator-harness/SKILL.md",
20
+ "skills/acceptance-contract/SKILL.md",
21
+ "skills/cleanup-harness/SKILL.md",
22
+ ];
23
+ const README_MAPPED_FILES = REQUIRED_FILES.filter(
24
+ (relativePath) => relativePath !== "AGENTS.md" && relativePath !== "README.md"
25
+ );
26
+ const STATE_FILES = ["feature_list.json", "progress.md"];
27
+ const BEHAVIOR_CHANGE_PATTERNS = [
28
+ /^(scripts|tests|skills)\//,
29
+ /^package(?:-lock)?\.json$/,
30
+ /^init\.sh$/,
31
+ /^AGENTS\.md$/,
32
+ ];
33
+
34
+ async function exists(filePath) {
35
+ try {
36
+ await access(filePath);
37
+ return true;
38
+ } catch {
39
+ return false;
40
+ }
41
+ }
42
+
43
+ async function markdownFiles(directory, relativeRoot) {
44
+ if (!await exists(directory)) {
45
+ return [];
46
+ }
47
+
48
+ const found = [];
49
+ const entries = await readdir(directory, { withFileTypes: true });
50
+
51
+ for (const entry of entries) {
52
+ const absolutePath = path.join(directory, entry.name);
53
+ const relativePath = path.join(relativeRoot, entry.name);
54
+
55
+ if (entry.isDirectory()) {
56
+ found.push(...await markdownFiles(absolutePath, relativePath));
57
+ } else if (entry.isFile() && entry.name.endsWith(".md")) {
58
+ found.push(relativePath);
59
+ }
60
+ }
61
+
62
+ return found;
63
+ }
64
+
65
+ function changesHarnessBehavior(relativePath) {
66
+ return BEHAVIOR_CHANGE_PATTERNS.some((pattern) => pattern.test(relativePath));
67
+ }
68
+
69
+ function changedFilesInGit(root) {
70
+ try {
71
+ const modified = execFileSync("git", ["diff", "--name-only", "HEAD", "--"], {
72
+ cwd: root,
73
+ encoding: "utf8",
74
+ });
75
+ const untracked = execFileSync("git", ["ls-files", "--others", "--exclude-standard"], {
76
+ cwd: root,
77
+ encoding: "utf8",
78
+ });
79
+ return [...new Set(`${modified}\n${untracked}`.split("\n").filter(Boolean))];
80
+ } catch {
81
+ return [];
82
+ }
83
+ }
84
+
85
+ function latestProgressEntry(progress) {
86
+ const entryStart = progress.lastIndexOf("\n## ");
87
+ return entryStart === -1 ? progress : progress.slice(entryStart + 1);
88
+ }
89
+
90
+ export async function verifyHarness(root = PACKAGE_ROOT, { changedFiles = [] } = {}) {
91
+ const errors = [];
92
+
93
+ for (const relativePath of REQUIRED_FILES) {
94
+ if (!await exists(path.join(root, relativePath))) {
95
+ errors.push(`${relativePath}: required artifact is missing`);
96
+ }
97
+ }
98
+
99
+ const readmePath = path.join(root, "README.md");
100
+ if (await exists(readmePath)) {
101
+ const readme = await readFile(readmePath, "utf8");
102
+ for (const relativePath of README_MAPPED_FILES) {
103
+ if (!readme.includes(`\`${relativePath}\``)) {
104
+ errors.push(`README.md: required artifact ${relativePath} is not referenced`);
105
+ }
106
+ }
107
+ }
108
+
109
+ const scannableFiles = [
110
+ "AGENTS.md",
111
+ "README.md",
112
+ ...await markdownFiles(path.join(root, "docs", "harness-engineering"), "docs/harness-engineering"),
113
+ ...await markdownFiles(path.join(root, "skills"), "skills"),
114
+ ];
115
+
116
+ for (const relativePath of scannableFiles) {
117
+ const filePath = path.join(root, relativePath);
118
+ if (!await exists(filePath)) {
119
+ continue;
120
+ }
121
+
122
+ const contents = await readFile(filePath, "utf8");
123
+ for (const match of contents.matchAll(/\[S(\d+)\]/g)) {
124
+ const sourceNumber = Number(match[1]);
125
+ if (sourceNumber < 1 || sourceNumber > 5) {
126
+ errors.push(`${relativePath}: citation ${match[0]} is outside permitted range [S1]-[S5]`);
127
+ }
128
+ }
129
+ }
130
+
131
+ const behaviorChanges = changedFiles.filter(changesHarnessBehavior);
132
+ if (behaviorChanges.length > 0) {
133
+ for (const stateFile of STATE_FILES) {
134
+ if (!changedFiles.includes(stateFile)) {
135
+ errors.push(
136
+ `${stateFile}: must be updated when implementation or harness behavior changes`
137
+ );
138
+ }
139
+ }
140
+
141
+ if (changedFiles.includes("progress.md") && await exists(path.join(root, "progress.md"))) {
142
+ const progress = await readFile(path.join(root, "progress.md"), "utf8");
143
+ const latestEntry = latestProgressEntry(progress);
144
+ for (const relativePath of behaviorChanges) {
145
+ if (!latestEntry.includes(`\`${relativePath}\``)) {
146
+ errors.push(
147
+ `progress.md: latest entry must reference changed behavior artifact ${relativePath}`
148
+ );
149
+ }
150
+ }
151
+ }
152
+ }
153
+
154
+ return errors;
155
+ }
156
+
157
+ function isDirectRun() {
158
+ return Boolean(process.argv[1]) &&
159
+ pathToFileURL(path.resolve(process.argv[1])).href === import.meta.url;
160
+ }
161
+
162
+ if (isDirectRun()) {
163
+ const errors = await verifyHarness(PACKAGE_ROOT, {
164
+ changedFiles: changedFilesInGit(PACKAGE_ROOT),
165
+ });
166
+
167
+ if (errors.length > 0) {
168
+ for (const error of errors) {
169
+ console.error(error);
170
+ }
171
+ process.exitCode = 1;
172
+ } else {
173
+ console.log("Harness verification passed.");
174
+ }
175
+ }
@@ -0,0 +1,78 @@
1
+ ---
2
+ name: acceptance-contract
3
+ description: Use when a user asks to define success criteria, clarify scope, prevent premature done claims, or prepare an AI agent/coding agent task before implementation.
4
+ ---
5
+
6
+ # Acceptance Contract
7
+
8
+ ## Core Principle
9
+
10
+ Turn an unclear request into a small, verifiable contract before implementation.
11
+ Use this skill when "done" is ambiguous, the task could drift, or an agent may
12
+ claim completion without evidence.
13
+
14
+ In this repository, follow the local source policy: use only `[S1]-[S5]` for
15
+ harness claims. Read `docs/harness-engineering/sources.md` only when you need to
16
+ check that policy. For templates, prefer the relevant section of
17
+ `docs/harness-engineering/implementation-playbook.md` instead of loading the
18
+ whole research note.
19
+
20
+ ## Workflow
21
+
22
+ 1. State assumptions in one short list.
23
+ 2. Name any ambiguity that changes implementation or verification.
24
+ 3. Keep the scope smaller than the implementation work.
25
+ 4. Define user-visible or system-visible behavior.
26
+ 5. Define acceptance criteria that can be checked.
27
+ 6. Define verification commands or observable signals.
28
+ 7. Mark non-goals so the agent does not widen the task.
29
+ 8. Implement only after the contract is clear enough to verify.
30
+
31
+ If the missing information cannot be inferred safely, ask one concise question
32
+ before writing code.
33
+
34
+ ## Contract Template
35
+
36
+ ```markdown
37
+ # Acceptance Contract
38
+
39
+ ## Assumptions
40
+ - ...
41
+
42
+ ## Scope
43
+ - Feature/fix:
44
+ - User-visible behavior:
45
+ - Likely files:
46
+
47
+ ## Acceptance Criteria
48
+ - [ ] ...
49
+ - [ ] ...
50
+
51
+ ## Verification
52
+ - Unit:
53
+ - Integration:
54
+ - Browser/API:
55
+ - Log/metric/trace:
56
+
57
+ ## Out of Scope
58
+ - ...
59
+ ```
60
+
61
+ ## Verification Rules
62
+
63
+ - Prefer an existing project command over a new script.
64
+ - For code changes, run the narrowest test that proves the criteria.
65
+ - For UI/runtime behavior, use browser, API, log, metric, trace, or screenshot
66
+ evidence when available.
67
+ - Do not mark criteria done until verification has run or the skipped check is
68
+ explicitly explained.
69
+
70
+ ## Source Mapping
71
+
72
+ - Small tasks should use the simplest sufficient workflow [S3].
73
+ - Long-running agent tasks need state and verification to avoid early done
74
+ claims [S2].
75
+ - Runtime-visible checks improve agent feedback loops [S1], [S2], [S4].
76
+ - Sprint contracts and evaluator criteria help when task quality is subjective
77
+ or multi-step [S4].
78
+ - Trajectory evaluation and LLM-as-a-judge monitor execution path quality, and AutoHarness enforces constraints when manual rules are too complex [S5].
@@ -0,0 +1,4 @@
1
+ interface:
2
+ display_name: "Acceptance Contract"
3
+ short_description: "Define scope, done criteria, and checks"
4
+ default_prompt: "Use $acceptance-contract to define scope, acceptance criteria, and verification for this task."
@@ -0,0 +1,90 @@
1
+ ---
2
+ name: cleanup-harness
3
+ description: Use when a user asks to design, scope, or run cleanup for agent-created code, documentation drift, repeated review defects, architecture drift, or accumulated harness debt.
4
+ ---
5
+
6
+ # Cleanup Harness
7
+
8
+ ## Core Principle
9
+
10
+ Treat cleanup as a scoped harness task, not opportunistic refactoring. Cleanup
11
+ needs a trigger, acceptance criteria, verification, and rollback path because
12
+ high agent throughput can spread weak patterns quickly.
13
+
14
+ In this repository, follow the local source policy: use only `[S1]-[S5]` for
15
+ harness claims. Read `docs/harness-engineering/sources.md` only when you need to
16
+ check that policy. For cleanup templates, prefer the relevant section of
17
+ `docs/harness-engineering/implementation-playbook.md` instead of loading the
18
+ whole research note.
19
+
20
+ ## Cleanup Triggers
21
+
22
+ Start a cleanup task only when at least one trigger is visible:
23
+
24
+ - the same helper, workaround, or pattern appears repeatedly;
25
+ - a feature bypasses an architecture boundary;
26
+ - progress logs repeat the same failure;
27
+ - evaluator or review feedback catches the same defect class multiple times;
28
+ - docs, indexes, or `AGENTS.md` drift from the repository state;
29
+ - new work adds workaround code instead of fixing the cause.
30
+
31
+ If no trigger is visible, mention the potential issue but do not edit unrelated
32
+ code.
33
+
34
+ ## Workflow
35
+
36
+ 1. Identify the concrete trigger and evidence.
37
+ 2. Define the smallest cleanup scope that removes the repeated problem.
38
+ 3. List files likely to change.
39
+ 4. Define acceptance criteria.
40
+ 5. Define verification commands or observable signals.
41
+ 6. Remove only debt inside the declared scope.
42
+ 7. Convert repeated judgment into a mechanical guardrail when practical.
43
+ 8. Record what was verified and any residual risk.
44
+
45
+ ## Cleanup Task Template
46
+
47
+ ```markdown
48
+ # Cleanup Task
49
+
50
+ ## Trigger
51
+ - Evidence:
52
+
53
+ ## Scope
54
+ - Clean up:
55
+ - Likely files:
56
+
57
+ ## Acceptance Criteria
58
+ - [ ] Duplicate or drift source is removed.
59
+ - [ ] Behavior remains unchanged unless explicitly requested.
60
+ - [ ] Guardrail is added or the reason for not adding one is stated.
61
+
62
+ ## Verification
63
+ - Tests:
64
+ - Lint/structural check:
65
+ - Runtime check:
66
+
67
+ ## Rollback
68
+ - Safe restore point:
69
+ ```
70
+
71
+ ## Guardrail Guidance
72
+
73
+ Prefer a mechanical check when the same issue is likely to recur:
74
+
75
+ - lint or structural test for architecture boundaries;
76
+ - doc/index freshness check for repository source of truth;
77
+ - smoke test for setup or runtime drift;
78
+ - evaluator rubric for repeated subjective quality failures.
79
+
80
+ Do not add broad rules that protect no concrete invariant.
81
+
82
+ ## Source Mapping
83
+
84
+ - Cleanup is part of repository-level harness maintenance when throughput
85
+ increases entropy [S1].
86
+ - Mechanical guardrails are stronger than prose for repeated invariants [S1].
87
+ - Keep the intervention as simple as the failure mode allows [S3].
88
+ - Long-running work benefits from explicit state, verification, and recovery
89
+ points [S2], [S4].
90
+ - AutoHarness can automatically enforce code constraints to reduce cleanup debt, and trajectory evaluation tracks whether cleanup alters agent execution paths [S5].
@@ -0,0 +1,4 @@
1
+ interface:
2
+ display_name: "Cleanup Harness"
3
+ short_description: "Scope cleanup with triggers and checks"
4
+ default_prompt: "Use $cleanup-harness to scope a cleanup task with trigger evidence, acceptance criteria, and verification."
@@ -0,0 +1,124 @@
1
+ ---
2
+ name: creator-harness
3
+ description: Use when a user asks to create, design, audit, or improve a harness for AI agents, coding agents, long-running work, eval loops, repository workflows, or agent operating procedures.
4
+ ---
5
+
6
+ # Creator Harness
7
+
8
+ ## Core Principle
9
+
10
+ Create the smallest harness that changes agent behavior. A harness is the
11
+ control plane around an agent: durable state, readable tools, verification
12
+ loops, evaluator feedback when needed, and mechanical guardrails.
13
+
14
+ Use only the local five-source research as the source of truth:
15
+
16
+ - `docs/harness-engineering/sources.md`
17
+ - `docs/harness-engineering/research-note.md`
18
+ - `docs/harness-engineering/implementation-playbook.md`
19
+
20
+ Do not introduce external harness resources unless the user explicitly asks to
21
+ expand beyond the five OpenAI/Anthropic/Google articles.
22
+
23
+ ## Working Rules
24
+
25
+ 1. State assumptions before creating files. If the target agent, runtime, or
26
+ success criteria are unknowable, ask one concise question.
27
+ 2. Start with a single-agent harness plus state and verification. Add planner,
28
+ evaluator, telemetry, or cleanup automation only when a named failure mode
29
+ requires it.
30
+ 3. Touch only harness artifacts unless the user explicitly asks for product code
31
+ changes.
32
+ 4. Every harness artifact must answer at least one question: What should the
33
+ agent know? What state survives context loss? What can it observe? How does
34
+ it verify? What invariant is mechanically enforced?
35
+ 5. Convert important preferences into checks where practical: tests, lint,
36
+ scripts, CI jobs, evaluator rubrics, or reviewer contracts.
37
+ 6. For one-shot Markdown or research-note edits in this repository, do not start
38
+ autonomous loops unless the user explicitly requests them.
39
+
40
+ ## Design Workflow
41
+
42
+ 1. Inventory existing harness surface:
43
+ - `AGENTS.md`, `README.md`, architecture docs, product specs;
44
+ - setup scripts, task runner, CI, tests, smoke tests;
45
+ - progress logs, feature lists, todos, research state;
46
+ - eval prompts, evaluator rubrics, screenshots, traces, telemetry;
47
+ - tool contracts, permissions, escalation rules.
48
+
49
+ 2. Name the failure modes:
50
+ - lost context across sessions;
51
+ - early "done" claims;
52
+ - weak runtime observability;
53
+ - overbroad implementation;
54
+ - self-evaluation optimism;
55
+ - architecture drift;
56
+ - cleanup debt from high agent throughput.
57
+
58
+ 3. Pick the minimal intervention:
59
+ - unclear task: acceptance contract;
60
+ - lost context: `progress.md`, `feature_list.json`, git protocol;
61
+ - broken environment: `init.sh`, smoke test;
62
+ - invisible runtime: browser/API/log/metric/trace checks;
63
+ - weak self-review: evaluator rubric or separate evaluator pass;
64
+ - drift: structural lint or architecture test;
65
+ - throughput entropy: targeted cleanup task with verification;
66
+ - complex constraints: AutoHarness synthesized code wrapper [S5];
67
+ - agent trajectory drift: Trajectory Evaluation and LLM-as-a-judge [S5].
68
+
69
+ 4. Write a harness contract:
70
+ - agent role and allowed scope;
71
+ - durable state files;
72
+ - required tools and observable signals;
73
+ - verification commands;
74
+ - loop cadence;
75
+ - stop/escalation conditions;
76
+ - out-of-scope work.
77
+
78
+ 5. Create only the needed files. For templates, read
79
+ `references/harness-artifacts.md`.
80
+
81
+ 6. Verify the harness:
82
+ - run syntax/format validators for files created;
83
+ - run the declared smoke test if one exists;
84
+ - run the placeholder and citation scan from `AGENTS.md`;
85
+ - verify no recurring automation was created for a one-shot documentation
86
+ task;
87
+ - if editing this skill, validate the skill if a validator exists locally.
88
+
89
+ ## Harness Types
90
+
91
+ | Situation | Default harness |
92
+ | --------------------------- | --------------------------------------------------------- |
93
+ | Small bug or feature | Acceptance criteria and a verification command |
94
+ | Multi-session coding | `progress.md`, `feature_list.json`, `init.sh`, smoke test |
95
+ | UI/runtime-heavy app | Sprint contract, browser/API checks, evaluator notes |
96
+ | Long application build | Planner, generator, evaluator, sprint contract |
97
+ | Architecture-sensitive repo | Dependency rules, structural tests, cleanup cadence |
98
+ | Complex or rule-heavy env | AutoHarness (wrapper), Trajectory evaluation / VeRO |
99
+
100
+ ## Output Shape
101
+
102
+ When answering without file edits, produce:
103
+
104
+ ```markdown
105
+ ## Assumptions
106
+
107
+ - ...
108
+
109
+ ## Failure Modes
110
+
111
+ - ...
112
+
113
+ ## Minimal Harness
114
+
115
+ - Artifact:
116
+ - Purpose:
117
+ - Verification:
118
+
119
+ ## Next Step
120
+
121
+ - ...
122
+ ```
123
+
124
+ When editing files, summarize changed files and verification run.
@@ -0,0 +1,4 @@
1
+ interface:
2
+ display_name: "Creator Harness"
3
+ short_description: "Design practical agent harnesses"
4
+ default_prompt: "Use $creator-harness to design a minimal harness for this repository."