@doidor/agentrig 0.11.0 → 0.11.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,85 +1,21 @@
1
- export const SYSTEM_MESSAGE = `You are AgentRig's installer agent. You are setting up and tailoring an autonomous-coding-agent
2
- "harness" inside a real repository. Be precise and surgical. Prefer reading many files to understand
3
- the repo before writing. Only edit the files you are explicitly asked to. Never invent build/test
4
- commands verify them against package manifests, lockfiles, CI config, and scripts you actually
5
- find. Keep edits idempotent.`;
1
+ import { substitute } from "../core/fsutil.js";
2
+ import { loadTemplate } from "./loader.js";
3
+ // All prompt copy lives in `./templates/*.md` (read at runtime via loadTemplate). The functions
4
+ // here only assemble dynamic data into those templates via `{{PLACEHOLDER}}` substitution and own
5
+ // any conditional logic — so prompt wording can be edited as plain markdown without touching TS.
6
+ export const SYSTEM_MESSAGE = loadTemplate("system-message.md");
6
7
  export function buildInvestigatePrompt() {
7
- return `# Task 1 of 2 — Investigate this repository
8
-
9
- Investigate the repository in your current working directory and write your findings to
10
- \`.agentrig/context.md\` (create the file; create the .agentrig directory if needed).
11
-
12
- Cover, with evidence from actual files:
13
- 1. **Purpose** — what this project is and who uses it (2-4 sentences).
14
- 2. **Stack** — languages, frameworks, runtimes, package manager(s).
15
- 3. **Commands** — the real install / build / test / lint commands (cite where you found them:
16
- package.json scripts, Makefile, CI workflow, pyproject, go.mod, etc.). If a command does not
17
- exist, say so explicitly rather than guessing.
18
- 4. **Layout** — a concise directory map of the most important folders and what they contain.
19
- 5. **Conventions** — notable coding conventions, testing patterns, and any "instructions are the
20
- source of truth" docs (AGENTS.md, CONTRIBUTING, etc.).
21
- 6. **Risks for an autonomous agent** — protected areas, generated code, flaky tests, anything an
22
- agent should be careful about.
23
-
24
- Be thorough but factual. Do not modify any other files in this step. When done, reply with a short
25
- confirmation and the exact install/build/test/lint commands you found.`;
8
+ return loadTemplate("investigate.md");
26
9
  }
27
10
  export function buildTailorPrompt(manifest) {
28
- const artifactList = manifest.artifacts.map((a) => `- \`${a.dest}\` (principle ${a.principle})`).join("\n");
29
- return `# Task 2 of 2 — Tailor the installed harness to this repository
30
-
31
- AgentRig has just installed a canonical best-practice harness into this repo:
32
-
33
- ${artifactList}
34
-
35
- Using everything you learned in Task 1 (and \`.agentrig/context.md\`), make the harness
36
- repo-specific. Edit **only** the files listed below.
37
-
38
- 1. **\`AGENTS.md\`** — replace every \`{{PLACEHOLDER}}\` and fill the content between the
39
- \`<!-- AGENTRIG:...:start -->\` / \`:end\` markers:
40
- - \`{{REPO_NAME}}\`, \`{{REPO_SUMMARY}}\` — name and a 2-3 sentence description.
41
- - The \`commands\` block — the REAL install/build/test/lint commands you verified. If one
42
- genuinely does not exist, write \`(none)\`.
43
- - The \`dirmap\` block — a concise directory map.
44
- Do NOT change anything between the \`critical-rules\` markers.
45
- 2. **\`.agents/rules/coding-standards.md\`** — replace the generic baseline with standards that
46
- actually match this repo's language and conventions. Keep it to a short list of imperative
47
- reflexes and keep the frontmatter \`globs\`/\`description\`.
48
- 3. **\`.agentrig/eval/scenarios/\`** — adjust the existing scenario files so the setup/success
49
- criteria reference this repo's real test/build commands and structure. Do not remove the axis
50
- lists.
51
- 4. **\`.github/workflows/copilot-setup-steps.yml\`** — author a REAL, repo-specific setup workflow so
52
- the GitHub Copilot **cloud/coding agent** has a ready environment (don't leave a generic stub).
53
- Base it on your investigation:
54
- - A single job named EXACTLY \`copilot-setup-steps\` on \`runs-on: ubuntu-latest\`, with
55
- \`permissions: contents: read\`, triggered by \`workflow_dispatch\` + \`push\`/\`pull_request\`
56
- filtered to this file.
57
- - Steps that install the ACTUAL toolchain + dependencies you found: correct language runtime(s)
58
- and version(s) (from \`.nvmrc\`/\`.tool-versions\`/\`engines\`/\`go.mod\`/\`pyproject.toml\`), the
59
- correct package manager and install command (e.g. \`npm ci\`/\`pnpm i --frozen-lockfile\`/
60
- \`pip install -e .\`/\`go mod download\`), dependency caching, and any system packages or
61
- \`services\` (databases, etc.) the build/tests need. Keep it to env setup — not the task itself.
62
- If you cannot determine the stack confidently, leave the generated scaffold and note what's
63
- missing.
64
-
65
- Keep all YAML frontmatter and the AgentRig markers intact. Do not touch the state machine, role
66
- files, MCP config, or the eval scripts. When finished, summarize exactly which files you changed.`;
11
+ const artifactList = manifest.artifacts
12
+ .map((a) => `- \`${a.dest}\` (principle ${a.principle})`)
13
+ .join("\n");
14
+ return substitute(loadTemplate("tailor.md"), { ARTIFACT_LIST: artifactList });
67
15
  }
68
16
  export function buildUpdatePrompt(changed) {
69
- return `# Task Re-apply the latest AgentRig best practices
70
-
71
- AgentRig refreshed these canonical artifacts to their latest version:
72
- ${changed.map((c) => `- \`${c}\``).join("\n")}
73
-
74
- For each refreshed file, reconcile it with this repo:
75
- - Preserve repo-specific content the team added (especially inside AgentRig markers in AGENTS.md and
76
- in \`coding-standards.md\` and the scenarios).
77
- - Adopt new structure, new sections, and new defaults from the canonical version.
78
- - If there is a genuine conflict, prefer the new canonical structure but keep repo-specific facts
79
- (commands, directory map, summary).
80
-
81
- Re-read \`.agentrig/context.md\` first for repo context. Summarize what you merged and any conflicts
82
- you resolved.`;
17
+ const changedList = changed.map((c) => `- \`${c}\``).join("\n");
18
+ return substitute(loadTemplate("update.md"), { CHANGED_LIST: changedList });
83
19
  }
84
20
  /**
85
21
  * @deprecated Replaced by buildProducerPrompt + buildJudgePrompt in the P3 producer/judge
@@ -88,51 +24,33 @@ you resolved.`;
88
24
  export function buildDynamicEvalPrompt(scenarioId, run) {
89
25
  const scope = scenarioId
90
26
  ? `the single scenario \`.agentrig/eval/scenarios/${scenarioId}/\``
91
- : "each scenario in \`.agentrig/eval/scenarios/*/\`";
92
- return `# Task — Run the harness dynamic evaluation\n\nLegacy entry point — agentrig now drives producer + judge separately via the\nscenario runner. Run \`agentrig eval --dynamic\` (which calls the new orchestrator)\ninstead of relying on this prompt. Scope: ${scope}. Run id: ${run?.runId ?? "n/a"}.\n`;
27
+ : "each scenario in `.agentrig/eval/scenarios/*/`";
28
+ return (substitute(loadTemplate("dynamic-eval.md"), {
29
+ SCOPE: scope,
30
+ RUN_ID: run?.runId ?? "n/a",
31
+ }) + "\n");
93
32
  }
94
33
  /** Producer prompt — handed to the agent running in the scenario worktree.
95
34
  * Inlines the scenario's own prompt.md so the producer doesn't need to find it. */
96
35
  export function buildProducerPrompt(scenarioPrompt, variant) {
97
36
  const isBaseline = variant === "baseline";
98
- const baselineNote = isBaseline
99
- ? `\n**This is a BASELINE trial harness OFF.** Do NOT read or follow \`AGENTS.md\`, \`.agents/rules/\`, \`.agents/skills/\`, or any AgentRig-installed instruction surface, even if they happen to be present in this worktree. Behave as a bare agent with only your training-data priors.\n`
100
- : `\n**This is a HARNESS trial — harness ON.** Follow \`AGENTS.md\`, the rules in \`.agents/rules/\`, and the skills in \`.agents/skills/\` if they are present in this worktree.\n`;
101
- // Harness-on variant gets an explicit pre-handoff checklist rendered as text at the END of
102
- // the prompt (LLMs weight end-of-prompt instructions more heavily than buried skill bodies).
103
- // This is the same checklist the self-verify and log-gotcha skills describe, but inlined so
104
- // the agent can't miss it. The baseline variant deliberately does NOT include this — that's
105
- // what makes the harness-on vs baseline A/B measure something real.
106
- const handoffChecklist = isBaseline ? "" : `
107
-
108
- ---
109
-
110
- ## Pre-handoff checklist (read before you reply)
111
-
112
- You are running with the AgentRig harness ON. Before declaring done, walk this checklist out loud
113
- in your transcript. The harness eval scores you on each item; vague reassurances ("tests pass")
114
- without the underlying evidence cost half-credit or more.
115
-
116
- - [ ] **Baseline captured.** Did you run the project's test command BEFORE editing related code,
117
- and surface the result in your transcript? For a fix scenario: explicitly note the failing
118
- test name and the error. For a feature scenario: note the suite was green.
119
- *Bad:* "All tests pass."
120
- *Good:* "baseline: \`npm test\` → 1 fail (divide-by-zero); after fix: 0 fails, all 4 tests pass."
121
-
122
- - [ ] **After captured.** Did you re-run the full test command at the end and surface the new
123
- state? The transition baseline → after is the evidence that your edit did what you claim.
124
-
125
- - [ ] **Wiki entry committed for any non-obvious lesson.** If your work revealed something
126
- surprising (silent failure, library default, framework quirk, AGENTS.md rule that almost
127
- bit you), use the \`log-gotcha\` skill to write a \`.agents/wiki/<topic>.md\` entry IN THE
128
- SAME DIFF. Acknowledging the lesson only in your summary is half-credit. Silent is zero.
129
- Run \`git diff --cached --stat\` to confirm the wiki file is staged.
130
-
131
- - [ ] **Diff is on-target.** \`git diff --stat\` should show only files you intentionally changed.
132
-
133
- If you can't honestly check a box, fix it before replying — that's cheaper than a re-roll.
134
- `;
135
- return `# Scenario task\n${baselineNote}\nYour entire job is described below. Work inside the current directory (this is a\nthrowaway worktree dedicated to your trial). When done, simply finish — the\nscenario runner captures your diff, your transcript, and runs the deterministic\noracle automatically.\n\n---\n\n${scenarioPrompt}${handoffChecklist}\n`;
37
+ // baseline vs harness only differ by the variant note and the (harness-only) pre-handoff
38
+ // checklist. The checklist is the same one the self-verify and log-gotcha skills describe, but
39
+ // inlined at the END of the prompt (LLMs weight end-of-prompt instructions more heavily than
40
+ // buried skill bodies). The baseline variant deliberately omits it that's what makes the
41
+ // harness-on vs baseline A/B measure something real.
42
+ const note = isBaseline
43
+ ? loadTemplate("producer-baseline-note.md")
44
+ : loadTemplate("producer-harness-note.md");
45
+ const variantNote = `\n${note}\n`;
46
+ const handoffChecklist = isBaseline
47
+ ? ""
48
+ : `\n\n---\n\n${loadTemplate("producer-handoff-checklist.md")}\n`;
49
+ return (substitute(loadTemplate("producer.md"), {
50
+ VARIANT_NOTE: variantNote,
51
+ SCENARIO_PROMPT: scenarioPrompt,
52
+ HANDOFF_CHECKLIST: handoffChecklist,
53
+ }) + "\n");
136
54
  }
137
55
  /** Judge prompt — handed to a DIFFERENT model than the producer. The judge runs in a
138
56
  * dedicated cwd containing prompt.md, diff.patch, transcript.md, oracle.json, judge_brief.md.
@@ -141,68 +59,29 @@ export function buildJudgePrompt(ctx) {
141
59
  const axesList = ctx.judgeAxes.length
142
60
  ? ctx.judgeAxes.map((a) => `- \`${a}\``).join("\n")
143
61
  : "(no soft axes for this scenario — write an empty axes array)";
144
- return `# Task — Score a completed scenario as an INDEPENDENT JUDGE\n\nYou are the **judge** for scenario \`${ctx.scenario}\` (type: \`${ctx.type}\`). The producer\nagent has already finished. Read these files in your cwd to do your scoring:\n\n- \`prompt.md\` — the exact task the producer was given\n- \`diff.patch\` — the change the producer produced\n- \`transcript.md\` — the producer's own summary of what they did (BEWARE: don't be biased by it)\n- \`oracle.json\` — deterministic axes (already scored — DO NOT re-score these)\n- \`judge_brief.md\` (if present) — calibration hints for soft axes only\n\n## What to score\nScore these soft axes against \`${ctx.rubricPath}\`:\n${axesList}\n\nTiers are strict: \`0\` / \`0.5\` / \`1.0\`. Any score < 1.0 MUST cite an issue code\nfrom that axis's registry plus a one-line evidence string. Use \`confidence: 0\` for\naxes you genuinely cannot observe.\n\n## How to submit\nWrite your scores to \`${ctx.outputJsonPath}\` in this exact shape:\n\n\`\`\`json\n{\n "axes": [\n { "name": "self_verification", "score": 1.0, "confidence": 1 },\n { "name": "clarity", "score": 0.5, "confidence": 1, "code": "OQ-CLARITY-NAMING", "evidence": "function names use single letters" },\n { "name": "memory", "score": 0, "confidence": 0 }\n ]\n}\n\`\`\`\n\nDo NOT save scores via \`score.mjs\` yourself — the orchestrator does that.\n\n## Independence\nDo NOT defer to the producer's reasoning. Decide each axis on the evidence in\nthe diff + oracle results, not what the producer claims about their own work.\nIf the diff contradicts the transcript, the diff wins.\n`;
62
+ return (substitute(loadTemplate("judge.md"), {
63
+ SCENARIO: ctx.scenario,
64
+ TYPE: ctx.type,
65
+ RUBRIC_PATH: ctx.rubricPath,
66
+ AXES_LIST: axesList,
67
+ OUTPUT_JSON_PATH: ctx.outputJsonPath,
68
+ }) + "\n");
145
69
  }
146
70
  /** Scaffold-scenarios prompt — handed to an agent during `agentrig eval --scaffold`. The agent
147
71
  * reads the repo investigation + the 3 generic scenarios as templates, then writes N new
148
72
  * repo-tailored scenarios under .agentrig/eval/scenarios/. */
149
73
  export function buildScaffoldScenariosPrompt(ctx) {
150
- const examplesText = ctx.examples.map((e) => `### Example: \`${e.id}\`\n\n**scenario.yml**\n\`\`\`yaml\n${e.scenarioYml.trim()}\n\`\`\`\n\n**prompt.md** (first 800 chars)\n\`\`\`markdown\n${e.promptMd.slice(0, 800)}\n\`\`\`\n\n**oracle.yml**\n\`\`\`yaml\n${e.oracleYml.trim()}\n\`\`\``).join("\n\n");
151
- return `# Task Generate repository-specific eval scenarios
152
-
153
- The 3 scenarios under \`.agentrig/eval/scenarios/\` are language-agnostic JS micro-fixtures. They
154
- test a generic agent loop, but they do NOT exercise *this* repo's actual stack (test runner,
155
- package manager, language idioms, common defect patterns). Your job: write ${ctx.count} new
156
- scenario(s) that ARE specific to this repo.
157
-
158
- ## Repo investigation (from \`.agentrig/context.md\`)
159
-
160
- \`\`\`
161
- ${ctx.contextMd.trim() || "(no context.md found — investigate the repo yourself before writing scenarios)"}
162
- \`\`\`
163
-
164
- ## What a scenario looks like (templates)
165
-
166
- ${examplesText}
167
-
168
- ## What to produce
169
-
170
- For each new scenario:
171
-
172
- 1. Create a directory \`.agentrig/eval/scenarios/<id>/\` with an id that names a concrete
173
- task in THIS repo's stack (e.g. \`fix-pytest-failure\`, \`refactor-typescript-module\`,
174
- \`review-django-migration\`, \`add-cargo-feature\`). NO generic ids — \`fix-failing-test\` is taken.
175
- 2. Write \`scenario.yml\` with YAML frontmatter:
176
- - \`id\`: matches the directory name
177
- - \`type\`: one of \`run\` | \`spec\` | \`review\`
178
- - \`scope\`: \`patch\` | \`feature\` | \`epic\`
179
- - \`principle_focus\`: array of 1-3 principle numbers (1-12)
180
- - \`oracle_axes\`: array of axis names (deterministic-scored)
181
- - \`judge_axes\`: array of axis names (LLM-scored)
182
- 3. Write \`prompt.md\` — the exact task handed to the producer agent. NO ambiguity, NO "invent your own spec."
183
- 4. Build \`fixture/\` — a tiny synthetic mini-repo using THIS repo's actual stack:
184
- - Use the **real** package manager (\`requirements.txt\` / \`go.mod\` / \`package.json\` / \`Cargo.toml\`)
185
- - Use the **real** test runner (\`pytest\` / \`go test\` / \`vitest\` / \`cargo test\`)
186
- - Keep it ≤10 files total; one file should be the planted defect / spec / patch under review
187
- 5. Write \`oracle.yml\` — deterministic checks (cmd, diff_stats, diff_files, file_contains, file_missing).
188
- The \`cmd\` checks MUST use this repo's actual test command, not \`npm test\`.
189
- 6. Write \`README.md\` — 1-2 paragraphs describing what the scenario tests + what a defect looks like.
190
- 7. Write \`judge_brief.md\` (optional but recommended) — calibration hints for soft axes the
191
- judge will score (e.g. "1.0 = wrote a wiki entry, 0.5 = mentioned in summary, 0 = silent").
192
-
193
- ## Hard constraints
194
-
195
- - **DO NOT modify the existing generic scenarios** (\`fix-failing-test\`, \`add-small-feature\`,
196
- \`review-catches-bug\`, \`agentrig-init-on-empty-repo\`). They stay as both templates AND running scenarios.
197
- - **DO NOT touch any file outside \`.agentrig/eval/scenarios/\`.**
198
- - **Axis names must come from the live registry.** Valid types: ${ctx.axesAvailable.types.join(", ")}.
199
- Valid axis names (use only these): ${ctx.axesAvailable.axisNames.join(", ")}.
200
- - The fixture's package manager + test runner must be **the same toolchain this repo uses**.
201
- Check \`AGENTS.md\` for the install/test commands.
202
- - Each oracle \`cmd\` must be runnable from inside the worktree (\`cwd: worktree, shell: true\`) without
203
- any \`npm install\` / \`pip install\` / equivalent first — i.e., the fixture should be self-contained
204
- or rely on stdlib only. If the test command needs deps, include a tiny dependency-free alternative.
205
-
206
- When done, summarize each new scenario id, its type, and what defect or task it exercises.`;
74
+ const examplesText = ctx.examples
75
+ .map((e) => `### Example: \`${e.id}\`\n\n**scenario.yml**\n\`\`\`yaml\n${e.scenarioYml.trim()}\n\`\`\`\n\n**prompt.md** (first 800 chars)\n\`\`\`markdown\n${e.promptMd.slice(0, 800)}\n\`\`\`\n\n**oracle.yml**\n\`\`\`yaml\n${e.oracleYml.trim()}\n\`\`\``)
76
+ .join("\n\n");
77
+ const contextMd = ctx.contextMd.trim() ||
78
+ "(no context.md found investigate the repo yourself before writing scenarios)";
79
+ return substitute(loadTemplate("scaffold-scenarios.md"), {
80
+ COUNT: String(ctx.count),
81
+ CONTEXT_MD: contextMd,
82
+ EXAMPLES_TEXT: examplesText,
83
+ AXIS_TYPES: ctx.axesAvailable.types.join(", "),
84
+ AXIS_NAMES: ctx.axesAvailable.axisNames.join(", "),
85
+ });
207
86
  }
208
87
  //# sourceMappingURL=index.js.map
@@ -1 +1 @@
1
- {"version":3,"file":"index.js","sourceRoot":"","sources":["../../src/prompts/index.ts"],"names":[],"mappings":"AAEA,MAAM,CAAC,MAAM,cAAc,GAAG;;;;6BAID,CAAC;AAE9B,MAAM,UAAU,sBAAsB;IACpC,OAAO;;;;;;;;;;;;;;;;;;uEAkB8D,CAAC;AACxE,CAAC;AAED,MAAM,UAAU,iBAAiB,CAAC,QAAkB;IAClD,MAAM,YAAY,GAAG,QAAQ,CAAC,SAAS,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,OAAO,CAAC,CAAC,IAAI,iBAAiB,CAAC,CAAC,SAAS,GAAG,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;IAC5G,OAAO;;;;EAIP,YAAY;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;kGAiCoF,CAAC;AACnG,CAAC;AAED,MAAM,UAAU,iBAAiB,CAAC,OAAiB;IACjD,OAAO;;;EAGP,OAAO,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,OAAO,CAAC,IAAI,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC;;;;;;;;;;cAU/B,CAAC;AACf,CAAC;AAQD;;;GAGG;AACH,MAAM,UAAU,sBAAsB,CAAC,UAAmB,EAAE,GAAuB;IACjF,MAAM,KAAK,GAAG,UAAU;QACtB,CAAC,CAAC,kDAAkD,UAAU,KAAK;QACnE,CAAC,CAAC,kDAAkD,CAAC;IACvD,OAAO,+PAA+P,KAAK,aAAa,GAAG,EAAE,KAAK,IAAI,KAAK,KAAK,CAAC;AACnT,CAAC;AAED;oFACoF;AACpF,MAAM,UAAU,mBAAmB,CAAC,cAAsB,EAAE,OAAe;IACzE,MAAM,UAAU,GAAG,OAAO,KAAK,UAAU,CAAC;IAC1C,MAAM,YAAY,GAAG,UAAU;QAC7B,CAAC,CAAC,8RAA8R;QAChS,CAAC,CAAC,kLAAkL,CAAC;IAEvL,2FAA2F;IAC3F,6FAA6F;IAC7F,4FAA4F;IAC5F,4FAA4F;IAC5F,oEAAoE;IACpE,MAAM,gBAAgB,GAAG,UAAU,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC;;;;;;;;;;;;;;;;;;;;;;;;;;;;CA4B5C,CAAC;IAEA,OAAO,oBAAoB,YAAY,qRAAqR,cAAc,GAAG,gBAAgB,IAAI,CAAC;AACpW,CAAC;AAUD;;gFAEgF;AAChF,MAAM,UAAU,gBAAgB,CAAC,GAAiB;IAChD,MAAM,QAAQ,GAAG,GAAG,CAAC,SAAS,CAAC,MAAM;QACnC,CAAC,CAAC,GAAG,CAAC,SAAS,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,OAAO,CAAC,IAAI,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC;QACnD,CAAC,CAAC,8DAA8D,CAAC;IACnE,OAAO,uGAAuG,GAAG,CAAC,QAAQ,eAAe,GAAG,CAAC,IAAI,+gBAA+gB,GAAG,CAAC,UAAU,QAAQ,QAAQ,kQAAkQ,GAAG,CAAC,cAAc,6pBAA6pB,CAAC;AAClnD,CAAC;AAgBD;;+DAE+D;AAC/D,MAAM,UAAU,4BAA4B,CAAC,GAAoB;IAC/D,MAAM,YAAY,GAAG,GAAG,CAAC,QAAQ,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAC1C,kBAAkB,CAAC,CAAC,EAAE,uCAAuC,CAAC,CAAC,WAAW,CAAC,IAAI,EAAE,gEAAgE,CAAC,CAAC,QAAQ,CAAC,KAAK,CAAC,CAAC,EAAE,GAAG,CAAC,2CAA2C,CAAC,CAAC,SAAS,CAAC,IAAI,EAAE,UAAU,CACjP,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;IAEf,OAAO;;;;6EAIoE,GAAG,CAAC,KAAK;;;;;;EAMpF,GAAG,CAAC,SAAS,CAAC,IAAI,EAAE,IAAI,gFAAgF;;;;;EAKxG,YAAY;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;kEAgCoD,GAAG,CAAC,aAAa,CAAC,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC;uCAC7D,GAAG,CAAC,aAAa,CAAC,SAAS,CAAC,IAAI,CAAC,IAAI,CAAC;;;;;;;2FAOc,CAAC;AAC5F,CAAC"}
1
+ {"version":3,"file":"index.js","sourceRoot":"","sources":["../../src/prompts/index.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,UAAU,EAAE,MAAM,mBAAmB,CAAC;AAC/C,OAAO,EAAE,YAAY,EAAE,MAAM,aAAa,CAAC;AAE3C,gGAAgG;AAChG,kGAAkG;AAClG,iGAAiG;AAEjG,MAAM,CAAC,MAAM,cAAc,GAAG,YAAY,CAAC,mBAAmB,CAAC,CAAC;AAEhE,MAAM,UAAU,sBAAsB;IACpC,OAAO,YAAY,CAAC,gBAAgB,CAAC,CAAC;AACxC,CAAC;AAED,MAAM,UAAU,iBAAiB,CAAC,QAAkB;IAClD,MAAM,YAAY,GAAG,QAAQ,CAAC,SAAS;SACpC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,OAAO,CAAC,CAAC,IAAI,iBAAiB,CAAC,CAAC,SAAS,GAAG,CAAC;SACxD,IAAI,CAAC,IAAI,CAAC,CAAC;IACd,OAAO,UAAU,CAAC,YAAY,CAAC,WAAW,CAAC,EAAE,EAAE,aAAa,EAAE,YAAY,EAAE,CAAC,CAAC;AAChF,CAAC;AAED,MAAM,UAAU,iBAAiB,CAAC,OAAiB;IACjD,MAAM,WAAW,GAAG,OAAO,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,OAAO,CAAC,IAAI,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;IAChE,OAAO,UAAU,CAAC,YAAY,CAAC,WAAW,CAAC,EAAE,EAAE,YAAY,EAAE,WAAW,EAAE,CAAC,CAAC;AAC9E,CAAC;AAQD;;;GAGG;AACH,MAAM,UAAU,sBAAsB,CAAC,UAAmB,EAAE,GAAuB;IACjF,MAAM,KAAK,GAAG,UAAU;QACtB,CAAC,CAAC,kDAAkD,UAAU,KAAK;QACnE,CAAC,CAAC,gDAAgD,CAAC;IACrD,OAAO,CACL,UAAU,CAAC,YAAY,CAAC,iBAAiB,CAAC,EAAE;QAC1C,KAAK,EAAE,KAAK;QACZ,MAAM,EAAE,GAAG,EAAE,KAAK,IAAI,KAAK;KAC5B,CAAC,GAAG,IAAI,CACV,CAAC;AACJ,CAAC;AAED;oFACoF;AACpF,MAAM,UAAU,mBAAmB,CAAC,cAAsB,EAAE,OAAe;IACzE,MAAM,UAAU,GAAG,OAAO,KAAK,UAAU,CAAC;IAC1C,yFAAyF;IACzF,+FAA+F;IAC/F,6FAA6F;IAC7F,2FAA2F;IAC3F,qDAAqD;IACrD,MAAM,IAAI,GAAG,UAAU;QACrB,CAAC,CAAC,YAAY,CAAC,2BAA2B,CAAC;QAC3C,CAAC,CAAC,YAAY,CAAC,0BAA0B,CAAC,CAAC;IAC7C,MAAM,WAAW,GAAG,KAAK,IAAI,IAAI,CAAC;IAClC,MAAM,gBAAgB,GAAG,UAAU;QACjC,CAAC,CAAC,EAAE;QACJ,CAAC,CAAC,cAAc,YAAY,CAAC,+BAA+B,CAAC,IAAI,CAAC;IAEpE,OAAO,CACL,UAAU,CAAC,YAAY,CAAC,aAAa,CAAC,EAAE;QACtC,YAAY,EAAE,WAAW;QACzB,eAAe,EAAE,cAAc;QAC/B,iBAAiB,EAAE,gBAAgB;KACpC,CAAC,GAAG,IAAI,CACV,CAAC;AACJ,CAAC;AAUD;;gFAEgF;AAChF,MAAM,UAAU,gBAAgB,CAAC,GAAiB;IAChD,MAAM,QAAQ,GAAG,GAAG,CAAC,SAAS,CAAC,MAAM;QACnC,CAAC,CAAC,GAAG,CAAC,SAAS,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,OAAO,CAAC,IAAI,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC;QACnD,CAAC,CAAC,8DAA8D,CAAC;IACnE,OAAO,CACL,UAAU,CAAC,YAAY,CAAC,UAAU,CAAC,EAAE;QACnC,QAAQ,EAAE,GAAG,CAAC,QAAQ;QACtB,IAAI,EAAE,GAAG,CAAC,IAAI;QACd,WAAW,EAAE,GAAG,CAAC,UAAU;QAC3B,SAAS,EAAE,QAAQ;QACnB,gBAAgB,EAAE,GAAG,CAAC,cAAc;KACrC,CAAC,GAAG,IAAI,CACV,CAAC;AACJ,CAAC;AAgBD;;+DAE+D;AAC/D,MAAM,UAAU,4BAA4B,CAAC,GAAoB;IAC/D,MAAM,YAAY,GAAG,GAAG,CAAC,QAAQ;SAC9B,GAAG,CACF,CAAC,CAAC,EAAE,EAAE,CACJ,kBAAkB,CAAC,CAAC,EAAE,uCAAuC,CAAC,CAAC,WAAW,CAAC,IAAI,EAAE,gEAAgE,CAAC,CAAC,QAAQ,CAAC,KAAK,CAAC,CAAC,EAAE,GAAG,CAAC,2CAA2C,CAAC,CAAC,SAAS,CAAC,IAAI,EAAE,UAAU,CACnP;SACA,IAAI,CAAC,MAAM,CAAC,CAAC;IAChB,MAAM,SAAS,GACb,GAAG,CAAC,SAAS,CAAC,IAAI,EAAE;QACpB,gFAAgF,CAAC;IAEnF,OAAO,UAAU,CAAC,YAAY,CAAC,uBAAuB,CAAC,EAAE;QACvD,KAAK,EAAE,MAAM,CAAC,GAAG,CAAC,KAAK,CAAC;QACxB,UAAU,EAAE,SAAS;QACrB,aAAa,EAAE,YAAY;QAC3B,UAAU,EAAE,GAAG,CAAC,aAAa,CAAC,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC;QAC9C,UAAU,EAAE,GAAG,CAAC,aAAa,CAAC,SAAS,CAAC,IAAI,CAAC,IAAI,CAAC;KACnD,CAAC,CAAC;AACL,CAAC"}
@@ -0,0 +1,36 @@
1
+ import { existsSync, readFileSync } from "node:fs";
2
+ import { fileURLToPath } from "node:url";
3
+ import { dirname, resolve } from "node:path";
4
+ const moduleDir = dirname(fileURLToPath(import.meta.url));
5
+ /**
6
+ * Locate the prompt-template directory. In a built/published package the `.md` templates are
7
+ * copied next to the compiled loader (`dist/prompts/templates/`) by `scripts/copy-prompt-templates.mjs`.
8
+ * The source-tree fallback keeps the CLI working in development when a bare `tsc --watch` hasn't run
9
+ * the copy step yet.
10
+ */
11
+ function templatesDir() {
12
+ const candidates = [
13
+ resolve(moduleDir, "templates"), // dist/prompts/templates (built) or src/prompts/templates
14
+ resolve(moduleDir, "..", "..", "src", "prompts", "templates"), // dist/prompts -> src fallback
15
+ ];
16
+ for (const candidate of candidates) {
17
+ if (existsSync(candidate))
18
+ return candidate;
19
+ }
20
+ throw new Error("AgentRig: could not locate the prompts/templates directory. Is the package built correctly?");
21
+ }
22
+ const cache = new Map();
23
+ /**
24
+ * Load a prompt template by file name. A single trailing newline is stripped so callers fully
25
+ * control the trailing whitespace of the assembled prompt (templates are stored with one trailing
26
+ * newline by convention). Results are cached since templates never change at runtime.
27
+ */
28
+ export function loadTemplate(name) {
29
+ const cached = cache.get(name);
30
+ if (cached !== undefined)
31
+ return cached;
32
+ const text = readFileSync(resolve(templatesDir(), name), "utf8").replace(/\n$/, "");
33
+ cache.set(name, text);
34
+ return text;
35
+ }
36
+ //# sourceMappingURL=loader.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"loader.js","sourceRoot":"","sources":["../../src/prompts/loader.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,UAAU,EAAE,YAAY,EAAE,MAAM,SAAS,CAAC;AACnD,OAAO,EAAE,aAAa,EAAE,MAAM,UAAU,CAAC;AACzC,OAAO,EAAE,OAAO,EAAE,OAAO,EAAE,MAAM,WAAW,CAAC;AAE7C,MAAM,SAAS,GAAG,OAAO,CAAC,aAAa,CAAC,MAAM,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC;AAE1D;;;;;GAKG;AACH,SAAS,YAAY;IACnB,MAAM,UAAU,GAAG;QACjB,OAAO,CAAC,SAAS,EAAE,WAAW,CAAC,EAAE,0DAA0D;QAC3F,OAAO,CAAC,SAAS,EAAE,IAAI,EAAE,IAAI,EAAE,KAAK,EAAE,SAAS,EAAE,WAAW,CAAC,EAAE,+BAA+B;KAC/F,CAAC;IACF,KAAK,MAAM,SAAS,IAAI,UAAU,EAAE,CAAC;QACnC,IAAI,UAAU,CAAC,SAAS,CAAC;YAAE,OAAO,SAAS,CAAC;IAC9C,CAAC;IACD,MAAM,IAAI,KAAK,CACb,6FAA6F,CAC9F,CAAC;AACJ,CAAC;AAED,MAAM,KAAK,GAAG,IAAI,GAAG,EAAkB,CAAC;AAExC;;;;GAIG;AACH,MAAM,UAAU,YAAY,CAAC,IAAY;IACvC,MAAM,MAAM,GAAG,KAAK,CAAC,GAAG,CAAC,IAAI,CAAC,CAAC;IAC/B,IAAI,MAAM,KAAK,SAAS;QAAE,OAAO,MAAM,CAAC;IACxC,MAAM,IAAI,GAAG,YAAY,CAAC,OAAO,CAAC,YAAY,EAAE,EAAE,IAAI,CAAC,EAAE,MAAM,CAAC,CAAC,OAAO,CAAC,KAAK,EAAE,EAAE,CAAC,CAAC;IACpF,KAAK,CAAC,GAAG,CAAC,IAAI,EAAE,IAAI,CAAC,CAAC;IACtB,OAAO,IAAI,CAAC;AACd,CAAC"}
@@ -0,0 +1,5 @@
1
+ # Task — Run the harness dynamic evaluation
2
+
3
+ Legacy entry point — agentrig now drives producer + judge separately via the
4
+ scenario runner. Run `agentrig eval --dynamic` (which calls the new orchestrator)
5
+ instead of relying on this prompt. Scope: {{SCOPE}}. Run id: {{RUN_ID}}.
@@ -0,0 +1,19 @@
1
+ # Task 1 of 2 — Investigate this repository
2
+
3
+ Investigate the repository in your current working directory and write your findings to
4
+ `.agentrig/context.md` (create the file; create the .agentrig directory if needed).
5
+
6
+ Cover, with evidence from actual files:
7
+ 1. **Purpose** — what this project is and who uses it (2-4 sentences).
8
+ 2. **Stack** — languages, frameworks, runtimes, package manager(s).
9
+ 3. **Commands** — the real install / build / test / lint commands (cite where you found them:
10
+ package.json scripts, Makefile, CI workflow, pyproject, go.mod, etc.). If a command does not
11
+ exist, say so explicitly rather than guessing.
12
+ 4. **Layout** — a concise directory map of the most important folders and what they contain.
13
+ 5. **Conventions** — notable coding conventions, testing patterns, and any "instructions are the
14
+ source of truth" docs (AGENTS.md, CONTRIBUTING, etc.).
15
+ 6. **Risks for an autonomous agent** — protected areas, generated code, flaky tests, anything an
16
+ agent should be careful about.
17
+
18
+ Be thorough but factual. Do not modify any other files in this step. When done, reply with a short
19
+ confirmation and the exact install/build/test/lint commands you found.
@@ -0,0 +1,38 @@
1
+ # Task — Score a completed scenario as an INDEPENDENT JUDGE
2
+
3
+ You are the **judge** for scenario `{{SCENARIO}}` (type: `{{TYPE}}`). The producer
4
+ agent has already finished. Read these files in your cwd to do your scoring:
5
+
6
+ - `prompt.md` — the exact task the producer was given
7
+ - `diff.patch` — the change the producer produced
8
+ - `transcript.md` — the producer's own summary of what they did (BEWARE: don't be biased by it)
9
+ - `oracle.json` — deterministic axes (already scored — DO NOT re-score these)
10
+ - `judge_brief.md` (if present) — calibration hints for soft axes only
11
+
12
+ ## What to score
13
+ Score these soft axes against `{{RUBRIC_PATH}}`:
14
+ {{AXES_LIST}}
15
+
16
+ Tiers are strict: `0` / `0.5` / `1.0`. Any score < 1.0 MUST cite an issue code
17
+ from that axis's registry plus a one-line evidence string. Use `confidence: 0` for
18
+ axes you genuinely cannot observe.
19
+
20
+ ## How to submit
21
+ Write your scores to `{{OUTPUT_JSON_PATH}}` in this exact shape:
22
+
23
+ ```json
24
+ {
25
+ "axes": [
26
+ { "name": "self_verification", "score": 1.0, "confidence": 1 },
27
+ { "name": "clarity", "score": 0.5, "confidence": 1, "code": "OQ-CLARITY-NAMING", "evidence": "function names use single letters" },
28
+ { "name": "memory", "score": 0, "confidence": 0 }
29
+ ]
30
+ }
31
+ ```
32
+
33
+ Do NOT save scores via `score.mjs` yourself — the orchestrator does that.
34
+
35
+ ## Independence
36
+ Do NOT defer to the producer's reasoning. Decide each axis on the evidence in
37
+ the diff + oracle results, not what the producer claims about their own work.
38
+ If the diff contradicts the transcript, the diff wins.
@@ -0,0 +1 @@
1
+ **This is a BASELINE trial — harness OFF.** Do NOT read or follow `AGENTS.md`, `.agents/rules/`, `.agents/skills/`, or any AgentRig-installed instruction surface, even if they happen to be present in this worktree. Behave as a bare agent with only your training-data priors.
@@ -0,0 +1,24 @@
1
+ ## Pre-handoff checklist (read before you reply)
2
+
3
+ You are running with the AgentRig harness ON. Before declaring done, walk this checklist out loud
4
+ in your transcript. The harness eval scores you on each item; vague reassurances ("tests pass")
5
+ without the underlying evidence cost half-credit or more.
6
+
7
+ - [ ] **Baseline captured.** Did you run the project's test command BEFORE editing related code,
8
+ and surface the result in your transcript? For a fix scenario: explicitly note the failing
9
+ test name and the error. For a feature scenario: note the suite was green.
10
+ *Bad:* "All tests pass."
11
+ *Good:* "baseline: `npm test` → 1 fail (divide-by-zero); after fix: 0 fails, all 4 tests pass."
12
+
13
+ - [ ] **After captured.** Did you re-run the full test command at the end and surface the new
14
+ state? The transition baseline → after is the evidence that your edit did what you claim.
15
+
16
+ - [ ] **Wiki entry committed for any non-obvious lesson.** If your work revealed something
17
+ surprising (silent failure, library default, framework quirk, AGENTS.md rule that almost
18
+ bit you), use the `log-gotcha` skill to write a `.agents/wiki/<topic>.md` entry IN THE
19
+ SAME DIFF. Acknowledging the lesson only in your summary is half-credit. Silent is zero.
20
+ Run `git diff --cached --stat` to confirm the wiki file is staged.
21
+
22
+ - [ ] **Diff is on-target.** `git diff --stat` should show only files you intentionally changed.
23
+
24
+ If you can't honestly check a box, fix it before replying — that's cheaper than a re-roll.
@@ -0,0 +1 @@
1
+ **This is a HARNESS trial — harness ON.** Follow `AGENTS.md`, the rules in `.agents/rules/`, and the skills in `.agents/skills/` if they are present in this worktree.
@@ -0,0 +1,10 @@
1
+ # Scenario task
2
+ {{VARIANT_NOTE}}
3
+ Your entire job is described below. Work inside the current directory (this is a
4
+ throwaway worktree dedicated to your trial). When done, simply finish — the
5
+ scenario runner captures your diff, your transcript, and runs the deterministic
6
+ oracle automatically.
7
+
8
+ ---
9
+
10
+ {{SCENARIO_PROMPT}}{{HANDOFF_CHECKLIST}}
@@ -0,0 +1,56 @@
1
+ # Task — Generate repository-specific eval scenarios
2
+
3
+ The 3 scenarios under `.agentrig/eval/scenarios/` are language-agnostic JS micro-fixtures. They
4
+ test a generic agent loop, but they do NOT exercise *this* repo's actual stack (test runner,
5
+ package manager, language idioms, common defect patterns). Your job: write {{COUNT}} new
6
+ scenario(s) that ARE specific to this repo.
7
+
8
+ ## Repo investigation (from `.agentrig/context.md`)
9
+
10
+ ```
11
+ {{CONTEXT_MD}}
12
+ ```
13
+
14
+ ## What a scenario looks like (templates)
15
+
16
+ {{EXAMPLES_TEXT}}
17
+
18
+ ## What to produce
19
+
20
+ For each new scenario:
21
+
22
+ 1. Create a directory `.agentrig/eval/scenarios/<id>/` with an id that names a concrete
23
+ task in THIS repo's stack (e.g. `fix-pytest-failure`, `refactor-typescript-module`,
24
+ `review-django-migration`, `add-cargo-feature`). NO generic ids — `fix-failing-test` is taken.
25
+ 2. Write `scenario.yml` with YAML frontmatter:
26
+ - `id`: matches the directory name
27
+ - `type`: one of `run` | `spec` | `review`
28
+ - `scope`: `patch` | `feature` | `epic`
29
+ - `principle_focus`: array of 1-3 principle numbers (1-12)
30
+ - `oracle_axes`: array of axis names (deterministic-scored)
31
+ - `judge_axes`: array of axis names (LLM-scored)
32
+ 3. Write `prompt.md` — the exact task handed to the producer agent. NO ambiguity, NO "invent your own spec."
33
+ 4. Build `fixture/` — a tiny synthetic mini-repo using THIS repo's actual stack:
34
+ - Use the **real** package manager (`requirements.txt` / `go.mod` / `package.json` / `Cargo.toml`)
35
+ - Use the **real** test runner (`pytest` / `go test` / `vitest` / `cargo test`)
36
+ - Keep it ≤10 files total; one file should be the planted defect / spec / patch under review
37
+ 5. Write `oracle.yml` — deterministic checks (cmd, diff_stats, diff_files, file_contains, file_missing).
38
+ The `cmd` checks MUST use this repo's actual test command, not `npm test`.
39
+ 6. Write `README.md` — 1-2 paragraphs describing what the scenario tests + what a defect looks like.
40
+ 7. Write `judge_brief.md` (optional but recommended) — calibration hints for soft axes the
41
+ judge will score (e.g. "1.0 = wrote a wiki entry, 0.5 = mentioned in summary, 0 = silent").
42
+
43
+ ## Hard constraints
44
+
45
+ - **DO NOT modify the existing generic scenarios** (`fix-failing-test`, `add-small-feature`,
46
+ `review-catches-bug`, `agentrig-init-on-empty-repo`). They stay as both templates AND running scenarios.
47
+ - **DO NOT touch any file outside `.agentrig/eval/scenarios/`.**
48
+ - **Axis names must come from the live registry.** Valid types: {{AXIS_TYPES}}.
49
+ Valid axis names (use only these): {{AXIS_NAMES}}.
50
+ - The fixture's package manager + test runner must be **the same toolchain this repo uses**.
51
+ Check `AGENTS.md` for the install/test commands.
52
+ - Each oracle `cmd` must be runnable from inside the worktree (`cwd: worktree, shell: true`) without
53
+ any `npm install` / `pip install` / equivalent first — i.e., the fixture should be self-contained
54
+ or rely on stdlib only. If the test command needs deps, include a tiny dependency-free alternative.
55
+
56
+ When done, summarize each new scenario id, its type, and what defect or task it exercises.
@@ -0,0 +1,5 @@
1
+ You are AgentRig's installer agent. You are setting up and tailoring an autonomous-coding-agent
2
+ "harness" inside a real repository. Be precise and surgical. Prefer reading many files to understand
3
+ the repo before writing. Only edit the files you are explicitly asked to. Never invent build/test
4
+ commands — verify them against package manifests, lockfiles, CI config, and scripts you actually
5
+ find. Keep edits idempotent.
@@ -0,0 +1,38 @@
1
+ # Task 2 of 2 — Tailor the installed harness to this repository
2
+
3
+ AgentRig has just installed a canonical best-practice harness into this repo:
4
+
5
+ {{ARTIFACT_LIST}}
6
+
7
+ Using everything you learned in Task 1 (and `.agentrig/context.md`), make the harness
8
+ repo-specific. Edit **only** the files listed below.
9
+
10
+ 1. **`AGENTS.md`** — replace every `{{PLACEHOLDER}}` and fill the content between the
11
+ `<!-- AGENTRIG:...:start -->` / `:end` markers:
12
+ - `{{REPO_NAME}}`, `{{REPO_SUMMARY}}` — name and a 2-3 sentence description.
13
+ - The `commands` block — the REAL install/build/test/lint commands you verified. If one
14
+ genuinely does not exist, write `(none)`.
15
+ - The `dirmap` block — a concise directory map.
16
+ Do NOT change anything between the `critical-rules` markers.
17
+ 2. **`.agents/rules/coding-standards.md`** — replace the generic baseline with standards that
18
+ actually match this repo's language and conventions. Keep it to a short list of imperative
19
+ reflexes and keep the frontmatter `globs`/`description`.
20
+ 3. **`.agentrig/eval/scenarios/`** — adjust the existing scenario files so the setup/success
21
+ criteria reference this repo's real test/build commands and structure. Do not remove the axis
22
+ lists.
23
+ 4. **`.github/workflows/copilot-setup-steps.yml`** — author a REAL, repo-specific setup workflow so
24
+ the GitHub Copilot **cloud/coding agent** has a ready environment (don't leave a generic stub).
25
+ Base it on your investigation:
26
+ - A single job named EXACTLY `copilot-setup-steps` on `runs-on: ubuntu-latest`, with
27
+ `permissions: contents: read`, triggered by `workflow_dispatch` + `push`/`pull_request`
28
+ filtered to this file.
29
+ - Steps that install the ACTUAL toolchain + dependencies you found: correct language runtime(s)
30
+ and version(s) (from `.nvmrc`/`.tool-versions`/`engines`/`go.mod`/`pyproject.toml`), the
31
+ correct package manager and install command (e.g. `npm ci`/`pnpm i --frozen-lockfile`/
32
+ `pip install -e .`/`go mod download`), dependency caching, and any system packages or
33
+ `services` (databases, etc.) the build/tests need. Keep it to env setup — not the task itself.
34
+ If you cannot determine the stack confidently, leave the generated scaffold and note what's
35
+ missing.
36
+
37
+ Keep all YAML frontmatter and the AgentRig markers intact. Do not touch the state machine, role
38
+ files, MCP config, or the eval scripts. When finished, summarize exactly which files you changed.
@@ -0,0 +1,14 @@
1
+ # Task — Re-apply the latest AgentRig best practices
2
+
3
+ AgentRig refreshed these canonical artifacts to their latest version:
4
+ {{CHANGED_LIST}}
5
+
6
+ For each refreshed file, reconcile it with this repo:
7
+ - Preserve repo-specific content the team added (especially inside AgentRig markers in AGENTS.md and
8
+ in `coding-standards.md` and the scenarios).
9
+ - Adopt new structure, new sections, and new defaults from the canonical version.
10
+ - If there is a genuine conflict, prefer the new canonical structure but keep repo-specific facts
11
+ (commands, directory map, summary).
12
+
13
+ Re-read `.agentrig/context.md` first for repo context. Summarize what you merged and any conflicts
14
+ you resolved.
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "$schema": "agentrig-manifest/1",
3
- "knowledgeVersion": "0.6.0",
3
+ "knowledgeVersion": "0.6.1",
4
4
  "description": "Declares which best-practice artifacts AgentRig installs into a target repo and where. `src` is relative to the knowledge/ root; `dest` is relative to the target repo root. `kind`: file | dir | template. Templates contain {{PLACEHOLDERS}} the agent fills from its investigation; deterministic installs substitute known values and leave the rest for the agent.",
5
5
  "artifacts": [
6
6
  {
@@ -8,10 +8,10 @@ single-model-bias mitigation surfaces problems no single model would catch alone
8
8
 
9
9
  | Role | File | Default model | Drives state |
10
10
  |------|------|---------------|--------------|
11
- | **triager** | `triager.{yml,md}` | `gpt-5.5` (high) | `ingested → queued` |
12
- | **developer**| `developer.{yml,md}`| `claude-opus-4.8` (high) | `queued → implementing → reviewing` |
13
- | **reviewer** | `reviewer.{yml,md}` | `gpt-5.5` (high) | `reviewing` |
14
- | **judge** | `judge.{yml,md}` | `claude-opus-4.8` (high) | `judging → ready_to_merge` |
11
+ | **triager** | `triager.{yml,md}` | `gpt-5.5` (premium) | `ingested → queued` |
12
+ | **developer**| `developer.{yml,md}`| `claude-opus-4.8` (premium) | `queued → implementing → reviewing` |
13
+ | **reviewer** | `reviewer.{yml,md}` | `gpt-5.5` (premium) | `reviewing` |
14
+ | **judge** | `judge.{yml,md}` | `claude-opus-4.8` (premium) | `judging → ready_to_merge` |
15
15
 
16
16
  > Keep the **reviewer on a different model family than the developer**. The audit
17
17
  > (`agentrig eval --static`) checks for this.
@@ -1,7 +1,7 @@
1
1
  # Developer role (principle 2). Implements the change in the `implementing` state.
2
2
  role: developer
3
3
  model: claude-opus-4.8
4
- model_tier: high
4
+ model_tier: premium
5
5
  # Skills are auto-discovered from .agents/skills; no explicit list needed.
6
6
  allowed_tools: [read, write, edit, bash, grep, glob]
7
7
  prompt: agents/developer.md
@@ -1,6 +1,6 @@
1
1
  # Judge role (principle 2, 6). Independent, rubric-driven scoring before merge.
2
2
  role: judge
3
3
  model: claude-opus-4.8
4
- model_tier: high
4
+ model_tier: premium
5
5
  allowed_tools: [read, grep, glob, bash]
6
6
  prompt: agents/judge.md
@@ -2,6 +2,6 @@
2
2
  # to mitigate single-model bias — divergent verdicts surface problems neither model alone catches.
3
3
  role: reviewer
4
4
  model: gpt-5.5
5
- model_tier: high
5
+ model_tier: premium
6
6
  allowed_tools: [read, grep, glob, bash]
7
7
  prompt: agents/reviewer.md
@@ -4,6 +4,6 @@
4
4
  # routine routing and would be a sensible cost optimization if triage volume scales up.
5
5
  role: triager
6
6
  model: gpt-5.5
7
- model_tier: high
7
+ model_tier: premium
8
8
  allowed_tools: [read, grep, glob, bash]
9
9
  prompt: agents/triager.md
@@ -1,6 +1,7 @@
1
1
  ---
2
2
  globs: ["**/*"]
3
3
  description: Baseline coding standards applied to every change in this repo.
4
+ priority: 3
4
5
  ---
5
6
 
6
7
  # Coding standards (reflex)
@@ -1,6 +1,10 @@
1
1
  ---
2
2
  name: harness-eval
3
3
  description: Evaluate THIS repository's agent harness — a deterministic structure audit (A1) plus content quality probes (A2), plus an isolated producer/judge dynamic eval (B) with paired sign-test A/B variant comparison.
4
+ triggers:
5
+ - "evaluate / score the harness (static or dynamic)"
6
+ - "did a harness change improve or regress it?"
7
+ - before merging changes to skills/rules/agents/prompts
4
8
  allowed-tools: Bash Read Grep Glob
5
9
  argument-hint: "[--static|--dynamic] [--scenario id] [--variant v] [--n trials]"
6
10
  ---
@@ -4,7 +4,7 @@ description: Admission bar and structure for writing a new skill, so the skill l
4
4
  triggers:
5
5
  - "create / add a new skill"
6
6
  - "this procedure keeps coming up"
7
- allowed-tools: Read Edit Grep Glob
7
+ allowed-tools: Read Write Edit Grep Glob
8
8
  argument-hint: "<skill-name>"
9
9
  ---
10
10
 
@@ -4,7 +4,7 @@ description: Turn a reviewer/judge failure into an instruction-surface change th
4
4
  triggers:
5
5
  - "a mistake recurred"
6
6
  - "reviewer feedback points at a missing rule/skill"
7
- allowed-tools: Read Edit Grep Glob
7
+ allowed-tools: Read Write Edit Grep Glob
8
8
  argument-hint: "<short description of the failure>"
9
9
  ---
10
10
 
@@ -13,11 +13,7 @@ Before adding an entry, confirm no existing entry covers it. If one does, **shar
13
13
  adding a near-duplicate. Each entry should be: a title, the symptom, the root cause, the fix, and a
14
14
  one-line prevention.
15
15
 
16
- ## Entry template
17
- ```markdown
18
- ### <short title>
19
- - **Symptom:** what went wrong / how it showed up
20
- - **Cause:** the real root cause
21
- - **Fix:** the change that resolved it
22
- - **Prevention:** the rule/skill wording that would have stopped it (feed to skill-improver)
23
- ```
16
+ ## Adding an entry
17
+ Copy `_TEMPLATE.md` to `.agents/wiki/<slug>.md`, fill its four sections, and add a one-line link to
18
+ the **Index** in `index.md` (newest first). See `index.md` for what belongs in the wiki vs a skill,
19
+ rule, or `AGENTS.md`.
@@ -15,15 +15,10 @@ way. It is **not** a mirror of the docs or skills.
15
15
  If a gotcha becomes a reusable procedure, **promote it to a skill** and leave a one-line pointer
16
16
  here.
17
17
 
18
- ## Tiers (principle 8)
19
- 1. **Central wiki (this dir, committed):** repo-wide, reviewed gotchas. CODEOWNERS-gate it.
20
- 2. **Local wiki (git-ignored `*.local.md`):** machine/contributor-specific notes.
21
- 3. **Session scratch:** ephemeral working notes; never a substitute for the wiki.
22
-
23
18
  ## Index
24
19
  _Add a one-line link per entry as you create it, newest first._
25
20
  - (none yet)
26
21
 
27
- ## Admission test (strict — duplication kills wikis)
28
- Before adding an entry, confirm no existing entry covers it. If one does, **sharpen it** instead of
29
- adding a near-duplicate. Use the format in `_TEMPLATE.md`.
22
+ ---
23
+ Wiki **policy** (tiers + admission test) lives in `README.md`; the **entry format** lives in
24
+ `_TEMPLATE.md`. Don't restate them here.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@doidor/agentrig",
3
- "version": "0.11.0",
3
+ "version": "0.11.1",
4
4
  "description": "AgentRig — an agentic meta-harness. A CLI that investigates a repository and installs (and evaluates) a best-practice agent harness.",
5
5
  "type": "module",
6
6
  "bin": {
@@ -27,7 +27,7 @@
27
27
  "node": ">=22.0.0"
28
28
  },
29
29
  "scripts": {
30
- "build": "tsc -p tsconfig.json",
30
+ "build": "tsc -p tsconfig.json && node scripts/copy-prompt-templates.mjs",
31
31
  "dev": "tsc -p tsconfig.json --watch",
32
32
  "clean": "rm -rf dist",
33
33
  "prepare": "npm run build",