ralphctl 0.4.0 → 0.4.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-MRN3Z2XC.mjs → chunk-GL7MKLLS.mjs} +2 -2
- package/dist/{chunk-JYCGQA2D.mjs → chunk-TKPTT2UG.mjs} +101 -8
- package/dist/cli.mjs +5 -5
- package/dist/{mount-XMN3S4W6.mjs → mount-ISHZM36X.mjs} +2 -2
- package/dist/prompts/ideate-auto.md +26 -1
- package/dist/prompts/ideate.md +5 -1
- package/dist/prompts/plan-auto.md +30 -2
- package/dist/prompts/plan-common-examples.md +82 -0
- package/dist/prompts/plan-common.md +26 -78
- package/dist/prompts/plan-interactive.md +6 -2
- package/dist/prompts/sprint-feedback.md +12 -3
- package/dist/prompts/task-evaluation.md +32 -11
- package/dist/prompts/task-execution.md +13 -13
- package/dist/prompts/ticket-refine.md +4 -0
- package/dist/prompts/validation-checklist.md +4 -0
- package/dist/{start-D35SOXMM.mjs → start-76JKJQIH.mjs} +1 -1
- package/package.json +1 -1
|
@@ -52,7 +52,7 @@ import {
|
|
|
52
52
|
updateTask,
|
|
53
53
|
updateTaskStatus,
|
|
54
54
|
validateImportTasks
|
|
55
|
-
} from "./chunk-
|
|
55
|
+
} from "./chunk-TKPTT2UG.mjs";
|
|
56
56
|
import {
|
|
57
57
|
fetchIssueFromUrl,
|
|
58
58
|
formatIssueContext,
|
|
@@ -177,7 +177,7 @@ import {
|
|
|
177
177
|
// package.json
|
|
178
178
|
var package_default = {
|
|
179
179
|
name: "ralphctl",
|
|
180
|
-
version: "0.4.
|
|
180
|
+
version: "0.4.2",
|
|
181
181
|
description: "Agent harness for long-running AI coding tasks \u2014 orchestrates Claude Code & GitHub Copilot across repositories",
|
|
182
182
|
homepage: "https://github.com/lukas-grigis/ralphctl",
|
|
183
183
|
type: "module",
|
|
@@ -1681,6 +1681,37 @@ function collectRepoIds(tasks) {
|
|
|
1681
1681
|
|
|
1682
1682
|
// src/business/usecases/execute.ts
|
|
1683
1683
|
import { basename } from "path";
|
|
1684
|
+
|
|
1685
|
+
// src/business/usecases/recover-dirty-tree.ts
|
|
1686
|
+
async function recoverDirtyTree(deps, params) {
|
|
1687
|
+
const { external, logger, signalBus } = deps;
|
|
1688
|
+
const { sprintId, taskId, taskName, repoPath } = params;
|
|
1689
|
+
if (!external.hasUncommittedChanges(repoPath)) return;
|
|
1690
|
+
logger.warn(
|
|
1691
|
+
`Dirty tree after "${taskName}" \u2014 auto-committing on the harness's behalf. The agent should commit its own work; see prompt guidance.`,
|
|
1692
|
+
{ taskId, projectPath: repoPath }
|
|
1693
|
+
);
|
|
1694
|
+
signalBus.emit({
|
|
1695
|
+
type: "signal",
|
|
1696
|
+
signal: {
|
|
1697
|
+
type: "note",
|
|
1698
|
+
text: `harness auto-commit: dirty tree after task "${taskName}" settlement`,
|
|
1699
|
+
timestamp: /* @__PURE__ */ new Date()
|
|
1700
|
+
},
|
|
1701
|
+
ctx: { sprintId, taskId, projectPath: repoPath }
|
|
1702
|
+
});
|
|
1703
|
+
const message = `chore(harness): auto-commit leftover changes from "${taskName}"`;
|
|
1704
|
+
try {
|
|
1705
|
+
await external.autoCommit(repoPath, message);
|
|
1706
|
+
} catch (err) {
|
|
1707
|
+
logger.error(`Auto-commit failed in ${repoPath}: ${err instanceof Error ? err.message : String(err)}`, {
|
|
1708
|
+
taskId,
|
|
1709
|
+
projectPath: repoPath
|
|
1710
|
+
});
|
|
1711
|
+
}
|
|
1712
|
+
}
|
|
1713
|
+
|
|
1714
|
+
// src/business/usecases/execute.ts
|
|
1684
1715
|
var ExecuteTasksUseCase = class {
|
|
1685
1716
|
constructor(persistence, aiSession, promptBuilder, parser, ui, logger, external, fs, signalParser2, signalHandler, signalBus) {
|
|
1686
1717
|
this.persistence = persistence;
|
|
@@ -1940,6 +1971,10 @@ ${instructions}`;
|
|
|
1940
1971
|
);
|
|
1941
1972
|
if (finishStatus === "done") finishStatus = "failed";
|
|
1942
1973
|
}
|
|
1974
|
+
await recoverDirtyTree(
|
|
1975
|
+
{ external: this.external, logger: this.logger, signalBus: this.signalBus },
|
|
1976
|
+
{ sprintId: sprint.id, taskId: syntheticTask.id, taskName: syntheticTask.name, repoPath }
|
|
1977
|
+
);
|
|
1943
1978
|
this.signalBus.emit({
|
|
1944
1979
|
type: "task-finished",
|
|
1945
1980
|
sprintId: sprint.id,
|
|
@@ -2760,6 +2795,19 @@ function evaluateTask(deps) {
|
|
|
2760
2795
|
});
|
|
2761
2796
|
}
|
|
2762
2797
|
|
|
2798
|
+
// src/business/pipelines/execute/steps/recover-dirty-tree.ts
|
|
2799
|
+
function recoverDirtyTree2(deps) {
|
|
2800
|
+
return step("recover-dirty-tree", async (ctx) => {
|
|
2801
|
+
const { task, sprint } = ctx;
|
|
2802
|
+
const repoPath = await deps.persistence.resolveRepoPath(task.repoId);
|
|
2803
|
+
await recoverDirtyTree(
|
|
2804
|
+
{ external: deps.external, logger: deps.logger, signalBus: deps.signalBus },
|
|
2805
|
+
{ sprintId: sprint.id, taskId: task.id, taskName: task.name, repoPath }
|
|
2806
|
+
);
|
|
2807
|
+
return Result.ok({});
|
|
2808
|
+
});
|
|
2809
|
+
}
|
|
2810
|
+
|
|
2763
2811
|
// src/business/pipelines/execute/steps/mark-done.ts
|
|
2764
2812
|
function markDone(deps) {
|
|
2765
2813
|
return step("mark-done", async (ctx) => {
|
|
@@ -2823,6 +2871,14 @@ function createPerTaskPipeline(deps, useCase, options = {}) {
|
|
|
2823
2871
|
options
|
|
2824
2872
|
})
|
|
2825
2873
|
),
|
|
2874
|
+
trace(
|
|
2875
|
+
recoverDirtyTree2({
|
|
2876
|
+
persistence: deps.persistence,
|
|
2877
|
+
external: deps.external,
|
|
2878
|
+
logger: deps.logger,
|
|
2879
|
+
signalBus: deps.signalBus
|
|
2880
|
+
})
|
|
2881
|
+
),
|
|
2826
2882
|
trace(markDone({ persistence: deps.persistence, logger: deps.logger, signalBus: deps.signalBus }))
|
|
2827
2883
|
]);
|
|
2828
2884
|
}
|
|
@@ -4183,9 +4239,12 @@ function composePrompt(template, substitutions) {
|
|
|
4183
4239
|
}
|
|
4184
4240
|
return result;
|
|
4185
4241
|
}
|
|
4242
|
+
var CHECK_GATE_EXAMPLE = "Run the project's check gate \u2014 all pass (omit this step when the project has no check script)";
|
|
4186
4243
|
function buildPlanCommon(projectToolingSection) {
|
|
4187
4244
|
return composePrompt(loadPartial("plan-common"), {
|
|
4188
|
-
|
|
4245
|
+
PLAN_COMMON_EXAMPLES: loadPartial("plan-common-examples"),
|
|
4246
|
+
PROJECT_TOOLING: projectToolingSection,
|
|
4247
|
+
CHECK_GATE_EXAMPLE
|
|
4189
4248
|
});
|
|
4190
4249
|
}
|
|
4191
4250
|
function buildPlannerBase(projectToolingSection) {
|
|
@@ -4193,7 +4252,8 @@ function buildPlannerBase(projectToolingSection) {
|
|
|
4193
4252
|
HARNESS_CONTEXT: loadPartial("harness-context"),
|
|
4194
4253
|
COMMON: buildPlanCommon(projectToolingSection),
|
|
4195
4254
|
VALIDATION: loadPartial("validation-checklist"),
|
|
4196
|
-
SIGNALS: loadPartial("signals-planning")
|
|
4255
|
+
SIGNALS: loadPartial("signals-planning"),
|
|
4256
|
+
CHECK_GATE_EXAMPLE
|
|
4197
4257
|
};
|
|
4198
4258
|
}
|
|
4199
4259
|
function buildInteractivePrompt(context, outputFile, schema, projectToolingSection) {
|
|
@@ -4212,9 +4272,13 @@ function buildAutoPrompt(context, schema, projectToolingSection) {
|
|
|
4212
4272
|
});
|
|
4213
4273
|
}
|
|
4214
4274
|
function buildTaskExecutionPrompt(progressFilePath, noCommit, contextFileName, projectToolingSection = "") {
|
|
4215
|
-
|
|
4216
|
-
|
|
4217
|
-
|
|
4275
|
+
let template = loadTemplate("task-execution");
|
|
4276
|
+
if (noCommit) {
|
|
4277
|
+
template = template.replace(/^[ \t]*\{\{COMMIT_STEP\}\}\n/m, "\n");
|
|
4278
|
+
template = template.replace(/^[ \t]*\{\{COMMIT_CONSTRAINT\}\}\n/m, "");
|
|
4279
|
+
}
|
|
4280
|
+
const commitStep = noCommit ? "" : " - **Before continuing:** Create a git commit with a descriptive message for the changes made.";
|
|
4281
|
+
const commitConstraint = noCommit ? "" : "- **Must commit** \u2014 Create a git commit before signaling completion.";
|
|
4218
4282
|
return composePrompt(template, {
|
|
4219
4283
|
HARNESS_CONTEXT: loadPartial("harness-context"),
|
|
4220
4284
|
SIGNALS: loadPartial("signals-task"),
|
|
@@ -4227,11 +4291,16 @@ function buildTaskExecutionPrompt(progressFilePath, noCommit, contextFileName, p
|
|
|
4227
4291
|
}
|
|
4228
4292
|
function buildTicketRefinePrompt(ticketContent, outputFile, schema, issueContext = "") {
|
|
4229
4293
|
const template = loadTemplate("ticket-refine");
|
|
4294
|
+
const issueContextSection = issueContext ? `<context>
|
|
4295
|
+
|
|
4296
|
+
${issueContext}
|
|
4297
|
+
|
|
4298
|
+
</context>` : "";
|
|
4230
4299
|
return composePrompt(template, {
|
|
4231
4300
|
TICKET: ticketContent,
|
|
4232
4301
|
OUTPUT_FILE: outputFile,
|
|
4233
4302
|
SCHEMA: schema,
|
|
4234
|
-
ISSUE_CONTEXT:
|
|
4303
|
+
ISSUE_CONTEXT: issueContextSection
|
|
4235
4304
|
});
|
|
4236
4305
|
}
|
|
4237
4306
|
function buildIdeatePrompt(ideaTitle, ideaDescription, projectName, repositories, outputFile, schema, projectToolingSection) {
|
|
@@ -4260,9 +4329,10 @@ function renderExtraDimensions(extras) {
|
|
|
4260
4329
|
return { section: "", passBar: "", assessment: "" };
|
|
4261
4330
|
}
|
|
4262
4331
|
const section = extras.map(
|
|
4263
|
-
(name
|
|
4264
|
-
|
|
4332
|
+
(name) => `
|
|
4333
|
+
<dimension name="${name}" floor="false">
|
|
4265
4334
|
Additional task-specific dimension flagged by the planner. Apply judgment to whether the implementation satisfies this dimension given the task's verification criteria and steps.
|
|
4335
|
+
</dimension>
|
|
4266
4336
|
`
|
|
4267
4337
|
).join("");
|
|
4268
4338
|
const passBar = extras.map((name) => `
|
|
@@ -5217,6 +5287,25 @@ function hasUncommittedChanges(cwd) {
|
|
|
5217
5287
|
}
|
|
5218
5288
|
return result.stdout.trim().length > 0;
|
|
5219
5289
|
}
|
|
5290
|
+
function autoCommit(cwd, message) {
|
|
5291
|
+
assertSafeCwd(cwd);
|
|
5292
|
+
const add = spawnSync3("git", ["add", "-A"], {
|
|
5293
|
+
cwd,
|
|
5294
|
+
encoding: "utf-8",
|
|
5295
|
+
stdio: ["pipe", "pipe", "pipe"]
|
|
5296
|
+
});
|
|
5297
|
+
if (add.status !== 0) {
|
|
5298
|
+
throw new Error(`Failed to stage changes in ${cwd}: ${add.stderr.trim()}`);
|
|
5299
|
+
}
|
|
5300
|
+
const commit = spawnSync3("git", ["commit", "-m", message], {
|
|
5301
|
+
cwd,
|
|
5302
|
+
encoding: "utf-8",
|
|
5303
|
+
stdio: ["pipe", "pipe", "pipe"]
|
|
5304
|
+
});
|
|
5305
|
+
if (commit.status !== 0) {
|
|
5306
|
+
throw new Error(`Failed to commit in ${cwd}: ${commit.stderr.trim() || commit.stdout.trim()}`);
|
|
5307
|
+
}
|
|
5308
|
+
}
|
|
5220
5309
|
function generateBranchName(sprintId) {
|
|
5221
5310
|
return `ralphctl/${sprintId}`;
|
|
5222
5311
|
}
|
|
@@ -5276,6 +5365,10 @@ var DefaultExternalAdapter = class {
|
|
|
5276
5365
|
hasUncommittedChanges(projectPath) {
|
|
5277
5366
|
return hasUncommittedChanges(projectPath);
|
|
5278
5367
|
}
|
|
5368
|
+
autoCommit(projectPath, message) {
|
|
5369
|
+
autoCommit(projectPath, message);
|
|
5370
|
+
return Promise.resolve();
|
|
5371
|
+
}
|
|
5279
5372
|
createAndCheckoutBranch(projectPath, branchName) {
|
|
5280
5373
|
createAndCheckoutBranch(projectPath, branchName);
|
|
5281
5374
|
}
|
package/dist/cli.mjs
CHANGED
|
@@ -41,7 +41,7 @@ import {
|
|
|
41
41
|
ticketRefineCommand,
|
|
42
42
|
ticketRemoveCommand,
|
|
43
43
|
ticketShowCommand
|
|
44
|
-
} from "./chunk-
|
|
44
|
+
} from "./chunk-GL7MKLLS.mjs";
|
|
45
45
|
import {
|
|
46
46
|
projectAddCommand
|
|
47
47
|
} from "./chunk-D2YGPLIV.mjs";
|
|
@@ -55,7 +55,7 @@ import "./chunk-NUYQK5MN.mjs";
|
|
|
55
55
|
import {
|
|
56
56
|
getTasks,
|
|
57
57
|
sprintStartCommand
|
|
58
|
-
} from "./chunk-
|
|
58
|
+
} from "./chunk-TKPTT2UG.mjs";
|
|
59
59
|
import {
|
|
60
60
|
truncate
|
|
61
61
|
} from "./chunk-JOQO4HMM.mjs";
|
|
@@ -705,7 +705,7 @@ async function main() {
|
|
|
705
705
|
const isBare = argv.length <= 2;
|
|
706
706
|
const isInteractive = argv[2] === "interactive";
|
|
707
707
|
if (isBare || isInteractive) {
|
|
708
|
-
const { mountInkApp } = await import("./mount-
|
|
708
|
+
const { mountInkApp } = await import("./mount-ISHZM36X.mjs");
|
|
709
709
|
const { fallback } = await mountInkApp({ initialView: "repl" });
|
|
710
710
|
if (!fallback) return;
|
|
711
711
|
printBanner();
|
|
@@ -716,10 +716,10 @@ async function main() {
|
|
|
716
716
|
return;
|
|
717
717
|
}
|
|
718
718
|
if (argv[2] === "sprint" && argv[3] === "start") {
|
|
719
|
-
const { parseSprintStartArgs } = await import("./start-
|
|
719
|
+
const { parseSprintStartArgs } = await import("./start-76JKJQIH.mjs");
|
|
720
720
|
const parsed = parseSprintStartArgs(argv.slice(4));
|
|
721
721
|
if (parsed.ok) {
|
|
722
|
-
const { mountInkApp } = await import("./mount-
|
|
722
|
+
const { mountInkApp } = await import("./mount-ISHZM36X.mjs");
|
|
723
723
|
const { getSharedDeps } = await import("./bootstrap-FMHG6DRY.mjs");
|
|
724
724
|
let sprintId;
|
|
725
725
|
try {
|
|
@@ -62,7 +62,7 @@ import {
|
|
|
62
62
|
ticketShowCommand,
|
|
63
63
|
useCurrentPrompt,
|
|
64
64
|
validateConfigValue
|
|
65
|
-
} from "./chunk-
|
|
65
|
+
} from "./chunk-GL7MKLLS.mjs";
|
|
66
66
|
import {
|
|
67
67
|
PromptCancelledError,
|
|
68
68
|
projectAddCommand
|
|
@@ -109,7 +109,7 @@ import {
|
|
|
109
109
|
sprintStartCommand,
|
|
110
110
|
updateTaskStatus,
|
|
111
111
|
withSuspendedTui
|
|
112
|
-
} from "./chunk-
|
|
112
|
+
} from "./chunk-TKPTT2UG.mjs";
|
|
113
113
|
import {
|
|
114
114
|
addTicket,
|
|
115
115
|
allRequirementsApproved,
|
|
@@ -11,6 +11,27 @@ When finished, emit a signal from the `<signals>` block below.
|
|
|
11
11
|
|
|
12
12
|
## Two-Phase Protocol
|
|
13
13
|
|
|
14
|
+
### Phase 0: Think Before Writing
|
|
15
|
+
|
|
16
|
+
Before emitting any JSON, write your reasoning in a `<thinking>…</thinking>` block. Use it to interrogate the idea —
|
|
17
|
+
surface hidden assumptions, identify the real user problem, sketch requirements, and reason about which repositories
|
|
18
|
+
and dependencies the work touches. Explicit reasoning produces sharper output than jumping straight to JSON.
|
|
19
|
+
|
|
20
|
+
The harness's JSON extractor skips everything before the first `{`, so the `<thinking>` block is stripped
|
|
21
|
+
automatically — but the JSON object itself must still be emitted without markdown fences or commentary after it.
|
|
22
|
+
|
|
23
|
+
```
|
|
24
|
+
<thinking>
|
|
25
|
+
The idea says "webhook notifications" but doesn't say which events. Reviewing the API, the natural candidates are
|
|
26
|
+
task-status transitions. Scope = status-change webhooks only; other event types are out of scope.
|
|
27
|
+
Acceptance: POST to configured URL with JSON payload on task status change; retries on 5xx.
|
|
28
|
+
…
|
|
29
|
+
</thinking>
|
|
30
|
+
{
|
|
31
|
+
… JSON object …
|
|
32
|
+
}
|
|
33
|
+
```
|
|
34
|
+
|
|
14
35
|
### Phase 1: Refine Requirements (WHAT)
|
|
15
36
|
|
|
16
37
|
Analyze the idea and produce complete, implementation-agnostic requirements:
|
|
@@ -87,6 +108,8 @@ If you cannot produce a valid plan, signal the issue instead of outputting incom
|
|
|
87
108
|
|
|
88
109
|
- `<planning-blocked>reason</planning-blocked>`
|
|
89
110
|
|
|
111
|
+
<context>
|
|
112
|
+
|
|
90
113
|
## Idea to Implement
|
|
91
114
|
|
|
92
115
|
**Title:** {{IDEA_TITLE}}
|
|
@@ -107,6 +130,8 @@ You have access to these repositories:
|
|
|
107
130
|
|
|
108
131
|
{{COMMON}}
|
|
109
132
|
|
|
133
|
+
</context>
|
|
134
|
+
|
|
110
135
|
{{VALIDATION}}
|
|
111
136
|
|
|
112
137
|
## Output Format
|
|
@@ -148,7 +173,7 @@ If you cannot produce a valid plan, output `<planning-blocked>reason</planning-b
|
|
|
148
173
|
"Update src/repositories/export.ts findExports() to add WHERE clause for date filtering",
|
|
149
174
|
"Add unit tests in src/schemas/__tests__/date-range.test.ts covering valid ranges, invalid formats, and reversed dates",
|
|
150
175
|
"Add integration test in src/controllers/__tests__/export.test.ts for filtered and unfiltered queries",
|
|
151
|
-
"
|
|
176
|
+
"{{CHECK_GATE_EXAMPLE}}"
|
|
152
177
|
],
|
|
153
178
|
"verificationCriteria": [
|
|
154
179
|
"TypeScript compiles with no errors",
|
package/dist/prompts/ideate.md
CHANGED
|
@@ -118,6 +118,8 @@ Focus: Determine HOW to implement the approved requirements
|
|
|
118
118
|
|
|
119
119
|
{{VALIDATION}}
|
|
120
120
|
|
|
121
|
+
<context>
|
|
122
|
+
|
|
121
123
|
## Idea to Refine and Plan
|
|
122
124
|
|
|
123
125
|
**Title:** {{IDEA_TITLE}}
|
|
@@ -141,6 +143,8 @@ mention it as an observation.
|
|
|
141
143
|
|
|
142
144
|
{{COMMON}}
|
|
143
145
|
|
|
146
|
+
</context>
|
|
147
|
+
|
|
144
148
|
## Output Format
|
|
145
149
|
|
|
146
150
|
When BOTH phases are approved by the user, write the JSON to: {{OUTPUT_FILE}}
|
|
@@ -169,7 +173,7 @@ Use this exact JSON Schema:
|
|
|
169
173
|
"Update ExportController.getExport() in src/controllers/export.ts to parse and validate date range params",
|
|
170
174
|
"Add date range filtering to ExportRepository.findRecords() in src/repositories/export.ts",
|
|
171
175
|
"Write tests in src/controllers/__tests__/export.test.ts for: no dates, valid range, invalid range, start > end",
|
|
172
|
-
"
|
|
176
|
+
"{{CHECK_GATE_EXAMPLE}}"
|
|
173
177
|
],
|
|
174
178
|
"verificationCriteria": [
|
|
175
179
|
"TypeScript compiles with no errors",
|
|
@@ -12,6 +12,27 @@ When finished, emit a signal from the `<signals>` block below.
|
|
|
12
12
|
|
|
13
13
|
## Protocol
|
|
14
14
|
|
|
15
|
+
### Step 0: Think Before Writing
|
|
16
|
+
|
|
17
|
+
Before emitting any JSON, write your reasoning in a `<thinking>…</thinking>` block. Use it to work through the problem
|
|
18
|
+
— map tickets to repositories, reason about dependencies, identify risks, and decide on task boundaries. Explicit
|
|
19
|
+
reasoning produces sharper plans than jumping straight to output.
|
|
20
|
+
|
|
21
|
+
The harness's JSON extractor skips everything before the first `[`, so the `<thinking>` block is stripped
|
|
22
|
+
automatically — but the JSON array itself must still be emitted without markdown fences or commentary after it.
|
|
23
|
+
|
|
24
|
+
```
|
|
25
|
+
<thinking>
|
|
26
|
+
Ticket 1 touches both the API and the worker repo — split into two tasks with a blockedBy edge.
|
|
27
|
+
The shared schema change must land first so the worker can import it.
|
|
28
|
+
Verification criterion for the API task: a contract test against the new schema.
|
|
29
|
+
…
|
|
30
|
+
</thinking>
|
|
31
|
+
[
|
|
32
|
+
{ … JSON array … }
|
|
33
|
+
]
|
|
34
|
+
```
|
|
35
|
+
|
|
15
36
|
### Step 1: Explore the Project
|
|
16
37
|
|
|
17
38
|
Scope exploration to what will change the plan — read instruction files first, then only the specific files you need
|
|
@@ -55,10 +76,14 @@ The sprint contains:
|
|
|
55
76
|
- **Existing Tasks**: Tasks from a previous planning run (your output replaces all existing tasks)
|
|
56
77
|
- **Projects**: Each ticket belongs to a project which may have multiple repository paths
|
|
57
78
|
|
|
79
|
+
<context>
|
|
80
|
+
|
|
58
81
|
{{CONTEXT}}
|
|
59
82
|
|
|
60
83
|
{{COMMON}}
|
|
61
84
|
|
|
85
|
+
</context>
|
|
86
|
+
|
|
62
87
|
### Step 5: Handle Blockers
|
|
63
88
|
|
|
64
89
|
If you cannot produce a valid task breakdown, signal the issue instead of outputting incomplete JSON:
|
|
@@ -73,6 +98,9 @@ If you cannot produce a valid task breakdown, signal the issue instead of output
|
|
|
73
98
|
|
|
74
99
|
## Output
|
|
75
100
|
|
|
101
|
+
Your output MAY begin with a `<thinking>…</thinking>` block — the harness's JSON extractor skips everything before the
|
|
102
|
+
first `[`. The JSON array itself must still be emitted without markdown fences or surrounding prose.
|
|
103
|
+
|
|
76
104
|
Output only the JSON document matching the schema below — the harness parses your raw output directly as JSON, so emit
|
|
77
105
|
it without markdown fences, commentary, or surrounding prose. If you cannot produce tasks, output a
|
|
78
106
|
`<planning-blocked>` signal instead.
|
|
@@ -102,7 +130,7 @@ JSON Schema:
|
|
|
102
130
|
"steps": [
|
|
103
131
|
"Create src/utils/validation.ts with validateEmail(), validatePhone(), validateDateRange()",
|
|
104
132
|
"Add corresponding unit tests in src/utils/__tests__/validation.test.ts covering valid inputs, invalid inputs, and edge cases (empty strings, unicode)",
|
|
105
|
-
"
|
|
133
|
+
"{{CHECK_GATE_EXAMPLE}}"
|
|
106
134
|
],
|
|
107
135
|
"verificationCriteria": [
|
|
108
136
|
"TypeScript compiles with no errors",
|
|
@@ -123,7 +151,7 @@ JSON Schema:
|
|
|
123
151
|
"Wire up validation from src/utils/validation.ts with inline error messages",
|
|
124
152
|
"Add form submission handler that calls POST /api/users",
|
|
125
153
|
"Write component tests in src/components/__tests__/RegistrationForm.test.ts for valid submission, validation errors, and API failure",
|
|
126
|
-
"
|
|
154
|
+
"{{CHECK_GATE_EXAMPLE}}"
|
|
127
155
|
],
|
|
128
156
|
"verificationCriteria": [
|
|
129
157
|
"TypeScript compiles with no errors",
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
<examples>
|
|
2
|
+
|
|
3
|
+
The illustrations below are non-normative — they show good/bad shapes for the rules stated in `plan-common.md`. Use
|
|
4
|
+
them as calibration, not templates to copy literally.
|
|
5
|
+
|
|
6
|
+
## Verification Criteria — good vs bad
|
|
7
|
+
|
|
8
|
+
> **Good criteria (verifiable, unambiguous):**
|
|
9
|
+
>
|
|
10
|
+
> - "TypeScript compiles with no errors"
|
|
11
|
+
> - "All existing tests pass plus new tests for the added feature"
|
|
12
|
+
> - "GET /api/users returns 200 with paginated user list"
|
|
13
|
+
> - "GET /api/users?page=-1 returns 400 with validation error"
|
|
14
|
+
> - "Component renders without console errors in browser"
|
|
15
|
+
> - "Playwright e2e: login flow completes without errors" _(UI tasks with Playwright configured)_
|
|
16
|
+
|
|
17
|
+
> **Bad criteria (vague, not independently verifiable):**
|
|
18
|
+
>
|
|
19
|
+
> - "Code is clean and well-structured"
|
|
20
|
+
> - "Error handling is appropriate"
|
|
21
|
+
> - "Performance is acceptable"
|
|
22
|
+
|
|
23
|
+
## Dependency Graph — good vs bad
|
|
24
|
+
|
|
25
|
+
### Good Dependency Graph
|
|
26
|
+
|
|
27
|
+
```
|
|
28
|
+
Task 1: Add shared validation utilities (no deps)
|
|
29
|
+
Task 2: Implement user registration form (blockedBy: [1])
|
|
30
|
+
Task 3: Implement user profile editor (blockedBy: [1])
|
|
31
|
+
Task 4: Add form submission analytics (blockedBy: [2, 3])
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
Tasks 2 and 3 run in parallel (both depend only on 1). Task 4 waits for both.
|
|
35
|
+
|
|
36
|
+
### Bad Dependency Graph
|
|
37
|
+
|
|
38
|
+
```
|
|
39
|
+
Task 1: Add validation utilities (no deps)
|
|
40
|
+
Task 2: Implement registration form (blockedBy: [1])
|
|
41
|
+
Task 3: Implement profile editor (blockedBy: [2]) <-- WRONG
|
|
42
|
+
Task 4: Add submission analytics (blockedBy: [3]) <-- WRONG
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
Task 3 does not actually need Task 2 — it only needs Task 1. This creates a false serial chain that prevents parallel
|
|
46
|
+
execution.
|
|
47
|
+
|
|
48
|
+
## Precise Steps — good vs bad
|
|
49
|
+
|
|
50
|
+
Bad — vague steps that force the agent to guess:
|
|
51
|
+
|
|
52
|
+
```json
|
|
53
|
+
{
|
|
54
|
+
"name": "Add user authentication",
|
|
55
|
+
"steps": ["Implement auth", "Add tests", "Update docs"]
|
|
56
|
+
}
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
Good — precise steps with file paths and pattern references:
|
|
60
|
+
|
|
61
|
+
```json
|
|
62
|
+
{
|
|
63
|
+
"name": "Add user authentication",
|
|
64
|
+
"projectPath": "/Users/dev/my-app",
|
|
65
|
+
"steps": [
|
|
66
|
+
"Create auth service in src/services/auth.ts with login(), logout(), getCurrentUser() — follow the pattern in src/services/user.ts for error handling and return types",
|
|
67
|
+
"Add AuthContext provider in src/contexts/AuthContext.tsx wrapping the app — follow existing ThemeContext pattern",
|
|
68
|
+
"Create useAuth hook in src/hooks/useAuth.ts exposing auth state and actions",
|
|
69
|
+
"Add ProtectedRoute wrapper component in src/components/ProtectedRoute.tsx",
|
|
70
|
+
"Write unit tests in src/services/__tests__/auth.test.ts — follow test patterns in src/services/__tests__/user.test.ts",
|
|
71
|
+
"{{CHECK_GATE_EXAMPLE}}"
|
|
72
|
+
],
|
|
73
|
+
"verificationCriteria": [
|
|
74
|
+
"TypeScript compiles with no errors",
|
|
75
|
+
"All existing tests pass plus new auth tests",
|
|
76
|
+
"ProtectedRoute redirects unauthenticated users to /login",
|
|
77
|
+
"useAuth hook exposes isAuthenticated, user, login, and logout"
|
|
78
|
+
]
|
|
79
|
+
}
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
</examples>
|
|
@@ -1,17 +1,22 @@
|
|
|
1
1
|
## Project Resources
|
|
2
2
|
|
|
3
|
-
|
|
4
|
-
|
|
3
|
+
During exploration, check for project instruction files if present. Treat whichever files exist as authoritative for
|
|
4
|
+
that codebase; skip silently when absent.
|
|
5
|
+
|
|
6
|
+
**Instruction files (any ecosystem):**
|
|
7
|
+
|
|
8
|
+
- **`CLAUDE.md` / `AGENTS.md`** — when present: project-level rules, conventions, and persistent memory
|
|
9
|
+
- **`.github/copilot-instructions.md`** — when present: GitHub Copilot-specific repository instructions
|
|
10
|
+
- **`README.md`** and manifest files (`package.json`, `pyproject.toml`, `Cargo.toml`, `go.mod`, `pom.xml`, …) — setup,
|
|
11
|
+
scripts, and dependencies
|
|
12
|
+
|
|
13
|
+
**Claude-specific configuration (only when the repo has a `.claude/` directory):**
|
|
5
14
|
|
|
6
|
-
- **`CLAUDE.md` / `AGENTS.md`** — project-level rules, conventions, and persistent memory
|
|
7
|
-
- **`.github/copilot-instructions.md`** — GitHub Copilot-specific repository instructions, when present
|
|
8
15
|
- **`.mcp.json`** — MCP servers the project ships with (Playwright, database inspection, etc.)
|
|
9
16
|
- **`.claude/agents/`** — subagent definitions for Task-tool delegation
|
|
10
17
|
- **`.claude/skills/`** — custom skills invokable with the Skill tool for project-specific workflows
|
|
11
18
|
- **`.claude/settings.json`** / **`.claude/settings.local.json`** — tool permissions, model preferences, hooks
|
|
12
19
|
|
|
13
|
-
When repository instruction files exist, treat their instructions as authoritative for that codebase.
|
|
14
|
-
|
|
15
20
|
## What Makes a Great Task
|
|
16
21
|
|
|
17
22
|
A great task can be picked up cold by an AI agent, implemented independently, and verified as done — by a _different_ AI
|
|
@@ -63,6 +68,8 @@ Right size (one task covering the full change):
|
|
|
63
68
|
|
|
64
69
|
### Verification Criteria (The Evaluator Contract)
|
|
65
70
|
|
|
71
|
+
_See the `<examples>` block at the end of this page for good/bad pairs._
|
|
72
|
+
|
|
66
73
|
Every task must include a `verificationCriteria` array — these are the **done contract** between the generator (task
|
|
67
74
|
executor) and the evaluator (independent reviewer). The evaluator grades each criterion as pass/fail across four
|
|
68
75
|
floor dimensions: correctness, completeness, safety, and consistency. If ANY dimension fails, the task fails
|
|
@@ -86,21 +93,6 @@ Write criteria that are:
|
|
|
86
93
|
- **Unambiguous** — two reviewers would agree on pass/fail
|
|
87
94
|
- **Outcome-oriented** — describe WHAT is true when done, not HOW to get there
|
|
88
95
|
|
|
89
|
-
> **Good criteria (verifiable, unambiguous):**
|
|
90
|
-
>
|
|
91
|
-
> - "TypeScript compiles with no errors"
|
|
92
|
-
> - "All existing tests pass plus new tests for the added feature"
|
|
93
|
-
> - "GET /api/users returns 200 with paginated user list"
|
|
94
|
-
> - "GET /api/users?page=-1 returns 400 with validation error"
|
|
95
|
-
> - "Component renders without console errors in browser"
|
|
96
|
-
> - "Playwright e2e: login flow completes without errors" _(UI tasks with Playwright configured)_
|
|
97
|
-
|
|
98
|
-
> **Bad criteria (vague, not independently verifiable):**
|
|
99
|
-
>
|
|
100
|
-
> - "Code is clean and well-structured"
|
|
101
|
-
> - "Error handling is appropriate"
|
|
102
|
-
> - "Performance is acceptable"
|
|
103
|
-
|
|
104
96
|
Aim for 2-4 criteria per task. Include at least one criterion that is computationally checkable (test pass, type check,
|
|
105
97
|
lint clean). For **UI/frontend tasks**, if the project has Playwright configured, add a browser-verifiable criterion —
|
|
106
98
|
the evaluator will attempt visual verification using Playwright or browser tools when the project supports it.
|
|
@@ -108,7 +100,8 @@ the evaluator will attempt visual verification using Playwright or browser tools
|
|
|
108
100
|
### Guidelines
|
|
109
101
|
|
|
110
102
|
1. **Outcome-oriented** — Each task delivers a testable result
|
|
111
|
-
2. **Merge create+use** —
|
|
103
|
+
2. **Merge create+use** — Keep "create X" and "use X" in one task — except when a stable contract makes them
|
|
104
|
+
independently testable (e.g. schema + migration lands first, consumer wiring lands after)
|
|
112
105
|
3. **Let scope drive task count** — do not aim for a specific number. Fewer, larger coherent tasks beat many
|
|
113
106
|
micro-tasks; split only when parallelism or a clean boundary justifies it
|
|
114
107
|
4. **Merge serial chains** — If tasks only make sense when run in sequence, fold them into one task
|
|
@@ -134,6 +127,8 @@ the evaluator will attempt visual verification using Playwright or browser tools
|
|
|
134
127
|
|
|
135
128
|
## Dependency Graph
|
|
136
129
|
|
|
130
|
+
_See the `<examples>` block at the end of this page for good/bad pairs._
|
|
131
|
+
|
|
137
132
|
Tasks execute in dependency order — foundations before dependents.
|
|
138
133
|
|
|
139
134
|
### Guidelines
|
|
@@ -143,29 +138,6 @@ Tasks execute in dependency order — foundations before dependents.
|
|
|
143
138
|
3. **Maximize parallelism** — Only add `blockedBy` when there is a real code dependency
|
|
144
139
|
4. **Validate the DAG** — No cycles; earlier tasks cannot depend on later ones
|
|
145
140
|
|
|
146
|
-
### Good Dependency Graph
|
|
147
|
-
|
|
148
|
-
```
|
|
149
|
-
Task 1: Add shared validation utilities (no deps)
|
|
150
|
-
Task 2: Implement user registration form (blockedBy: [1])
|
|
151
|
-
Task 3: Implement user profile editor (blockedBy: [1])
|
|
152
|
-
Task 4: Add form submission analytics (blockedBy: [2, 3])
|
|
153
|
-
```
|
|
154
|
-
|
|
155
|
-
Tasks 2 and 3 run in parallel (both depend only on 1). Task 4 waits for both.
|
|
156
|
-
|
|
157
|
-
### Bad Dependency Graph
|
|
158
|
-
|
|
159
|
-
```
|
|
160
|
-
Task 1: Add validation utilities (no deps)
|
|
161
|
-
Task 2: Implement registration form (blockedBy: [1])
|
|
162
|
-
Task 3: Implement profile editor (blockedBy: [2]) <-- WRONG
|
|
163
|
-
Task 4: Add submission analytics (blockedBy: [3]) <-- WRONG
|
|
164
|
-
```
|
|
165
|
-
|
|
166
|
-
Task 3 does not actually need Task 2 — it only needs Task 1. This creates a false serial chain that prevents parallel
|
|
167
|
-
execution.
|
|
168
|
-
|
|
169
141
|
**Dependency test**: For each `blockedBy` entry, ask: "Does this task literally use code produced by the blocker?" If
|
|
170
142
|
not, remove the dependency.
|
|
171
143
|
|
|
@@ -177,10 +149,14 @@ Each task must specify which repository it executes in via `projectPath`:
|
|
|
177
149
|
2. **Split by repo** — If a ticket affects multiple repos, create separate tasks per repo with dependencies
|
|
178
150
|
3. **Use exact paths** — `projectPath` must be one of the absolute paths from the project's Repositories section
|
|
179
151
|
|
|
180
|
-
|
|
152
|
+
Split cross-repo work into one task per repo with `blockedBy` — except when atomicity is genuinely required (a
|
|
153
|
+
single commit must land in both repos to avoid broken state), in which case flag the task and surface the need for
|
|
154
|
+
human coordination.
|
|
181
155
|
|
|
182
156
|
## Precise Step Declarations
|
|
183
157
|
|
|
158
|
+
_See the `<examples>` block at the end of this page for good/bad pairs._
|
|
159
|
+
|
|
184
160
|
Every task must include explicit, actionable steps — the implementation checklist.
|
|
185
161
|
|
|
186
162
|
### Step Requirements
|
|
@@ -194,38 +170,6 @@ Every task must include explicit, actionable steps — the implementation checkl
|
|
|
194
170
|
instruction files
|
|
195
171
|
5. **No ambiguity** — Another developer should be able to follow steps without guessing
|
|
196
172
|
|
|
197
|
-
Bad — vague steps that force the agent to guess:
|
|
198
|
-
|
|
199
|
-
```json
|
|
200
|
-
{
|
|
201
|
-
"name": "Add user authentication",
|
|
202
|
-
"steps": ["Implement auth", "Add tests", "Update docs"]
|
|
203
|
-
}
|
|
204
|
-
```
|
|
205
|
-
|
|
206
|
-
Good — precise steps with file paths and pattern references:
|
|
207
|
-
|
|
208
|
-
```json
|
|
209
|
-
{
|
|
210
|
-
"name": "Add user authentication",
|
|
211
|
-
"projectPath": "/Users/dev/my-app",
|
|
212
|
-
"steps": [
|
|
213
|
-
"Create auth service in src/services/auth.ts with login(), logout(), getCurrentUser() — follow the pattern in src/services/user.ts for error handling and return types",
|
|
214
|
-
"Add AuthContext provider in src/contexts/AuthContext.tsx wrapping the app — follow existing ThemeContext pattern",
|
|
215
|
-
"Create useAuth hook in src/hooks/useAuth.ts exposing auth state and actions",
|
|
216
|
-
"Add ProtectedRoute wrapper component in src/components/ProtectedRoute.tsx",
|
|
217
|
-
"Write unit tests in src/services/__tests__/auth.test.ts — follow test patterns in src/services/__tests__/user.test.ts",
|
|
218
|
-
"Run pnpm typecheck && pnpm lint && pnpm test — all pass"
|
|
219
|
-
],
|
|
220
|
-
"verificationCriteria": [
|
|
221
|
-
"TypeScript compiles with no errors",
|
|
222
|
-
"All existing tests pass plus new auth tests",
|
|
223
|
-
"ProtectedRoute redirects unauthenticated users to /login",
|
|
224
|
-
"useAuth hook exposes isAuthenticated, user, login, and logout"
|
|
225
|
-
]
|
|
226
|
-
}
|
|
227
|
-
```
|
|
228
|
-
|
|
229
173
|
Use actual file paths discovered during exploration. Reference the repository instruction files for verification
|
|
230
174
|
commands.
|
|
231
175
|
|
|
@@ -234,6 +178,10 @@ commands.
|
|
|
234
178
|
Start with an action verb (Add, Create, Update, Fix, Refactor, Remove, Migrate). Include the feature/concept, not files.
|
|
235
179
|
Keep under 60 characters. Avoid vague verbs (Improve, Enhance, Handle).
|
|
236
180
|
|
|
181
|
+
See `<examples>` below for concrete good/bad pairs.
|
|
182
|
+
|
|
183
|
+
{{PLAN_COMMON_EXAMPLES}}
|
|
184
|
+
|
|
237
185
|
## Delegation to Available Tooling
|
|
238
186
|
|
|
239
187
|
The "Project Tooling" section below (when present) lists subagents, skills, and MCP servers detected in the target
|
|
@@ -72,7 +72,7 @@ before the plan is finalized.
|
|
|
72
72
|
**Steps:**
|
|
73
73
|
1. Create src/utils/csvExport.ts with column formatters for date, number, and string types
|
|
74
74
|
2. Add unit tests in src/utils/__tests__/csvExport.test.ts covering empty data, special characters, and large datasets
|
|
75
|
-
3. Run
|
|
75
|
+
3. Run the project's check/test/build gate — all pass
|
|
76
76
|
```
|
|
77
77
|
|
|
78
78
|
2. **Show the dependency graph** — Make it obvious which tasks run in parallel vs sequentially, and why each dependency
|
|
@@ -123,10 +123,14 @@ The sprint contains:
|
|
|
123
123
|
- **Existing Tasks**: Tasks from a previous planning run (your output replaces all existing tasks)
|
|
124
124
|
- **Projects**: Each ticket belongs to a project which may have multiple repository paths
|
|
125
125
|
|
|
126
|
+
<context>
|
|
127
|
+
|
|
126
128
|
{{CONTEXT}}
|
|
127
129
|
|
|
128
130
|
{{COMMON}}
|
|
129
131
|
|
|
132
|
+
</context>
|
|
133
|
+
|
|
130
134
|
### Repository Assignment
|
|
131
135
|
|
|
132
136
|
Repositories have been pre-selected by the user. Only create tasks targeting these repositories — the harness executes
|
|
@@ -166,7 +170,7 @@ Use this exact JSON Schema:
|
|
|
166
170
|
"Update ExportController.getExport() in src/controllers/export.ts to parse and validate date range params",
|
|
167
171
|
"Add date range filtering to ExportRepository.findRecords() in src/repositories/export.ts",
|
|
168
172
|
"Write tests in src/controllers/__tests__/export.test.ts for: no dates, valid range, invalid range, start > end",
|
|
169
|
-
"
|
|
173
|
+
"{{CHECK_GATE_EXAMPLE}}"
|
|
170
174
|
],
|
|
171
175
|
"verificationCriteria": [
|
|
172
176
|
"TypeScript compiles with no errors",
|
|
@@ -19,19 +19,26 @@ something entirely new (create a file, add a feature, tweak a script), do exactl
|
|
|
19
19
|
|
|
20
20
|
## User Feedback — Implement this
|
|
21
21
|
|
|
22
|
+
<task-specification>
|
|
23
|
+
|
|
22
24
|
{{FEEDBACK}}
|
|
23
25
|
|
|
26
|
+
</task-specification>
|
|
27
|
+
|
|
24
28
|
## Protocol
|
|
25
29
|
|
|
26
30
|
1. **Parse the feedback as an instruction** — Identify the concrete change(s) requested. If it says "create X", create
|
|
27
31
|
X. If it says "change Y", change Y. Do not ask for clarification unless the instruction is genuinely contradictory.
|
|
28
32
|
2. **Implement the change** — Create or edit the files required to satisfy the feedback. Make the smallest change that
|
|
29
33
|
fully carries out the instruction.
|
|
30
|
-
3. **Run verification** — If the project has a check script (
|
|
31
|
-
it passes. If no check script is configured, skip this step.
|
|
34
|
+
3. **Run verification** — If the project has a check script (test, typecheck, lint, or build command), run it and
|
|
35
|
+
confirm it passes. If no check script is configured, skip this step.
|
|
32
36
|
4. **Output verification results** — Wrap any verification output in `<task-verified>...</task-verified>`. If you
|
|
33
37
|
skipped step 3, emit `<task-verified>no check script configured; change applied</task-verified>`.
|
|
34
|
-
5. **
|
|
38
|
+
5. **Commit your work** — Stage the modified files and create a git commit with a descriptive message summarising the
|
|
39
|
+
feedback you implemented. The harness refuses to mark the task done with a dirty working tree.
|
|
40
|
+
6. **Signal completion** — Output `<task-complete>` once the change is applied, verification (if any) passed, and the
|
|
41
|
+
commit has landed.
|
|
35
42
|
|
|
36
43
|
Only signal `<task-blocked>reason</task-blocked>` if the feedback is literally impossible to carry out (e.g., asks
|
|
37
44
|
you to edit a file in a repository you don't have access to). Ambiguity is **not** a blocker — make a reasonable
|
|
@@ -42,6 +49,8 @@ interpretation and proceed.
|
|
|
42
49
|
- **The feedback is the authoritative instruction** — implement it even if it seems unrelated to the completed tasks.
|
|
43
50
|
- **Do the smallest change that fully satisfies the feedback** — no speculative refactors, no adjacent cleanup.
|
|
44
51
|
- **Make the edits — don't just describe them** — the harness does not apply edits for you; you must write the files.
|
|
52
|
+
- **Must commit** — Create a git commit before signaling completion. Uncommitted changes leave the sprint branch dirty
|
|
53
|
+
and block sprint close.
|
|
45
54
|
|
|
46
55
|
</constraints>
|
|
47
56
|
|
|
@@ -21,6 +21,11 @@ These verification criteria are the pre-agreed definition of "done" — your pri
|
|
|
21
21
|
|
|
22
22
|
## Review Protocol
|
|
23
23
|
|
|
24
|
+
**You are a reviewer — do not edit files.** If you believe a fix is needed, emit `<evaluation-failed>` with a concrete
|
|
25
|
+
critique; the harness will resume the generator to apply the fix. Do not run `git stash`, do not edit tests, do not
|
|
26
|
+
create commits. Your tools are read-only: `git status`, `git log`, `git diff`, file reads, and running existing check
|
|
27
|
+
scripts. Any write operation is a protocol violation.
|
|
28
|
+
|
|
24
29
|
You are working in this project directory:
|
|
25
30
|
|
|
26
31
|
```
|
|
@@ -37,7 +42,8 @@ Run deterministic checks first — these are cheap, fast, and authoritative.
|
|
|
37
42
|
|
|
38
43
|
1. **Run the check script** (if provided above) — this is the same gate the harness uses post-task. If it fails, the
|
|
39
44
|
implementation fails regardless of how good the code looks. Record the output.
|
|
40
|
-
2. **Run `git status`** —
|
|
45
|
+
2. **Run `git status`** — the tree MUST be clean. Uncommitted changes from the generator are a Completeness failure;
|
|
46
|
+
uncommitted changes from you are a protocol violation.
|
|
41
47
|
3. **Run `git log --oneline -10`** — identify which commits belong to this task
|
|
42
48
|
|
|
43
49
|
Computational results are ground truth. If the check script fails, stop early — the implementation does not pass.
|
|
@@ -52,10 +58,16 @@ Now apply semantic judgment to what the computational checks cannot catch:
|
|
|
52
58
|
2. **Read the changed files carefully** — understand the full implementation, not just the diff.
|
|
53
59
|
3. **Read surrounding code** — check that the implementation follows existing patterns and conventions.
|
|
54
60
|
4. **Augment the Project Tooling section above** — the section lists detected subagents, skills, and MCP servers.
|
|
55
|
-
Additionally skim
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
61
|
+
Additionally skim repository config for the test/verification stack and any conventions the section didn't surface.
|
|
62
|
+
Note which application type this is (backend API / CLI / frontend SPA / fullstack / library) — it determines which
|
|
63
|
+
verification methods apply.
|
|
64
|
+
|
|
65
|
+
<examples>
|
|
66
|
+
Representative files to scan when present — not an exhaustive list, adapt to the ecosystem:
|
|
67
|
+
`package.json`, `pyproject.toml`, `Cargo.toml`, `go.mod`, `playwright.config.*`, `cypress.config.*`,
|
|
68
|
+
`vitest.config.*`, `.storybook/`, `CLAUDE.md`, `AGENTS.md`, `.github/copilot-instructions.md`.
|
|
69
|
+
</examples>
|
|
70
|
+
|
|
59
71
|
5. **Run extended verification when the detected tooling makes it cheap and deterministic:**
|
|
60
72
|
- **Frontend/UI tasks** — if Playwright or Cypress is configured, run a targeted e2e test or use a browser MCP to
|
|
61
73
|
verify the changed UI renders correctly (console errors, layout, interactive behaviour).
|
|
@@ -72,14 +84,15 @@ Evaluate the implementation across the dimensions below. Each dimension is pass/
|
|
|
72
84
|
dimension fails, the overall evaluation fails. The first four are the floor — every task is graded on them. The
|
|
73
85
|
planner may have flagged additional task-specific dimensions; when present, they are graded on top of the floor.
|
|
74
86
|
|
|
75
|
-
|
|
87
|
+
<dimension name="Correctness" floor="true">
|
|
76
88
|
Does the implementation do what the specification says? Check for:
|
|
77
89
|
|
|
78
90
|
- Logical errors, off-by-one, race conditions, type issues
|
|
79
91
|
- Behavior matches each verification criterion (grade each one explicitly)
|
|
80
92
|
- Edge cases handled where specified
|
|
93
|
+
</dimension>
|
|
81
94
|
|
|
82
|
-
|
|
95
|
+
<dimension name="Completeness" floor="true">
|
|
83
96
|
Is the full specification implemented? Check for:
|
|
84
97
|
|
|
85
98
|
- Every verification criterion is satisfied (not just most)
|
|
@@ -87,25 +100,29 @@ Is the full specification implemented? Check for:
|
|
|
87
100
|
- No TODO/FIXME/HACK markers left behind that indicate unfinished work
|
|
88
101
|
- Uncommitted changes that look like incomplete work (WIP diffs, stashed edits) — committing is expected unless the
|
|
89
102
|
task's contract says otherwise
|
|
103
|
+
</dimension>
|
|
90
104
|
|
|
91
|
-
|
|
105
|
+
<dimension name="Safety" floor="true">
|
|
92
106
|
Are there security or reliability issues? Check for:
|
|
93
107
|
|
|
94
108
|
- Injection vulnerabilities (SQL, command, XSS)
|
|
95
109
|
- Validation gaps on external input
|
|
96
110
|
- Exposed secrets, hardcoded credentials
|
|
97
111
|
- Unsafe error handling that leaks internals
|
|
112
|
+
</dimension>
|
|
98
113
|
|
|
99
|
-
|
|
114
|
+
<dimension name="Consistency" floor="true">
|
|
100
115
|
Does the implementation fit the codebase? Check for:
|
|
101
116
|
|
|
102
117
|
- Follows existing patterns and conventions (naming, structure, error handling)
|
|
103
118
|
- Uses existing utilities instead of reinventing them
|
|
104
119
|
- No unnecessary changes outside the task scope — spec drift
|
|
105
120
|
- Test patterns match the project's existing test style
|
|
121
|
+
</dimension>
|
|
106
122
|
{{EXTRA_DIMENSIONS_SECTION}}
|
|
107
|
-
|
|
108
|
-
|
|
123
|
+
|
|
124
|
+
Evaluate only what was asked vs what was delivered — suggesting improvements beyond the task scope creates noise that
|
|
125
|
+
distracts from the actual pass/fail decision.
|
|
109
126
|
|
|
110
127
|
### Pass Bar
|
|
111
128
|
|
|
@@ -159,6 +176,8 @@ Each issue must reference which dimension it violates.]
|
|
|
159
176
|
|
|
160
177
|
### Calibration Examples
|
|
161
178
|
|
|
179
|
+
<examples>
|
|
180
|
+
|
|
162
181
|
**Example of a correct PASS:**
|
|
163
182
|
|
|
164
183
|
> Task: "Add date validation to export endpoint"
|
|
@@ -187,6 +206,8 @@ Each issue must reference which dimension it violates.]
|
|
|
187
206
|
> 2. [Safety] `src/repositories/users.ts:23` — `WHERE name LIKE '%${query}%'` is SQL injection. Use parameterized
|
|
188
207
|
> query: `WHERE name LIKE $1` with `%${query}%` as parameter.
|
|
189
208
|
|
|
209
|
+
</examples>
|
|
210
|
+
|
|
190
211
|
Be direct and specific — point to files, lines, and concrete problems.
|
|
191
212
|
|
|
192
213
|
{{SIGNALS}}
|
|
@@ -15,16 +15,17 @@ When finished, emit a signal from the `<signals>` block below.
|
|
|
15
15
|
- **Respect task boundaries** — complete exactly the declared steps for this one task, then stop. Other agents may be
|
|
16
16
|
working on neighboring tasks in parallel; skipping steps, improvising, or editing files outside the declared set
|
|
17
17
|
causes merge conflicts with their work.
|
|
18
|
-
- **Prefer fixing the code over the test** — a failing test usually indicates a bug in the implementation. Update
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
18
|
+
- **Prefer fixing the code over the test** — a failing test usually indicates a bug in the implementation. Update
|
|
19
|
+
tests only when the declared steps intentionally change the asserted behaviour (e.g. a contract change, a regression
|
|
20
|
+
fix). If the right move is genuinely ambiguous, signal `<task-blocked>` so a human can decide — do not silently
|
|
21
|
+
weaken a test to make a failure go away.
|
|
22
22
|
- **Verify before completing** — the harness runs a post-task check gate; unverified work will be caught and rejected.
|
|
23
23
|
- **Append progress, never overwrite** — append each progress entry at the end of the progress file. Overwriting
|
|
24
24
|
erases context that downstream tasks depend on.
|
|
25
25
|
- **Leave {{CONTEXT_FILE}} and task definitions alone** — the context file is cleaned up by the harness (committing it
|
|
26
26
|
pollutes the repo); the task name, description, steps, and other task files are immutable.
|
|
27
|
-
|
|
27
|
+
|
|
28
|
+
{{COMMIT_CONSTRAINT}}
|
|
28
29
|
|
|
29
30
|
</constraints>
|
|
30
31
|
|
|
@@ -93,7 +94,8 @@ Complete these steps IN ORDER:
|
|
|
93
94
|
1. **Confirm all steps done** — Every task step has been completed
|
|
94
95
|
2. **Run ALL verification commands** — Execute every verification command (see Check Script section in the context file
|
|
95
96
|
or project instructions). Fix any failures before proceeding. The harness runs the check script as a post-task
|
|
96
|
-
gate — your task is not marked done unless it passes.
|
|
97
|
+
gate — your task is not marked done unless it passes.
|
|
98
|
+
{{COMMIT_STEP}}
|
|
97
99
|
3. **Update progress file** — Append to {{PROGRESS_FILE}} using this format:
|
|
98
100
|
|
|
99
101
|
```markdown
|
|
@@ -142,17 +144,15 @@ Complete these steps IN ORDER:
|
|
|
142
144
|
- The WHERE clause builder in src/repositories/base.ts can be extended for future filters
|
|
143
145
|
```
|
|
144
146
|
|
|
145
|
-
4. **Output verification results
|
|
147
|
+
4. **Output verification results** — use the actual commands the harness ran; the examples below are illustrative:
|
|
146
148
|
|
|
147
149
|
<!-- prettier-ignore -->
|
|
148
150
|
```
|
|
149
151
|
<task-verified>
|
|
150
|
-
$
|
|
151
|
-
|
|
152
|
-
$
|
|
153
|
-
|
|
154
|
-
$ pnpm test
|
|
155
|
-
47 tests passed
|
|
152
|
+
$ <check-command-1>
|
|
153
|
+
<output>
|
|
154
|
+
$ <check-command-2>
|
|
155
|
+
<output>
|
|
156
156
|
</task-verified>
|
|
157
157
|
```
|
|
158
158
|
|
|
@@ -223,10 +223,14 @@ The `ref` field should match either:
|
|
|
223
223
|
- The ticket's internal ID
|
|
224
224
|
- The exact ticket title
|
|
225
225
|
|
|
226
|
+
<task-specification>
|
|
227
|
+
|
|
226
228
|
## Ticket to Refine
|
|
227
229
|
|
|
228
230
|
{{TICKET}}
|
|
229
231
|
|
|
232
|
+
</task-specification>
|
|
233
|
+
|
|
230
234
|
{{ISSUE_CONTEXT}}
|
|
231
235
|
|
|
232
236
|
---
|
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
<validation-checklist>
|
|
2
|
+
|
|
1
3
|
## Pre-Output Validation
|
|
2
4
|
|
|
3
5
|
Before writing the JSON output, verify EVERY item:
|
|
@@ -12,3 +14,5 @@ Before writing the JSON output, verify EVERY item:
|
|
|
12
14
|
8. **`projectPath` assigned** — every task uses a path from the available repositories
|
|
13
15
|
9. **Verification criteria** — every task has 2-4 `verificationCriteria` that are testable and unambiguous
|
|
14
16
|
10. **Raw JSON output** — the output is valid JSON matching the schema exactly; the harness parses the output directly as JSON, so emit it without markdown fences, commentary, or surrounding prose
|
|
17
|
+
|
|
18
|
+
</validation-checklist>
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "ralphctl",
|
|
3
|
-
"version": "0.4.
|
|
3
|
+
"version": "0.4.2",
|
|
4
4
|
"description": "Agent harness for long-running AI coding tasks — orchestrates Claude Code & GitHub Copilot across repositories",
|
|
5
5
|
"homepage": "https://github.com/lukas-grigis/ralphctl",
|
|
6
6
|
"type": "module",
|