@workermill/agent 0.1.1 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
File without changes
package/dist/api.d.ts CHANGED
File without changes
package/dist/api.js CHANGED
File without changes
package/dist/cli.d.ts CHANGED
File without changes
package/dist/cli.js CHANGED
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
package/dist/config.d.ts CHANGED
File without changes
package/dist/config.js CHANGED
File without changes
package/dist/index.d.ts CHANGED
File without changes
package/dist/index.js CHANGED
File without changes
@@ -0,0 +1,82 @@
1
+ /**
2
+ * Plan Validator for Remote Agent
3
+ *
4
+ * Validates execution plans locally before posting to the cloud API.
5
+ * Implements the same guardrails as the server-side planning pipeline:
6
+ * 1. File cap: max 5 targetFiles per story (prevents scope explosion)
7
+ * 2. Critic validation: LLM scores the plan, rejects below threshold
8
+ *
9
+ * This ensures remote agent plans get the same quality gates as cloud plans,
10
+ * even though the planning prompt runs locally via Claude CLI.
11
+ */
12
+ export interface PlannedStory {
13
+ id: string;
14
+ title: string;
15
+ description: string;
16
+ persona: string;
17
+ priority: number;
18
+ estimatedEffort: "small" | "medium" | "large";
19
+ dependencies: string[];
20
+ acceptanceCriteria: string[];
21
+ targetFiles?: string[];
22
+ scope?: string;
23
+ }
24
+ export interface ExecutionPlan {
25
+ summary: string;
26
+ stories: PlannedStory[];
27
+ risks: string[];
28
+ assumptions: string[];
29
+ }
30
+ export interface CriticResult {
31
+ approved: boolean;
32
+ score: number;
33
+ risks: string[];
34
+ suggestions?: string[];
35
+ storyFeedback?: Array<{
36
+ storyId: string;
37
+ feedback: string;
38
+ suggestedChanges?: string[];
39
+ }>;
40
+ }
41
+ declare const AUTO_APPROVAL_THRESHOLD = 85;
42
+ /**
43
+ * Parse execution plan JSON from raw Claude CLI output.
44
+ * Mirrors server-side parseExecutionPlan() in planning-agent-local.ts.
45
+ */
46
+ export declare function parseExecutionPlan(output: string): ExecutionPlan;
47
+ /**
48
+ * Apply file cap to all stories. Truncates targetFiles > MAX_TARGET_FILES.
49
+ * Returns details about truncated stories for logging.
50
+ */
51
+ export declare function applyFileCap(plan: ExecutionPlan): {
52
+ truncatedCount: number;
53
+ details: string[];
54
+ };
55
+ /**
56
+ * Re-serialize plan as a JSON code block for posting to the API.
57
+ * The server-side parseExecutionPlan() expects ```json ... ``` blocks.
58
+ */
59
+ export declare function serializePlan(plan: ExecutionPlan): string;
60
+ /**
61
+ * Build the critic prompt with PRD and plan substituted.
62
+ */
63
+ export declare function buildCriticPrompt(prd: string, plan: ExecutionPlan): string;
64
+ /**
65
+ * Parse critic JSON response from raw Claude CLI output.
66
+ */
67
+ export declare function parseCriticResponse(text: string): CriticResult;
68
+ /**
69
+ * Run the critic via Claude CLI (lightweight — no tools, just reasoning).
70
+ * Returns the raw text output.
71
+ */
72
+ export declare function runCriticCli(claudePath: string, model: string, prompt: string, env: Record<string, string | undefined>): Promise<string>;
73
+ /**
74
+ * Format critic feedback for appending to the planner prompt on re-run.
75
+ */
76
+ export declare function formatCriticFeedback(critic: CriticResult): string;
77
+ /**
78
+ * Run critic validation on a parsed plan.
79
+ * Returns the critic result, or null if critic fails (non-blocking).
80
+ */
81
+ export declare function runCriticValidation(claudePath: string, model: string, prd: string, plan: ExecutionPlan, env: Record<string, string | undefined>, taskLabel: string): Promise<CriticResult | null>;
82
+ export { AUTO_APPROVAL_THRESHOLD };
@@ -0,0 +1,268 @@
1
+ /**
2
+ * Plan Validator for Remote Agent
3
+ *
4
+ * Validates execution plans locally before posting to the cloud API.
5
+ * Implements the same guardrails as the server-side planning pipeline:
6
+ * 1. File cap: max 5 targetFiles per story (prevents scope explosion)
7
+ * 2. Critic validation: LLM scores the plan, rejects below threshold
8
+ *
9
+ * This ensures remote agent plans get the same quality gates as cloud plans,
10
+ * even though the planning prompt runs locally via Claude CLI.
11
+ */
12
+ import { spawn } from "child_process";
13
+ import chalk from "chalk";
14
+ // ============================================================================
15
+ // CONSTANTS
16
+ // ============================================================================
17
+ const MAX_TARGET_FILES = 5;
18
+ const AUTO_APPROVAL_THRESHOLD = 85;
19
+ // ============================================================================
20
+ // PLAN PARSING
21
+ // ============================================================================
22
+ /**
23
+ * Parse execution plan JSON from raw Claude CLI output.
24
+ * Mirrors server-side parseExecutionPlan() in planning-agent-local.ts.
25
+ */
26
+ export function parseExecutionPlan(output) {
27
+ const jsonMatch = output.match(/```json\s*([\s\S]*?)\s*```/);
28
+ if (jsonMatch) {
29
+ return JSON.parse(jsonMatch[1]);
30
+ }
31
+ const rawJsonMatch = output.match(/\{[\s\S]*"stories"[\s\S]*\}/);
32
+ if (rawJsonMatch) {
33
+ return JSON.parse(rawJsonMatch[0]);
34
+ }
35
+ throw new Error("Could not find JSON execution plan in output");
36
+ }
37
+ // ============================================================================
38
+ // FILE CAP
39
+ // ============================================================================
40
+ /**
41
+ * Apply file cap to all stories. Truncates targetFiles > MAX_TARGET_FILES.
42
+ * Returns details about truncated stories for logging.
43
+ */
44
+ export function applyFileCap(plan) {
45
+ let truncatedCount = 0;
46
+ const details = [];
47
+ for (const story of plan.stories) {
48
+ if (!story.targetFiles || !Array.isArray(story.targetFiles)) {
49
+ story.targetFiles = [];
50
+ }
51
+ else if (story.targetFiles.length > MAX_TARGET_FILES) {
52
+ const dropped = story.targetFiles.slice(MAX_TARGET_FILES);
53
+ details.push(`${story.id}: ${story.targetFiles.length} files → ${MAX_TARGET_FILES} (dropped: ${dropped.join(", ")})`);
54
+ story.targetFiles = story.targetFiles.slice(0, MAX_TARGET_FILES);
55
+ truncatedCount++;
56
+ }
57
+ }
58
+ return { truncatedCount, details };
59
+ }
60
+ // ============================================================================
61
+ // PLAN SERIALIZATION
62
+ // ============================================================================
63
+ /**
64
+ * Re-serialize plan as a JSON code block for posting to the API.
65
+ * The server-side parseExecutionPlan() expects ```json ... ``` blocks.
66
+ */
67
+ export function serializePlan(plan) {
68
+ return "```json\n" + JSON.stringify(plan, null, 2) + "\n```";
69
+ }
70
+ // ============================================================================
71
+ // CRITIC
72
+ // ============================================================================
73
+ /**
74
+ * Critic prompt — identical to server-side critic-agent.ts CRITIC_PROMPT.
75
+ */
76
+ const CRITIC_PROMPT = `You are a Senior Architect reviewing an execution plan. Your job is to ensure the plan is appropriately sized for the task.
77
+
78
+ Review this execution plan against the PRD:
79
+
80
+ ## PRD (Product Requirements Document)
81
+ {{PRD}}
82
+
83
+ ## PROPOSED EXECUTION PLAN
84
+ {{PLAN}}
85
+
86
+ ## Review Guidelines
87
+
88
+ **IMPORTANT: Match plan size to task complexity**
89
+
90
+ - Simple tasks (typos, config changes, single-file fixes) = 1 step is CORRECT
91
+ - Medium tasks (2-4 files, small features) = 2-3 steps is appropriate
92
+ - Complex tasks (new systems, security) = 3-5 steps is appropriate
93
+
94
+ **Do NOT penalize:**
95
+ - Single-step plans for genuinely simple tasks
96
+ - Using one persona when only one skill is needed
97
+
98
+ **DO check for:**
99
+ 1. **Missing Requirements** - Does the plan cover what the PRD asks for?
100
+ 2. **Vague Instructions** - Will the worker know what to do?
101
+ 3. **Security Issues** - Only for tasks involving auth, user data, or external input
102
+ 4. **Unrealistic Scope** - Any step targeting >3 files MUST score below 85 (auto-rejection threshold). Each step should modify at most 3 files. If a step needs more, split it into multiple steps first.
103
+ 5. **Missing Operational Steps** - If the PRD requires deployment, provisioning, migrations, or running commands, does the plan include operational steps? Writing code is not the same as deploying it.
104
+ 6. **Overlapping File Scope** - If two or more steps share the same targetFiles, this causes parallel merge conflicts. Steps MUST NOT overlap on targetFiles. Deduct 10 points per shared file across steps.
105
+
106
+ ## Scoring Guide
107
+
108
+ - **90-100**: Plan matches task complexity, requirements covered
109
+ - **75-89**: Minor gaps but fundamentally sound
110
+ - **50-74**: Significant issues or wrong-sized for the task
111
+ - **0-49**: Fundamentally flawed
112
+
113
+ ## Output Format
114
+
115
+ Respond with ONLY a JSON object (no markdown, no explanation):
116
+ {"approved": boolean, "score": number, "risks": ["risk1", "risk2"], "suggestions": ["suggestion1", "suggestion2"], "storyFeedback": [{"storyId": "step-0", "feedback": "specific feedback", "suggestedChanges": ["change1"]}]}
117
+
118
+ Rules:
119
+ - approved = true if score >= 85 AND plan is right-sized for task
120
+ - risks = specific issues (empty array if none)
121
+ - suggestions = actionable improvements (empty array if none)
122
+ - storyFeedback = per-step feedback (optional, only for steps that need changes)`;
123
+ /**
124
+ * Build the critic prompt with PRD and plan substituted.
125
+ */
126
+ export function buildCriticPrompt(prd, plan) {
127
+ const planJson = JSON.stringify(plan, null, 2);
128
+ return CRITIC_PROMPT.replace("{{PRD}}", prd).replace("{{PLAN}}", planJson);
129
+ }
130
+ /**
131
+ * Parse critic JSON response from raw Claude CLI output.
132
+ */
133
+ export function parseCriticResponse(text) {
134
+ let jsonText = text.trim();
135
+ // Handle markdown code blocks
136
+ if (jsonText.includes("```")) {
137
+ const match = jsonText.match(/```(?:json)?\s*([\s\S]*?)```/);
138
+ if (match)
139
+ jsonText = match[1].trim();
140
+ }
141
+ // Find JSON object if preceded by reasoning text
142
+ const jsonStart = jsonText.indexOf("{");
143
+ if (jsonStart > 0) {
144
+ jsonText = jsonText.substring(jsonStart);
145
+ }
146
+ const result = JSON.parse(jsonText);
147
+ return {
148
+ approved: result.approved,
149
+ score: Math.max(0, Math.min(100, Math.round(result.score))),
150
+ risks: result.risks || [],
151
+ suggestions: result.suggestions,
152
+ storyFeedback: Array.isArray(result.storyFeedback)
153
+ ? result.storyFeedback
154
+ : undefined,
155
+ };
156
+ }
157
+ /**
158
+ * Run the critic via Claude CLI (lightweight — no tools, just reasoning).
159
+ * Returns the raw text output.
160
+ */
161
+ export function runCriticCli(claudePath, model, prompt, env) {
162
+ return new Promise((resolve, reject) => {
163
+ const proc = spawn(claudePath, [
164
+ "--print",
165
+ "--model",
166
+ model,
167
+ "--permission-mode",
168
+ "bypassPermissions",
169
+ ], {
170
+ env,
171
+ stdio: ["pipe", "pipe", "pipe"],
172
+ });
173
+ proc.stdin.write(prompt);
174
+ proc.stdin.end();
175
+ let stdout = "";
176
+ let stderr = "";
177
+ proc.stdout.on("data", (data) => {
178
+ stdout += data.toString();
179
+ });
180
+ proc.stderr.on("data", (data) => {
181
+ stderr += data.toString();
182
+ });
183
+ const timeout = setTimeout(() => {
184
+ proc.kill("SIGTERM");
185
+ reject(new Error("Critic CLI timed out after 3 minutes"));
186
+ }, 180_000);
187
+ proc.on("exit", (code) => {
188
+ clearTimeout(timeout);
189
+ if (code !== 0) {
190
+ reject(new Error(`Critic CLI failed (exit ${code}): ${stderr.substring(0, 300)}`));
191
+ }
192
+ else {
193
+ resolve(stdout);
194
+ }
195
+ });
196
+ proc.on("error", (err) => {
197
+ clearTimeout(timeout);
198
+ reject(err);
199
+ });
200
+ });
201
+ }
202
+ /**
203
+ * Format critic feedback for appending to the planner prompt on re-run.
204
+ */
205
+ export function formatCriticFeedback(critic) {
206
+ const lines = [
207
+ "",
208
+ "## CRITIC FEEDBACK — Your previous plan was REJECTED",
209
+ "",
210
+ `Score: ${critic.score}/100 (need >= ${AUTO_APPROVAL_THRESHOLD} to pass)`,
211
+ "",
212
+ ];
213
+ if (critic.risks.length > 0) {
214
+ lines.push("### Risks Identified:");
215
+ for (const risk of critic.risks) {
216
+ lines.push(`- ${risk}`);
217
+ }
218
+ lines.push("");
219
+ }
220
+ if (critic.suggestions && critic.suggestions.length > 0) {
221
+ lines.push("### Required Changes:");
222
+ for (const suggestion of critic.suggestions) {
223
+ lines.push(`- ${suggestion}`);
224
+ }
225
+ lines.push("");
226
+ }
227
+ if (critic.storyFeedback && critic.storyFeedback.length > 0) {
228
+ lines.push("### Per-Story Feedback:");
229
+ for (const fb of critic.storyFeedback) {
230
+ lines.push(`- **${fb.storyId}**: ${fb.feedback}`);
231
+ if (fb.suggestedChanges) {
232
+ for (const change of fb.suggestedChanges) {
233
+ lines.push(` - ${change}`);
234
+ }
235
+ }
236
+ }
237
+ lines.push("");
238
+ }
239
+ lines.push("**You MUST address ALL feedback above.** Each story must target at most 5 files.", "Stories MUST NOT overlap on targetFiles. Generate a revised plan.");
240
+ return lines.join("\n");
241
+ }
242
+ /** Timestamp prefix for console logs */
243
+ function ts() {
244
+ return chalk.dim(new Date().toLocaleTimeString());
245
+ }
246
+ /**
247
+ * Run critic validation on a parsed plan.
248
+ * Returns the critic result, or null if critic fails (non-blocking).
249
+ */
250
+ export async function runCriticValidation(claudePath, model, prd, plan, env, taskLabel) {
251
+ const criticPrompt = buildCriticPrompt(prd, plan);
252
+ console.log(`${ts()} ${taskLabel} ${chalk.dim("Running critic validation...")}`);
253
+ try {
254
+ const rawCriticOutput = await runCriticCli(claudePath, model, criticPrompt, env);
255
+ const result = parseCriticResponse(rawCriticOutput);
256
+ const statusIcon = result.score >= AUTO_APPROVAL_THRESHOLD
257
+ ? chalk.green("✓")
258
+ : chalk.red("✗");
259
+ console.log(`${ts()} ${taskLabel} ${statusIcon} Critic score: ${result.score}/100 (threshold: ${AUTO_APPROVAL_THRESHOLD})`);
260
+ return result;
261
+ }
262
+ catch (error) {
263
+ const errMsg = error instanceof Error ? error.message : String(error);
264
+ console.error(`${ts()} ${taskLabel} ${chalk.yellow("⚠")} Critic failed: ${errMsg.substring(0, 100)}`);
265
+ return null;
266
+ }
267
+ }
268
+ export { AUTO_APPROVAL_THRESHOLD };
package/dist/planner.d.ts CHANGED
@@ -3,7 +3,13 @@
3
3
  *
4
4
  * Fetches the planning prompt from the cloud API, runs it through
5
5
  * Claude CLI locally (using the customer's Claude Max subscription),
6
- * and posts the raw output back for server-side validation.
6
+ * validates with a Planner-Critic loop, and posts the approved plan
7
+ * back for server-side processing.
8
+ *
9
+ * Guardrails (matching server-side planning pipeline):
10
+ * 1. File cap: max 5 targetFiles per story (prevents scope explosion)
11
+ * 2. Critic validation: LLM scores the plan, rejects below 85/100
12
+ * 3. Max 3 Planner-Critic iterations before failure
7
13
  *
8
14
  * Logs are streamed to the cloud dashboard in real-time so the user
9
15
  * sees the same planning progress as cloud mode.
@@ -12,8 +18,18 @@ import { type AgentConfig } from "./config.js";
12
18
  export interface PlanningTask {
13
19
  id: string;
14
20
  summary: string;
21
+ description: string | null;
15
22
  }
16
23
  /**
17
- * Run planning for a task: fetch prompt, execute Claude CLI, post result.
24
+ * Run planning for a task with Planner-Critic validation loop.
25
+ *
26
+ * Flow:
27
+ * 1. Fetch planning prompt from cloud API
28
+ * 2. Run Claude CLI to generate plan
29
+ * 3. Parse plan, apply file cap (max 5 files per story)
30
+ * 4. Run critic validation via Claude CLI
31
+ * 5. If critic approves (score >= 85): post validated plan to API
32
+ * 6. If critic rejects: re-run planner with feedback (up to MAX_ITERATIONS)
33
+ * 7. After MAX_ITERATIONS without approval: fail the task
18
34
  */
19
35
  export declare function planTask(task: PlanningTask, config: AgentConfig): Promise<boolean>;
package/dist/planner.js CHANGED
@@ -3,7 +3,13 @@
3
3
  *
4
4
  * Fetches the planning prompt from the cloud API, runs it through
5
5
  * Claude CLI locally (using the customer's Claude Max subscription),
6
- * and posts the raw output back for server-side validation.
6
+ * validates with a Planner-Critic loop, and posts the approved plan
7
+ * back for server-side processing.
8
+ *
9
+ * Guardrails (matching server-side planning pipeline):
10
+ * 1. File cap: max 5 targetFiles per story (prevents scope explosion)
11
+ * 2. Critic validation: LLM scores the plan, rejects below 85/100
12
+ * 3. Max 3 Planner-Critic iterations before failure
7
13
  *
8
14
  * Logs are streamed to the cloud dashboard in real-time so the user
9
15
  * sees the same planning progress as cloud mode.
@@ -12,6 +18,9 @@ import chalk from "chalk";
12
18
  import { spawn } from "child_process";
13
19
  import { findClaudePath } from "./config.js";
14
20
  import { api } from "./api.js";
21
+ import { parseExecutionPlan, applyFileCap, serializePlan, runCriticValidation, formatCriticFeedback, AUTO_APPROVAL_THRESHOLD, } from "./plan-validator.js";
22
+ /** Max Planner-Critic iterations before giving up */
23
+ const MAX_ITERATIONS = 3;
15
24
  /** Timestamp prefix */
16
25
  function ts() {
17
26
  return chalk.dim(new Date().toLocaleTimeString());
@@ -51,14 +60,22 @@ async function postProgress(taskId, phase, elapsedSeconds, detail, charsGenerate
51
60
  // Fire and forget
52
61
  }
53
62
  }
63
+ /** Consistent prefix matching local workermill dashboard format */
64
+ const PREFIX = "[🗺️ planning_agent 🤖]";
65
+ /** Format elapsed seconds as human-readable string (e.g. "28s", "1m 25s") */
66
+ function formatElapsed(seconds) {
67
+ const mins = Math.floor(seconds / 60);
68
+ const secs = seconds % 60;
69
+ return mins > 0 ? `${mins}m ${secs}s` : `${secs}s`;
70
+ }
54
71
  function phaseLabel(phase, elapsed) {
55
72
  switch (phase) {
56
- case "initializing": return "Starting planning agent...";
57
- case "reading_repo": return "Reading repository structure...";
58
- case "analyzing": return "Analyzing requirements...";
59
- case "generating_plan": return `Generating execution plan... (${elapsed}s)`;
60
- case "validating": return "Validating plan...";
61
- case "complete": return "Planning complete";
73
+ case "initializing": return `${PREFIX} Starting planning agent...`;
74
+ case "reading_repo": return `${PREFIX} Reading repository structure...`;
75
+ case "analyzing": return `${PREFIX} Analyzing requirements...`;
76
+ case "generating_plan": return `${PREFIX} Planning in progress — analyzing requirements and decomposing into steps (${formatElapsed(elapsed)} elapsed)`;
77
+ case "validating": return `${PREFIX} Validating plan...`;
78
+ case "complete": return `${PREFIX} Planning complete`;
62
79
  }
63
80
  }
64
81
  /**
@@ -119,7 +136,7 @@ function runClaudeCli(claudePath, model, prompt, env, taskId, startTime) {
119
136
  // Periodic progress during generation
120
137
  if (currentPhase === "generating_plan" && elapsed - lastProgressLogAt >= 30) {
121
138
  lastProgressLogAt = elapsed;
122
- const msg = `Generating execution plan... (${elapsed}s, ${charsReceived} chars, ${toolCallCount} tool calls)`;
139
+ const msg = `${PREFIX} Planning in progress analyzing requirements and decomposing into steps (${formatElapsed(elapsed)} elapsed)`;
123
140
  postLog(taskId, msg);
124
141
  console.log(`${ts()} ${taskLabel} ${chalk.dim(msg)}`);
125
142
  }
@@ -212,57 +229,186 @@ function runClaudeCli(claudePath, model, prompt, env, taskId, startTime) {
212
229
  });
213
230
  }
214
231
  /**
215
- * Run planning for a task: fetch prompt, execute Claude CLI, post result.
232
+ * Run planning for a task with Planner-Critic validation loop.
233
+ *
234
+ * Flow:
235
+ * 1. Fetch planning prompt from cloud API
236
+ * 2. Run Claude CLI to generate plan
237
+ * 3. Parse plan, apply file cap (max 5 files per story)
238
+ * 4. Run critic validation via Claude CLI
239
+ * 5. If critic approves (score >= 85): post validated plan to API
240
+ * 6. If critic rejects: re-run planner with feedback (up to MAX_ITERATIONS)
241
+ * 7. After MAX_ITERATIONS without approval: fail the task
216
242
  */
217
243
  export async function planTask(task, config) {
218
244
  const taskLabel = chalk.cyan(task.id.slice(0, 8));
219
245
  console.log(`${ts()} ${taskLabel} Fetching planning prompt...`);
220
- await postLog(task.id, "Fetching planning prompt from cloud API...");
246
+ await postLog(task.id, `${PREFIX} Fetching planning prompt from cloud API...`);
221
247
  // 1. Fetch the assembled planning prompt from the cloud API
222
248
  const promptResponse = await api.get("/api/agent/planning-prompt", {
223
249
  params: { taskId: task.id },
224
250
  });
225
- const { prompt, model } = promptResponse.data;
251
+ const { prompt: basePrompt, model } = promptResponse.data;
226
252
  const cliModel = model || "sonnet";
227
- console.log(`${ts()} ${taskLabel} Running Claude CLI ${chalk.dim(`(model: ${chalk.yellow(cliModel)})`)}`);
228
- await postLog(task.id, `Starting planning agent (model: ${cliModel})...`);
229
- // 2. Run Claude CLI asynchronously with progress logging
230
253
  const claudePath = process.env.CLAUDE_CLI_PATH || findClaudePath() || "claude";
231
254
  const cleanEnv = { ...process.env };
232
255
  delete cleanEnv.CLAUDE_CODE_OAUTH_TOKEN;
233
256
  const startTime = Date.now();
234
- let rawOutput;
257
+ // PRD for critic validation: use task description, fall back to summary
258
+ const prd = task.description || task.summary;
259
+ // 2. Planner-Critic iteration loop
260
+ let currentPrompt = basePrompt;
261
+ let bestPlan = null;
262
+ let bestScore = 0;
263
+ for (let iteration = 1; iteration <= MAX_ITERATIONS; iteration++) {
264
+ const iterLabel = MAX_ITERATIONS > 1 ? ` (attempt ${iteration}/${MAX_ITERATIONS})` : "";
265
+ if (iteration > 1) {
266
+ console.log(`${ts()} ${taskLabel} Running Claude CLI${iterLabel} ${chalk.dim(`(model: ${chalk.yellow(cliModel)})`)}`);
267
+ await postLog(task.id, `${PREFIX} Re-planning${iterLabel} using anthropic/${cliModel}`);
268
+ }
269
+ else {
270
+ console.log(`${ts()} ${taskLabel} Running Claude CLI ${chalk.dim(`(model: ${chalk.yellow(cliModel)})`)}`);
271
+ await postLog(task.id, `${PREFIX} Starting planning agent using anthropic/${cliModel}`);
272
+ }
273
+ // 2a. Run Claude CLI to generate plan
274
+ let rawOutput;
275
+ try {
276
+ rawOutput = await runClaudeCli(claudePath, cliModel, currentPrompt, cleanEnv, task.id, startTime);
277
+ }
278
+ catch (error) {
279
+ const elapsed = Math.round((Date.now() - startTime) / 1000);
280
+ const errMsg = error instanceof Error ? error.message : String(error);
281
+ console.error(`${ts()} ${taskLabel} ${chalk.red("✗")} Failed after ${elapsed}s: ${errMsg.substring(0, 100)}`);
282
+ await postLog(task.id, `${PREFIX} Planning failed after ${formatElapsed(elapsed)}: ${errMsg.substring(0, 200)}`, "error", "error");
283
+ return false;
284
+ }
285
+ const elapsed = Math.round((Date.now() - startTime) / 1000);
286
+ console.log(`${ts()} ${taskLabel} ${chalk.green("✓")} Claude CLI done ${chalk.dim(`(${elapsed}s, ${rawOutput.length} chars)`)}`);
287
+ // 2b. Parse plan from raw output
288
+ let plan;
289
+ try {
290
+ plan = parseExecutionPlan(rawOutput);
291
+ }
292
+ catch (error) {
293
+ const errMsg = error instanceof Error ? error.message : String(error);
294
+ console.error(`${ts()} ${taskLabel} ${chalk.red("✗")} Plan parse failed: ${errMsg.substring(0, 100)}`);
295
+ await postLog(task.id, `${PREFIX} Failed to parse execution plan from Claude output: ${errMsg.substring(0, 200)}`, "error", "error");
296
+ // If we can't parse the plan, post raw output and let server-side try
297
+ return await postRawPlan(task.id, rawOutput, config.agentId, taskLabel, elapsed);
298
+ }
299
+ // 2c. Apply file cap (max 5 files per story)
300
+ const { truncatedCount, details } = applyFileCap(plan);
301
+ if (truncatedCount > 0) {
302
+ const msg = `${PREFIX} File cap applied: ${truncatedCount} stories truncated to max 5 targetFiles`;
303
+ console.log(`${ts()} ${taskLabel} ${chalk.yellow("⚠")} ${msg}`);
304
+ await postLog(task.id, msg);
305
+ for (const detail of details) {
306
+ console.log(`${ts()} ${taskLabel} ${chalk.dim(detail)}`);
307
+ }
308
+ }
309
+ console.log(`${ts()} ${taskLabel} Plan: ${chalk.bold(plan.stories.length)} stories`);
310
+ await postLog(task.id, `${PREFIX} Plan generated: ${plan.stories.length} stories (${formatElapsed(elapsed)}). Running critic validation...`);
311
+ // 2d. Run critic validation
312
+ const criticResult = await runCriticValidation(claudePath, cliModel, prd, plan, cleanEnv, taskLabel);
313
+ // Track best plan across iterations
314
+ if (criticResult && criticResult.score > bestScore) {
315
+ bestPlan = plan;
316
+ bestScore = criticResult.score;
317
+ }
318
+ else if (!criticResult && !bestPlan) {
319
+ // Critic failed entirely — use this plan as fallback
320
+ bestPlan = plan;
321
+ }
322
+ // 2e. Check critic result
323
+ if (!criticResult) {
324
+ // Critic failed (timeout, parse error, etc.) — post plan without critic gate
325
+ const msg = `${PREFIX} Critic validation failed — posting plan without critic score`;
326
+ console.log(`${ts()} ${taskLabel} ${chalk.yellow("⚠")} ${msg}`);
327
+ await postLog(task.id, msg);
328
+ return await postValidatedPlan(task.id, plan, config.agentId, taskLabel, elapsed);
329
+ }
330
+ if (criticResult.approved || criticResult.score >= AUTO_APPROVAL_THRESHOLD) {
331
+ // Approved! Post the file-capped plan
332
+ const msg = `${PREFIX} Critic approved (score: ${criticResult.score}/100)`;
333
+ await postLog(task.id, msg);
334
+ return await postValidatedPlan(task.id, plan, config.agentId, taskLabel, elapsed);
335
+ }
336
+ // 2f. Rejected — append critic feedback for next iteration
337
+ if (iteration < MAX_ITERATIONS) {
338
+ const feedback = formatCriticFeedback(criticResult);
339
+ currentPrompt = basePrompt + "\n\n" + feedback;
340
+ const msg = `${PREFIX} Critic rejected (score: ${criticResult.score}/100, threshold: ${AUTO_APPROVAL_THRESHOLD}). Re-planning with feedback...`;
341
+ console.log(`${ts()} ${taskLabel} ${chalk.yellow("⚠")} ${msg}`);
342
+ await postLog(task.id, msg);
343
+ if (criticResult.risks.length > 0) {
344
+ await postLog(task.id, `${PREFIX} Critic risks: ${criticResult.risks.join("; ")}`);
345
+ }
346
+ }
347
+ else {
348
+ // Final iteration — rejected
349
+ const msg = `${PREFIX} Critic rejected after ${MAX_ITERATIONS} iterations (best score: ${bestScore}/100, threshold: ${AUTO_APPROVAL_THRESHOLD})`;
350
+ console.error(`${ts()} ${taskLabel} ${chalk.red("✗")} ${msg}`);
351
+ await postLog(task.id, msg, "error", "error");
352
+ if (criticResult.risks.length > 0) {
353
+ await postLog(task.id, `${PREFIX} Final risks: ${criticResult.risks.join("; ")}`, "error", "error");
354
+ }
355
+ if (criticResult.suggestions && criticResult.suggestions.length > 0) {
356
+ await postLog(task.id, `${PREFIX} Suggestions: ${criticResult.suggestions.join("; ")}`, "error", "error");
357
+ }
358
+ }
359
+ }
360
+ // All iterations exhausted — fail
361
+ return false;
362
+ }
363
+ /**
364
+ * Post a validated (file-capped) plan to the cloud API.
365
+ * Re-serializes the plan as a JSON code block since the server-side
366
+ * parseExecutionPlan() expects that format.
367
+ */
368
+ async function postValidatedPlan(taskId, plan, agentId, taskLabel, elapsed) {
369
+ const serialized = serializePlan(plan);
235
370
  try {
236
- rawOutput = await runClaudeCli(claudePath, cliModel, prompt, cleanEnv, task.id, startTime);
371
+ const result = await api.post("/api/agent/plan-result", {
372
+ taskId,
373
+ rawOutput: serialized,
374
+ agentId,
375
+ });
376
+ const storyCount = result.data.storyCount;
377
+ console.log(`${ts()} ${taskLabel} ${chalk.green("✓")} Plan validated: ${chalk.bold(storyCount)} stories → ${chalk.green("queued")}`);
378
+ await postLog(taskId, `${PREFIX} Plan validated: ${storyCount} stories. Task queued for execution.`);
379
+ await postProgress(taskId, "complete", elapsed, "Planning complete", 0, 0);
380
+ return true;
237
381
  }
238
382
  catch (error) {
239
- const elapsed = Math.round((Date.now() - startTime) / 1000);
240
- const errMsg = error instanceof Error ? error.message : String(error);
241
- console.error(`${ts()} ${taskLabel} ${chalk.red("✗")} Failed after ${elapsed}s: ${errMsg.substring(0, 100)}`);
242
- await postLog(task.id, `Planning agent failed after ${elapsed}s: ${errMsg.substring(0, 200)}`, "error", "error");
383
+ const err = error;
384
+ const detail = err.response?.data?.detail || String(error);
385
+ console.error(`${ts()} ${taskLabel} ${chalk.red("✗")} Server validation failed: ${detail.substring(0, 100)}`);
386
+ await postLog(taskId, `${PREFIX} Server-side plan validation failed: ${detail.substring(0, 200)}`, "error", "error");
243
387
  return false;
244
388
  }
245
- const elapsed = Math.round((Date.now() - startTime) / 1000);
246
- console.log(`${ts()} ${taskLabel} ${chalk.green("✓")} Claude CLI done ${chalk.dim(`(${elapsed}s, ${rawOutput.length} chars)`)}`);
247
- await postLog(task.id, `Planning complete (${elapsed}s, ${rawOutput.length} chars). Validating plan...`);
248
- // 3. Post raw output back to cloud API for validation
389
+ }
390
+ /**
391
+ * Post raw (unparsed) plan output to the cloud API as a fallback.
392
+ * Used when local plan parsing fails let the server try.
393
+ */
394
+ async function postRawPlan(taskId, rawOutput, agentId, taskLabel, elapsed) {
249
395
  try {
250
396
  const result = await api.post("/api/agent/plan-result", {
251
- taskId: task.id,
397
+ taskId,
252
398
  rawOutput,
253
- agentId: config.agentId,
399
+ agentId,
254
400
  });
255
401
  const storyCount = result.data.storyCount;
256
- console.log(`${ts()} ${taskLabel} ${chalk.green("✓")} Plan validated: ${chalk.bold(storyCount)} stories → ${chalk.green("queued")}`);
257
- await postLog(task.id, `Plan validated: ${storyCount} stories. Task queued for execution.`);
258
- await postProgress(task.id, "complete", elapsed, "Planning complete", 0, 0);
402
+ console.log(`${ts()} ${taskLabel} ${chalk.green("✓")} Plan validated (server-side): ${chalk.bold(storyCount)} stories → ${chalk.green("queued")}`);
403
+ await postLog(taskId, `${PREFIX} Plan validated: ${storyCount} stories. Task queued for execution.`);
404
+ await postProgress(taskId, "complete", elapsed, "Planning complete", 0, 0);
259
405
  return true;
260
406
  }
261
407
  catch (error) {
262
408
  const err = error;
263
409
  const detail = err.response?.data?.detail || String(error);
264
410
  console.error(`${ts()} ${taskLabel} ${chalk.red("✗")} Validation failed: ${detail.substring(0, 100)}`);
265
- await postLog(task.id, `Plan validation failed: ${detail.substring(0, 200)}`, "error", "error");
411
+ await postLog(taskId, `${PREFIX} Plan validation failed: ${detail.substring(0, 200)}`, "error", "error");
266
412
  return false;
267
413
  }
268
414
  }
package/dist/poller.d.ts CHANGED
File without changes
package/dist/poller.js CHANGED
File without changes
package/dist/spawner.d.ts CHANGED
File without changes
package/dist/spawner.js CHANGED
File without changes
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@workermill/agent",
3
- "version": "0.1.1",
3
+ "version": "0.2.0",
4
4
  "description": "WorkerMill Remote Agent - Run AI workers locally with your Claude Max subscription",
5
5
  "type": "module",
6
6
  "main": "./dist/index.js",