@polpo-ai/core 0.2.5 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. package/LICENSE +21 -0
  2. package/dist/assessment-orchestrator.d.ts +124 -0
  3. package/dist/assessment-orchestrator.d.ts.map +1 -0
  4. package/dist/assessment-orchestrator.js +790 -0
  5. package/dist/assessment-orchestrator.js.map +1 -0
  6. package/dist/assessment-prompts.d.ts +45 -0
  7. package/dist/assessment-prompts.d.ts.map +1 -0
  8. package/dist/assessment-prompts.js +205 -0
  9. package/dist/assessment-prompts.js.map +1 -0
  10. package/dist/assessment-schemas.d.ts +102 -0
  11. package/dist/assessment-schemas.d.ts.map +1 -0
  12. package/dist/assessment-schemas.js +112 -0
  13. package/dist/assessment-schemas.js.map +1 -0
  14. package/dist/assessment-scoring.d.ts +26 -0
  15. package/dist/assessment-scoring.d.ts.map +1 -0
  16. package/dist/assessment-scoring.js +129 -0
  17. package/dist/assessment-scoring.js.map +1 -0
  18. package/dist/assessor.d.ts +36 -0
  19. package/dist/assessor.d.ts.map +1 -0
  20. package/dist/assessor.js +209 -0
  21. package/dist/assessor.js.map +1 -0
  22. package/dist/index.d.ts +12 -0
  23. package/dist/index.d.ts.map +1 -1
  24. package/dist/index.js +15 -0
  25. package/dist/index.js.map +1 -1
  26. package/dist/mission-executor.d.ts +253 -0
  27. package/dist/mission-executor.d.ts.map +1 -0
  28. package/dist/mission-executor.js +1053 -0
  29. package/dist/mission-executor.js.map +1 -0
  30. package/dist/orchestrator-context.d.ts +11 -5
  31. package/dist/orchestrator-context.d.ts.map +1 -1
  32. package/dist/orchestrator-engine.d.ts +500 -0
  33. package/dist/orchestrator-engine.d.ts.map +1 -0
  34. package/dist/orchestrator-engine.js +454 -0
  35. package/dist/orchestrator-engine.js.map +1 -0
  36. package/dist/question-detector.d.ts +31 -0
  37. package/dist/question-detector.d.ts.map +1 -0
  38. package/dist/question-detector.js +65 -0
  39. package/dist/question-detector.js.map +1 -0
  40. package/dist/retry.d.ts +22 -0
  41. package/dist/retry.d.ts.map +1 -0
  42. package/dist/retry.js +47 -0
  43. package/dist/retry.js.map +1 -0
  44. package/dist/spawner.d.ts +43 -0
  45. package/dist/spawner.d.ts.map +1 -0
  46. package/dist/spawner.js +2 -0
  47. package/dist/spawner.js.map +1 -0
  48. package/dist/task-runner.d.ts +43 -0
  49. package/dist/task-runner.d.ts.map +1 -0
  50. package/dist/task-runner.js +487 -0
  51. package/dist/task-runner.js.map +1 -0
  52. package/package.json +50 -8
@@ -0,0 +1,790 @@
1
+ /**
2
+ * AssessmentOrchestrator — handles the full assessment pipeline:
3
+ * result collection → question detection → assessment → auto-correction → judge → fix/retry/fail.
4
+ *
5
+ * Pure core version — ZERO Node.js imports.
6
+ * All runtime-specific behavior is injected via OrchestratorContext ports
7
+ * and the optional AssessmentPorts interface.
8
+ */
9
+ import { setAssessment } from "./types.js";
10
+ import { buildFixPrompt, buildRetryPrompt, buildSideEffectFixPrompt, buildSideEffectRetryPrompt, buildJudgePrompt } from "./assessment-prompts.js";
11
+ import { looksLikeQuestion, classifyAsQuestion } from "./question-detector.js";
12
+ /**
13
+ * Handles the full assessment pipeline: result collection → question detection →
14
+ * assessment → auto-correction → judge → fix/retry/fail.
15
+ */
16
+ export class AssessmentOrchestrator {
17
+ ctx;
18
+ ports;
19
+ constructor(ctx, ports) {
20
+ this.ctx = ctx;
21
+ this.ports = ports ?? {};
22
+ }
23
+ /**
24
+ * Build a rich ReviewContext from all available data sources:
25
+ * - RunStore activity (files, tool counts)
26
+ * - TaskResult (stdout, stderr, exit code, duration)
27
+ * - JSONL transcript log (execution timeline)
28
+ * - Task outcomes
29
+ */
30
+ async buildReviewContext(taskId, task, result) {
31
+ const run = await this.ctx.runStore.getRunByTaskId(taskId);
32
+ const activity = run?.activity;
33
+ const outcomes = run?.outcomes ?? task.outcomes;
34
+ // Build execution summary from JSONL transcript log (via ctx ports)
35
+ let executionSummary;
36
+ let toolsSummary;
37
+ try {
38
+ if (this.ctx.findLogForTask && this.ctx.buildExecutionSummary) {
39
+ const logPath = this.ctx.findLogForTask(this.ctx.polpoDir, taskId, run?.id);
40
+ if (logPath) {
41
+ const summaryResult = this.ctx.buildExecutionSummary(logPath);
42
+ executionSummary = summaryResult.summary;
43
+ toolsSummary = summaryResult.toolsSummary;
44
+ }
45
+ }
46
+ }
47
+ catch { /* best effort — don't fail assessment if log parsing fails */ }
48
+ return {
49
+ taskTitle: task.title,
50
+ taskDescription: task.originalDescription ?? task.description,
51
+ agentOutput: result.stdout || undefined,
52
+ agentStderr: result.stderr || undefined,
53
+ exitCode: result.exitCode,
54
+ duration: result.duration,
55
+ filesCreated: activity?.filesCreated,
56
+ filesEdited: activity?.filesEdited,
57
+ toolCalls: activity?.toolCalls,
58
+ toolsSummary: toolsSummary || undefined,
59
+ executionSummary,
60
+ outcomes: outcomes?.length ? outcomes : undefined,
61
+ };
62
+ }
63
+ /**
64
+ * Attempt to transition a task to "done", but first run the before:task:complete
65
+ * hook so approval gates (and any other hooks) can block it.
66
+ * Returns true if the task transitioned to done, false if a hook blocked it.
67
+ */
68
+ async transitionToDone(taskId, task, result) {
69
+ const hookResult = await this.ctx.hooks.runBefore("task:complete", {
70
+ taskId, task, result,
71
+ });
72
+ if (hookResult.cancelled) {
73
+ this.ctx.emitter.emit("log", {
74
+ level: "info",
75
+ message: `[${taskId}] Completion blocked by hook: ${hookResult.cancelReason ?? "no reason"}`,
76
+ });
77
+ return false;
78
+ }
79
+ this.ctx.emitter.emit("task:transition", {
80
+ taskId,
81
+ from: task.status,
82
+ to: "done",
83
+ task: { ...task, status: "done" },
84
+ });
85
+ await this.ctx.registry.transition(taskId, "done");
86
+ await this.ctx.registry.updateTask(taskId, { phase: undefined });
87
+ // Fire after:task:complete (async, fire-and-forget)
88
+ this.ctx.hooks.runAfter("task:complete", { taskId, task, result }).catch(() => { });
89
+ return true;
90
+ }
91
+ /** Resolve effective confidence: explicit field, or default by type. */
92
+ getConfidence(exp) {
93
+ if (exp.confidence)
94
+ return exp.confidence;
95
+ return exp.type === "file_exists" ? "estimated" : "firm";
96
+ }
97
+ /** Check if any failed checks correspond to estimated expectations. */
98
+ hasEstimatedFailures(task, assessment) {
99
+ const failedTypes = new Set(assessment.checks.filter(c => !c.passed).map(c => c.type));
100
+ return task.expectations.some(e => failedTypes.has(e.type) && this.getConfidence(e) === "estimated");
101
+ }
102
+ async handleResult(taskId, result) {
103
+ const task = await this.ctx.registry.getTask(taskId);
104
+ if (!task)
105
+ return;
106
+ // Skip if already terminal
107
+ if (task.status === "done" || task.status === "failed")
108
+ return;
109
+ this.ctx.emitter.emit("agent:finished", {
110
+ taskId,
111
+ agentName: task.assignTo,
112
+ exitCode: result.exitCode,
113
+ duration: result.duration,
114
+ sessionId: task.sessionId,
115
+ });
116
+ // Ensure we're in review state
117
+ if (task.status === "in_progress") {
118
+ await this.ctx.registry.transition(taskId, "review");
119
+ await this.ctx.registry.updateTask(taskId, { phase: "review" });
120
+ }
121
+ // Question detection: intercept before assessment
122
+ const maxQRounds = this.ctx.config.settings.maxQuestionRounds ?? 2;
123
+ const questionRounds = task.questionRounds ?? 0;
124
+ if (result.exitCode === 0 && questionRounds < maxQRounds) {
125
+ // Get activity from RunStore for richer heuristic
126
+ const run = await this.ctx.runStore.getRunByTaskId(taskId);
127
+ const activity = run?.activity;
128
+ if (looksLikeQuestion(result, activity)) {
129
+ this.handlePossibleQuestion(taskId, task, result);
130
+ return;
131
+ }
132
+ }
133
+ this.proceedToAssessment(taskId, task, result);
134
+ }
135
+ /**
136
+ * LLM-classify a potential question, then either resolve+rerun or proceed to assessment.
137
+ */
138
+ handlePossibleQuestion(taskId, task, result) {
139
+ if (!this.ctx.queryLLM) {
140
+ // No LLM available — skip classification, proceed to assessment
141
+ this.proceedToAssessment(taskId, task, result);
142
+ return;
143
+ }
144
+ const queryLLM = this.ctx.queryLLM;
145
+ const classify = this.ports.classifyAsQuestion
146
+ ? (stdout, model) => this.ports.classifyAsQuestion(stdout, model)
147
+ : (stdout, model) => classifyAsQuestion(stdout, queryLLM, model);
148
+ classify(result.stdout, this.ctx.config.settings.orchestratorModel).then(classification => {
149
+ if (classification.isQuestion) {
150
+ this.resolveAndRerun(taskId, task, result, classification.question);
151
+ }
152
+ else {
153
+ this.proceedToAssessment(taskId, task, result);
154
+ }
155
+ }).catch(() => {
156
+ // Classification failed → proceed normally
157
+ this.proceedToAssessment(taskId, task, result);
158
+ });
159
+ }
160
+ /**
161
+ * Auto-answer an agent's question and re-run the task (no retry burn).
162
+ */
163
+ resolveAndRerun(taskId, task, result, question) {
164
+ this.ctx.emitter.emit("task:question", { taskId, question });
165
+ // Use the generateAnswer port if provided, otherwise build the answer inline via queryLLM
166
+ const answerPromise = this.ports.generateAnswer
167
+ ? this.ports.generateAnswer(task, question, this.ctx.config.settings.orchestratorModel)
168
+ : this.generateAnswerInline(task, question);
169
+ answerPromise.then(async (answer) => {
170
+ this.ctx.emitter.emit("task:answered", { taskId, question, answer });
171
+ const current = await this.ctx.registry.getTask(taskId);
172
+ if (!current)
173
+ return;
174
+ // Save original description before first Q&A
175
+ if (!current.originalDescription) {
176
+ await this.ctx.registry.updateTask(taskId, { originalDescription: current.description });
177
+ }
178
+ // Clear old outcomes before re-run — the agent will produce fresh ones.
179
+ await this.ctx.registry.updateTask(taskId, { outcomes: [] });
180
+ // Append Q&A to description and re-run (no retry burn)
181
+ const qaBlock = `\n\n[Polpo Clarification]\nQ: ${question}\nA: ${answer}`;
182
+ await this.ctx.registry.unsafeSetStatus(taskId, "pending", "Q&A re-run — no retry burn");
183
+ await this.ctx.registry.updateTask(taskId, {
184
+ phase: "execution",
185
+ description: current.description + qaBlock,
186
+ questionRounds: (current.questionRounds ?? 0) + 1,
187
+ });
188
+ }).catch(() => {
189
+ // Answer generation failed → proceed to assessment normally
190
+ this.proceedToAssessment(taskId, task, result);
191
+ });
192
+ }
193
+ /**
194
+ * Inline answer generation using ctx.memoryStore + ctx.registry + ctx.queryLLM.
195
+ * Equivalent to the shell's generateAnswer() but without Node.js dependencies.
196
+ */
197
+ async generateAnswerInline(task, question) {
198
+ if (!this.ctx.queryLLM) {
199
+ throw new Error("queryLLM port not available");
200
+ }
201
+ const memory = (await this.ctx.memoryStore?.get()) ?? "";
202
+ const state = await this.ctx.registry.getState();
203
+ // Sibling tasks in the same plan group for additional context
204
+ const siblings = task.group
205
+ ? state.tasks.filter(t => t.group === task.group && t.id !== task.id)
206
+ : [];
207
+ const parts = [
208
+ `You are Polpo, an AI agent orchestration framework. An agent working on a task has asked a question instead of completing the work.`,
209
+ `Your job is to answer the question concisely so the agent can proceed autonomously.`,
210
+ ``,
211
+ `## Task`,
212
+ `Title: ${task.title}`,
213
+ `Description: ${task.originalDescription || task.description}`,
214
+ ];
215
+ if (memory) {
216
+ parts.push(``, `## Shared Memory`, memory);
217
+ }
218
+ if (siblings.length > 0) {
219
+ parts.push(``, `## Related tasks in the same plan`);
220
+ for (const s of siblings) {
221
+ parts.push(`- [${s.status}] ${s.title}`);
222
+ if (s.result?.stdout && s.status === "done") {
223
+ parts.push(` Result: ${s.result.stdout.slice(0, 200)}`);
224
+ }
225
+ }
226
+ }
227
+ parts.push(``, `## Agent's Question`, question, ``, `Answer the question directly and concisely. Provide specific, actionable information.`, `If you're unsure, give your best guidance based on available context.`, `Do NOT ask follow-up questions. Just answer.`);
228
+ const prompt = parts.join("\n");
229
+ return (await this.ctx.queryLLM(prompt, this.ctx.config.settings.orchestratorModel)).text;
230
+ }
231
+ /**
232
+ * Run assessment with retry when all LLM reviewers fail.
233
+ * Retries up to maxAssessmentRetries times before returning the failed result.
234
+ */
235
+ async runAssessmentWithRetry(task, cwd, progressCb, context, checkProgressCb) {
236
+ const maxRetries = this.ctx.config.settings.maxAssessmentRetries ?? 1;
237
+ const reasoning = this.ctx.config.settings.reasoning;
238
+ for (let attempt = 0; attempt <= maxRetries; attempt++) {
239
+ const assessment = await this.ctx.assessFn(task, cwd, progressCb, context, reasoning, checkProgressCb);
240
+ // Check if failure is due to all evaluators failing (not low scores)
241
+ const allEvalsFailed = !assessment.passed && assessment.checks.some(c => c.type === "llm_review" && !c.passed && c.message.includes("all evaluators failed"));
242
+ if (!allEvalsFailed || attempt === maxRetries) {
243
+ return assessment;
244
+ }
245
+ this.ctx.emitter.emit("log", {
246
+ level: "warn",
247
+ message: `[${task.id}] All reviewers failed, retrying assessment (${attempt + 1}/${maxRetries})`,
248
+ });
249
+ }
250
+ // Unreachable — satisfies TypeScript
251
+ return this.ctx.assessFn(task, cwd, progressCb, context, reasoning, checkProgressCb);
252
+ }
253
+ /**
254
+ * Standard assessment flow: run expectations/metrics, then mark done/failed/fix/retry.
255
+ */
256
+ async proceedToAssessment(taskId, task, result) {
257
+ if (task.expectations.length > 0 || task.metrics.length > 0) {
258
+ // Run before:assessment:run hook (async — assessment is already async)
259
+ this.ctx.hooks.runBefore("assessment:run", { taskId, task }).then(async (hookResult) => {
260
+ if (hookResult.cancelled) {
261
+ this.ctx.emitter.emit("log", {
262
+ level: "info",
263
+ message: `[${taskId}] Assessment blocked by hook: ${hookResult.cancelReason ?? "no reason"}`,
264
+ });
265
+ // Skip assessment — mark done with result as-is
266
+ await this.ctx.registry.updateTask(taskId, { result });
267
+ if (result.exitCode === 0) {
268
+ this.transitionToDone(taskId, task, result).catch(() => { });
269
+ }
270
+ else {
271
+ await this.retryOrFail(taskId, task, result);
272
+ }
273
+ return;
274
+ }
275
+ this.runAssessmentFlow(taskId, task, result);
276
+ }).catch(() => {
277
+ this.runAssessmentFlow(taskId, task, result);
278
+ });
279
+ }
280
+ else {
281
+ await this.ctx.registry.updateTask(taskId, { result });
282
+ if (result.exitCode === 0) {
283
+ this.transitionToDone(taskId, task, result).catch(() => { });
284
+ }
285
+ else {
286
+ await this.retryOrFail(taskId, task, result);
287
+ }
288
+ }
289
+ }
290
+ /**
291
+ * Core assessment flow — extracted to allow hook interception in proceedToAssessment.
292
+ */
293
+ async runAssessmentFlow(taskId, task, result) {
294
+ this.ctx.emitter.emit("assessment:started", { taskId });
295
+ const progressCb = (msg) => this.ctx.emitter.emit("assessment:progress", { taskId, message: msg });
296
+ const checkProgressCb = (ev) => {
297
+ if (ev.phase === "started") {
298
+ this.ctx.emitter.emit("assessment:check:started", { taskId, index: ev.index, total: ev.total, type: ev.type, label: ev.label });
299
+ }
300
+ else {
301
+ this.ctx.emitter.emit("assessment:check:complete", { taskId, index: ev.index, total: ev.total, type: ev.type, label: ev.label, passed: ev.passed ?? false, message: ev.message });
302
+ }
303
+ };
304
+ // Build rich review context from RunStore, JSONL transcript, and outcomes
305
+ const reviewContext = await this.buildReviewContext(taskId, task, result);
306
+ this.runAssessmentWithRetry(task, this.ctx.agentWorkDir, progressCb, reviewContext, checkProgressCb).then(async (assessment) => {
307
+ setAssessment(result, assessment, "initial");
308
+ await this.ctx.registry.updateTask(taskId, { result });
309
+ if (assessment.passed && result.exitCode === 0) {
310
+ this.ctx.emitter.emit("assessment:complete", {
311
+ taskId,
312
+ passed: true,
313
+ scores: assessment.scores,
314
+ globalScore: assessment.globalScore,
315
+ message: task.title,
316
+ });
317
+ this.transitionToDone(taskId, task, result).catch(() => { });
318
+ }
319
+ else if (assessment.passed && result.exitCode !== 0) {
320
+ // Checks passed but agent failed (killed, crashed, non-zero exit).
321
+ // Override assessment to failed — the agent didn't complete successfully.
322
+ assessment.passed = false;
323
+ const exitMsg = `Agent exited with code ${result.exitCode}`;
324
+ assessment.checks.push({
325
+ type: "test",
326
+ passed: false,
327
+ message: exitMsg,
328
+ details: result.stderr || undefined,
329
+ });
330
+ await this.ctx.registry.updateTask(taskId, { result });
331
+ this.ctx.emitter.emit("assessment:complete", {
332
+ taskId,
333
+ passed: false,
334
+ scores: assessment.scores,
335
+ globalScore: assessment.globalScore,
336
+ message: exitMsg,
337
+ });
338
+ await this.retryOrFail(taskId, task, result);
339
+ }
340
+ else {
341
+ const reasons = [
342
+ ...assessment.checks.filter(c => !c.passed).map(c => `${c.type}: ${c.message}`),
343
+ ...assessment.metrics.filter(m => !m.passed).map(m => `${m.name}: ${m.value} < ${m.threshold}`),
344
+ ];
345
+ this.ctx.emitter.emit("assessment:complete", {
346
+ taskId,
347
+ passed: false,
348
+ scores: assessment.scores,
349
+ globalScore: assessment.globalScore,
350
+ message: reasons.join(", "),
351
+ });
352
+ // Execution OK but review failed → check if estimated expectations can be corrected
353
+ if (result.exitCode === 0) {
354
+ const autoCorrect = this.ctx.config.settings.autoCorrectExpectations !== false;
355
+ const hasEstimatedFailures = this.hasEstimatedFailures(task, assessment);
356
+ if (autoCorrect && hasEstimatedFailures) {
357
+ this.tryAutoCorrectExpectations(taskId, task, result, assessment).then(async (corrected) => {
358
+ if (corrected)
359
+ return;
360
+ const judged = await this.judgeExpectations(taskId, task, result, assessment);
361
+ if (!judged)
362
+ await this.fixOrRetry(taskId, task, result);
363
+ }).catch(async () => {
364
+ await this.fixOrRetry(taskId, task, result);
365
+ });
366
+ }
367
+ else {
368
+ await this.fixOrRetry(taskId, task, result);
369
+ }
370
+ }
371
+ else {
372
+ await this.retryOrFail(taskId, task, result);
373
+ }
374
+ }
375
+ }).catch(async (err) => {
376
+ this.ctx.emitter.emit("log", { level: "error", message: `[${taskId}] Assessment error: ${err.message}` });
377
+ await this.ctx.registry.updateTask(taskId, { result });
378
+ await this.retryOrFail(taskId, task, result);
379
+ });
380
+ }
381
+ /**
382
+ * Auto-correct expectations when assessment fails due to wrong paths.
383
+ * If the only failures are file_exists checks with incorrect paths, search
384
+ * for the actual files using agent activity + filesystem, update expectations,
385
+ * and re-assess. Returns true if auto-correction succeeded (task is done).
386
+ */
387
+ async tryAutoCorrectExpectations(taskId, task, result, assessment) {
388
+ const failedChecks = assessment.checks.filter(c => !c.passed);
389
+ const failedMetrics = assessment.metrics.filter(m => !m.passed);
390
+ if (failedMetrics.length > 0)
391
+ return false;
392
+ if (failedChecks.length === 0)
393
+ return false;
394
+ // Only correct estimated file_exists expectations; firm ones are never touched
395
+ const nonCorrectableFailures = failedChecks.filter(c => {
396
+ if (c.type !== "file_exists")
397
+ return true;
398
+ const exp = task.expectations.find(e => e.type === c.type);
399
+ return exp ? this.getConfidence(exp) === "firm" : true;
400
+ });
401
+ if (nonCorrectableFailures.length > 0)
402
+ return false;
403
+ // File system ports required for auto-correction
404
+ const { fileExists, baseName, joinPath } = this.ports;
405
+ if (!fileExists || !baseName)
406
+ return false;
407
+ // Gather agent's actual file list from activity
408
+ const run = await this.ctx.runStore.getRunByTaskId(taskId);
409
+ const activity = run?.activity;
410
+ const agentFiles = [
411
+ ...(activity?.filesCreated ?? []),
412
+ ...(activity?.filesEdited ?? []),
413
+ ];
414
+ // For each file_exists expectation that failed, try to find the actual path
415
+ const corrections = new Map(); // expectation index → corrected paths
416
+ let allCorrected = true;
417
+ for (let i = 0; i < task.expectations.length; i++) {
418
+ const exp = task.expectations[i];
419
+ if (exp.type !== "file_exists" || !exp.paths)
420
+ continue;
421
+ // Check if this expectation's check failed
422
+ const check = assessment.checks.find(c => c.type === "file_exists" && !c.passed);
423
+ if (!check)
424
+ continue;
425
+ const correctedPaths = [];
426
+ for (const expectedPath of exp.paths) {
427
+ if (fileExists(expectedPath)) {
428
+ correctedPaths.push(expectedPath);
429
+ continue;
430
+ }
431
+ // Try to find by basename in agent's created/edited files
432
+ const name = baseName(expectedPath);
433
+ const match = agentFiles.find(f => baseName(f) === name);
434
+ if (match && fileExists(match)) {
435
+ correctedPaths.push(match);
436
+ continue;
437
+ }
438
+ // Try to find by basename in workDir (shallow search in common locations)
439
+ const found = this.findFileByName(name);
440
+ if (found) {
441
+ correctedPaths.push(found);
442
+ continue;
443
+ }
444
+ // Can't find this file — can't auto-correct
445
+ allCorrected = false;
446
+ break;
447
+ }
448
+ if (!allCorrected)
449
+ break;
450
+ if (correctedPaths.length > 0) {
451
+ corrections.set(i, correctedPaths);
452
+ }
453
+ }
454
+ if (!allCorrected || corrections.size === 0)
455
+ return false;
456
+ // Apply corrections
457
+ const newExpectations = [...task.expectations];
458
+ for (const [idx, paths] of corrections) {
459
+ newExpectations[idx] = { ...newExpectations[idx], paths };
460
+ }
461
+ await this.ctx.registry.updateTask(taskId, { expectations: newExpectations });
462
+ this.ctx.emitter.emit("assessment:corrected", { taskId, corrections: corrections.size });
463
+ // Re-assess with corrected expectations
464
+ const current = await this.ctx.registry.getTask(taskId);
465
+ if (!current)
466
+ return false;
467
+ try {
468
+ const progressCb = (msg) => this.ctx.emitter.emit("assessment:progress", { taskId, message: msg });
469
+ const reCtx = await this.buildReviewContext(taskId, task, result);
470
+ const newAssessment = await this.ctx.assessFn(current, this.ctx.agentWorkDir, progressCb, reCtx, this.ctx.config.settings.reasoning);
471
+ setAssessment(result, newAssessment, "auto-correct");
472
+ await this.ctx.registry.updateTask(taskId, { result });
473
+ if (newAssessment.passed) {
474
+ this.ctx.emitter.emit("assessment:complete", {
475
+ taskId,
476
+ passed: true,
477
+ scores: newAssessment.scores,
478
+ globalScore: newAssessment.globalScore,
479
+ message: `${task.title} (paths auto-corrected)`,
480
+ });
481
+ return this.transitionToDone(taskId, task, result);
482
+ }
483
+ }
484
+ catch { /* re-assessment failed */
485
+ }
486
+ return false;
487
+ }
488
+ /** Search for a file by name in common project locations.
489
+ * Searches agentWorkDir first (where the agent actually created files),
490
+ * then falls back to workDir (the project root) when they differ. */
491
+ findFileByName(name) {
492
+ const { joinPath } = this.ports;
493
+ if (!joinPath)
494
+ return null;
495
+ const searchDirs = [
496
+ this.ctx.agentWorkDir,
497
+ joinPath(this.ctx.agentWorkDir, "src"),
498
+ ];
499
+ // When agentWorkDir differs from workDir (settings.workDir is set),
500
+ // also search the project root as a fallback.
501
+ if (this.ctx.agentWorkDir !== this.ctx.workDir) {
502
+ searchDirs.push(this.ctx.workDir, joinPath(this.ctx.workDir, "src"));
503
+ }
504
+ for (const dir of searchDirs) {
505
+ const found = this.searchDir(dir, name, 4);
506
+ if (found)
507
+ return found;
508
+ }
509
+ return null;
510
+ }
511
+ /**
512
+ * LLM judge: analyze failed estimated expectations vs agent output and decide
513
+ * whether they are wrong (correct them) or the agent's work is wrong (fix phase).
514
+ * Only operates on estimated expectations — firm ones are never touched.
515
+ * Returns true if expectations were corrected and re-assessment passed.
516
+ */
517
+ async judgeExpectations(taskId, task, result, assessment) {
518
+ // Only judge estimated expectations
519
+ const failedChecks = assessment.checks.filter(c => {
520
+ if (c.passed)
521
+ return false;
522
+ const exp = task.expectations.find(e => e.type === c.type);
523
+ return exp ? this.getConfidence(exp) === "estimated" : false;
524
+ });
525
+ if (failedChecks.length === 0)
526
+ return false;
527
+ // Don't judge if score is very low — that's clearly bad work
528
+ if (assessment.globalScore !== undefined && assessment.globalScore < 2.5)
529
+ return false;
530
+ // queryLLM port required for judge
531
+ if (!this.ctx.queryLLM)
532
+ return false;
533
+ // Gather context
534
+ const run = await this.ctx.runStore.getRunByTaskId(taskId);
535
+ const activity = run?.activity;
536
+ const prompt = buildJudgePrompt(task, result, assessment, failedChecks, activity);
537
+ let response;
538
+ try {
539
+ response = (await this.ctx.queryLLM(prompt, this.ctx.config.settings.orchestratorModel)).text;
540
+ }
541
+ catch { /* LLM query failed */
542
+ return false;
543
+ }
544
+ // Parse LLM verdict
545
+ let verdict;
546
+ try {
547
+ const cleaned = response.replace(/```json?\n?/g, "").replace(/```/g, "").trim();
548
+ verdict = JSON.parse(cleaned);
549
+ if (!verdict.corrections || !Array.isArray(verdict.corrections))
550
+ return false;
551
+ }
552
+ catch { /* malformed JSON response */
553
+ return false;
554
+ }
555
+ // Apply corrections only if LLM found at least one fixable expectation
556
+ const fixable = verdict.corrections.filter((c) => c.verdict === "expectation_wrong" && c.fix);
557
+ if (fixable.length === 0)
558
+ return false;
559
+ const newExpectations = [...task.expectations];
560
+ let correctionCount = 0;
561
+ for (const fix of fixable) {
562
+ const idx = task.expectations.findIndex(e => e.type === fix.type);
563
+ if (idx < 0 || !fix.fix)
564
+ continue;
565
+ // Double-check: never correct firm expectations even if LLM suggests it
566
+ const exp = newExpectations[idx];
567
+ if (this.getConfidence(exp) === "firm")
568
+ continue;
569
+ const f = fix.fix;
570
+ if (fix.type === "file_exists" && f.paths) {
571
+ newExpectations[idx] = { ...exp, paths: f.paths };
572
+ correctionCount++;
573
+ }
574
+ else if ((fix.type === "test" || fix.type === "script") && f.command) {
575
+ newExpectations[idx] = { ...exp, command: f.command };
576
+ correctionCount++;
577
+ }
578
+ else if (fix.type === "llm_review" && f.threshold !== undefined) {
579
+ newExpectations[idx] = { ...exp, threshold: f.threshold };
580
+ correctionCount++;
581
+ }
582
+ }
583
+ if (correctionCount === 0)
584
+ return false;
585
+ await this.ctx.registry.updateTask(taskId, { expectations: newExpectations });
586
+ this.ctx.emitter.emit("assessment:corrected", { taskId, corrections: correctionCount });
587
+ // Re-assess with corrected expectations
588
+ const current = await this.ctx.registry.getTask(taskId);
589
+ if (!current)
590
+ return false;
591
+ try {
592
+ const progressCb = (msg) => this.ctx.emitter.emit("assessment:progress", { taskId, message: msg });
593
+ const judgeCtx = await this.buildReviewContext(taskId, task, result);
594
+ const newAssessment = await this.ctx.assessFn(current, this.ctx.agentWorkDir, progressCb, judgeCtx, this.ctx.config.settings.reasoning);
595
+ setAssessment(result, newAssessment, "judge");
596
+ await this.ctx.registry.updateTask(taskId, { result });
597
+ if (newAssessment.passed) {
598
+ this.ctx.emitter.emit("assessment:complete", {
599
+ taskId,
600
+ passed: true,
601
+ scores: newAssessment.scores,
602
+ globalScore: newAssessment.globalScore,
603
+ message: `${task.title} (expectations corrected)`,
604
+ });
605
+ return this.transitionToDone(taskId, task, result);
606
+ }
607
+ }
608
+ catch { /* re-assessment failed */
609
+ }
610
+ return false;
611
+ }
612
+ /** Recursive directory search (bounded depth). Uses injected file system ports. */
613
+ searchDir(dir, name, maxDepth) {
614
+ if (maxDepth <= 0)
615
+ return null;
616
+ const { readDir, joinPath } = this.ports;
617
+ if (!readDir || !joinPath)
618
+ return null;
619
+ try {
620
+ const entries = readDir(dir);
621
+ for (const entry of entries) {
622
+ if (entry.name === "node_modules" || entry.name === ".git")
623
+ continue;
624
+ const fullPath = joinPath(dir, entry.name);
625
+ if (entry.isFile && entry.name === name)
626
+ return fullPath;
627
+ if (entry.isDirectory) {
628
+ const found = this.searchDir(fullPath, name, maxDepth - 1);
629
+ if (found)
630
+ return found;
631
+ }
632
+ }
633
+ }
634
+ catch { /* permission error or missing dir */ }
635
+ return null;
636
+ }
637
+ /**
638
+ * Fix phase: when execution succeeded but review failed, try a targeted fix
639
+ * without burning a full retry. After maxFixAttempts, fall back to full retry.
640
+ */
641
+ async fixOrRetry(taskId, _task, result) {
642
+ const current = await this.ctx.registry.getTask(taskId);
643
+ if (!current)
644
+ return;
645
+ // Side-effects guard: block automatic fix/retry for tasks with irreversible actions.
646
+ // The task transitions to awaiting_approval so a human can decide whether to re-execute.
647
+ // We still save the fix prompt so the agent gets feedback if the human approves.
648
+ if (current.sideEffects) {
649
+ const reason = "Task has sideEffects — automatic fix/retry blocked. Awaiting human approval.";
650
+ this.ctx.emitter.emit("task:retry:blocked", { taskId, reason });
651
+ this.ctx.emitter.emit("log", { level: "warn", message: `[${taskId}] ${reason}` });
652
+ // Preserve original description and prepare fix prompt for when approval comes
653
+ if (!current.originalDescription) {
654
+ await this.ctx.registry.updateTask(taskId, { originalDescription: current.description });
655
+ }
656
+ await this.ctx.registry.updateTask(taskId, {
657
+ description: buildSideEffectFixPrompt(current, result),
658
+ phase: "fix",
659
+ });
660
+ await this.ctx.registry.transition(taskId, "awaiting_approval");
661
+ return;
662
+ }
663
+ const maxFix = this.ctx.config.settings.maxFixAttempts ?? 2;
664
+ const fixAttempts = (current.fixAttempts ?? 0) + 1;
665
+ if (fixAttempts <= maxFix) {
666
+ // Save original description before first fix/retry
667
+ if (!current.originalDescription) {
668
+ await this.ctx.registry.updateTask(taskId, { originalDescription: current.description });
669
+ }
670
+ this.ctx.emitter.emit("task:fix", { taskId, attempt: fixAttempts, maxFix });
671
+ // Clear old outcomes — the agent will produce fresh ones on re-execution.
672
+ await this.ctx.registry.updateTask(taskId, { outcomes: [] });
673
+ // unsafeSetStatus bypasses retry increment (fix attempts are NOT real failures)
674
+ await this.ctx.registry.unsafeSetStatus(taskId, "pending", "fix phase — no retry burn");
675
+ await this.ctx.registry.updateTask(taskId, {
676
+ phase: "fix",
677
+ fixAttempts,
678
+ description: buildFixPrompt(current, result),
679
+ });
680
+ }
681
+ else {
682
+ // Fix attempts exhausted → full retry (burns 1 retry)
683
+ this.ctx.emitter.emit("log", { level: "warn", message: `[${taskId}] Fix attempts exhausted (${maxFix}), falling back to full retry` });
684
+ await this.ctx.registry.updateTask(taskId, {
685
+ phase: "execution",
686
+ fixAttempts: 0,
687
+ });
688
+ await this.retryOrFail(taskId, _task, result);
689
+ }
690
+ }
691
+ /** @internal — exposed for test access via Orchestrator facade */
692
+ async retryOrFail(taskId, _task, result) {
693
+ const current = await this.ctx.registry.getTask(taskId);
694
+ if (!current)
695
+ return;
696
+ // Side-effects guard: block automatic retry for tasks with irreversible actions.
697
+ if (current.sideEffects) {
698
+ const reason = "Task has sideEffects — automatic retry blocked. Awaiting human approval.";
699
+ this.ctx.emitter.emit("task:retry:blocked", { taskId, reason });
700
+ this.ctx.emitter.emit("log", { level: "warn", message: `[${taskId}] ${reason}` });
701
+ // Preserve original description and prepare retry prompt for when approval comes
702
+ if (!current.originalDescription) {
703
+ await this.ctx.registry.updateTask(taskId, { originalDescription: current.description });
704
+ }
705
+ await this.ctx.registry.updateTask(taskId, {
706
+ description: buildSideEffectRetryPrompt(current, result),
707
+ phase: "execution",
708
+ });
709
+ await this.ctx.registry.transition(taskId, "awaiting_approval");
710
+ return;
711
+ }
712
+ // Don't retry tasks from cancelled missions — resolve via missionId (direct FK) first
713
+ if (current.group) {
714
+ const mission = current.missionId
715
+ ? await this.ctx.registry.getMission?.(current.missionId)
716
+ : await this.ctx.registry.getMissionByName?.(current.group);
717
+ if (mission && mission.status === "cancelled") {
718
+ this.ctx.emitter.emit("log", { level: "debug", message: `[${taskId}] Skipping retry — mission cancelled` });
719
+ await this.ctx.registry.transition(taskId, "failed");
720
+ return;
721
+ }
722
+ }
723
+ if (current.retries < current.maxRetries) {
724
+ const policy = current.retryPolicy ?? this.ctx.config.settings.defaultRetryPolicy;
725
+ const nextAttempt = current.retries + 1;
726
+ // Save original description before first retry
727
+ if (!current.originalDescription) {
728
+ await this.ctx.registry.updateTask(taskId, { originalDescription: current.description });
729
+ }
730
+ // Check if we should escalate to a different agent
731
+ // fallbackAgent resolution: explicit policy > agent.reportsTo (org chart)
732
+ let assignTo = current.assignTo;
733
+ const currentAgent = await this.ctx.agentStore.getAgent(current.assignTo);
734
+ const effectiveFallback = policy?.fallbackAgent ?? currentAgent?.reportsTo;
735
+ if (policy?.escalateAfter !== undefined && nextAttempt >= policy.escalateAfter) {
736
+ if (effectiveFallback) {
737
+ const fallback = await this.ctx.agentStore.getAgent(effectiveFallback);
738
+ if (fallback) {
739
+ assignTo = effectiveFallback;
740
+ this.ctx.emitter.emit("log", { level: "info", message: `[${taskId}] Escalating to ${assignTo} (attempt ${nextAttempt})` });
741
+ }
742
+ }
743
+ }
744
+ this.ctx.emitter.emit("task:retry", { taskId, attempt: nextAttempt, maxRetries: current.maxRetries });
745
+ // Clear old outcomes — the agent will produce fresh ones on re-execution.
746
+ // Without this, outcomes accumulate across retries and all get re-sent via notifications.
747
+ await this.ctx.registry.updateTask(taskId, { outcomes: [] });
748
+ await this.ctx.registry.transition(taskId, "failed");
749
+ await this.ctx.registry.transition(taskId, "pending");
750
+ await this.ctx.registry.updateTask(taskId, {
751
+ description: buildRetryPrompt(current, result),
752
+ assignTo,
753
+ phase: "execution",
754
+ fixAttempts: 0,
755
+ });
756
+ }
757
+ else {
758
+ this.ctx.emitter.emit("task:maxRetries", { taskId });
759
+ // Run before:task:fail hook — escalation manager can intercept here
760
+ this.ctx.hooks.runBefore("task:fail", {
761
+ taskId,
762
+ task: current,
763
+ result,
764
+ reason: "maxRetries",
765
+ }).then(async (hookResult) => {
766
+ if (hookResult.cancelled) {
767
+ this.ctx.emitter.emit("log", {
768
+ level: "info",
769
+ message: `[${taskId}] Final failure intercepted by hook: ${hookResult.cancelReason ?? "escalation"}`,
770
+ });
771
+ return; // Escalation manager (or other hook) is handling this
772
+ }
773
+ await this.ctx.registry.transition(taskId, "failed");
774
+ await this.ctx.registry.updateTask(taskId, { phase: undefined });
775
+ // Fire after:task:fail
776
+ this.ctx.hooks.runAfter("task:fail", {
777
+ taskId,
778
+ task: current,
779
+ result,
780
+ reason: "maxRetries",
781
+ }).catch(() => { });
782
+ }).catch(async () => {
783
+ // Hook failed — fail the task normally
784
+ await this.ctx.registry.transition(taskId, "failed");
785
+ await this.ctx.registry.updateTask(taskId, { phase: undefined });
786
+ });
787
+ }
788
+ }
789
+ }
790
+ //# sourceMappingURL=assessment-orchestrator.js.map