@visorcraft/idlehands 2.0.0 → 2.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. package/dist/agent/context-budget.js +103 -0
  2. package/dist/agent/context-budget.js.map +1 -0
  3. package/dist/agent/tool-loop-detection.js +91 -20
  4. package/dist/agent/tool-loop-detection.js.map +1 -1
  5. package/dist/agent.js +55 -11
  6. package/dist/agent.js.map +1 -1
  7. package/dist/anton/controller.js +512 -186
  8. package/dist/anton/controller.js.map +1 -1
  9. package/dist/anton/preflight.js +52 -24
  10. package/dist/anton/preflight.js.map +1 -1
  11. package/dist/anton/session.js +6 -0
  12. package/dist/anton/session.js.map +1 -1
  13. package/dist/bot/anton-run.js +16 -5
  14. package/dist/bot/anton-run.js.map +1 -1
  15. package/dist/bot/discord-commands.js +25 -0
  16. package/dist/bot/discord-commands.js.map +1 -1
  17. package/dist/bot/discord.js +28 -0
  18. package/dist/bot/discord.js.map +1 -1
  19. package/dist/bot/format.js +0 -5
  20. package/dist/bot/format.js.map +1 -1
  21. package/dist/bot/telegram-commands.js +21 -0
  22. package/dist/bot/telegram-commands.js.map +1 -1
  23. package/dist/bot/telegram.js +3 -1
  24. package/dist/bot/telegram.js.map +1 -1
  25. package/dist/bot/upgrade-command.js +398 -0
  26. package/dist/bot/upgrade-command.js.map +1 -0
  27. package/dist/bot/ux/shared-formatter.js +43 -0
  28. package/dist/bot/ux/shared-formatter.js.map +1 -0
  29. package/dist/cli/commands/upgrade.js +27 -0
  30. package/dist/cli/commands/upgrade.js.map +1 -0
  31. package/dist/history.js +418 -0
  32. package/dist/history.js.map +1 -1
  33. package/dist/index.js +2 -0
  34. package/dist/index.js.map +1 -1
  35. package/dist/tui/command-handler.js +2 -0
  36. package/dist/tui/command-handler.js.map +1 -1
  37. package/dist/vault.js +133 -0
  38. package/dist/vault.js.map +1 -1
  39. package/package.json +1 -1
@@ -4,17 +4,182 @@
4
4
  * Coordinates all components: parser, prompt, verifier, lock, git, session.
5
5
  * Structured as a deterministic orchestration flow for autonomous task execution.
6
6
  */
7
+ import * as fs from 'fs';
8
+ import * as path from 'path';
7
9
  import { isToolLoopBreak, AUTO_CONTINUE_PROMPT } from '../bot/auto-continue.js';
8
10
  import { ensureCleanWorkingTree, getWorkingDiff, commitAll, restoreTrackedChanges, cleanUntracked, createBranch, getUntrackedFiles, removeUntrackedFiles, } from '../git.js';
9
11
  import { estimateTokens } from '../utils.js';
10
12
  import { acquireAntonLock, releaseAntonLock, touchAntonLock } from './lock.js';
11
13
  import { parseTaskFile, findRunnablePendingTasks, markTaskChecked, insertSubTasks, autoCompleteAncestors, } from './parser.js';
12
- import { ensureAgentsTasksDir, makeUniqueTaskPlanFilename, buildDiscoveryPrompt, parseDiscoveryResult, buildRequirementsReviewPrompt, parseRequirementsReviewResult, ensurePlanFileExistsOrBootstrap, } from './preflight.js';
14
+ import { ensureAgentsTasksDir, makeUniqueTaskPlanFilename, buildDiscoveryPrompt, parseDiscoveryResult, buildRequirementsReviewPrompt, parseRequirementsReviewResult, ensurePlanFileExistsOrBootstrap, FORCE_DISCOVERY_DECISION_PROMPT, FORCE_REVIEW_DECISION_PROMPT, } from './preflight.js';
13
15
  import { buildAntonPrompt, parseAntonResult, classifyTaskComplexity } from './prompt.js';
14
16
  import { formatDryRunPlan } from './reporter.js';
15
17
  import { classifyInfraError, ensureAntonRuntimeReady } from './runtime-ready.js';
16
18
  import { buildSessionConfig, buildPreflightConfig, buildDecomposeConfig, buildVerifyConfig, defaultCreateSession, } from './session.js';
17
19
  import { captureLintBaseline, detectVerificationCommands, runVerification } from './verifier.js';
20
+ // ─────────────────────────────────────────────────────────────────────────────
21
+ // L2 Retry Enhancement Helpers
22
+ // ─────────────────────────────────────────────────────────────────────────────
23
+ /**
24
+ * Extract file paths mentioned in an L2 failure reason.
25
+ * Looks for patterns like: app/Models/Channel.php, src/foo/bar.ts, etc.
26
+ */
27
+ function extractFilePathsFromL2Reason(reason) {
28
+ const patterns = [
29
+ // PHP/Laravel style: app/Models/Channel.php, app/Http/Controllers/Foo.php
30
+ /\b(app\/[\w\/]+\.php)\b/gi,
31
+ // General file paths with extensions
32
+ /\b((?:src|lib|tests?)\/[\w\/.-]+\.\w+)\b/gi,
33
+ // Model names that can be mapped to files: "Channel model" -> app/Models/Channel.php
34
+ /\b(\w+)\s+model\b/gi,
35
+ ];
36
+ const found = new Set();
37
+ for (const pattern of patterns) {
38
+ const matches = reason.matchAll(pattern);
39
+ for (const match of matches) {
40
+ const p = match[1];
41
+ // If it's a model name reference like "Channel model", convert to path
42
+ if (/model$/i.test(match[0]) && !/\.php$/i.test(p)) {
43
+ found.add(`app/Models/${p}.php`);
44
+ }
45
+ else {
46
+ found.add(p);
47
+ }
48
+ }
49
+ }
50
+ return [...found];
51
+ }
52
+ /**
53
+ * Detect if L2 reason indicates a "missing implementation" pattern.
54
+ * Returns true if the model wrote tests but forgot the actual implementation.
55
+ */
56
+ function isL2MissingImplementation(reason) {
57
+ const missingPatterns = [
58
+ /missing\s+(?:from|in)\s+/i,
59
+ /no\s+(?:corresponding|evidence|actual)/i,
60
+ /relationship\s+(?:method\s+)?is\s+missing/i,
61
+ /but\s+(?:the|there['']?s?\s+no)/i,
62
+ /tests?\s+(?:expect|added|written).*but/i,
63
+ /should\s+be\s+(?:hasMany|hasOne|belongsTo|morphMany)/i,
64
+ ];
65
+ return missingPatterns.some((p) => p.test(reason));
66
+ }
67
+ function isRecoverablePreflightDiscoveryError(errMsg) {
68
+ return (/preflight-json-missing-object|preflight-discovery-invalid-status|preflight-discovery-invalid-filename|preflight-discovery-filename/i.test(errMsg) || /identical call repeated|breaking loop|tool\s+edit_range/i.test(errMsg));
69
+ }
70
+ function isRecoverablePreflightReviewError(errMsg) {
71
+ return /preflight-json-missing-object|preflight-review-invalid-status|preflight-review-invalid-filename|preflight-review-filename/i.test(errMsg);
72
+ }
73
+ /**
74
+ * Try to read a file's contents for injection into retry context.
75
+ * Returns null if file doesn't exist or is too large.
76
+ */
77
+ function readFileForL2Injection(projectDir, filePath) {
78
+ const MAX_FILE_SIZE = 15000; // ~15KB, reasonable for injection
79
+ try {
80
+ const fullPath = path.resolve(projectDir, filePath);
81
+ if (!fs.existsSync(fullPath))
82
+ return null;
83
+ const stat = fs.statSync(fullPath);
84
+ if (stat.size > MAX_FILE_SIZE)
85
+ return null;
86
+ return fs.readFileSync(fullPath, 'utf8');
87
+ }
88
+ catch {
89
+ return null;
90
+ }
91
+ }
92
+ /**
93
+ * Build enhanced retry context when L2 fails due to missing implementation.
94
+ * - On first L2 failure: Add strong guidance about which files to modify
95
+ * - On 2+ L2 failures: Inject the actual file contents so model can see what's missing
96
+ */
97
+ function buildL2EnhancedRetryContext(l2Reason, l2FailCount, projectDir, taskText) {
98
+ const parts = [];
99
+ const filePaths = extractFilePathsFromL2Reason(l2Reason);
100
+ const isMissingImpl = isL2MissingImplementation(l2Reason);
101
+ if (!isMissingImpl || filePaths.length === 0) {
102
+ // Not a "missing implementation" pattern, no enhancement needed
103
+ return '';
104
+ }
105
+ parts.push('');
106
+ parts.push('═══════════════════════════════════════════════════════════════════════');
107
+ parts.push('⚠️ CRITICAL: AI REVIEW FAILED — MISSING IMPLEMENTATION DETECTED');
108
+ parts.push('═══════════════════════════════════════════════════════════════════════');
109
+ parts.push('');
110
+ parts.push(`The AI review found that you wrote tests but FORGOT THE ACTUAL IMPLEMENTATION.`);
111
+ parts.push(`Task: "${taskText}"`);
112
+ parts.push('');
113
+ parts.push('YOU MUST MODIFY THESE FILES:');
114
+ for (const fp of filePaths) {
115
+ parts.push(` → ${fp}`);
116
+ }
117
+ parts.push('');
118
+ // After 2+ identical L2 failures, inject file contents
119
+ if (l2FailCount >= 2) {
120
+ parts.push('Since you have failed this verification multiple times, here are the current');
121
+ parts.push('contents of the files you need to modify:');
122
+ parts.push('');
123
+ for (const fp of filePaths) {
124
+ const contents = readFileForL2Injection(projectDir, fp);
125
+ if (contents !== null) {
126
+ parts.push(`┌─── ${fp} ───`);
127
+ parts.push(contents);
128
+ parts.push(`└─── end of ${fp} ───`);
129
+ parts.push('');
130
+ }
131
+ else {
132
+ parts.push(`[Could not read ${fp} — file may not exist or is too large]`);
133
+ parts.push('');
134
+ }
135
+ }
136
+ }
137
+ parts.push('INSTRUCTIONS:');
138
+ parts.push('1. READ the files listed above (they are your existing code)');
139
+ parts.push('2. ADD the missing method/relationship to the model file');
140
+ parts.push('3. Do NOT just modify tests — the MODEL/SOURCE file must change');
141
+ parts.push('4. The L2 review expects to see your implementation in the diff');
142
+ parts.push('');
143
+ return parts.join('\n');
144
+ }
145
+ const ANTON_RESULT_SYSTEM_CONTRACT = `[Anton output contract]
146
+ Every final implementation/decompose answer MUST contain exactly one structured block:
147
+ <anton-result>
148
+ status: done|failed|blocked|decompose
149
+ reason: <optional>
150
+ subtasks:
151
+ - <only when status=decompose>
152
+ </anton-result>
153
+ Do not omit this block.`;
154
+ const STRUCTURED_RESULT_RECOVERY_PROMPT = `Your previous reply did not include a valid <anton-result> block.
155
+ Do NOT call tools.
156
+ Return ONLY this block shape and nothing else:
157
+ <anton-result>
158
+ status: done|failed|blocked|decompose
159
+ reason: <optional>
160
+ subtasks:
161
+ - <only when status=decompose>
162
+ </anton-result>`;
163
+ function isStructuredResultParseFailure(reason) {
164
+ if (!reason)
165
+ return false;
166
+ return (reason === 'Agent did not emit structured result' ||
167
+ reason === 'No status line found in result block' ||
168
+ reason.startsWith('Unknown status:'));
169
+ }
170
+ function injectAntonResultContract(session) {
171
+ try {
172
+ const current = String(session.getSystemPrompt?.() ?? '').trim();
173
+ if (!current)
174
+ return;
175
+ if (current.includes('<anton-result>') || current.includes('[Anton output contract]'))
176
+ return;
177
+ session.setSystemPrompt(`${current}\n\n${ANTON_RESULT_SYSTEM_CONTRACT}`);
178
+ }
179
+ catch {
180
+ // best effort
181
+ }
182
+ }
18
183
  export async function runAnton(opts) {
19
184
  const { config, idlehandsConfig, progress, abortSignal, apiKey, vault, lens } = opts;
20
185
  const createSessionFn = opts.createSession || defaultCreateSession;
@@ -31,6 +196,7 @@ export async function runAnton(opts) {
31
196
  const taskRetryCount = new Map();
32
197
  const lastFailureReason = new Map();
33
198
  const consecutiveIdenticalCount = new Map();
199
+ const l2FailCount = new Map(); // Track consecutive L2 failures per task
34
200
  let lockHeartbeatTimer = null;
35
201
  // SIGINT handler
36
202
  const handleAbort = () => {
@@ -131,8 +297,15 @@ export async function runAnton(opts) {
131
297
  parts.push('- Test command failed');
132
298
  if (v.l1_lint === false)
133
299
  parts.push('- Lint command failed');
134
- if (v.l2_ai === false && v.l2_reason)
300
+ if (v.l2_ai === false && v.l2_reason) {
135
301
  parts.push(`- AI review: ${v.l2_reason}`);
302
+ // Enhanced L2 retry context: stronger guidance + file injection on repeated failures
303
+ const currentL2Count = l2FailCount.get(currentTask.key) || 0;
304
+ const l2Enhancement = buildL2EnhancedRetryContext(v.l2_reason, currentL2Count, config.projectDir, currentTask.text);
305
+ if (l2Enhancement) {
306
+ parts.push(l2Enhancement);
307
+ }
308
+ }
136
309
  // Include error output (filtered to errors only, no warnings) so the
137
310
  // agent can see and fix the exact issues.
138
311
  if (v.commandOutput) {
@@ -241,171 +414,117 @@ export async function runAnton(opts) {
241
414
  let discoveryOk = false;
242
415
  await ensureAgentsTasksDir(config.projectDir);
243
416
  const plannedFilePath = taskPlanByTaskKey.get(currentTask.key) ?? makeUniqueTaskPlanFilename(config.projectDir);
244
- let discoveryIterationCap = Math.max(1, Math.floor(config.preflightSessionMaxIterations ?? 500));
245
- // Stage 1: discovery (retry discovery only).
246
- for (let discoveryTry = 0; discoveryTry <= preflightMaxRetries; discoveryTry++) {
247
- const stageStart = Date.now();
248
- const discoveryTimeoutSec = config.preflightDiscoveryTimeoutSec ?? config.taskTimeoutSec;
249
- const discoveryTimeoutMs = discoveryTimeoutSec * 1000;
250
- let discoverySession;
251
- try {
252
- progress.onStage?.('🔎 Discovery: checking if already done...');
253
- discoverySession = await createSessionFn(buildPreflightConfig(idlehandsConfig, config, discoveryTimeoutSec, discoveryIterationCap), apiKey);
254
- const discoveryPrompt = buildDiscoveryPrompt({
255
- task: currentTask,
256
- taskFilePath: config.taskFile,
257
- projectDir: config.projectDir,
258
- planFilePath: plannedFilePath,
259
- });
260
- const discoveryRes = await Promise.race([
261
- discoverySession.ask(discoveryPrompt),
262
- new Promise((_, reject) => setTimeout(() => {
263
- try {
264
- discoverySession?.cancel();
265
- }
266
- catch {
267
- // best effort
268
- }
269
- reject(new Error('preflight-discovery-timeout'));
270
- }, discoveryTimeoutMs)),
271
- ]);
272
- const discoveryTokens = discoverySession.usage.prompt + discoverySession.usage.completion;
273
- totalTokens += discoveryTokens;
274
- const discovery = parseDiscoveryResult(discoveryRes.text, config.projectDir);
275
- preflightRecords.push({
276
- taskKey: currentTask.key,
277
- stage: 'discovery',
278
- durationMs: Date.now() - stageStart,
279
- tokensUsed: discoveryTokens,
280
- status: discovery.status,
281
- filename: discovery.filename || undefined,
282
- });
283
- if (discovery.status === 'complete') {
284
- await markTaskChecked(config.taskFile, currentTask.key);
285
- await autoCompleteAncestors(config.taskFile, currentTask.key);
286
- autoCompleted += 1;
287
- progress.onStage?.(`✅ Discovery confirmed already complete: ${currentTask.text}`);
288
- preflightMarkedComplete = true;
289
- discoveryOk = true;
290
- break;
291
- }
292
- const discoveryPlanState = await ensurePlanFileExistsOrBootstrap({
293
- absPath: discovery.filename,
294
- task: currentTask,
295
- source: 'discovery',
296
- });
297
- if (discoveryPlanState === 'bootstrapped') {
298
- progress.onStage?.(`⚠️ Discovery returned a filename but did not write it. Created fallback plan file: ${discovery.filename}`);
299
- }
300
- taskPlanByTaskKey.set(currentTask.key, discovery.filename);
301
- progress.onStage?.(`📝 Discovery plan file: ${discovery.filename}`);
302
- discoveryOk = true;
303
- break;
304
- }
305
- catch (error) {
306
- const errMsg = error instanceof Error ? error.message : String(error);
307
- const timeout = /timeout/i.test(errMsg);
308
- preflightRecords.push({
309
- taskKey: currentTask.key,
310
- stage: 'discovery',
311
- durationMs: Date.now() - stageStart,
312
- tokensUsed: 0,
313
- status: timeout ? 'timeout' : 'error',
314
- error: errMsg,
315
- });
316
- if (discoveryTry < preflightMaxRetries) {
317
- const short = errMsg.length > 180 ? `${errMsg.slice(0, 177)}...` : errMsg;
318
- if (/max iterations exceeded/i.test(errMsg)) {
319
- const nextCap = Math.min(Math.max(discoveryIterationCap * 2, discoveryIterationCap + 2), 1000);
320
- if (nextCap > discoveryIterationCap) {
321
- progress.onStage?.(`⚠️ Discovery hit max iterations (${discoveryIterationCap}). Increasing preflight cap to ${nextCap} and retrying...`);
322
- discoveryIterationCap = nextCap;
323
- continue;
324
- }
325
- }
326
- progress.onStage?.(`⚠️ Discovery failed (${discoveryTry + 1}/${preflightTotalTries}): ${short}. Retrying discovery...`);
327
- continue;
328
- }
329
- const preflightAttempt = {
330
- taskKey: currentTask.key,
331
- taskText: currentTask.text,
332
- attempt: attemptNumber,
333
- durationMs: Date.now() - stageStart,
334
- tokensUsed: 0,
335
- status: timeout ? 'timeout' : 'error',
336
- verification: undefined,
337
- error: `preflight-error(discovery): ${errMsg}`,
338
- commitHash: undefined,
339
- };
340
- attempts.push(preflightAttempt);
341
- taskRetryCount.set(currentTask.key, retries + 1);
342
- if (!config.skipOnFail)
343
- break mainLoop;
344
- }
345
- finally {
417
+ // Default to 50 iterations for discovery (was 500 - way too high for a simple JSON check)
418
+ let discoveryIterationCap = Math.max(1, Math.floor(config.preflightSessionMaxIterations ?? 50));
419
+ let discoveryRetryHint;
420
+ // Shared preflight session - reused between discovery and review stages to avoid
421
+ // session creation overhead. Created lazily, closed on error (for fresh retry state)
422
+ // or at end of preflight block.
423
+ let preflightSession;
424
+ const closePreflightSession = async () => {
425
+ if (preflightSession) {
346
426
  try {
347
- await discoverySession?.close();
427
+ await preflightSession.close();
348
428
  }
349
429
  catch {
350
430
  // best effort
351
431
  }
432
+ preflightSession = undefined;
352
433
  }
353
- }
354
- // Discovery already marked complete -> next task.
355
- if (preflightMarkedComplete) {
356
- continue;
357
- }
358
- if (!discoveryOk) {
359
- continue;
360
- }
361
- // Stage 2: requirements review (retry review only; keep same plan file).
362
- if (config.preflightRequirementsReview) {
363
- const reviewPlanFile = taskPlanByTaskKey.get(currentTask.key) ?? plannedFilePath;
364
- let reviewOk = false;
365
- let reviewIterationCap = Math.max(1, Math.floor(config.preflightSessionMaxIterations ?? 500));
366
- for (let reviewTry = 0; reviewTry <= preflightMaxRetries; reviewTry++) {
434
+ };
435
+ try {
436
+ // Stage 1: discovery (retry discovery only).
437
+ for (let discoveryTry = 0; discoveryTry <= preflightMaxRetries; discoveryTry++) {
367
438
  const stageStart = Date.now();
368
- const reviewTimeoutSec = config.preflightReviewTimeoutSec ?? config.taskTimeoutSec;
369
- const reviewTimeoutMs = reviewTimeoutSec * 1000;
370
- let reviewSession;
439
+ const discoveryTimeoutSec = config.preflightDiscoveryTimeoutSec ?? config.taskTimeoutSec;
440
+ const discoveryTimeoutMs = discoveryTimeoutSec * 1000;
371
441
  try {
372
- progress.onStage?.('🧪 Requirements review: refining plan...');
373
- reviewSession = await createSessionFn(buildPreflightConfig(idlehandsConfig, config, reviewTimeoutSec, reviewIterationCap), apiKey);
374
- const reviewPrompt = buildRequirementsReviewPrompt(reviewPlanFile);
375
- const reviewRes = await Promise.race([
376
- reviewSession.ask(reviewPrompt),
377
- new Promise((_, reject) => setTimeout(() => {
442
+ progress.onStage?.('🔎 Discovery: checking if already done...');
443
+ // Create session if not already open (first try or after error closed it)
444
+ if (!preflightSession) {
445
+ preflightSession = await createSessionFn(buildPreflightConfig(idlehandsConfig, config, discoveryTimeoutSec, discoveryIterationCap), apiKey);
446
+ }
447
+ const discoveryPrompt = buildDiscoveryPrompt({
448
+ task: currentTask,
449
+ taskFilePath: config.taskFile,
450
+ projectDir: config.projectDir,
451
+ planFilePath: plannedFilePath,
452
+ retryHint: discoveryRetryHint,
453
+ });
454
+ let discoveryTimeoutHandle;
455
+ const discoveryRes = await Promise.race([
456
+ preflightSession.ask(discoveryPrompt).finally(() => clearTimeout(discoveryTimeoutHandle)),
457
+ new Promise((_, reject) => {
458
+ discoveryTimeoutHandle = setTimeout(() => {
459
+ try {
460
+ preflightSession?.cancel();
461
+ }
462
+ catch {
463
+ // best effort
464
+ }
465
+ reject(new Error('preflight-discovery-timeout'));
466
+ }, discoveryTimeoutMs);
467
+ }),
468
+ ]);
469
+ let discoveryTokens = preflightSession.usage.prompt + preflightSession.usage.completion;
470
+ totalTokens += discoveryTokens;
471
+ // Try to parse discovery result; if invalid JSON, attempt force-decision prompt
472
+ let discovery;
473
+ try {
474
+ discovery = parseDiscoveryResult(discoveryRes.text, config.projectDir);
475
+ }
476
+ catch (parseError) {
477
+ const parseErrMsg = parseError instanceof Error ? parseError.message : String(parseError);
478
+ // Only try force-decision for JSON/format errors, not file path errors
479
+ if (/preflight-json-missing-object|preflight-discovery-invalid/i.test(parseErrMsg)) {
480
+ progress.onStage?.('⚠️ Discovery output invalid, requesting forced decision...');
378
481
  try {
379
- reviewSession?.cancel();
482
+ const forceRes = await preflightSession.ask(FORCE_DISCOVERY_DECISION_PROMPT);
483
+ const forceTokens = preflightSession.usage.prompt + preflightSession.usage.completion - discoveryTokens;
484
+ discoveryTokens += forceTokens;
485
+ totalTokens += forceTokens;
486
+ discovery = parseDiscoveryResult(forceRes.text, config.projectDir);
487
+ progress.onStage?.('✅ Forced decision succeeded');
380
488
  }
381
- catch {
382
- // best effort
489
+ catch (forceError) {
490
+ // Force-decision also failed, throw original error
491
+ throw parseError;
383
492
  }
384
- reject(new Error('preflight-review-timeout'));
385
- }, reviewTimeoutMs)),
386
- ]);
387
- const reviewTokens = reviewSession.usage.prompt + reviewSession.usage.completion;
388
- totalTokens += reviewTokens;
389
- const review = parseRequirementsReviewResult(reviewRes.text, config.projectDir);
390
- const reviewPlanState = await ensurePlanFileExistsOrBootstrap({
391
- absPath: review.filename,
392
- task: currentTask,
393
- source: 'requirements-review',
394
- });
395
- if (reviewPlanState === 'bootstrapped') {
396
- progress.onStage?.(`⚠️ Requirements review returned a filename but did not write it. Created fallback plan file: ${review.filename}`);
493
+ }
494
+ else {
495
+ throw parseError;
496
+ }
397
497
  }
398
498
  preflightRecords.push({
399
499
  taskKey: currentTask.key,
400
- stage: 'requirements-review',
500
+ stage: 'discovery',
401
501
  durationMs: Date.now() - stageStart,
402
- tokensUsed: reviewTokens,
403
- status: 'ready',
404
- filename: review.filename,
502
+ tokensUsed: discoveryTokens,
503
+ status: discovery.status,
504
+ filename: discovery.filename || undefined,
505
+ });
506
+ if (discovery.status === 'complete') {
507
+ await markTaskChecked(config.taskFile, currentTask.key);
508
+ await autoCompleteAncestors(config.taskFile, currentTask.key);
509
+ autoCompleted += 1;
510
+ progress.onStage?.(`✅ Discovery confirmed already complete: ${currentTask.text}`);
511
+ preflightMarkedComplete = true;
512
+ discoveryOk = true;
513
+ // No review needed - close session now
514
+ await closePreflightSession();
515
+ break;
516
+ }
517
+ const discoveryPlanState = await ensurePlanFileExistsOrBootstrap({
518
+ absPath: discovery.filename,
519
+ task: currentTask,
520
+ source: 'discovery',
405
521
  });
406
- taskPlanByTaskKey.set(currentTask.key, review.filename);
407
- progress.onStage?.(`✅ Requirements review ready: ${review.filename}`);
408
- reviewOk = true;
522
+ if (discoveryPlanState === 'bootstrapped') {
523
+ progress.onStage?.(`⚠️ Discovery returned a filename but did not write it. Created fallback plan file: ${discovery.filename}`);
524
+ }
525
+ taskPlanByTaskKey.set(currentTask.key, discovery.filename);
526
+ progress.onStage?.(`📝 Discovery plan file: ${discovery.filename}`);
527
+ discoveryOk = true;
409
528
  break;
410
529
  }
411
530
  catch (error) {
@@ -413,53 +532,227 @@ export async function runAnton(opts) {
413
532
  const timeout = /timeout/i.test(errMsg);
414
533
  preflightRecords.push({
415
534
  taskKey: currentTask.key,
416
- stage: 'requirements-review',
535
+ stage: 'discovery',
417
536
  durationMs: Date.now() - stageStart,
418
537
  tokensUsed: 0,
419
538
  status: timeout ? 'timeout' : 'error',
420
539
  error: errMsg,
421
540
  });
422
- if (reviewTry < preflightMaxRetries) {
423
- const short = errMsg.length > 180 ? `${errMsg.slice(0, 177)}...` : errMsg;
541
+ const short = errMsg.length > 180 ? `${errMsg.slice(0, 177)}...` : errMsg;
542
+ discoveryRetryHint = `Previous discovery attempt failed: ${short}. Do not edit source files. Only update ${plannedFilePath} and return strict JSON.`;
543
+ // If discovery returns malformed/non-JSON output (or loops on source edits),
544
+ // degrade immediately to fallback plan instead of burning retries.
545
+ if (isRecoverablePreflightDiscoveryError(errMsg)) {
546
+ const fallbackState = await ensurePlanFileExistsOrBootstrap({
547
+ absPath: plannedFilePath,
548
+ task: currentTask,
549
+ source: 'discovery',
550
+ });
551
+ if (fallbackState === 'bootstrapped') {
552
+ progress.onStage?.(`⚠️ Discovery returned invalid output (${short}). Bootstrapped fallback plan and continuing: ${plannedFilePath}`);
553
+ }
554
+ else {
555
+ progress.onStage?.(`⚠️ Discovery returned invalid output (${short}). Reusing existing plan and continuing: ${plannedFilePath}`);
556
+ }
557
+ taskPlanByTaskKey.set(currentTask.key, plannedFilePath);
558
+ discoveryOk = true;
559
+ break;
560
+ }
561
+ if (discoveryTry < preflightMaxRetries) {
562
+ // Close session on error so retry gets fresh state
563
+ await closePreflightSession();
424
564
  if (/max iterations exceeded/i.test(errMsg)) {
425
- const nextCap = Math.min(Math.max(reviewIterationCap * 2, reviewIterationCap + 2), 1000);
426
- if (nextCap > reviewIterationCap) {
427
- progress.onStage?.(`⚠️ Requirements review hit max iterations (${reviewIterationCap}). Increasing preflight cap to ${nextCap} and retrying...`);
428
- reviewIterationCap = nextCap;
565
+ const nextCap = Math.min(Math.max(discoveryIterationCap * 2, discoveryIterationCap + 2), 1000);
566
+ if (nextCap > discoveryIterationCap) {
567
+ progress.onStage?.(`⚠️ Discovery hit max iterations (${discoveryIterationCap}). Increasing preflight cap to ${nextCap} and retrying...`);
568
+ discoveryIterationCap = nextCap;
429
569
  continue;
430
570
  }
431
571
  }
432
- progress.onStage?.(`⚠️ Requirements review failed (${reviewTry + 1}/${preflightTotalTries}): ${short}. Retrying review with existing plan file...`);
572
+ progress.onStage?.(`⚠️ Discovery failed (${discoveryTry + 1}/${preflightTotalTries}): ${short}. Retrying discovery...`);
433
573
  continue;
434
574
  }
435
- const preflightAttempt = {
436
- taskKey: currentTask.key,
437
- taskText: currentTask.text,
438
- attempt: attemptNumber,
439
- durationMs: Date.now() - stageStart,
440
- tokensUsed: 0,
441
- status: timeout ? 'timeout' : 'error',
442
- verification: undefined,
443
- error: `preflight-error(requirements-review): ${errMsg}`,
444
- commitHash: undefined,
445
- };
446
- attempts.push(preflightAttempt);
447
- taskRetryCount.set(currentTask.key, retries + 1);
448
- if (!config.skipOnFail)
449
- break mainLoop;
450
- }
451
- finally {
452
- try {
453
- await reviewSession?.close();
575
+ // Final discovery failure: degrade gracefully by bootstrapping a fallback plan file
576
+ // so Anton can still proceed to implementation/review instead of hard-failing task 1.
577
+ const fallbackState = await ensurePlanFileExistsOrBootstrap({
578
+ absPath: plannedFilePath,
579
+ task: currentTask,
580
+ source: 'discovery',
581
+ });
582
+ if (fallbackState === 'bootstrapped') {
583
+ progress.onStage?.(`⚠️ Discovery failed after ${preflightTotalTries} tries (${short}). Bootstrapped fallback plan and continuing: ${plannedFilePath}`);
454
584
  }
455
- catch {
456
- // best effort
585
+ else {
586
+ progress.onStage?.(`⚠️ Discovery failed after ${preflightTotalTries} tries (${short}). Reusing existing plan and continuing: ${plannedFilePath}`);
457
587
  }
588
+ taskPlanByTaskKey.set(currentTask.key, plannedFilePath);
589
+ discoveryOk = true;
590
+ break;
458
591
  }
592
+ // Note: session stays open for reuse in review stage (closed at end of preflight block)
593
+ }
594
+ // Discovery already marked complete -> next task.
595
+ if (preflightMarkedComplete) {
596
+ continue;
459
597
  }
460
- if (!reviewOk) {
598
+ if (!discoveryOk) {
461
599
  continue;
462
600
  }
601
+ // Stage 2: requirements review (retry review only; keep same plan file).
602
+ // NOTE: Discovery prompt now includes review instructions, producing a "reviewed" plan.
603
+ // Separate review stage is skipped by default to save an LLM round-trip.
604
+ // Set preflightRequirementsReview=true AND preflightSeparateReview=true to force separate review.
605
+ const skipSeparateReview = !config.preflightSeparateReview;
606
+ if (config.preflightRequirementsReview && !skipSeparateReview) {
607
+ const reviewPlanFile = taskPlanByTaskKey.get(currentTask.key) ?? plannedFilePath;
608
+ let reviewOk = false;
609
+ // Default to 30 iterations for review (simpler than discovery, just refining existing plan)
610
+ let reviewIterationCap = Math.max(1, Math.floor(config.preflightSessionMaxIterations ?? 30));
611
+ for (let reviewTry = 0; reviewTry <= preflightMaxRetries; reviewTry++) {
612
+ const stageStart = Date.now();
613
+ const reviewTimeoutSec = config.preflightReviewTimeoutSec ?? config.taskTimeoutSec;
614
+ const reviewTimeoutMs = reviewTimeoutSec * 1000;
615
+ try {
616
+ progress.onStage?.('🧪 Requirements review: refining plan...');
617
+ // Reuse preflight session from discovery, or create new one if needed (e.g., after error)
618
+ if (!preflightSession) {
619
+ preflightSession = await createSessionFn(buildPreflightConfig(idlehandsConfig, config, reviewTimeoutSec, reviewIterationCap), apiKey);
620
+ }
621
+ const reviewPrompt = buildRequirementsReviewPrompt(reviewPlanFile);
622
+ let reviewTimeoutHandle;
623
+ const reviewRes = await Promise.race([
624
+ preflightSession.ask(reviewPrompt).finally(() => clearTimeout(reviewTimeoutHandle)),
625
+ new Promise((_, reject) => {
626
+ reviewTimeoutHandle = setTimeout(() => {
627
+ try {
628
+ preflightSession?.cancel();
629
+ }
630
+ catch {
631
+ // best effort
632
+ }
633
+ reject(new Error('preflight-review-timeout'));
634
+ }, reviewTimeoutMs);
635
+ }),
636
+ ]);
637
+ let reviewTokens = preflightSession.usage.prompt + preflightSession.usage.completion;
638
+ totalTokens += reviewTokens;
639
+ // Try to parse review result; if invalid JSON, attempt force-decision prompt
640
+ let review;
641
+ try {
642
+ review = parseRequirementsReviewResult(reviewRes.text, config.projectDir);
643
+ }
644
+ catch (parseError) {
645
+ const parseErrMsg = parseError instanceof Error ? parseError.message : String(parseError);
646
+ // Only try force-decision for JSON/format errors
647
+ if (/preflight-json-missing-object|preflight-review-invalid/i.test(parseErrMsg)) {
648
+ progress.onStage?.('⚠️ Review output invalid, requesting forced decision...');
649
+ try {
650
+ const forceRes = await preflightSession.ask(FORCE_REVIEW_DECISION_PROMPT);
651
+ const forceTokens = preflightSession.usage.prompt + preflightSession.usage.completion - reviewTokens;
652
+ reviewTokens += forceTokens;
653
+ totalTokens += forceTokens;
654
+ review = parseRequirementsReviewResult(forceRes.text, config.projectDir);
655
+ progress.onStage?.('✅ Forced decision succeeded');
656
+ }
657
+ catch (forceError) {
658
+ // Force-decision also failed, throw original error
659
+ throw parseError;
660
+ }
661
+ }
662
+ else {
663
+ throw parseError;
664
+ }
665
+ }
666
+ const reviewPlanState = await ensurePlanFileExistsOrBootstrap({
667
+ absPath: review.filename,
668
+ task: currentTask,
669
+ source: 'requirements-review',
670
+ });
671
+ if (reviewPlanState === 'bootstrapped') {
672
+ progress.onStage?.(`⚠️ Requirements review returned a filename but did not write it. Created fallback plan file: ${review.filename}`);
673
+ }
674
+ preflightRecords.push({
675
+ taskKey: currentTask.key,
676
+ stage: 'requirements-review',
677
+ durationMs: Date.now() - stageStart,
678
+ tokensUsed: reviewTokens,
679
+ status: 'ready',
680
+ filename: review.filename,
681
+ });
682
+ taskPlanByTaskKey.set(currentTask.key, review.filename);
683
+ progress.onStage?.(`✅ Requirements review ready: ${review.filename}`);
684
+ reviewOk = true;
685
+ break;
686
+ }
687
+ catch (error) {
688
+ const errMsg = error instanceof Error ? error.message : String(error);
689
+ const timeout = /timeout/i.test(errMsg);
690
+ preflightRecords.push({
691
+ taskKey: currentTask.key,
692
+ stage: 'requirements-review',
693
+ durationMs: Date.now() - stageStart,
694
+ tokensUsed: 0,
695
+ status: timeout ? 'timeout' : 'error',
696
+ error: errMsg,
697
+ });
698
+ const short = errMsg.length > 180 ? `${errMsg.slice(0, 177)}...` : errMsg;
699
+ // If review returns malformed/non-JSON output, keep moving with existing plan.
700
+ if (isRecoverablePreflightReviewError(errMsg)) {
701
+ const fallbackState = await ensurePlanFileExistsOrBootstrap({
702
+ absPath: reviewPlanFile,
703
+ task: currentTask,
704
+ source: 'requirements-review',
705
+ });
706
+ if (fallbackState === 'bootstrapped') {
707
+ progress.onStage?.(`⚠️ Requirements review returned invalid output (${short}). Bootstrapped fallback plan and continuing: ${reviewPlanFile}`);
708
+ }
709
+ else {
710
+ progress.onStage?.(`⚠️ Requirements review returned invalid output (${short}). Reusing existing plan and continuing: ${reviewPlanFile}`);
711
+ }
712
+ taskPlanByTaskKey.set(currentTask.key, reviewPlanFile);
713
+ reviewOk = true;
714
+ break;
715
+ }
716
+ if (reviewTry < preflightMaxRetries) {
717
+ // Close session on error so retry gets fresh state
718
+ await closePreflightSession();
719
+ if (/max iterations exceeded/i.test(errMsg)) {
720
+ const nextCap = Math.min(Math.max(reviewIterationCap * 2, reviewIterationCap + 2), 1000);
721
+ if (nextCap > reviewIterationCap) {
722
+ progress.onStage?.(`⚠️ Requirements review hit max iterations (${reviewIterationCap}). Increasing preflight cap to ${nextCap} and retrying...`);
723
+ reviewIterationCap = nextCap;
724
+ continue;
725
+ }
726
+ }
727
+ progress.onStage?.(`⚠️ Requirements review failed (${reviewTry + 1}/${preflightTotalTries}): ${short}. Retrying review with existing plan file...`);
728
+ continue;
729
+ }
730
+ const preflightAttempt = {
731
+ taskKey: currentTask.key,
732
+ taskText: currentTask.text,
733
+ attempt: attemptNumber,
734
+ durationMs: Date.now() - stageStart,
735
+ tokensUsed: 0,
736
+ status: timeout ? 'timeout' : 'error',
737
+ verification: undefined,
738
+ error: `preflight-error(requirements-review): ${errMsg}`,
739
+ commitHash: undefined,
740
+ };
741
+ attempts.push(preflightAttempt);
742
+ taskRetryCount.set(currentTask.key, retries + 1);
743
+ if (!config.skipOnFail)
744
+ break mainLoop;
745
+ }
746
+ // Note: session stays open, will be closed at end of preflight block
747
+ }
748
+ if (!reviewOk) {
749
+ continue;
750
+ }
751
+ }
752
+ }
753
+ finally {
754
+ // Always close preflight session at end of preflight block
755
+ await closePreflightSession();
463
756
  }
464
757
  }
465
758
  progress.onStage?.('🛠️ Implementation: executing vetted plan...');
@@ -476,6 +769,7 @@ export async function runAnton(opts) {
476
769
  : buildSessionConfig(idlehandsConfig, config);
477
770
  console.error(`[anton:debug] task="${currentTask.text}" depth=${currentTask.depth} complexity=${taskComplexity} isComplexDecompose=${isComplexDecompose} no_tools=${!!sessionConfig.no_tools} max_iterations=${sessionConfig.max_iterations}`);
478
771
  session = await createSessionFn(sessionConfig, apiKey);
772
+ injectAntonResultContract(session);
479
773
  // Set up timeout + stop propagation for the currently running attempt.
480
774
  // /anton stop flips abortSignal.aborted; we poll that and cancel session.ask immediately
481
775
  // instead of waiting for the task attempt to naturally finish.
@@ -650,18 +944,41 @@ export async function runAnton(opts) {
650
944
  }
651
945
  const taskEndMs = Date.now();
652
946
  const durationMs = taskEndMs - taskStartMs;
653
- const tokensUsed = session.usage.prompt + session.usage.completion;
947
+ let tokensUsed = session.usage.prompt + session.usage.completion;
948
+ // Parse structured result (with one-shot recovery for format-only failures).
949
+ let agentResult = parseAntonResult(result.text);
950
+ if (agentResult.status === 'blocked' &&
951
+ isStructuredResultParseFailure(agentResult.reason) &&
952
+ !abortSignal.aborted &&
953
+ !controller.signal.aborted) {
954
+ try {
955
+ progress.onStage?.('⚠️ Agent omitted structured result. Requesting format-only recovery...');
956
+ const repaired = await session.ask(STRUCTURED_RESULT_RECOVERY_PROMPT);
957
+ iterationsUsed += repaired.turns;
958
+ agentResult = parseAntonResult(repaired.text);
959
+ tokensUsed = session.usage.prompt + session.usage.completion;
960
+ }
961
+ catch (repairErr) {
962
+ console.error(`[anton:result-recovery] failed: ${repairErr}`);
963
+ }
964
+ }
965
+ // If result is still parse-broken, treat as failed (retriable) instead of blocked (terminal).
966
+ if (agentResult.status === 'blocked' && isStructuredResultParseFailure(agentResult.reason)) {
967
+ agentResult = {
968
+ status: 'failed',
969
+ reason: `structured-result-parse-failure: ${agentResult.reason}`,
970
+ subtasks: [],
971
+ };
972
+ }
654
973
  // Per-attempt token cost guardrail (not just prompt size).
655
974
  if (tokensUsed > config.maxPromptTokensPerAttempt) {
656
975
  throw new Error(`attempt-token-budget-exceeded: used=${tokensUsed} max=${config.maxPromptTokensPerAttempt}`);
657
976
  }
658
- // Parse structured result
659
- const agentResult = parseAntonResult(result.text);
660
977
  console.error(`[anton:result] task="${currentTask.text.slice(0, 50)}" status=${agentResult.status} reason=${agentResult.reason ?? 'none'} subtasks=${agentResult.subtasks.length} tokens=${tokensUsed} duration=${Math.round(durationMs / 1000)}s`);
661
978
  if (isComplexDecompose) {
662
979
  console.error(`[anton:debug] decompose result: status=${agentResult.status} subtasks=${agentResult.subtasks.length} reason=${agentResult.reason ?? 'none'}`);
663
- if (agentResult.status === 'blocked' &&
664
- agentResult.reason === 'Agent did not emit structured result') {
980
+ if (agentResult.status === 'failed' &&
981
+ (agentResult.reason ?? '').startsWith('structured-result-parse-failure')) {
665
982
  console.error(`[anton:debug] decompose raw output (first 500 chars): ${(result.text ?? '').slice(0, 500)}`);
666
983
  }
667
984
  }
@@ -874,6 +1191,15 @@ export async function runAnton(opts) {
874
1191
  consecutiveIdenticalCount.set(currentTask.key, 1);
875
1192
  }
876
1193
  lastFailureReason.set(currentTask.key, currentReason);
1194
+ // Track L2-specific failures for enhanced retry context
1195
+ if (attempt.verification?.l2_ai === false) {
1196
+ l2FailCount.set(currentTask.key, (l2FailCount.get(currentTask.key) || 0) + 1);
1197
+ console.error(`[anton:l2-fail] task="${currentTask.text.slice(0, 40)}" l2_fail_count=${l2FailCount.get(currentTask.key)}`);
1198
+ }
1199
+ }
1200
+ else {
1201
+ // Task passed — reset L2 fail count
1202
+ l2FailCount.delete(currentTask.key);
877
1203
  }
878
1204
  // Report task end
879
1205
  progress.onTaskEnd(currentTask, attempt, currentProgress);