input-kanban 0.0.8 → 0.0.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -17,6 +17,10 @@ const runner = defaultRunner;
17
17
  const VALID_SANDBOXES = new Set(['read-only', 'workspace-write', 'danger-full-access']);
18
18
  const MISSING_RUNNER_GRACE_MS = 10000;
19
19
  const MAX_DERIVED_LABEL_DISPLAY_WIDTH = 40;
20
+ const RUN_STATE_LOCK_NAME = 'run_state.lock';
21
+ const RUN_STATE_LOCK_STALE_MS = 30000;
22
+ const RUN_STATE_LOCK_TIMEOUT_MS = 30000;
23
+ const sleep = ms => new Promise(resolve => setTimeout(resolve, ms));
20
24
 
21
25
  function normalizeSandbox(value, fallback = 'workspace-write') {
22
26
  const sandbox = String(value || '').trim();
@@ -26,6 +30,65 @@ function normalizeSandbox(value, fallback = 'workspace-write') {
26
30
 
27
31
  function statePath(runDir) { return path.join(runDir, 'run_state.json'); }
28
32
  function planPath(runDir) { return path.join(runDir, 'plan.json'); }
33
+ function lockPath(runDir) { return path.join(runDir, RUN_STATE_LOCK_NAME); }
34
+
35
+ async function isStaleRunLock(lockFile) {
36
+ const info = await fileInfo(lockFile);
37
+ if (!info.exists) return false;
38
+ const modifiedAt = Date.parse(info.mtime || '');
39
+ if (!Number.isFinite(modifiedAt)) return true;
40
+ if (Date.now() - modifiedAt < RUN_STATE_LOCK_STALE_MS) return false;
41
+ const lockData = await readJson(lockFile, null);
42
+ const pid = Number(lockData?.pid);
43
+ if (!Number.isFinite(pid) || pid <= 0) return true;
44
+ return !isPidAlive(pid);
45
+ }
46
+
47
+ export async function acquireRunStateLock(runId, { timeoutMs = RUN_STATE_LOCK_TIMEOUT_MS, staleMs = RUN_STATE_LOCK_STALE_MS } = {}) {
48
+ const runDir = pathForRun(runId);
49
+ await ensureDir(runDir);
50
+ const lockFile = lockPath(runDir);
51
+ const startedAt = Date.now();
52
+ let waitMs = 50;
53
+ while (true) {
54
+ try {
55
+ const handle = await fsp.open(lockFile, 'wx');
56
+ try {
57
+ await handle.writeFile(JSON.stringify({ runId, pid: process.pid, createdAt: nowIso() }, null, 2));
58
+ await handle.sync().catch(() => {});
59
+ } catch (error) {
60
+ await handle.close().catch(() => {});
61
+ await fsp.unlink(lockFile).catch(() => {});
62
+ throw error;
63
+ }
64
+ let released = false;
65
+ return async () => {
66
+ if (released) return;
67
+ released = true;
68
+ try { await handle.close(); } catch {}
69
+ await fsp.unlink(lockFile).catch(() => {});
70
+ };
71
+ } catch (error) {
72
+ if (error?.code !== 'EEXIST') throw error;
73
+ if (await isStaleRunLock(lockFile) && Date.now() - startedAt >= staleMs) {
74
+ await fsp.unlink(lockFile).catch(() => {});
75
+ continue;
76
+ }
77
+ if (Date.now() - startedAt >= timeoutMs) throw new Error(`run state lock busy: ${runId}`);
78
+ await sleep(waitMs);
79
+ waitMs = Math.min(waitMs * 1.5, 1000);
80
+ }
81
+ }
82
+ }
83
+
84
+ async function withRunStateLock(runId, fn, options = {}) {
85
+ const release = await acquireRunStateLock(runId, options);
86
+ try {
87
+ return await fn();
88
+ } finally {
89
+ await release();
90
+ }
91
+ }
29
92
 
30
93
  function shouldMarkRunnerUnknown(target) {
31
94
  const missingSince = Date.parse(target.missingRunnerAt || '');
@@ -238,38 +301,42 @@ Plan: ${path.join(pathForRun(state.runId), 'plan.json')}
238
301
  }
239
302
 
240
303
  export async function startPlanner(runId) {
241
- const state = await loadRun(runId);
242
- if (!state) throw new Error(`run not found: ${runId}`);
243
- if (state.archived) throw new Error('archived run cannot be planned');
244
- if (state.status === 'stopped') throw new Error('stopped run cannot be planned; create a new run after modifications');
245
- if (state.planner.status === 'running') throw new Error('planner already running');
246
- if (hasStartedExecution(state)) throw new Error('planner retry is allowed only before any worker/judge starts');
247
- const runDir = pathForRun(runId);
248
- const previousPlanner = state.planner;
249
- if (previousPlanner?.status && previousPlanner.status !== 'pending') await rotatePlannerAttempt(state, runDir);
250
- state.batches = [];
251
- state.tasks = [];
252
- state.judge = { status: 'pending' };
253
- const outDir = roleDir(runDir, 'planner');
254
- await ensureDir(outDir);
255
- await fsp.rm(planPath(runDir), { force: true });
256
- const taskText = await fsp.readFile(path.join(runDir, 'task.md'), 'utf8');
257
- const prompt = defaultPlannerPrompt(state, taskText);
258
- const child = await runner.startCodexTask({ runId: state.runId, taskId: 'planner', batchId: 'planner', runStatePath: statePath(runDir), prompt, sandbox: 'read-only', cwd: state.repo, outDir });
259
- state.status = 'planning';
260
- state.planner = { status: 'running', pid: child.pid, startedAt: nowIso(), dir: outDir, attempt: (state.plannerAttempts?.length || 0) + 1 };
261
- await saveRun(state);
262
- child.onExit(async code => {
263
- const s = await loadRun(runId); if (!s || s.status === 'stopped') return;
264
- s.planner.exitCode = code; s.planner.endedAt = nowIso(); s.planner.status = code === 0 ? 'completed' : 'failed';
265
- const planResult = await materializePlan(s);
266
- if (s.planner.status !== 'completed') s.status = 'plan_failed';
267
- else if (planResult.ok) s.status = 'planned';
268
- else if (planResult.empty) s.status = 'plan_empty';
269
- else s.status = 'plan_failed';
270
- await saveRun(s);
304
+ return await withRunStateLock(runId, async () => {
305
+ const state = await loadRun(runId);
306
+ if (!state) throw new Error(`run not found: ${runId}`);
307
+ if (state.archived) throw new Error('archived run cannot be planned');
308
+ if (state.status === 'stopped') throw new Error('stopped run cannot be planned; create a new run after modifications');
309
+ if (state.planner.status === 'running') throw new Error('planner already running');
310
+ if (hasStartedExecution(state)) throw new Error('planner retry is allowed only before any worker/judge starts');
311
+ const runDir = pathForRun(runId);
312
+ const previousPlanner = state.planner;
313
+ if (previousPlanner?.status && previousPlanner.status !== 'pending') await rotatePlannerAttempt(state, runDir);
314
+ state.batches = [];
315
+ state.tasks = [];
316
+ state.judge = { status: 'pending' };
317
+ const outDir = roleDir(runDir, 'planner');
318
+ await ensureDir(outDir);
319
+ await fsp.rm(planPath(runDir), { force: true });
320
+ const taskText = await fsp.readFile(path.join(runDir, 'task.md'), 'utf8');
321
+ const prompt = defaultPlannerPrompt(state, taskText);
322
+ const child = await runner.startCodexTask({ runId: state.runId, taskId: 'planner', batchId: 'planner', runStatePath: statePath(runDir), prompt, sandbox: 'read-only', cwd: state.repo, outDir });
323
+ state.status = 'planning';
324
+ state.planner = { status: 'running', pid: child.pid, startedAt: nowIso(), dir: outDir, attempt: (state.plannerAttempts?.length || 0) + 1 };
325
+ await saveRun(state);
326
+ child.onExit(async code => {
327
+ await withRunStateLock(runId, async () => {
328
+ const s = await loadRun(runId); if (!s || s.status === 'stopped') return;
329
+ s.planner.exitCode = code; s.planner.endedAt = nowIso(); s.planner.status = code === 0 ? 'completed' : 'failed';
330
+ const planResult = await materializePlan(s);
331
+ if (s.planner.status !== 'completed') s.status = 'plan_failed';
332
+ else if (planResult.ok) s.status = 'planned';
333
+ else if (planResult.empty) s.status = 'plan_empty';
334
+ else s.status = 'plan_failed';
335
+ await saveRun(s);
336
+ });
337
+ });
338
+ return state;
271
339
  });
272
- return state;
273
340
  }
274
341
 
275
342
  function normalizeExpectedArtifacts(value, runId, taskId) {
@@ -322,6 +389,43 @@ async function rotatePlannerAttempt(state, runDir) {
322
389
  }];
323
390
  }
324
391
 
392
+ async function rotateWorkerAttempt(state, task) {
393
+ const runDir = pathForRun(state.runId);
394
+ const workerDir = roleDir(runDir, 'worker', task.id);
395
+ if (!fs.existsSync(workerDir)) return null;
396
+ const attemptsDir = path.join(runDir, 'worker_attempts', task.id);
397
+ await ensureDir(attemptsDir);
398
+ const attempt = Number(task.retryCount || 0) + 1;
399
+ const archivedDir = path.join(attemptsDir, `attempt-${String(attempt).padStart(2, '0')}`);
400
+ await fsp.rm(archivedDir, { recursive: true, force: true });
401
+ await fsp.rename(workerDir, archivedDir);
402
+ task.retryHistory = [...(task.retryHistory || []), {
403
+ attempt,
404
+ status: task.status,
405
+ exitCode: task.exitCode ?? null,
406
+ startedAt: task.startedAt,
407
+ endedAt: task.endedAt,
408
+ archivedDir,
409
+ archivedAt: nowIso(),
410
+ reason: task.retryReason || null
411
+ }];
412
+ task.retryCount = attempt;
413
+ task.retryReason = null;
414
+ delete task.pid;
415
+ delete task.exitCode;
416
+ delete task.startedAt;
417
+ delete task.endedAt;
418
+ delete task.stoppedAt;
419
+ delete task.missingRunnerAt;
420
+ delete task.manualCompletion;
421
+ delete task.originalStatus;
422
+ delete task.originalExitCode;
423
+ delete task.error;
424
+ delete task.tmux;
425
+ task.status = 'pending';
426
+ return archivedDir;
427
+ }
428
+
325
429
  function normalizePlan(plan, defaultMaxParallel, defaultSandbox = 'workspace-write', runId = '') {
326
430
  if (Array.isArray(plan.batches)) {
327
431
  const batches = plan.batches.map((b, bi) => {
@@ -378,18 +482,20 @@ async function materializePlan(state) {
378
482
  }
379
483
 
380
484
  export async function dispatchRun(runId) {
381
- const state = await loadRun(runId);
382
- if (!state) throw new Error(`run not found: ${runId}`);
383
- if (state.archived) throw new Error('archived run cannot be dispatched');
384
- if (state.status === 'stopped') throw new Error('stopped run cannot be dispatched; create a new run after modifications');
385
- if (!state.tasks?.length) throw new Error('no tasks in plan');
386
- if (state.status === 'batch_blocked') throw new Error('current batch is blocked by failed/unknown tasks');
387
- if (allBatchesCompleted(state)) throw new Error('all batches completed; run final judge next');
388
- state.status = 'running';
389
- await scheduleMoreWorkers(state);
390
- recomputeRunStatus(state);
391
- await saveRun(state);
392
- return state;
485
+ return await withRunStateLock(runId, async () => {
486
+ const state = await loadRun(runId);
487
+ if (!state) throw new Error(`run not found: ${runId}`);
488
+ if (state.archived) throw new Error('archived run cannot be dispatched');
489
+ if (state.status === 'stopped') throw new Error('stopped run cannot be dispatched; create a new run after modifications');
490
+ if (!state.tasks?.length) throw new Error('no tasks in plan');
491
+ if (state.status === 'batch_blocked') throw new Error('current batch is blocked by failed/unknown tasks');
492
+ if (allBatchesCompleted(state)) throw new Error('all batches completed; run final judge next');
493
+ state.status = 'running';
494
+ await scheduleMoreWorkers(state);
495
+ recomputeRunStatus(state);
496
+ await saveRun(state);
497
+ return state;
498
+ });
393
499
  }
394
500
 
395
501
  function artifactPathForState(state, rel) {
@@ -428,139 +534,177 @@ ${task.prompt}${workerArtifactInstructions(state, task)}${upstreamArtifactInstru
428
534
  }
429
535
 
430
536
  export async function stopRun(runId, { reason = 'stopped by user' } = {}) {
431
- const state = await loadRun(runId);
432
- if (!state) throw new Error(`run not found: ${runId}`);
433
- const stoppedAt = nowIso();
434
- await runner.stopRun(runId);
435
- const stoppedPids = new Set();
436
- const stopTargetPid = target => {
437
- const pid = Number(target?.pid);
438
- if (Number.isFinite(pid) && pid > 0 && !stoppedPids.has(pid)) {
439
- stoppedPids.add(pid);
440
- stopPid(pid);
537
+ return await withRunStateLock(runId, async () => {
538
+ const state = await loadRun(runId);
539
+ if (!state) throw new Error(`run not found: ${runId}`);
540
+ const stoppedAt = nowIso();
541
+ await runner.stopRun(runId);
542
+ const stoppedPids = new Set();
543
+ const stopTargetPid = target => {
544
+ const pid = Number(target?.pid);
545
+ if (Number.isFinite(pid) && pid > 0 && !stoppedPids.has(pid)) {
546
+ stoppedPids.add(pid);
547
+ stopPid(pid);
548
+ }
549
+ };
550
+ for (const roleState of [state.planner, state.judge]) {
551
+ if (roleState?.status === 'running') {
552
+ stopTargetPid(roleState);
553
+ Object.assign(roleState, { status: 'stopped', stoppedAt, endedAt: stoppedAt });
554
+ }
441
555
  }
442
- };
443
- for (const roleState of [state.planner, state.judge]) {
444
- if (roleState?.status === 'running') {
445
- stopTargetPid(roleState);
446
- Object.assign(roleState, { status: 'stopped', stoppedAt, endedAt: stoppedAt });
556
+ for (const task of state.tasks || []) {
557
+ if (task.status === 'running') {
558
+ stopTargetPid(task);
559
+ Object.assign(task, { status: 'stopped', stoppedAt, endedAt: stoppedAt });
560
+ }
447
561
  }
448
- }
449
- for (const task of state.tasks || []) {
450
- if (task.status === 'running') {
451
- stopTargetPid(task);
452
- Object.assign(task, { status: 'stopped', stoppedAt, endedAt: stoppedAt });
562
+ for (const batch of state.batches || []) {
563
+ for (const task of batch.tasks || []) if (task.status === 'running') stopTargetPid(task);
453
564
  }
454
- }
455
- for (const batch of state.batches || []) {
456
- for (const task of batch.tasks || []) if (task.status === 'running') stopTargetPid(task);
457
- }
458
- for (const batch of state.batches || []) {
459
- if ((batch.tasks || []).some(t => t.status === 'stopped')) batch.status = 'stopped';
460
- }
461
- state.status = 'stopped';
462
- state.stopInfo = { reason, stoppedAt };
463
- await saveRun(state);
464
- return state;
565
+ for (const batch of state.batches || []) {
566
+ if ((batch.tasks || []).some(t => t.status === 'stopped')) batch.status = 'stopped';
567
+ }
568
+ state.status = 'stopped';
569
+ state.stopInfo = { reason, stoppedAt };
570
+ await saveRun(state);
571
+ return state;
572
+ });
465
573
  }
466
574
 
467
575
  export async function archiveRun(runId, { reason = 'archived by user' } = {}) {
468
- const state = await loadRun(runId);
469
- if (!state) throw new Error(`run not found: ${runId}`);
470
- if ((state.tasks || []).some(t => t.status === 'running') || state.planner?.status === 'running' || state.judge?.status === 'running') {
471
- throw new Error('cannot archive a run while tasks are running; stop it first');
472
- }
473
- state.archived = true;
474
- state.archivedAt = nowIso();
475
- state.archiveInfo = { reason, archivedAt: state.archivedAt };
476
- await saveRun(state);
477
- return state;
576
+ return await withRunStateLock(runId, async () => {
577
+ const state = await loadRun(runId);
578
+ if (!state) throw new Error(`run not found: ${runId}`);
579
+ if ((state.tasks || []).some(t => t.status === 'running') || state.planner?.status === 'running' || state.judge?.status === 'running') {
580
+ throw new Error('cannot archive a run while tasks are running; stop it first');
581
+ }
582
+ state.archived = true;
583
+ state.archivedAt = nowIso();
584
+ state.archiveInfo = { reason, archivedAt: state.archivedAt };
585
+ await saveRun(state);
586
+ return state;
587
+ });
478
588
  }
479
589
 
480
590
  export async function renameRun(runId, { label = '' } = {}) {
481
- const state = await loadRun(runId);
482
- if (!state) throw new Error(`run not found: ${runId}`);
483
- const nextLabel = String(label || '').trim();
484
- if (!nextLabel) throw userInputError('run label cannot be empty');
485
- state.label = nextLabel;
486
- state.renamedAt = nowIso();
487
- await saveRun(state);
488
- return state;
591
+ return await withRunStateLock(runId, async () => {
592
+ const state = await loadRun(runId);
593
+ if (!state) throw new Error(`run not found: ${runId}`);
594
+ const nextLabel = String(label || '').trim();
595
+ if (!nextLabel) throw userInputError('run label cannot be empty');
596
+ state.label = nextLabel;
597
+ state.renamedAt = nowIso();
598
+ await saveRun(state);
599
+ return state;
600
+ });
601
+ }
602
+
603
+ export async function retryRun(runId, { taskId = '', reason = 'manual retry', maxRetries = 1, auto = false } = {}) {
604
+ return await withRunStateLock(runId, async () => {
605
+ const state = await loadRun(runId);
606
+ if (!state) throw new Error(`run not found: ${runId}`);
607
+ if (state.archived) throw new Error('archived run cannot be retried');
608
+ if (state.status === 'stopped') throw new Error('stopped run cannot be retried');
609
+ const selectedTaskId = String(taskId || '').trim();
610
+ let taskIds = [];
611
+ if (selectedTaskId) {
612
+ const task = (state.tasks || []).find(item => item.id === selectedTaskId);
613
+ if (!task) throw new Error(`task not found: ${selectedTaskId}`);
614
+ taskIds = [task.id];
615
+ if (!['failed', 'unknown'].includes(task.status)) throw new Error(`task is not retryable: ${selectedTaskId}`);
616
+ } else {
617
+ const batch = currentBlockedBatch(state);
618
+ if (!batch) throw new Error('no blocked batch to retry');
619
+ taskIds = (batch.tasks || []).filter(item => ['failed', 'unknown'].includes(item.status) && (!auto || canAutoRetryTask(item, maxRetries))).map(item => item.id);
620
+ if (!taskIds.length) throw new Error('no retryable tasks in blocked batch');
621
+ }
622
+ const result = await retryTasksInState(state, taskIds, { auto, maxRetries, reason });
623
+ if (!result.retried.length) throw new Error('no tasks were retried');
624
+ await saveRun(state);
625
+ return { ...state, retriedTaskIds: result.retried };
626
+ });
489
627
  }
490
628
 
491
629
  export async function markTaskCompleted(runId, taskId, { reason = 'manual success confirmed by user', resultText = '' } = {}) {
492
- const state = await loadRun(runId);
493
- if (!state) throw new Error(`run not found: ${runId}`);
494
- const task = (state.tasks || []).find(t => t.id === taskId);
495
- if (!task) throw new Error(`task not found: ${taskId}`);
496
- if (task.status === 'running') throw new Error('cannot mark a running task completed');
497
- const runDir = pathForRun(runId);
498
- const outDir = roleDir(runDir, 'worker', task.id);
499
- await ensureDir(outDir);
500
- if (task.status !== 'completed') {
501
- const manualResult = String(resultText || '').trim();
502
- if (manualResult) await fsp.writeFile(path.join(outDir, 'manual_result.md'), manualResult);
503
- const override = {
504
- type: 'manual_task_completed',
505
- runId,
506
- taskId,
507
- originalStatus: task.originalStatus || task.status,
508
- originalExitCode: task.originalExitCode ?? task.exitCode ?? null,
509
- previousStatus: task.status,
510
- previousExitCode: task.exitCode ?? null,
511
- reason,
512
- hasManualResult: !!manualResult,
513
- manualResultFile: manualResult ? 'manual_result.md' : null,
514
- manualResultPreview: manualResult ? manualResult.slice(0, 500) : '',
515
- markedAt: nowIso()
516
- };
517
- await writeJsonAtomic(path.join(outDir, 'manual_completion.json'), override);
518
- Object.assign(task, {
519
- status: 'completed',
520
- originalStatus: override.originalStatus,
521
- originalExitCode: override.originalExitCode,
522
- manualCompletion: override,
523
- completedAt: override.markedAt
524
- });
525
- const batch = (state.batches || []).find(b => b.id === task.batchId);
526
- if (batch) {
527
- const batchTask = batch.tasks.find(t => t.id === task.id);
528
- if (batchTask && batchTask !== task) Object.assign(batchTask, task);
630
+ return await withRunStateLock(runId, async () => {
631
+ const state = await loadRun(runId);
632
+ if (!state) throw new Error(`run not found: ${runId}`);
633
+ const task = (state.tasks || []).find(t => t.id === taskId);
634
+ if (!task) throw new Error(`task not found: ${taskId}`);
635
+ if (task.status === 'running') throw new Error('cannot mark a running task completed');
636
+ const runDir = pathForRun(runId);
637
+ const outDir = roleDir(runDir, 'worker', task.id);
638
+ await ensureDir(outDir);
639
+ if (task.status !== 'completed') {
640
+ const manualResult = String(resultText || '').trim();
641
+ if (manualResult) await fsp.writeFile(path.join(outDir, 'manual_result.md'), manualResult);
642
+ const override = {
643
+ type: 'manual_task_completed',
644
+ runId,
645
+ taskId,
646
+ originalStatus: task.originalStatus || task.status,
647
+ originalExitCode: task.originalExitCode ?? task.exitCode ?? null,
648
+ previousStatus: task.status,
649
+ previousExitCode: task.exitCode ?? null,
650
+ reason,
651
+ hasManualResult: !!manualResult,
652
+ manualResultFile: manualResult ? 'manual_result.md' : null,
653
+ manualResultPreview: manualResult ? manualResult.slice(0, 500) : '',
654
+ markedAt: nowIso()
655
+ };
656
+ await writeJsonAtomic(path.join(outDir, 'manual_completion.json'), override);
657
+ Object.assign(task, {
658
+ status: 'completed',
659
+ originalStatus: override.originalStatus,
660
+ originalExitCode: override.originalExitCode,
661
+ manualCompletion: override,
662
+ completedAt: override.markedAt
663
+ });
664
+ const batch = (state.batches || []).find(b => b.id === task.batchId);
665
+ if (batch) {
666
+ const batchTask = batch.tasks.find(t => t.id === task.id);
667
+ if (batchTask && batchTask !== task) Object.assign(batchTask, task);
668
+ }
529
669
  }
530
- }
531
- recomputeRunStatus(state);
532
- if (hasPendingRunnableBatch(state)) state.status = 'running';
533
- await scheduleMoreWorkers(state);
534
- recomputeRunStatus(state);
535
- await saveRun(state);
536
- return state;
670
+ recomputeRunStatus(state);
671
+ if (hasPendingRunnableBatch(state)) state.status = 'running';
672
+ await scheduleMoreWorkers(state);
673
+ recomputeRunStatus(state);
674
+ await saveRun(state);
675
+ return state;
676
+ });
537
677
  }
538
678
 
539
679
  export async function startJudge(runId) {
540
- const state = await loadRun(runId);
541
- if (!state) throw new Error(`run not found: ${runId}`);
542
- recomputeRunStatus(state);
543
- if (!allBatchesCompleted(state) && state.tasks?.length) throw new Error('final judge is allowed only after all batches completed');
544
- const outDir = roleDir(pathForRun(runId), 'judge');
545
- await ensureDir(outDir);
546
- const judgeInputPath = path.join(outDir, 'judge_input.json');
547
- const judgeInput = await buildJudgeInput(state);
548
- await writeJsonAtomic(judgeInputPath, judgeInput);
549
- const prompt = defaultJudgePrompt(state, judgeInputPath);
550
- const child = await runner.startCodexTask({ runId: state.runId, taskId: 'judge', batchId: 'judge', runStatePath: statePath(pathForRun(runId)), prompt, sandbox: 'read-only', cwd: state.repo, outDir });
551
- state.judge = { status: 'running', pid: child.pid, startedAt: nowIso(), dir: outDir };
552
- state.status = 'judging';
553
- await saveRun(state);
554
- child.onExit(async code => {
555
- const s = await loadRun(runId); if (!s || s.status === 'stopped') return;
556
- s.judge.exitCode = code; s.judge.endedAt = nowIso(); s.judge.status = code === 0 ? 'completed' : 'failed';
557
- const text = await readTextMaybe(path.join(outDir, 'last_message.md'), 1000000);
558
- const verdict = extractFirstJsonObject(text);
559
- if (verdict) { s.judge.verdict = verdict; await writeJsonAtomic(path.join(outDir, 'verdict.json'), verdict); }
560
- s.status = s.judge.status === 'completed' ? 'judged' : 'judge_failed';
561
- await saveRun(s);
680
+ return await withRunStateLock(runId, async () => {
681
+ const state = await loadRun(runId);
682
+ if (!state) throw new Error(`run not found: ${runId}`);
683
+ recomputeRunStatus(state);
684
+ if (!allBatchesCompleted(state) && state.tasks?.length) throw new Error('final judge is allowed only after all batches completed');
685
+ const outDir = roleDir(pathForRun(runId), 'judge');
686
+ await ensureDir(outDir);
687
+ const judgeInputPath = path.join(outDir, 'judge_input.json');
688
+ const judgeInput = await buildJudgeInput(state);
689
+ await writeJsonAtomic(judgeInputPath, judgeInput);
690
+ const prompt = defaultJudgePrompt(state, judgeInputPath);
691
+ const child = await runner.startCodexTask({ runId: state.runId, taskId: 'judge', batchId: 'judge', runStatePath: statePath(pathForRun(runId)), prompt, sandbox: 'read-only', cwd: state.repo, outDir });
692
+ state.judge = { status: 'running', pid: child.pid, startedAt: nowIso(), dir: outDir };
693
+ state.status = 'judging';
694
+ await saveRun(state);
695
+ child.onExit(async code => {
696
+ await withRunStateLock(runId, async () => {
697
+ const s = await loadRun(runId); if (!s || s.status === 'stopped') return;
698
+ s.judge.exitCode = code; s.judge.endedAt = nowIso(); s.judge.status = code === 0 ? 'completed' : 'failed';
699
+ const text = await readTextMaybe(path.join(outDir, 'last_message.md'), 1000000);
700
+ const verdict = extractFirstJsonObject(text);
701
+ if (verdict) { s.judge.verdict = verdict; await writeJsonAtomic(path.join(outDir, 'verdict.json'), verdict); }
702
+ s.status = s.judge.status === 'completed' ? 'judged' : 'judge_failed';
703
+ await saveRun(s);
704
+ });
705
+ });
706
+ return state;
562
707
  });
563
- return state;
564
708
  }
565
709
 
566
710
  export async function refreshRun(runId, appClient = null) {
@@ -568,21 +712,23 @@ export async function refreshRun(runId, appClient = null) {
568
712
  }
569
713
 
570
714
  async function loadAndRefreshRun(runId, appClient = null, { light = false } = {}) {
571
- const state = await loadRun(runId);
572
- if (!state) return null;
573
- state.runner = state.runner || RUNNER;
574
- await refreshRole(state, state.planner, roleDir(pathForRun(runId), 'planner'));
575
- await recoverCompletedPlanner(state);
576
- for (const task of state.tasks || []) await refreshTask(state, task);
577
- await refreshRole(state, state.judge, roleDir(pathForRun(runId), 'judge'));
578
- await recoverCompletedJudge(state);
579
- aggregateRunTmuxMetadata(state);
580
- recomputeRunStatus(state);
581
- await scheduleMoreWorkers(state);
582
- recomputeRunStatus(state);
583
- if (appClient && !light) await enrichFromAppServer(state, appClient).catch(e => { state.appServerError = e.message; });
584
- await saveRun(state);
585
- return state;
715
+ return await withRunStateLock(runId, async () => {
716
+ const state = await loadRun(runId);
717
+ if (!state) return null;
718
+ state.runner = state.runner || RUNNER;
719
+ await refreshRole(state, state.planner, roleDir(pathForRun(runId), 'planner'));
720
+ await recoverCompletedPlanner(state);
721
+ for (const task of state.tasks || []) await refreshTask(state, task);
722
+ await refreshRole(state, state.judge, roleDir(pathForRun(runId), 'judge'));
723
+ await recoverCompletedJudge(state);
724
+ aggregateRunTmuxMetadata(state);
725
+ recomputeRunStatus(state);
726
+ await scheduleMoreWorkers(state);
727
+ recomputeRunStatus(state);
728
+ if (appClient && !light) await enrichFromAppServer(state, appClient).catch(e => { state.appServerError = e.message; });
729
+ await saveRun(state);
730
+ return state;
731
+ });
586
732
  }
587
733
 
588
734
  async function recoverCompletedPlanner(state) {
@@ -744,6 +890,43 @@ function currentBatch(state) {
744
890
  return (state.batches || []).find(b => b.status !== 'completed');
745
891
  }
746
892
 
893
+ function currentBlockedBatch(state) {
894
+ ensureBatchShape(state);
895
+ return (state.batches || []).find(b => b.status === 'failed' || b.status === 'blocked' || b.status === 'running' && (b.tasks || []).some(t => ['failed', 'unknown'].includes(t.status)));
896
+ }
897
+
898
+ function canAutoRetryTask(task, maxRetries = 1) {
899
+ if (!task) return false;
900
+ if (!['failed', 'unknown'].includes(task.status)) return false;
901
+ if (Number(task.retryCount || 0) >= Number(maxRetries || 1)) return false;
902
+ return true;
903
+ }
904
+
905
+ async function retryTasksInState(state, taskIds = null, { auto = false, maxRetries = 1, reason = 'retry' } = {}) {
906
+ ensureBatchShape(state);
907
+ const selectedTaskIds = taskIds ? new Set(taskIds.map(id => safeIdPart(id))) : null;
908
+ const tasksToRetry = (state.tasks || []).filter(task => {
909
+ if (selectedTaskIds && !selectedTaskIds.has(task.id)) return false;
910
+ if (!['failed', 'unknown'].includes(task.status)) return false;
911
+ if (auto && !canAutoRetryTask(task, maxRetries)) return false;
912
+ return true;
913
+ });
914
+ if (!tasksToRetry.length) return { retried: [], state };
915
+ for (const task of tasksToRetry) {
916
+ if (hasLiveRunnerProcess(state, task.id, task)) throw new Error(`task still has a live process: ${task.id}`);
917
+ const batch = (state.batches || []).find(item => item.id === task.batchId);
918
+ task.retryReason = reason;
919
+ await rotateWorkerAttempt(state, task);
920
+ const batchTask = batch?.tasks?.find(item => item.id === task.id);
921
+ if (batchTask && batchTask !== task) Object.assign(batchTask, task);
922
+ }
923
+ recomputeRunStatus(state);
924
+ if (hasPendingRunnableBatch(state)) state.status = 'running';
925
+ await scheduleMoreWorkers(state);
926
+ recomputeRunStatus(state);
927
+ return { retried: tasksToRetry.map(task => task.id), state };
928
+ }
929
+
747
930
  async function scheduleMoreWorkers(state) {
748
931
  if (state.status !== 'running') return;
749
932
  const batch = currentBatch(state);
@@ -904,7 +1087,7 @@ function runDurationEndOfState(s) {
904
1087
  return times.length ? new Date(Math.max(...times)).toISOString() : s.updatedAt;
905
1088
  }
906
1089
 
907
- function summaryOfRun(s) {
1090
+ export function summaryOfRun(s) {
908
1091
  const tasks = s.tasks || [];
909
1092
  return { runId: s.runId, label: s.label, repo: s.repo, status: s.status, runner: s.runner || RUNNER, workerSandbox: s.workerSandbox || 'workspace-write', archived: !!s.archived, createdAt: s.createdAt, updatedAt: s.updatedAt, durationEnd: runDurationEndOfState(s), total: tasks.length, completed: tasks.filter(t => t.status === 'completed').length, failed: tasks.filter(t => ['failed','unknown'].includes(t.status)).length, running: tasks.filter(t => t.status === 'running').length, batches: (s.batches || []).map(b => ({ id: b.id, name: b.name, status: b.status, total: b.tasks?.length || 0, completed: (b.tasks || []).filter(t => t.status === 'completed').length })) };
910
1093
  }