@dotsetlabs/dotclaw 2.4.0 → 2.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (118) hide show
  1. package/.env.example +9 -10
  2. package/README.md +8 -4
  3. package/config-examples/runtime.json +34 -8
  4. package/config-examples/tool-policy.json +12 -2
  5. package/container/agent-runner/package-lock.json +2 -2
  6. package/container/agent-runner/package.json +1 -1
  7. package/container/agent-runner/src/agent-config.ts +19 -3
  8. package/container/agent-runner/src/container-protocol.ts +11 -0
  9. package/container/agent-runner/src/context-overflow-recovery.ts +39 -0
  10. package/container/agent-runner/src/index.ts +603 -165
  11. package/container/agent-runner/src/openrouter-input.ts +159 -0
  12. package/container/agent-runner/src/system-prompt.ts +13 -3
  13. package/container/agent-runner/src/tool-loop-policy.ts +741 -0
  14. package/container/agent-runner/src/tools.ts +211 -8
  15. package/dist/agent-context.d.ts +1 -0
  16. package/dist/agent-context.d.ts.map +1 -1
  17. package/dist/agent-context.js +21 -9
  18. package/dist/agent-context.js.map +1 -1
  19. package/dist/agent-execution.d.ts +2 -0
  20. package/dist/agent-execution.d.ts.map +1 -1
  21. package/dist/agent-execution.js +164 -15
  22. package/dist/agent-execution.js.map +1 -1
  23. package/dist/agent-semaphore.d.ts +24 -1
  24. package/dist/agent-semaphore.d.ts.map +1 -1
  25. package/dist/agent-semaphore.js +109 -20
  26. package/dist/agent-semaphore.js.map +1 -1
  27. package/dist/cli.js +3 -11
  28. package/dist/cli.js.map +1 -1
  29. package/dist/config.d.ts +2 -0
  30. package/dist/config.d.ts.map +1 -1
  31. package/dist/config.js +2 -0
  32. package/dist/config.js.map +1 -1
  33. package/dist/container-protocol.d.ts +22 -0
  34. package/dist/container-protocol.d.ts.map +1 -1
  35. package/dist/container-protocol.js.map +1 -1
  36. package/dist/container-runner.d.ts +7 -0
  37. package/dist/container-runner.d.ts.map +1 -1
  38. package/dist/container-runner.js +417 -143
  39. package/dist/container-runner.js.map +1 -1
  40. package/dist/db.d.ts.map +1 -1
  41. package/dist/db.js +46 -12
  42. package/dist/db.js.map +1 -1
  43. package/dist/error-messages.d.ts.map +1 -1
  44. package/dist/error-messages.js +18 -4
  45. package/dist/error-messages.js.map +1 -1
  46. package/dist/failover-policy.d.ts +41 -0
  47. package/dist/failover-policy.d.ts.map +1 -0
  48. package/dist/failover-policy.js +261 -0
  49. package/dist/failover-policy.js.map +1 -0
  50. package/dist/index.js +1 -0
  51. package/dist/index.js.map +1 -1
  52. package/dist/ipc-dispatcher.d.ts.map +1 -1
  53. package/dist/ipc-dispatcher.js +27 -43
  54. package/dist/ipc-dispatcher.js.map +1 -1
  55. package/dist/mcp-config.d.ts +22 -0
  56. package/dist/mcp-config.d.ts.map +1 -0
  57. package/dist/mcp-config.js +94 -0
  58. package/dist/mcp-config.js.map +1 -0
  59. package/dist/memory-backend.d.ts +27 -0
  60. package/dist/memory-backend.d.ts.map +1 -0
  61. package/dist/memory-backend.js +112 -0
  62. package/dist/memory-backend.js.map +1 -0
  63. package/dist/memory-recall.d.ts.map +1 -1
  64. package/dist/memory-recall.js +135 -22
  65. package/dist/memory-recall.js.map +1 -1
  66. package/dist/memory-store.d.ts +1 -0
  67. package/dist/memory-store.d.ts.map +1 -1
  68. package/dist/memory-store.js +55 -7
  69. package/dist/memory-store.js.map +1 -1
  70. package/dist/message-pipeline.d.ts +24 -0
  71. package/dist/message-pipeline.d.ts.map +1 -1
  72. package/dist/message-pipeline.js +131 -27
  73. package/dist/message-pipeline.js.map +1 -1
  74. package/dist/metrics.d.ts +1 -0
  75. package/dist/metrics.d.ts.map +1 -1
  76. package/dist/metrics.js +9 -0
  77. package/dist/metrics.js.map +1 -1
  78. package/dist/providers/discord/discord-provider.d.ts.map +1 -1
  79. package/dist/providers/discord/discord-provider.js +72 -4
  80. package/dist/providers/discord/discord-provider.js.map +1 -1
  81. package/dist/providers/telegram/telegram-provider.d.ts.map +1 -1
  82. package/dist/providers/telegram/telegram-provider.js +65 -3
  83. package/dist/providers/telegram/telegram-provider.js.map +1 -1
  84. package/dist/recall-policy.d.ts +12 -0
  85. package/dist/recall-policy.d.ts.map +1 -0
  86. package/dist/recall-policy.js +89 -0
  87. package/dist/recall-policy.js.map +1 -0
  88. package/dist/runtime-config.d.ts +33 -0
  89. package/dist/runtime-config.d.ts.map +1 -1
  90. package/dist/runtime-config.js +109 -9
  91. package/dist/runtime-config.js.map +1 -1
  92. package/dist/streaming.d.ts.map +1 -1
  93. package/dist/streaming.js +125 -33
  94. package/dist/streaming.js.map +1 -1
  95. package/dist/task-scheduler.d.ts.map +1 -1
  96. package/dist/task-scheduler.js +4 -2
  97. package/dist/task-scheduler.js.map +1 -1
  98. package/dist/tool-policy.d.ts.map +1 -1
  99. package/dist/tool-policy.js +26 -4
  100. package/dist/tool-policy.js.map +1 -1
  101. package/dist/trace-writer.d.ts +12 -0
  102. package/dist/trace-writer.d.ts.map +1 -1
  103. package/dist/trace-writer.js.map +1 -1
  104. package/dist/turn-hygiene.d.ts +14 -0
  105. package/dist/turn-hygiene.d.ts.map +1 -0
  106. package/dist/turn-hygiene.js +214 -0
  107. package/dist/turn-hygiene.js.map +1 -0
  108. package/dist/webhook.d.ts.map +1 -1
  109. package/dist/webhook.js +1 -0
  110. package/dist/webhook.js.map +1 -1
  111. package/package.json +15 -1
  112. package/scripts/benchmark-baseline.js +365 -0
  113. package/scripts/benchmark-harness.js +1413 -0
  114. package/scripts/benchmark-scenarios.js +301 -0
  115. package/scripts/canary-suite.js +123 -0
  116. package/scripts/generate-controlled-traces.js +230 -0
  117. package/scripts/release-slo-check.js +214 -0
  118. package/scripts/run-live-canary.js +339 -0
@@ -360,8 +360,50 @@ function isContainerRunning(name) {
360
360
  }
361
361
  }
362
362
  const daemonConfig = runtime.host.container.daemon;
363
- function readDaemonStatus(groupFolder) {
364
- const statusPath = path.join(DATA_DIR, 'ipc', groupFolder, 'daemon_status.json');
363
+ const DAEMON_BOOTSTRAP_GRACE_MS = Math.max(daemonConfig.gracePeriodMs, daemonConfig.heartbeatMaxAgeMs);
364
+ const daemonBootstrapUntil = new Map();
365
+ function markDaemonBootstrapping(groupFolder) {
366
+ daemonBootstrapUntil.set(groupFolder, Date.now() + DAEMON_BOOTSTRAP_GRACE_MS);
367
+ }
368
+ function isDaemonBootstrapping(groupFolder, nowMs = Date.now()) {
369
+ const until = daemonBootstrapUntil.get(groupFolder);
370
+ if (!until)
371
+ return false;
372
+ if (until <= nowMs) {
373
+ daemonBootstrapUntil.delete(groupFolder);
374
+ return false;
375
+ }
376
+ return true;
377
+ }
378
+ function consumeMemoryExtractionError(groupFolder) {
379
+ const statusPath = path.join(DATA_DIR, 'ipc', groupFolder, 'memory_extraction_error.json');
380
+ try {
381
+ if (!fs.existsSync(statusPath))
382
+ return undefined;
383
+ const raw = fs.readFileSync(statusPath, 'utf-8').trim();
384
+ try {
385
+ const parsed = JSON.parse(raw);
386
+ if (typeof parsed.error === 'string' && parsed.error.trim()) {
387
+ return parsed.error.trim();
388
+ }
389
+ }
390
+ catch {
391
+ if (raw)
392
+ return raw;
393
+ }
394
+ }
395
+ catch {
396
+ return undefined;
397
+ }
398
+ finally {
399
+ try {
400
+ fs.unlinkSync(statusPath);
401
+ }
402
+ catch { /* ignore */ }
403
+ }
404
+ return undefined;
405
+ }
406
+ function readDaemonStatusFromPath(statusPath) {
365
407
  try {
366
408
  if (!fs.existsSync(statusPath))
367
409
  return null;
@@ -372,6 +414,10 @@ function readDaemonStatus(groupFolder) {
372
414
  return null;
373
415
  }
374
416
  }
417
+ function readDaemonStatus(groupFolder) {
418
+ const statusPath = path.join(DATA_DIR, 'ipc', groupFolder, 'daemon_status.json');
419
+ return readDaemonStatusFromPath(statusPath);
420
+ }
375
421
  /**
376
422
  * 3-state health check: healthy / busy / dead
377
423
  *
@@ -444,8 +490,7 @@ export function gracefulRestartDaemonContainer(group, isMain) {
444
490
  }
445
491
  }
446
492
  // Start new container
447
- const mounts = buildVolumeMounts(group, isMain);
448
- ensureDaemonContainer(mounts, group.folder);
493
+ ensureDaemonContainer(group, isMain);
449
494
  logger.info({ groupFolder: group.folder }, 'Daemon container restarted (graceful)');
450
495
  }
451
496
  /**
@@ -459,8 +504,7 @@ export function restartDaemonContainer(group, isMain) {
459
504
  catch {
460
505
  // Ignore if container doesn't exist
461
506
  }
462
- const mounts = buildVolumeMounts(group, isMain);
463
- ensureDaemonContainer(mounts, group.folder);
507
+ ensureDaemonContainer(group, isMain);
464
508
  logger.info({ groupFolder: group.folder }, 'Daemon container restarted (force)');
465
509
  }
466
510
  // Track daemon health check state
@@ -478,6 +522,7 @@ export function suppressHealthChecks(durationMs) {
478
522
  export function resetUnhealthyDaemons() {
479
523
  unhealthyDaemons.clear();
480
524
  healthCheckRestartTimestamps.clear();
525
+ daemonBootstrapUntil.clear();
481
526
  }
482
527
  /**
483
528
  * Perform health check on all daemon containers and restart if needed.
@@ -497,6 +542,7 @@ export function performDaemonHealthChecks(getRegisteredGroups, mainGroupFolder)
497
542
  // Skip if container isn't running (may be intentionally stopped)
498
543
  if (!isContainerRunning(containerName)) {
499
544
  unhealthyDaemons.delete(group.folder);
545
+ daemonBootstrapUntil.delete(group.folder);
500
546
  continue;
501
547
  }
502
548
  const health = checkDaemonHealth(group.folder);
@@ -579,28 +625,58 @@ export function stopDaemonHealthCheckLoop() {
579
625
  healthCheckInterval = null;
580
626
  }
581
627
  }
582
- function ensureDaemonContainer(mounts, groupFolder) {
628
+ function ensureDaemonContainer(group, isMain) {
629
+ sanitizeGroupFolder(group.folder);
630
+ const groupFolder = group.folder;
583
631
  const containerName = getDaemonContainerName(groupFolder);
584
- if (isContainerRunning(containerName))
632
+ const health = checkDaemonHealth(groupFolder);
633
+ if (health.state === 'healthy' || health.state === 'busy') {
585
634
  return;
586
- try {
587
- execSync(`docker rm -f ${containerName}`, { stdio: 'ignore', timeout: 15_000 });
588
635
  }
589
- catch {
590
- // ignore if container doesn't exist
636
+ const running = isContainerRunning(containerName);
637
+ if (running && isDaemonBootstrapping(groupFolder)) {
638
+ return;
639
+ }
640
+ const mounts = buildVolumeMounts(group, isMain);
641
+ if (running) {
642
+ logger.warn({
643
+ groupFolder,
644
+ ageMs: health.ageMs,
645
+ daemonState: health.daemonState
646
+ }, 'Daemon container running but unhealthy; restarting before request');
647
+ try {
648
+ execSync(`docker rm -f ${containerName}`, { stdio: 'ignore', timeout: 15_000 });
649
+ }
650
+ catch {
651
+ // ignore cleanup failure and attempt fresh start
652
+ }
653
+ }
654
+ else {
655
+ daemonBootstrapUntil.delete(groupFolder);
656
+ try {
657
+ execSync(`docker rm -f ${containerName}`, { stdio: 'ignore', timeout: 15_000 });
658
+ }
659
+ catch {
660
+ // ignore if container doesn't exist
661
+ }
591
662
  }
592
663
  const args = buildDaemonArgs(mounts, containerName, groupFolder);
593
664
  const result = spawnSync('docker', args, { stdio: 'ignore' });
594
665
  if (result.status !== 0) {
666
+ // Concurrent starts can lose the name race while the daemon is actually up.
667
+ if (isContainerRunning(containerName)) {
668
+ markDaemonBootstrapping(groupFolder);
669
+ return;
670
+ }
595
671
  logger.error({ groupFolder, status: result.status }, 'Failed to start daemon container');
596
672
  throw new Error(`Failed to start daemon container for ${groupFolder}`);
597
673
  }
674
+ markDaemonBootstrapping(groupFolder);
598
675
  }
599
676
  export function warmGroupContainer(group, isMain) {
600
677
  if (CONTAINER_MODE !== 'daemon')
601
678
  return;
602
- const mounts = buildVolumeMounts(group, isMain);
603
- ensureDaemonContainer(mounts, group.folder);
679
+ ensureDaemonContainer(group, isMain);
604
680
  }
605
681
  function writeAgentRequest(groupFolder, payload) {
606
682
  const id = generateId('agent');
@@ -611,64 +687,206 @@ function writeAgentRequest(groupFolder, payload) {
611
687
  const requestPath = path.join(requestsDir, `${id}.json`);
612
688
  const responsePath = path.join(responsesDir, `${id}.json`);
613
689
  const tempPath = `${requestPath}.tmp`;
614
- fs.writeFileSync(tempPath, JSON.stringify({ id, input: payload }, null, 2));
690
+ fs.writeFileSync(tempPath, JSON.stringify({ id, input: payload }));
615
691
  fs.renameSync(tempPath, requestPath);
616
692
  return { id, requestPath, responsePath };
617
693
  }
618
- async function waitForAgentResponse(responsePath, timeoutMs, abortSignal) {
619
- const start = Date.now();
620
- while (Date.now() - start < timeoutMs) {
621
- if (abortSignal?.aborted) {
622
- throw new Error('Agent run preempted');
694
+ class FileChangeSignal {
695
+ abortSignal;
696
+ watcher = null;
697
+ pending = false;
698
+ waiter = null;
699
+ abortHandler;
700
+ constructor(watchDir, abortSignal) {
701
+ this.abortSignal = abortSignal;
702
+ try {
703
+ this.watcher = fs.watch(watchDir, { persistent: false }, () => this.notify());
704
+ this.watcher.on('error', () => {
705
+ if (this.watcher) {
706
+ try {
707
+ this.watcher.close();
708
+ }
709
+ catch { /* ignore */ }
710
+ this.watcher = null;
711
+ }
712
+ this.notify();
713
+ });
623
714
  }
624
- if (fs.existsSync(responsePath)) {
625
- let raw;
626
- try {
627
- raw = fs.readFileSync(responsePath, 'utf-8');
628
- }
629
- catch (readErr) {
630
- const code = readErr?.code;
631
- if (code === 'ENOENT') {
632
- // File disappeared between existsSync and readFileSync; continue polling
633
- await new Promise(resolve => setTimeout(resolve, CONTAINER_DAEMON_POLL_MS));
634
- continue;
715
+ catch {
716
+ this.watcher = null;
717
+ }
718
+ if (this.abortSignal) {
719
+ this.abortHandler = () => this.notify();
720
+ this.abortSignal.addEventListener('abort', this.abortHandler);
721
+ }
722
+ else {
723
+ this.abortHandler = null;
724
+ }
725
+ }
726
+ notify() {
727
+ if (this.waiter) {
728
+ const wake = this.waiter;
729
+ this.waiter = null;
730
+ wake();
731
+ return;
732
+ }
733
+ this.pending = true;
734
+ }
735
+ async wait(timeoutMs) {
736
+ if (this.pending) {
737
+ this.pending = false;
738
+ return;
739
+ }
740
+ await new Promise((resolve) => {
741
+ let settled = false;
742
+ const finish = () => {
743
+ if (settled)
744
+ return;
745
+ settled = true;
746
+ if (this.waiter === finish)
747
+ this.waiter = null;
748
+ clearTimeout(timer);
749
+ resolve();
750
+ };
751
+ const timer = setTimeout(finish, Math.max(10, timeoutMs));
752
+ this.waiter = finish;
753
+ });
754
+ }
755
+ close() {
756
+ if (this.abortSignal && this.abortHandler) {
757
+ this.abortSignal.removeEventListener('abort', this.abortHandler);
758
+ }
759
+ if (this.watcher) {
760
+ this.watcher.close();
761
+ this.watcher = null;
762
+ }
763
+ this.notify();
764
+ }
765
+ }
766
+ function isContainerOutputPayload(value) {
767
+ if (!value || typeof value !== 'object')
768
+ return false;
769
+ const status = value.status;
770
+ return status === 'success' || status === 'error';
771
+ }
772
+ export async function waitForAgentResponse(responsePath, timeoutMs, abortSignal, options) {
773
+ const RESPONSE_PARSE_MAX_RETRIES = 8;
774
+ const pollMs = Math.max(25, CONTAINER_DAEMON_POLL_MS);
775
+ const parseRetryMs = Math.max(20, Math.floor(pollMs / 2));
776
+ const maxExtensionMs = Math.max(0, Math.floor(options?.maxExtensionMs
777
+ ?? Math.min(120_000, Math.max(30_000, Math.floor(timeoutMs * 0.5)))));
778
+ const start = Date.now();
779
+ let deadline = start + timeoutMs;
780
+ let extendedMs = 0;
781
+ const waitSignal = new FileChangeSignal(path.dirname(responsePath), abortSignal);
782
+ let parseFailures = 0;
783
+ let lastParseError = '';
784
+ try {
785
+ for (;;) {
786
+ while (Date.now() < deadline) {
787
+ if (abortSignal?.aborted) {
788
+ throw new Error('Agent run preempted');
635
789
  }
636
- throw readErr;
637
- }
638
- try {
639
- const parsed = JSON.parse(raw);
640
- fs.unlinkSync(responsePath);
641
- return parsed;
642
- }
643
- catch (parseErr) {
644
- // Partial read during atomic rename can produce invalid JSON — retry up to 3 times
645
- let retryParsed = null;
646
- for (let parseRetry = 0; parseRetry < 3; parseRetry++) {
647
- await new Promise(resolve => setTimeout(resolve, CONTAINER_DAEMON_POLL_MS));
790
+ if (fs.existsSync(responsePath)) {
791
+ let raw;
648
792
  try {
649
- const retryRaw = fs.readFileSync(responsePath, 'utf-8');
650
- retryParsed = JSON.parse(retryRaw);
651
- break;
793
+ raw = fs.readFileSync(responsePath, 'utf-8');
652
794
  }
653
- catch {
654
- // continue retrying
795
+ catch (readErr) {
796
+ const code = readErr?.code;
797
+ if (code === 'ENOENT') {
798
+ await waitSignal.wait(pollMs);
799
+ continue;
800
+ }
801
+ throw readErr;
655
802
  }
656
- }
657
- if (retryParsed) {
658
803
  try {
659
- fs.unlinkSync(responsePath);
804
+ const parsed = JSON.parse(raw);
805
+ if (!isContainerOutputPayload(parsed)) {
806
+ throw new Error('Missing required "status" field');
807
+ }
808
+ try {
809
+ fs.unlinkSync(responsePath);
810
+ }
811
+ catch { /* ignore */ }
812
+ return parsed;
813
+ }
814
+ catch (parseErr) {
815
+ parseFailures += 1;
816
+ lastParseError = parseErr instanceof Error ? parseErr.message : String(parseErr);
817
+ if (parseFailures >= RESPONSE_PARSE_MAX_RETRIES) {
818
+ try {
819
+ fs.unlinkSync(responsePath);
820
+ }
821
+ catch { /* ignore */ }
822
+ throw new Error(`Failed to parse daemon response after ${parseFailures} attempts: ${lastParseError}`);
823
+ }
824
+ await waitSignal.wait(parseRetryMs);
825
+ if (fs.existsSync(responsePath)) {
826
+ let stat;
827
+ try {
828
+ stat = fs.statSync(responsePath);
829
+ }
830
+ catch (statErr) {
831
+ const code = statErr?.code;
832
+ if (code === 'ENOENT')
833
+ continue;
834
+ throw statErr;
835
+ }
836
+ if (Date.now() - stat.mtimeMs > 5_000) {
837
+ try {
838
+ fs.unlinkSync(responsePath);
839
+ }
840
+ catch { /* ignore */ }
841
+ throw new Error(`Stale daemon response file: ${lastParseError}`);
842
+ }
843
+ }
844
+ continue;
660
845
  }
661
- catch { /* ignore */ }
662
- return retryParsed;
663
846
  }
664
- try {
665
- fs.unlinkSync(responsePath);
847
+ await waitSignal.wait(pollMs);
848
+ }
849
+ // Soft timeout extension: if daemon is actively processing this request with fresh heartbeat,
850
+ // grant bounded extra time to avoid false timeout/restart thrash on long turns.
851
+ const groupFolder = options?.groupFolder;
852
+ const requestId = options?.requestId;
853
+ const daemonStatusPath = options?.daemonStatusPath;
854
+ if ((groupFolder || daemonStatusPath) && maxExtensionMs > 0 && extendedMs < maxExtensionMs) {
855
+ const status = daemonStatusPath
856
+ ? readDaemonStatusFromPath(daemonStatusPath)
857
+ : readDaemonStatus(groupFolder);
858
+ if (status && status.state === 'processing') {
859
+ if (!requestId || !status.request_id || status.request_id === requestId) {
860
+ const now = Date.now();
861
+ const heartbeatAgeMs = Number.isFinite(status.ts) ? now - status.ts : Number.POSITIVE_INFINITY;
862
+ const freshnessBudgetMs = Math.max(daemonConfig.heartbeatMaxAgeMs * 2, 15_000);
863
+ if (heartbeatAgeMs <= freshnessBudgetMs) {
864
+ const remainingExtensionMs = maxExtensionMs - extendedMs;
865
+ if (remainingExtensionMs > 0) {
866
+ const stepExtensionMs = Math.min(remainingExtensionMs, Math.max(15_000, Math.floor(timeoutMs * 0.25)));
867
+ extendedMs += stepExtensionMs;
868
+ deadline = now + stepExtensionMs;
869
+ logger.warn({
870
+ groupFolder,
871
+ requestId: requestId || status.request_id || null,
872
+ heartbeatAgeMs,
873
+ extendedMs,
874
+ maxExtensionMs
875
+ }, 'Extending daemon response wait for active processing request');
876
+ continue;
877
+ }
878
+ }
879
+ }
666
880
  }
667
- catch { /* ignore */ }
668
- throw new Error(`Failed to parse daemon response: ${parseErr instanceof Error ? parseErr.message : String(parseErr)}`);
669
881
  }
882
+ break;
670
883
  }
671
- await new Promise(resolve => setTimeout(resolve, CONTAINER_DAEMON_POLL_MS));
884
+ }
885
+ finally {
886
+ waitSignal.close();
887
+ }
888
+ if (lastParseError) {
889
+ throw new Error(`Daemon response timeout after ${timeoutMs}ms (last parse error: ${lastParseError})`);
672
890
  }
673
891
  throw new Error(`Daemon response timeout after ${timeoutMs}ms`);
674
892
  }
@@ -687,6 +905,20 @@ function removeContainerById(containerId, reason) {
687
905
  logger.warn({ containerId, reason }, 'Removing container');
688
906
  spawn('docker', ['rm', '-f', containerId], { stdio: 'ignore' });
689
907
  }
908
+ export function shouldRetryDaemonRequestError(errorMessage) {
909
+ if (!errorMessage)
910
+ return false;
911
+ const lower = errorMessage.toLowerCase();
912
+ if (/preempted|aborted|interrupted|cancelled|canceled/.test(lower)) {
913
+ return false;
914
+ }
915
+ return /daemon response timeout|failed to parse daemon response|stale daemon response file/.test(lower);
916
+ }
917
+ function isDaemonTimeoutError(errorMessage) {
918
+ if (!errorMessage)
919
+ return false;
920
+ return /daemon response timeout/i.test(errorMessage);
921
+ }
690
922
  export async function runContainerAgent(group, input, options) {
691
923
  sanitizeGroupFolder(group.folder);
692
924
  if (CONTAINER_MODE === 'daemon') {
@@ -823,31 +1055,35 @@ export async function runContainerAgent(group, input, options) {
823
1055
  }
824
1056
  cleanupCid();
825
1057
  const duration = Date.now() - startTime;
826
- const timestamp = new Date().toISOString().replace(/[:.]/g, '-');
827
- const logFile = path.join(logsDir, `container-${timestamp}.log`);
828
1058
  const isVerbose = runtime.host.logLevel === 'debug' || runtime.host.logLevel === 'trace';
829
- const logLines = [
830
- `=== Container Run Log ===`,
831
- `Timestamp: ${new Date().toISOString()}`,
832
- `Group: ${group.name}`,
833
- `IsMain: ${input.isMain}`,
834
- `Duration: ${duration}ms`,
835
- `Exit Code: ${code}`,
836
- `Stdout Truncated: ${stdoutTruncated}`,
837
- `Stderr Truncated: ${stderrTruncated}`,
838
- ``
839
- ];
840
- if (isVerbose) {
841
- logLines.push(`=== Input ===`, JSON.stringify(input, null, 2), ``, `=== Container Args ===`, containerArgs.join(' '), ``, `=== Mounts ===`, mounts.map(m => `${m.hostPath} -> ${m.containerPath}${m.readonly ? ' (ro)' : ''}`).join('\n'), ``, `=== Stderr${stderrTruncated ? ' (TRUNCATED)' : ''} ===`, stderr, ``, `=== Stdout${stdoutTruncated ? ' (TRUNCATED)' : ''} ===`, stdout);
842
- }
843
- else {
844
- logLines.push(`=== Input Summary ===`, `Prompt length: ${input.prompt.length} chars`, `Session ID: ${input.sessionId || 'new'}`, ``, `=== Mounts ===`, mounts.map(m => `${m.containerPath}${m.readonly ? ' (ro)' : ''}`).join('\n'), ``);
845
- if (code !== 0) {
846
- logLines.push(`=== Stderr (last 500 chars) ===`, stderr.slice(-500), ``);
1059
+ const shouldWriteLog = isVerbose || code !== 0 || stdoutTruncated || stderrTruncated;
1060
+ let logFile;
1061
+ if (shouldWriteLog) {
1062
+ const timestamp = new Date().toISOString().replace(/[:.]/g, '-');
1063
+ logFile = path.join(logsDir, `container-${timestamp}.log`);
1064
+ const logLines = [
1065
+ `=== Container Run Log ===`,
1066
+ `Timestamp: ${new Date().toISOString()}`,
1067
+ `Group: ${group.name}`,
1068
+ `IsMain: ${input.isMain}`,
1069
+ `Duration: ${duration}ms`,
1070
+ `Exit Code: ${code}`,
1071
+ `Stdout Truncated: ${stdoutTruncated}`,
1072
+ `Stderr Truncated: ${stderrTruncated}`,
1073
+ ``
1074
+ ];
1075
+ if (isVerbose) {
1076
+ logLines.push(`=== Input ===`, JSON.stringify(input, null, 2), ``, `=== Container Args ===`, containerArgs.join(' '), ``, `=== Mounts ===`, mounts.map(m => `${m.hostPath} -> ${m.containerPath}${m.readonly ? ' (ro)' : ''}`).join('\n'), ``, `=== Stderr${stderrTruncated ? ' (TRUNCATED)' : ''} ===`, stderr, ``, `=== Stdout${stdoutTruncated ? ' (TRUNCATED)' : ''} ===`, stdout);
847
1077
  }
1078
+ else {
1079
+ logLines.push(`=== Input Summary ===`, `Prompt length: ${input.prompt.length} chars`, `Session ID: ${input.sessionId || 'new'}`, ``, `=== Mounts ===`, mounts.map(m => `${m.containerPath}${m.readonly ? ' (ro)' : ''}`).join('\n'), ``);
1080
+ if (code !== 0) {
1081
+ logLines.push(`=== Stderr (last 500 chars) ===`, stderr.slice(-500), ``);
1082
+ }
1083
+ }
1084
+ fs.writeFileSync(logFile, logLines.join('\n'));
1085
+ logger.debug({ logFile, verbose: isVerbose }, 'Container log written');
848
1086
  }
849
- fs.writeFileSync(logFile, logLines.join('\n'));
850
- logger.debug({ logFile, verbose: isVerbose }, 'Container log written');
851
1087
  if (code !== 0) {
852
1088
  logger.error({
853
1089
  group: group.name,
@@ -928,81 +1164,118 @@ async function runContainerAgentDaemon(group, input, options) {
928
1164
  const startTime = Date.now();
929
1165
  const groupDir = path.join(GROUPS_DIR, group.folder);
930
1166
  fs.mkdirSync(groupDir, { recursive: true });
931
- const mounts = buildVolumeMounts(group, input.isMain);
932
- ensureDaemonContainer(mounts, group.folder);
933
- const { id: requestId, responsePath, requestPath } = writeAgentRequest(group.folder, input);
934
- const requestsDir = path.join(DATA_DIR, 'ipc', group.folder, 'agent_requests');
935
1167
  const timeoutMs = options?.timeoutMs || group.containerConfig?.timeout || CONTAINER_TIMEOUT;
936
1168
  const abortSignal = options?.abortSignal;
937
- const abortHandler = () => {
938
- logger.warn({ group: group.name }, 'Daemon run preempted');
939
- // Write cancel sentinel so daemon can detect the abort
940
- const cancelPath = path.join(requestsDir, `${requestId}.cancel`);
941
- try {
942
- fs.writeFileSync(cancelPath, '');
943
- }
944
- catch { /* ignore */ }
945
- try {
946
- if (fs.existsSync(requestPath))
947
- fs.unlinkSync(requestPath);
948
- }
949
- catch {
950
- // ignore cleanup failure
1169
+ const requestsDir = path.join(DATA_DIR, 'ipc', group.folder, 'agent_requests');
1170
+ const maxAttempts = 2;
1171
+ for (let attempt = 1; attempt <= maxAttempts; attempt += 1) {
1172
+ ensureDaemonContainer(group, input.isMain);
1173
+ const { id: requestId, responsePath, requestPath } = writeAgentRequest(group.folder, input);
1174
+ let aborted = false;
1175
+ const abortHandler = () => {
1176
+ aborted = true;
1177
+ logger.warn({ group: group.name }, 'Daemon run preempted');
1178
+ const cancelPath = path.join(requestsDir, `${requestId}.cancel`);
1179
+ try {
1180
+ fs.writeFileSync(cancelPath, '');
1181
+ }
1182
+ catch { /* ignore */ }
1183
+ try {
1184
+ if (fs.existsSync(requestPath))
1185
+ fs.unlinkSync(requestPath);
1186
+ }
1187
+ catch {
1188
+ // ignore cleanup failure
1189
+ }
1190
+ try {
1191
+ if (fs.existsSync(responsePath))
1192
+ fs.unlinkSync(responsePath);
1193
+ }
1194
+ catch {
1195
+ // ignore cleanup failure
1196
+ }
1197
+ };
1198
+ if (abortSignal) {
1199
+ if (abortSignal.aborted) {
1200
+ abortHandler();
1201
+ return {
1202
+ status: 'error',
1203
+ result: null,
1204
+ error: 'Daemon run preempted'
1205
+ };
1206
+ }
1207
+ abortSignal.addEventListener('abort', abortHandler, { once: true });
951
1208
  }
952
1209
  try {
953
- if (fs.existsSync(responsePath))
954
- fs.unlinkSync(responsePath);
955
- }
956
- catch {
957
- // ignore cleanup failure
1210
+ const output = await waitForAgentResponse(responsePath, timeoutMs, abortSignal, {
1211
+ groupFolder: group.folder,
1212
+ requestId
1213
+ });
1214
+ const memoryExtractionError = consumeMemoryExtractionError(group.folder);
1215
+ return {
1216
+ ...output,
1217
+ memory_extraction_error: output.memory_extraction_error || memoryExtractionError,
1218
+ latency_ms: output.latency_ms ?? (Date.now() - startTime)
1219
+ };
958
1220
  }
959
- };
960
- if (abortSignal) {
961
- if (abortSignal.aborted) {
962
- abortHandler();
1221
+ catch (err) {
1222
+ const errorMessage = err instanceof Error ? err.message : String(err);
1223
+ logger.error({ group: group.name, error: errorMessage, attempt }, 'Daemon agent error');
1224
+ try {
1225
+ if (fs.existsSync(requestPath))
1226
+ fs.unlinkSync(requestPath);
1227
+ }
1228
+ catch {
1229
+ // ignore cleanup failure
1230
+ }
1231
+ try {
1232
+ if (fs.existsSync(responsePath))
1233
+ fs.unlinkSync(responsePath);
1234
+ }
1235
+ catch {
1236
+ // ignore cleanup failure
1237
+ }
1238
+ if (!aborted && !abortSignal?.aborted && attempt < maxAttempts && shouldRetryDaemonRequestError(errorMessage)) {
1239
+ logger.warn({ group: group.name, attempt, error: errorMessage }, 'Retrying daemon request after restart');
1240
+ try {
1241
+ gracefulRestartDaemonContainer(group, input.isMain);
1242
+ // Timeout errors usually indicate a stuck/slow daemon turn.
1243
+ // Avoid spending another full request timeout on the same model;
1244
+ // restart once and return the timeout so host-level failover can act.
1245
+ if (isDaemonTimeoutError(errorMessage)) {
1246
+ return {
1247
+ status: 'error',
1248
+ result: null,
1249
+ error: errorMessage
1250
+ };
1251
+ }
1252
+ continue;
1253
+ }
1254
+ catch (restartErr) {
1255
+ logger.error({
1256
+ group: group.name,
1257
+ attempt,
1258
+ error: restartErr instanceof Error ? restartErr.message : String(restartErr)
1259
+ }, 'Daemon restart failed during retry recovery');
1260
+ }
1261
+ }
963
1262
  return {
964
1263
  status: 'error',
965
1264
  result: null,
966
- error: 'Daemon run preempted'
1265
+ error: errorMessage
967
1266
  };
968
1267
  }
969
- abortSignal.addEventListener('abort', abortHandler, { once: true });
970
- }
971
- try {
972
- const output = await waitForAgentResponse(responsePath, timeoutMs, abortSignal);
973
- return {
974
- ...output,
975
- latency_ms: output.latency_ms ?? (Date.now() - startTime)
976
- };
977
- }
978
- catch (err) {
979
- const errorMessage = err instanceof Error ? err.message : String(err);
980
- logger.error({ group: group.name, error: errorMessage }, 'Daemon agent error');
981
- try {
982
- if (fs.existsSync(requestPath))
983
- fs.unlinkSync(requestPath);
984
- }
985
- catch {
986
- // ignore cleanup failure
987
- }
988
- try {
989
- if (fs.existsSync(responsePath))
990
- fs.unlinkSync(responsePath);
991
- }
992
- catch {
993
- // ignore cleanup failure
994
- }
995
- return {
996
- status: 'error',
997
- result: null,
998
- error: errorMessage
999
- };
1000
- }
1001
- finally {
1002
- if (abortSignal) {
1003
- abortSignal.removeEventListener('abort', abortHandler);
1268
+ finally {
1269
+ if (abortSignal) {
1270
+ abortSignal.removeEventListener('abort', abortHandler);
1271
+ }
1004
1272
  }
1005
1273
  }
1274
+ return {
1275
+ status: 'error',
1276
+ result: null,
1277
+ error: 'Daemon request failed'
1278
+ };
1006
1279
  }
1007
1280
  /**
1008
1281
  * Stop all Docker containers belonging to this instance.
@@ -1010,6 +1283,7 @@ async function runContainerAgentDaemon(group, input, options) {
1010
1283
  */
1011
1284
  export function cleanupInstanceContainers() {
1012
1285
  try {
1286
+ daemonBootstrapUntil.clear();
1013
1287
  let filterArgs;
1014
1288
  if (CONTAINER_INSTANCE_ID) {
1015
1289
  filterArgs = `--filter "label=dotclaw.instance=${CONTAINER_INSTANCE_ID}"`;