@dotsetlabs/dotclaw 2.4.0 → 2.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.env.example +9 -10
- package/README.md +8 -4
- package/config-examples/runtime.json +34 -8
- package/config-examples/tool-policy.json +12 -2
- package/container/agent-runner/package-lock.json +2 -2
- package/container/agent-runner/package.json +1 -1
- package/container/agent-runner/src/agent-config.ts +19 -3
- package/container/agent-runner/src/container-protocol.ts +11 -0
- package/container/agent-runner/src/context-overflow-recovery.ts +39 -0
- package/container/agent-runner/src/index.ts +603 -165
- package/container/agent-runner/src/openrouter-input.ts +159 -0
- package/container/agent-runner/src/system-prompt.ts +13 -3
- package/container/agent-runner/src/tool-loop-policy.ts +741 -0
- package/container/agent-runner/src/tools.ts +211 -8
- package/dist/agent-context.d.ts +1 -0
- package/dist/agent-context.d.ts.map +1 -1
- package/dist/agent-context.js +21 -9
- package/dist/agent-context.js.map +1 -1
- package/dist/agent-execution.d.ts +2 -0
- package/dist/agent-execution.d.ts.map +1 -1
- package/dist/agent-execution.js +164 -15
- package/dist/agent-execution.js.map +1 -1
- package/dist/agent-semaphore.d.ts +24 -1
- package/dist/agent-semaphore.d.ts.map +1 -1
- package/dist/agent-semaphore.js +109 -20
- package/dist/agent-semaphore.js.map +1 -1
- package/dist/cli.js +3 -11
- package/dist/cli.js.map +1 -1
- package/dist/config.d.ts +2 -0
- package/dist/config.d.ts.map +1 -1
- package/dist/config.js +2 -0
- package/dist/config.js.map +1 -1
- package/dist/container-protocol.d.ts +22 -0
- package/dist/container-protocol.d.ts.map +1 -1
- package/dist/container-protocol.js.map +1 -1
- package/dist/container-runner.d.ts +7 -0
- package/dist/container-runner.d.ts.map +1 -1
- package/dist/container-runner.js +417 -143
- package/dist/container-runner.js.map +1 -1
- package/dist/db.d.ts.map +1 -1
- package/dist/db.js +46 -12
- package/dist/db.js.map +1 -1
- package/dist/error-messages.d.ts.map +1 -1
- package/dist/error-messages.js +18 -4
- package/dist/error-messages.js.map +1 -1
- package/dist/failover-policy.d.ts +41 -0
- package/dist/failover-policy.d.ts.map +1 -0
- package/dist/failover-policy.js +261 -0
- package/dist/failover-policy.js.map +1 -0
- package/dist/index.js +1 -0
- package/dist/index.js.map +1 -1
- package/dist/ipc-dispatcher.d.ts.map +1 -1
- package/dist/ipc-dispatcher.js +27 -43
- package/dist/ipc-dispatcher.js.map +1 -1
- package/dist/mcp-config.d.ts +22 -0
- package/dist/mcp-config.d.ts.map +1 -0
- package/dist/mcp-config.js +94 -0
- package/dist/mcp-config.js.map +1 -0
- package/dist/memory-backend.d.ts +27 -0
- package/dist/memory-backend.d.ts.map +1 -0
- package/dist/memory-backend.js +112 -0
- package/dist/memory-backend.js.map +1 -0
- package/dist/memory-recall.d.ts.map +1 -1
- package/dist/memory-recall.js +135 -22
- package/dist/memory-recall.js.map +1 -1
- package/dist/memory-store.d.ts +1 -0
- package/dist/memory-store.d.ts.map +1 -1
- package/dist/memory-store.js +55 -7
- package/dist/memory-store.js.map +1 -1
- package/dist/message-pipeline.d.ts +24 -0
- package/dist/message-pipeline.d.ts.map +1 -1
- package/dist/message-pipeline.js +131 -27
- package/dist/message-pipeline.js.map +1 -1
- package/dist/metrics.d.ts +1 -0
- package/dist/metrics.d.ts.map +1 -1
- package/dist/metrics.js +9 -0
- package/dist/metrics.js.map +1 -1
- package/dist/providers/discord/discord-provider.d.ts.map +1 -1
- package/dist/providers/discord/discord-provider.js +72 -4
- package/dist/providers/discord/discord-provider.js.map +1 -1
- package/dist/providers/telegram/telegram-provider.d.ts.map +1 -1
- package/dist/providers/telegram/telegram-provider.js +65 -3
- package/dist/providers/telegram/telegram-provider.js.map +1 -1
- package/dist/recall-policy.d.ts +12 -0
- package/dist/recall-policy.d.ts.map +1 -0
- package/dist/recall-policy.js +89 -0
- package/dist/recall-policy.js.map +1 -0
- package/dist/runtime-config.d.ts +33 -0
- package/dist/runtime-config.d.ts.map +1 -1
- package/dist/runtime-config.js +109 -9
- package/dist/runtime-config.js.map +1 -1
- package/dist/streaming.d.ts.map +1 -1
- package/dist/streaming.js +125 -33
- package/dist/streaming.js.map +1 -1
- package/dist/task-scheduler.d.ts.map +1 -1
- package/dist/task-scheduler.js +4 -2
- package/dist/task-scheduler.js.map +1 -1
- package/dist/tool-policy.d.ts.map +1 -1
- package/dist/tool-policy.js +26 -4
- package/dist/tool-policy.js.map +1 -1
- package/dist/trace-writer.d.ts +12 -0
- package/dist/trace-writer.d.ts.map +1 -1
- package/dist/trace-writer.js.map +1 -1
- package/dist/turn-hygiene.d.ts +14 -0
- package/dist/turn-hygiene.d.ts.map +1 -0
- package/dist/turn-hygiene.js +214 -0
- package/dist/turn-hygiene.js.map +1 -0
- package/dist/webhook.d.ts.map +1 -1
- package/dist/webhook.js +1 -0
- package/dist/webhook.js.map +1 -1
- package/package.json +15 -1
- package/scripts/benchmark-baseline.js +365 -0
- package/scripts/benchmark-harness.js +1413 -0
- package/scripts/benchmark-scenarios.js +301 -0
- package/scripts/canary-suite.js +123 -0
- package/scripts/generate-controlled-traces.js +230 -0
- package/scripts/release-slo-check.js +214 -0
- package/scripts/run-live-canary.js +339 -0
package/dist/container-runner.js
CHANGED
|
@@ -360,8 +360,50 @@ function isContainerRunning(name) {
|
|
|
360
360
|
}
|
|
361
361
|
}
|
|
362
362
|
const daemonConfig = runtime.host.container.daemon;
|
|
363
|
-
|
|
364
|
-
|
|
363
|
+
const DAEMON_BOOTSTRAP_GRACE_MS = Math.max(daemonConfig.gracePeriodMs, daemonConfig.heartbeatMaxAgeMs);
|
|
364
|
+
const daemonBootstrapUntil = new Map();
|
|
365
|
+
function markDaemonBootstrapping(groupFolder) {
|
|
366
|
+
daemonBootstrapUntil.set(groupFolder, Date.now() + DAEMON_BOOTSTRAP_GRACE_MS);
|
|
367
|
+
}
|
|
368
|
+
function isDaemonBootstrapping(groupFolder, nowMs = Date.now()) {
|
|
369
|
+
const until = daemonBootstrapUntil.get(groupFolder);
|
|
370
|
+
if (!until)
|
|
371
|
+
return false;
|
|
372
|
+
if (until <= nowMs) {
|
|
373
|
+
daemonBootstrapUntil.delete(groupFolder);
|
|
374
|
+
return false;
|
|
375
|
+
}
|
|
376
|
+
return true;
|
|
377
|
+
}
|
|
378
|
+
function consumeMemoryExtractionError(groupFolder) {
|
|
379
|
+
const statusPath = path.join(DATA_DIR, 'ipc', groupFolder, 'memory_extraction_error.json');
|
|
380
|
+
try {
|
|
381
|
+
if (!fs.existsSync(statusPath))
|
|
382
|
+
return undefined;
|
|
383
|
+
const raw = fs.readFileSync(statusPath, 'utf-8').trim();
|
|
384
|
+
try {
|
|
385
|
+
const parsed = JSON.parse(raw);
|
|
386
|
+
if (typeof parsed.error === 'string' && parsed.error.trim()) {
|
|
387
|
+
return parsed.error.trim();
|
|
388
|
+
}
|
|
389
|
+
}
|
|
390
|
+
catch {
|
|
391
|
+
if (raw)
|
|
392
|
+
return raw;
|
|
393
|
+
}
|
|
394
|
+
}
|
|
395
|
+
catch {
|
|
396
|
+
return undefined;
|
|
397
|
+
}
|
|
398
|
+
finally {
|
|
399
|
+
try {
|
|
400
|
+
fs.unlinkSync(statusPath);
|
|
401
|
+
}
|
|
402
|
+
catch { /* ignore */ }
|
|
403
|
+
}
|
|
404
|
+
return undefined;
|
|
405
|
+
}
|
|
406
|
+
function readDaemonStatusFromPath(statusPath) {
|
|
365
407
|
try {
|
|
366
408
|
if (!fs.existsSync(statusPath))
|
|
367
409
|
return null;
|
|
@@ -372,6 +414,10 @@ function readDaemonStatus(groupFolder) {
|
|
|
372
414
|
return null;
|
|
373
415
|
}
|
|
374
416
|
}
|
|
417
|
+
function readDaemonStatus(groupFolder) {
|
|
418
|
+
const statusPath = path.join(DATA_DIR, 'ipc', groupFolder, 'daemon_status.json');
|
|
419
|
+
return readDaemonStatusFromPath(statusPath);
|
|
420
|
+
}
|
|
375
421
|
/**
|
|
376
422
|
* 3-state health check: healthy / busy / dead
|
|
377
423
|
*
|
|
@@ -444,8 +490,7 @@ export function gracefulRestartDaemonContainer(group, isMain) {
|
|
|
444
490
|
}
|
|
445
491
|
}
|
|
446
492
|
// Start new container
|
|
447
|
-
|
|
448
|
-
ensureDaemonContainer(mounts, group.folder);
|
|
493
|
+
ensureDaemonContainer(group, isMain);
|
|
449
494
|
logger.info({ groupFolder: group.folder }, 'Daemon container restarted (graceful)');
|
|
450
495
|
}
|
|
451
496
|
/**
|
|
@@ -459,8 +504,7 @@ export function restartDaemonContainer(group, isMain) {
|
|
|
459
504
|
catch {
|
|
460
505
|
// Ignore if container doesn't exist
|
|
461
506
|
}
|
|
462
|
-
|
|
463
|
-
ensureDaemonContainer(mounts, group.folder);
|
|
507
|
+
ensureDaemonContainer(group, isMain);
|
|
464
508
|
logger.info({ groupFolder: group.folder }, 'Daemon container restarted (force)');
|
|
465
509
|
}
|
|
466
510
|
// Track daemon health check state
|
|
@@ -478,6 +522,7 @@ export function suppressHealthChecks(durationMs) {
|
|
|
478
522
|
export function resetUnhealthyDaemons() {
|
|
479
523
|
unhealthyDaemons.clear();
|
|
480
524
|
healthCheckRestartTimestamps.clear();
|
|
525
|
+
daemonBootstrapUntil.clear();
|
|
481
526
|
}
|
|
482
527
|
/**
|
|
483
528
|
* Perform health check on all daemon containers and restart if needed.
|
|
@@ -497,6 +542,7 @@ export function performDaemonHealthChecks(getRegisteredGroups, mainGroupFolder)
|
|
|
497
542
|
// Skip if container isn't running (may be intentionally stopped)
|
|
498
543
|
if (!isContainerRunning(containerName)) {
|
|
499
544
|
unhealthyDaemons.delete(group.folder);
|
|
545
|
+
daemonBootstrapUntil.delete(group.folder);
|
|
500
546
|
continue;
|
|
501
547
|
}
|
|
502
548
|
const health = checkDaemonHealth(group.folder);
|
|
@@ -579,28 +625,58 @@ export function stopDaemonHealthCheckLoop() {
|
|
|
579
625
|
healthCheckInterval = null;
|
|
580
626
|
}
|
|
581
627
|
}
|
|
582
|
-
function ensureDaemonContainer(
|
|
628
|
+
function ensureDaemonContainer(group, isMain) {
|
|
629
|
+
sanitizeGroupFolder(group.folder);
|
|
630
|
+
const groupFolder = group.folder;
|
|
583
631
|
const containerName = getDaemonContainerName(groupFolder);
|
|
584
|
-
|
|
632
|
+
const health = checkDaemonHealth(groupFolder);
|
|
633
|
+
if (health.state === 'healthy' || health.state === 'busy') {
|
|
585
634
|
return;
|
|
586
|
-
try {
|
|
587
|
-
execSync(`docker rm -f ${containerName}`, { stdio: 'ignore', timeout: 15_000 });
|
|
588
635
|
}
|
|
589
|
-
|
|
590
|
-
|
|
636
|
+
const running = isContainerRunning(containerName);
|
|
637
|
+
if (running && isDaemonBootstrapping(groupFolder)) {
|
|
638
|
+
return;
|
|
639
|
+
}
|
|
640
|
+
const mounts = buildVolumeMounts(group, isMain);
|
|
641
|
+
if (running) {
|
|
642
|
+
logger.warn({
|
|
643
|
+
groupFolder,
|
|
644
|
+
ageMs: health.ageMs,
|
|
645
|
+
daemonState: health.daemonState
|
|
646
|
+
}, 'Daemon container running but unhealthy; restarting before request');
|
|
647
|
+
try {
|
|
648
|
+
execSync(`docker rm -f ${containerName}`, { stdio: 'ignore', timeout: 15_000 });
|
|
649
|
+
}
|
|
650
|
+
catch {
|
|
651
|
+
// ignore cleanup failure and attempt fresh start
|
|
652
|
+
}
|
|
653
|
+
}
|
|
654
|
+
else {
|
|
655
|
+
daemonBootstrapUntil.delete(groupFolder);
|
|
656
|
+
try {
|
|
657
|
+
execSync(`docker rm -f ${containerName}`, { stdio: 'ignore', timeout: 15_000 });
|
|
658
|
+
}
|
|
659
|
+
catch {
|
|
660
|
+
// ignore if container doesn't exist
|
|
661
|
+
}
|
|
591
662
|
}
|
|
592
663
|
const args = buildDaemonArgs(mounts, containerName, groupFolder);
|
|
593
664
|
const result = spawnSync('docker', args, { stdio: 'ignore' });
|
|
594
665
|
if (result.status !== 0) {
|
|
666
|
+
// Concurrent starts can lose the name race while the daemon is actually up.
|
|
667
|
+
if (isContainerRunning(containerName)) {
|
|
668
|
+
markDaemonBootstrapping(groupFolder);
|
|
669
|
+
return;
|
|
670
|
+
}
|
|
595
671
|
logger.error({ groupFolder, status: result.status }, 'Failed to start daemon container');
|
|
596
672
|
throw new Error(`Failed to start daemon container for ${groupFolder}`);
|
|
597
673
|
}
|
|
674
|
+
markDaemonBootstrapping(groupFolder);
|
|
598
675
|
}
|
|
599
676
|
export function warmGroupContainer(group, isMain) {
|
|
600
677
|
if (CONTAINER_MODE !== 'daemon')
|
|
601
678
|
return;
|
|
602
|
-
|
|
603
|
-
ensureDaemonContainer(mounts, group.folder);
|
|
679
|
+
ensureDaemonContainer(group, isMain);
|
|
604
680
|
}
|
|
605
681
|
function writeAgentRequest(groupFolder, payload) {
|
|
606
682
|
const id = generateId('agent');
|
|
@@ -611,64 +687,206 @@ function writeAgentRequest(groupFolder, payload) {
|
|
|
611
687
|
const requestPath = path.join(requestsDir, `${id}.json`);
|
|
612
688
|
const responsePath = path.join(responsesDir, `${id}.json`);
|
|
613
689
|
const tempPath = `${requestPath}.tmp`;
|
|
614
|
-
fs.writeFileSync(tempPath, JSON.stringify({ id, input: payload }
|
|
690
|
+
fs.writeFileSync(tempPath, JSON.stringify({ id, input: payload }));
|
|
615
691
|
fs.renameSync(tempPath, requestPath);
|
|
616
692
|
return { id, requestPath, responsePath };
|
|
617
693
|
}
|
|
618
|
-
|
|
619
|
-
|
|
620
|
-
|
|
621
|
-
|
|
622
|
-
|
|
694
|
+
class FileChangeSignal {
|
|
695
|
+
abortSignal;
|
|
696
|
+
watcher = null;
|
|
697
|
+
pending = false;
|
|
698
|
+
waiter = null;
|
|
699
|
+
abortHandler;
|
|
700
|
+
constructor(watchDir, abortSignal) {
|
|
701
|
+
this.abortSignal = abortSignal;
|
|
702
|
+
try {
|
|
703
|
+
this.watcher = fs.watch(watchDir, { persistent: false }, () => this.notify());
|
|
704
|
+
this.watcher.on('error', () => {
|
|
705
|
+
if (this.watcher) {
|
|
706
|
+
try {
|
|
707
|
+
this.watcher.close();
|
|
708
|
+
}
|
|
709
|
+
catch { /* ignore */ }
|
|
710
|
+
this.watcher = null;
|
|
711
|
+
}
|
|
712
|
+
this.notify();
|
|
713
|
+
});
|
|
623
714
|
}
|
|
624
|
-
|
|
625
|
-
|
|
626
|
-
|
|
627
|
-
|
|
628
|
-
|
|
629
|
-
|
|
630
|
-
|
|
631
|
-
|
|
632
|
-
|
|
633
|
-
|
|
634
|
-
|
|
715
|
+
catch {
|
|
716
|
+
this.watcher = null;
|
|
717
|
+
}
|
|
718
|
+
if (this.abortSignal) {
|
|
719
|
+
this.abortHandler = () => this.notify();
|
|
720
|
+
this.abortSignal.addEventListener('abort', this.abortHandler);
|
|
721
|
+
}
|
|
722
|
+
else {
|
|
723
|
+
this.abortHandler = null;
|
|
724
|
+
}
|
|
725
|
+
}
|
|
726
|
+
notify() {
|
|
727
|
+
if (this.waiter) {
|
|
728
|
+
const wake = this.waiter;
|
|
729
|
+
this.waiter = null;
|
|
730
|
+
wake();
|
|
731
|
+
return;
|
|
732
|
+
}
|
|
733
|
+
this.pending = true;
|
|
734
|
+
}
|
|
735
|
+
async wait(timeoutMs) {
|
|
736
|
+
if (this.pending) {
|
|
737
|
+
this.pending = false;
|
|
738
|
+
return;
|
|
739
|
+
}
|
|
740
|
+
await new Promise((resolve) => {
|
|
741
|
+
let settled = false;
|
|
742
|
+
const finish = () => {
|
|
743
|
+
if (settled)
|
|
744
|
+
return;
|
|
745
|
+
settled = true;
|
|
746
|
+
if (this.waiter === finish)
|
|
747
|
+
this.waiter = null;
|
|
748
|
+
clearTimeout(timer);
|
|
749
|
+
resolve();
|
|
750
|
+
};
|
|
751
|
+
const timer = setTimeout(finish, Math.max(10, timeoutMs));
|
|
752
|
+
this.waiter = finish;
|
|
753
|
+
});
|
|
754
|
+
}
|
|
755
|
+
close() {
|
|
756
|
+
if (this.abortSignal && this.abortHandler) {
|
|
757
|
+
this.abortSignal.removeEventListener('abort', this.abortHandler);
|
|
758
|
+
}
|
|
759
|
+
if (this.watcher) {
|
|
760
|
+
this.watcher.close();
|
|
761
|
+
this.watcher = null;
|
|
762
|
+
}
|
|
763
|
+
this.notify();
|
|
764
|
+
}
|
|
765
|
+
}
|
|
766
|
+
function isContainerOutputPayload(value) {
|
|
767
|
+
if (!value || typeof value !== 'object')
|
|
768
|
+
return false;
|
|
769
|
+
const status = value.status;
|
|
770
|
+
return status === 'success' || status === 'error';
|
|
771
|
+
}
|
|
772
|
+
export async function waitForAgentResponse(responsePath, timeoutMs, abortSignal, options) {
|
|
773
|
+
const RESPONSE_PARSE_MAX_RETRIES = 8;
|
|
774
|
+
const pollMs = Math.max(25, CONTAINER_DAEMON_POLL_MS);
|
|
775
|
+
const parseRetryMs = Math.max(20, Math.floor(pollMs / 2));
|
|
776
|
+
const maxExtensionMs = Math.max(0, Math.floor(options?.maxExtensionMs
|
|
777
|
+
?? Math.min(120_000, Math.max(30_000, Math.floor(timeoutMs * 0.5)))));
|
|
778
|
+
const start = Date.now();
|
|
779
|
+
let deadline = start + timeoutMs;
|
|
780
|
+
let extendedMs = 0;
|
|
781
|
+
const waitSignal = new FileChangeSignal(path.dirname(responsePath), abortSignal);
|
|
782
|
+
let parseFailures = 0;
|
|
783
|
+
let lastParseError = '';
|
|
784
|
+
try {
|
|
785
|
+
for (;;) {
|
|
786
|
+
while (Date.now() < deadline) {
|
|
787
|
+
if (abortSignal?.aborted) {
|
|
788
|
+
throw new Error('Agent run preempted');
|
|
635
789
|
}
|
|
636
|
-
|
|
637
|
-
|
|
638
|
-
try {
|
|
639
|
-
const parsed = JSON.parse(raw);
|
|
640
|
-
fs.unlinkSync(responsePath);
|
|
641
|
-
return parsed;
|
|
642
|
-
}
|
|
643
|
-
catch (parseErr) {
|
|
644
|
-
// Partial read during atomic rename can produce invalid JSON — retry up to 3 times
|
|
645
|
-
let retryParsed = null;
|
|
646
|
-
for (let parseRetry = 0; parseRetry < 3; parseRetry++) {
|
|
647
|
-
await new Promise(resolve => setTimeout(resolve, CONTAINER_DAEMON_POLL_MS));
|
|
790
|
+
if (fs.existsSync(responsePath)) {
|
|
791
|
+
let raw;
|
|
648
792
|
try {
|
|
649
|
-
|
|
650
|
-
retryParsed = JSON.parse(retryRaw);
|
|
651
|
-
break;
|
|
793
|
+
raw = fs.readFileSync(responsePath, 'utf-8');
|
|
652
794
|
}
|
|
653
|
-
catch {
|
|
654
|
-
|
|
795
|
+
catch (readErr) {
|
|
796
|
+
const code = readErr?.code;
|
|
797
|
+
if (code === 'ENOENT') {
|
|
798
|
+
await waitSignal.wait(pollMs);
|
|
799
|
+
continue;
|
|
800
|
+
}
|
|
801
|
+
throw readErr;
|
|
655
802
|
}
|
|
656
|
-
}
|
|
657
|
-
if (retryParsed) {
|
|
658
803
|
try {
|
|
659
|
-
|
|
804
|
+
const parsed = JSON.parse(raw);
|
|
805
|
+
if (!isContainerOutputPayload(parsed)) {
|
|
806
|
+
throw new Error('Missing required "status" field');
|
|
807
|
+
}
|
|
808
|
+
try {
|
|
809
|
+
fs.unlinkSync(responsePath);
|
|
810
|
+
}
|
|
811
|
+
catch { /* ignore */ }
|
|
812
|
+
return parsed;
|
|
813
|
+
}
|
|
814
|
+
catch (parseErr) {
|
|
815
|
+
parseFailures += 1;
|
|
816
|
+
lastParseError = parseErr instanceof Error ? parseErr.message : String(parseErr);
|
|
817
|
+
if (parseFailures >= RESPONSE_PARSE_MAX_RETRIES) {
|
|
818
|
+
try {
|
|
819
|
+
fs.unlinkSync(responsePath);
|
|
820
|
+
}
|
|
821
|
+
catch { /* ignore */ }
|
|
822
|
+
throw new Error(`Failed to parse daemon response after ${parseFailures} attempts: ${lastParseError}`);
|
|
823
|
+
}
|
|
824
|
+
await waitSignal.wait(parseRetryMs);
|
|
825
|
+
if (fs.existsSync(responsePath)) {
|
|
826
|
+
let stat;
|
|
827
|
+
try {
|
|
828
|
+
stat = fs.statSync(responsePath);
|
|
829
|
+
}
|
|
830
|
+
catch (statErr) {
|
|
831
|
+
const code = statErr?.code;
|
|
832
|
+
if (code === 'ENOENT')
|
|
833
|
+
continue;
|
|
834
|
+
throw statErr;
|
|
835
|
+
}
|
|
836
|
+
if (Date.now() - stat.mtimeMs > 5_000) {
|
|
837
|
+
try {
|
|
838
|
+
fs.unlinkSync(responsePath);
|
|
839
|
+
}
|
|
840
|
+
catch { /* ignore */ }
|
|
841
|
+
throw new Error(`Stale daemon response file: ${lastParseError}`);
|
|
842
|
+
}
|
|
843
|
+
}
|
|
844
|
+
continue;
|
|
660
845
|
}
|
|
661
|
-
catch { /* ignore */ }
|
|
662
|
-
return retryParsed;
|
|
663
846
|
}
|
|
664
|
-
|
|
665
|
-
|
|
847
|
+
await waitSignal.wait(pollMs);
|
|
848
|
+
}
|
|
849
|
+
// Soft timeout extension: if daemon is actively processing this request with fresh heartbeat,
|
|
850
|
+
// grant bounded extra time to avoid false timeout/restart thrash on long turns.
|
|
851
|
+
const groupFolder = options?.groupFolder;
|
|
852
|
+
const requestId = options?.requestId;
|
|
853
|
+
const daemonStatusPath = options?.daemonStatusPath;
|
|
854
|
+
if ((groupFolder || daemonStatusPath) && maxExtensionMs > 0 && extendedMs < maxExtensionMs) {
|
|
855
|
+
const status = daemonStatusPath
|
|
856
|
+
? readDaemonStatusFromPath(daemonStatusPath)
|
|
857
|
+
: readDaemonStatus(groupFolder);
|
|
858
|
+
if (status && status.state === 'processing') {
|
|
859
|
+
if (!requestId || !status.request_id || status.request_id === requestId) {
|
|
860
|
+
const now = Date.now();
|
|
861
|
+
const heartbeatAgeMs = Number.isFinite(status.ts) ? now - status.ts : Number.POSITIVE_INFINITY;
|
|
862
|
+
const freshnessBudgetMs = Math.max(daemonConfig.heartbeatMaxAgeMs * 2, 15_000);
|
|
863
|
+
if (heartbeatAgeMs <= freshnessBudgetMs) {
|
|
864
|
+
const remainingExtensionMs = maxExtensionMs - extendedMs;
|
|
865
|
+
if (remainingExtensionMs > 0) {
|
|
866
|
+
const stepExtensionMs = Math.min(remainingExtensionMs, Math.max(15_000, Math.floor(timeoutMs * 0.25)));
|
|
867
|
+
extendedMs += stepExtensionMs;
|
|
868
|
+
deadline = now + stepExtensionMs;
|
|
869
|
+
logger.warn({
|
|
870
|
+
groupFolder,
|
|
871
|
+
requestId: requestId || status.request_id || null,
|
|
872
|
+
heartbeatAgeMs,
|
|
873
|
+
extendedMs,
|
|
874
|
+
maxExtensionMs
|
|
875
|
+
}, 'Extending daemon response wait for active processing request');
|
|
876
|
+
continue;
|
|
877
|
+
}
|
|
878
|
+
}
|
|
879
|
+
}
|
|
666
880
|
}
|
|
667
|
-
catch { /* ignore */ }
|
|
668
|
-
throw new Error(`Failed to parse daemon response: ${parseErr instanceof Error ? parseErr.message : String(parseErr)}`);
|
|
669
881
|
}
|
|
882
|
+
break;
|
|
670
883
|
}
|
|
671
|
-
|
|
884
|
+
}
|
|
885
|
+
finally {
|
|
886
|
+
waitSignal.close();
|
|
887
|
+
}
|
|
888
|
+
if (lastParseError) {
|
|
889
|
+
throw new Error(`Daemon response timeout after ${timeoutMs}ms (last parse error: ${lastParseError})`);
|
|
672
890
|
}
|
|
673
891
|
throw new Error(`Daemon response timeout after ${timeoutMs}ms`);
|
|
674
892
|
}
|
|
@@ -687,6 +905,20 @@ function removeContainerById(containerId, reason) {
|
|
|
687
905
|
logger.warn({ containerId, reason }, 'Removing container');
|
|
688
906
|
spawn('docker', ['rm', '-f', containerId], { stdio: 'ignore' });
|
|
689
907
|
}
|
|
908
|
+
export function shouldRetryDaemonRequestError(errorMessage) {
|
|
909
|
+
if (!errorMessage)
|
|
910
|
+
return false;
|
|
911
|
+
const lower = errorMessage.toLowerCase();
|
|
912
|
+
if (/preempted|aborted|interrupted|cancelled|canceled/.test(lower)) {
|
|
913
|
+
return false;
|
|
914
|
+
}
|
|
915
|
+
return /daemon response timeout|failed to parse daemon response|stale daemon response file/.test(lower);
|
|
916
|
+
}
|
|
917
|
+
function isDaemonTimeoutError(errorMessage) {
|
|
918
|
+
if (!errorMessage)
|
|
919
|
+
return false;
|
|
920
|
+
return /daemon response timeout/i.test(errorMessage);
|
|
921
|
+
}
|
|
690
922
|
export async function runContainerAgent(group, input, options) {
|
|
691
923
|
sanitizeGroupFolder(group.folder);
|
|
692
924
|
if (CONTAINER_MODE === 'daemon') {
|
|
@@ -823,31 +1055,35 @@ export async function runContainerAgent(group, input, options) {
|
|
|
823
1055
|
}
|
|
824
1056
|
cleanupCid();
|
|
825
1057
|
const duration = Date.now() - startTime;
|
|
826
|
-
const timestamp = new Date().toISOString().replace(/[:.]/g, '-');
|
|
827
|
-
const logFile = path.join(logsDir, `container-${timestamp}.log`);
|
|
828
1058
|
const isVerbose = runtime.host.logLevel === 'debug' || runtime.host.logLevel === 'trace';
|
|
829
|
-
const
|
|
830
|
-
|
|
831
|
-
|
|
832
|
-
|
|
833
|
-
`
|
|
834
|
-
|
|
835
|
-
|
|
836
|
-
|
|
837
|
-
|
|
838
|
-
|
|
839
|
-
|
|
840
|
-
|
|
841
|
-
|
|
842
|
-
|
|
843
|
-
|
|
844
|
-
|
|
845
|
-
if (
|
|
846
|
-
logLines.push(`===
|
|
1059
|
+
const shouldWriteLog = isVerbose || code !== 0 || stdoutTruncated || stderrTruncated;
|
|
1060
|
+
let logFile;
|
|
1061
|
+
if (shouldWriteLog) {
|
|
1062
|
+
const timestamp = new Date().toISOString().replace(/[:.]/g, '-');
|
|
1063
|
+
logFile = path.join(logsDir, `container-${timestamp}.log`);
|
|
1064
|
+
const logLines = [
|
|
1065
|
+
`=== Container Run Log ===`,
|
|
1066
|
+
`Timestamp: ${new Date().toISOString()}`,
|
|
1067
|
+
`Group: ${group.name}`,
|
|
1068
|
+
`IsMain: ${input.isMain}`,
|
|
1069
|
+
`Duration: ${duration}ms`,
|
|
1070
|
+
`Exit Code: ${code}`,
|
|
1071
|
+
`Stdout Truncated: ${stdoutTruncated}`,
|
|
1072
|
+
`Stderr Truncated: ${stderrTruncated}`,
|
|
1073
|
+
``
|
|
1074
|
+
];
|
|
1075
|
+
if (isVerbose) {
|
|
1076
|
+
logLines.push(`=== Input ===`, JSON.stringify(input, null, 2), ``, `=== Container Args ===`, containerArgs.join(' '), ``, `=== Mounts ===`, mounts.map(m => `${m.hostPath} -> ${m.containerPath}${m.readonly ? ' (ro)' : ''}`).join('\n'), ``, `=== Stderr${stderrTruncated ? ' (TRUNCATED)' : ''} ===`, stderr, ``, `=== Stdout${stdoutTruncated ? ' (TRUNCATED)' : ''} ===`, stdout);
|
|
847
1077
|
}
|
|
1078
|
+
else {
|
|
1079
|
+
logLines.push(`=== Input Summary ===`, `Prompt length: ${input.prompt.length} chars`, `Session ID: ${input.sessionId || 'new'}`, ``, `=== Mounts ===`, mounts.map(m => `${m.containerPath}${m.readonly ? ' (ro)' : ''}`).join('\n'), ``);
|
|
1080
|
+
if (code !== 0) {
|
|
1081
|
+
logLines.push(`=== Stderr (last 500 chars) ===`, stderr.slice(-500), ``);
|
|
1082
|
+
}
|
|
1083
|
+
}
|
|
1084
|
+
fs.writeFileSync(logFile, logLines.join('\n'));
|
|
1085
|
+
logger.debug({ logFile, verbose: isVerbose }, 'Container log written');
|
|
848
1086
|
}
|
|
849
|
-
fs.writeFileSync(logFile, logLines.join('\n'));
|
|
850
|
-
logger.debug({ logFile, verbose: isVerbose }, 'Container log written');
|
|
851
1087
|
if (code !== 0) {
|
|
852
1088
|
logger.error({
|
|
853
1089
|
group: group.name,
|
|
@@ -928,81 +1164,118 @@ async function runContainerAgentDaemon(group, input, options) {
|
|
|
928
1164
|
const startTime = Date.now();
|
|
929
1165
|
const groupDir = path.join(GROUPS_DIR, group.folder);
|
|
930
1166
|
fs.mkdirSync(groupDir, { recursive: true });
|
|
931
|
-
const mounts = buildVolumeMounts(group, input.isMain);
|
|
932
|
-
ensureDaemonContainer(mounts, group.folder);
|
|
933
|
-
const { id: requestId, responsePath, requestPath } = writeAgentRequest(group.folder, input);
|
|
934
|
-
const requestsDir = path.join(DATA_DIR, 'ipc', group.folder, 'agent_requests');
|
|
935
1167
|
const timeoutMs = options?.timeoutMs || group.containerConfig?.timeout || CONTAINER_TIMEOUT;
|
|
936
1168
|
const abortSignal = options?.abortSignal;
|
|
937
|
-
const
|
|
938
|
-
|
|
939
|
-
|
|
940
|
-
|
|
941
|
-
|
|
942
|
-
|
|
943
|
-
|
|
944
|
-
|
|
945
|
-
|
|
946
|
-
|
|
947
|
-
|
|
948
|
-
|
|
949
|
-
|
|
950
|
-
|
|
1169
|
+
const requestsDir = path.join(DATA_DIR, 'ipc', group.folder, 'agent_requests');
|
|
1170
|
+
const maxAttempts = 2;
|
|
1171
|
+
for (let attempt = 1; attempt <= maxAttempts; attempt += 1) {
|
|
1172
|
+
ensureDaemonContainer(group, input.isMain);
|
|
1173
|
+
const { id: requestId, responsePath, requestPath } = writeAgentRequest(group.folder, input);
|
|
1174
|
+
let aborted = false;
|
|
1175
|
+
const abortHandler = () => {
|
|
1176
|
+
aborted = true;
|
|
1177
|
+
logger.warn({ group: group.name }, 'Daemon run preempted');
|
|
1178
|
+
const cancelPath = path.join(requestsDir, `${requestId}.cancel`);
|
|
1179
|
+
try {
|
|
1180
|
+
fs.writeFileSync(cancelPath, '');
|
|
1181
|
+
}
|
|
1182
|
+
catch { /* ignore */ }
|
|
1183
|
+
try {
|
|
1184
|
+
if (fs.existsSync(requestPath))
|
|
1185
|
+
fs.unlinkSync(requestPath);
|
|
1186
|
+
}
|
|
1187
|
+
catch {
|
|
1188
|
+
// ignore cleanup failure
|
|
1189
|
+
}
|
|
1190
|
+
try {
|
|
1191
|
+
if (fs.existsSync(responsePath))
|
|
1192
|
+
fs.unlinkSync(responsePath);
|
|
1193
|
+
}
|
|
1194
|
+
catch {
|
|
1195
|
+
// ignore cleanup failure
|
|
1196
|
+
}
|
|
1197
|
+
};
|
|
1198
|
+
if (abortSignal) {
|
|
1199
|
+
if (abortSignal.aborted) {
|
|
1200
|
+
abortHandler();
|
|
1201
|
+
return {
|
|
1202
|
+
status: 'error',
|
|
1203
|
+
result: null,
|
|
1204
|
+
error: 'Daemon run preempted'
|
|
1205
|
+
};
|
|
1206
|
+
}
|
|
1207
|
+
abortSignal.addEventListener('abort', abortHandler, { once: true });
|
|
951
1208
|
}
|
|
952
1209
|
try {
|
|
953
|
-
|
|
954
|
-
|
|
955
|
-
|
|
956
|
-
|
|
957
|
-
|
|
1210
|
+
const output = await waitForAgentResponse(responsePath, timeoutMs, abortSignal, {
|
|
1211
|
+
groupFolder: group.folder,
|
|
1212
|
+
requestId
|
|
1213
|
+
});
|
|
1214
|
+
const memoryExtractionError = consumeMemoryExtractionError(group.folder);
|
|
1215
|
+
return {
|
|
1216
|
+
...output,
|
|
1217
|
+
memory_extraction_error: output.memory_extraction_error || memoryExtractionError,
|
|
1218
|
+
latency_ms: output.latency_ms ?? (Date.now() - startTime)
|
|
1219
|
+
};
|
|
958
1220
|
}
|
|
959
|
-
|
|
960
|
-
|
|
961
|
-
|
|
962
|
-
|
|
1221
|
+
catch (err) {
|
|
1222
|
+
const errorMessage = err instanceof Error ? err.message : String(err);
|
|
1223
|
+
logger.error({ group: group.name, error: errorMessage, attempt }, 'Daemon agent error');
|
|
1224
|
+
try {
|
|
1225
|
+
if (fs.existsSync(requestPath))
|
|
1226
|
+
fs.unlinkSync(requestPath);
|
|
1227
|
+
}
|
|
1228
|
+
catch {
|
|
1229
|
+
// ignore cleanup failure
|
|
1230
|
+
}
|
|
1231
|
+
try {
|
|
1232
|
+
if (fs.existsSync(responsePath))
|
|
1233
|
+
fs.unlinkSync(responsePath);
|
|
1234
|
+
}
|
|
1235
|
+
catch {
|
|
1236
|
+
// ignore cleanup failure
|
|
1237
|
+
}
|
|
1238
|
+
if (!aborted && !abortSignal?.aborted && attempt < maxAttempts && shouldRetryDaemonRequestError(errorMessage)) {
|
|
1239
|
+
logger.warn({ group: group.name, attempt, error: errorMessage }, 'Retrying daemon request after restart');
|
|
1240
|
+
try {
|
|
1241
|
+
gracefulRestartDaemonContainer(group, input.isMain);
|
|
1242
|
+
// Timeout errors usually indicate a stuck/slow daemon turn.
|
|
1243
|
+
// Avoid spending another full request timeout on the same model;
|
|
1244
|
+
// restart once and return the timeout so host-level failover can act.
|
|
1245
|
+
if (isDaemonTimeoutError(errorMessage)) {
|
|
1246
|
+
return {
|
|
1247
|
+
status: 'error',
|
|
1248
|
+
result: null,
|
|
1249
|
+
error: errorMessage
|
|
1250
|
+
};
|
|
1251
|
+
}
|
|
1252
|
+
continue;
|
|
1253
|
+
}
|
|
1254
|
+
catch (restartErr) {
|
|
1255
|
+
logger.error({
|
|
1256
|
+
group: group.name,
|
|
1257
|
+
attempt,
|
|
1258
|
+
error: restartErr instanceof Error ? restartErr.message : String(restartErr)
|
|
1259
|
+
}, 'Daemon restart failed during retry recovery');
|
|
1260
|
+
}
|
|
1261
|
+
}
|
|
963
1262
|
return {
|
|
964
1263
|
status: 'error',
|
|
965
1264
|
result: null,
|
|
966
|
-
error:
|
|
1265
|
+
error: errorMessage
|
|
967
1266
|
};
|
|
968
1267
|
}
|
|
969
|
-
|
|
970
|
-
|
|
971
|
-
|
|
972
|
-
|
|
973
|
-
return {
|
|
974
|
-
...output,
|
|
975
|
-
latency_ms: output.latency_ms ?? (Date.now() - startTime)
|
|
976
|
-
};
|
|
977
|
-
}
|
|
978
|
-
catch (err) {
|
|
979
|
-
const errorMessage = err instanceof Error ? err.message : String(err);
|
|
980
|
-
logger.error({ group: group.name, error: errorMessage }, 'Daemon agent error');
|
|
981
|
-
try {
|
|
982
|
-
if (fs.existsSync(requestPath))
|
|
983
|
-
fs.unlinkSync(requestPath);
|
|
984
|
-
}
|
|
985
|
-
catch {
|
|
986
|
-
// ignore cleanup failure
|
|
987
|
-
}
|
|
988
|
-
try {
|
|
989
|
-
if (fs.existsSync(responsePath))
|
|
990
|
-
fs.unlinkSync(responsePath);
|
|
991
|
-
}
|
|
992
|
-
catch {
|
|
993
|
-
// ignore cleanup failure
|
|
994
|
-
}
|
|
995
|
-
return {
|
|
996
|
-
status: 'error',
|
|
997
|
-
result: null,
|
|
998
|
-
error: errorMessage
|
|
999
|
-
};
|
|
1000
|
-
}
|
|
1001
|
-
finally {
|
|
1002
|
-
if (abortSignal) {
|
|
1003
|
-
abortSignal.removeEventListener('abort', abortHandler);
|
|
1268
|
+
finally {
|
|
1269
|
+
if (abortSignal) {
|
|
1270
|
+
abortSignal.removeEventListener('abort', abortHandler);
|
|
1271
|
+
}
|
|
1004
1272
|
}
|
|
1005
1273
|
}
|
|
1274
|
+
return {
|
|
1275
|
+
status: 'error',
|
|
1276
|
+
result: null,
|
|
1277
|
+
error: 'Daemon request failed'
|
|
1278
|
+
};
|
|
1006
1279
|
}
|
|
1007
1280
|
/**
|
|
1008
1281
|
* Stop all Docker containers belonging to this instance.
|
|
@@ -1010,6 +1283,7 @@ async function runContainerAgentDaemon(group, input, options) {
|
|
|
1010
1283
|
*/
|
|
1011
1284
|
export function cleanupInstanceContainers() {
|
|
1012
1285
|
try {
|
|
1286
|
+
daemonBootstrapUntil.clear();
|
|
1013
1287
|
let filterArgs;
|
|
1014
1288
|
if (CONTAINER_INSTANCE_ID) {
|
|
1015
1289
|
filterArgs = `--filter "label=dotclaw.instance=${CONTAINER_INSTANCE_ID}"`;
|