@botcord/daemon 0.2.74 → 0.2.76
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cloud-auth.d.ts +47 -0
- package/dist/cloud-auth.js +51 -0
- package/dist/cloud-daemon.d.ts +43 -0
- package/dist/cloud-daemon.js +252 -0
- package/dist/cloud-mode.d.ts +45 -0
- package/dist/cloud-mode.js +55 -0
- package/dist/cloud-settle.d.ts +81 -0
- package/dist/cloud-settle.js +100 -0
- package/dist/daemon-singleton.d.ts +26 -0
- package/dist/daemon-singleton.js +91 -0
- package/dist/daemon.d.ts +1 -1
- package/dist/daemon.js +15 -6
- package/dist/doctor.d.ts +4 -1
- package/dist/doctor.js +15 -4
- package/dist/gateway/channels/botcord.d.ts +1 -1
- package/dist/gateway/channels/botcord.js +48 -5
- package/dist/gateway/dispatcher.d.ts +34 -1
- package/dist/gateway/dispatcher.js +277 -20
- package/dist/gateway/gateway.d.ts +9 -1
- package/dist/gateway/gateway.js +4 -1
- package/dist/gateway/runtime-errors.d.ts +6 -0
- package/dist/gateway/runtime-errors.js +14 -0
- package/dist/gateway/runtimes/claude-code.d.ts +8 -0
- package/dist/gateway/runtimes/claude-code.js +92 -4
- package/dist/gateway/runtimes/deepseek-tui.js +19 -5
- package/dist/gateway/transcript.d.ts +1 -1
- package/dist/gateway/types.d.ts +33 -0
- package/dist/index.js +71 -80
- package/dist/provision.d.ts +2 -0
- package/dist/provision.js +39 -1
- package/dist/status-render.js +17 -0
- package/package.json +2 -2
- package/src/__tests__/cloud-auth.test.ts +42 -0
- package/src/__tests__/cloud-daemon.test.ts +237 -0
- package/src/__tests__/cloud-mode.test.ts +65 -0
- package/src/__tests__/cloud-settle.test.ts +287 -0
- package/src/__tests__/daemon-singleton.test.ts +89 -0
- package/src/__tests__/doctor.test.ts +34 -0
- package/src/__tests__/runtime-discovery.test.ts +90 -0
- package/src/__tests__/status-render.test.ts +34 -0
- package/src/cloud-auth.ts +78 -0
- package/src/cloud-daemon.ts +338 -0
- package/src/cloud-mode.ts +70 -0
- package/src/cloud-settle.ts +182 -0
- package/src/daemon-singleton.ts +122 -0
- package/src/daemon.ts +18 -5
- package/src/doctor.ts +18 -5
- package/src/gateway/__tests__/botcord-channel.test.ts +74 -0
- package/src/gateway/__tests__/claude-code-adapter.test.ts +101 -1
- package/src/gateway/__tests__/deepseek-tui-adapter.test.ts +19 -0
- package/src/gateway/__tests__/dispatcher.test.ts +120 -0
- package/src/gateway/channels/botcord.ts +54 -7
- package/src/gateway/dispatcher.ts +354 -21
- package/src/gateway/gateway.ts +16 -1
- package/src/gateway/runtime-errors.ts +15 -0
- package/src/gateway/runtimes/claude-code.ts +98 -2
- package/src/gateway/runtimes/deepseek-tui.ts +23 -5
- package/src/gateway/transcript.ts +1 -1
- package/src/gateway/types.ts +34 -0
- package/src/index.ts +83 -74
- package/src/provision.ts +45 -1
- package/src/status-render.ts +24 -0
|
@@ -1,8 +1,11 @@
|
|
|
1
1
|
import { randomUUID } from "node:crypto";
|
|
2
|
+
import { looksLikeRuntimeAuthFailure } from "./runtime-errors.js";
|
|
2
3
|
import { resolveRoute } from "./router.js";
|
|
3
4
|
import { sessionKey } from "./session-store.js";
|
|
4
5
|
import { truncateTextField, } from "./transcript.js";
|
|
5
6
|
const DEFAULT_TURN_TIMEOUT_MS = 30 * 60 * 1000;
|
|
7
|
+
const DEFAULT_RUNTIME_AUTH_FAILURE_THRESHOLD = 3;
|
|
8
|
+
const DEFAULT_RUNTIME_AUTH_FAILURE_COOLDOWN_MS = 10 * 60 * 1000;
|
|
6
9
|
/**
|
|
7
10
|
* Owner-chat room prefix. Reply-text gating: only rooms with this prefix get
|
|
8
11
|
* `result.text` forwarded to the channel; in every other room the runtime's
|
|
@@ -101,6 +104,26 @@ function redactSecretString(value) {
|
|
|
101
104
|
.replace(/\b(token=)[^\s"']+/gi, "$1[REDACTED]")
|
|
102
105
|
.replace(/\b(drt_|dit_|gho_)[A-Za-z0-9_-]+/g, "$1[REDACTED]");
|
|
103
106
|
}
|
|
107
|
+
function extractCloudRunBudget(msg) {
|
|
108
|
+
const envelope = msg.raw?.envelope;
|
|
109
|
+
if (envelope?.type !== "cloud_run")
|
|
110
|
+
return undefined;
|
|
111
|
+
const budget = envelope.payload?.cloud_run?.budget;
|
|
112
|
+
if (!budget)
|
|
113
|
+
return undefined;
|
|
114
|
+
const out = {};
|
|
115
|
+
if (typeof budget.max_wall_time_seconds === "number" &&
|
|
116
|
+
Number.isFinite(budget.max_wall_time_seconds) &&
|
|
117
|
+
budget.max_wall_time_seconds > 0) {
|
|
118
|
+
out.maxWallTimeMs = Math.floor(budget.max_wall_time_seconds * 1000);
|
|
119
|
+
}
|
|
120
|
+
if (typeof budget.max_tool_calls === "number" &&
|
|
121
|
+
Number.isFinite(budget.max_tool_calls) &&
|
|
122
|
+
budget.max_tool_calls > 0) {
|
|
123
|
+
out.maxToolCalls = Math.floor(budget.max_tool_calls);
|
|
124
|
+
}
|
|
125
|
+
return out.maxWallTimeMs !== undefined || out.maxToolCalls !== undefined ? out : undefined;
|
|
126
|
+
}
|
|
104
127
|
/**
|
|
105
128
|
* Reason carried on `AbortController.abort()` when a cancel-previous wave
|
|
106
129
|
* is taking over the slot. Distinguishing this from a timeout abort lets
|
|
@@ -137,10 +160,14 @@ export class Dispatcher {
|
|
|
137
160
|
sessionStore;
|
|
138
161
|
log;
|
|
139
162
|
turnTimeoutMs;
|
|
163
|
+
runtimeAuthFailureThreshold;
|
|
164
|
+
runtimeAuthFailureCooldownMs;
|
|
140
165
|
buildSystemContext;
|
|
141
166
|
buildMemoryContext;
|
|
142
167
|
onInbound;
|
|
143
168
|
onOutbound;
|
|
169
|
+
onTurnComplete;
|
|
170
|
+
onRuntimeCircuitBreakerChange;
|
|
144
171
|
composeUserTurn;
|
|
145
172
|
managedRoutes;
|
|
146
173
|
attentionGate;
|
|
@@ -148,6 +175,7 @@ export class Dispatcher {
|
|
|
148
175
|
transcript;
|
|
149
176
|
queues = new Map();
|
|
150
177
|
deferredMultimodal = new Map();
|
|
178
|
+
runtimeAuthFailures = new Map();
|
|
151
179
|
/**
|
|
152
180
|
* Last `/hub/typing` ping timestamp per (accountId, conversationId).
|
|
153
181
|
* Used to debounce cancel-previous bursts so we don't trip Hub's 20/min
|
|
@@ -161,10 +189,16 @@ export class Dispatcher {
|
|
|
161
189
|
this.sessionStore = opts.sessionStore;
|
|
162
190
|
this.log = opts.log;
|
|
163
191
|
this.turnTimeoutMs = opts.turnTimeoutMs ?? DEFAULT_TURN_TIMEOUT_MS;
|
|
192
|
+
this.runtimeAuthFailureThreshold =
|
|
193
|
+
opts.runtimeAuthFailureThreshold ?? DEFAULT_RUNTIME_AUTH_FAILURE_THRESHOLD;
|
|
194
|
+
this.runtimeAuthFailureCooldownMs =
|
|
195
|
+
opts.runtimeAuthFailureCooldownMs ?? DEFAULT_RUNTIME_AUTH_FAILURE_COOLDOWN_MS;
|
|
164
196
|
this.buildSystemContext = opts.buildSystemContext;
|
|
165
197
|
this.buildMemoryContext = opts.buildMemoryContext;
|
|
166
198
|
this.onInbound = opts.onInbound;
|
|
167
199
|
this.onOutbound = opts.onOutbound;
|
|
200
|
+
this.onTurnComplete = opts.onTurnComplete;
|
|
201
|
+
this.onRuntimeCircuitBreakerChange = opts.onRuntimeCircuitBreakerChange;
|
|
168
202
|
this.composeUserTurn = opts.composeUserTurn;
|
|
169
203
|
this.managedRoutes = opts.managedRoutes;
|
|
170
204
|
this.attentionGate = opts.attentionGate;
|
|
@@ -368,6 +402,11 @@ export class Dispatcher {
|
|
|
368
402
|
fallback: "raw_text",
|
|
369
403
|
});
|
|
370
404
|
}
|
|
405
|
+
const openAuthBreaker = this.openRuntimeAuthBreaker(dispatchRoute, dispatchMsg);
|
|
406
|
+
if (openAuthBreaker) {
|
|
407
|
+
await this.skipRuntimeForAuthBreaker(openAuthBreaker, dispatchRoute, dispatchMsg, dispatchChannel, dispatchTurnId);
|
|
408
|
+
return;
|
|
409
|
+
}
|
|
371
410
|
if (mode === "cancel-previous") {
|
|
372
411
|
await this.runCancelPrevious(queueKey, dispatchRoute, text, dispatchMsg, dispatchChannel, dispatchTurnId, mergedFromDeferredTurnIds);
|
|
373
412
|
}
|
|
@@ -384,6 +423,15 @@ export class Dispatcher {
|
|
|
384
423
|
}
|
|
385
424
|
return out;
|
|
386
425
|
}
|
|
426
|
+
runtimeCircuitBreakers() {
|
|
427
|
+
this.pruneExpiredRuntimeAuthBreakers();
|
|
428
|
+
const out = {};
|
|
429
|
+
for (const [key, state] of this.runtimeAuthFailures) {
|
|
430
|
+
if (state.blockedUntil > Date.now())
|
|
431
|
+
out[key] = { ...state };
|
|
432
|
+
}
|
|
433
|
+
return out;
|
|
434
|
+
}
|
|
387
435
|
// ---------------------------------------------------------------------------
|
|
388
436
|
// Internals
|
|
389
437
|
// ---------------------------------------------------------------------------
|
|
@@ -444,6 +492,147 @@ export class Dispatcher {
|
|
|
444
492
|
this.deferredMultimodal.delete(queueKey);
|
|
445
493
|
return list;
|
|
446
494
|
}
|
|
495
|
+
runtimeAuthBreakerKey(route, msg) {
|
|
496
|
+
const thread = msg.conversation.threadId ?? "";
|
|
497
|
+
return `${route.runtime}:${msg.channel}:${msg.accountId}:${msg.conversation.id}:${thread}`;
|
|
498
|
+
}
|
|
499
|
+
openRuntimeAuthBreaker(route, msg) {
|
|
500
|
+
const key = this.runtimeAuthBreakerKey(route, msg);
|
|
501
|
+
const state = this.runtimeAuthFailures.get(key);
|
|
502
|
+
if (!state)
|
|
503
|
+
return null;
|
|
504
|
+
if (state.blockedUntil > 0 && state.blockedUntil <= Date.now()) {
|
|
505
|
+
this.runtimeAuthFailures.delete(key);
|
|
506
|
+
return null;
|
|
507
|
+
}
|
|
508
|
+
return state.blockedUntil > Date.now() ? state : null;
|
|
509
|
+
}
|
|
510
|
+
pruneExpiredRuntimeAuthBreakers() {
|
|
511
|
+
const now = Date.now();
|
|
512
|
+
for (const [key, state] of this.runtimeAuthFailures) {
|
|
513
|
+
if (state.blockedUntil > 0 && state.blockedUntil <= now)
|
|
514
|
+
this.runtimeAuthFailures.delete(key);
|
|
515
|
+
}
|
|
516
|
+
}
|
|
517
|
+
recordRuntimeAuthFailure(route, msg, error) {
|
|
518
|
+
const now = Date.now();
|
|
519
|
+
const key = this.runtimeAuthBreakerKey(route, msg);
|
|
520
|
+
const prev = this.runtimeAuthFailures.get(key);
|
|
521
|
+
const failures = (prev?.failures ?? 0) + 1;
|
|
522
|
+
const openedAt = prev?.openedAt ?? now;
|
|
523
|
+
const state = {
|
|
524
|
+
key,
|
|
525
|
+
runtime: route.runtime,
|
|
526
|
+
channel: msg.channel,
|
|
527
|
+
accountId: msg.accountId,
|
|
528
|
+
conversationId: msg.conversation.id,
|
|
529
|
+
threadId: msg.conversation.threadId ?? null,
|
|
530
|
+
failures,
|
|
531
|
+
openedAt,
|
|
532
|
+
blockedUntil: failures >= this.runtimeAuthFailureThreshold
|
|
533
|
+
? now + this.runtimeAuthFailureCooldownMs
|
|
534
|
+
: 0,
|
|
535
|
+
lastFailureAt: now,
|
|
536
|
+
lastError: error,
|
|
537
|
+
};
|
|
538
|
+
this.runtimeAuthFailures.set(key, state);
|
|
539
|
+
if (state.blockedUntil > now) {
|
|
540
|
+
this.log.error("dispatcher: runtime auth circuit breaker opened", {
|
|
541
|
+
key,
|
|
542
|
+
runtime: route.runtime,
|
|
543
|
+
agentId: msg.accountId,
|
|
544
|
+
roomId: msg.conversation.id,
|
|
545
|
+
topicId: msg.conversation.threadId ?? null,
|
|
546
|
+
failures,
|
|
547
|
+
blockedUntil: state.blockedUntil,
|
|
548
|
+
error,
|
|
549
|
+
});
|
|
550
|
+
this.notifyRuntimeCircuitBreakerChange();
|
|
551
|
+
return state;
|
|
552
|
+
}
|
|
553
|
+
this.log.warn("dispatcher: runtime authentication failure recorded", {
|
|
554
|
+
key,
|
|
555
|
+
runtime: route.runtime,
|
|
556
|
+
agentId: msg.accountId,
|
|
557
|
+
roomId: msg.conversation.id,
|
|
558
|
+
topicId: msg.conversation.threadId ?? null,
|
|
559
|
+
failures,
|
|
560
|
+
threshold: this.runtimeAuthFailureThreshold,
|
|
561
|
+
error,
|
|
562
|
+
});
|
|
563
|
+
return null;
|
|
564
|
+
}
|
|
565
|
+
clearRuntimeAuthFailures(route, msg) {
|
|
566
|
+
const key = this.runtimeAuthBreakerKey(route, msg);
|
|
567
|
+
if (!this.runtimeAuthFailures.delete(key))
|
|
568
|
+
return;
|
|
569
|
+
this.log.info("dispatcher: runtime auth circuit breaker cleared", {
|
|
570
|
+
key,
|
|
571
|
+
runtime: route.runtime,
|
|
572
|
+
agentId: msg.accountId,
|
|
573
|
+
roomId: msg.conversation.id,
|
|
574
|
+
topicId: msg.conversation.threadId ?? null,
|
|
575
|
+
});
|
|
576
|
+
this.notifyRuntimeCircuitBreakerChange();
|
|
577
|
+
}
|
|
578
|
+
notifyRuntimeCircuitBreakerChange() {
|
|
579
|
+
try {
|
|
580
|
+
this.onRuntimeCircuitBreakerChange?.();
|
|
581
|
+
}
|
|
582
|
+
catch (err) {
|
|
583
|
+
this.log.warn("dispatcher: onRuntimeCircuitBreakerChange threw", {
|
|
584
|
+
error: err instanceof Error ? err.message : String(err),
|
|
585
|
+
});
|
|
586
|
+
}
|
|
587
|
+
}
|
|
588
|
+
async skipRuntimeForAuthBreaker(state, route, msg, channel, turnId) {
|
|
589
|
+
const error = `runtime authentication failed repeatedly; dispatch paused until ${new Date(state.blockedUntil).toISOString()}`;
|
|
590
|
+
this.log.warn("dispatcher: runtime auth circuit breaker blocking turn", {
|
|
591
|
+
key: state.key,
|
|
592
|
+
runtime: route.runtime,
|
|
593
|
+
agentId: msg.accountId,
|
|
594
|
+
roomId: msg.conversation.id,
|
|
595
|
+
topicId: msg.conversation.threadId ?? null,
|
|
596
|
+
turnId,
|
|
597
|
+
blockedUntil: state.blockedUntil,
|
|
598
|
+
});
|
|
599
|
+
this.transcript.write({
|
|
600
|
+
ts: nowIso(),
|
|
601
|
+
kind: "turn_error",
|
|
602
|
+
turnId,
|
|
603
|
+
agentId: msg.accountId,
|
|
604
|
+
roomId: msg.conversation.id,
|
|
605
|
+
topicId: msg.conversation.threadId ?? null,
|
|
606
|
+
phase: "runtime",
|
|
607
|
+
error,
|
|
608
|
+
durationMs: 0,
|
|
609
|
+
});
|
|
610
|
+
const canDeliverRuntimeText = isOwnerChatRoom(msg) || !isBotCordChannel(channel);
|
|
611
|
+
const canDeliverRuntimeDiagnostics = canDeliverRuntimeText || isBotCordChannel(channel);
|
|
612
|
+
if (canDeliverRuntimeDiagnostics) {
|
|
613
|
+
const sendResult = await this.sendReply(channel, {
|
|
614
|
+
channel: msg.channel,
|
|
615
|
+
accountId: msg.accountId,
|
|
616
|
+
conversationId: msg.conversation.id,
|
|
617
|
+
threadId: msg.conversation.threadId ?? null,
|
|
618
|
+
type: "error",
|
|
619
|
+
text: `⚠️ Runtime error: ${truncate(error, 500)}`,
|
|
620
|
+
replyTo: this.providerReplyTo(msg),
|
|
621
|
+
traceId: msg.trace?.id ?? null,
|
|
622
|
+
}, turnId);
|
|
623
|
+
this.emitOutbound({
|
|
624
|
+
turnId,
|
|
625
|
+
msg,
|
|
626
|
+
runtime: route.runtime,
|
|
627
|
+
runtimeSessionId: null,
|
|
628
|
+
startedAt: Date.now(),
|
|
629
|
+
finalText: truncateTextField(""),
|
|
630
|
+
deliveryStatus: sendResult.ok ? "delivered" : "send_failed",
|
|
631
|
+
deliveryReason: sendResult.ok ? null : sendResult.error,
|
|
632
|
+
blocks: [],
|
|
633
|
+
});
|
|
634
|
+
}
|
|
635
|
+
}
|
|
447
636
|
async runCancelPrevious(queueKey, route, text, msg, channel, turnId, mergedFromTurnIds = []) {
|
|
448
637
|
const q = this.getQueue(queueKey);
|
|
449
638
|
// Bump the generation on every arrival. Older arrivals still awaiting
|
|
@@ -703,6 +892,7 @@ export class Dispatcher {
|
|
|
703
892
|
turnId,
|
|
704
893
|
controller,
|
|
705
894
|
timedOut: false,
|
|
895
|
+
budgetExceeded: null,
|
|
706
896
|
snapshot,
|
|
707
897
|
done,
|
|
708
898
|
dispatchedAt: startedAt,
|
|
@@ -738,6 +928,9 @@ export class Dispatcher {
|
|
|
738
928
|
...(mergedFromTurnIds.length > 0 ? { mergedFromTurns: mergedFromTurnIds.length } : {}),
|
|
739
929
|
composedPreview: logPreview(text),
|
|
740
930
|
});
|
|
931
|
+
const cloudRunBudget = extractCloudRunBudget(msg);
|
|
932
|
+
const effectiveTurnTimeoutMs = Math.min(this.turnTimeoutMs, cloudRunBudget?.maxWallTimeMs ?? this.turnTimeoutMs);
|
|
933
|
+
let observedToolCalls = 0;
|
|
741
934
|
// Hard-cap turn with a timeout.
|
|
742
935
|
const timer = setTimeout(() => {
|
|
743
936
|
slot.timedOut = true;
|
|
@@ -747,10 +940,10 @@ export class Dispatcher {
|
|
|
747
940
|
topicId: msg.conversation.threadId ?? null,
|
|
748
941
|
turnId,
|
|
749
942
|
queueKey,
|
|
750
|
-
timeoutMs:
|
|
943
|
+
timeoutMs: effectiveTurnTimeoutMs,
|
|
751
944
|
});
|
|
752
945
|
controller.abort();
|
|
753
|
-
},
|
|
946
|
+
}, effectiveTurnTimeoutMs);
|
|
754
947
|
if (typeof timer.unref === "function")
|
|
755
948
|
timer.unref();
|
|
756
949
|
const key = sessionKey({
|
|
@@ -773,6 +966,22 @@ export class Dispatcher {
|
|
|
773
966
|
(streamable || !isBotCordChannel(channel));
|
|
774
967
|
const canStream = streamable && typeof traceId === "string" && typeof channel.streamBlock === "function";
|
|
775
968
|
const recordBlock = (block) => {
|
|
969
|
+
if (block.kind === "tool_use" && cloudRunBudget?.maxToolCalls !== undefined) {
|
|
970
|
+
observedToolCalls += 1;
|
|
971
|
+
if (observedToolCalls > cloudRunBudget.maxToolCalls && !controller.signal.aborted) {
|
|
972
|
+
slot.budgetExceeded = `tool call budget exceeded after ${observedToolCalls} tool call(s)`;
|
|
973
|
+
this.log.warn("dispatcher: cloud_run tool budget exceeded", {
|
|
974
|
+
agentId: msg.accountId,
|
|
975
|
+
roomId: msg.conversation.id,
|
|
976
|
+
topicId: msg.conversation.threadId ?? null,
|
|
977
|
+
turnId,
|
|
978
|
+
queueKey,
|
|
979
|
+
maxToolCalls: cloudRunBudget.maxToolCalls,
|
|
980
|
+
observedToolCalls,
|
|
981
|
+
});
|
|
982
|
+
controller.abort(new Error(slot.budgetExceeded));
|
|
983
|
+
}
|
|
984
|
+
}
|
|
776
985
|
const summary = summarizeStreamBlock(block);
|
|
777
986
|
slot.blocks.push(summary);
|
|
778
987
|
if (this.transcript.enabled) {
|
|
@@ -957,7 +1166,8 @@ export class Dispatcher {
|
|
|
957
1166
|
sendThinkingMarker(event.phase, event.label, "runtime");
|
|
958
1167
|
}
|
|
959
1168
|
: undefined;
|
|
960
|
-
const
|
|
1169
|
+
const shouldObserveBlocks = canStream || this.transcript.enabled || cloudRunBudget?.maxToolCalls !== undefined;
|
|
1170
|
+
const onBlock = shouldObserveBlocks
|
|
961
1171
|
? (block) => {
|
|
962
1172
|
// Always record adapter-emitted blocks for transcript fidelity, even
|
|
963
1173
|
// after abort — the transcript reflects what the runtime emitted,
|
|
@@ -1059,6 +1269,7 @@ export class Dispatcher {
|
|
|
1059
1269
|
const runtime = this.runtimeFactory(route.runtime, route.extraArgs);
|
|
1060
1270
|
let result;
|
|
1061
1271
|
let threw;
|
|
1272
|
+
const turnStartedAt = Date.now();
|
|
1062
1273
|
try {
|
|
1063
1274
|
try {
|
|
1064
1275
|
result = await runtime.run({
|
|
@@ -1081,6 +1292,7 @@ export class Dispatcher {
|
|
|
1081
1292
|
channel: msg.channel,
|
|
1082
1293
|
conversationKind: msg.conversation.kind,
|
|
1083
1294
|
},
|
|
1295
|
+
...(cloudRunBudget ? { budget: cloudRunBudget } : {}),
|
|
1084
1296
|
gateway: route.gateway,
|
|
1085
1297
|
...(route.hermesProfile ? { hermesProfile: route.hermesProfile } : {}),
|
|
1086
1298
|
});
|
|
@@ -1091,6 +1303,26 @@ export class Dispatcher {
|
|
|
1091
1303
|
finally {
|
|
1092
1304
|
clearTimeout(timer);
|
|
1093
1305
|
}
|
|
1306
|
+
// Fire onTurnComplete observer. Cloud daemon hooks this to settle
|
|
1307
|
+
// ``cloud_run`` envelopes against the Hub usage ledger. Errors are
|
|
1308
|
+
// swallowed so settle failures never break the reply path.
|
|
1309
|
+
if (this.onTurnComplete) {
|
|
1310
|
+
const wallTimeMs = Date.now() - turnStartedAt;
|
|
1311
|
+
try {
|
|
1312
|
+
await this.onTurnComplete({
|
|
1313
|
+
message: msg,
|
|
1314
|
+
result,
|
|
1315
|
+
wallTimeMs,
|
|
1316
|
+
...(threw !== undefined ? { error: threw } : {}),
|
|
1317
|
+
});
|
|
1318
|
+
}
|
|
1319
|
+
catch (hookErr) {
|
|
1320
|
+
this.log.warn("dispatcher: onTurnComplete threw — continuing", {
|
|
1321
|
+
error: hookErr instanceof Error ? hookErr.message : String(hookErr),
|
|
1322
|
+
messageId: msg.id,
|
|
1323
|
+
});
|
|
1324
|
+
}
|
|
1325
|
+
}
|
|
1094
1326
|
// Re-check the abort signal AFTER runtime.run resolves but BEFORE any
|
|
1095
1327
|
// side effects (session write, reply send). This closes the race where
|
|
1096
1328
|
// a cancel-previous arrives between runtime.run resolving and the
|
|
@@ -1103,7 +1335,7 @@ export class Dispatcher {
|
|
|
1103
1335
|
// record from `runCancelPrevious` BEFORE aborting, so we MUST NOT also
|
|
1104
1336
|
// emit a `turn_error` here — that would violate the "exactly one
|
|
1105
1337
|
// terminal record per turnId" invariant.
|
|
1106
|
-
if (controller.signal.aborted && !slot.timedOut) {
|
|
1338
|
+
if (controller.signal.aborted && !slot.timedOut && !slot.budgetExceeded) {
|
|
1107
1339
|
return;
|
|
1108
1340
|
}
|
|
1109
1341
|
// Reply gating: BotCord network rooms only accept the runtime's plain
|
|
@@ -1126,7 +1358,9 @@ export class Dispatcher {
|
|
|
1126
1358
|
const isOwnerChat = isOwnerChatRoom(msg);
|
|
1127
1359
|
const canDeliverRuntimeText = isOwnerChat || !isBotCordChannel(channel);
|
|
1128
1360
|
const canDeliverRuntimeDiagnostics = canDeliverRuntimeText || isBotCordChannel(channel);
|
|
1129
|
-
if (slot.timedOut) {
|
|
1361
|
+
if (slot.timedOut || slot.budgetExceeded) {
|
|
1362
|
+
const phase = slot.budgetExceeded ? "budget" : "timeout";
|
|
1363
|
+
const error = slot.budgetExceeded ?? `runtime timeout after ${effectiveTurnTimeoutMs}ms`;
|
|
1130
1364
|
this.transcript.write({
|
|
1131
1365
|
ts: nowIso(),
|
|
1132
1366
|
kind: "turn_error",
|
|
@@ -1134,8 +1368,8 @@ export class Dispatcher {
|
|
|
1134
1368
|
agentId: msg.accountId,
|
|
1135
1369
|
roomId: msg.conversation.id,
|
|
1136
1370
|
topicId: msg.conversation.threadId ?? null,
|
|
1137
|
-
phase
|
|
1138
|
-
error
|
|
1371
|
+
phase,
|
|
1372
|
+
error,
|
|
1139
1373
|
durationMs: Date.now() - slot.dispatchedAt,
|
|
1140
1374
|
});
|
|
1141
1375
|
if (canDeliverRuntimeDiagnostics) {
|
|
@@ -1145,7 +1379,9 @@ export class Dispatcher {
|
|
|
1145
1379
|
conversationId: msg.conversation.id,
|
|
1146
1380
|
threadId: msg.conversation.threadId ?? null,
|
|
1147
1381
|
type: "error",
|
|
1148
|
-
text:
|
|
1382
|
+
text: slot.budgetExceeded
|
|
1383
|
+
? `Cloud run budget exceeded: ${slot.budgetExceeded}`
|
|
1384
|
+
: `Runtime timeout after ${Math.round(effectiveTurnTimeoutMs / 60000)} minute(s); aborted`,
|
|
1149
1385
|
replyTo: this.providerReplyTo(msg),
|
|
1150
1386
|
traceId: msg.trace?.id ?? null,
|
|
1151
1387
|
}, turnId);
|
|
@@ -1157,7 +1393,8 @@ export class Dispatcher {
|
|
|
1157
1393
|
topicId: msg.conversation.threadId ?? null,
|
|
1158
1394
|
turnId,
|
|
1159
1395
|
queueKey,
|
|
1160
|
-
timeoutMs:
|
|
1396
|
+
timeoutMs: effectiveTurnTimeoutMs,
|
|
1397
|
+
budgetExceeded: slot.budgetExceeded,
|
|
1161
1398
|
});
|
|
1162
1399
|
}
|
|
1163
1400
|
return;
|
|
@@ -1209,8 +1446,28 @@ export class Dispatcher {
|
|
|
1209
1446
|
}
|
|
1210
1447
|
if (!result)
|
|
1211
1448
|
return;
|
|
1212
|
-
const
|
|
1213
|
-
const
|
|
1449
|
+
const rawReplyText = (result.text || "").trim();
|
|
1450
|
+
const replyLooksLikeAuthFailure = looksLikeRuntimeAuthFailure(rawReplyText);
|
|
1451
|
+
const replyText = replyLooksLikeAuthFailure ? "" : rawReplyText;
|
|
1452
|
+
const effectiveError = result.error ?? (replyLooksLikeAuthFailure ? rawReplyText : undefined);
|
|
1453
|
+
const authFailureError = effectiveError && looksLikeRuntimeAuthFailure(effectiveError) ? effectiveError : undefined;
|
|
1454
|
+
const finalTextField = truncateTextField(replyLooksLikeAuthFailure ? "" : result.text || "");
|
|
1455
|
+
if (replyLooksLikeAuthFailure) {
|
|
1456
|
+
this.log.error("dispatcher: runtime text looked like authentication failure; treating as error", {
|
|
1457
|
+
agentId: msg.accountId,
|
|
1458
|
+
roomId: msg.conversation.id,
|
|
1459
|
+
topicId: msg.conversation.threadId ?? null,
|
|
1460
|
+
turnId,
|
|
1461
|
+
runtime: route.runtime,
|
|
1462
|
+
error: rawReplyText,
|
|
1463
|
+
});
|
|
1464
|
+
}
|
|
1465
|
+
if (authFailureError) {
|
|
1466
|
+
this.recordRuntimeAuthFailure(route, msg, authFailureError);
|
|
1467
|
+
}
|
|
1468
|
+
else if (!effectiveError) {
|
|
1469
|
+
this.clearRuntimeAuthFailures(route, msg);
|
|
1470
|
+
}
|
|
1214
1471
|
// Persist session before reply so next turn sees the new id even if send fails.
|
|
1215
1472
|
//
|
|
1216
1473
|
// Adapter contract:
|
|
@@ -1220,14 +1477,14 @@ export class Dispatcher {
|
|
|
1220
1477
|
// even when the adapter echoes that id back
|
|
1221
1478
|
// result.newSessionId truthy → upsert the entry
|
|
1222
1479
|
// otherwise → no-op (e.g. codex intentionally never persists)
|
|
1223
|
-
if (sessionId &&
|
|
1480
|
+
if (sessionId && effectiveError && !replyText) {
|
|
1224
1481
|
try {
|
|
1225
1482
|
await this.sessionStore.delete(key);
|
|
1226
1483
|
this.log.info("dispatcher: dropped stale runtime session", {
|
|
1227
1484
|
key,
|
|
1228
1485
|
prevRuntimeSessionId: sessionId,
|
|
1229
1486
|
nextRuntimeSessionId: result.newSessionId || null,
|
|
1230
|
-
error:
|
|
1487
|
+
error: effectiveError,
|
|
1231
1488
|
});
|
|
1232
1489
|
}
|
|
1233
1490
|
catch (err) {
|
|
@@ -1237,7 +1494,7 @@ export class Dispatcher {
|
|
|
1237
1494
|
});
|
|
1238
1495
|
}
|
|
1239
1496
|
}
|
|
1240
|
-
else if (result.newSessionId) {
|
|
1497
|
+
else if (result.newSessionId && !authFailureError) {
|
|
1241
1498
|
const session = {
|
|
1242
1499
|
key,
|
|
1243
1500
|
runtime: route.runtime,
|
|
@@ -1267,13 +1524,13 @@ export class Dispatcher {
|
|
|
1267
1524
|
});
|
|
1268
1525
|
}
|
|
1269
1526
|
}
|
|
1270
|
-
else if (sessionId &&
|
|
1527
|
+
else if (sessionId && effectiveError) {
|
|
1271
1528
|
try {
|
|
1272
1529
|
await this.sessionStore.delete(key);
|
|
1273
1530
|
this.log.info("dispatcher: dropped stale runtime session", {
|
|
1274
1531
|
key,
|
|
1275
1532
|
prevRuntimeSessionId: sessionId,
|
|
1276
|
-
error:
|
|
1533
|
+
error: effectiveError,
|
|
1277
1534
|
});
|
|
1278
1535
|
}
|
|
1279
1536
|
catch (err) {
|
|
@@ -1284,14 +1541,14 @@ export class Dispatcher {
|
|
|
1284
1541
|
}
|
|
1285
1542
|
}
|
|
1286
1543
|
if (!replyText) {
|
|
1287
|
-
if (
|
|
1544
|
+
if (effectiveError) {
|
|
1288
1545
|
this.log.warn("dispatcher: runtime returned error without reply text", {
|
|
1289
1546
|
agentId: msg.accountId,
|
|
1290
1547
|
roomId: msg.conversation.id,
|
|
1291
1548
|
topicId: msg.conversation.threadId ?? null,
|
|
1292
1549
|
turnId,
|
|
1293
1550
|
runtime: route.runtime,
|
|
1294
|
-
error:
|
|
1551
|
+
error: effectiveError,
|
|
1295
1552
|
});
|
|
1296
1553
|
if (canDeliverRuntimeDiagnostics) {
|
|
1297
1554
|
const sendResult = await this.sendReply(channel, {
|
|
@@ -1300,7 +1557,7 @@ export class Dispatcher {
|
|
|
1300
1557
|
conversationId: msg.conversation.id,
|
|
1301
1558
|
threadId: msg.conversation.threadId ?? null,
|
|
1302
1559
|
type: "error",
|
|
1303
|
-
text: `⚠️ Runtime error: ${truncate(
|
|
1560
|
+
text: `⚠️ Runtime error: ${truncate(effectiveError, 500)}`,
|
|
1304
1561
|
replyTo: this.providerReplyTo(msg),
|
|
1305
1562
|
traceId: msg.trace?.id ?? null,
|
|
1306
1563
|
}, turnId);
|
|
@@ -1328,7 +1585,7 @@ export class Dispatcher {
|
|
|
1328
1585
|
costUsd: result.costUsd,
|
|
1329
1586
|
finalText: finalTextField,
|
|
1330
1587
|
deliveryStatus: "empty_text",
|
|
1331
|
-
deliveryReason:
|
|
1588
|
+
deliveryReason: effectiveError ?? null,
|
|
1332
1589
|
blocks: slot.blocks,
|
|
1333
1590
|
});
|
|
1334
1591
|
return;
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import { type ChannelBackoffOptions } from "./channel-manager.js";
|
|
2
|
-
import { type RuntimeFactory } from "./dispatcher.js";
|
|
2
|
+
import { type DispatcherOptions, type RuntimeFactory } from "./dispatcher.js";
|
|
3
3
|
import { type GatewayLogger } from "./log.js";
|
|
4
4
|
import { type TranscriptWriter } from "./transcript.js";
|
|
5
5
|
import type { ChannelAdapter, GatewayChannelConfig, GatewayConfig, GatewayInboundMessage, GatewayOutboundMessage, GatewayRoute, GatewayRuntimeSnapshot, InboundObserver, MemoryContextBuilder, OutboundObserver, SystemContextBuilder, UserTurnBuilder } from "./types.js";
|
|
@@ -41,6 +41,14 @@ export interface GatewayBootOptions {
|
|
|
41
41
|
* bookkeeping like loop-risk tracking.
|
|
42
42
|
*/
|
|
43
43
|
onOutbound?: OutboundObserver;
|
|
44
|
+
onRuntimeCircuitBreakerChange?: () => void;
|
|
45
|
+
/**
|
|
46
|
+
* Optional observer fired after each runtime turn resolves. Forwarded
|
|
47
|
+
* to the dispatcher verbatim — see {@link Dispatcher} for semantics.
|
|
48
|
+
* Cloud daemon hooks this to settle ``cloud_run`` envelopes against
|
|
49
|
+
* the Hub usage ledger.
|
|
50
|
+
*/
|
|
51
|
+
onTurnComplete?: DispatcherOptions["onTurnComplete"];
|
|
44
52
|
/**
|
|
45
53
|
* Optional attention gate (PR3, design §4.2). Forwarded to the dispatcher
|
|
46
54
|
* verbatim — see {@link Dispatcher} for semantics. Returning `false` skips
|
package/dist/gateway/gateway.js
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import { ChannelManager } from "./channel-manager.js";
|
|
2
|
-
import { Dispatcher } from "./dispatcher.js";
|
|
2
|
+
import { Dispatcher, } from "./dispatcher.js";
|
|
3
3
|
import { consoleLogger } from "./log.js";
|
|
4
4
|
import { createRuntime } from "./runtimes/registry.js";
|
|
5
5
|
import { DEFAULT_SESSION_STORE_MAX_ENTRY_AGE_MS, SessionStore } from "./session-store.js";
|
|
@@ -72,6 +72,8 @@ export class Gateway {
|
|
|
72
72
|
onInbound: opts.onInbound,
|
|
73
73
|
composeUserTurn: opts.composeUserTurn,
|
|
74
74
|
onOutbound: opts.onOutbound,
|
|
75
|
+
onTurnComplete: opts.onTurnComplete,
|
|
76
|
+
onRuntimeCircuitBreakerChange: opts.onRuntimeCircuitBreakerChange,
|
|
75
77
|
managedRoutes: this.managedRoutes,
|
|
76
78
|
attentionGate: opts.attentionGate,
|
|
77
79
|
resolveHubUrl: opts.resolveHubUrl,
|
|
@@ -105,6 +107,7 @@ export class Gateway {
|
|
|
105
107
|
return {
|
|
106
108
|
channels: this.channelManager.status(),
|
|
107
109
|
turns: this.dispatcher.turns(),
|
|
110
|
+
runtimeCircuitBreakers: this.dispatcher.runtimeCircuitBreakers(),
|
|
108
111
|
};
|
|
109
112
|
}
|
|
110
113
|
/**
|
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Runtime CLIs sometimes report authentication failures as ordinary final
|
|
3
|
+
* text. Keep this intentionally narrow so normal model replies about auth do
|
|
4
|
+
* not get reclassified unless they look like a top-level CLI/API failure.
|
|
5
|
+
*/
|
|
6
|
+
export declare function looksLikeRuntimeAuthFailure(text: string): boolean;
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Runtime CLIs sometimes report authentication failures as ordinary final
|
|
3
|
+
* text. Keep this intentionally narrow so normal model replies about auth do
|
|
4
|
+
* not get reclassified unless they look like a top-level CLI/API failure.
|
|
5
|
+
*/
|
|
6
|
+
export function looksLikeRuntimeAuthFailure(text) {
|
|
7
|
+
const s = text.trim();
|
|
8
|
+
if (!s)
|
|
9
|
+
return false;
|
|
10
|
+
return (/^(Failed to authenticate|Authentication failed|Invalid API key|Invalid Anthropic API key)\b/i.test(s) ||
|
|
11
|
+
/^API Error:\s*4\d\d\b/i.test(s) ||
|
|
12
|
+
/\b(API Error:\s*4\d\d|Request not allowed|invalid x-api-key)\b/i.test(s) ||
|
|
13
|
+
/^(Unauthorized|Forbidden)(?:\b|:)/i.test(s));
|
|
14
|
+
}
|
|
@@ -1,10 +1,17 @@
|
|
|
1
1
|
import { NdjsonStreamAdapter, type NdjsonEventCtx } from "./ndjson-stream.js";
|
|
2
2
|
import { type ProbeDeps } from "./probe.js";
|
|
3
3
|
import type { RuntimeProbeResult, RuntimeRunOptions } from "../types.js";
|
|
4
|
+
export declare function scrubClaudeCodeAuthEnv(env: NodeJS.ProcessEnv): NodeJS.ProcessEnv;
|
|
4
5
|
/** Resolve the Claude Code CLI path on PATH or the macOS desktop bundle fallback. */
|
|
5
6
|
export declare function resolveClaudeCommand(deps?: ProbeDeps): string | null;
|
|
6
7
|
/** Probe whether the Claude Code CLI is installed and report its version. */
|
|
7
8
|
export declare function probeClaude(deps?: ProbeDeps): RuntimeProbeResult;
|
|
9
|
+
export interface ClaudeAuthProbeResult {
|
|
10
|
+
checked: boolean;
|
|
11
|
+
ok: boolean;
|
|
12
|
+
message: string;
|
|
13
|
+
}
|
|
14
|
+
export declare function probeClaudeAuth(deps?: ProbeDeps): ClaudeAuthProbeResult;
|
|
8
15
|
/**
|
|
9
16
|
* Claude Code adapter — spawns `claude -p "<text>" --output-format stream-json`
|
|
10
17
|
* (with `--resume <sid>` when available) and parses the ndjson stream.
|
|
@@ -26,5 +33,6 @@ export declare class ClaudeCodeAdapter extends NdjsonStreamAdapter {
|
|
|
26
33
|
run(opts: RuntimeRunOptions): Promise<import("../types.js").RuntimeRunResult>;
|
|
27
34
|
protected resolveBinary(): string;
|
|
28
35
|
protected buildArgs(opts: RuntimeRunOptions): string[];
|
|
36
|
+
protected spawnEnv(opts: RuntimeRunOptions): NodeJS.ProcessEnv;
|
|
29
37
|
protected handleEvent(raw: unknown, ctx: NdjsonEventCtx): void;
|
|
30
38
|
}
|