@botcord/daemon 0.2.75 → 0.2.77
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cloud-auth.d.ts +47 -0
- package/dist/cloud-auth.js +51 -0
- package/dist/cloud-daemon.d.ts +43 -0
- package/dist/cloud-daemon.js +252 -0
- package/dist/cloud-mode.d.ts +45 -0
- package/dist/cloud-mode.js +55 -0
- package/dist/cloud-settle.d.ts +81 -0
- package/dist/cloud-settle.js +100 -0
- package/dist/daemon-singleton.d.ts +26 -0
- package/dist/daemon-singleton.js +91 -0
- package/dist/daemon.d.ts +1 -1
- package/dist/daemon.js +15 -6
- package/dist/doctor.d.ts +4 -1
- package/dist/doctor.js +15 -4
- package/dist/gateway/channels/botcord.d.ts +1 -1
- package/dist/gateway/channels/botcord.js +280 -52
- package/dist/gateway/dispatcher.d.ts +34 -1
- package/dist/gateway/dispatcher.js +277 -20
- package/dist/gateway/gateway.d.ts +9 -1
- package/dist/gateway/gateway.js +4 -1
- package/dist/gateway/runtime-errors.d.ts +6 -0
- package/dist/gateway/runtime-errors.js +14 -0
- package/dist/gateway/runtimes/claude-code.d.ts +8 -0
- package/dist/gateway/runtimes/claude-code.js +92 -4
- package/dist/gateway/runtimes/deepseek-tui.js +19 -5
- package/dist/gateway/transcript.d.ts +1 -1
- package/dist/gateway/types.d.ts +33 -0
- package/dist/index.js +71 -80
- package/dist/provision.d.ts +2 -0
- package/dist/provision.js +39 -1
- package/dist/status-render.js +17 -0
- package/package.json +2 -2
- package/src/__tests__/cloud-auth.test.ts +42 -0
- package/src/__tests__/cloud-daemon.test.ts +237 -0
- package/src/__tests__/cloud-mode.test.ts +65 -0
- package/src/__tests__/cloud-settle.test.ts +287 -0
- package/src/__tests__/daemon-singleton.test.ts +89 -0
- package/src/__tests__/doctor.test.ts +34 -0
- package/src/__tests__/runtime-discovery.test.ts +90 -0
- package/src/__tests__/status-render.test.ts +34 -0
- package/src/cloud-auth.ts +78 -0
- package/src/cloud-daemon.ts +338 -0
- package/src/cloud-mode.ts +70 -0
- package/src/cloud-settle.ts +182 -0
- package/src/daemon-singleton.ts +122 -0
- package/src/daemon.ts +18 -5
- package/src/doctor.ts +18 -5
- package/src/gateway/__tests__/botcord-channel.test.ts +98 -0
- package/src/gateway/__tests__/claude-code-adapter.test.ts +101 -1
- package/src/gateway/__tests__/deepseek-tui-adapter.test.ts +19 -0
- package/src/gateway/__tests__/dispatcher.test.ts +120 -0
- package/src/gateway/channels/botcord.ts +299 -43
- package/src/gateway/dispatcher.ts +354 -21
- package/src/gateway/gateway.ts +16 -1
- package/src/gateway/runtime-errors.ts +15 -0
- package/src/gateway/runtimes/claude-code.ts +98 -2
- package/src/gateway/runtimes/deepseek-tui.ts +23 -5
- package/src/gateway/transcript.ts +1 -1
- package/src/gateway/types.ts +34 -0
- package/src/index.ts +83 -74
- package/src/provision.ts +45 -1
- package/src/status-render.ts +24 -0
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import { randomUUID } from "node:crypto";
|
|
2
2
|
|
|
3
3
|
import type { GatewayLogger } from "./log.js";
|
|
4
|
+
import { looksLikeRuntimeAuthFailure } from "./runtime-errors.js";
|
|
4
5
|
import { resolveRoute } from "./router.js";
|
|
5
6
|
import { sessionKey, type SessionStore } from "./session-store.js";
|
|
6
7
|
import {
|
|
@@ -22,6 +23,8 @@ import type {
|
|
|
22
23
|
OutboundObserver,
|
|
23
24
|
QueueMode,
|
|
24
25
|
RuntimeAdapter,
|
|
26
|
+
RuntimeRunResult,
|
|
27
|
+
RuntimeCircuitBreakerSnapshot,
|
|
25
28
|
RuntimeStatusEvent,
|
|
26
29
|
StreamBlock,
|
|
27
30
|
SystemContextBuilder,
|
|
@@ -30,6 +33,8 @@ import type {
|
|
|
30
33
|
} from "./types.js";
|
|
31
34
|
|
|
32
35
|
const DEFAULT_TURN_TIMEOUT_MS = 30 * 60 * 1000;
|
|
36
|
+
const DEFAULT_RUNTIME_AUTH_FAILURE_THRESHOLD = 3;
|
|
37
|
+
const DEFAULT_RUNTIME_AUTH_FAILURE_COOLDOWN_MS = 10 * 60 * 1000;
|
|
33
38
|
|
|
34
39
|
/**
|
|
35
40
|
* Owner-chat room prefix. Reply-text gating: only rooms with this prefix get
|
|
@@ -142,6 +147,41 @@ function redactSecretString(value: string): string {
|
|
|
142
147
|
.replace(/\b(drt_|dit_|gho_)[A-Za-z0-9_-]+/g, "$1[REDACTED]");
|
|
143
148
|
}
|
|
144
149
|
|
|
150
|
+
function extractCloudRunBudget(msg: GatewayInboundMessage): CloudRunBudgetCaps | undefined {
|
|
151
|
+
const envelope = (msg.raw as { envelope?: unknown } | undefined)?.envelope as
|
|
152
|
+
| {
|
|
153
|
+
type?: unknown;
|
|
154
|
+
payload?: {
|
|
155
|
+
cloud_run?: {
|
|
156
|
+
budget?: {
|
|
157
|
+
max_wall_time_seconds?: unknown;
|
|
158
|
+
max_tool_calls?: unknown;
|
|
159
|
+
} | null;
|
|
160
|
+
} | null;
|
|
161
|
+
} | null;
|
|
162
|
+
}
|
|
163
|
+
| undefined;
|
|
164
|
+
if (envelope?.type !== "cloud_run") return undefined;
|
|
165
|
+
const budget = envelope.payload?.cloud_run?.budget;
|
|
166
|
+
if (!budget) return undefined;
|
|
167
|
+
const out: CloudRunBudgetCaps = {};
|
|
168
|
+
if (
|
|
169
|
+
typeof budget.max_wall_time_seconds === "number" &&
|
|
170
|
+
Number.isFinite(budget.max_wall_time_seconds) &&
|
|
171
|
+
budget.max_wall_time_seconds > 0
|
|
172
|
+
) {
|
|
173
|
+
out.maxWallTimeMs = Math.floor(budget.max_wall_time_seconds * 1000);
|
|
174
|
+
}
|
|
175
|
+
if (
|
|
176
|
+
typeof budget.max_tool_calls === "number" &&
|
|
177
|
+
Number.isFinite(budget.max_tool_calls) &&
|
|
178
|
+
budget.max_tool_calls > 0
|
|
179
|
+
) {
|
|
180
|
+
out.maxToolCalls = Math.floor(budget.max_tool_calls);
|
|
181
|
+
}
|
|
182
|
+
return out.maxWallTimeMs !== undefined || out.maxToolCalls !== undefined ? out : undefined;
|
|
183
|
+
}
|
|
184
|
+
|
|
145
185
|
/** Factory signature for building a runtime adapter at turn dispatch time. */
|
|
146
186
|
export type RuntimeFactory = (
|
|
147
187
|
runtimeId: string,
|
|
@@ -156,6 +196,8 @@ export interface DispatcherOptions {
|
|
|
156
196
|
sessionStore: SessionStore;
|
|
157
197
|
log: GatewayLogger;
|
|
158
198
|
turnTimeoutMs?: number;
|
|
199
|
+
runtimeAuthFailureThreshold?: number;
|
|
200
|
+
runtimeAuthFailureCooldownMs?: number;
|
|
159
201
|
/**
|
|
160
202
|
* Live reference to the Gateway's managed-route map. Dispatcher reads
|
|
161
203
|
* `values()` on every `resolveRoute` call so hot-add/remove take effect
|
|
@@ -194,6 +236,24 @@ export interface DispatcherOptions {
|
|
|
194
236
|
* and suppressed so observer failures never break the turn.
|
|
195
237
|
*/
|
|
196
238
|
onOutbound?: OutboundObserver;
|
|
239
|
+
onRuntimeCircuitBreakerChange?: () => void;
|
|
240
|
+
/**
|
|
241
|
+
* Optional observer fired exactly once per turn after ``runtime.run``
|
|
242
|
+
* resolves (or throws / times out). Receives the inbound message, the
|
|
243
|
+
* raw runtime result (may be undefined on throw), the elapsed wall
|
|
244
|
+
* time in milliseconds, and any thrown error. The cloud daemon hooks
|
|
245
|
+
* this to settle ``cloud_run`` envelopes against the Hub's usage
|
|
246
|
+
* ledger; local daemons leave it unset.
|
|
247
|
+
*
|
|
248
|
+
* Errors thrown by the observer are logged and swallowed — settle
|
|
249
|
+
* failures must never break the agent reply path.
|
|
250
|
+
*/
|
|
251
|
+
onTurnComplete?: (event: {
|
|
252
|
+
message: GatewayInboundMessage;
|
|
253
|
+
result?: RuntimeRunResult;
|
|
254
|
+
wallTimeMs: number;
|
|
255
|
+
error?: unknown;
|
|
256
|
+
}) => Promise<void> | void;
|
|
197
257
|
/**
|
|
198
258
|
* Optional attention gate (PR3, design §4.2). Resolved AFTER `onInbound`
|
|
199
259
|
* runs and BEFORE the runtime turn enqueues, so working memory / activity
|
|
@@ -245,6 +305,7 @@ interface TurnSlot {
|
|
|
245
305
|
turnId: string;
|
|
246
306
|
controller: AbortController;
|
|
247
307
|
timedOut: boolean;
|
|
308
|
+
budgetExceeded: string | null;
|
|
248
309
|
snapshot: TurnStatusSnapshot;
|
|
249
310
|
done: Promise<void>;
|
|
250
311
|
dispatchedAt: number;
|
|
@@ -288,10 +349,17 @@ interface QueueState {
|
|
|
288
349
|
serialWorkerActive: boolean;
|
|
289
350
|
}
|
|
290
351
|
|
|
352
|
+
interface CloudRunBudgetCaps {
|
|
353
|
+
maxWallTimeMs?: number;
|
|
354
|
+
maxToolCalls?: number;
|
|
355
|
+
}
|
|
356
|
+
|
|
291
357
|
interface DeferredMultimodalEntry extends BufferedSerialEntry {
|
|
292
358
|
queuedAt: number;
|
|
293
359
|
}
|
|
294
360
|
|
|
361
|
+
interface RuntimeAuthFailureState extends RuntimeCircuitBreakerSnapshot {}
|
|
362
|
+
|
|
295
363
|
/**
|
|
296
364
|
* Gateway dispatcher: consumes `GatewayInboundEnvelope` and drives a runtime
|
|
297
365
|
* turn per message, respecting queue mode, trust level, streaming, and
|
|
@@ -309,10 +377,14 @@ export class Dispatcher {
|
|
|
309
377
|
private readonly sessionStore: SessionStore;
|
|
310
378
|
private readonly log: GatewayLogger;
|
|
311
379
|
private readonly turnTimeoutMs: number;
|
|
380
|
+
private readonly runtimeAuthFailureThreshold: number;
|
|
381
|
+
private readonly runtimeAuthFailureCooldownMs: number;
|
|
312
382
|
private readonly buildSystemContext?: SystemContextBuilder;
|
|
313
383
|
private readonly buildMemoryContext?: MemoryContextBuilder;
|
|
314
384
|
private readonly onInbound?: InboundObserver;
|
|
315
385
|
private readonly onOutbound?: OutboundObserver;
|
|
386
|
+
private readonly onTurnComplete?: DispatcherOptions["onTurnComplete"];
|
|
387
|
+
private readonly onRuntimeCircuitBreakerChange?: () => void;
|
|
316
388
|
private readonly composeUserTurn?: UserTurnBuilder;
|
|
317
389
|
private readonly managedRoutes?: Map<string, GatewayRoute>;
|
|
318
390
|
private readonly attentionGate?: (
|
|
@@ -322,6 +394,7 @@ export class Dispatcher {
|
|
|
322
394
|
private readonly transcript: TranscriptWriter;
|
|
323
395
|
private readonly queues: Map<string, QueueState> = new Map();
|
|
324
396
|
private readonly deferredMultimodal: Map<string, DeferredMultimodalEntry[]> = new Map();
|
|
397
|
+
private readonly runtimeAuthFailures: Map<string, RuntimeAuthFailureState> = new Map();
|
|
325
398
|
/**
|
|
326
399
|
* Last `/hub/typing` ping timestamp per (accountId, conversationId).
|
|
327
400
|
* Used to debounce cancel-previous bursts so we don't trip Hub's 20/min
|
|
@@ -336,10 +409,16 @@ export class Dispatcher {
|
|
|
336
409
|
this.sessionStore = opts.sessionStore;
|
|
337
410
|
this.log = opts.log;
|
|
338
411
|
this.turnTimeoutMs = opts.turnTimeoutMs ?? DEFAULT_TURN_TIMEOUT_MS;
|
|
412
|
+
this.runtimeAuthFailureThreshold =
|
|
413
|
+
opts.runtimeAuthFailureThreshold ?? DEFAULT_RUNTIME_AUTH_FAILURE_THRESHOLD;
|
|
414
|
+
this.runtimeAuthFailureCooldownMs =
|
|
415
|
+
opts.runtimeAuthFailureCooldownMs ?? DEFAULT_RUNTIME_AUTH_FAILURE_COOLDOWN_MS;
|
|
339
416
|
this.buildSystemContext = opts.buildSystemContext;
|
|
340
417
|
this.buildMemoryContext = opts.buildMemoryContext;
|
|
341
418
|
this.onInbound = opts.onInbound;
|
|
342
419
|
this.onOutbound = opts.onOutbound;
|
|
420
|
+
this.onTurnComplete = opts.onTurnComplete;
|
|
421
|
+
this.onRuntimeCircuitBreakerChange = opts.onRuntimeCircuitBreakerChange;
|
|
343
422
|
this.composeUserTurn = opts.composeUserTurn;
|
|
344
423
|
this.managedRoutes = opts.managedRoutes;
|
|
345
424
|
this.attentionGate = opts.attentionGate;
|
|
@@ -559,6 +638,18 @@ export class Dispatcher {
|
|
|
559
638
|
});
|
|
560
639
|
}
|
|
561
640
|
|
|
641
|
+
const openAuthBreaker = this.openRuntimeAuthBreaker(dispatchRoute, dispatchMsg);
|
|
642
|
+
if (openAuthBreaker) {
|
|
643
|
+
await this.skipRuntimeForAuthBreaker(
|
|
644
|
+
openAuthBreaker,
|
|
645
|
+
dispatchRoute,
|
|
646
|
+
dispatchMsg,
|
|
647
|
+
dispatchChannel,
|
|
648
|
+
dispatchTurnId,
|
|
649
|
+
);
|
|
650
|
+
return;
|
|
651
|
+
}
|
|
652
|
+
|
|
562
653
|
if (mode === "cancel-previous") {
|
|
563
654
|
await this.runCancelPrevious(
|
|
564
655
|
queueKey,
|
|
@@ -591,6 +682,15 @@ export class Dispatcher {
|
|
|
591
682
|
return out;
|
|
592
683
|
}
|
|
593
684
|
|
|
685
|
+
runtimeCircuitBreakers(): Record<string, RuntimeCircuitBreakerSnapshot> {
|
|
686
|
+
this.pruneExpiredRuntimeAuthBreakers();
|
|
687
|
+
const out: Record<string, RuntimeCircuitBreakerSnapshot> = {};
|
|
688
|
+
for (const [key, state] of this.runtimeAuthFailures) {
|
|
689
|
+
if (state.blockedUntil > Date.now()) out[key] = { ...state };
|
|
690
|
+
}
|
|
691
|
+
return out;
|
|
692
|
+
}
|
|
693
|
+
|
|
594
694
|
// ---------------------------------------------------------------------------
|
|
595
695
|
// Internals
|
|
596
696
|
// ---------------------------------------------------------------------------
|
|
@@ -653,6 +753,166 @@ export class Dispatcher {
|
|
|
653
753
|
return list;
|
|
654
754
|
}
|
|
655
755
|
|
|
756
|
+
private runtimeAuthBreakerKey(route: GatewayRoute, msg: GatewayInboundMessage): string {
|
|
757
|
+
const thread = msg.conversation.threadId ?? "";
|
|
758
|
+
return `${route.runtime}:${msg.channel}:${msg.accountId}:${msg.conversation.id}:${thread}`;
|
|
759
|
+
}
|
|
760
|
+
|
|
761
|
+
private openRuntimeAuthBreaker(
|
|
762
|
+
route: GatewayRoute,
|
|
763
|
+
msg: GatewayInboundMessage,
|
|
764
|
+
): RuntimeAuthFailureState | null {
|
|
765
|
+
const key = this.runtimeAuthBreakerKey(route, msg);
|
|
766
|
+
const state = this.runtimeAuthFailures.get(key);
|
|
767
|
+
if (!state) return null;
|
|
768
|
+
if (state.blockedUntil > 0 && state.blockedUntil <= Date.now()) {
|
|
769
|
+
this.runtimeAuthFailures.delete(key);
|
|
770
|
+
return null;
|
|
771
|
+
}
|
|
772
|
+
return state.blockedUntil > Date.now() ? state : null;
|
|
773
|
+
}
|
|
774
|
+
|
|
775
|
+
private pruneExpiredRuntimeAuthBreakers(): void {
|
|
776
|
+
const now = Date.now();
|
|
777
|
+
for (const [key, state] of this.runtimeAuthFailures) {
|
|
778
|
+
if (state.blockedUntil > 0 && state.blockedUntil <= now) this.runtimeAuthFailures.delete(key);
|
|
779
|
+
}
|
|
780
|
+
}
|
|
781
|
+
|
|
782
|
+
private recordRuntimeAuthFailure(
|
|
783
|
+
route: GatewayRoute,
|
|
784
|
+
msg: GatewayInboundMessage,
|
|
785
|
+
error: string,
|
|
786
|
+
): RuntimeAuthFailureState | null {
|
|
787
|
+
const now = Date.now();
|
|
788
|
+
const key = this.runtimeAuthBreakerKey(route, msg);
|
|
789
|
+
const prev = this.runtimeAuthFailures.get(key);
|
|
790
|
+
const failures = (prev?.failures ?? 0) + 1;
|
|
791
|
+
const openedAt = prev?.openedAt ?? now;
|
|
792
|
+
const state: RuntimeAuthFailureState = {
|
|
793
|
+
key,
|
|
794
|
+
runtime: route.runtime,
|
|
795
|
+
channel: msg.channel,
|
|
796
|
+
accountId: msg.accountId,
|
|
797
|
+
conversationId: msg.conversation.id,
|
|
798
|
+
threadId: msg.conversation.threadId ?? null,
|
|
799
|
+
failures,
|
|
800
|
+
openedAt,
|
|
801
|
+
blockedUntil:
|
|
802
|
+
failures >= this.runtimeAuthFailureThreshold
|
|
803
|
+
? now + this.runtimeAuthFailureCooldownMs
|
|
804
|
+
: 0,
|
|
805
|
+
lastFailureAt: now,
|
|
806
|
+
lastError: error,
|
|
807
|
+
};
|
|
808
|
+
this.runtimeAuthFailures.set(key, state);
|
|
809
|
+
if (state.blockedUntil > now) {
|
|
810
|
+
this.log.error("dispatcher: runtime auth circuit breaker opened", {
|
|
811
|
+
key,
|
|
812
|
+
runtime: route.runtime,
|
|
813
|
+
agentId: msg.accountId,
|
|
814
|
+
roomId: msg.conversation.id,
|
|
815
|
+
topicId: msg.conversation.threadId ?? null,
|
|
816
|
+
failures,
|
|
817
|
+
blockedUntil: state.blockedUntil,
|
|
818
|
+
error,
|
|
819
|
+
});
|
|
820
|
+
this.notifyRuntimeCircuitBreakerChange();
|
|
821
|
+
return state;
|
|
822
|
+
}
|
|
823
|
+
this.log.warn("dispatcher: runtime authentication failure recorded", {
|
|
824
|
+
key,
|
|
825
|
+
runtime: route.runtime,
|
|
826
|
+
agentId: msg.accountId,
|
|
827
|
+
roomId: msg.conversation.id,
|
|
828
|
+
topicId: msg.conversation.threadId ?? null,
|
|
829
|
+
failures,
|
|
830
|
+
threshold: this.runtimeAuthFailureThreshold,
|
|
831
|
+
error,
|
|
832
|
+
});
|
|
833
|
+
return null;
|
|
834
|
+
}
|
|
835
|
+
|
|
836
|
+
private clearRuntimeAuthFailures(route: GatewayRoute, msg: GatewayInboundMessage): void {
|
|
837
|
+
const key = this.runtimeAuthBreakerKey(route, msg);
|
|
838
|
+
if (!this.runtimeAuthFailures.delete(key)) return;
|
|
839
|
+
this.log.info("dispatcher: runtime auth circuit breaker cleared", {
|
|
840
|
+
key,
|
|
841
|
+
runtime: route.runtime,
|
|
842
|
+
agentId: msg.accountId,
|
|
843
|
+
roomId: msg.conversation.id,
|
|
844
|
+
topicId: msg.conversation.threadId ?? null,
|
|
845
|
+
});
|
|
846
|
+
this.notifyRuntimeCircuitBreakerChange();
|
|
847
|
+
}
|
|
848
|
+
|
|
849
|
+
private notifyRuntimeCircuitBreakerChange(): void {
|
|
850
|
+
try {
|
|
851
|
+
this.onRuntimeCircuitBreakerChange?.();
|
|
852
|
+
} catch (err) {
|
|
853
|
+
this.log.warn("dispatcher: onRuntimeCircuitBreakerChange threw", {
|
|
854
|
+
error: err instanceof Error ? err.message : String(err),
|
|
855
|
+
});
|
|
856
|
+
}
|
|
857
|
+
}
|
|
858
|
+
|
|
859
|
+
private async skipRuntimeForAuthBreaker(
|
|
860
|
+
state: RuntimeAuthFailureState,
|
|
861
|
+
route: GatewayRoute,
|
|
862
|
+
msg: GatewayInboundMessage,
|
|
863
|
+
channel: ChannelAdapter,
|
|
864
|
+
turnId: string,
|
|
865
|
+
): Promise<void> {
|
|
866
|
+
const error =
|
|
867
|
+
`runtime authentication failed repeatedly; dispatch paused until ${new Date(state.blockedUntil).toISOString()}`;
|
|
868
|
+
this.log.warn("dispatcher: runtime auth circuit breaker blocking turn", {
|
|
869
|
+
key: state.key,
|
|
870
|
+
runtime: route.runtime,
|
|
871
|
+
agentId: msg.accountId,
|
|
872
|
+
roomId: msg.conversation.id,
|
|
873
|
+
topicId: msg.conversation.threadId ?? null,
|
|
874
|
+
turnId,
|
|
875
|
+
blockedUntil: state.blockedUntil,
|
|
876
|
+
});
|
|
877
|
+
this.transcript.write({
|
|
878
|
+
ts: nowIso(),
|
|
879
|
+
kind: "turn_error",
|
|
880
|
+
turnId,
|
|
881
|
+
agentId: msg.accountId,
|
|
882
|
+
roomId: msg.conversation.id,
|
|
883
|
+
topicId: msg.conversation.threadId ?? null,
|
|
884
|
+
phase: "runtime",
|
|
885
|
+
error,
|
|
886
|
+
durationMs: 0,
|
|
887
|
+
});
|
|
888
|
+
|
|
889
|
+
const canDeliverRuntimeText = isOwnerChatRoom(msg) || !isBotCordChannel(channel);
|
|
890
|
+
const canDeliverRuntimeDiagnostics = canDeliverRuntimeText || isBotCordChannel(channel);
|
|
891
|
+
if (canDeliverRuntimeDiagnostics) {
|
|
892
|
+
const sendResult = await this.sendReply(channel, {
|
|
893
|
+
channel: msg.channel,
|
|
894
|
+
accountId: msg.accountId,
|
|
895
|
+
conversationId: msg.conversation.id,
|
|
896
|
+
threadId: msg.conversation.threadId ?? null,
|
|
897
|
+
type: "error",
|
|
898
|
+
text: `⚠️ Runtime error: ${truncate(error, 500)}`,
|
|
899
|
+
replyTo: this.providerReplyTo(msg),
|
|
900
|
+
traceId: msg.trace?.id ?? null,
|
|
901
|
+
}, turnId);
|
|
902
|
+
this.emitOutbound({
|
|
903
|
+
turnId,
|
|
904
|
+
msg,
|
|
905
|
+
runtime: route.runtime,
|
|
906
|
+
runtimeSessionId: null,
|
|
907
|
+
startedAt: Date.now(),
|
|
908
|
+
finalText: truncateTextField(""),
|
|
909
|
+
deliveryStatus: sendResult.ok ? "delivered" : "send_failed",
|
|
910
|
+
deliveryReason: sendResult.ok ? null : sendResult.error,
|
|
911
|
+
blocks: [],
|
|
912
|
+
});
|
|
913
|
+
}
|
|
914
|
+
}
|
|
915
|
+
|
|
656
916
|
private async runCancelPrevious(
|
|
657
917
|
queueKey: string,
|
|
658
918
|
route: GatewayRoute,
|
|
@@ -956,6 +1216,7 @@ export class Dispatcher {
|
|
|
956
1216
|
turnId,
|
|
957
1217
|
controller,
|
|
958
1218
|
timedOut: false,
|
|
1219
|
+
budgetExceeded: null,
|
|
959
1220
|
snapshot,
|
|
960
1221
|
done,
|
|
961
1222
|
dispatchedAt: startedAt,
|
|
@@ -992,6 +1253,13 @@ export class Dispatcher {
|
|
|
992
1253
|
composedPreview: logPreview(text),
|
|
993
1254
|
});
|
|
994
1255
|
|
|
1256
|
+
const cloudRunBudget = extractCloudRunBudget(msg);
|
|
1257
|
+
const effectiveTurnTimeoutMs = Math.min(
|
|
1258
|
+
this.turnTimeoutMs,
|
|
1259
|
+
cloudRunBudget?.maxWallTimeMs ?? this.turnTimeoutMs,
|
|
1260
|
+
);
|
|
1261
|
+
let observedToolCalls = 0;
|
|
1262
|
+
|
|
995
1263
|
// Hard-cap turn with a timeout.
|
|
996
1264
|
const timer = setTimeout(() => {
|
|
997
1265
|
slot.timedOut = true;
|
|
@@ -1001,10 +1269,10 @@ export class Dispatcher {
|
|
|
1001
1269
|
topicId: msg.conversation.threadId ?? null,
|
|
1002
1270
|
turnId,
|
|
1003
1271
|
queueKey,
|
|
1004
|
-
timeoutMs:
|
|
1272
|
+
timeoutMs: effectiveTurnTimeoutMs,
|
|
1005
1273
|
});
|
|
1006
1274
|
controller.abort();
|
|
1007
|
-
},
|
|
1275
|
+
}, effectiveTurnTimeoutMs);
|
|
1008
1276
|
if (typeof timer.unref === "function") timer.unref();
|
|
1009
1277
|
|
|
1010
1278
|
const key = sessionKey({
|
|
@@ -1030,6 +1298,22 @@ export class Dispatcher {
|
|
|
1030
1298
|
const canStream =
|
|
1031
1299
|
streamable && typeof traceId === "string" && typeof channel.streamBlock === "function";
|
|
1032
1300
|
const recordBlock = (block: StreamBlock): void => {
|
|
1301
|
+
if (block.kind === "tool_use" && cloudRunBudget?.maxToolCalls !== undefined) {
|
|
1302
|
+
observedToolCalls += 1;
|
|
1303
|
+
if (observedToolCalls > cloudRunBudget.maxToolCalls && !controller.signal.aborted) {
|
|
1304
|
+
slot.budgetExceeded = `tool call budget exceeded after ${observedToolCalls} tool call(s)`;
|
|
1305
|
+
this.log.warn("dispatcher: cloud_run tool budget exceeded", {
|
|
1306
|
+
agentId: msg.accountId,
|
|
1307
|
+
roomId: msg.conversation.id,
|
|
1308
|
+
topicId: msg.conversation.threadId ?? null,
|
|
1309
|
+
turnId,
|
|
1310
|
+
queueKey,
|
|
1311
|
+
maxToolCalls: cloudRunBudget.maxToolCalls,
|
|
1312
|
+
observedToolCalls,
|
|
1313
|
+
});
|
|
1314
|
+
controller.abort(new Error(slot.budgetExceeded));
|
|
1315
|
+
}
|
|
1316
|
+
}
|
|
1033
1317
|
const summary = summarizeStreamBlock(block);
|
|
1034
1318
|
slot.blocks.push(summary);
|
|
1035
1319
|
if (this.transcript.enabled) {
|
|
@@ -1215,7 +1499,9 @@ export class Dispatcher {
|
|
|
1215
1499
|
}
|
|
1216
1500
|
: undefined;
|
|
1217
1501
|
|
|
1218
|
-
const
|
|
1502
|
+
const shouldObserveBlocks =
|
|
1503
|
+
canStream || this.transcript.enabled || cloudRunBudget?.maxToolCalls !== undefined;
|
|
1504
|
+
const onBlock = shouldObserveBlocks
|
|
1219
1505
|
? (block: StreamBlock) => {
|
|
1220
1506
|
// Always record adapter-emitted blocks for transcript fidelity, even
|
|
1221
1507
|
// after abort — the transcript reflects what the runtime emitted,
|
|
@@ -1316,8 +1602,9 @@ export class Dispatcher {
|
|
|
1316
1602
|
}
|
|
1317
1603
|
|
|
1318
1604
|
const runtime = this.runtimeFactory(route.runtime, route.extraArgs);
|
|
1319
|
-
let result:
|
|
1605
|
+
let result: RuntimeRunResult | undefined;
|
|
1320
1606
|
let threw: unknown;
|
|
1607
|
+
const turnStartedAt = Date.now();
|
|
1321
1608
|
try {
|
|
1322
1609
|
try {
|
|
1323
1610
|
result = await runtime.run({
|
|
@@ -1340,6 +1627,7 @@ export class Dispatcher {
|
|
|
1340
1627
|
channel: msg.channel,
|
|
1341
1628
|
conversationKind: msg.conversation.kind,
|
|
1342
1629
|
},
|
|
1630
|
+
...(cloudRunBudget ? { budget: cloudRunBudget } : {}),
|
|
1343
1631
|
gateway: route.gateway,
|
|
1344
1632
|
...(route.hermesProfile ? { hermesProfile: route.hermesProfile } : {}),
|
|
1345
1633
|
});
|
|
@@ -1349,6 +1637,26 @@ export class Dispatcher {
|
|
|
1349
1637
|
clearTimeout(timer);
|
|
1350
1638
|
}
|
|
1351
1639
|
|
|
1640
|
+
// Fire onTurnComplete observer. Cloud daemon hooks this to settle
|
|
1641
|
+
// ``cloud_run`` envelopes against the Hub usage ledger. Errors are
|
|
1642
|
+
// swallowed so settle failures never break the reply path.
|
|
1643
|
+
if (this.onTurnComplete) {
|
|
1644
|
+
const wallTimeMs = Date.now() - turnStartedAt;
|
|
1645
|
+
try {
|
|
1646
|
+
await this.onTurnComplete({
|
|
1647
|
+
message: msg,
|
|
1648
|
+
result,
|
|
1649
|
+
wallTimeMs,
|
|
1650
|
+
...(threw !== undefined ? { error: threw } : {}),
|
|
1651
|
+
});
|
|
1652
|
+
} catch (hookErr) {
|
|
1653
|
+
this.log.warn("dispatcher: onTurnComplete threw — continuing", {
|
|
1654
|
+
error: hookErr instanceof Error ? hookErr.message : String(hookErr),
|
|
1655
|
+
messageId: msg.id,
|
|
1656
|
+
});
|
|
1657
|
+
}
|
|
1658
|
+
}
|
|
1659
|
+
|
|
1352
1660
|
// Re-check the abort signal AFTER runtime.run resolves but BEFORE any
|
|
1353
1661
|
// side effects (session write, reply send). This closes the race where
|
|
1354
1662
|
// a cancel-previous arrives between runtime.run resolving and the
|
|
@@ -1361,7 +1669,7 @@ export class Dispatcher {
|
|
|
1361
1669
|
// record from `runCancelPrevious` BEFORE aborting, so we MUST NOT also
|
|
1362
1670
|
// emit a `turn_error` here — that would violate the "exactly one
|
|
1363
1671
|
// terminal record per turnId" invariant.
|
|
1364
|
-
if (controller.signal.aborted && !slot.timedOut) {
|
|
1672
|
+
if (controller.signal.aborted && !slot.timedOut && !slot.budgetExceeded) {
|
|
1365
1673
|
return;
|
|
1366
1674
|
}
|
|
1367
1675
|
|
|
@@ -1386,7 +1694,9 @@ export class Dispatcher {
|
|
|
1386
1694
|
const canDeliverRuntimeText = isOwnerChat || !isBotCordChannel(channel);
|
|
1387
1695
|
const canDeliverRuntimeDiagnostics = canDeliverRuntimeText || isBotCordChannel(channel);
|
|
1388
1696
|
|
|
1389
|
-
if (slot.timedOut) {
|
|
1697
|
+
if (slot.timedOut || slot.budgetExceeded) {
|
|
1698
|
+
const phase = slot.budgetExceeded ? "budget" : "timeout";
|
|
1699
|
+
const error = slot.budgetExceeded ?? `runtime timeout after ${effectiveTurnTimeoutMs}ms`;
|
|
1390
1700
|
this.transcript.write({
|
|
1391
1701
|
ts: nowIso(),
|
|
1392
1702
|
kind: "turn_error",
|
|
@@ -1394,8 +1704,8 @@ export class Dispatcher {
|
|
|
1394
1704
|
agentId: msg.accountId,
|
|
1395
1705
|
roomId: msg.conversation.id,
|
|
1396
1706
|
topicId: msg.conversation.threadId ?? null,
|
|
1397
|
-
phase
|
|
1398
|
-
error
|
|
1707
|
+
phase,
|
|
1708
|
+
error,
|
|
1399
1709
|
durationMs: Date.now() - slot.dispatchedAt,
|
|
1400
1710
|
});
|
|
1401
1711
|
if (canDeliverRuntimeDiagnostics) {
|
|
@@ -1405,7 +1715,9 @@ export class Dispatcher {
|
|
|
1405
1715
|
conversationId: msg.conversation.id,
|
|
1406
1716
|
threadId: msg.conversation.threadId ?? null,
|
|
1407
1717
|
type: "error",
|
|
1408
|
-
text:
|
|
1718
|
+
text: slot.budgetExceeded
|
|
1719
|
+
? `Cloud run budget exceeded: ${slot.budgetExceeded}`
|
|
1720
|
+
: `Runtime timeout after ${Math.round(effectiveTurnTimeoutMs / 60000)} minute(s); aborted`,
|
|
1409
1721
|
replyTo: this.providerReplyTo(msg),
|
|
1410
1722
|
traceId: msg.trace?.id ?? null,
|
|
1411
1723
|
}, turnId);
|
|
@@ -1416,7 +1728,8 @@ export class Dispatcher {
|
|
|
1416
1728
|
topicId: msg.conversation.threadId ?? null,
|
|
1417
1729
|
turnId,
|
|
1418
1730
|
queueKey,
|
|
1419
|
-
timeoutMs:
|
|
1731
|
+
timeoutMs: effectiveTurnTimeoutMs,
|
|
1732
|
+
budgetExceeded: slot.budgetExceeded,
|
|
1420
1733
|
});
|
|
1421
1734
|
}
|
|
1422
1735
|
return;
|
|
@@ -1469,8 +1782,28 @@ export class Dispatcher {
|
|
|
1469
1782
|
|
|
1470
1783
|
if (!result) return;
|
|
1471
1784
|
|
|
1472
|
-
const
|
|
1473
|
-
const
|
|
1785
|
+
const rawReplyText = (result.text || "").trim();
|
|
1786
|
+
const replyLooksLikeAuthFailure = looksLikeRuntimeAuthFailure(rawReplyText);
|
|
1787
|
+
const replyText = replyLooksLikeAuthFailure ? "" : rawReplyText;
|
|
1788
|
+
const effectiveError = result.error ?? (replyLooksLikeAuthFailure ? rawReplyText : undefined);
|
|
1789
|
+
const authFailureError =
|
|
1790
|
+
effectiveError && looksLikeRuntimeAuthFailure(effectiveError) ? effectiveError : undefined;
|
|
1791
|
+
const finalTextField = truncateTextField(replyLooksLikeAuthFailure ? "" : result.text || "");
|
|
1792
|
+
if (replyLooksLikeAuthFailure) {
|
|
1793
|
+
this.log.error("dispatcher: runtime text looked like authentication failure; treating as error", {
|
|
1794
|
+
agentId: msg.accountId,
|
|
1795
|
+
roomId: msg.conversation.id,
|
|
1796
|
+
topicId: msg.conversation.threadId ?? null,
|
|
1797
|
+
turnId,
|
|
1798
|
+
runtime: route.runtime,
|
|
1799
|
+
error: rawReplyText,
|
|
1800
|
+
});
|
|
1801
|
+
}
|
|
1802
|
+
if (authFailureError) {
|
|
1803
|
+
this.recordRuntimeAuthFailure(route, msg, authFailureError);
|
|
1804
|
+
} else if (!effectiveError) {
|
|
1805
|
+
this.clearRuntimeAuthFailures(route, msg);
|
|
1806
|
+
}
|
|
1474
1807
|
|
|
1475
1808
|
// Persist session before reply so next turn sees the new id even if send fails.
|
|
1476
1809
|
//
|
|
@@ -1481,14 +1814,14 @@ export class Dispatcher {
|
|
|
1481
1814
|
// even when the adapter echoes that id back
|
|
1482
1815
|
// result.newSessionId truthy → upsert the entry
|
|
1483
1816
|
// otherwise → no-op (e.g. codex intentionally never persists)
|
|
1484
|
-
if (sessionId &&
|
|
1817
|
+
if (sessionId && effectiveError && !replyText) {
|
|
1485
1818
|
try {
|
|
1486
1819
|
await this.sessionStore.delete(key);
|
|
1487
1820
|
this.log.info("dispatcher: dropped stale runtime session", {
|
|
1488
1821
|
key,
|
|
1489
1822
|
prevRuntimeSessionId: sessionId,
|
|
1490
1823
|
nextRuntimeSessionId: result.newSessionId || null,
|
|
1491
|
-
error:
|
|
1824
|
+
error: effectiveError,
|
|
1492
1825
|
});
|
|
1493
1826
|
} catch (err) {
|
|
1494
1827
|
this.log.warn("dispatcher: session-store.delete failed", {
|
|
@@ -1496,7 +1829,7 @@ export class Dispatcher {
|
|
|
1496
1829
|
error: err instanceof Error ? err.message : String(err),
|
|
1497
1830
|
});
|
|
1498
1831
|
}
|
|
1499
|
-
} else if (result.newSessionId) {
|
|
1832
|
+
} else if (result.newSessionId && !authFailureError) {
|
|
1500
1833
|
const session: GatewaySessionEntry = {
|
|
1501
1834
|
key,
|
|
1502
1835
|
runtime: route.runtime,
|
|
@@ -1524,13 +1857,13 @@ export class Dispatcher {
|
|
|
1524
1857
|
error: err instanceof Error ? err.message : String(err),
|
|
1525
1858
|
});
|
|
1526
1859
|
}
|
|
1527
|
-
} else if (sessionId &&
|
|
1860
|
+
} else if (sessionId && effectiveError) {
|
|
1528
1861
|
try {
|
|
1529
1862
|
await this.sessionStore.delete(key);
|
|
1530
1863
|
this.log.info("dispatcher: dropped stale runtime session", {
|
|
1531
1864
|
key,
|
|
1532
1865
|
prevRuntimeSessionId: sessionId,
|
|
1533
|
-
error:
|
|
1866
|
+
error: effectiveError,
|
|
1534
1867
|
});
|
|
1535
1868
|
} catch (err) {
|
|
1536
1869
|
this.log.warn("dispatcher: session-store.delete failed", {
|
|
@@ -1541,14 +1874,14 @@ export class Dispatcher {
|
|
|
1541
1874
|
}
|
|
1542
1875
|
|
|
1543
1876
|
if (!replyText) {
|
|
1544
|
-
if (
|
|
1877
|
+
if (effectiveError) {
|
|
1545
1878
|
this.log.warn("dispatcher: runtime returned error without reply text", {
|
|
1546
1879
|
agentId: msg.accountId,
|
|
1547
1880
|
roomId: msg.conversation.id,
|
|
1548
1881
|
topicId: msg.conversation.threadId ?? null,
|
|
1549
1882
|
turnId,
|
|
1550
1883
|
runtime: route.runtime,
|
|
1551
|
-
error:
|
|
1884
|
+
error: effectiveError,
|
|
1552
1885
|
});
|
|
1553
1886
|
if (canDeliverRuntimeDiagnostics) {
|
|
1554
1887
|
const sendResult = await this.sendReply(channel, {
|
|
@@ -1557,7 +1890,7 @@ export class Dispatcher {
|
|
|
1557
1890
|
conversationId: msg.conversation.id,
|
|
1558
1891
|
threadId: msg.conversation.threadId ?? null,
|
|
1559
1892
|
type: "error",
|
|
1560
|
-
text: `⚠️ Runtime error: ${truncate(
|
|
1893
|
+
text: `⚠️ Runtime error: ${truncate(effectiveError, 500)}`,
|
|
1561
1894
|
replyTo: this.providerReplyTo(msg),
|
|
1562
1895
|
traceId: msg.trace?.id ?? null,
|
|
1563
1896
|
}, turnId);
|
|
@@ -1585,7 +1918,7 @@ export class Dispatcher {
|
|
|
1585
1918
|
costUsd: result.costUsd,
|
|
1586
1919
|
finalText: finalTextField,
|
|
1587
1920
|
deliveryStatus: "empty_text",
|
|
1588
|
-
deliveryReason:
|
|
1921
|
+
deliveryReason: effectiveError ?? null,
|
|
1589
1922
|
blocks: slot.blocks,
|
|
1590
1923
|
});
|
|
1591
1924
|
return;
|
package/src/gateway/gateway.ts
CHANGED
|
@@ -1,5 +1,9 @@
|
|
|
1
1
|
import { ChannelManager, type ChannelBackoffOptions } from "./channel-manager.js";
|
|
2
|
-
import {
|
|
2
|
+
import {
|
|
3
|
+
Dispatcher,
|
|
4
|
+
type DispatcherOptions,
|
|
5
|
+
type RuntimeFactory,
|
|
6
|
+
} from "./dispatcher.js";
|
|
3
7
|
import { consoleLogger, type GatewayLogger } from "./log.js";
|
|
4
8
|
import { createRuntime } from "./runtimes/registry.js";
|
|
5
9
|
import { DEFAULT_SESSION_STORE_MAX_ENTRY_AGE_MS, SessionStore } from "./session-store.js";
|
|
@@ -61,6 +65,14 @@ export interface GatewayBootOptions {
|
|
|
61
65
|
* bookkeeping like loop-risk tracking.
|
|
62
66
|
*/
|
|
63
67
|
onOutbound?: OutboundObserver;
|
|
68
|
+
onRuntimeCircuitBreakerChange?: () => void;
|
|
69
|
+
/**
|
|
70
|
+
* Optional observer fired after each runtime turn resolves. Forwarded
|
|
71
|
+
* to the dispatcher verbatim — see {@link Dispatcher} for semantics.
|
|
72
|
+
* Cloud daemon hooks this to settle ``cloud_run`` envelopes against
|
|
73
|
+
* the Hub usage ledger.
|
|
74
|
+
*/
|
|
75
|
+
onTurnComplete?: DispatcherOptions["onTurnComplete"];
|
|
64
76
|
/**
|
|
65
77
|
* Optional attention gate (PR3, design §4.2). Forwarded to the dispatcher
|
|
66
78
|
* verbatim — see {@link Dispatcher} for semantics. Returning `false` skips
|
|
@@ -169,6 +181,8 @@ export class Gateway {
|
|
|
169
181
|
onInbound: opts.onInbound,
|
|
170
182
|
composeUserTurn: opts.composeUserTurn,
|
|
171
183
|
onOutbound: opts.onOutbound,
|
|
184
|
+
onTurnComplete: opts.onTurnComplete,
|
|
185
|
+
onRuntimeCircuitBreakerChange: opts.onRuntimeCircuitBreakerChange,
|
|
172
186
|
managedRoutes: this.managedRoutes,
|
|
173
187
|
attentionGate: opts.attentionGate,
|
|
174
188
|
resolveHubUrl: opts.resolveHubUrl,
|
|
@@ -204,6 +218,7 @@ export class Gateway {
|
|
|
204
218
|
return {
|
|
205
219
|
channels: this.channelManager.status(),
|
|
206
220
|
turns: this.dispatcher.turns(),
|
|
221
|
+
runtimeCircuitBreakers: this.dispatcher.runtimeCircuitBreakers(),
|
|
207
222
|
};
|
|
208
223
|
}
|
|
209
224
|
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Runtime CLIs sometimes report authentication failures as ordinary final
|
|
3
|
+
* text. Keep this intentionally narrow so normal model replies about auth do
|
|
4
|
+
* not get reclassified unless they look like a top-level CLI/API failure.
|
|
5
|
+
*/
|
|
6
|
+
export function looksLikeRuntimeAuthFailure(text: string): boolean {
|
|
7
|
+
const s = text.trim();
|
|
8
|
+
if (!s) return false;
|
|
9
|
+
return (
|
|
10
|
+
/^(Failed to authenticate|Authentication failed|Invalid API key|Invalid Anthropic API key)\b/i.test(s) ||
|
|
11
|
+
/^API Error:\s*4\d\d\b/i.test(s) ||
|
|
12
|
+
/\b(API Error:\s*4\d\d|Request not allowed|invalid x-api-key)\b/i.test(s) ||
|
|
13
|
+
/^(Unauthorized|Forbidden)(?:\b|:)/i.test(s)
|
|
14
|
+
);
|
|
15
|
+
}
|