switchroom 0.10.0 → 0.11.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +5 -4
- package/dist/agent-scheduler/index.js +2 -2
- package/dist/auth-broker/index.js +125 -3
- package/dist/cli/drive-write-pretool.mjs +5436 -0
- package/dist/cli/switchroom.js +231 -29
- package/dist/host-control/main.js +2 -2
- package/dist/vault/approvals/kernel-server.js +2 -2
- package/dist/vault/broker/server.js +2 -2
- package/package.json +1 -1
- package/telegram-plugin/admin-commands/dispatch.test.ts +1 -1
- package/telegram-plugin/admin-commands/index.ts +2 -0
- package/telegram-plugin/auth-snapshot-format.ts +612 -0
- package/telegram-plugin/auto-fallback-fleet.ts +215 -0
- package/telegram-plugin/auto-fallback.ts +28 -301
- package/telegram-plugin/dist/gateway/gateway.js +4314 -2143
- package/telegram-plugin/fleet-fallback-gate.ts +105 -0
- package/telegram-plugin/gateway/approval-callback.test.ts +104 -0
- package/telegram-plugin/gateway/approval-callback.ts +31 -3
- package/telegram-plugin/gateway/auth-broker-client.ts +2 -0
- package/telegram-plugin/gateway/auth-command.ts +131 -10
- package/telegram-plugin/gateway/auth-status-adapter.ts +101 -0
- package/telegram-plugin/gateway/boot-card.ts +1 -1
- package/telegram-plugin/gateway/boot-probes.ts +6 -9
- package/telegram-plugin/gateway/diff-preview-card.test.ts +192 -0
- package/telegram-plugin/gateway/diff-preview-card.ts +170 -0
- package/telegram-plugin/gateway/drive-write-approval.test.ts +312 -0
- package/telegram-plugin/gateway/drive-write-approval.ts +243 -0
- package/telegram-plugin/gateway/folder-picker-handler.test.ts +314 -0
- package/telegram-plugin/gateway/folder-picker-handler.ts +348 -0
- package/telegram-plugin/gateway/gateway.ts +903 -173
- package/telegram-plugin/gateway/hostd-dispatch.ts +137 -2
- package/telegram-plugin/gateway/ipc-protocol.ts +83 -2
- package/telegram-plugin/gateway/ipc-server.ts +69 -0
- package/telegram-plugin/hooks/sandbox-hint-posttool.mjs +103 -12
- package/telegram-plugin/model-unavailable.ts +28 -12
- package/telegram-plugin/silence-poke.ts +153 -1
- package/telegram-plugin/tests/auth-command-format2.test.ts +156 -0
- package/telegram-plugin/tests/auth-snapshot-format.test.ts +429 -0
- package/telegram-plugin/tests/auth-status-adapter.test.ts +129 -0
- package/telegram-plugin/tests/auto-fallback-fleet.test.ts +211 -0
- package/telegram-plugin/tests/auto-fallback.test.ts +60 -358
- package/telegram-plugin/tests/boot-probes.test.ts +16 -18
- package/telegram-plugin/tests/fleet-fallback-gate.test.ts +197 -0
- package/telegram-plugin/tests/model-unavailable.test.ts +30 -5
- package/telegram-plugin/tests/sandbox-hint-posttool.test.ts +212 -2
- package/telegram-plugin/tests/silence-poke.test.ts +237 -0
- package/telegram-plugin/tests/turn-flush-safety.test.ts +112 -0
- package/telegram-plugin/turn-flush-safety.ts +55 -1
- package/telegram-plugin/uat/SETUP.md +16 -12
- package/telegram-plugin/auto-fallback-dispatcher.ts +0 -68
- package/telegram-plugin/tests/auto-fallback-dispatcher.e2e.test.ts +0 -183
- package/telegram-plugin/tests/hostd-dispatch.test.ts +0 -129
|
@@ -0,0 +1,215 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Fleet-wide auto-fallback (RFC H — successor to the per-agent
|
|
3
|
+
* `performAutoFallback` in `auto-fallback.ts`).
|
|
4
|
+
*
|
|
5
|
+
* Why this exists alongside the legacy per-agent path:
|
|
6
|
+
*
|
|
7
|
+
* The pre-#XYZ auto-fallback called `fallbackToNextSlot(agentDir)`,
|
|
8
|
+
* which writes the new active slot to ONE agent's local
|
|
9
|
+
* `.claude/credentials.json`. That left the rest of the fleet still
|
|
10
|
+
* pointing at the just-exhausted account — which would then hit the
|
|
11
|
+
* wall on its own next call, surfacing N separate "Model unavailable"
|
|
12
|
+
* cards for the same root cause.
|
|
13
|
+
*
|
|
14
|
+
* Manual `/auth use <label>` already takes the fleet-wide path
|
|
15
|
+
* (broker.setActive → fan-out to all per-agent credential mirrors).
|
|
16
|
+
* Auto-fallback now uses the same path so scope is consistent and
|
|
17
|
+
* one quota event resolves the whole fleet in one swap.
|
|
18
|
+
*
|
|
19
|
+
* What this module does:
|
|
20
|
+
*
|
|
21
|
+
* 1. Probe live quota for every account in parallel via the
|
|
22
|
+
* broker (`client.probeQuota(...)`, #1336) so we pick the best
|
|
23
|
+
* target with current data, not stale broker disk-cache.
|
|
24
|
+
* 2. Skip blocked accounts entirely; pick the lowest-utilization
|
|
25
|
+
* healthy candidate (or, if none, the lowest throttling one).
|
|
26
|
+
* 3. Call `client.setActive(target)` — same broker verb /auth use
|
|
27
|
+
* uses. Broker re-mirrors creds to all agents.
|
|
28
|
+
* 4. Render the causal-shape announcement
|
|
29
|
+
* (`renderFallbackAnnouncement`) with the OLD account's binding
|
|
30
|
+
* window in the headline (5-hour vs 7-day) and the new
|
|
31
|
+
* account's headroom in the body.
|
|
32
|
+
*
|
|
33
|
+
* Pure-data return shape — caller does the actual Telegram send +
|
|
34
|
+
* lockout-record bookkeeping, mirroring the legacy module's contract.
|
|
35
|
+
*/
|
|
36
|
+
|
|
37
|
+
import type { QuotaResult, QuotaUtilization } from './quota-check.js';
|
|
38
|
+
import type { ListStateData } from '../src/auth/broker/client.js';
|
|
39
|
+
import {
|
|
40
|
+
renderFallbackAnnouncement,
|
|
41
|
+
classifyHealth,
|
|
42
|
+
buildSnapshotsFromState,
|
|
43
|
+
type AccountSnapshot,
|
|
44
|
+
} from './auth-snapshot-format.js';
|
|
45
|
+
|
|
46
|
+
export type FleetFallbackOutcome =
|
|
47
|
+
| {
|
|
48
|
+
kind: 'switched';
|
|
49
|
+
oldLabel: string;
|
|
50
|
+
newLabel: string;
|
|
51
|
+
announcement: string;
|
|
52
|
+
/** Quota for the OLD account at the moment of failure — caller
|
|
53
|
+
* may persist this as the broker's `quota.json` so the next
|
|
54
|
+
* /auth render reflects the freshly-known exhaustion without
|
|
55
|
+
* another probe. */
|
|
56
|
+
oldQuota: QuotaUtilization;
|
|
57
|
+
/** Quota for the new active account, useful for caller logging. */
|
|
58
|
+
newQuota: QuotaUtilization;
|
|
59
|
+
}
|
|
60
|
+
| {
|
|
61
|
+
kind: 'all-blocked';
|
|
62
|
+
oldLabel: string;
|
|
63
|
+
announcement: string;
|
|
64
|
+
oldQuota: QuotaUtilization | null;
|
|
65
|
+
}
|
|
66
|
+
| {
|
|
67
|
+
kind: 'no-old-active';
|
|
68
|
+
announcement: string;
|
|
69
|
+
}
|
|
70
|
+
| {
|
|
71
|
+
kind: 'no-eligible-target';
|
|
72
|
+
oldLabel: string;
|
|
73
|
+
announcement: string;
|
|
74
|
+
oldQuota: QuotaUtilization | null;
|
|
75
|
+
};
|
|
76
|
+
|
|
77
|
+
export interface FleetFallbackDeps {
|
|
78
|
+
/** Live broker state. Caller passes pre-fetched data so this module
|
|
79
|
+
* is testable without spinning up a UDS. */
|
|
80
|
+
state: ListStateData;
|
|
81
|
+
/** Parallel array of live quota probes, same order as `state.accounts`.
|
|
82
|
+
* Get via `client.probeQuota(state.accounts.map(a => a.label))`
|
|
83
|
+
* and map the response back to per-account results (#1336). */
|
|
84
|
+
quotas: QuotaResult[];
|
|
85
|
+
/** Broker `setActive` invoker. Returns the result for logging. */
|
|
86
|
+
setActive: (label: string) => Promise<{ active: string; fanned: string[] }>;
|
|
87
|
+
/** Agent that triggered this fallback (for the announcement byline). */
|
|
88
|
+
triggerAgent: string;
|
|
89
|
+
/** Operator timezone for absolute reset times in the announcement. */
|
|
90
|
+
tz?: string;
|
|
91
|
+
now?: Date;
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
/**
|
|
95
|
+
* Plan + execute the fleet-wide swap. Returns a structured outcome the
|
|
96
|
+
* caller can both log and notify on.
|
|
97
|
+
*
|
|
98
|
+
* Idempotency: when the active account is already healthy (a stale
|
|
99
|
+
* model-unavailable event arrives after the quota window already
|
|
100
|
+
* rolled over, for example), we DO NOT swap. Returns
|
|
101
|
+
* `'no-eligible-target'` so the caller silently no-ops the
|
|
102
|
+
* announcement.
|
|
103
|
+
*/
|
|
104
|
+
export async function runFleetAutoFallback(
|
|
105
|
+
deps: FleetFallbackDeps,
|
|
106
|
+
): Promise<FleetFallbackOutcome> {
|
|
107
|
+
const now = deps.now ?? new Date();
|
|
108
|
+
const tz = deps.tz ?? 'UTC';
|
|
109
|
+
const snapshots = buildSnapshotsFromState(deps.state, deps.quotas);
|
|
110
|
+
|
|
111
|
+
const oldSnap = snapshots.find((s) => s.isActive);
|
|
112
|
+
if (!oldSnap) {
|
|
113
|
+
return {
|
|
114
|
+
kind: 'no-old-active',
|
|
115
|
+
announcement: '<i>Auto-fallback skipped: no active account in broker state.</i>',
|
|
116
|
+
};
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
// Idempotency guard: don't swap a healthy active account, even if
|
|
120
|
+
// the trigger event said quota_exhausted. The event may be stale
|
|
121
|
+
// (event posted, window rolled over, gateway picked it up late).
|
|
122
|
+
const oldHealth = classifyHealth(oldSnap);
|
|
123
|
+
if (oldHealth === 'healthy') {
|
|
124
|
+
return {
|
|
125
|
+
kind: 'no-eligible-target',
|
|
126
|
+
oldLabel: oldSnap.label,
|
|
127
|
+
oldQuota: oldSnap.quota,
|
|
128
|
+
announcement:
|
|
129
|
+
`<i>Auto-fallback skipped: ${oldSnap.label} probed healthy ` +
|
|
130
|
+
`(${pctSummary(oldSnap.quota)}). Stale event?</i>`,
|
|
131
|
+
};
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
const target = pickFallbackTarget(snapshots);
|
|
135
|
+
if (!target) {
|
|
136
|
+
// All-blocked path: no eligible target. Still notify the user with
|
|
137
|
+
// earliest-reset info via the announcement formatter.
|
|
138
|
+
return {
|
|
139
|
+
kind: 'all-blocked',
|
|
140
|
+
oldLabel: oldSnap.label,
|
|
141
|
+
oldQuota: oldSnap.quota,
|
|
142
|
+
announcement: renderFallbackAnnouncement({
|
|
143
|
+
oldLabel: oldSnap.label,
|
|
144
|
+
oldQuota: oldSnap.quota,
|
|
145
|
+
newLabel: null,
|
|
146
|
+
newQuota: null,
|
|
147
|
+
triggerAgent: deps.triggerAgent,
|
|
148
|
+
tz,
|
|
149
|
+
now,
|
|
150
|
+
}),
|
|
151
|
+
};
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
// Execute the broker swap. Caller catches and surfaces the failure
|
|
155
|
+
// — we don't double-wrap.
|
|
156
|
+
await deps.setActive(target.label);
|
|
157
|
+
|
|
158
|
+
return {
|
|
159
|
+
kind: 'switched',
|
|
160
|
+
oldLabel: oldSnap.label,
|
|
161
|
+
newLabel: target.label,
|
|
162
|
+
oldQuota: oldSnap.quota!, // non-null: only `unknown` health gets here through
|
|
163
|
+
// the no-target branch, never the switched one
|
|
164
|
+
newQuota: target.quota!,
|
|
165
|
+
announcement: renderFallbackAnnouncement({
|
|
166
|
+
oldLabel: oldSnap.label,
|
|
167
|
+
oldQuota: oldSnap.quota,
|
|
168
|
+
newLabel: target.label,
|
|
169
|
+
newQuota: target.quota,
|
|
170
|
+
triggerAgent: deps.triggerAgent,
|
|
171
|
+
tz,
|
|
172
|
+
now,
|
|
173
|
+
}),
|
|
174
|
+
};
|
|
175
|
+
}
|
|
176
|
+
|
|
177
|
+
/**
|
|
178
|
+
* Pick the best non-active fallback target. Selection order:
|
|
179
|
+
* 1. Healthy accounts, sorted by lowest 5h utilization (most
|
|
180
|
+
* runway).
|
|
181
|
+
* 2. If no healthy alternative, throttling accounts sorted by
|
|
182
|
+
* lowest binding-window utilization (least worst).
|
|
183
|
+
* 3. Skip blocked + unknown entirely — never recommend a switch
|
|
184
|
+
* into a wall, never bet on creds we couldn't probe.
|
|
185
|
+
*
|
|
186
|
+
* Returns null when no eligible target exists.
|
|
187
|
+
*/
|
|
188
|
+
export function pickFallbackTarget(
|
|
189
|
+
snapshots: AccountSnapshot[],
|
|
190
|
+
): AccountSnapshot | null {
|
|
191
|
+
const candidates = snapshots
|
|
192
|
+
.filter((s) => !s.isActive && s.quota != null)
|
|
193
|
+
.map((s) => ({ snap: s, health: classifyHealth(s) }));
|
|
194
|
+
|
|
195
|
+
const healthy = candidates
|
|
196
|
+
.filter((c) => c.health === 'healthy')
|
|
197
|
+
.sort((a, b) => a.snap.quota!.fiveHourUtilizationPct - b.snap.quota!.fiveHourUtilizationPct);
|
|
198
|
+
if (healthy.length > 0) return healthy[0]!.snap;
|
|
199
|
+
|
|
200
|
+
const throttling = candidates
|
|
201
|
+
.filter((c) => c.health === 'throttling')
|
|
202
|
+
.sort((a, b) => maxWindow(a.snap.quota!) - maxWindow(b.snap.quota!));
|
|
203
|
+
if (throttling.length > 0) return throttling[0]!.snap;
|
|
204
|
+
|
|
205
|
+
return null;
|
|
206
|
+
}
|
|
207
|
+
|
|
208
|
+
function maxWindow(q: QuotaUtilization): number {
|
|
209
|
+
return Math.max(q.fiveHourUtilizationPct, q.sevenDayUtilizationPct);
|
|
210
|
+
}
|
|
211
|
+
|
|
212
|
+
function pctSummary(q: QuotaUtilization | null): string {
|
|
213
|
+
if (!q) return 'no probe';
|
|
214
|
+
return `${Math.round(q.fiveHourUtilizationPct)}% / ${Math.round(q.sevenDayUtilizationPct)}%`;
|
|
215
|
+
}
|
|
@@ -1,252 +1,51 @@
|
|
|
1
1
|
/**
|
|
2
|
-
*
|
|
3
|
-
*
|
|
4
|
-
*
|
|
2
|
+
* Read-only persistence for the legacy per-agent auto-fallback lockout
|
|
3
|
+
* file. The lockout writer + decision logic + plan executor were retired
|
|
4
|
+
* in PR #1329 (fleet-wide auto-fallback path supersedes the per-agent
|
|
5
|
+
* one); this module's only remaining job is to support
|
|
6
|
+
* `isAutoFallbackCooldownActive` in gateway.ts, which reads the existing
|
|
7
|
+
* on-disk lockout to defer pending-restart drains while a recent
|
|
8
|
+
* rotation is still settling.
|
|
5
9
|
*
|
|
6
|
-
*
|
|
7
|
-
*
|
|
8
|
-
*
|
|
9
|
-
*
|
|
10
|
-
*
|
|
11
|
-
*
|
|
12
|
-
* which returns a plan + side-effect descriptor the caller
|
|
13
|
-
* executes (mark exhausted, swap slot, restart agent, notify).
|
|
10
|
+
* Existing on-disk lockouts (written by pre-#1329 gateways) age out via
|
|
11
|
+
* `DEFAULT_FALLBACK_COOLDOWN_MS`; new lockouts are never written. Once
|
|
12
|
+
* every operator has run `switchroom update` post-#1329, the file goes
|
|
13
|
+
* cold and `isAutoFallbackCooldownActive` always returns false. This
|
|
14
|
+
* module + the drain-cap consumer can then be retired together in a
|
|
15
|
+
* follow-up.
|
|
14
16
|
*/
|
|
15
17
|
|
|
16
|
-
import type { QuotaResult, QuotaUtilization } from './quota-check.js';
|
|
17
|
-
import { renderOperatorEvent } from './operator-events.js';
|
|
18
|
-
|
|
19
|
-
/** Threshold over which we treat the active slot as functionally out
|
|
20
|
-
* of quota. 99.5% leaves a tiny head-room for clock skew between the
|
|
21
|
-
* Anthropic rate-limit window and wall clock, matching the dashboard's
|
|
22
|
-
* own rounding behaviour. Tune with care. */
|
|
23
|
-
export const DEFAULT_TRIGGER_UTILIZATION_PCT = 99.5;
|
|
24
|
-
|
|
25
18
|
/** Minimum time between two consecutive fallback attempts for the same
|
|
26
|
-
* slot
|
|
27
|
-
*
|
|
28
|
-
*
|
|
19
|
+
* slot — guard against poll-storm fallback loops. Read-only since
|
|
20
|
+
* PR #1329; only consumed by `isAutoFallbackCooldownActive` to bound
|
|
21
|
+
* the drain-cap defer. */
|
|
29
22
|
export const DEFAULT_FALLBACK_COOLDOWN_MS = 2 * 60_000;
|
|
30
23
|
|
|
31
24
|
export type LockoutRecord = {
|
|
32
|
-
/** Slot name most recently marked exhausted by
|
|
25
|
+
/** Slot name most recently marked exhausted by the legacy writer. */
|
|
33
26
|
lastTransitionedFrom: string | null;
|
|
34
|
-
/** Wall-clock ms timestamp of
|
|
27
|
+
/** Wall-clock ms timestamp of that transition. */
|
|
35
28
|
lastTransitionAt: number;
|
|
36
29
|
};
|
|
37
30
|
|
|
38
|
-
export type FallbackDecision =
|
|
39
|
-
| { action: 'noop'; reason: string }
|
|
40
|
-
| {
|
|
41
|
-
action: 'fallback';
|
|
42
|
-
triggerReason: 'utilization-over-threshold' | '429-response' | 'explicit';
|
|
43
|
-
resetAtMs: number | null;
|
|
44
|
-
utilizationPct: number | null;
|
|
45
|
-
};
|
|
46
|
-
|
|
47
|
-
export type EvaluateArgs = {
|
|
48
|
-
quota: QuotaResult;
|
|
49
|
-
activeSlot: string | null;
|
|
50
|
-
now: number;
|
|
51
|
-
lockout: LockoutRecord;
|
|
52
|
-
thresholdPct?: number;
|
|
53
|
-
cooldownMs?: number;
|
|
54
|
-
/** Set to true when the caller already saw a 429 response body;
|
|
55
|
-
* this short-circuits past utilization-based decisions. */
|
|
56
|
-
saw429?: boolean;
|
|
57
|
-
};
|
|
58
|
-
|
|
59
|
-
/** Pure decision function — takes a quota result + lockout state and
|
|
60
|
-
* returns whether the caller should trigger auto-fallback.
|
|
61
|
-
* No side effects. Throws only on programmer error. */
|
|
62
|
-
export function evaluateFallbackTrigger(args: EvaluateArgs): FallbackDecision {
|
|
63
|
-
const threshold = args.thresholdPct ?? DEFAULT_TRIGGER_UTILIZATION_PCT;
|
|
64
|
-
const cooldown = args.cooldownMs ?? DEFAULT_FALLBACK_COOLDOWN_MS;
|
|
65
|
-
|
|
66
|
-
if (!args.activeSlot) {
|
|
67
|
-
return { action: 'noop', reason: 'no active slot (nothing to fall back from)' };
|
|
68
|
-
}
|
|
69
|
-
|
|
70
|
-
// Cooldown guard: if we already transitioned out of this slot
|
|
71
|
-
// recently, don't flap. The caller can safely re-poll without
|
|
72
|
-
// creating noise.
|
|
73
|
-
if (
|
|
74
|
-
args.lockout.lastTransitionedFrom === args.activeSlot &&
|
|
75
|
-
args.now - args.lockout.lastTransitionAt < cooldown
|
|
76
|
-
) {
|
|
77
|
-
return { action: 'noop', reason: 'recent transition, within cooldown' };
|
|
78
|
-
}
|
|
79
|
-
|
|
80
|
-
if (args.saw429) {
|
|
81
|
-
return {
|
|
82
|
-
action: 'fallback',
|
|
83
|
-
triggerReason: '429-response',
|
|
84
|
-
resetAtMs: extractNearestResetMs(args.quota),
|
|
85
|
-
utilizationPct: extractHighestUtilization(args.quota),
|
|
86
|
-
};
|
|
87
|
-
}
|
|
88
|
-
|
|
89
|
-
if (!args.quota.ok) {
|
|
90
|
-
return { action: 'noop', reason: `quota check failed: ${args.quota.reason}` };
|
|
91
|
-
}
|
|
92
|
-
|
|
93
|
-
const highest = extractHighestUtilization(args.quota);
|
|
94
|
-
if (highest == null) {
|
|
95
|
-
return { action: 'noop', reason: 'no utilization headers' };
|
|
96
|
-
}
|
|
97
|
-
|
|
98
|
-
if (highest >= threshold) {
|
|
99
|
-
return {
|
|
100
|
-
action: 'fallback',
|
|
101
|
-
triggerReason: 'utilization-over-threshold',
|
|
102
|
-
resetAtMs: extractNearestResetMs(args.quota),
|
|
103
|
-
utilizationPct: highest,
|
|
104
|
-
};
|
|
105
|
-
}
|
|
106
|
-
|
|
107
|
-
return { action: 'noop', reason: `utilization ${highest.toFixed(1)}% below ${threshold}%` };
|
|
108
|
-
}
|
|
109
|
-
|
|
110
|
-
function extractHighestUtilization(q: QuotaResult): number | null {
|
|
111
|
-
if (!q.ok) return null;
|
|
112
|
-
const u: QuotaUtilization = q.data;
|
|
113
|
-
const five = u.fiveHourUtilizationPct ?? null;
|
|
114
|
-
const seven = u.sevenDayUtilizationPct ?? null;
|
|
115
|
-
if (five == null && seven == null) return null;
|
|
116
|
-
if (five == null) return seven;
|
|
117
|
-
if (seven == null) return five;
|
|
118
|
-
return Math.max(five, seven);
|
|
119
|
-
}
|
|
120
|
-
|
|
121
|
-
function extractNearestResetMs(q: QuotaResult): number | null {
|
|
122
|
-
if (!q.ok) return null;
|
|
123
|
-
const candidates: number[] = [];
|
|
124
|
-
if (q.data.fiveHourResetAt) candidates.push(q.data.fiveHourResetAt.getTime());
|
|
125
|
-
if (q.data.sevenDayResetAt) candidates.push(q.data.sevenDayResetAt.getTime());
|
|
126
|
-
if (candidates.length === 0) return null;
|
|
127
|
-
return Math.min(...candidates);
|
|
128
|
-
}
|
|
129
|
-
|
|
130
|
-
/** The full plan built by the orchestrator — mirrored by the
|
|
131
|
-
* executor in gateway.ts. Pure data so tests can assert on it. */
|
|
132
|
-
export type FallbackPlan =
|
|
133
|
-
| {
|
|
134
|
-
kind: 'executed';
|
|
135
|
-
previousSlot: string;
|
|
136
|
-
newSlot: string;
|
|
137
|
-
resetAtMs: number | null;
|
|
138
|
-
notificationHtml: string;
|
|
139
|
-
agentName: string;
|
|
140
|
-
/** Carried through from the FallbackDecision so the executor can
|
|
141
|
-
* decide whether to do a hard or graceful restart. Reactive
|
|
142
|
-
* (`429-response`) failover wants a hard restart — the request
|
|
143
|
-
* the user just made already failed, so there's no in-flight
|
|
144
|
-
* turn worth preserving. Preemptive (`utilization-over-threshold`
|
|
145
|
-
* / `explicit`) failover wants a graceful one. See #420. */
|
|
146
|
-
triggerReason: 'utilization-over-threshold' | '429-response' | 'explicit';
|
|
147
|
-
}
|
|
148
|
-
| {
|
|
149
|
-
kind: 'exhausted-all';
|
|
150
|
-
activeSlot: string;
|
|
151
|
-
resetAtMs: number | null;
|
|
152
|
-
notificationHtml: string;
|
|
153
|
-
agentName: string;
|
|
154
|
-
};
|
|
155
|
-
|
|
156
|
-
export type PerformArgs = {
|
|
157
|
-
agentDir: string;
|
|
158
|
-
agentName: string;
|
|
159
|
-
decision: Extract<FallbackDecision, { action: 'fallback' }>;
|
|
160
|
-
deps: {
|
|
161
|
-
/** Current active slot; null means caller has already detached. */
|
|
162
|
-
currentActiveSlot: (agentDir: string) => string | null;
|
|
163
|
-
markSlotQuotaExhausted: (agentDir: string, slot: string, resetAtMs?: number, reason?: string) => void;
|
|
164
|
-
fallbackToNextSlot: (name: string, agentDir: string) => { newActive: string | null; previous: string | null };
|
|
165
|
-
};
|
|
166
|
-
};
|
|
167
|
-
|
|
168
|
-
/** Run the side-effects for a fallback decision and return a plan
|
|
169
|
-
* describing what happened. Caller is responsible for:
|
|
170
|
-
* - Executing the agent restart CLI (via runSwitchroomCommand)
|
|
171
|
-
* - Sending the notification via Telegram
|
|
172
|
-
* - Updating the in-memory lockout record (see `nextLockout`)
|
|
173
|
-
*/
|
|
174
|
-
export function performAutoFallback(args: PerformArgs): FallbackPlan {
|
|
175
|
-
const active = args.deps.currentActiveSlot(args.agentDir);
|
|
176
|
-
if (!active) {
|
|
177
|
-
return {
|
|
178
|
-
kind: 'exhausted-all',
|
|
179
|
-
activeSlot: 'unknown',
|
|
180
|
-
resetAtMs: args.decision.resetAtMs,
|
|
181
|
-
notificationHtml: buildAllExhaustedMessage('unknown', args.agentName, args.decision.resetAtMs),
|
|
182
|
-
agentName: args.agentName,
|
|
183
|
-
};
|
|
184
|
-
}
|
|
185
|
-
|
|
186
|
-
args.deps.markSlotQuotaExhausted(
|
|
187
|
-
args.agentDir,
|
|
188
|
-
active,
|
|
189
|
-
args.decision.resetAtMs ?? undefined,
|
|
190
|
-
args.decision.triggerReason,
|
|
191
|
-
);
|
|
192
|
-
|
|
193
|
-
const { newActive, previous } = args.deps.fallbackToNextSlot(args.agentName, args.agentDir);
|
|
194
|
-
const prev = previous ?? active;
|
|
195
|
-
|
|
196
|
-
if (!newActive || newActive === prev) {
|
|
197
|
-
return {
|
|
198
|
-
kind: 'exhausted-all',
|
|
199
|
-
activeSlot: prev,
|
|
200
|
-
resetAtMs: args.decision.resetAtMs,
|
|
201
|
-
notificationHtml: buildAllExhaustedMessage(prev, args.agentName, args.decision.resetAtMs),
|
|
202
|
-
agentName: args.agentName,
|
|
203
|
-
};
|
|
204
|
-
}
|
|
205
|
-
|
|
206
|
-
return {
|
|
207
|
-
kind: 'executed',
|
|
208
|
-
previousSlot: prev,
|
|
209
|
-
newSlot: newActive,
|
|
210
|
-
resetAtMs: args.decision.resetAtMs,
|
|
211
|
-
notificationHtml: buildSwitchedMessage(prev, newActive, args.agentName, args.decision.resetAtMs),
|
|
212
|
-
agentName: args.agentName,
|
|
213
|
-
triggerReason: args.decision.triggerReason,
|
|
214
|
-
};
|
|
215
|
-
}
|
|
216
|
-
|
|
217
|
-
/** Compute the next lockout record after a successful fallback. */
|
|
218
|
-
export function nextLockout(previousSlot: string, now: number): LockoutRecord {
|
|
219
|
-
return { lastTransitionedFrom: previousSlot, lastTransitionAt: now };
|
|
220
|
-
}
|
|
221
|
-
|
|
222
|
-
export function emptyLockout(): LockoutRecord {
|
|
223
|
-
return { lastTransitionedFrom: null, lastTransitionAt: 0 };
|
|
224
|
-
}
|
|
225
|
-
|
|
226
|
-
/**
|
|
227
|
-
* Disk-persistence helpers for the lockout record. The cooldown guard
|
|
228
|
-
* lives entirely in process memory pre-fix, so a gateway restart inside
|
|
229
|
-
* the cooldown window resets the timer to zero — and a quota-flap on
|
|
230
|
-
* the now-recovering slot can re-trigger fallback the moment the
|
|
231
|
-
* gateway comes back. See #417.
|
|
232
|
-
*
|
|
233
|
-
* Storage path: \`<agentDir>/.claude/auto-fallback-lockout.json\`. We
|
|
234
|
-
* tolerate any read/parse error by returning emptyLockout (the same
|
|
235
|
-
* outcome as a fresh process), since the cooldown is a noise filter,
|
|
236
|
-
* not a security boundary.
|
|
237
|
-
*/
|
|
238
|
-
const LOCKOUT_FILE = "auto-fallback-lockout.json";
|
|
239
|
-
|
|
240
31
|
export interface LockoutPersistOps {
|
|
241
32
|
readFileSync: (path: string, encoding: BufferEncoding) => string;
|
|
33
|
+
// writeFileSync + mkdirSync stay in the interface so the gateway's
|
|
34
|
+
// existing lockoutOps bundle still type-checks. They're never called
|
|
35
|
+
// by this module any more (the writer was retired).
|
|
242
36
|
writeFileSync: (path: string, data: string, opts: { mode?: number }) => void;
|
|
243
37
|
existsSync: (path: string) => boolean;
|
|
244
38
|
mkdirSync: (path: string, opts: { recursive: true }) => void;
|
|
245
39
|
joinPath: (...parts: string[]) => string;
|
|
246
|
-
now?: () => number;
|
|
247
40
|
}
|
|
248
41
|
|
|
249
|
-
|
|
42
|
+
const LOCKOUT_FILE = "auto-fallback-lockout.json";
|
|
43
|
+
|
|
44
|
+
function emptyLockout(): LockoutRecord {
|
|
45
|
+
return { lastTransitionedFrom: null, lastTransitionAt: 0 };
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
function lockoutPath(agentDir: string, joinPath: LockoutPersistOps['joinPath']): string {
|
|
250
49
|
return joinPath(agentDir, '.claude', LOCKOUT_FILE);
|
|
251
50
|
}
|
|
252
51
|
|
|
@@ -274,75 +73,3 @@ export function loadLockout(agentDir: string, ops: LockoutPersistOps): LockoutRe
|
|
|
274
73
|
}
|
|
275
74
|
return emptyLockout();
|
|
276
75
|
}
|
|
277
|
-
|
|
278
|
-
export function saveLockout(
|
|
279
|
-
agentDir: string,
|
|
280
|
-
record: LockoutRecord,
|
|
281
|
-
ops: LockoutPersistOps,
|
|
282
|
-
): void {
|
|
283
|
-
const path = lockoutPath(agentDir, ops.joinPath);
|
|
284
|
-
// Best-effort: ensure the .claude directory exists, then write. Any
|
|
285
|
-
// failure is swallowed by the caller's try/catch — losing the lockout
|
|
286
|
-
// file just degrades to in-memory-only behaviour, not a hard failure.
|
|
287
|
-
ops.mkdirSync(ops.joinPath(agentDir, '.claude'), { recursive: true });
|
|
288
|
-
ops.writeFileSync(
|
|
289
|
-
path,
|
|
290
|
-
JSON.stringify(record, null, 2) + '\n',
|
|
291
|
-
{ mode: 0o600 },
|
|
292
|
-
);
|
|
293
|
-
}
|
|
294
|
-
|
|
295
|
-
/**
|
|
296
|
-
* Build the notification HTML for a successful slot switch.
|
|
297
|
-
* Delegates to renderOperatorEvent for quota-exhausted; appends
|
|
298
|
-
* slot-transition detail as structured context.
|
|
299
|
-
*/
|
|
300
|
-
function buildSwitchedMessage(
|
|
301
|
-
prev: string,
|
|
302
|
-
next: string,
|
|
303
|
-
agent: string,
|
|
304
|
-
resetAtMs: number | null,
|
|
305
|
-
): string {
|
|
306
|
-
const reset = resetAtMs ? formatResetAt(resetAtMs) : 'unknown';
|
|
307
|
-
const detail = [
|
|
308
|
-
`Switched from slot ${prev} to ${next}. Restarting agent.`,
|
|
309
|
-
`Reset at: ${reset}.`,
|
|
310
|
-
].join(' ');
|
|
311
|
-
return renderOperatorEvent({
|
|
312
|
-
kind: 'quota-exhausted',
|
|
313
|
-
agent,
|
|
314
|
-
detail,
|
|
315
|
-
suggestedActions: [],
|
|
316
|
-
firstSeenAt: new Date(),
|
|
317
|
-
}).text;
|
|
318
|
-
}
|
|
319
|
-
|
|
320
|
-
/**
|
|
321
|
-
* Build the notification HTML when all slots are exhausted.
|
|
322
|
-
* Delegates to renderOperatorEvent for quota-exhausted; appends
|
|
323
|
-
* all-exhausted detail.
|
|
324
|
-
*/
|
|
325
|
-
function buildAllExhaustedMessage(
|
|
326
|
-
active: string,
|
|
327
|
-
agent: string,
|
|
328
|
-
resetAtMs: number | null,
|
|
329
|
-
): string {
|
|
330
|
-
const reset = resetAtMs ? formatResetAt(resetAtMs) : 'unknown';
|
|
331
|
-
const detail = [
|
|
332
|
-
`All account slots exhausted. Active slot: ${active}.`,
|
|
333
|
-
`Earliest reset at: ${reset}.`,
|
|
334
|
-
`Run /auth add ${agent} to attach another subscription.`,
|
|
335
|
-
].join(' ');
|
|
336
|
-
return renderOperatorEvent({
|
|
337
|
-
kind: 'quota-exhausted',
|
|
338
|
-
agent,
|
|
339
|
-
detail,
|
|
340
|
-
suggestedActions: [],
|
|
341
|
-
firstSeenAt: new Date(),
|
|
342
|
-
}).text;
|
|
343
|
-
}
|
|
344
|
-
|
|
345
|
-
function formatResetAt(ms: number): string {
|
|
346
|
-
// ISO with seconds trimmed — Telegram doesn't need millisecond precision.
|
|
347
|
-
return new Date(ms).toISOString().replace(/\.\d{3}Z$/, 'Z');
|
|
348
|
-
}
|