switchroom 0.7.15 → 0.10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +51 -59
- package/bin/run-hook.sh +27 -11
- package/bin/timezone-hook.sh +9 -7
- package/dist/agent-scheduler/index.js +410 -133
- package/dist/auth-broker/index.js +13932 -0
- package/dist/cli/switchroom.js +26937 -5601
- package/dist/host-control/main.js +12702 -0
- package/dist/vault/approvals/kernel-server.js +467 -184
- package/dist/vault/broker/server.js +1430 -724
- package/examples/minimal.yaml +63 -0
- package/examples/personal-google-workspace-mcp/.env.example +34 -0
- package/examples/personal-google-workspace-mcp/README.md +194 -0
- package/examples/personal-google-workspace-mcp/compose.yaml +66 -0
- package/examples/switchroom.yaml +220 -0
- package/package.json +7 -4
- package/profiles/_base/settings.json.hbs +20 -5
- package/profiles/_base/start.sh.hbs +16 -3
- package/profiles/_shared/agent-self-service.md.hbs +126 -0
- package/profiles/_shared/telegram-style.md.hbs +20 -90
- package/profiles/_shared/vault-protocol.md.hbs +68 -0
- package/profiles/default/CLAUDE.md +50 -96
- package/profiles/default/CLAUDE.md.hbs +36 -6
- package/profiles/default/workspace/SOUL.md.hbs +12 -5
- package/skills/buildkite-agent-infrastructure/SKILL.md +30 -11
- package/skills/buildkite-agent-runtime/SKILL.md +44 -11
- package/skills/buildkite-api/SKILL.md +31 -8
- package/skills/buildkite-cli/SKILL.md +27 -9
- package/skills/buildkite-migration/SKILL.md +22 -9
- package/skills/buildkite-pipelines/SKILL.md +26 -9
- package/skills/buildkite-secure-delivery/SKILL.md +23 -9
- package/skills/buildkite-test-engine/SKILL.md +25 -8
- package/skills/docx/SKILL.md +1 -1
- package/skills/docx/scripts/office/validators/__pycache__/__init__.cpython-313.pyc +0 -0
- package/skills/docx/scripts/office/validators/__pycache__/base.cpython-313.pyc +0 -0
- package/skills/file-bug/SKILL.md +34 -6
- package/skills/humanizer/SKILL.md +15 -0
- package/skills/humanizer-calibrate/SKILL.md +7 -1
- package/skills/mcp-builder/SKILL.md +1 -1
- package/skills/pdf/SKILL.md +1 -1
- package/skills/pptx/SKILL.md +1 -1
- package/skills/skill-creator/SKILL.md +21 -1
- package/skills/skill-creator/scripts/__pycache__/__init__.cpython-313.pyc +0 -0
- package/skills/skill-creator/scripts/__pycache__/generate_report.cpython-313.pyc +0 -0
- package/skills/skill-creator/scripts/__pycache__/improve_description.cpython-313.pyc +0 -0
- package/skills/skill-creator/scripts/__pycache__/run_eval.cpython-313.pyc +0 -0
- package/skills/skill-creator/scripts/__pycache__/run_loop.cpython-313.pyc +0 -0
- package/skills/skill-creator/scripts/__pycache__/utils.cpython-313.pyc +0 -0
- package/skills/switchroom-cli/SKILL.md +63 -64
- package/skills/switchroom-health/SKILL.md +23 -10
- package/skills/switchroom-install/SKILL.md +3 -3
- package/skills/switchroom-manage/SKILL.md +26 -19
- package/skills/switchroom-runtime/SKILL.md +191 -0
- package/skills/switchroom-status/SKILL.md +27 -2
- package/skills/telegram-test-harness/SKILL.md +3 -0
- package/skills/token-helpers/SKILL.md +24 -1
- package/skills/webapp-testing/SKILL.md +31 -1
- package/skills/xlsx/SKILL.md +1 -1
- package/telegram-plugin/admin-commands/index.ts +7 -5
- package/telegram-plugin/analytics-posthog.ts +191 -0
- package/telegram-plugin/bridge/bridge.ts +69 -0
- package/telegram-plugin/bridge/ipc-client.ts +4 -1
- package/telegram-plugin/dist/bridge/bridge.js +194 -119
- package/telegram-plugin/dist/gateway/gateway.js +23611 -19671
- package/telegram-plugin/dist/server.js +245 -189
- package/telegram-plugin/first-paint.ts +3 -24
- package/telegram-plugin/gateway/auth-add-flow.ts +326 -0
- package/telegram-plugin/gateway/auth-broker-client.ts +75 -0
- package/telegram-plugin/gateway/auth-command.ts +794 -0
- package/telegram-plugin/gateway/auth-line.ts +123 -0
- package/telegram-plugin/gateway/boot-card.ts +169 -40
- package/telegram-plugin/gateway/boot-issue-cache.ts +308 -0
- package/telegram-plugin/gateway/boot-probes.ts +166 -123
- package/telegram-plugin/gateway/boot-reason.ts +41 -7
- package/telegram-plugin/gateway/boot-version.ts +66 -0
- package/telegram-plugin/gateway/gateway.ts +3499 -1885
- package/telegram-plugin/gateway/hostd-dispatch.ts +117 -0
- package/telegram-plugin/gateway/ipc-protocol.ts +18 -0
- package/telegram-plugin/gateway/pending-inbound-buffer.ts +106 -0
- package/telegram-plugin/gateway/quarantine.ts +69 -0
- package/telegram-plugin/gateway/quota-cache.ts +9 -4
- package/telegram-plugin/gateway/reaction-trigger.ts +401 -0
- package/telegram-plugin/gateway/recent-denials.test.ts +103 -0
- package/telegram-plugin/gateway/recent-denials.ts +77 -0
- package/telegram-plugin/gateway/startup-network-retry.ts +109 -31
- package/telegram-plugin/gateway/vault-grant-inbound-builders.ts +125 -0
- package/telegram-plugin/history.ts +91 -0
- package/telegram-plugin/hooks/hooks.json +10 -0
- package/telegram-plugin/hooks/sandbox-hint-posttool.mjs +130 -0
- package/telegram-plugin/hooks/subagent-tracker-posttool.mjs +19 -2
- package/telegram-plugin/hooks/subagent-tracker-pretool.mjs +22 -2
- package/telegram-plugin/hooks/tool-label-pretool.mjs +11 -0
- package/telegram-plugin/hooks/wedge-detect-posttool.mjs +303 -0
- package/telegram-plugin/inbound-classifier.ts +50 -0
- package/telegram-plugin/inline-keyboard-callbacks.ts +136 -0
- package/telegram-plugin/node_modules/.vite/vitest/da39a3ee5e6b4b0d3255bfef95601890afd80709/results.json +1 -0
- package/telegram-plugin/package.json +4 -2
- package/telegram-plugin/permission-rule.ts +51 -0
- package/telegram-plugin/permission-title.ts +56 -0
- package/telegram-plugin/quota-check.ts +19 -41
- package/telegram-plugin/registry/reaper.ts +223 -0
- package/telegram-plugin/retry-api-call.ts +80 -0
- package/telegram-plugin/runtime-metrics.ts +177 -0
- package/telegram-plugin/scripts/build.mjs +0 -1
- package/telegram-plugin/secret-detect/index.ts +24 -0
- package/telegram-plugin/secret-detect/vault-error.test.ts +64 -12
- package/telegram-plugin/secret-detect/vault-error.ts +78 -11
- package/telegram-plugin/secret-detect/vault-write.ts +14 -2
- package/telegram-plugin/server.js +41795 -0
- package/telegram-plugin/session-tail.ts +6 -1
- package/telegram-plugin/shared/bot-runtime.ts +5 -4
- package/telegram-plugin/silence-poke.ts +420 -0
- package/telegram-plugin/silent-end.ts +174 -0
- package/telegram-plugin/stream-controller.ts +13 -0
- package/telegram-plugin/stream-reply-handler.ts +7 -0
- package/telegram-plugin/subagent-watcher.ts +213 -4
- package/telegram-plugin/tests/auth-add-flow.test.ts +559 -0
- package/telegram-plugin/tests/auth-code-redact.test.ts +8 -4
- package/telegram-plugin/tests/auth-command-vernacular.test.ts +531 -0
- package/telegram-plugin/tests/boot-card-issue-dedup.test.ts +247 -0
- package/telegram-plugin/tests/boot-card-reason-to-render.test.ts +182 -0
- package/telegram-plugin/tests/boot-card-reason.test.ts +65 -2
- package/telegram-plugin/tests/boot-card-render.test.ts +146 -0
- package/telegram-plugin/tests/boot-card-silent-on-operator.test.ts +103 -0
- package/telegram-plugin/tests/boot-probes.test.ts +216 -10
- package/telegram-plugin/tests/boot-version-string.test.ts +0 -0
- package/telegram-plugin/tests/finalize-callback.test.ts +190 -0
- package/telegram-plugin/tests/gateway-message-validator.test.ts +26 -0
- package/telegram-plugin/tests/gateway-secret-detect.test.ts +12 -3
- package/telegram-plugin/tests/gateway-startup-network-retry.test.ts +104 -0
- package/telegram-plugin/tests/history-reaper.test.ts +378 -0
- package/telegram-plugin/tests/hostd-dispatch.test.ts +129 -0
- package/telegram-plugin/tests/inbound-classifier.test.ts +76 -0
- package/telegram-plugin/tests/inbound-message-types.test.ts +267 -0
- package/telegram-plugin/tests/issues-card.test.ts +49 -0
- package/telegram-plugin/tests/pending-inbound-buffer.test.ts +132 -0
- package/telegram-plugin/tests/permission-rule.test.ts +80 -1
- package/telegram-plugin/tests/permission-title.test.ts +31 -0
- package/telegram-plugin/tests/quota-check.test.ts +5 -35
- package/telegram-plugin/tests/races.test.ts +179 -0
- package/telegram-plugin/tests/reaction-trigger-flow.test.ts +353 -0
- package/telegram-plugin/tests/reaction-trigger.test.ts +397 -0
- package/telegram-plugin/tests/retry-api-call.test.ts +152 -1
- package/telegram-plugin/tests/runtime-metrics.test.ts +145 -0
- package/telegram-plugin/tests/sandbox-hint-posttool.test.ts +155 -0
- package/telegram-plugin/tests/secret-detect-delete-must-surface-failures.test.ts +133 -0
- package/telegram-plugin/tests/secret-detect-false-positives.test.ts +137 -0
- package/telegram-plugin/tests/silence-poke.test.ts +493 -0
- package/telegram-plugin/tests/silent-end.test.ts +206 -0
- package/telegram-plugin/tests/subagent-tracker-hooks.test.ts +107 -0
- package/telegram-plugin/tests/subagent-watcher-env-thresholds.test.ts +224 -0
- package/telegram-plugin/tests/subagent-watcher-stall-terminal.test.ts +316 -0
- package/telegram-plugin/tests/subagent-watcher.test.ts +263 -0
- package/telegram-plugin/tests/turn-signal-tracker.test.ts +81 -0
- package/telegram-plugin/tests/vault-approval-posture.test.ts +256 -0
- package/telegram-plugin/tests/vault-grant-auto-resume.test.ts +73 -0
- package/telegram-plugin/tests/vault-grant-inbound-builders.test.ts +226 -0
- package/telegram-plugin/tests/vault-grant-union.test.ts +130 -0
- package/telegram-plugin/tests/vault-key-regex-allows-slash.test.ts +140 -0
- package/telegram-plugin/tests/vault-posture-quarantine.test.ts +104 -0
- package/telegram-plugin/tests/vault-request-access-tool.test.ts +114 -0
- package/telegram-plugin/tests/vault-request-access-unlock-resume.test.ts +106 -0
- package/telegram-plugin/turn-signal-tracker.ts +100 -24
- package/telegram-plugin/uat/SETUP.md +210 -35
- package/telegram-plugin/uat/assertions.ts +264 -37
- package/telegram-plugin/uat/driver-info.ts +57 -0
- package/telegram-plugin/uat/driver.ts +590 -51
- package/telegram-plugin/uat/harness.ts +140 -94
- package/telegram-plugin/uat/load-env.test.ts +72 -0
- package/telegram-plugin/uat/load-env.ts +48 -0
- package/telegram-plugin/uat/login.ts +96 -53
- package/telegram-plugin/uat/runners/agent-self-sufficiency.ts +457 -0
- package/telegram-plugin/uat/runners/paraphrases.ts +231 -0
- package/telegram-plugin/uat/runners/report.ts +150 -0
- package/telegram-plugin/uat/runners/run-agent-self-sufficiency.sh +50 -0
- package/telegram-plugin/uat/runners/scorer.test.ts +196 -0
- package/telegram-plugin/uat/runners/scorer.ts +106 -0
- package/telegram-plugin/uat/runners/skill-coverage.test.ts +100 -0
- package/telegram-plugin/uat/runners/skill-coverage.ts +620 -0
- package/telegram-plugin/uat/scenarios/ask-user-button-tap-dm.test.ts +141 -0
- package/telegram-plugin/uat/scenarios/bg-sub-agent-dispatch-dm.test.ts +191 -0
- package/telegram-plugin/uat/scenarios/fuzz-extended-dm.test.ts +255 -0
- package/telegram-plugin/uat/scenarios/fuzz-human-style-dm.test.ts +275 -0
- package/telegram-plugin/uat/scenarios/fuzz-random-prompts-dm.test.ts +146 -0
- package/telegram-plugin/uat/scenarios/fuzz-status-ask-dm.test.ts +486 -0
- package/telegram-plugin/uat/scenarios/jtbd-interrupt-marker-dm.test.ts +67 -0
- package/telegram-plugin/uat/scenarios/jtbd-rapid-followup-dm.test.ts +100 -0
- package/telegram-plugin/uat/scenarios/jtbd-soft-commit-dm.test.ts +67 -0
- package/telegram-plugin/uat/scenarios/jtbd-status-query-dm.test.ts +49 -0
- package/telegram-plugin/uat/scenarios/location-inbound-dm.test.ts +65 -0
- package/telegram-plugin/uat/scenarios/midturn-silent-dm.test.ts +175 -0
- package/telegram-plugin/uat/scenarios/reactions-dm.test.ts +142 -0
- package/telegram-plugin/uat/scenarios/reactions-trigger-turn-dm.test.ts +96 -0
- package/telegram-plugin/uat/scenarios/secret-redaction-deletes-original-dm.test.ts +123 -0
- package/telegram-plugin/uat/scenarios/secret-redaction-no-false-positive-dm.test.ts +87 -0
- package/telegram-plugin/uat/scenarios/silence-poke-soft-dm.test.ts +155 -0
- package/telegram-plugin/uat/scenarios/silent-end-recovery-dm.test.ts +95 -0
- package/telegram-plugin/uat/scenarios/smoke-dm-reply.test.ts +57 -0
- package/telegram-plugin/uat/scenarios/subagent-watcher-no-rerun-dm.test.ts +135 -0
- package/telegram-plugin/uat/scenarios/vault-approval-posture-telegram-id-dm.test.ts +191 -0
- package/telegram-plugin/uat/scenarios/vault-audit-allow-dm.test.ts +108 -0
- package/telegram-plugin/uat/scenarios/vault-grant-auto-resume-dm.test.ts +121 -0
- package/telegram-plugin/uat/scenarios/vault-request-access-concurrent-dm.test.ts +161 -0
- package/telegram-plugin/uat/scenarios/vault-request-access-end-to-end-dm.test.ts +158 -0
- package/telegram-plugin/uat/scenarios/voice-inbound-dm.test.ts +65 -0
- package/telegram-plugin/vault-approval-posture.ts +42 -0
- package/telegram-plugin/welcome-text.ts +1 -0
- package/telegram-plugin/active-pins-sweep.ts +0 -204
- package/telegram-plugin/active-pins.ts +0 -146
- package/telegram-plugin/auth-dashboard.ts +0 -1104
- package/telegram-plugin/auth-slot-parser.ts +0 -497
- package/telegram-plugin/card-event-log.ts +0 -138
- package/telegram-plugin/dist/foreman/foreman.js +0 -31106
- package/telegram-plugin/docs/multi-agent-card-design.md +0 -847
- package/telegram-plugin/docs/pinned-progress-card-reliability.md +0 -144
- package/telegram-plugin/foreman/foreman-create-flow.ts +0 -202
- package/telegram-plugin/foreman/foreman-handlers.ts +0 -493
- package/telegram-plugin/foreman/foreman.ts +0 -1165
- package/telegram-plugin/foreman/setup-flow.ts +0 -345
- package/telegram-plugin/foreman/setup-state.ts +0 -239
- package/telegram-plugin/foreman/state.ts +0 -203
- package/telegram-plugin/pin-event-log.ts +0 -76
- package/telegram-plugin/progress-card-driver.ts +0 -2886
- package/telegram-plugin/progress-card-pin-manager.ts +0 -589
- package/telegram-plugin/progress-card-pin-watchdog.ts +0 -98
- package/telegram-plugin/progress-card.ts +0 -1409
- package/telegram-plugin/tests/HARNESS.md +0 -340
- package/telegram-plugin/tests/_progress-card-harness.ts +0 -109
- package/telegram-plugin/tests/active-pins-boot-reaper.test.ts +0 -211
- package/telegram-plugin/tests/active-pins-sweep.test.ts +0 -309
- package/telegram-plugin/tests/active-pins.test.ts +0 -187
- package/telegram-plugin/tests/auth-account-identity-surface.test.ts +0 -118
- package/telegram-plugin/tests/auth-dashboard-edge-cases.test.ts +0 -260
- package/telegram-plugin/tests/auth-dashboard-restart-flow.test.ts +0 -140
- package/telegram-plugin/tests/auth-dashboard-v3b.test.ts +0 -559
- package/telegram-plugin/tests/auth-dashboard.test.ts +0 -1045
- package/telegram-plugin/tests/auth-slot-commands.test.ts +0 -640
- package/telegram-plugin/tests/bg-agent-progress-card-757.test.ts +0 -201
- package/telegram-plugin/tests/boot-card-account-quota.test.ts +0 -137
- package/telegram-plugin/tests/card-event-log.test.ts +0 -145
- package/telegram-plugin/tests/first-paint.test.ts +0 -257
- package/telegram-plugin/tests/foreman-create-flow.test.ts +0 -359
- package/telegram-plugin/tests/foreman-handlers.test.ts +0 -347
- package/telegram-plugin/tests/foreman-state.test.ts +0 -164
- package/telegram-plugin/tests/foreman-write-ops.test.ts +0 -214
- package/telegram-plugin/tests/harness-ordering-invariants.test.ts +0 -243
- package/telegram-plugin/tests/pin-event-log.test.ts +0 -124
- package/telegram-plugin/tests/progress-card-api-failure-during-deferred.test.ts +0 -73
- package/telegram-plugin/tests/progress-card-close-paths-converge.test.ts +0 -272
- package/telegram-plugin/tests/progress-card-cross-turn.test.ts +0 -258
- package/telegram-plugin/tests/progress-card-delay-842.test.ts +0 -160
- package/telegram-plugin/tests/progress-card-dispose-preservepending.test.ts +0 -81
- package/telegram-plugin/tests/progress-card-draft-flag.test.ts +0 -80
- package/telegram-plugin/tests/progress-card-driver-eviction.test.ts +0 -215
- package/telegram-plugin/tests/progress-card-driver-fleet-shadow.test.ts +0 -123
- package/telegram-plugin/tests/progress-card-driver-force-complete-parent-done.test.ts +0 -76
- package/telegram-plugin/tests/progress-card-edit-timestamps-budget.test.ts +0 -62
- package/telegram-plugin/tests/progress-card-memory-bounds.test.ts +0 -84
- package/telegram-plugin/tests/progress-card-pin-failure-paths.test.ts +0 -139
- package/telegram-plugin/tests/progress-card-pin-manager.test.ts +0 -773
- package/telegram-plugin/tests/progress-card-pin-race-fast-turn.test.ts +0 -66
- package/telegram-plugin/tests/progress-card-pin-sidecar-partial-write.test.ts +0 -64
- package/telegram-plugin/tests/progress-card-pin-watchdog.test.ts +0 -190
- package/telegram-plugin/tests/progress-card-sigterm-pin-flush.test.ts +0 -146
- package/telegram-plugin/tests/real-gateway-f1-ladder-integrity.test.ts +0 -123
- package/telegram-plugin/tests/real-gateway-f2-instant-draft.test.ts +0 -82
- package/telegram-plugin/tests/real-gateway-f3-late-card.test.ts +0 -114
- package/telegram-plugin/tests/real-gateway-harness.ts +0 -699
- package/telegram-plugin/tests/real-gateway-i6-turn-flush-replay-dedup.test.ts +0 -313
- package/telegram-plugin/tests/real-gateway-ipc-lifecycle.test.ts +0 -299
- package/telegram-plugin/tests/real-gateway-spec.test.ts +0 -487
- package/telegram-plugin/tests/real-gateway.smoke.test.ts +0 -101
- package/telegram-plugin/tests/setup-flow.test.ts +0 -510
- package/telegram-plugin/tests/setup-state.test.ts +0 -146
- package/telegram-plugin/tests/sync-chat-running-subagents.test.ts +0 -116
- package/telegram-plugin/tests/turn-end-regressions.test.ts +0 -489
- package/telegram-plugin/tests/turn-flush-card-takeover.test.ts +0 -218
- package/telegram-plugin/tests/turn-flush-prose-recovery.test.ts +0 -78
- package/telegram-plugin/tests/two-zone-bg-carry-full-lifecycle.test.ts +0 -131
- package/telegram-plugin/tests/two-zone-bg-detection.test.ts +0 -120
- package/telegram-plugin/tests/two-zone-bg-done-when-all-terminal.test.ts +0 -116
- package/telegram-plugin/tests/two-zone-bg-early-turn-end.test.ts +0 -87
- package/telegram-plugin/tests/two-zone-bg-survives-next-turn.test.ts +0 -211
- package/telegram-plugin/tests/two-zone-card-cap.test.ts +0 -62
- package/telegram-plugin/tests/two-zone-card-fleet-row.test.ts +0 -101
- package/telegram-plugin/tests/two-zone-card-header-phases.test.ts +0 -78
- package/telegram-plugin/tests/two-zone-card-html-balance.test.ts +0 -110
- package/telegram-plugin/tests/two-zone-card-lifecycle.test.ts +0 -128
- package/telegram-plugin/tests/two-zone-card-sanitise.test.ts +0 -58
- package/telegram-plugin/tests/two-zone-card-snapshot.test.ts +0 -133
- package/telegram-plugin/tests/two-zone-concurrent-turns-isolation.test.ts +0 -155
- package/telegram-plugin/tests/two-zone-phasefor-precedence.test.ts +0 -117
- package/telegram-plugin/tests/two-zone-snapshot-extras.test.ts +0 -187
- package/telegram-plugin/tests/two-zone-stuck-edit-throttle.test.ts +0 -149
- package/telegram-plugin/tests/two-zone-stuck-header-escalation.test.ts +0 -101
- package/telegram-plugin/tests/two-zone-stuck-per-member.test.ts +0 -114
- package/telegram-plugin/tests/two-zone-stuck-recovery.test.ts +0 -105
- package/telegram-plugin/tests/waiting-ux-harness.ts +0 -381
- package/telegram-plugin/tests/waiting-ux.e2e.test.ts +0 -233
- package/telegram-plugin/turn-flush-prose-recovery.ts +0 -40
- package/telegram-plugin/two-zone-card.ts +0 -269
- package/telegram-plugin/uat/scenarios/smoke-clerk-reply.test.ts +0 -61
|
@@ -11,12 +11,13 @@
|
|
|
11
11
|
* caller as a thrown error — only as ProbeResult{ status:'fail', ... }.
|
|
12
12
|
*/
|
|
13
13
|
|
|
14
|
-
import { readFileSync, readdirSync, existsSync
|
|
14
|
+
import { readFileSync, readdirSync, existsSync } from 'fs'
|
|
15
15
|
import { join } from 'path'
|
|
16
16
|
import { execFile as execFileCb } from 'child_process'
|
|
17
17
|
import { promisify } from 'util'
|
|
18
18
|
|
|
19
19
|
import { readQuotaCache, writeQuotaCache } from './quota-cache.js'
|
|
20
|
+
import { fetchQuota, formatQuotaLine } from '../quota-check.js'
|
|
20
21
|
|
|
21
22
|
const execFile = promisify(execFileCb)
|
|
22
23
|
|
|
@@ -28,6 +29,13 @@ export interface ProbeResult {
|
|
|
28
29
|
status: ProbeStatus
|
|
29
30
|
label: string
|
|
30
31
|
detail: string
|
|
32
|
+
/** Plain-text remediation hint shown beneath the degraded row in the
|
|
33
|
+
* boot card. Per `reference/principles.md` principle 1, every failure
|
|
34
|
+
* should tell the user what to do next — naming the failure without a
|
|
35
|
+
* next step is the explicit ❌ Bad pattern. Omitted on ok rows (they
|
|
36
|
+
* don't render) and on degraded rows where no actionable hint exists.
|
|
37
|
+
*/
|
|
38
|
+
nextStep?: string
|
|
31
39
|
/** True when a 429 caused the probe to skip the live check. Used by
|
|
32
40
|
* writeQuotaCache to select the short RATE_LIMIT_TTL_MS instead of the
|
|
33
41
|
* default 5-min TTL. Keying off this boolean avoids matching on the
|
|
@@ -111,10 +119,18 @@ const TOKEN_EXPIRING_SOON_DAYS = 7
|
|
|
111
119
|
* Read account info from the agent's .claude.json.
|
|
112
120
|
* agentDir: e.g. /home/user/.switchroom/agents/clerk
|
|
113
121
|
*/
|
|
114
|
-
export async function probeAccount(
|
|
122
|
+
export async function probeAccount(
|
|
123
|
+
agentDir: string,
|
|
124
|
+
opts: { agentName?: string } = {},
|
|
125
|
+
): Promise<ProbeResult> {
|
|
115
126
|
return withTimeout('Account', (async (): Promise<ProbeResult> => {
|
|
116
127
|
const claudeDir = join(agentDir, '.claude')
|
|
117
128
|
const claudeJsonPath = join(claudeDir, '.claude.json')
|
|
129
|
+
// Fall back to the literal placeholder only when no agentName is plumbed
|
|
130
|
+
// through — the renderer's <code> escape will keep that safe in Telegram
|
|
131
|
+
// HTML, but real call sites should always pass the name so users can
|
|
132
|
+
// tap-to-copy a working command.
|
|
133
|
+
const agentRef = opts.agentName ?? '<agent>'
|
|
118
134
|
let cfg: ClaudeJson = {}
|
|
119
135
|
try {
|
|
120
136
|
const raw = readFileSync(claudeJsonPath, 'utf8')
|
|
@@ -125,7 +141,12 @@ export async function probeAccount(agentDir: string): Promise<ProbeResult> {
|
|
|
125
141
|
|
|
126
142
|
const acc = cfg.oauthAccount
|
|
127
143
|
if (!acc?.emailAddress) {
|
|
128
|
-
return {
|
|
144
|
+
return {
|
|
145
|
+
status: 'degraded',
|
|
146
|
+
label: 'Account',
|
|
147
|
+
detail: 'not signed in',
|
|
148
|
+
nextStep: `Run \`switchroom auth login ${agentRef}\` to start the OAuth flow`,
|
|
149
|
+
}
|
|
129
150
|
}
|
|
130
151
|
|
|
131
152
|
const plan = mapPlan(acc.billingType, acc.hasExtraUsageEnabled)
|
|
@@ -154,10 +175,16 @@ export async function probeAccount(agentDir: string): Promise<ProbeResult> {
|
|
|
154
175
|
}
|
|
155
176
|
}
|
|
156
177
|
|
|
178
|
+
const nextStep = status === 'fail'
|
|
179
|
+
? `OAuth token expired — run \`switchroom auth login ${agentRef}\` to re-authenticate`
|
|
180
|
+
: status === 'degraded'
|
|
181
|
+
? `Token expiring soon — run \`switchroom auth login ${agentRef}\` before it lapses`
|
|
182
|
+
: undefined
|
|
157
183
|
return {
|
|
158
184
|
status,
|
|
159
185
|
label: 'Account',
|
|
160
186
|
detail: `${acc.emailAddress} · ${plan}${tokenStr}`,
|
|
187
|
+
...(nextStep ? { nextStep } : {}),
|
|
161
188
|
}
|
|
162
189
|
})())
|
|
163
190
|
}
|
|
@@ -378,10 +405,36 @@ export function uptimeMsForStarttime(
|
|
|
378
405
|
}
|
|
379
406
|
}
|
|
380
407
|
|
|
408
|
+
/**
|
|
409
|
+
* Compute a remediation hint for a non-active agent systemd state. Returns
|
|
410
|
+
* `undefined` when no actionable hint applies. Per `reference/principles.md`
|
|
411
|
+
* principle 1, every degraded/fail row should tell the user what to do next.
|
|
412
|
+
* Hints share a common journalctl shape so they're greppable across
|
|
413
|
+
* agents.
|
|
414
|
+
*/
|
|
415
|
+
function nextStepForAgentState(agentName: string, state: string): string | undefined {
|
|
416
|
+
if (state === 'failed') {
|
|
417
|
+
return `Service failed — inspect with \`journalctl --user -u switchroom-${agentName} -n 100\` then \`switchroom agent restart ${agentName}\``
|
|
418
|
+
}
|
|
419
|
+
if (state === 'inactive') {
|
|
420
|
+
return `Service inactive — start with \`switchroom agent start ${agentName}\` (or \`systemctl --user start switchroom-${agentName}\`)`
|
|
421
|
+
}
|
|
422
|
+
if (state === 'deactivating' || state === 'activating' || state === 'auto-restart') {
|
|
423
|
+
return `Service is in a transient \`${state}\` state — re-check with \`switchroom agent status ${agentName}\` in a few seconds`
|
|
424
|
+
}
|
|
425
|
+
// Unknown state — keep the door open with a generic hint.
|
|
426
|
+
return `Inspect with \`journalctl --user -u switchroom-${agentName} -n 100\``
|
|
427
|
+
}
|
|
428
|
+
|
|
381
429
|
function probeAgentProcessDocker(): ProbeResult {
|
|
382
430
|
const found = findAgentProcessInContainer()
|
|
383
431
|
if (!found) {
|
|
384
|
-
return {
|
|
432
|
+
return {
|
|
433
|
+
status: 'fail',
|
|
434
|
+
label: 'Agent',
|
|
435
|
+
detail: 'claude process not found',
|
|
436
|
+
nextStep: 'No claude process in container — check container logs with `docker logs <container>` and restart with `switchroom agent restart <agent>`',
|
|
437
|
+
}
|
|
385
438
|
}
|
|
386
439
|
const uptimeMs = uptimeMsForStarttime(found.starttime)
|
|
387
440
|
const mb = Math.round(found.rssKb / 1024)
|
|
@@ -570,7 +623,8 @@ export async function probeAgentProcess(
|
|
|
570
623
|
state === 'activating' ||
|
|
571
624
|
state === 'auto-restart'
|
|
572
625
|
const status = isTransient ? 'degraded' : 'fail'
|
|
573
|
-
|
|
626
|
+
const nextStep = nextStepForAgentState(agentName, state)
|
|
627
|
+
return { status, label: 'Agent', detail: `service ${state}`, ...(nextStep ? { nextStep } : {}) }
|
|
574
628
|
}
|
|
575
629
|
|
|
576
630
|
// Still within retry budget — wait and try again.
|
|
@@ -681,7 +735,8 @@ export async function* watchAgentProcess(
|
|
|
681
735
|
state === 'auto-restart' ||
|
|
682
736
|
state === 'inactive'
|
|
683
737
|
const status = isTransient ? 'degraded' : 'fail'
|
|
684
|
-
|
|
738
|
+
const nextStep = nextStepForAgentState(agentName, state)
|
|
739
|
+
return { status, label: 'Agent', detail: `service ${state}`, ...(nextStep ? { nextStep } : {}) }
|
|
685
740
|
}
|
|
686
741
|
|
|
687
742
|
while (true) {
|
|
@@ -758,144 +813,83 @@ export async function probeGateway(info: GatewayRuntimeInfo): Promise<ProbeResul
|
|
|
758
813
|
|
|
759
814
|
// ─── Probe: Quota ─────────────────────────────────────────────────────────────
|
|
760
815
|
|
|
761
|
-
const QUOTA_DEBUG_FILE = 'quota-debug.json'
|
|
762
|
-
|
|
763
816
|
/**
|
|
764
|
-
*
|
|
765
|
-
*
|
|
766
|
-
*
|
|
817
|
+
* Read quota utilization via the Pro/Max plan rate-limit headers on a
|
|
818
|
+
* `/v1/messages` probe — the same mechanism `/usage` and `/status` use.
|
|
819
|
+
*
|
|
820
|
+
* Pre-#1163 this hit Anthropic's `/api/oauth/usage` endpoint, which has
|
|
821
|
+
* deprecated/tightened auth and now returns HTTP 403 even for healthy
|
|
822
|
+
* OAuth tokens. That produced the useless boot-card row "Quota HTTP 403
|
|
823
|
+
* — re-authenticate" while `/status` (using the unified-ratelimit
|
|
824
|
+
* headers path) reported the agent as 🟢. See `quota-check.ts` for the
|
|
825
|
+
* underlying probe and `/v1/messages` header surface.
|
|
767
826
|
*
|
|
768
|
-
* Result is cached
|
|
769
|
-
*
|
|
770
|
-
*
|
|
771
|
-
* that surface as 🟡 "rate limited" in the boot card. See `quota-cache.ts`.
|
|
827
|
+
* Result is cached briefly via `quota-cache.ts` so simultaneous fleet
|
|
828
|
+
* restarts (multiple agents booting at once, each with their own gateway)
|
|
829
|
+
* coalesce on the cache instead of each spending a `/v1/messages` token.
|
|
772
830
|
*
|
|
773
831
|
* Tests can override the cache path via SWITCHROOM_QUOTA_CACHE_PATH.
|
|
774
832
|
*/
|
|
775
833
|
export async function probeQuota(
|
|
776
834
|
claudeConfigDir: string,
|
|
777
|
-
|
|
835
|
+
_agentDir: string,
|
|
778
836
|
fetchImpl: typeof fetch = fetch,
|
|
779
837
|
): Promise<ProbeResult> {
|
|
780
838
|
return withTimeout('Quota', (async (): Promise<ProbeResult> => {
|
|
781
|
-
// Cache hit → return early (avoids the rate-limit cascade)
|
|
782
839
|
const cached = readQuotaCache()
|
|
783
840
|
if (cached) {
|
|
784
841
|
return cached
|
|
785
842
|
}
|
|
786
843
|
|
|
787
|
-
//
|
|
788
|
-
|
|
844
|
+
// The fallback per-agent token path is `accounts/default/.oauth-token`;
|
|
845
|
+
// fetchQuota's own resolver only checks the top-level `.oauth-token`,
|
|
846
|
+
// so prefer that, and if it's missing surface the same degraded row
|
|
847
|
+
// we did before (no live probe — that's a setup issue, not a runtime
|
|
848
|
+
// one).
|
|
849
|
+
let claudeDirForProbe: string | null = null
|
|
789
850
|
for (const candidate of [
|
|
790
|
-
|
|
791
|
-
join(claudeConfigDir, 'accounts', 'default'
|
|
851
|
+
claudeConfigDir,
|
|
852
|
+
join(claudeConfigDir, 'accounts', 'default'),
|
|
792
853
|
]) {
|
|
793
|
-
if (existsSync(candidate)) {
|
|
794
|
-
|
|
795
|
-
|
|
796
|
-
if (raw.length > 0) { token = raw; break }
|
|
797
|
-
} catch {}
|
|
854
|
+
if (existsSync(join(candidate, '.oauth-token'))) {
|
|
855
|
+
claudeDirForProbe = candidate
|
|
856
|
+
break
|
|
798
857
|
}
|
|
799
858
|
}
|
|
800
|
-
if (!
|
|
801
|
-
return {
|
|
802
|
-
|
|
803
|
-
|
|
804
|
-
let resp: Response
|
|
805
|
-
try {
|
|
806
|
-
const controller = new AbortController()
|
|
807
|
-
const t = setTimeout(() => controller.abort(), 1800)
|
|
808
|
-
resp = await fetchImpl('https://api.anthropic.com/api/oauth/usage', {
|
|
809
|
-
method: 'GET',
|
|
810
|
-
headers: {
|
|
811
|
-
'Authorization': `Bearer ${token}`,
|
|
812
|
-
'Accept': 'application/json',
|
|
813
|
-
'anthropic-version': '2023-06-01',
|
|
814
|
-
'anthropic-beta': 'oauth-2025-04-20',
|
|
815
|
-
'User-Agent': 'switchroom-boot/0.1',
|
|
816
|
-
},
|
|
817
|
-
signal: controller.signal,
|
|
818
|
-
})
|
|
819
|
-
clearTimeout(t)
|
|
820
|
-
} catch (err: unknown) {
|
|
821
|
-
return { status: 'fail', label: 'Quota', detail: `request failed: ${(err as Error).message ?? String(err)}` }
|
|
822
|
-
}
|
|
823
|
-
|
|
824
|
-
if (resp.status === 429) {
|
|
825
|
-
// A 429 from /api/oauth/usage means the endpoint is rate-limiting our
|
|
826
|
-
// probe calls — it does NOT mean the user is out of quota. Conflating
|
|
827
|
-
// the two is the root cause of the false 🟡 "rate limited" alarm
|
|
828
|
-
// reported in #210. Return ok-with-note and cache it for 30 s so
|
|
829
|
-
// simultaneous fleet restarts read the cached result instead of piling
|
|
830
|
-
// up on the same endpoint (see quota-cache.ts: RATE_LIMIT_TTL_MS).
|
|
831
|
-
//
|
|
832
|
-
// We assume 429 from /api/oauth/usage signals endpoint rate-limiting,
|
|
833
|
-
// not quota exhaustion. Anthropic uses 403 / 200-with-flag for the
|
|
834
|
-
// latter today; if that changes, revisit this 🟢 mapping.
|
|
835
|
-
const rateLimitResult: ProbeResult = {
|
|
836
|
-
status: 'ok',
|
|
859
|
+
if (!claudeDirForProbe) {
|
|
860
|
+
return {
|
|
861
|
+
status: 'degraded',
|
|
837
862
|
label: 'Quota',
|
|
838
|
-
detail: '
|
|
839
|
-
|
|
863
|
+
detail: 'no OAuth token',
|
|
864
|
+
nextStep: 'No OAuth token on disk — register a fleet account: `switchroom auth add <label> --from-oauth` then `switchroom auth use <label>` (RFC H)',
|
|
840
865
|
}
|
|
841
|
-
writeQuotaCache(rateLimitResult)
|
|
842
|
-
return rateLimitResult
|
|
843
|
-
}
|
|
844
|
-
if (!resp.ok) {
|
|
845
|
-
return { status: 'degraded', label: 'Quota', detail: `HTTP ${resp.status}` }
|
|
846
|
-
}
|
|
847
|
-
|
|
848
|
-
let body: unknown
|
|
849
|
-
try {
|
|
850
|
-
body = await resp.json()
|
|
851
|
-
} catch {
|
|
852
|
-
return { status: 'degraded', label: 'Quota', detail: 'invalid JSON response' }
|
|
853
866
|
}
|
|
854
867
|
|
|
855
|
-
|
|
856
|
-
|
|
857
|
-
|
|
858
|
-
|
|
859
|
-
|
|
860
|
-
|
|
861
|
-
|
|
862
|
-
|
|
863
|
-
|
|
864
|
-
|
|
865
|
-
} catch {}
|
|
866
|
-
|
|
867
|
-
// Try common field paths — schema not yet locked
|
|
868
|
-
const b = body as Record<string, unknown>
|
|
869
|
-
const sessionQuota =
|
|
870
|
-
(b?.['data'] as Record<string, unknown> | undefined)?.['session_quota'] ??
|
|
871
|
-
b?.['session_quota'] ??
|
|
872
|
-
(b?.['quota'] as Record<string, unknown> | undefined)?.['session'] ??
|
|
873
|
-
(b?.['usage'] as Record<string, unknown> | undefined)?.['session']
|
|
874
|
-
|
|
875
|
-
if (!sessionQuota) {
|
|
868
|
+
const probe = await fetchQuota({
|
|
869
|
+
claudeConfigDir: claudeDirForProbe,
|
|
870
|
+
fetchImpl,
|
|
871
|
+
timeoutMs: 1800,
|
|
872
|
+
})
|
|
873
|
+
if (!probe.ok) {
|
|
874
|
+
// Auth rejection from /v1/messages is a strong signal — the same
|
|
875
|
+
// endpoint claude itself uses. Other errors are surfaced verbatim
|
|
876
|
+
// so operators can see what's wrong.
|
|
877
|
+
const isAuth = /auth rejected|HTTP 401|HTTP 403/i.test(probe.reason)
|
|
876
878
|
return {
|
|
877
879
|
status: 'degraded',
|
|
878
880
|
label: 'Quota',
|
|
879
|
-
detail:
|
|
881
|
+
detail: probe.reason,
|
|
882
|
+
nextStep: isAuth
|
|
883
|
+
? 'Auth rejected by Anthropic — broker auto-refreshes; if persistent, replace the account: `switchroom auth add <label> --from-oauth --replace`'
|
|
884
|
+
: 'Anthropic quota probe failed — re-check after a minute; broker auto-rotates per `auth.fallback_order`',
|
|
880
885
|
}
|
|
881
886
|
}
|
|
882
887
|
|
|
883
|
-
const
|
|
884
|
-
|
|
885
|
-
|
|
886
|
-
|
|
887
|
-
if (typeof sq['used_pct'] === 'number') parts.push(`${Math.round(sq['used_pct'] as number)}% used`)
|
|
888
|
-
if (typeof sq['resets_in_sec'] === 'number') {
|
|
889
|
-
const sec = sq['resets_in_sec'] as number
|
|
890
|
-
const h = Math.floor(sec / 3600)
|
|
891
|
-
const m = Math.round((sec % 3600) / 60)
|
|
892
|
-
parts.push(`resets in ${h}h ${m}m`)
|
|
888
|
+
const result: ProbeResult = {
|
|
889
|
+
status: 'ok',
|
|
890
|
+
label: 'Quota',
|
|
891
|
+
detail: formatQuotaLine(probe.data),
|
|
893
892
|
}
|
|
894
|
-
|
|
895
|
-
if (parts.length === 0) {
|
|
896
|
-
return { status: 'degraded', label: 'Quota', detail: 'schema unknown — saving raw response' }
|
|
897
|
-
}
|
|
898
|
-
const result: ProbeResult = { status: 'ok', label: 'Quota', detail: parts.join(' · ') }
|
|
899
893
|
writeQuotaCache(result)
|
|
900
894
|
return result
|
|
901
895
|
})())
|
|
@@ -922,7 +916,12 @@ export async function probeHindsight(
|
|
|
922
916
|
}
|
|
923
917
|
|
|
924
918
|
if (!resp || !resp.ok) {
|
|
925
|
-
return {
|
|
919
|
+
return {
|
|
920
|
+
status: 'fail',
|
|
921
|
+
label: 'Hindsight',
|
|
922
|
+
detail: 'unreachable',
|
|
923
|
+
nextStep: 'Hindsight server not responding on 127.0.0.1:18888 — start it with `hindsight serve` or check `systemctl --user status hindsight`',
|
|
924
|
+
}
|
|
926
925
|
}
|
|
927
926
|
|
|
928
927
|
const bankSuffix = bankName ? ` · bank=${bankName}` : ''
|
|
@@ -1082,6 +1081,9 @@ export async function probeScheduler(
|
|
|
1082
1081
|
status: stillSettling ? 'degraded' : 'fail',
|
|
1083
1082
|
label: 'Scheduler',
|
|
1084
1083
|
detail: `sidecar not running (no lockfile)${settlingNote}`,
|
|
1084
|
+
nextStep: stillSettling
|
|
1085
|
+
? 'Scheduler sidecar still starting — re-check in 30s'
|
|
1086
|
+
: 'Scheduler sidecar not running — restart the agent with `switchroom agent restart <agent>` so the supervisor relaunches it',
|
|
1085
1087
|
}
|
|
1086
1088
|
}
|
|
1087
1089
|
let holderPid: number | null = null
|
|
@@ -1090,16 +1092,27 @@ export async function probeScheduler(
|
|
|
1090
1092
|
const parsed = Number.parseInt(raw, 10)
|
|
1091
1093
|
if (Number.isInteger(parsed) && parsed > 0) holderPid = parsed
|
|
1092
1094
|
} catch {
|
|
1093
|
-
return {
|
|
1095
|
+
return {
|
|
1096
|
+
status: 'degraded',
|
|
1097
|
+
label: 'Scheduler',
|
|
1098
|
+
detail: 'lockfile unreadable',
|
|
1099
|
+
nextStep: `Inspect with \`cat ${lockPath}\` — if corrupt, remove it and restart the agent so the supervisor recreates the sidecar`,
|
|
1100
|
+
}
|
|
1094
1101
|
}
|
|
1095
1102
|
if (holderPid == null) {
|
|
1096
|
-
return {
|
|
1103
|
+
return {
|
|
1104
|
+
status: 'degraded',
|
|
1105
|
+
label: 'Scheduler',
|
|
1106
|
+
detail: 'lockfile contents invalid',
|
|
1107
|
+
nextStep: `Inspect with \`cat ${lockPath}\` — if corrupt, remove it and restart the agent so the supervisor recreates the sidecar`,
|
|
1108
|
+
}
|
|
1097
1109
|
}
|
|
1098
1110
|
if (!isAlive(holderPid)) {
|
|
1099
1111
|
return {
|
|
1100
1112
|
status: 'degraded',
|
|
1101
1113
|
label: 'Scheduler',
|
|
1102
1114
|
detail: `lock holder pid ${holderPid} not alive (supervisor restart in progress?)`,
|
|
1115
|
+
nextStep: 'Supervisor should relaunch the sidecar shortly — re-check in 30s; if still stale, restart the agent',
|
|
1103
1116
|
}
|
|
1104
1117
|
}
|
|
1105
1118
|
|
|
@@ -1147,14 +1160,24 @@ async function probeUds(
|
|
|
1147
1160
|
return { status: 'ok', label, detail: 'n/a (non-docker)' }
|
|
1148
1161
|
}
|
|
1149
1162
|
if (!socketPath) {
|
|
1150
|
-
return {
|
|
1163
|
+
return {
|
|
1164
|
+
status: 'fail',
|
|
1165
|
+
label,
|
|
1166
|
+
detail: 'socket path not configured',
|
|
1167
|
+
nextStep: udsNextStep(label, 'unconfigured'),
|
|
1168
|
+
}
|
|
1151
1169
|
}
|
|
1152
1170
|
return withTimeout(label, (async (): Promise<ProbeResult> => {
|
|
1153
1171
|
if (!opts.connectImpl) {
|
|
1154
1172
|
// Cheap pre-check: stat the file. Saves the connect round-trip on
|
|
1155
1173
|
// the common "broker container down → bind mount empty" case.
|
|
1156
1174
|
if (!existsSync(socketPath)) {
|
|
1157
|
-
return {
|
|
1175
|
+
return {
|
|
1176
|
+
status: 'fail',
|
|
1177
|
+
label,
|
|
1178
|
+
detail: `socket missing: ${socketPath}`,
|
|
1179
|
+
nextStep: udsNextStep(label, 'missing'),
|
|
1180
|
+
}
|
|
1158
1181
|
}
|
|
1159
1182
|
}
|
|
1160
1183
|
const connect = opts.connectImpl ?? defaultUdsConnect
|
|
@@ -1164,13 +1187,31 @@ async function probeUds(
|
|
|
1164
1187
|
} catch (err: unknown) {
|
|
1165
1188
|
const code = (err as NodeJS.ErrnoException)?.code
|
|
1166
1189
|
const msg = (err as Error)?.message ?? String(err)
|
|
1167
|
-
if (code === 'ENOENT') return { status: 'fail', label, detail: 'socket missing' }
|
|
1168
|
-
if (code === 'ECONNREFUSED') return { status: 'fail', label, detail: 'connection refused' }
|
|
1169
|
-
return { status: 'fail', label, detail: `connect failed: ${msg}
|
|
1190
|
+
if (code === 'ENOENT') return { status: 'fail', label, detail: 'socket missing', nextStep: udsNextStep(label, 'missing') }
|
|
1191
|
+
if (code === 'ECONNREFUSED') return { status: 'fail', label, detail: 'connection refused', nextStep: udsNextStep(label, 'refused') }
|
|
1192
|
+
return { status: 'fail', label, detail: `connect failed: ${msg}`, nextStep: udsNextStep(label, 'other') }
|
|
1170
1193
|
}
|
|
1171
1194
|
})())
|
|
1172
1195
|
}
|
|
1173
1196
|
|
|
1197
|
+
/**
|
|
1198
|
+
* Remediation hints for the UDS (vault-broker / approval-kernel) probe.
|
|
1199
|
+
* Both services are run by docker-compose alongside agents; recovery is
|
|
1200
|
+
* almost always the same shape ("the service container isn't up"), so we
|
|
1201
|
+
* surface the right `docker compose` target per label.
|
|
1202
|
+
*/
|
|
1203
|
+
function udsNextStep(label: string, kind: 'missing' | 'refused' | 'unconfigured' | 'other'): string {
|
|
1204
|
+
const svc = label.toLowerCase() === 'broker' ? 'vault-broker' : 'approval-kernel'
|
|
1205
|
+
if (kind === 'unconfigured') {
|
|
1206
|
+
return `${label} socket path not set — check the compose mount for the agent container`
|
|
1207
|
+
}
|
|
1208
|
+
if (kind === 'refused') {
|
|
1209
|
+
return `${label} socket present but not accepting connections — restart with \`docker compose restart ${svc}\``
|
|
1210
|
+
}
|
|
1211
|
+
// missing | other: most common case is the daemon container isn't running.
|
|
1212
|
+
return `${label} socket not reachable — bring up the daemon with \`docker compose up -d ${svc}\` (or check \`docker compose ps\`)`
|
|
1213
|
+
}
|
|
1214
|
+
|
|
1174
1215
|
/**
|
|
1175
1216
|
* Default UDS connect — opens a stream, then immediately closes it.
|
|
1176
1217
|
* Resolves on `connect` event, rejects on `error`. 1s connect timeout
|
|
@@ -1236,7 +1277,7 @@ export async function probeKernel(
|
|
|
1236
1277
|
*/
|
|
1237
1278
|
export async function probeSkills(
|
|
1238
1279
|
agentDir: string,
|
|
1239
|
-
opts: { fs?: SkillsFsImpl; maxNamesShown?: number } = {},
|
|
1280
|
+
opts: { fs?: SkillsFsImpl; maxNamesShown?: number; agentName?: string } = {},
|
|
1240
1281
|
): Promise<ProbeResult> {
|
|
1241
1282
|
return withTimeout('Skills', (async (): Promise<ProbeResult> => {
|
|
1242
1283
|
const fs = opts.fs ?? realSkillsFs
|
|
@@ -1282,10 +1323,12 @@ export async function probeSkills(
|
|
|
1282
1323
|
}
|
|
1283
1324
|
const named = dangling.slice(0, max).join(', ')
|
|
1284
1325
|
const more = dangling.length > max ? ` +${dangling.length - max} more` : ''
|
|
1326
|
+
const reconcileTarget = opts.agentName ? ` ${opts.agentName}` : ''
|
|
1285
1327
|
return {
|
|
1286
1328
|
status: 'degraded',
|
|
1287
1329
|
label: 'Skills',
|
|
1288
1330
|
detail: `${dangling.length}/${entries.length} dangling: ${named}${more}`,
|
|
1331
|
+
nextStep: `Run \`switchroom agent reconcile${reconcileTarget}\` to rebuild symlinks, or remove unused entries from switchroom.yaml`,
|
|
1289
1332
|
}
|
|
1290
1333
|
})())
|
|
1291
1334
|
}
|
|
@@ -14,12 +14,44 @@ import type { SessionMarker } from './session-marker.js'
|
|
|
14
14
|
// Re-export so tests can import from a single path
|
|
15
15
|
export type { RestartReason }
|
|
16
16
|
|
|
17
|
+
/**
|
|
18
|
+
* Operator-initiated restart-marker freshness window. Longer than the
|
|
19
|
+
* default `clean-shutdown.json` window (60s) because operator-driven
|
|
20
|
+
* flows — specifically `switchroom update` from the host CLI — stamp
|
|
21
|
+
* the marker BEFORE `docker compose up -d --remove-orphans` runs, and
|
|
22
|
+
* the recreate for a multi-agent fleet can comfortably take longer
|
|
23
|
+
* than 60s to bring every container's gateway back up (9 agents ×
|
|
24
|
+
* docker network/volume setup + gateway boot probes). Without this
|
|
25
|
+
* extended window, my "operator: switchroom update" marker reads
|
|
26
|
+
* stale by the time the late-bootstrapping agent's gateway reads it
|
|
27
|
+
* — `determineRestartReason` falls through to `'crash'` and the
|
|
28
|
+
* boot card renders the planned redeploy as a crash with a noisy
|
|
29
|
+
* `agent-crashed` operator-events broadcast (the very pattern
|
|
30
|
+
* PR #1139 set out to suppress).
|
|
31
|
+
*
|
|
32
|
+
* Five minutes is generous: a 50-agent fleet recreate would still
|
|
33
|
+
* finish well inside it, and we still treat a 5-min-old marker as a
|
|
34
|
+
* crash if the gateway eventually does come up so the longer window
|
|
35
|
+
* isn't a "silent forever" mode. Verified end-to-end against a 9-agent
|
|
36
|
+
* fleet on 2026-05-13: latest-recreated agent's marker age was 97s.
|
|
37
|
+
*
|
|
38
|
+
* Keyed on the reason-text prefix (`operator:`) so user/cli/in-gateway
|
|
39
|
+
* restart paths keep their 60s tight window — those produce a much
|
|
40
|
+
* shorter shutdown-to-boot delta and a 5-min window there would mask
|
|
41
|
+
* a real crash during/after a `/restart`.
|
|
42
|
+
*/
|
|
43
|
+
const OPERATOR_MARKER_MAX_AGE_MS = 5 * 60_000
|
|
44
|
+
|
|
17
45
|
/**
|
|
18
46
|
* Determine why this gateway is starting up.
|
|
19
47
|
*
|
|
20
48
|
* Priority order:
|
|
21
49
|
* 1. restart-pending.json present + fresh (<5 min) → 'planned'
|
|
22
|
-
* 2. clean-shutdown.json present + fresh
|
|
50
|
+
* 2. clean-shutdown.json present + fresh:
|
|
51
|
+
* - default <60s → 'graceful'
|
|
52
|
+
* - reason starts with `operator:` → <5min → 'graceful' (#1141
|
|
53
|
+
* follow-up: fleet recreate can exceed 60s and still be a
|
|
54
|
+
* planned operator update)
|
|
23
55
|
* 3. gateway-session.json present (prior process existed) → 'crash'
|
|
24
56
|
* 4. Otherwise → 'fresh'
|
|
25
57
|
*/
|
|
@@ -30,6 +62,7 @@ export function determineRestartReason(opts: {
|
|
|
30
62
|
now: number
|
|
31
63
|
cleanMaxAgeMs?: number
|
|
32
64
|
markerMaxAgeMs?: number
|
|
65
|
+
operatorMaxAgeMs?: number
|
|
33
66
|
}): RestartReason {
|
|
34
67
|
const {
|
|
35
68
|
marker,
|
|
@@ -38,14 +71,15 @@ export function determineRestartReason(opts: {
|
|
|
38
71
|
now,
|
|
39
72
|
cleanMaxAgeMs = CLEAN_SHUTDOWN_MAX_AGE_MS,
|
|
40
73
|
markerMaxAgeMs = 5 * 60_000,
|
|
74
|
+
operatorMaxAgeMs = OPERATOR_MARKER_MAX_AGE_MS,
|
|
41
75
|
} = opts
|
|
42
76
|
if (marker != null && now - marker.ts < markerMaxAgeMs) return 'planned'
|
|
43
|
-
if (
|
|
44
|
-
cleanMarker
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
77
|
+
if (cleanMarker != null && now - cleanMarker.ts >= 0) {
|
|
78
|
+
const isOperator = typeof cleanMarker.reason === 'string'
|
|
79
|
+
&& cleanMarker.reason.startsWith('operator:')
|
|
80
|
+
const window = isOperator ? operatorMaxAgeMs : cleanMaxAgeMs
|
|
81
|
+
if (now - cleanMarker.ts < window) return 'graceful'
|
|
82
|
+
}
|
|
49
83
|
if (sessionMarker != null) return 'crash'
|
|
50
84
|
return 'fresh'
|
|
51
85
|
}
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Pure boot-card version-string composer + helpers.
|
|
3
|
+
*
|
|
4
|
+
* Extracted from gateway.ts so the version-string code path can be
|
|
5
|
+
* exercised by property-based tests without dragging in the gateway's
|
|
6
|
+
* runtime side effects (env loading, bot client init, etc.). Live
|
|
7
|
+
* callers stay in gateway.ts; this file is pure functions only.
|
|
8
|
+
*/
|
|
9
|
+
|
|
10
|
+
export type BootVersionInputs = {
|
|
11
|
+
version: string
|
|
12
|
+
commitSha: string | null
|
|
13
|
+
commitDate: string | null
|
|
14
|
+
latestPr: number | null
|
|
15
|
+
commitsAheadOfTag: number | null
|
|
16
|
+
claudeCliVersion: string | null
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
export function formatRelativeAgo(iso: string | null): string | null {
|
|
20
|
+
if (!iso) return null
|
|
21
|
+
const t = Date.parse(iso)
|
|
22
|
+
if (Number.isNaN(t)) return null
|
|
23
|
+
const diffSec = Math.max(0, Math.floor((Date.now() - t) / 1000))
|
|
24
|
+
if (diffSec < 60) return `${diffSec}s ago`
|
|
25
|
+
if (diffSec < 3600) return `${Math.floor(diffSec / 60)}m ago`
|
|
26
|
+
if (diffSec < 86400) return `${Math.floor(diffSec / 3600)}h ago`
|
|
27
|
+
return `${Math.floor(diffSec / 86400)}d ago`
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
/**
|
|
31
|
+
* Compose the version string shown in the boot-card ack line and the
|
|
32
|
+
* status card's Version row. Two shapes, matching the deleted greeting
|
|
33
|
+
* card's behavior:
|
|
34
|
+
*
|
|
35
|
+
* - on a tag (commits_ahead = 0 or null): "v0.2.0 · #44 · claude 2.1.123 · 2h ago"
|
|
36
|
+
* (omit "#44 ·" when no PR was parsed; omit claude segment if unavailable)
|
|
37
|
+
* - ahead of a tag (commits_ahead > 0): "v0.2.0+3 · db6de9e · claude 2.1.123 · 2m ago"
|
|
38
|
+
* (always show short SHA when ahead, omit PR)
|
|
39
|
+
*
|
|
40
|
+
* Age segment is omitted if no commit date is available (npm consumer).
|
|
41
|
+
*
|
|
42
|
+
* Sanitization: claude --version output is whitespace-collapsed before
|
|
43
|
+
* embedding — a malicious or rogue `claude` on PATH must not be able to
|
|
44
|
+
* smuggle newlines into the ack line. HTML escaping happens at the
|
|
45
|
+
* boot-card boundary (see boot-card.ts: escapeHtml(version)).
|
|
46
|
+
*/
|
|
47
|
+
export function composeBootVersionString(inputs: BootVersionInputs): string {
|
|
48
|
+
const ago = formatRelativeAgo(inputs.commitDate)
|
|
49
|
+
const onTag = inputs.commitsAheadOfTag === 0 || inputs.commitsAheadOfTag === null
|
|
50
|
+
const claudeVerRaw = inputs.claudeCliVersion?.replace(/\s+/g, ' ').trim()
|
|
51
|
+
const claudeVer = claudeVerRaw && claudeVerRaw.length > 0 ? claudeVerRaw : null
|
|
52
|
+
|
|
53
|
+
if (onTag) {
|
|
54
|
+
const parts: string[] = [`v${inputs.version}`]
|
|
55
|
+
if (inputs.latestPr != null) parts.push(`#${inputs.latestPr}`)
|
|
56
|
+
if (claudeVer) parts.push(`claude ${claudeVer}`)
|
|
57
|
+
if (ago) parts.push(ago)
|
|
58
|
+
return parts.join(' · ')
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
const parts: string[] = [`v${inputs.version}+${inputs.commitsAheadOfTag}`]
|
|
62
|
+
if (inputs.commitSha) parts.push(inputs.commitSha)
|
|
63
|
+
if (claudeVer) parts.push(`claude ${claudeVer}`)
|
|
64
|
+
if (ago) parts.push(ago)
|
|
65
|
+
return parts.join(' · ')
|
|
66
|
+
}
|