switchroom 0.7.15 → 0.10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +51 -59
- package/bin/run-hook.sh +27 -11
- package/bin/timezone-hook.sh +9 -7
- package/dist/agent-scheduler/index.js +410 -133
- package/dist/auth-broker/index.js +13932 -0
- package/dist/cli/switchroom.js +26937 -5601
- package/dist/host-control/main.js +12702 -0
- package/dist/vault/approvals/kernel-server.js +467 -184
- package/dist/vault/broker/server.js +1430 -724
- package/examples/minimal.yaml +63 -0
- package/examples/personal-google-workspace-mcp/.env.example +34 -0
- package/examples/personal-google-workspace-mcp/README.md +194 -0
- package/examples/personal-google-workspace-mcp/compose.yaml +66 -0
- package/examples/switchroom.yaml +220 -0
- package/package.json +7 -4
- package/profiles/_base/settings.json.hbs +20 -5
- package/profiles/_base/start.sh.hbs +16 -3
- package/profiles/_shared/agent-self-service.md.hbs +126 -0
- package/profiles/_shared/telegram-style.md.hbs +20 -90
- package/profiles/_shared/vault-protocol.md.hbs +68 -0
- package/profiles/default/CLAUDE.md +50 -96
- package/profiles/default/CLAUDE.md.hbs +36 -6
- package/profiles/default/workspace/SOUL.md.hbs +12 -5
- package/skills/buildkite-agent-infrastructure/SKILL.md +30 -11
- package/skills/buildkite-agent-runtime/SKILL.md +44 -11
- package/skills/buildkite-api/SKILL.md +31 -8
- package/skills/buildkite-cli/SKILL.md +27 -9
- package/skills/buildkite-migration/SKILL.md +22 -9
- package/skills/buildkite-pipelines/SKILL.md +26 -9
- package/skills/buildkite-secure-delivery/SKILL.md +23 -9
- package/skills/buildkite-test-engine/SKILL.md +25 -8
- package/skills/docx/SKILL.md +1 -1
- package/skills/docx/scripts/office/validators/__pycache__/__init__.cpython-313.pyc +0 -0
- package/skills/docx/scripts/office/validators/__pycache__/base.cpython-313.pyc +0 -0
- package/skills/file-bug/SKILL.md +34 -6
- package/skills/humanizer/SKILL.md +15 -0
- package/skills/humanizer-calibrate/SKILL.md +7 -1
- package/skills/mcp-builder/SKILL.md +1 -1
- package/skills/pdf/SKILL.md +1 -1
- package/skills/pptx/SKILL.md +1 -1
- package/skills/skill-creator/SKILL.md +21 -1
- package/skills/skill-creator/scripts/__pycache__/__init__.cpython-313.pyc +0 -0
- package/skills/skill-creator/scripts/__pycache__/generate_report.cpython-313.pyc +0 -0
- package/skills/skill-creator/scripts/__pycache__/improve_description.cpython-313.pyc +0 -0
- package/skills/skill-creator/scripts/__pycache__/run_eval.cpython-313.pyc +0 -0
- package/skills/skill-creator/scripts/__pycache__/run_loop.cpython-313.pyc +0 -0
- package/skills/skill-creator/scripts/__pycache__/utils.cpython-313.pyc +0 -0
- package/skills/switchroom-cli/SKILL.md +63 -64
- package/skills/switchroom-health/SKILL.md +23 -10
- package/skills/switchroom-install/SKILL.md +3 -3
- package/skills/switchroom-manage/SKILL.md +26 -19
- package/skills/switchroom-runtime/SKILL.md +191 -0
- package/skills/switchroom-status/SKILL.md +27 -2
- package/skills/telegram-test-harness/SKILL.md +3 -0
- package/skills/token-helpers/SKILL.md +24 -1
- package/skills/webapp-testing/SKILL.md +31 -1
- package/skills/xlsx/SKILL.md +1 -1
- package/telegram-plugin/admin-commands/index.ts +7 -5
- package/telegram-plugin/analytics-posthog.ts +191 -0
- package/telegram-plugin/bridge/bridge.ts +69 -0
- package/telegram-plugin/bridge/ipc-client.ts +4 -1
- package/telegram-plugin/dist/bridge/bridge.js +194 -119
- package/telegram-plugin/dist/gateway/gateway.js +23611 -19671
- package/telegram-plugin/dist/server.js +245 -189
- package/telegram-plugin/first-paint.ts +3 -24
- package/telegram-plugin/gateway/auth-add-flow.ts +326 -0
- package/telegram-plugin/gateway/auth-broker-client.ts +75 -0
- package/telegram-plugin/gateway/auth-command.ts +794 -0
- package/telegram-plugin/gateway/auth-line.ts +123 -0
- package/telegram-plugin/gateway/boot-card.ts +169 -40
- package/telegram-plugin/gateway/boot-issue-cache.ts +308 -0
- package/telegram-plugin/gateway/boot-probes.ts +166 -123
- package/telegram-plugin/gateway/boot-reason.ts +41 -7
- package/telegram-plugin/gateway/boot-version.ts +66 -0
- package/telegram-plugin/gateway/gateway.ts +3499 -1885
- package/telegram-plugin/gateway/hostd-dispatch.ts +117 -0
- package/telegram-plugin/gateway/ipc-protocol.ts +18 -0
- package/telegram-plugin/gateway/pending-inbound-buffer.ts +106 -0
- package/telegram-plugin/gateway/quarantine.ts +69 -0
- package/telegram-plugin/gateway/quota-cache.ts +9 -4
- package/telegram-plugin/gateway/reaction-trigger.ts +401 -0
- package/telegram-plugin/gateway/recent-denials.test.ts +103 -0
- package/telegram-plugin/gateway/recent-denials.ts +77 -0
- package/telegram-plugin/gateway/startup-network-retry.ts +109 -31
- package/telegram-plugin/gateway/vault-grant-inbound-builders.ts +125 -0
- package/telegram-plugin/history.ts +91 -0
- package/telegram-plugin/hooks/hooks.json +10 -0
- package/telegram-plugin/hooks/sandbox-hint-posttool.mjs +130 -0
- package/telegram-plugin/hooks/subagent-tracker-posttool.mjs +19 -2
- package/telegram-plugin/hooks/subagent-tracker-pretool.mjs +22 -2
- package/telegram-plugin/hooks/tool-label-pretool.mjs +11 -0
- package/telegram-plugin/hooks/wedge-detect-posttool.mjs +303 -0
- package/telegram-plugin/inbound-classifier.ts +50 -0
- package/telegram-plugin/inline-keyboard-callbacks.ts +136 -0
- package/telegram-plugin/node_modules/.vite/vitest/da39a3ee5e6b4b0d3255bfef95601890afd80709/results.json +1 -0
- package/telegram-plugin/package.json +4 -2
- package/telegram-plugin/permission-rule.ts +51 -0
- package/telegram-plugin/permission-title.ts +56 -0
- package/telegram-plugin/quota-check.ts +19 -41
- package/telegram-plugin/registry/reaper.ts +223 -0
- package/telegram-plugin/retry-api-call.ts +80 -0
- package/telegram-plugin/runtime-metrics.ts +177 -0
- package/telegram-plugin/scripts/build.mjs +0 -1
- package/telegram-plugin/secret-detect/index.ts +24 -0
- package/telegram-plugin/secret-detect/vault-error.test.ts +64 -12
- package/telegram-plugin/secret-detect/vault-error.ts +78 -11
- package/telegram-plugin/secret-detect/vault-write.ts +14 -2
- package/telegram-plugin/server.js +41795 -0
- package/telegram-plugin/session-tail.ts +6 -1
- package/telegram-plugin/shared/bot-runtime.ts +5 -4
- package/telegram-plugin/silence-poke.ts +420 -0
- package/telegram-plugin/silent-end.ts +174 -0
- package/telegram-plugin/stream-controller.ts +13 -0
- package/telegram-plugin/stream-reply-handler.ts +7 -0
- package/telegram-plugin/subagent-watcher.ts +213 -4
- package/telegram-plugin/tests/auth-add-flow.test.ts +559 -0
- package/telegram-plugin/tests/auth-code-redact.test.ts +8 -4
- package/telegram-plugin/tests/auth-command-vernacular.test.ts +531 -0
- package/telegram-plugin/tests/boot-card-issue-dedup.test.ts +247 -0
- package/telegram-plugin/tests/boot-card-reason-to-render.test.ts +182 -0
- package/telegram-plugin/tests/boot-card-reason.test.ts +65 -2
- package/telegram-plugin/tests/boot-card-render.test.ts +146 -0
- package/telegram-plugin/tests/boot-card-silent-on-operator.test.ts +103 -0
- package/telegram-plugin/tests/boot-probes.test.ts +216 -10
- package/telegram-plugin/tests/boot-version-string.test.ts +0 -0
- package/telegram-plugin/tests/finalize-callback.test.ts +190 -0
- package/telegram-plugin/tests/gateway-message-validator.test.ts +26 -0
- package/telegram-plugin/tests/gateway-secret-detect.test.ts +12 -3
- package/telegram-plugin/tests/gateway-startup-network-retry.test.ts +104 -0
- package/telegram-plugin/tests/history-reaper.test.ts +378 -0
- package/telegram-plugin/tests/hostd-dispatch.test.ts +129 -0
- package/telegram-plugin/tests/inbound-classifier.test.ts +76 -0
- package/telegram-plugin/tests/inbound-message-types.test.ts +267 -0
- package/telegram-plugin/tests/issues-card.test.ts +49 -0
- package/telegram-plugin/tests/pending-inbound-buffer.test.ts +132 -0
- package/telegram-plugin/tests/permission-rule.test.ts +80 -1
- package/telegram-plugin/tests/permission-title.test.ts +31 -0
- package/telegram-plugin/tests/quota-check.test.ts +5 -35
- package/telegram-plugin/tests/races.test.ts +179 -0
- package/telegram-plugin/tests/reaction-trigger-flow.test.ts +353 -0
- package/telegram-plugin/tests/reaction-trigger.test.ts +397 -0
- package/telegram-plugin/tests/retry-api-call.test.ts +152 -1
- package/telegram-plugin/tests/runtime-metrics.test.ts +145 -0
- package/telegram-plugin/tests/sandbox-hint-posttool.test.ts +155 -0
- package/telegram-plugin/tests/secret-detect-delete-must-surface-failures.test.ts +133 -0
- package/telegram-plugin/tests/secret-detect-false-positives.test.ts +137 -0
- package/telegram-plugin/tests/silence-poke.test.ts +493 -0
- package/telegram-plugin/tests/silent-end.test.ts +206 -0
- package/telegram-plugin/tests/subagent-tracker-hooks.test.ts +107 -0
- package/telegram-plugin/tests/subagent-watcher-env-thresholds.test.ts +224 -0
- package/telegram-plugin/tests/subagent-watcher-stall-terminal.test.ts +316 -0
- package/telegram-plugin/tests/subagent-watcher.test.ts +263 -0
- package/telegram-plugin/tests/turn-signal-tracker.test.ts +81 -0
- package/telegram-plugin/tests/vault-approval-posture.test.ts +256 -0
- package/telegram-plugin/tests/vault-grant-auto-resume.test.ts +73 -0
- package/telegram-plugin/tests/vault-grant-inbound-builders.test.ts +226 -0
- package/telegram-plugin/tests/vault-grant-union.test.ts +130 -0
- package/telegram-plugin/tests/vault-key-regex-allows-slash.test.ts +140 -0
- package/telegram-plugin/tests/vault-posture-quarantine.test.ts +104 -0
- package/telegram-plugin/tests/vault-request-access-tool.test.ts +114 -0
- package/telegram-plugin/tests/vault-request-access-unlock-resume.test.ts +106 -0
- package/telegram-plugin/turn-signal-tracker.ts +100 -24
- package/telegram-plugin/uat/SETUP.md +210 -35
- package/telegram-plugin/uat/assertions.ts +264 -37
- package/telegram-plugin/uat/driver-info.ts +57 -0
- package/telegram-plugin/uat/driver.ts +590 -51
- package/telegram-plugin/uat/harness.ts +140 -94
- package/telegram-plugin/uat/load-env.test.ts +72 -0
- package/telegram-plugin/uat/load-env.ts +48 -0
- package/telegram-plugin/uat/login.ts +96 -53
- package/telegram-plugin/uat/runners/agent-self-sufficiency.ts +457 -0
- package/telegram-plugin/uat/runners/paraphrases.ts +231 -0
- package/telegram-plugin/uat/runners/report.ts +150 -0
- package/telegram-plugin/uat/runners/run-agent-self-sufficiency.sh +50 -0
- package/telegram-plugin/uat/runners/scorer.test.ts +196 -0
- package/telegram-plugin/uat/runners/scorer.ts +106 -0
- package/telegram-plugin/uat/runners/skill-coverage.test.ts +100 -0
- package/telegram-plugin/uat/runners/skill-coverage.ts +620 -0
- package/telegram-plugin/uat/scenarios/ask-user-button-tap-dm.test.ts +141 -0
- package/telegram-plugin/uat/scenarios/bg-sub-agent-dispatch-dm.test.ts +191 -0
- package/telegram-plugin/uat/scenarios/fuzz-extended-dm.test.ts +255 -0
- package/telegram-plugin/uat/scenarios/fuzz-human-style-dm.test.ts +275 -0
- package/telegram-plugin/uat/scenarios/fuzz-random-prompts-dm.test.ts +146 -0
- package/telegram-plugin/uat/scenarios/fuzz-status-ask-dm.test.ts +486 -0
- package/telegram-plugin/uat/scenarios/jtbd-interrupt-marker-dm.test.ts +67 -0
- package/telegram-plugin/uat/scenarios/jtbd-rapid-followup-dm.test.ts +100 -0
- package/telegram-plugin/uat/scenarios/jtbd-soft-commit-dm.test.ts +67 -0
- package/telegram-plugin/uat/scenarios/jtbd-status-query-dm.test.ts +49 -0
- package/telegram-plugin/uat/scenarios/location-inbound-dm.test.ts +65 -0
- package/telegram-plugin/uat/scenarios/midturn-silent-dm.test.ts +175 -0
- package/telegram-plugin/uat/scenarios/reactions-dm.test.ts +142 -0
- package/telegram-plugin/uat/scenarios/reactions-trigger-turn-dm.test.ts +96 -0
- package/telegram-plugin/uat/scenarios/secret-redaction-deletes-original-dm.test.ts +123 -0
- package/telegram-plugin/uat/scenarios/secret-redaction-no-false-positive-dm.test.ts +87 -0
- package/telegram-plugin/uat/scenarios/silence-poke-soft-dm.test.ts +155 -0
- package/telegram-plugin/uat/scenarios/silent-end-recovery-dm.test.ts +95 -0
- package/telegram-plugin/uat/scenarios/smoke-dm-reply.test.ts +57 -0
- package/telegram-plugin/uat/scenarios/subagent-watcher-no-rerun-dm.test.ts +135 -0
- package/telegram-plugin/uat/scenarios/vault-approval-posture-telegram-id-dm.test.ts +191 -0
- package/telegram-plugin/uat/scenarios/vault-audit-allow-dm.test.ts +108 -0
- package/telegram-plugin/uat/scenarios/vault-grant-auto-resume-dm.test.ts +121 -0
- package/telegram-plugin/uat/scenarios/vault-request-access-concurrent-dm.test.ts +161 -0
- package/telegram-plugin/uat/scenarios/vault-request-access-end-to-end-dm.test.ts +158 -0
- package/telegram-plugin/uat/scenarios/voice-inbound-dm.test.ts +65 -0
- package/telegram-plugin/vault-approval-posture.ts +42 -0
- package/telegram-plugin/welcome-text.ts +1 -0
- package/telegram-plugin/active-pins-sweep.ts +0 -204
- package/telegram-plugin/active-pins.ts +0 -146
- package/telegram-plugin/auth-dashboard.ts +0 -1104
- package/telegram-plugin/auth-slot-parser.ts +0 -497
- package/telegram-plugin/card-event-log.ts +0 -138
- package/telegram-plugin/dist/foreman/foreman.js +0 -31106
- package/telegram-plugin/docs/multi-agent-card-design.md +0 -847
- package/telegram-plugin/docs/pinned-progress-card-reliability.md +0 -144
- package/telegram-plugin/foreman/foreman-create-flow.ts +0 -202
- package/telegram-plugin/foreman/foreman-handlers.ts +0 -493
- package/telegram-plugin/foreman/foreman.ts +0 -1165
- package/telegram-plugin/foreman/setup-flow.ts +0 -345
- package/telegram-plugin/foreman/setup-state.ts +0 -239
- package/telegram-plugin/foreman/state.ts +0 -203
- package/telegram-plugin/pin-event-log.ts +0 -76
- package/telegram-plugin/progress-card-driver.ts +0 -2886
- package/telegram-plugin/progress-card-pin-manager.ts +0 -589
- package/telegram-plugin/progress-card-pin-watchdog.ts +0 -98
- package/telegram-plugin/progress-card.ts +0 -1409
- package/telegram-plugin/tests/HARNESS.md +0 -340
- package/telegram-plugin/tests/_progress-card-harness.ts +0 -109
- package/telegram-plugin/tests/active-pins-boot-reaper.test.ts +0 -211
- package/telegram-plugin/tests/active-pins-sweep.test.ts +0 -309
- package/telegram-plugin/tests/active-pins.test.ts +0 -187
- package/telegram-plugin/tests/auth-account-identity-surface.test.ts +0 -118
- package/telegram-plugin/tests/auth-dashboard-edge-cases.test.ts +0 -260
- package/telegram-plugin/tests/auth-dashboard-restart-flow.test.ts +0 -140
- package/telegram-plugin/tests/auth-dashboard-v3b.test.ts +0 -559
- package/telegram-plugin/tests/auth-dashboard.test.ts +0 -1045
- package/telegram-plugin/tests/auth-slot-commands.test.ts +0 -640
- package/telegram-plugin/tests/bg-agent-progress-card-757.test.ts +0 -201
- package/telegram-plugin/tests/boot-card-account-quota.test.ts +0 -137
- package/telegram-plugin/tests/card-event-log.test.ts +0 -145
- package/telegram-plugin/tests/first-paint.test.ts +0 -257
- package/telegram-plugin/tests/foreman-create-flow.test.ts +0 -359
- package/telegram-plugin/tests/foreman-handlers.test.ts +0 -347
- package/telegram-plugin/tests/foreman-state.test.ts +0 -164
- package/telegram-plugin/tests/foreman-write-ops.test.ts +0 -214
- package/telegram-plugin/tests/harness-ordering-invariants.test.ts +0 -243
- package/telegram-plugin/tests/pin-event-log.test.ts +0 -124
- package/telegram-plugin/tests/progress-card-api-failure-during-deferred.test.ts +0 -73
- package/telegram-plugin/tests/progress-card-close-paths-converge.test.ts +0 -272
- package/telegram-plugin/tests/progress-card-cross-turn.test.ts +0 -258
- package/telegram-plugin/tests/progress-card-delay-842.test.ts +0 -160
- package/telegram-plugin/tests/progress-card-dispose-preservepending.test.ts +0 -81
- package/telegram-plugin/tests/progress-card-draft-flag.test.ts +0 -80
- package/telegram-plugin/tests/progress-card-driver-eviction.test.ts +0 -215
- package/telegram-plugin/tests/progress-card-driver-fleet-shadow.test.ts +0 -123
- package/telegram-plugin/tests/progress-card-driver-force-complete-parent-done.test.ts +0 -76
- package/telegram-plugin/tests/progress-card-edit-timestamps-budget.test.ts +0 -62
- package/telegram-plugin/tests/progress-card-memory-bounds.test.ts +0 -84
- package/telegram-plugin/tests/progress-card-pin-failure-paths.test.ts +0 -139
- package/telegram-plugin/tests/progress-card-pin-manager.test.ts +0 -773
- package/telegram-plugin/tests/progress-card-pin-race-fast-turn.test.ts +0 -66
- package/telegram-plugin/tests/progress-card-pin-sidecar-partial-write.test.ts +0 -64
- package/telegram-plugin/tests/progress-card-pin-watchdog.test.ts +0 -190
- package/telegram-plugin/tests/progress-card-sigterm-pin-flush.test.ts +0 -146
- package/telegram-plugin/tests/real-gateway-f1-ladder-integrity.test.ts +0 -123
- package/telegram-plugin/tests/real-gateway-f2-instant-draft.test.ts +0 -82
- package/telegram-plugin/tests/real-gateway-f3-late-card.test.ts +0 -114
- package/telegram-plugin/tests/real-gateway-harness.ts +0 -699
- package/telegram-plugin/tests/real-gateway-i6-turn-flush-replay-dedup.test.ts +0 -313
- package/telegram-plugin/tests/real-gateway-ipc-lifecycle.test.ts +0 -299
- package/telegram-plugin/tests/real-gateway-spec.test.ts +0 -487
- package/telegram-plugin/tests/real-gateway.smoke.test.ts +0 -101
- package/telegram-plugin/tests/setup-flow.test.ts +0 -510
- package/telegram-plugin/tests/setup-state.test.ts +0 -146
- package/telegram-plugin/tests/sync-chat-running-subagents.test.ts +0 -116
- package/telegram-plugin/tests/turn-end-regressions.test.ts +0 -489
- package/telegram-plugin/tests/turn-flush-card-takeover.test.ts +0 -218
- package/telegram-plugin/tests/turn-flush-prose-recovery.test.ts +0 -78
- package/telegram-plugin/tests/two-zone-bg-carry-full-lifecycle.test.ts +0 -131
- package/telegram-plugin/tests/two-zone-bg-detection.test.ts +0 -120
- package/telegram-plugin/tests/two-zone-bg-done-when-all-terminal.test.ts +0 -116
- package/telegram-plugin/tests/two-zone-bg-early-turn-end.test.ts +0 -87
- package/telegram-plugin/tests/two-zone-bg-survives-next-turn.test.ts +0 -211
- package/telegram-plugin/tests/two-zone-card-cap.test.ts +0 -62
- package/telegram-plugin/tests/two-zone-card-fleet-row.test.ts +0 -101
- package/telegram-plugin/tests/two-zone-card-header-phases.test.ts +0 -78
- package/telegram-plugin/tests/two-zone-card-html-balance.test.ts +0 -110
- package/telegram-plugin/tests/two-zone-card-lifecycle.test.ts +0 -128
- package/telegram-plugin/tests/two-zone-card-sanitise.test.ts +0 -58
- package/telegram-plugin/tests/two-zone-card-snapshot.test.ts +0 -133
- package/telegram-plugin/tests/two-zone-concurrent-turns-isolation.test.ts +0 -155
- package/telegram-plugin/tests/two-zone-phasefor-precedence.test.ts +0 -117
- package/telegram-plugin/tests/two-zone-snapshot-extras.test.ts +0 -187
- package/telegram-plugin/tests/two-zone-stuck-edit-throttle.test.ts +0 -149
- package/telegram-plugin/tests/two-zone-stuck-header-escalation.test.ts +0 -101
- package/telegram-plugin/tests/two-zone-stuck-per-member.test.ts +0 -114
- package/telegram-plugin/tests/two-zone-stuck-recovery.test.ts +0 -105
- package/telegram-plugin/tests/waiting-ux-harness.ts +0 -381
- package/telegram-plugin/tests/waiting-ux.e2e.test.ts +0 -233
- package/telegram-plugin/turn-flush-prose-recovery.ts +0 -40
- package/telegram-plugin/two-zone-card.ts +0 -269
- package/telegram-plugin/uat/scenarios/smoke-clerk-reply.test.ts +0 -61
|
@@ -0,0 +1,123 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* UAT scenario — operator pastes a real-shaped secret into the bot's
|
|
3
|
+
* DM; bot detects, deletes the original, posts a redaction card.
|
|
4
|
+
*
|
|
5
|
+
* Part of: secret-redaction bug class reported 2026-05-12 (Bug A —
|
|
6
|
+
* sometimes the original message isn't actually deleted from chat
|
|
7
|
+
* history despite the bot claiming it was).
|
|
8
|
+
*
|
|
9
|
+
* **Skipped by default.** To unskip:
|
|
10
|
+
*
|
|
11
|
+
* 1. Run the standard UAT preflight (uat/SETUP.md §5-6) so the
|
|
12
|
+
* test-harness agent is live and the driver session is auth'd.
|
|
13
|
+
*
|
|
14
|
+
* 2. Verify the test-harness chat has secret-detect enabled. The
|
|
15
|
+
* agent's switchroom.yaml `access.json` must include the driver
|
|
16
|
+
* in `allowFrom` so the driver's paste is treated as a real
|
|
17
|
+
* operator message (not silently ignored). Existing UAT setup
|
|
18
|
+
* already covers this for the smoke scenario.
|
|
19
|
+
*
|
|
20
|
+
* 3. Confirm a vault passphrase is cached in the test-harness chat
|
|
21
|
+
* so the high-confidence-stored branch fires (not the
|
|
22
|
+
* no-passphrase deferred branch). Easiest: send `/vault unlock`
|
|
23
|
+
* + passphrase as the driver once before running this scenario.
|
|
24
|
+
* Without a cached passphrase the assertion changes — the bot
|
|
25
|
+
* posts the "🔒 caught a secret. tap below to unlock the vault
|
|
26
|
+
* and save it" card instead of "🔒 captured N secrets:". Both
|
|
27
|
+
* paths MUST delete the original; the matcher here is loose
|
|
28
|
+
* enough to accept either.
|
|
29
|
+
*
|
|
30
|
+
* 4. Remove the `describe.skip` below.
|
|
31
|
+
*
|
|
32
|
+
* Why skipped: sends a real-shaped (but synthetic) secret-pattern
|
|
33
|
+
* string into Telegram. The pattern doesn't unlock any actual
|
|
34
|
+
* secret, but committing the scenario unskipped would also commit
|
|
35
|
+
* the test fixture into git history where secretlint pre-commit
|
|
36
|
+
* hooks might flag it. Generated at runtime to dodge the scan.
|
|
37
|
+
*/
|
|
38
|
+
|
|
39
|
+
import { describe, expect, it } from "vitest";
|
|
40
|
+
import { spinUp } from "../harness.js";
|
|
41
|
+
|
|
42
|
+
describe.skip("uat: secret-redaction deletes the original message (Bug A 2026-05-12)", () => {
|
|
43
|
+
it(
|
|
44
|
+
"paste a real-shaped secret; bot deletes the original from chat history",
|
|
45
|
+
async () => {
|
|
46
|
+
const sc = await spinUp({ agent: "test-harness" });
|
|
47
|
+
try {
|
|
48
|
+
// Build a real-shaped (but synthetic) ANTHROPIC_API_KEY value
|
|
49
|
+
// at runtime so the source file doesn't trip Push Protection.
|
|
50
|
+
// Same idiom as telegram-plugin/tests/secret-detect-secretlint.test.ts:1.
|
|
51
|
+
const fakeApiKey =
|
|
52
|
+
`sk-ant-` + "a1b2c3d4".repeat(4) + "_test_synthetic";
|
|
53
|
+
const inboundText = `set ANTHROPIC_API_KEY=${fakeApiKey}`;
|
|
54
|
+
|
|
55
|
+
// Send the secret-bearing message. Capture the messageId we
|
|
56
|
+
// sent so we can later assert it's gone from history.
|
|
57
|
+
const sent = await sc.sendDM(inboundText);
|
|
58
|
+
const sentMessageId = sent.messageId;
|
|
59
|
+
|
|
60
|
+
// The bot should reply with either:
|
|
61
|
+
// - "🔒 captured N secret(s):" (high-confidence stored
|
|
62
|
+
// path, requires cached passphrase)
|
|
63
|
+
// - "🔒 caught a secret. we deleted it from chat. tap
|
|
64
|
+
// below to unlock the vault..." (deferred path)
|
|
65
|
+
// OR the new fail-loud variant (if delete failed):
|
|
66
|
+
// - "⚠️ Could not auto-delete message containing your ..."
|
|
67
|
+
// The contract this test pins is: ONE of the first two
|
|
68
|
+
// success messages appears AND the original message is
|
|
69
|
+
// actually gone from history.
|
|
70
|
+
const reply = await sc.expectMessage(
|
|
71
|
+
/🔒 (captured|caught)/,
|
|
72
|
+
{ from: "bot", timeout: 30_000 },
|
|
73
|
+
);
|
|
74
|
+
expect(reply.text).toMatch(/deleted (it )?from chat|captured/i);
|
|
75
|
+
|
|
76
|
+
// The load-bearing assertion: the original message is
|
|
77
|
+
// unreachable in chat history. driver.getMessage returns
|
|
78
|
+
// null for deleted messages (driver.ts:525-534).
|
|
79
|
+
//
|
|
80
|
+
// Pre-2026-05-12 fix: this would sometimes pass when delete
|
|
81
|
+
// succeeded and silently leave the message behind when it
|
|
82
|
+
// failed (Telegram rate limits, network blip, message was
|
|
83
|
+
// edited mid-delete, etc.) — and the operator would never
|
|
84
|
+
// know.
|
|
85
|
+
//
|
|
86
|
+
// Post-fix: deleteSensitiveMessage either deletes
|
|
87
|
+
// successfully OR posts an in-chat warning "⚠️ Could not
|
|
88
|
+
// auto-delete..." which we'd see as a SECOND bot message.
|
|
89
|
+
// The assertion here is the strict "actually gone" version.
|
|
90
|
+
// chat_id for the driver's view of a DM = the partner's
|
|
91
|
+
// (bot's) user_id.
|
|
92
|
+
const stillThere = await sc.driver.getMessage(sc.botUserId, sentMessageId);
|
|
93
|
+
expect(
|
|
94
|
+
stillThere,
|
|
95
|
+
`original secret-bearing message ${sentMessageId} was NOT deleted — Telegram history still has it`,
|
|
96
|
+
).toBeNull();
|
|
97
|
+
} finally {
|
|
98
|
+
await sc.tearDown();
|
|
99
|
+
}
|
|
100
|
+
},
|
|
101
|
+
120_000,
|
|
102
|
+
);
|
|
103
|
+
|
|
104
|
+
it(
|
|
105
|
+
"when delete fails (simulated by editing the message just before delete), the bot posts a warning naming the leaked msg_id",
|
|
106
|
+
async () => {
|
|
107
|
+
// This case is harder to repro without a fault-injection
|
|
108
|
+
// hook — Telegram doesn't let us "make deleteMessage fail
|
|
109
|
+
// deterministically" from the driver side. The contract is
|
|
110
|
+
// pinned by the unit test at
|
|
111
|
+
// telegram-plugin/tests/secret-detect-delete-must-surface-failures.test.ts
|
|
112
|
+
// (deleteSensitiveMessage helper still logs SECURITY: …
|
|
113
|
+
// FAILED + posts an in-chat warning on its catch path).
|
|
114
|
+
// This UAT slot stays skipped pending a fault-injection
|
|
115
|
+
// affordance in the driver — tracked as a TODO on the
|
|
116
|
+
// harness roadmap.
|
|
117
|
+
const _ = await spinUp({ agent: "test-harness" });
|
|
118
|
+
void _;
|
|
119
|
+
expect(true).toBe(true);
|
|
120
|
+
},
|
|
121
|
+
60_000,
|
|
122
|
+
);
|
|
123
|
+
});
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* UAT scenario — operator chats casually about secrets/tokens
|
|
3
|
+
* (mentioning the words, not pasting actual credentials); bot
|
|
4
|
+
* MUST NOT redact the operator's question.
|
|
5
|
+
*
|
|
6
|
+
* Part of: secret-redaction bug class reported 2026-05-12 (Bug B —
|
|
7
|
+
* false positive on the word "secret"/"token" or on
|
|
8
|
+
* code-shaped-but-placeholder values like `MY_TOKEN=hello`).
|
|
9
|
+
*
|
|
10
|
+
* **Skipped by default.** Unskip after the standard UAT preflight
|
|
11
|
+
* (uat/SETUP.md §5-6). No host-state mutations.
|
|
12
|
+
*
|
|
13
|
+
* The unit-shape contract is pinned in
|
|
14
|
+
* `telegram-plugin/tests/secret-detect-false-positives.test.ts` —
|
|
15
|
+
* which runs every CI cycle. This UAT scenario adds the
|
|
16
|
+
* end-to-end Telegram round-trip so a future regression in the
|
|
17
|
+
* gateway integration (not the detector) would also surface.
|
|
18
|
+
*/
|
|
19
|
+
|
|
20
|
+
import { describe, expect, it } from "vitest";
|
|
21
|
+
import { spinUp } from "../harness.js";
|
|
22
|
+
|
|
23
|
+
const CASUAL_MENTIONS = [
|
|
24
|
+
"what's my fatsecret token?",
|
|
25
|
+
"delete that secret you sent earlier",
|
|
26
|
+
"the FATSECRET_TOKEN env var is missing",
|
|
27
|
+
"set MY_TOKEN=hello and try again",
|
|
28
|
+
"I keep forgetting my password again",
|
|
29
|
+
];
|
|
30
|
+
|
|
31
|
+
describe.skip("uat: secret-redaction does NOT fire on casual mentions (Bug B 2026-05-12)", () => {
|
|
32
|
+
for (const text of CASUAL_MENTIONS) {
|
|
33
|
+
it(
|
|
34
|
+
`does not redact: ${JSON.stringify(text)}`,
|
|
35
|
+
async () => {
|
|
36
|
+
const sc = await spinUp({ agent: "test-harness" });
|
|
37
|
+
try {
|
|
38
|
+
const sent = await sc.sendDM(text);
|
|
39
|
+
|
|
40
|
+
// Wait a short period for any (incorrect) redaction reply
|
|
41
|
+
// to arrive. If the bot's gonna fire the redaction
|
|
42
|
+
// pipeline, it does so synchronously in handleInbound —
|
|
43
|
+
// well under 10s.
|
|
44
|
+
//
|
|
45
|
+
// The assertion: we should NOT see a "🔒 captured" or
|
|
46
|
+
// "🔒 caught a secret" reply. If we do, the false
|
|
47
|
+
// positive is back.
|
|
48
|
+
//
|
|
49
|
+
// We tolerate the bot's normal Claude reply (which is
|
|
50
|
+
// unrelated content). Pin only the absence of the
|
|
51
|
+
// redaction marker.
|
|
52
|
+
let sawRedaction = false;
|
|
53
|
+
try {
|
|
54
|
+
await sc.expectMessage(/🔒 (captured|caught)/, {
|
|
55
|
+
from: "bot",
|
|
56
|
+
timeout: 10_000,
|
|
57
|
+
});
|
|
58
|
+
sawRedaction = true;
|
|
59
|
+
} catch {
|
|
60
|
+
// Expected: timeout means no redaction fired.
|
|
61
|
+
}
|
|
62
|
+
expect(
|
|
63
|
+
sawRedaction,
|
|
64
|
+
`false-positive redaction fired on casual chat: ${JSON.stringify(text)}`,
|
|
65
|
+
).toBe(false);
|
|
66
|
+
|
|
67
|
+
// The original message must remain visible — the
|
|
68
|
+
// operator asked a real question and the bot deleted
|
|
69
|
+
// it would be terrible UX.
|
|
70
|
+
// chat_id for the driver's view of a DM = the partner's
|
|
71
|
+
// (bot's) user_id.
|
|
72
|
+
const stillThere = await sc.driver.getMessage(
|
|
73
|
+
sc.botUserId,
|
|
74
|
+
sent.messageId,
|
|
75
|
+
);
|
|
76
|
+
expect(
|
|
77
|
+
stillThere,
|
|
78
|
+
`the bot deleted the operator's question (false positive on '${text}')`,
|
|
79
|
+
).not.toBeNull();
|
|
80
|
+
} finally {
|
|
81
|
+
await sc.tearDown();
|
|
82
|
+
}
|
|
83
|
+
},
|
|
84
|
+
60_000,
|
|
85
|
+
);
|
|
86
|
+
}
|
|
87
|
+
});
|
|
@@ -0,0 +1,155 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Silence-poke soft-fire end-to-end scenario.
|
|
3
|
+
*
|
|
4
|
+
* Goal context: cause class CC-3 in `docs/status-ask-cause-classes.md`
|
|
5
|
+
* — the L3 safety net. Unit tests (`silence-poke.test.ts`) cover the
|
|
6
|
+
* state machine: tick semantics, ladder thresholds, success measurement.
|
|
7
|
+
* They DO NOT cover the wire path between `consumeArmedPoke()` (in
|
|
8
|
+
* `silence-poke.ts`) and the model actually receiving the
|
|
9
|
+
* `[silence-poke]` system-reminder block on its next tool result.
|
|
10
|
+
*
|
|
11
|
+
* The wire path lives at `gateway.ts:2740`:
|
|
12
|
+
*
|
|
13
|
+
* onToolCall → executeToolCall(...) → consumeArmedPoke() →
|
|
14
|
+
* append `<system-reminder>[silence-poke] ...</system-reminder>`
|
|
15
|
+
* to the tool-result text.
|
|
16
|
+
*
|
|
17
|
+
* If that integration ever breaks — a refactor swaps `executeToolCall`
|
|
18
|
+
* for a path that doesn't call `consumeArmedPoke`, the result-content
|
|
19
|
+
* shape mutation gets dropped, MCP framing changes — the unit tests
|
|
20
|
+
* still pass but the model never sees the nudge, the user goes silent
|
|
21
|
+
* past 75s, and `inbound_status_query` ticks. This UAT closes that
|
|
22
|
+
* regression window end-to-end.
|
|
23
|
+
*
|
|
24
|
+
* ## Strategy
|
|
25
|
+
*
|
|
26
|
+
* Force the agent into a stretch of silent tool churn that exceeds the
|
|
27
|
+
* 75s soft threshold without the model emitting any outbound `reply`.
|
|
28
|
+
* The conversational-pacing prompt instructs the model to soft-commit
|
|
29
|
+
* fast turns, so we have to explicitly suppress that:
|
|
30
|
+
*
|
|
31
|
+
* - Prompt instructs three sequential 30s `sleep` Bash calls, NO
|
|
32
|
+
* mid-turn replies, single final reply when done.
|
|
33
|
+
* - Total silent stretch is ~90s + tool overhead, comfortably past
|
|
34
|
+
* the 75s soft threshold.
|
|
35
|
+
* - If the silence-poke wire works: the model sees the
|
|
36
|
+
* `[silence-poke]` system-reminder appended to the result of the
|
|
37
|
+
* first or second sleep, breaks the no-reply rule, sends a brief
|
|
38
|
+
* update. We observe a reply in the [70s, 200s] window.
|
|
39
|
+
* - If the wire is broken: model never receives the nudge, no
|
|
40
|
+
* reply until the third sleep ends at ~90s+, OR the framework
|
|
41
|
+
* fallback at 300s fires. We catch the latter as a separate
|
|
42
|
+
* failure (the framework fallback is the FLOOR, not the goal).
|
|
43
|
+
*
|
|
44
|
+
* ## Tolerances
|
|
45
|
+
*
|
|
46
|
+
* Real-Telegram UAT against a real Claude model has variability:
|
|
47
|
+
*
|
|
48
|
+
* - Model may insert one soft-commit "on it" reply at start; that
|
|
49
|
+
* resets the silence clock. Three 30s sleeps still pushes the
|
|
50
|
+
* post-commit silence past 75s as long as the commit lands
|
|
51
|
+
* within the first ~10s. We tolerate this.
|
|
52
|
+
* - Model may decline to follow the "no replies" instruction and
|
|
53
|
+
* send updates organically; if the FIRST reply still lands in
|
|
54
|
+
* [70s, 200s], the conversational pacing layer is doing its job
|
|
55
|
+
* and the test passes regardless of whether silence-poke
|
|
56
|
+
* specifically fired.
|
|
57
|
+
* - Window is generous (70-200s) to absorb 5s poll interval,
|
|
58
|
+
* mtcute receive lag, Telegram delivery jitter.
|
|
59
|
+
*
|
|
60
|
+
* ## Failure shapes the assertion catches
|
|
61
|
+
*
|
|
62
|
+
* 1. Wire path broken — first reply lands >200s after sendDM
|
|
63
|
+
* because the framework fallback (300s) is the only thing that
|
|
64
|
+
* eventually breaks the silence.
|
|
65
|
+
* 2. Soft poke armed but not drained — first reply lands at >200s
|
|
66
|
+
* similarly.
|
|
67
|
+
* 3. Model misbehavior — first reply is the FINAL answer (long
|
|
68
|
+
* text after all three sleeps complete at ~90s+); strictly that
|
|
69
|
+
* passes the window check, but the test also asserts the first
|
|
70
|
+
* reply is brief (<400 chars) as a sanity floor on "this is
|
|
71
|
+
* actually a poke response, not the final answer." Skip strict
|
|
72
|
+
* length if the prompt happens to be so simple the final
|
|
73
|
+
* answer IS brief.
|
|
74
|
+
*
|
|
75
|
+
* Requires the same env as `smoke-dm-reply.test.ts` (see
|
|
76
|
+
* `uat/SETUP.md` §6). Long-running: outer budget 4 min.
|
|
77
|
+
*/
|
|
78
|
+
|
|
79
|
+
import { describe, expect, it } from "vitest";
|
|
80
|
+
import { spinUp } from "../harness.js";
|
|
81
|
+
|
|
82
|
+
const SOFT_WINDOW_MIN_MS = 70_000;
|
|
83
|
+
const SOFT_WINDOW_MAX_MS = 200_000;
|
|
84
|
+
|
|
85
|
+
// Explicit instruction shape. Mirrors the `BG_DISPATCH_PROMPT` pattern
|
|
86
|
+
// in `bg-sub-agent-dispatch-dm.test.ts` — pin the tool + the sequence
|
|
87
|
+
// so behaviour is deterministic enough to test the *infra*, not the
|
|
88
|
+
// model's free-form judgement.
|
|
89
|
+
const SILENT_CHURN_PROMPT =
|
|
90
|
+
"I need you to test something. Run THREE separate Bash tool calls " +
|
|
91
|
+
"in sequence: first `sleep 30`, then `sleep 30`, then `sleep 30`. " +
|
|
92
|
+
"Critical: do NOT send any `reply` or `stream_reply` between or " +
|
|
93
|
+
"during the sleeps — no soft commit, no progress updates, no " +
|
|
94
|
+
"narration. Just the three Bash calls back-to-back. Once all three " +
|
|
95
|
+
"complete, send ONE brief final reply saying 'done' so I know " +
|
|
96
|
+
"you're back.";
|
|
97
|
+
|
|
98
|
+
describe("uat: silence-poke soft fires + reaches the model wire", () => {
|
|
99
|
+
it(
|
|
100
|
+
"agent breaks self-imposed silence in [70s, 200s] window via silence-poke",
|
|
101
|
+
async () => {
|
|
102
|
+
const sc = await spinUp({ agent: "test-harness" });
|
|
103
|
+
try {
|
|
104
|
+
const sendStart = Date.now();
|
|
105
|
+
await sc.sendDM(SILENT_CHURN_PROMPT);
|
|
106
|
+
|
|
107
|
+
// Wait for the FIRST reply. If silence-poke + the wire path
|
|
108
|
+
// are working, this lands between ~75s and ~110s as the
|
|
109
|
+
// model responds to the [silence-poke] system-reminder
|
|
110
|
+
// appended to the first or second sleep's tool result.
|
|
111
|
+
const firstReply = await sc.expectMessage(/\S/, {
|
|
112
|
+
from: "bot",
|
|
113
|
+
timeout: SOFT_WINDOW_MAX_MS + 20_000,
|
|
114
|
+
});
|
|
115
|
+
const elapsed = Date.now() - sendStart;
|
|
116
|
+
|
|
117
|
+
expect(firstReply.text.length).toBeGreaterThan(0);
|
|
118
|
+
|
|
119
|
+
// Primary window assertion.
|
|
120
|
+
expect(
|
|
121
|
+
elapsed,
|
|
122
|
+
`first bot reply lands at ${elapsed}ms (target window ` +
|
|
123
|
+
`[${SOFT_WINDOW_MIN_MS}, ${SOFT_WINDOW_MAX_MS}]). ` +
|
|
124
|
+
`Reply text: ${JSON.stringify(firstReply.text.slice(0, 200))}.`,
|
|
125
|
+
).toBeGreaterThanOrEqual(SOFT_WINDOW_MIN_MS);
|
|
126
|
+
expect(
|
|
127
|
+
elapsed,
|
|
128
|
+
`first bot reply lands at ${elapsed}ms — above ${SOFT_WINDOW_MAX_MS}ms ` +
|
|
129
|
+
`ceiling. Either silence-poke wire is broken (poke armed but ` +
|
|
130
|
+
`not drained at gateway.ts:onToolCall) or the framework ` +
|
|
131
|
+
`fallback at 300s was the first thing to break silence. ` +
|
|
132
|
+
`Reply text: ${JSON.stringify(firstReply.text.slice(0, 200))}.`,
|
|
133
|
+
).toBeLessThanOrEqual(SOFT_WINDOW_MAX_MS);
|
|
134
|
+
|
|
135
|
+
// Sanity floor: the first reply should be brief — proves it's
|
|
136
|
+
// a poke-driven update, not the final "done" answer after all
|
|
137
|
+
// three sleeps finished naturally. ~400 char ceiling allows a
|
|
138
|
+
// verbose model to add a sentence of context. Bump this if it
|
|
139
|
+
// flakes on perfectly valid short answers.
|
|
140
|
+
if (firstReply.text.length > 400) {
|
|
141
|
+
console.warn(
|
|
142
|
+
`[silence-poke] first reply at ${elapsed}ms is ${firstReply.text.length} ` +
|
|
143
|
+
`chars — longer than expected for a poke-driven update. The ` +
|
|
144
|
+
`window assertion still passed, but consider whether the model ` +
|
|
145
|
+
`bypassed the silence stretch (e.g. ran the sleeps in one ` +
|
|
146
|
+
`Bash call, dodging the per-call result poke chokepoint).`,
|
|
147
|
+
);
|
|
148
|
+
}
|
|
149
|
+
} finally {
|
|
150
|
+
await sc.tearDown();
|
|
151
|
+
}
|
|
152
|
+
},
|
|
153
|
+
240_000,
|
|
154
|
+
);
|
|
155
|
+
});
|
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Silent-end recovery scenario — the regression PR3 (#1126) introduced
|
|
3
|
+
* and PR1129 fixed.
|
|
4
|
+
*
|
|
5
|
+
* The bug: PR3 deleted the progress-card driver, and with it the
|
|
6
|
+
* `onSilentEnd` callback that wrote
|
|
7
|
+
* $TELEGRAM_STATE_DIR/silent-end-pending.json. The Stop hook
|
|
8
|
+
* (`silent-end-interrupt-stop.mjs`) reads that file to decide whether
|
|
9
|
+
* to block-and-re-prompt. With the writer gone, the hook always read
|
|
10
|
+
* "no silent-end pending" and allowed the stop. The model would
|
|
11
|
+
* produce an answer in its CLI session but never call `reply`, and
|
|
12
|
+
* the user got nothing back.
|
|
13
|
+
*
|
|
14
|
+
* This UAT exercises the outcome side directly: send a DM that
|
|
15
|
+
* SHOULD produce a reply, assert that a reply lands within a budget
|
|
16
|
+
* that covers (a) normal turn latency, (b) one Stop-hook re-prompt
|
|
17
|
+
* cycle (the agent goes silent → hook blocks → re-prompted → calls
|
|
18
|
+
* reply), and (c) worst-case framework fallback at 5 min.
|
|
19
|
+
*
|
|
20
|
+
* Why this scenario specifically:
|
|
21
|
+
* - The bug surfaced as "user gets no reply at all." The most
|
|
22
|
+
* defensible UAT assertion is "after asking, the user gets SOME
|
|
23
|
+
* reply within a reasonable bound." Anything that breaks this
|
|
24
|
+
* contract — silent-end gap, scaffold staleness, hook misconfig,
|
|
25
|
+
* gateway crash — fails this test.
|
|
26
|
+
* - Unlike `smoke-dm-reply.test.ts` (trivial inbound, fast reply),
|
|
27
|
+
* this scenario uses a tool-heavy prompt that pushes the model
|
|
28
|
+
* into the silent-end zone (lots of tool churn, easy to forget to
|
|
29
|
+
* call reply afterward). It's the actual JTBD-failure shape.
|
|
30
|
+
*
|
|
31
|
+
* Budget: 6 min outer, 5 min for the reply itself. Covers the
|
|
32
|
+
* 5-min framework fallback floor.
|
|
33
|
+
*/
|
|
34
|
+
|
|
35
|
+
import { describe, it, expect } from "vitest";
|
|
36
|
+
import { spinUp } from "../harness.js";
|
|
37
|
+
|
|
38
|
+
// The prompt pushes the model into a tool-heavy state where it has
|
|
39
|
+
// produced "an answer" internally but hasn't yet realised it must
|
|
40
|
+
// surface that via `reply`. This is the shape of the gymbro
|
|
41
|
+
// regression: the model did the work (cat, pip install, garmin-pull,
|
|
42
|
+
// etc), produced a summary, then ended the turn without `reply`.
|
|
43
|
+
const TOOL_HEAVY_PROMPT = (
|
|
44
|
+
"Pick a directory under /tmp that doesn't exist yet. Create it. "
|
|
45
|
+
+ "List its contents (should be empty). Write a small file in it. "
|
|
46
|
+
+ "List again. Then report what you did in a one-line reply."
|
|
47
|
+
);
|
|
48
|
+
|
|
49
|
+
describe("uat: silent-end recovery", () => {
|
|
50
|
+
it(
|
|
51
|
+
"user asks → agent always replies (the gymbro regression must not return)",
|
|
52
|
+
async () => {
|
|
53
|
+
const sc = await spinUp({ agent: "test-harness" });
|
|
54
|
+
try {
|
|
55
|
+
const { messageId: inboundId } = await sc.sendDM(TOOL_HEAVY_PROMPT);
|
|
56
|
+
expect(inboundId).toBeGreaterThan(0);
|
|
57
|
+
|
|
58
|
+
// The core assertion: SOMETHING comes back from the bot
|
|
59
|
+
// within 5min. That covers the worst case of the
|
|
60
|
+
// silent-end-recovery ladder:
|
|
61
|
+
// t=0: inbound
|
|
62
|
+
// t<30s: normal reply if all is well
|
|
63
|
+
// t=75s: silence-poke #1 fires (model re-prompted)
|
|
64
|
+
// t=180s: silence-poke #2 fires
|
|
65
|
+
// t=300s: framework fallback ("still working… (no update
|
|
66
|
+
// from agent in 5 min)") fires from the gateway.
|
|
67
|
+
// If we still get nothing by 300s+slack the bug is back.
|
|
68
|
+
const reply = await sc.expectMessage(/\S/, {
|
|
69
|
+
from: "bot",
|
|
70
|
+
timeout: 320_000,
|
|
71
|
+
});
|
|
72
|
+
|
|
73
|
+
expect(reply.text.length).toBeGreaterThan(0);
|
|
74
|
+
expect(reply.senderUserId).toBe(sc.botUserId);
|
|
75
|
+
|
|
76
|
+
// Subtler regression catch: if the reply is the framework
|
|
77
|
+
// fallback wording ("still working… (no update from agent
|
|
78
|
+
// in N min)") that means the silent-end loop fired AND the
|
|
79
|
+
// model didn't recover. Acceptable outcome — the user got
|
|
80
|
+
// something — but a design-health alarm. Log it.
|
|
81
|
+
if (/no update from agent/i.test(reply.text)) {
|
|
82
|
+
console.warn(
|
|
83
|
+
`[silent-end-recovery] reply was the framework fallback — `
|
|
84
|
+
+ `model never replied on its own. Reply text: ${JSON.stringify(reply.text.slice(0, 200))}`,
|
|
85
|
+
);
|
|
86
|
+
}
|
|
87
|
+
} finally {
|
|
88
|
+
await sc.tearDown();
|
|
89
|
+
}
|
|
90
|
+
},
|
|
91
|
+
// Outer budget = inner deadline (320s) + spinUp overhead
|
|
92
|
+
// (~12s mtcute connect + DEFAULT_SETTLE_MS) + headroom.
|
|
93
|
+
360_000,
|
|
94
|
+
);
|
|
95
|
+
});
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Smoke scenario — driver DMs the test bot, bot replies.
|
|
3
|
+
*
|
|
4
|
+
* Part of: https://github.com/switchroom/switchroom/issues/866
|
|
5
|
+
*
|
|
6
|
+
* Runs against real Telegram. Requires:
|
|
7
|
+
* - test-harness agent running (see uat/SETUP.md §5)
|
|
8
|
+
* - TELEGRAM_API_ID / TELEGRAM_API_HASH / TELEGRAM_UAT_DRIVER_SESSION
|
|
9
|
+
* in the env (operator script in SETUP.md §6)
|
|
10
|
+
* - TELEGRAM_TEST_BOT_USERNAME (defaults to `meken_switchroom_test_bot`)
|
|
11
|
+
*
|
|
12
|
+
* Invoke via `bun run test:uat` from `telegram-plugin/`. Default
|
|
13
|
+
* `bun test` / vitest do NOT discover this file — see
|
|
14
|
+
* vitest.config.ts.
|
|
15
|
+
*
|
|
16
|
+
* This is intentionally the simplest possible end-to-end check —
|
|
17
|
+
* just confirms the DM round-trip works. Richer assertions
|
|
18
|
+
* (reactions, progress card, edits) roll in with #866 Phase 2b.
|
|
19
|
+
*/
|
|
20
|
+
|
|
21
|
+
import { describe, it, expect } from "vitest";
|
|
22
|
+
import { spinUp } from "../harness.js";
|
|
23
|
+
|
|
24
|
+
const SMOKE_INBOUND = `uat-smoke ${new Date().toISOString()}`;
|
|
25
|
+
|
|
26
|
+
describe("uat: DM round-trip smoke", () => {
|
|
27
|
+
it(
|
|
28
|
+
"driver DMs the test bot and observes a bot reply",
|
|
29
|
+
async () => {
|
|
30
|
+
const sc = await spinUp({ agent: "test-harness" });
|
|
31
|
+
|
|
32
|
+
try {
|
|
33
|
+
await sc.sendDM(SMOKE_INBOUND);
|
|
34
|
+
|
|
35
|
+
// 90s wall-clock budget: tolerates one rate-limit retry on the
|
|
36
|
+
// bot side + a normal Claude turn. If the agent is healthy the
|
|
37
|
+
// reply arrives in <20s.
|
|
38
|
+
const reply = await sc.expectMessage(/.+/, {
|
|
39
|
+
from: "bot",
|
|
40
|
+
timeout: 90_000,
|
|
41
|
+
});
|
|
42
|
+
|
|
43
|
+
expect(reply.text.length).toBeGreaterThan(0);
|
|
44
|
+
expect(reply.senderUserId).toBe(sc.botUserId);
|
|
45
|
+
} finally {
|
|
46
|
+
await sc.tearDown();
|
|
47
|
+
}
|
|
48
|
+
},
|
|
49
|
+
// Per-test budget — must exceed the 90s inner expectMessage
|
|
50
|
+
// deadline plus spinUp overhead (~3s mtcute connect +
|
|
51
|
+
// DEFAULT_SETTLE_MS gap + unpin), so add ~12s of headroom on top
|
|
52
|
+
// for symmetry with progress-card-dm. bun:test's default of 5s
|
|
53
|
+
// would otherwise cut the test off on any turn that takes longer
|
|
54
|
+
// than a few seconds.
|
|
55
|
+
110_000,
|
|
56
|
+
);
|
|
57
|
+
});
|
|
@@ -0,0 +1,135 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Issue #1116 — subagent-watcher must not re-fire "✓ Worker done"
|
|
3
|
+
* after the terminal-cleanup grace window elapses.
|
|
4
|
+
*
|
|
5
|
+
* Pre-fix repro (validated by RCA on `clerk` DM, 2026-05-12): once a
|
|
6
|
+
* background sub-agent completed, `cleanupTerminalAgent` ran ~30s
|
|
7
|
+
* later, deleting the agent's filePath from `knownFiles` and its row
|
|
8
|
+
* from `registry`. The JSONL itself stayed on disk, so the next
|
|
9
|
+
* `rescanSubagentDirs` poll rediscovered it, re-registered the agent
|
|
10
|
+
* with `completionNotified=false`, read the terminal `turn_duration`
|
|
11
|
+
* line, and emitted a fresh `✓ Worker done: …` notification. The loop
|
|
12
|
+
* ran indefinitely — operator saw the same 4 sub-agents (30/2/15/105
|
|
13
|
+
* tools) re-announcing completion every ~6 minutes.
|
|
14
|
+
*
|
|
15
|
+
* Post-fix invariant: each completed sub-agent emits exactly ONE
|
|
16
|
+
* `✓ Worker done` notification for the lifetime of the gateway.
|
|
17
|
+
*
|
|
18
|
+
* As a side-benefit, this scenario also catches the original RFC's
|
|
19
|
+
* "raw HTML tags rendered in card text" symptom (Bug C in the RCA):
|
|
20
|
+
* any bot message containing a literal `<b>` / `<i>` / `<code>`
|
|
21
|
+
* substring during the window is flagged. The watcher's own
|
|
22
|
+
* notification path is HTML-correct on `main`, so this assertion is
|
|
23
|
+
* a regression detector — if a future change starts leaking raw
|
|
24
|
+
* tags via a fall-through send site, this scenario goes red.
|
|
25
|
+
*
|
|
26
|
+
* Requires the same env as the other DM scenarios (see SETUP.md §6)
|
|
27
|
+
* and the test-harness override `progress_card.delay_ms: 1000` so a
|
|
28
|
+
* short DM turn actually pins a card (SETUP.md §5).
|
|
29
|
+
*
|
|
30
|
+
* Time budget: the bg sub-agent does two ~10s sleeps (~20s total)
|
|
31
|
+
* + we listen for an extra 75s post-completion (>30s grace +
|
|
32
|
+
* generous rescan slack) to catch a rerun. Plus parent-turn ack
|
|
33
|
+
* latency and Telegram-edit settle. Outer cap 240s.
|
|
34
|
+
*/
|
|
35
|
+
|
|
36
|
+
import { describe, expect, it } from "vitest";
|
|
37
|
+
import { spinUp } from "../harness.js";
|
|
38
|
+
|
|
39
|
+
// Same Option-1 explicit-dispatch prompt as bg-sub-agent-dispatch-dm.test.ts
|
|
40
|
+
// — naming the tool + run_in_background flag keeps the model
|
|
41
|
+
// deterministic. The inner sleeps are shorter here (3×10s = ~30s
|
|
42
|
+
// background phase) so the outer budget stays sane: we only need
|
|
43
|
+
// the sub-agent to *complete* once. The duplicate-detection window
|
|
44
|
+
// is what makes the test meaningful, not the bg phase duration.
|
|
45
|
+
const BG_DISPATCH_PROMPT =
|
|
46
|
+
`Use the Agent tool with subagent_type "general-purpose" and ` +
|
|
47
|
+
`run_in_background: true to dispatch a worker with this exact task: ` +
|
|
48
|
+
`"Run \`sleep 10\` via the Bash tool, then \`echo step1\`, then ` +
|
|
49
|
+
`\`sleep 10\` again, then \`echo step2\`, then \`echo done\`. ` +
|
|
50
|
+
`That's two separate Bash sleeps and three echoes." After ` +
|
|
51
|
+
`dispatching, send a brief reply saying you've kicked off the ` +
|
|
52
|
+
`background worker so I can watch the progress card.`;
|
|
53
|
+
|
|
54
|
+
const WORKER_DONE_RE = /✓\s*Worker done/;
|
|
55
|
+
const RAW_HTML_TAG_RE = /<\/?(b|i|code|pre|strong|em)>/i;
|
|
56
|
+
|
|
57
|
+
describe("uat: issue #1116 — subagent-watcher does not re-fire Worker done", () => {
|
|
58
|
+
it(
|
|
59
|
+
"emits exactly one ✓ Worker done per bg sub-agent and no raw HTML leaks",
|
|
60
|
+
async () => {
|
|
61
|
+
const sc = await spinUp({ agent: "test-harness" });
|
|
62
|
+
try {
|
|
63
|
+
await sc.sendDM(BG_DISPATCH_PROMPT);
|
|
64
|
+
|
|
65
|
+
// Wait for the bg sub-agent to complete — the watcher's
|
|
66
|
+
// `✓ Worker done: …` notification is what we're locking
|
|
67
|
+
// behaviour around. Generous timeout: parent ack + bg sleeps
|
|
68
|
+
// + completion plumbing.
|
|
69
|
+
const firstDone = await sc.expectMessage(WORKER_DONE_RE, {
|
|
70
|
+
from: "bot",
|
|
71
|
+
timeout: 120_000,
|
|
72
|
+
});
|
|
73
|
+
expect(firstDone.text).toMatch(WORKER_DONE_RE);
|
|
74
|
+
|
|
75
|
+
// Snapshot bot-side messages observed after the first done.
|
|
76
|
+
// Pre-fix the same notification re-fired every ~30s
|
|
77
|
+
// (TERMINAL_CLEANUP_GRACE_MS + rescan). 75s gives us a
|
|
78
|
+
// comfortable >2 grace windows worth of observation.
|
|
79
|
+
const collected: Array<{ text: string; messageId: number }> = [];
|
|
80
|
+
const observer = sc.driver
|
|
81
|
+
.observeMessages(sc.botUserId)
|
|
82
|
+
[Symbol.asyncIterator]();
|
|
83
|
+
const deadline = Date.now() + 75_000;
|
|
84
|
+
try {
|
|
85
|
+
while (Date.now() < deadline) {
|
|
86
|
+
const remaining = deadline - Date.now();
|
|
87
|
+
if (remaining <= 0) break;
|
|
88
|
+
const winner = await Promise.race([
|
|
89
|
+
observer.next(),
|
|
90
|
+
new Promise<{ value?: undefined; done: true }>((resolve) =>
|
|
91
|
+
setTimeout(() => resolve({ done: true }), remaining),
|
|
92
|
+
),
|
|
93
|
+
]);
|
|
94
|
+
if (winner.done) break;
|
|
95
|
+
const msg = winner.value;
|
|
96
|
+
if (!msg) continue;
|
|
97
|
+
// Only count bot-sent messages (filter out anything the
|
|
98
|
+
// driver itself echoed in this window).
|
|
99
|
+
if (msg.fromUserId === sc.driverUserId) continue;
|
|
100
|
+
collected.push({ text: msg.text ?? "", messageId: msg.messageId });
|
|
101
|
+
}
|
|
102
|
+
} finally {
|
|
103
|
+
// Closing the iterator unregisters the mtcute listeners.
|
|
104
|
+
await observer.return?.();
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
// Invariant 1: no DUPLICATE Worker-done with the same shape
|
|
108
|
+
// as the first one. We compare text rather than message_id
|
|
109
|
+
// because the bug emits FRESH messages (not edits), so each
|
|
110
|
+
// re-fire has a new message_id but identical text.
|
|
111
|
+
const reruns = collected.filter((m) => WORKER_DONE_RE.test(m.text));
|
|
112
|
+
expect(
|
|
113
|
+
reruns,
|
|
114
|
+
`Expected zero re-fires of "Worker done" in the ${75}s post-completion window, got ${reruns.length}: ${JSON.stringify(reruns.slice(0, 4).map((r) => r.text.slice(0, 80)))}`,
|
|
115
|
+
).toHaveLength(0);
|
|
116
|
+
|
|
117
|
+
// Invariant 2: no raw HTML tags in any bot text — including
|
|
118
|
+
// the original `firstDone` notification. Catches Bug C
|
|
119
|
+
// (RCA's third symptom) as a regression detector.
|
|
120
|
+
const allBotTexts = [firstDone.text, ...collected.map((m) => m.text)];
|
|
121
|
+
for (const text of allBotTexts) {
|
|
122
|
+
expect(
|
|
123
|
+
text,
|
|
124
|
+
`Raw HTML tag leaked into bot text: ${text.slice(0, 120)}`,
|
|
125
|
+
).not.toMatch(RAW_HTML_TAG_RE);
|
|
126
|
+
}
|
|
127
|
+
} finally {
|
|
128
|
+
await sc.tearDown();
|
|
129
|
+
}
|
|
130
|
+
},
|
|
131
|
+
// Outer budget: 120s wait-for-done + 75s observation window +
|
|
132
|
+
// ~12s spinUp settle + slack. Round up.
|
|
133
|
+
240_000,
|
|
134
|
+
);
|
|
135
|
+
});
|