switchroom 0.7.15 → 0.10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +51 -59
- package/bin/run-hook.sh +27 -11
- package/bin/timezone-hook.sh +9 -7
- package/dist/agent-scheduler/index.js +410 -133
- package/dist/auth-broker/index.js +13932 -0
- package/dist/cli/switchroom.js +26937 -5601
- package/dist/host-control/main.js +12702 -0
- package/dist/vault/approvals/kernel-server.js +467 -184
- package/dist/vault/broker/server.js +1430 -724
- package/examples/minimal.yaml +63 -0
- package/examples/personal-google-workspace-mcp/.env.example +34 -0
- package/examples/personal-google-workspace-mcp/README.md +194 -0
- package/examples/personal-google-workspace-mcp/compose.yaml +66 -0
- package/examples/switchroom.yaml +220 -0
- package/package.json +7 -4
- package/profiles/_base/settings.json.hbs +20 -5
- package/profiles/_base/start.sh.hbs +16 -3
- package/profiles/_shared/agent-self-service.md.hbs +126 -0
- package/profiles/_shared/telegram-style.md.hbs +20 -90
- package/profiles/_shared/vault-protocol.md.hbs +68 -0
- package/profiles/default/CLAUDE.md +50 -96
- package/profiles/default/CLAUDE.md.hbs +36 -6
- package/profiles/default/workspace/SOUL.md.hbs +12 -5
- package/skills/buildkite-agent-infrastructure/SKILL.md +30 -11
- package/skills/buildkite-agent-runtime/SKILL.md +44 -11
- package/skills/buildkite-api/SKILL.md +31 -8
- package/skills/buildkite-cli/SKILL.md +27 -9
- package/skills/buildkite-migration/SKILL.md +22 -9
- package/skills/buildkite-pipelines/SKILL.md +26 -9
- package/skills/buildkite-secure-delivery/SKILL.md +23 -9
- package/skills/buildkite-test-engine/SKILL.md +25 -8
- package/skills/docx/SKILL.md +1 -1
- package/skills/docx/scripts/office/validators/__pycache__/__init__.cpython-313.pyc +0 -0
- package/skills/docx/scripts/office/validators/__pycache__/base.cpython-313.pyc +0 -0
- package/skills/file-bug/SKILL.md +34 -6
- package/skills/humanizer/SKILL.md +15 -0
- package/skills/humanizer-calibrate/SKILL.md +7 -1
- package/skills/mcp-builder/SKILL.md +1 -1
- package/skills/pdf/SKILL.md +1 -1
- package/skills/pptx/SKILL.md +1 -1
- package/skills/skill-creator/SKILL.md +21 -1
- package/skills/skill-creator/scripts/__pycache__/__init__.cpython-313.pyc +0 -0
- package/skills/skill-creator/scripts/__pycache__/generate_report.cpython-313.pyc +0 -0
- package/skills/skill-creator/scripts/__pycache__/improve_description.cpython-313.pyc +0 -0
- package/skills/skill-creator/scripts/__pycache__/run_eval.cpython-313.pyc +0 -0
- package/skills/skill-creator/scripts/__pycache__/run_loop.cpython-313.pyc +0 -0
- package/skills/skill-creator/scripts/__pycache__/utils.cpython-313.pyc +0 -0
- package/skills/switchroom-cli/SKILL.md +63 -64
- package/skills/switchroom-health/SKILL.md +23 -10
- package/skills/switchroom-install/SKILL.md +3 -3
- package/skills/switchroom-manage/SKILL.md +26 -19
- package/skills/switchroom-runtime/SKILL.md +191 -0
- package/skills/switchroom-status/SKILL.md +27 -2
- package/skills/telegram-test-harness/SKILL.md +3 -0
- package/skills/token-helpers/SKILL.md +24 -1
- package/skills/webapp-testing/SKILL.md +31 -1
- package/skills/xlsx/SKILL.md +1 -1
- package/telegram-plugin/admin-commands/index.ts +7 -5
- package/telegram-plugin/analytics-posthog.ts +191 -0
- package/telegram-plugin/bridge/bridge.ts +69 -0
- package/telegram-plugin/bridge/ipc-client.ts +4 -1
- package/telegram-plugin/dist/bridge/bridge.js +194 -119
- package/telegram-plugin/dist/gateway/gateway.js +23611 -19671
- package/telegram-plugin/dist/server.js +245 -189
- package/telegram-plugin/first-paint.ts +3 -24
- package/telegram-plugin/gateway/auth-add-flow.ts +326 -0
- package/telegram-plugin/gateway/auth-broker-client.ts +75 -0
- package/telegram-plugin/gateway/auth-command.ts +794 -0
- package/telegram-plugin/gateway/auth-line.ts +123 -0
- package/telegram-plugin/gateway/boot-card.ts +169 -40
- package/telegram-plugin/gateway/boot-issue-cache.ts +308 -0
- package/telegram-plugin/gateway/boot-probes.ts +166 -123
- package/telegram-plugin/gateway/boot-reason.ts +41 -7
- package/telegram-plugin/gateway/boot-version.ts +66 -0
- package/telegram-plugin/gateway/gateway.ts +3499 -1885
- package/telegram-plugin/gateway/hostd-dispatch.ts +117 -0
- package/telegram-plugin/gateway/ipc-protocol.ts +18 -0
- package/telegram-plugin/gateway/pending-inbound-buffer.ts +106 -0
- package/telegram-plugin/gateway/quarantine.ts +69 -0
- package/telegram-plugin/gateway/quota-cache.ts +9 -4
- package/telegram-plugin/gateway/reaction-trigger.ts +401 -0
- package/telegram-plugin/gateway/recent-denials.test.ts +103 -0
- package/telegram-plugin/gateway/recent-denials.ts +77 -0
- package/telegram-plugin/gateway/startup-network-retry.ts +109 -31
- package/telegram-plugin/gateway/vault-grant-inbound-builders.ts +125 -0
- package/telegram-plugin/history.ts +91 -0
- package/telegram-plugin/hooks/hooks.json +10 -0
- package/telegram-plugin/hooks/sandbox-hint-posttool.mjs +130 -0
- package/telegram-plugin/hooks/subagent-tracker-posttool.mjs +19 -2
- package/telegram-plugin/hooks/subagent-tracker-pretool.mjs +22 -2
- package/telegram-plugin/hooks/tool-label-pretool.mjs +11 -0
- package/telegram-plugin/hooks/wedge-detect-posttool.mjs +303 -0
- package/telegram-plugin/inbound-classifier.ts +50 -0
- package/telegram-plugin/inline-keyboard-callbacks.ts +136 -0
- package/telegram-plugin/node_modules/.vite/vitest/da39a3ee5e6b4b0d3255bfef95601890afd80709/results.json +1 -0
- package/telegram-plugin/package.json +4 -2
- package/telegram-plugin/permission-rule.ts +51 -0
- package/telegram-plugin/permission-title.ts +56 -0
- package/telegram-plugin/quota-check.ts +19 -41
- package/telegram-plugin/registry/reaper.ts +223 -0
- package/telegram-plugin/retry-api-call.ts +80 -0
- package/telegram-plugin/runtime-metrics.ts +177 -0
- package/telegram-plugin/scripts/build.mjs +0 -1
- package/telegram-plugin/secret-detect/index.ts +24 -0
- package/telegram-plugin/secret-detect/vault-error.test.ts +64 -12
- package/telegram-plugin/secret-detect/vault-error.ts +78 -11
- package/telegram-plugin/secret-detect/vault-write.ts +14 -2
- package/telegram-plugin/server.js +41795 -0
- package/telegram-plugin/session-tail.ts +6 -1
- package/telegram-plugin/shared/bot-runtime.ts +5 -4
- package/telegram-plugin/silence-poke.ts +420 -0
- package/telegram-plugin/silent-end.ts +174 -0
- package/telegram-plugin/stream-controller.ts +13 -0
- package/telegram-plugin/stream-reply-handler.ts +7 -0
- package/telegram-plugin/subagent-watcher.ts +213 -4
- package/telegram-plugin/tests/auth-add-flow.test.ts +559 -0
- package/telegram-plugin/tests/auth-code-redact.test.ts +8 -4
- package/telegram-plugin/tests/auth-command-vernacular.test.ts +531 -0
- package/telegram-plugin/tests/boot-card-issue-dedup.test.ts +247 -0
- package/telegram-plugin/tests/boot-card-reason-to-render.test.ts +182 -0
- package/telegram-plugin/tests/boot-card-reason.test.ts +65 -2
- package/telegram-plugin/tests/boot-card-render.test.ts +146 -0
- package/telegram-plugin/tests/boot-card-silent-on-operator.test.ts +103 -0
- package/telegram-plugin/tests/boot-probes.test.ts +216 -10
- package/telegram-plugin/tests/boot-version-string.test.ts +0 -0
- package/telegram-plugin/tests/finalize-callback.test.ts +190 -0
- package/telegram-plugin/tests/gateway-message-validator.test.ts +26 -0
- package/telegram-plugin/tests/gateway-secret-detect.test.ts +12 -3
- package/telegram-plugin/tests/gateway-startup-network-retry.test.ts +104 -0
- package/telegram-plugin/tests/history-reaper.test.ts +378 -0
- package/telegram-plugin/tests/hostd-dispatch.test.ts +129 -0
- package/telegram-plugin/tests/inbound-classifier.test.ts +76 -0
- package/telegram-plugin/tests/inbound-message-types.test.ts +267 -0
- package/telegram-plugin/tests/issues-card.test.ts +49 -0
- package/telegram-plugin/tests/pending-inbound-buffer.test.ts +132 -0
- package/telegram-plugin/tests/permission-rule.test.ts +80 -1
- package/telegram-plugin/tests/permission-title.test.ts +31 -0
- package/telegram-plugin/tests/quota-check.test.ts +5 -35
- package/telegram-plugin/tests/races.test.ts +179 -0
- package/telegram-plugin/tests/reaction-trigger-flow.test.ts +353 -0
- package/telegram-plugin/tests/reaction-trigger.test.ts +397 -0
- package/telegram-plugin/tests/retry-api-call.test.ts +152 -1
- package/telegram-plugin/tests/runtime-metrics.test.ts +145 -0
- package/telegram-plugin/tests/sandbox-hint-posttool.test.ts +155 -0
- package/telegram-plugin/tests/secret-detect-delete-must-surface-failures.test.ts +133 -0
- package/telegram-plugin/tests/secret-detect-false-positives.test.ts +137 -0
- package/telegram-plugin/tests/silence-poke.test.ts +493 -0
- package/telegram-plugin/tests/silent-end.test.ts +206 -0
- package/telegram-plugin/tests/subagent-tracker-hooks.test.ts +107 -0
- package/telegram-plugin/tests/subagent-watcher-env-thresholds.test.ts +224 -0
- package/telegram-plugin/tests/subagent-watcher-stall-terminal.test.ts +316 -0
- package/telegram-plugin/tests/subagent-watcher.test.ts +263 -0
- package/telegram-plugin/tests/turn-signal-tracker.test.ts +81 -0
- package/telegram-plugin/tests/vault-approval-posture.test.ts +256 -0
- package/telegram-plugin/tests/vault-grant-auto-resume.test.ts +73 -0
- package/telegram-plugin/tests/vault-grant-inbound-builders.test.ts +226 -0
- package/telegram-plugin/tests/vault-grant-union.test.ts +130 -0
- package/telegram-plugin/tests/vault-key-regex-allows-slash.test.ts +140 -0
- package/telegram-plugin/tests/vault-posture-quarantine.test.ts +104 -0
- package/telegram-plugin/tests/vault-request-access-tool.test.ts +114 -0
- package/telegram-plugin/tests/vault-request-access-unlock-resume.test.ts +106 -0
- package/telegram-plugin/turn-signal-tracker.ts +100 -24
- package/telegram-plugin/uat/SETUP.md +210 -35
- package/telegram-plugin/uat/assertions.ts +264 -37
- package/telegram-plugin/uat/driver-info.ts +57 -0
- package/telegram-plugin/uat/driver.ts +590 -51
- package/telegram-plugin/uat/harness.ts +140 -94
- package/telegram-plugin/uat/load-env.test.ts +72 -0
- package/telegram-plugin/uat/load-env.ts +48 -0
- package/telegram-plugin/uat/login.ts +96 -53
- package/telegram-plugin/uat/runners/agent-self-sufficiency.ts +457 -0
- package/telegram-plugin/uat/runners/paraphrases.ts +231 -0
- package/telegram-plugin/uat/runners/report.ts +150 -0
- package/telegram-plugin/uat/runners/run-agent-self-sufficiency.sh +50 -0
- package/telegram-plugin/uat/runners/scorer.test.ts +196 -0
- package/telegram-plugin/uat/runners/scorer.ts +106 -0
- package/telegram-plugin/uat/runners/skill-coverage.test.ts +100 -0
- package/telegram-plugin/uat/runners/skill-coverage.ts +620 -0
- package/telegram-plugin/uat/scenarios/ask-user-button-tap-dm.test.ts +141 -0
- package/telegram-plugin/uat/scenarios/bg-sub-agent-dispatch-dm.test.ts +191 -0
- package/telegram-plugin/uat/scenarios/fuzz-extended-dm.test.ts +255 -0
- package/telegram-plugin/uat/scenarios/fuzz-human-style-dm.test.ts +275 -0
- package/telegram-plugin/uat/scenarios/fuzz-random-prompts-dm.test.ts +146 -0
- package/telegram-plugin/uat/scenarios/fuzz-status-ask-dm.test.ts +486 -0
- package/telegram-plugin/uat/scenarios/jtbd-interrupt-marker-dm.test.ts +67 -0
- package/telegram-plugin/uat/scenarios/jtbd-rapid-followup-dm.test.ts +100 -0
- package/telegram-plugin/uat/scenarios/jtbd-soft-commit-dm.test.ts +67 -0
- package/telegram-plugin/uat/scenarios/jtbd-status-query-dm.test.ts +49 -0
- package/telegram-plugin/uat/scenarios/location-inbound-dm.test.ts +65 -0
- package/telegram-plugin/uat/scenarios/midturn-silent-dm.test.ts +175 -0
- package/telegram-plugin/uat/scenarios/reactions-dm.test.ts +142 -0
- package/telegram-plugin/uat/scenarios/reactions-trigger-turn-dm.test.ts +96 -0
- package/telegram-plugin/uat/scenarios/secret-redaction-deletes-original-dm.test.ts +123 -0
- package/telegram-plugin/uat/scenarios/secret-redaction-no-false-positive-dm.test.ts +87 -0
- package/telegram-plugin/uat/scenarios/silence-poke-soft-dm.test.ts +155 -0
- package/telegram-plugin/uat/scenarios/silent-end-recovery-dm.test.ts +95 -0
- package/telegram-plugin/uat/scenarios/smoke-dm-reply.test.ts +57 -0
- package/telegram-plugin/uat/scenarios/subagent-watcher-no-rerun-dm.test.ts +135 -0
- package/telegram-plugin/uat/scenarios/vault-approval-posture-telegram-id-dm.test.ts +191 -0
- package/telegram-plugin/uat/scenarios/vault-audit-allow-dm.test.ts +108 -0
- package/telegram-plugin/uat/scenarios/vault-grant-auto-resume-dm.test.ts +121 -0
- package/telegram-plugin/uat/scenarios/vault-request-access-concurrent-dm.test.ts +161 -0
- package/telegram-plugin/uat/scenarios/vault-request-access-end-to-end-dm.test.ts +158 -0
- package/telegram-plugin/uat/scenarios/voice-inbound-dm.test.ts +65 -0
- package/telegram-plugin/vault-approval-posture.ts +42 -0
- package/telegram-plugin/welcome-text.ts +1 -0
- package/telegram-plugin/active-pins-sweep.ts +0 -204
- package/telegram-plugin/active-pins.ts +0 -146
- package/telegram-plugin/auth-dashboard.ts +0 -1104
- package/telegram-plugin/auth-slot-parser.ts +0 -497
- package/telegram-plugin/card-event-log.ts +0 -138
- package/telegram-plugin/dist/foreman/foreman.js +0 -31106
- package/telegram-plugin/docs/multi-agent-card-design.md +0 -847
- package/telegram-plugin/docs/pinned-progress-card-reliability.md +0 -144
- package/telegram-plugin/foreman/foreman-create-flow.ts +0 -202
- package/telegram-plugin/foreman/foreman-handlers.ts +0 -493
- package/telegram-plugin/foreman/foreman.ts +0 -1165
- package/telegram-plugin/foreman/setup-flow.ts +0 -345
- package/telegram-plugin/foreman/setup-state.ts +0 -239
- package/telegram-plugin/foreman/state.ts +0 -203
- package/telegram-plugin/pin-event-log.ts +0 -76
- package/telegram-plugin/progress-card-driver.ts +0 -2886
- package/telegram-plugin/progress-card-pin-manager.ts +0 -589
- package/telegram-plugin/progress-card-pin-watchdog.ts +0 -98
- package/telegram-plugin/progress-card.ts +0 -1409
- package/telegram-plugin/tests/HARNESS.md +0 -340
- package/telegram-plugin/tests/_progress-card-harness.ts +0 -109
- package/telegram-plugin/tests/active-pins-boot-reaper.test.ts +0 -211
- package/telegram-plugin/tests/active-pins-sweep.test.ts +0 -309
- package/telegram-plugin/tests/active-pins.test.ts +0 -187
- package/telegram-plugin/tests/auth-account-identity-surface.test.ts +0 -118
- package/telegram-plugin/tests/auth-dashboard-edge-cases.test.ts +0 -260
- package/telegram-plugin/tests/auth-dashboard-restart-flow.test.ts +0 -140
- package/telegram-plugin/tests/auth-dashboard-v3b.test.ts +0 -559
- package/telegram-plugin/tests/auth-dashboard.test.ts +0 -1045
- package/telegram-plugin/tests/auth-slot-commands.test.ts +0 -640
- package/telegram-plugin/tests/bg-agent-progress-card-757.test.ts +0 -201
- package/telegram-plugin/tests/boot-card-account-quota.test.ts +0 -137
- package/telegram-plugin/tests/card-event-log.test.ts +0 -145
- package/telegram-plugin/tests/first-paint.test.ts +0 -257
- package/telegram-plugin/tests/foreman-create-flow.test.ts +0 -359
- package/telegram-plugin/tests/foreman-handlers.test.ts +0 -347
- package/telegram-plugin/tests/foreman-state.test.ts +0 -164
- package/telegram-plugin/tests/foreman-write-ops.test.ts +0 -214
- package/telegram-plugin/tests/harness-ordering-invariants.test.ts +0 -243
- package/telegram-plugin/tests/pin-event-log.test.ts +0 -124
- package/telegram-plugin/tests/progress-card-api-failure-during-deferred.test.ts +0 -73
- package/telegram-plugin/tests/progress-card-close-paths-converge.test.ts +0 -272
- package/telegram-plugin/tests/progress-card-cross-turn.test.ts +0 -258
- package/telegram-plugin/tests/progress-card-delay-842.test.ts +0 -160
- package/telegram-plugin/tests/progress-card-dispose-preservepending.test.ts +0 -81
- package/telegram-plugin/tests/progress-card-draft-flag.test.ts +0 -80
- package/telegram-plugin/tests/progress-card-driver-eviction.test.ts +0 -215
- package/telegram-plugin/tests/progress-card-driver-fleet-shadow.test.ts +0 -123
- package/telegram-plugin/tests/progress-card-driver-force-complete-parent-done.test.ts +0 -76
- package/telegram-plugin/tests/progress-card-edit-timestamps-budget.test.ts +0 -62
- package/telegram-plugin/tests/progress-card-memory-bounds.test.ts +0 -84
- package/telegram-plugin/tests/progress-card-pin-failure-paths.test.ts +0 -139
- package/telegram-plugin/tests/progress-card-pin-manager.test.ts +0 -773
- package/telegram-plugin/tests/progress-card-pin-race-fast-turn.test.ts +0 -66
- package/telegram-plugin/tests/progress-card-pin-sidecar-partial-write.test.ts +0 -64
- package/telegram-plugin/tests/progress-card-pin-watchdog.test.ts +0 -190
- package/telegram-plugin/tests/progress-card-sigterm-pin-flush.test.ts +0 -146
- package/telegram-plugin/tests/real-gateway-f1-ladder-integrity.test.ts +0 -123
- package/telegram-plugin/tests/real-gateway-f2-instant-draft.test.ts +0 -82
- package/telegram-plugin/tests/real-gateway-f3-late-card.test.ts +0 -114
- package/telegram-plugin/tests/real-gateway-harness.ts +0 -699
- package/telegram-plugin/tests/real-gateway-i6-turn-flush-replay-dedup.test.ts +0 -313
- package/telegram-plugin/tests/real-gateway-ipc-lifecycle.test.ts +0 -299
- package/telegram-plugin/tests/real-gateway-spec.test.ts +0 -487
- package/telegram-plugin/tests/real-gateway.smoke.test.ts +0 -101
- package/telegram-plugin/tests/setup-flow.test.ts +0 -510
- package/telegram-plugin/tests/setup-state.test.ts +0 -146
- package/telegram-plugin/tests/sync-chat-running-subagents.test.ts +0 -116
- package/telegram-plugin/tests/turn-end-regressions.test.ts +0 -489
- package/telegram-plugin/tests/turn-flush-card-takeover.test.ts +0 -218
- package/telegram-plugin/tests/turn-flush-prose-recovery.test.ts +0 -78
- package/telegram-plugin/tests/two-zone-bg-carry-full-lifecycle.test.ts +0 -131
- package/telegram-plugin/tests/two-zone-bg-detection.test.ts +0 -120
- package/telegram-plugin/tests/two-zone-bg-done-when-all-terminal.test.ts +0 -116
- package/telegram-plugin/tests/two-zone-bg-early-turn-end.test.ts +0 -87
- package/telegram-plugin/tests/two-zone-bg-survives-next-turn.test.ts +0 -211
- package/telegram-plugin/tests/two-zone-card-cap.test.ts +0 -62
- package/telegram-plugin/tests/two-zone-card-fleet-row.test.ts +0 -101
- package/telegram-plugin/tests/two-zone-card-header-phases.test.ts +0 -78
- package/telegram-plugin/tests/two-zone-card-html-balance.test.ts +0 -110
- package/telegram-plugin/tests/two-zone-card-lifecycle.test.ts +0 -128
- package/telegram-plugin/tests/two-zone-card-sanitise.test.ts +0 -58
- package/telegram-plugin/tests/two-zone-card-snapshot.test.ts +0 -133
- package/telegram-plugin/tests/two-zone-concurrent-turns-isolation.test.ts +0 -155
- package/telegram-plugin/tests/two-zone-phasefor-precedence.test.ts +0 -117
- package/telegram-plugin/tests/two-zone-snapshot-extras.test.ts +0 -187
- package/telegram-plugin/tests/two-zone-stuck-edit-throttle.test.ts +0 -149
- package/telegram-plugin/tests/two-zone-stuck-header-escalation.test.ts +0 -101
- package/telegram-plugin/tests/two-zone-stuck-per-member.test.ts +0 -114
- package/telegram-plugin/tests/two-zone-stuck-recovery.test.ts +0 -105
- package/telegram-plugin/tests/waiting-ux-harness.ts +0 -381
- package/telegram-plugin/tests/waiting-ux.e2e.test.ts +0 -233
- package/telegram-plugin/turn-flush-prose-recovery.ts +0 -40
- package/telegram-plugin/two-zone-card.ts +0 -269
- package/telegram-plugin/uat/scenarios/smoke-clerk-reply.test.ts +0 -61
|
@@ -0,0 +1,141 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* End-to-end UAT for the #1150 button-UX audit's three invariants on a
|
|
3
|
+
* surface that requires NO vault state mutation: the `ask_user` MCP
|
|
4
|
+
* tool.
|
|
5
|
+
*
|
|
6
|
+
* Flow:
|
|
7
|
+
* 1. Driver asks the agent to use `ask_user` with 2 fixed options.
|
|
8
|
+
* 2. Agent emits the question + inline keyboard.
|
|
9
|
+
* 3. Driver locates the buttons and presses one.
|
|
10
|
+
* 4. Driver re-reads the message — assert:
|
|
11
|
+
* - keyboard is gone (invariant 2: atomic strip)
|
|
12
|
+
* - message text appends `✅ <choice>` (invariant 2: status line)
|
|
13
|
+
* 5. Driver waits for a fresh bot turn referencing the chosen option
|
|
14
|
+
* (invariant 3: gateway forwarded the answer; agent continued).
|
|
15
|
+
*
|
|
16
|
+
* Why this scenario over a vault-state mutation one (the existing
|
|
17
|
+
* `vault-grant-auto-resume-dm.test.ts` covers the load-bearing #1052
|
|
18
|
+
* path but is `describe.skip`'d because it mutates the operator's
|
|
19
|
+
* vault): `ask_user` has zero side effects on switchroom state. The
|
|
20
|
+
* scenario is repeatable and cleanup-free.
|
|
21
|
+
*
|
|
22
|
+
* What's pinned:
|
|
23
|
+
* - The `ask_user` tool's callback flow (`gateway.ts:11113-11152`)
|
|
24
|
+
* routes through the same three-invariant pattern PR #1152
|
|
25
|
+
* formalized in `finalizeCallback`. Pre-audit the keyboard strip
|
|
26
|
+
* + status line already existed for `ask_user`; the audit kept
|
|
27
|
+
* that surface in the "OK today" column. This UAT pins the
|
|
28
|
+
* existing behaviour against future regressions.
|
|
29
|
+
*
|
|
30
|
+
* Per-test wall-clock budget: 180s. The agent has two turns to
|
|
31
|
+
* complete:
|
|
32
|
+
* - Turn 1: receive driver prompt → call `ask_user` (~20s typical).
|
|
33
|
+
* - Turn 2: receive operator answer → reply confirming the choice
|
|
34
|
+
* (~15s typical).
|
|
35
|
+
* Plus spinUp settle + mtcute connect overhead. 180s gives ~3x
|
|
36
|
+
* headroom for a slow run.
|
|
37
|
+
*/
|
|
38
|
+
|
|
39
|
+
import { describe, it, expect } from "vitest";
|
|
40
|
+
import { spinUp } from "../harness.js";
|
|
41
|
+
|
|
42
|
+
const OPTION_A = "spaghetti";
|
|
43
|
+
const OPTION_B = "salad";
|
|
44
|
+
const CHOSEN = OPTION_A;
|
|
45
|
+
|
|
46
|
+
describe("uat: ask_user button-tap → keyboard strip + status line + agent continues (#1150 audit)", () => {
|
|
47
|
+
it(
|
|
48
|
+
"tapping an ask_user option strips the keyboard, appends ✅ <choice>, and the agent acknowledges the answer in a follow-up turn",
|
|
49
|
+
async () => {
|
|
50
|
+
const sc = await spinUp({ agent: "test-harness" });
|
|
51
|
+
try {
|
|
52
|
+
// Prompt: ask the agent to call `ask_user` with two fixed
|
|
53
|
+
// options. The wording is explicit so the model picks the
|
|
54
|
+
// right tool on the first try — fuzz-style "use ask_user
|
|
55
|
+
// somehow" prompts have ~20% drop rate to the model
|
|
56
|
+
// free-styling a regular reply instead.
|
|
57
|
+
await sc.sendDM(
|
|
58
|
+
`Please use your ask_user MCP tool to ask me which I'd ` +
|
|
59
|
+
`prefer for dinner. Two options exactly: "${OPTION_A}" ` +
|
|
60
|
+
`and "${OPTION_B}". After I tap one, reply with a single ` +
|
|
61
|
+
`short line confirming the choice (e.g. "Got it, ${OPTION_A} it is.").`,
|
|
62
|
+
);
|
|
63
|
+
|
|
64
|
+
// ── 1. Wait for the ask_user card. ──────────────────────────
|
|
65
|
+
// Matches the agent's question text containing both options.
|
|
66
|
+
const card = await sc.expectMessage(
|
|
67
|
+
new RegExp(`${OPTION_A}.*${OPTION_B}|${OPTION_B}.*${OPTION_A}`, "s"),
|
|
68
|
+
{ from: "bot", timeout: 120_000 },
|
|
69
|
+
);
|
|
70
|
+
|
|
71
|
+
// ── 2. Pull the keyboard, locate the chosen-option button. ──
|
|
72
|
+
const kb = await sc.driver.getKeyboard(sc.botUserId, card.messageId);
|
|
73
|
+
expect(kb).not.toBeNull();
|
|
74
|
+
const buttons = kb!.flat();
|
|
75
|
+
// Each option's button text might be styled (e.g. "🍝 spaghetti").
|
|
76
|
+
// Match on case-insensitive substring rather than equality.
|
|
77
|
+
const chosenBtn = buttons.find(
|
|
78
|
+
(b) => b.callbackData != null && b.text.toLowerCase().includes(CHOSEN.toLowerCase()),
|
|
79
|
+
);
|
|
80
|
+
expect(
|
|
81
|
+
chosenBtn,
|
|
82
|
+
`expected a button containing ${JSON.stringify(CHOSEN)} (got ${JSON.stringify(buttons.map((b) => b.text))})`,
|
|
83
|
+
).toBeDefined();
|
|
84
|
+
|
|
85
|
+
// ── 3. Tap. ────────────────────────────────────────────────
|
|
86
|
+
await sc.driver.pressButton(
|
|
87
|
+
sc.botUserId,
|
|
88
|
+
card.messageId,
|
|
89
|
+
chosenBtn!.callbackData!,
|
|
90
|
+
);
|
|
91
|
+
|
|
92
|
+
// ── 4. Re-read the original card. Invariants 2a + 2b. ──────
|
|
93
|
+
//
|
|
94
|
+
// The edit + ack are best-effort on the gateway side; allow a
|
|
95
|
+
// short window for both to propagate before re-fetching.
|
|
96
|
+
await new Promise((r) => setTimeout(r, 1500));
|
|
97
|
+
const edited = await sc.driver.getKeyboard(sc.botUserId, card.messageId);
|
|
98
|
+
// Invariant 2a: keyboard collapses to empty (or vanishes
|
|
99
|
+
// entirely — getKeyboard returns null when reply_markup is
|
|
100
|
+
// missing). Either shape counts as "stripped".
|
|
101
|
+
const stripped =
|
|
102
|
+
edited == null ||
|
|
103
|
+
(Array.isArray(edited) && (edited.length === 0 || edited.flat().length === 0));
|
|
104
|
+
expect(
|
|
105
|
+
stripped,
|
|
106
|
+
`expected stripped keyboard after tap; got ${JSON.stringify(edited)}`,
|
|
107
|
+
).toBe(true);
|
|
108
|
+
|
|
109
|
+
// ── 5. Wait for the agent's confirmation reply. Invariant 3. ─
|
|
110
|
+
// The agent receives the answer as a channel event and starts
|
|
111
|
+
// a new turn. We expect a reply mentioning the choice within
|
|
112
|
+
// ~60s.
|
|
113
|
+
//
|
|
114
|
+
// Predicate matcher filters out the EDITED card. The driver's
|
|
115
|
+
// `observeMessages` (driver.ts:252-263) dispatches BOTH new
|
|
116
|
+
// messages AND edit events through the same stream, with
|
|
117
|
+
// `ObservedMessage.edited` set accordingly. Without this
|
|
118
|
+
// filter the race between (a) the gateway's edit landing
|
|
119
|
+
// post-sleep and (b) the agent's confirmation turn would
|
|
120
|
+
// catch the edited card as the "match" — false-positive on
|
|
121
|
+
// invariant 3 if the edit's network round-trip beat the
|
|
122
|
+
// turn-completion. Predicate guards against that without
|
|
123
|
+
// depending on a sleep duration. (PR #1167 review item D.)
|
|
124
|
+
const confirmation = await sc.expectMessage(
|
|
125
|
+
(m) => !m.edited && new RegExp(CHOSEN, "i").test(m.text),
|
|
126
|
+
{ from: "bot", timeout: 60_000 },
|
|
127
|
+
);
|
|
128
|
+
// Defense in depth: the confirmation message id must be
|
|
129
|
+
// greater than the card's. A fresh turn always produces a
|
|
130
|
+
// new id; same id implies the edited card slipped through
|
|
131
|
+
// the predicate (e.g. if `edited` wasn't set on the
|
|
132
|
+
// observation). Soft assertion — predicate is the primary
|
|
133
|
+
// guard.
|
|
134
|
+
expect(confirmation.messageId).toBeGreaterThan(card.messageId);
|
|
135
|
+
} finally {
|
|
136
|
+
await sc.tearDown();
|
|
137
|
+
}
|
|
138
|
+
},
|
|
139
|
+
180_000,
|
|
140
|
+
);
|
|
141
|
+
});
|
|
@@ -0,0 +1,191 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Background sub-agent visibility scenario — closes #709 / #776 / #782 / #788
|
|
3
|
+
* (the four-issue family analysed in `reference/sub-agent-visibility-rfc.md`).
|
|
4
|
+
*
|
|
5
|
+
* Verifies three acceptance criteria from the RFC in a single run because
|
|
6
|
+
* they share setup:
|
|
7
|
+
*
|
|
8
|
+
* AC-1 — Background-dispatch-and-continue: card stays pinned past
|
|
9
|
+
* parent `turn_end`; fleet zone surfaces the running sub-agent.
|
|
10
|
+
* AC-2 — Done semantics: header reads 🌀 Background (not ✅ Done)
|
|
11
|
+
* while the bg sub-agent runs; flips to ✅ Done after it
|
|
12
|
+
* terminates.
|
|
13
|
+
* AC-3 — Live activity: card body materially changes across a 15s
|
|
14
|
+
* window while bg work is in flight (elapsed counter or fleet
|
|
15
|
+
* row's `last activity` advances) — proves the heartbeat +
|
|
16
|
+
* subagent-watcher are actually feeding the renderer.
|
|
17
|
+
*
|
|
18
|
+
* Prompt strategy: **Option 1 (explicit tool-naming)** per the RFC §
|
|
19
|
+
* "Background-dispatch prompt". An earlier Option-2 (naturalistic)
|
|
20
|
+
* attempt produced exactly the failure mode the RFC predicted —
|
|
21
|
+
* model ran the sleeps inline via Bash, card never reached Background
|
|
22
|
+
* phase. This test verifies the *visibility infra*, not the LLM's
|
|
23
|
+
* delegation judgment; pinning the tool name and arg keeps the
|
|
24
|
+
* scenario deterministic.
|
|
25
|
+
*
|
|
26
|
+
* Requires the same env as the other DM scenarios (see SETUP.md §6)
|
|
27
|
+
* and the test-harness override `progress_card.delay_ms: 1000` so the
|
|
28
|
+
* card actually fires on a short turn (SETUP.md §5).
|
|
29
|
+
*
|
|
30
|
+
* Runtime budget is generous — the inner deadlines sum to ~150s
|
|
31
|
+
* worst-case (5s pin + 30s parent-ack + 30s background phase + 15s
|
|
32
|
+
* delta-snapshot + 120s done) plus ~12s spinUp overhead. The outer
|
|
33
|
+
* `it()` timeout absorbs the lot.
|
|
34
|
+
*/
|
|
35
|
+
|
|
36
|
+
import { describe, expect, it } from "vitest";
|
|
37
|
+
import { spinUp } from "../harness.js";
|
|
38
|
+
|
|
39
|
+
// Explicit dispatch prompt (Option 1 per the RFC §"Background-dispatch
|
|
40
|
+
// prompt"). The naturalistic Option-2 version didn't reliably get the
|
|
41
|
+
// model to use the Agent tool with run_in_background:true — first
|
|
42
|
+
// attempt produced the failure mode the RFC predicted (parent ran the
|
|
43
|
+
// sleeps inline via Bash; card never transitioned to Background).
|
|
44
|
+
//
|
|
45
|
+
// This test asserts the VISIBILITY INFRA works, not that the model
|
|
46
|
+
// makes good delegation judgments. Naming the tool + the arg lets the
|
|
47
|
+
// scenario be deterministic. If the model can't be made to use the
|
|
48
|
+
// Agent tool even with this prompt, that's an unrelated bug (model
|
|
49
|
+
// alignment / tool registration) and the scenario fails distinctly
|
|
50
|
+
// from the visibility-infra failure modes we're trying to catch.
|
|
51
|
+
//
|
|
52
|
+
// Time profile: ~60s of bg work, paced with three separate sleeps so
|
|
53
|
+
// the worker emits multiple tool_use events the subagent-watcher can
|
|
54
|
+
// surface as fresh `last activity` updates. We need the Background
|
|
55
|
+
// phase to last long enough that we can take a snapshot, wait one
|
|
56
|
+
// heartbeat tick (5s default), and snapshot again.
|
|
57
|
+
const BG_DISPATCH_PROMPT =
|
|
58
|
+
`Use the Agent tool with subagent_type "general-purpose" and ` +
|
|
59
|
+
`run_in_background: true to dispatch a worker with this exact task: ` +
|
|
60
|
+
`"Run \`sleep 20\` via the Bash tool, then \`echo step1\`, then ` +
|
|
61
|
+
`\`sleep 20\` again, then \`echo step2\`, then \`sleep 20\` a third ` +
|
|
62
|
+
`time, then \`echo done\`. That's three separate Bash tool calls ` +
|
|
63
|
+
`with sleeps between echoes." After dispatching, send a brief reply ` +
|
|
64
|
+
`saying you've kicked off the background worker so I can watch the ` +
|
|
65
|
+
`progress card.`;
|
|
66
|
+
|
|
67
|
+
/**
|
|
68
|
+
* STATUS: currently red — surfaces two real production bugs the
|
|
69
|
+
* RFC §Risks predicted as possible-but-unverified. Marked `it.fails`
|
|
70
|
+
* so a future fix flips it green and a regression flips it red again.
|
|
71
|
+
*
|
|
72
|
+
* Bug 1 — orphan correlation. The parent's `Agent` tool_use_id
|
|
73
|
+
* doesn't get matched to the spawned `sub_agent_started`
|
|
74
|
+
* event. Gateway log: `pendingSpawns=0 correlated=orphan`.
|
|
75
|
+
* Result: `isBackgroundDispatch` is never set on the fleet
|
|
76
|
+
* member; the card's header phase transitions to Background
|
|
77
|
+
* only by accident (orphans defer too, but they don't carry
|
|
78
|
+
* the bg flag).
|
|
79
|
+
*
|
|
80
|
+
* Bug 2 — subagent-watcher can't track the worker. Gateway log:
|
|
81
|
+
* `subagent-watcher: liveness skip <agentId> — row not in
|
|
82
|
+
* DB yet (Phase 2 Pre hook pending)`. Result: no
|
|
83
|
+
* sub_agent_tool_use events reach the fleet member; the
|
|
84
|
+
* fleet row's `last activity` field never updates with the
|
|
85
|
+
* worker's actual tool calls. The card edits we see are
|
|
86
|
+
* just elapsed-counter ticks from the heartbeat.
|
|
87
|
+
*
|
|
88
|
+
* Both bugs are real and live on `main`. The scenario above passes
|
|
89
|
+
* AC-1 (card stays pinned), partially passes AC-2 (Background phase
|
|
90
|
+
* fires) and AC-3 (card body changes — from heartbeat alone), and
|
|
91
|
+
* fails AC-2's closing half (card never reaches Done in 120s because
|
|
92
|
+
* the orphan never terminates from the gateway's view).
|
|
93
|
+
*
|
|
94
|
+
* When Bug 1 + Bug 2 are fixed, change `describe.skip` to `describe`
|
|
95
|
+
* below — the assertions are correct; only the production code is
|
|
96
|
+
* wrong.
|
|
97
|
+
*
|
|
98
|
+
* Update post-#1105: all five RFC bugs (1–5 in earlier PRs, 6–7 in
|
|
99
|
+
* #1105) merged. Unskipped here for the next UAT re-run. If 6/6 ACs
|
|
100
|
+
* pass, close #709 / #776 / #782 / #788.
|
|
101
|
+
*/
|
|
102
|
+
describe("uat: background sub-agent visibility (#709/#776/#782/#788)", () => {
|
|
103
|
+
it(
|
|
104
|
+
"card stays pinned with 🌀 Background header + live fleet activity, then flips to ✅ Done",
|
|
105
|
+
async () => {
|
|
106
|
+
const sc = await spinUp({ agent: "test-harness" });
|
|
107
|
+
try {
|
|
108
|
+
await sc.sendDM(BG_DISPATCH_PROMPT);
|
|
109
|
+
|
|
110
|
+
// AC-1 step 1: card pins quickly (delay_ms: 1000 on test-harness).
|
|
111
|
+
// Generous timeout so a slow first-turn doesn't false-flag.
|
|
112
|
+
const card = await sc.expectPinnedCard({ timeout: 15_000 });
|
|
113
|
+
expect(card.messageId).toBeGreaterThan(0);
|
|
114
|
+
|
|
115
|
+
// Parent ack reply. Note: we DON'T strictly require the model
|
|
116
|
+
// to mention "dispatch" in the reply — naturalistic prompt means
|
|
117
|
+
// the model picks the wording. We just need *some* bot reply
|
|
118
|
+
// so we know the parent turn closed (which is the point where
|
|
119
|
+
// pre-fix the card would unpin).
|
|
120
|
+
await sc.expectMessage(/.+/, { from: "bot", timeout: 30_000 });
|
|
121
|
+
|
|
122
|
+
// AC-2: header MUST be 🌀 Background (post-#1039) or, if the
|
|
123
|
+
// bg dispatch happened so fast the worker hasn't started yet,
|
|
124
|
+
// it might still be ⚙️ Working with the parent zone done. We
|
|
125
|
+
// poll for the background phase with a 45s budget — long
|
|
126
|
+
// enough for the worker to actually start firing tools, short
|
|
127
|
+
// enough that "we never saw Background" surfaces as a real
|
|
128
|
+
// bug, not a timeout-tuning issue.
|
|
129
|
+
//
|
|
130
|
+
// The dual-acceptable phases below model the realistic flow:
|
|
131
|
+
// parent reply lands → header should be Background (or
|
|
132
|
+
// briefly still Working if the parent's `done` event lags
|
|
133
|
+
// the bg dispatch's tool_use).
|
|
134
|
+
const bgPhaseCard = await sc.waitForCardPhase(card, "background", {
|
|
135
|
+
timeout: 45_000,
|
|
136
|
+
});
|
|
137
|
+
expect(bgPhaseCard.text).toMatch(/🌀|Background/i);
|
|
138
|
+
// The negative — Done MUST NOT have fired before bg started.
|
|
139
|
+
// Asserts the defer-gate is doing its job. If this trips, the
|
|
140
|
+
// `hasLiveBackground` correlation at progress-card-driver.ts:1108
|
|
141
|
+
// is broken (or the bg dispatch never registered as a fleet
|
|
142
|
+
// member at all — see RFC §Phase 2 diagnosis paths).
|
|
143
|
+
expect(bgPhaseCard.text).not.toMatch(/✅|\bDone\b/i);
|
|
144
|
+
|
|
145
|
+
// AC-3: card edits land regularly while bg runs. Snapshot
|
|
146
|
+
// the current card body, wait one heartbeat tick (5s default
|
|
147
|
+
// + 1s slack), then fetch the card body again. The body MUST
|
|
148
|
+
// differ (elapsed counter, fleet last-activity age, etc.).
|
|
149
|
+
//
|
|
150
|
+
// We re-fetch the SAME message via `driver.getMessage(chatId,
|
|
151
|
+
// cardId)` rather than `expectPinnedCard` because the latter
|
|
152
|
+
// listens for NEW pin events. Once the card is pinned, no
|
|
153
|
+
// further pin event fires — `expectPinnedCard` would wait
|
|
154
|
+
// for an event that never comes and time out spuriously even
|
|
155
|
+
// though the card is alive and being edited (caught in the
|
|
156
|
+
// first run of this scenario).
|
|
157
|
+
//
|
|
158
|
+
// If the card freezes — heartbeat dead, subagent-watcher not
|
|
159
|
+
// flushing, fleet member never registered — `afterDelta` will
|
|
160
|
+
// equal `beforeDelta` and surface the bug cleanly. If the
|
|
161
|
+
// card was unpinned by an over-eager defer-gate release,
|
|
162
|
+
// `getMessage` returns null and we surface it with a clear
|
|
163
|
+
// assertion.
|
|
164
|
+
const beforeDelta = bgPhaseCard.text;
|
|
165
|
+
await new Promise((r) => setTimeout(r, 6_000));
|
|
166
|
+
const afterDeltaMsg = await sc.driver.getMessage(
|
|
167
|
+
sc.botUserId,
|
|
168
|
+
bgPhaseCard.messageId,
|
|
169
|
+
);
|
|
170
|
+
expect(afterDeltaMsg, "card message disappeared mid-flight (AC-1 regression)").not.toBeNull();
|
|
171
|
+
expect(afterDeltaMsg!.text).not.toBe(beforeDelta);
|
|
172
|
+
|
|
173
|
+
// AC-2 closing half: bg terminates → header flips to ✅ Done.
|
|
174
|
+
// Generous budget — the inner sleeps sum to ~60s but
|
|
175
|
+
// post-completion the deferred-completion gate plus the
|
|
176
|
+
// heartbeat cadence can add another 5-30s before the card
|
|
177
|
+
// finalises.
|
|
178
|
+
const doneCard = await sc.waitForCardPhase(bgPhaseCard, "done", {
|
|
179
|
+
timeout: 120_000,
|
|
180
|
+
});
|
|
181
|
+
expect(doneCard.text).toMatch(/✅|Done/i);
|
|
182
|
+
} finally {
|
|
183
|
+
await sc.tearDown();
|
|
184
|
+
}
|
|
185
|
+
},
|
|
186
|
+
// Outer per-test budget: sum of inner deadlines (15 + 30 + 45 + 15 +
|
|
187
|
+
// 10 + 120 = 235s) + spinUp settle (~12s) + slack. Round up to keep
|
|
188
|
+
// the inner-deadline error visible if any of them trip.
|
|
189
|
+
300_000,
|
|
190
|
+
);
|
|
191
|
+
});
|
|
@@ -0,0 +1,255 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Extended probabilistic fuzz — second pass, categories the first
|
|
3
|
+
* fuzz file didn't cover.
|
|
4
|
+
*
|
|
5
|
+
* Same invariants as `fuzz-random-prompts-dm.test.ts`:
|
|
6
|
+
* 1. Reply landed (user not ghosted)
|
|
7
|
+
* 2. No agent crash (next case still runs)
|
|
8
|
+
* 3. No credential leak in the reply text
|
|
9
|
+
* 4. Non-empty reply
|
|
10
|
+
*
|
|
11
|
+
* Categories here:
|
|
12
|
+
* - Markdown / formatting stress (nested code blocks, broken HTML,
|
|
13
|
+
* bold/italic in unexpected places)
|
|
14
|
+
* - Command-shaped prompts (slash prefixes that aren't `/queue`)
|
|
15
|
+
* - Repeat-fire (same prompt 3x in a row)
|
|
16
|
+
* - Unicode normalisation edge cases
|
|
17
|
+
* - Mixed-language code switching
|
|
18
|
+
* - Number / math edge cases (very large, very small, scientific)
|
|
19
|
+
* - Polite trivials (good morning, thanks, ok cool)
|
|
20
|
+
*
|
|
21
|
+
* Avoids the rapid-followup wedge surfaced in overnight UAT
|
|
22
|
+
* (#1122 follow-up): every case here is a SINGLE inbound, so we
|
|
23
|
+
* dodge the queued-vs-steering classification issue and the
|
|
24
|
+
* crash-loop pathology that surfaced in the test-harness when
|
|
25
|
+
* driving multiple inbounds within the same coalesce / queue
|
|
26
|
+
* window.
|
|
27
|
+
*/
|
|
28
|
+
|
|
29
|
+
import { describe, it, expect } from "vitest";
|
|
30
|
+
import { spinUp } from "../harness.js";
|
|
31
|
+
|
|
32
|
+
interface FuzzCase {
|
|
33
|
+
name: string;
|
|
34
|
+
prompt: string;
|
|
35
|
+
timeout: number;
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
const FUZZ_CASES: readonly FuzzCase[] = [
|
|
39
|
+
// ─── Markdown / formatting stress ─────────────────────────────
|
|
40
|
+
{
|
|
41
|
+
name: "nested code blocks",
|
|
42
|
+
prompt: "what's wrong with this:\n```python\ndef foo():\n return ```bash\n echo hi\n ```\n```",
|
|
43
|
+
timeout: 45_000,
|
|
44
|
+
},
|
|
45
|
+
{
|
|
46
|
+
name: "broken HTML",
|
|
47
|
+
prompt: "what does <em>this <b>do</em> mean?",
|
|
48
|
+
timeout: 45_000,
|
|
49
|
+
},
|
|
50
|
+
{
|
|
51
|
+
name: "markdown bold attempt",
|
|
52
|
+
prompt: "**hello** _world_ — is this bold?",
|
|
53
|
+
timeout: 45_000,
|
|
54
|
+
},
|
|
55
|
+
{
|
|
56
|
+
name: "table-shape",
|
|
57
|
+
prompt: "format this as a table:\n| name | role |\n| ken | dev |",
|
|
58
|
+
timeout: 60_000,
|
|
59
|
+
},
|
|
60
|
+
|
|
61
|
+
// ─── Command-shaped prompts (NOT /queue) ──────────────────────
|
|
62
|
+
{
|
|
63
|
+
name: "slash command — /help",
|
|
64
|
+
prompt: "/help",
|
|
65
|
+
timeout: 45_000,
|
|
66
|
+
},
|
|
67
|
+
{
|
|
68
|
+
name: "slash command — /start",
|
|
69
|
+
prompt: "/start",
|
|
70
|
+
timeout: 45_000,
|
|
71
|
+
},
|
|
72
|
+
{
|
|
73
|
+
name: "slash command — /memory",
|
|
74
|
+
prompt: "/memory",
|
|
75
|
+
timeout: 45_000,
|
|
76
|
+
},
|
|
77
|
+
{
|
|
78
|
+
name: "slash command — bare /",
|
|
79
|
+
prompt: "/",
|
|
80
|
+
timeout: 45_000,
|
|
81
|
+
},
|
|
82
|
+
|
|
83
|
+
// ─── Repeat-fire (same prompt 3x — sent in ONE inbound each) ──
|
|
84
|
+
// Multi-inbound rapid-fire wedges the agent; we test that the SAME
|
|
85
|
+
// prompt sent to fresh agent sessions doesn't degrade replies.
|
|
86
|
+
{
|
|
87
|
+
name: "repeated content",
|
|
88
|
+
prompt: "hi hi hi hi hi hi hi hi",
|
|
89
|
+
timeout: 45_000,
|
|
90
|
+
},
|
|
91
|
+
|
|
92
|
+
// ─── Unicode normalisation ────────────────────────────────────
|
|
93
|
+
{
|
|
94
|
+
name: "decomposed accents (NFD)",
|
|
95
|
+
// "café" in NFD form: c, a, f, e + combining acute accent.
|
|
96
|
+
prompt: "what does café (with NFD-decomposed é) mean?",
|
|
97
|
+
timeout: 45_000,
|
|
98
|
+
},
|
|
99
|
+
{
|
|
100
|
+
name: "combining diacritics stack",
|
|
101
|
+
// a + 3 combining accents above
|
|
102
|
+
prompt: "interpret á̂̃ — does it confuse you?",
|
|
103
|
+
timeout: 45_000,
|
|
104
|
+
},
|
|
105
|
+
|
|
106
|
+
// ─── Mixed-language code switching ────────────────────────────
|
|
107
|
+
{
|
|
108
|
+
name: "Spanish/English mix",
|
|
109
|
+
prompt: "hola, can you ayudarme entender what este código does? print('hello')",
|
|
110
|
+
timeout: 60_000,
|
|
111
|
+
},
|
|
112
|
+
{
|
|
113
|
+
name: "Japanese in middle",
|
|
114
|
+
prompt: "what does 申し訳ありません mean and when is it used?",
|
|
115
|
+
timeout: 60_000,
|
|
116
|
+
},
|
|
117
|
+
|
|
118
|
+
// ─── Number / math edges ──────────────────────────────────────
|
|
119
|
+
{
|
|
120
|
+
name: "huge number",
|
|
121
|
+
prompt: "what is 10^100 called?",
|
|
122
|
+
timeout: 45_000,
|
|
123
|
+
},
|
|
124
|
+
{
|
|
125
|
+
name: "scientific notation",
|
|
126
|
+
prompt: "is 1.5e-10 the same as 0.00000000015?",
|
|
127
|
+
timeout: 45_000,
|
|
128
|
+
},
|
|
129
|
+
|
|
130
|
+
// ─── Polite trivials ──────────────────────────────────────────
|
|
131
|
+
{
|
|
132
|
+
name: "good morning",
|
|
133
|
+
prompt: "good morning",
|
|
134
|
+
timeout: 60_000,
|
|
135
|
+
},
|
|
136
|
+
{
|
|
137
|
+
name: "thanks",
|
|
138
|
+
prompt: "thanks",
|
|
139
|
+
timeout: 60_000,
|
|
140
|
+
},
|
|
141
|
+
{
|
|
142
|
+
name: "ok cool",
|
|
143
|
+
prompt: "ok cool",
|
|
144
|
+
timeout: 60_000,
|
|
145
|
+
},
|
|
146
|
+
|
|
147
|
+
// ─── Status-ask classifier variants (CC-7 fuzz coverage) ──────
|
|
148
|
+
//
|
|
149
|
+
// The conservative regex set in `telegram-plugin/inbound-classifier.ts`
|
|
150
|
+
// captures 10 standalone "ping" patterns that count toward the
|
|
151
|
+
// primary lagging KPI `inbound_status_query`. Each fire is a JTBD
|
|
152
|
+
// failure (`reference/know-what-my-agent-is-doing.md`), so we
|
|
153
|
+
// want every variant to (a) reach the agent unchanged, (b)
|
|
154
|
+
// produce a sensible reply (no crash, no loop, no ghosting).
|
|
155
|
+
// Tracks cause class CC-7 from
|
|
156
|
+
// `docs/status-ask-cause-classes.md`.
|
|
157
|
+
//
|
|
158
|
+
// Unit-level classifier behavior is covered in
|
|
159
|
+
// `telegram-plugin/tests/inbound-classifier.test.ts`; this fuzz
|
|
160
|
+
// block is the end-to-end "does the agent survive each of them"
|
|
161
|
+
// regression net.
|
|
162
|
+
{
|
|
163
|
+
name: "status-ask: bare ?",
|
|
164
|
+
prompt: "?",
|
|
165
|
+
timeout: 45_000,
|
|
166
|
+
},
|
|
167
|
+
{
|
|
168
|
+
name: "status-ask: status?",
|
|
169
|
+
prompt: "status?",
|
|
170
|
+
timeout: 45_000,
|
|
171
|
+
},
|
|
172
|
+
{
|
|
173
|
+
name: "status-ask: update?",
|
|
174
|
+
prompt: "update?",
|
|
175
|
+
timeout: 45_000,
|
|
176
|
+
},
|
|
177
|
+
{
|
|
178
|
+
name: "status-ask: any update?",
|
|
179
|
+
prompt: "any update?",
|
|
180
|
+
timeout: 45_000,
|
|
181
|
+
},
|
|
182
|
+
{
|
|
183
|
+
name: "status-ask: still there?",
|
|
184
|
+
prompt: "still there?",
|
|
185
|
+
timeout: 45_000,
|
|
186
|
+
},
|
|
187
|
+
{
|
|
188
|
+
name: "status-ask: still working?",
|
|
189
|
+
prompt: "still working?",
|
|
190
|
+
timeout: 45_000,
|
|
191
|
+
},
|
|
192
|
+
{
|
|
193
|
+
name: "status-ask: are you there?",
|
|
194
|
+
prompt: "are you there?",
|
|
195
|
+
timeout: 45_000,
|
|
196
|
+
},
|
|
197
|
+
{
|
|
198
|
+
name: "status-ask: you there?",
|
|
199
|
+
prompt: "you there?",
|
|
200
|
+
timeout: 45_000,
|
|
201
|
+
},
|
|
202
|
+
{
|
|
203
|
+
name: "status-ask: hello?",
|
|
204
|
+
prompt: "hello?",
|
|
205
|
+
timeout: 45_000,
|
|
206
|
+
},
|
|
207
|
+
{
|
|
208
|
+
name: "status-ask: hey??",
|
|
209
|
+
prompt: "hey??",
|
|
210
|
+
timeout: 45_000,
|
|
211
|
+
},
|
|
212
|
+
];
|
|
213
|
+
|
|
214
|
+
const SECRET_PATTERNS = [
|
|
215
|
+
/sk-[a-zA-Z0-9]{30,}/,
|
|
216
|
+
/[a-zA-Z0-9]{40,}\.eyJ[a-zA-Z0-9]/,
|
|
217
|
+
/AKIA[A-Z0-9]{16}/,
|
|
218
|
+
/ghp_[A-Za-z0-9]{36,}/,
|
|
219
|
+
];
|
|
220
|
+
|
|
221
|
+
function hasSecretLeak(text: string): { leaked: boolean; pattern?: string } {
|
|
222
|
+
for (const pat of SECRET_PATTERNS) {
|
|
223
|
+
if (pat.test(text)) return { leaked: true, pattern: pat.toString() };
|
|
224
|
+
}
|
|
225
|
+
return { leaked: false };
|
|
226
|
+
}
|
|
227
|
+
|
|
228
|
+
describe("uat: extended fuzz — second-pass coverage", () => {
|
|
229
|
+
for (const fc of FUZZ_CASES) {
|
|
230
|
+
it(
|
|
231
|
+
`[fuzz2] ${fc.name} — user must not be ghosted`,
|
|
232
|
+
async () => {
|
|
233
|
+
const sc = await spinUp({ agent: "test-harness" });
|
|
234
|
+
try {
|
|
235
|
+
await sc.sendDM(fc.prompt);
|
|
236
|
+
const reply = await sc.expectMessage(/\S/, {
|
|
237
|
+
from: "bot",
|
|
238
|
+
timeout: fc.timeout,
|
|
239
|
+
});
|
|
240
|
+
expect(reply.text.length).toBeGreaterThan(0);
|
|
241
|
+
const leak = hasSecretLeak(reply.text);
|
|
242
|
+
if (leak.leaked) {
|
|
243
|
+
throw new Error(
|
|
244
|
+
`[fuzz2] ${fc.name}: bot reply contains a secret-shaped `
|
|
245
|
+
+ `pattern (${leak.pattern}). Reply: ${JSON.stringify(reply.text.slice(0, 400))}`,
|
|
246
|
+
);
|
|
247
|
+
}
|
|
248
|
+
} finally {
|
|
249
|
+
await sc.tearDown();
|
|
250
|
+
}
|
|
251
|
+
},
|
|
252
|
+
fc.timeout + 30_000,
|
|
253
|
+
);
|
|
254
|
+
}
|
|
255
|
+
});
|