switchroom 0.7.15 → 0.10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +51 -59
- package/bin/run-hook.sh +27 -11
- package/bin/timezone-hook.sh +9 -7
- package/dist/agent-scheduler/index.js +410 -133
- package/dist/auth-broker/index.js +13932 -0
- package/dist/cli/switchroom.js +26937 -5601
- package/dist/host-control/main.js +12702 -0
- package/dist/vault/approvals/kernel-server.js +467 -184
- package/dist/vault/broker/server.js +1430 -724
- package/examples/minimal.yaml +63 -0
- package/examples/personal-google-workspace-mcp/.env.example +34 -0
- package/examples/personal-google-workspace-mcp/README.md +194 -0
- package/examples/personal-google-workspace-mcp/compose.yaml +66 -0
- package/examples/switchroom.yaml +220 -0
- package/package.json +7 -4
- package/profiles/_base/settings.json.hbs +20 -5
- package/profiles/_base/start.sh.hbs +16 -3
- package/profiles/_shared/agent-self-service.md.hbs +126 -0
- package/profiles/_shared/telegram-style.md.hbs +20 -90
- package/profiles/_shared/vault-protocol.md.hbs +68 -0
- package/profiles/default/CLAUDE.md +50 -96
- package/profiles/default/CLAUDE.md.hbs +36 -6
- package/profiles/default/workspace/SOUL.md.hbs +12 -5
- package/skills/buildkite-agent-infrastructure/SKILL.md +30 -11
- package/skills/buildkite-agent-runtime/SKILL.md +44 -11
- package/skills/buildkite-api/SKILL.md +31 -8
- package/skills/buildkite-cli/SKILL.md +27 -9
- package/skills/buildkite-migration/SKILL.md +22 -9
- package/skills/buildkite-pipelines/SKILL.md +26 -9
- package/skills/buildkite-secure-delivery/SKILL.md +23 -9
- package/skills/buildkite-test-engine/SKILL.md +25 -8
- package/skills/docx/SKILL.md +1 -1
- package/skills/docx/scripts/office/validators/__pycache__/__init__.cpython-313.pyc +0 -0
- package/skills/docx/scripts/office/validators/__pycache__/base.cpython-313.pyc +0 -0
- package/skills/file-bug/SKILL.md +34 -6
- package/skills/humanizer/SKILL.md +15 -0
- package/skills/humanizer-calibrate/SKILL.md +7 -1
- package/skills/mcp-builder/SKILL.md +1 -1
- package/skills/pdf/SKILL.md +1 -1
- package/skills/pptx/SKILL.md +1 -1
- package/skills/skill-creator/SKILL.md +21 -1
- package/skills/skill-creator/scripts/__pycache__/__init__.cpython-313.pyc +0 -0
- package/skills/skill-creator/scripts/__pycache__/generate_report.cpython-313.pyc +0 -0
- package/skills/skill-creator/scripts/__pycache__/improve_description.cpython-313.pyc +0 -0
- package/skills/skill-creator/scripts/__pycache__/run_eval.cpython-313.pyc +0 -0
- package/skills/skill-creator/scripts/__pycache__/run_loop.cpython-313.pyc +0 -0
- package/skills/skill-creator/scripts/__pycache__/utils.cpython-313.pyc +0 -0
- package/skills/switchroom-cli/SKILL.md +63 -64
- package/skills/switchroom-health/SKILL.md +23 -10
- package/skills/switchroom-install/SKILL.md +3 -3
- package/skills/switchroom-manage/SKILL.md +26 -19
- package/skills/switchroom-runtime/SKILL.md +191 -0
- package/skills/switchroom-status/SKILL.md +27 -2
- package/skills/telegram-test-harness/SKILL.md +3 -0
- package/skills/token-helpers/SKILL.md +24 -1
- package/skills/webapp-testing/SKILL.md +31 -1
- package/skills/xlsx/SKILL.md +1 -1
- package/telegram-plugin/admin-commands/index.ts +7 -5
- package/telegram-plugin/analytics-posthog.ts +191 -0
- package/telegram-plugin/bridge/bridge.ts +69 -0
- package/telegram-plugin/bridge/ipc-client.ts +4 -1
- package/telegram-plugin/dist/bridge/bridge.js +194 -119
- package/telegram-plugin/dist/gateway/gateway.js +23611 -19671
- package/telegram-plugin/dist/server.js +245 -189
- package/telegram-plugin/first-paint.ts +3 -24
- package/telegram-plugin/gateway/auth-add-flow.ts +326 -0
- package/telegram-plugin/gateway/auth-broker-client.ts +75 -0
- package/telegram-plugin/gateway/auth-command.ts +794 -0
- package/telegram-plugin/gateway/auth-line.ts +123 -0
- package/telegram-plugin/gateway/boot-card.ts +169 -40
- package/telegram-plugin/gateway/boot-issue-cache.ts +308 -0
- package/telegram-plugin/gateway/boot-probes.ts +166 -123
- package/telegram-plugin/gateway/boot-reason.ts +41 -7
- package/telegram-plugin/gateway/boot-version.ts +66 -0
- package/telegram-plugin/gateway/gateway.ts +3499 -1885
- package/telegram-plugin/gateway/hostd-dispatch.ts +117 -0
- package/telegram-plugin/gateway/ipc-protocol.ts +18 -0
- package/telegram-plugin/gateway/pending-inbound-buffer.ts +106 -0
- package/telegram-plugin/gateway/quarantine.ts +69 -0
- package/telegram-plugin/gateway/quota-cache.ts +9 -4
- package/telegram-plugin/gateway/reaction-trigger.ts +401 -0
- package/telegram-plugin/gateway/recent-denials.test.ts +103 -0
- package/telegram-plugin/gateway/recent-denials.ts +77 -0
- package/telegram-plugin/gateway/startup-network-retry.ts +109 -31
- package/telegram-plugin/gateway/vault-grant-inbound-builders.ts +125 -0
- package/telegram-plugin/history.ts +91 -0
- package/telegram-plugin/hooks/hooks.json +10 -0
- package/telegram-plugin/hooks/sandbox-hint-posttool.mjs +130 -0
- package/telegram-plugin/hooks/subagent-tracker-posttool.mjs +19 -2
- package/telegram-plugin/hooks/subagent-tracker-pretool.mjs +22 -2
- package/telegram-plugin/hooks/tool-label-pretool.mjs +11 -0
- package/telegram-plugin/hooks/wedge-detect-posttool.mjs +303 -0
- package/telegram-plugin/inbound-classifier.ts +50 -0
- package/telegram-plugin/inline-keyboard-callbacks.ts +136 -0
- package/telegram-plugin/node_modules/.vite/vitest/da39a3ee5e6b4b0d3255bfef95601890afd80709/results.json +1 -0
- package/telegram-plugin/package.json +4 -2
- package/telegram-plugin/permission-rule.ts +51 -0
- package/telegram-plugin/permission-title.ts +56 -0
- package/telegram-plugin/quota-check.ts +19 -41
- package/telegram-plugin/registry/reaper.ts +223 -0
- package/telegram-plugin/retry-api-call.ts +80 -0
- package/telegram-plugin/runtime-metrics.ts +177 -0
- package/telegram-plugin/scripts/build.mjs +0 -1
- package/telegram-plugin/secret-detect/index.ts +24 -0
- package/telegram-plugin/secret-detect/vault-error.test.ts +64 -12
- package/telegram-plugin/secret-detect/vault-error.ts +78 -11
- package/telegram-plugin/secret-detect/vault-write.ts +14 -2
- package/telegram-plugin/server.js +41795 -0
- package/telegram-plugin/session-tail.ts +6 -1
- package/telegram-plugin/shared/bot-runtime.ts +5 -4
- package/telegram-plugin/silence-poke.ts +420 -0
- package/telegram-plugin/silent-end.ts +174 -0
- package/telegram-plugin/stream-controller.ts +13 -0
- package/telegram-plugin/stream-reply-handler.ts +7 -0
- package/telegram-plugin/subagent-watcher.ts +213 -4
- package/telegram-plugin/tests/auth-add-flow.test.ts +559 -0
- package/telegram-plugin/tests/auth-code-redact.test.ts +8 -4
- package/telegram-plugin/tests/auth-command-vernacular.test.ts +531 -0
- package/telegram-plugin/tests/boot-card-issue-dedup.test.ts +247 -0
- package/telegram-plugin/tests/boot-card-reason-to-render.test.ts +182 -0
- package/telegram-plugin/tests/boot-card-reason.test.ts +65 -2
- package/telegram-plugin/tests/boot-card-render.test.ts +146 -0
- package/telegram-plugin/tests/boot-card-silent-on-operator.test.ts +103 -0
- package/telegram-plugin/tests/boot-probes.test.ts +216 -10
- package/telegram-plugin/tests/boot-version-string.test.ts +0 -0
- package/telegram-plugin/tests/finalize-callback.test.ts +190 -0
- package/telegram-plugin/tests/gateway-message-validator.test.ts +26 -0
- package/telegram-plugin/tests/gateway-secret-detect.test.ts +12 -3
- package/telegram-plugin/tests/gateway-startup-network-retry.test.ts +104 -0
- package/telegram-plugin/tests/history-reaper.test.ts +378 -0
- package/telegram-plugin/tests/hostd-dispatch.test.ts +129 -0
- package/telegram-plugin/tests/inbound-classifier.test.ts +76 -0
- package/telegram-plugin/tests/inbound-message-types.test.ts +267 -0
- package/telegram-plugin/tests/issues-card.test.ts +49 -0
- package/telegram-plugin/tests/pending-inbound-buffer.test.ts +132 -0
- package/telegram-plugin/tests/permission-rule.test.ts +80 -1
- package/telegram-plugin/tests/permission-title.test.ts +31 -0
- package/telegram-plugin/tests/quota-check.test.ts +5 -35
- package/telegram-plugin/tests/races.test.ts +179 -0
- package/telegram-plugin/tests/reaction-trigger-flow.test.ts +353 -0
- package/telegram-plugin/tests/reaction-trigger.test.ts +397 -0
- package/telegram-plugin/tests/retry-api-call.test.ts +152 -1
- package/telegram-plugin/tests/runtime-metrics.test.ts +145 -0
- package/telegram-plugin/tests/sandbox-hint-posttool.test.ts +155 -0
- package/telegram-plugin/tests/secret-detect-delete-must-surface-failures.test.ts +133 -0
- package/telegram-plugin/tests/secret-detect-false-positives.test.ts +137 -0
- package/telegram-plugin/tests/silence-poke.test.ts +493 -0
- package/telegram-plugin/tests/silent-end.test.ts +206 -0
- package/telegram-plugin/tests/subagent-tracker-hooks.test.ts +107 -0
- package/telegram-plugin/tests/subagent-watcher-env-thresholds.test.ts +224 -0
- package/telegram-plugin/tests/subagent-watcher-stall-terminal.test.ts +316 -0
- package/telegram-plugin/tests/subagent-watcher.test.ts +263 -0
- package/telegram-plugin/tests/turn-signal-tracker.test.ts +81 -0
- package/telegram-plugin/tests/vault-approval-posture.test.ts +256 -0
- package/telegram-plugin/tests/vault-grant-auto-resume.test.ts +73 -0
- package/telegram-plugin/tests/vault-grant-inbound-builders.test.ts +226 -0
- package/telegram-plugin/tests/vault-grant-union.test.ts +130 -0
- package/telegram-plugin/tests/vault-key-regex-allows-slash.test.ts +140 -0
- package/telegram-plugin/tests/vault-posture-quarantine.test.ts +104 -0
- package/telegram-plugin/tests/vault-request-access-tool.test.ts +114 -0
- package/telegram-plugin/tests/vault-request-access-unlock-resume.test.ts +106 -0
- package/telegram-plugin/turn-signal-tracker.ts +100 -24
- package/telegram-plugin/uat/SETUP.md +210 -35
- package/telegram-plugin/uat/assertions.ts +264 -37
- package/telegram-plugin/uat/driver-info.ts +57 -0
- package/telegram-plugin/uat/driver.ts +590 -51
- package/telegram-plugin/uat/harness.ts +140 -94
- package/telegram-plugin/uat/load-env.test.ts +72 -0
- package/telegram-plugin/uat/load-env.ts +48 -0
- package/telegram-plugin/uat/login.ts +96 -53
- package/telegram-plugin/uat/runners/agent-self-sufficiency.ts +457 -0
- package/telegram-plugin/uat/runners/paraphrases.ts +231 -0
- package/telegram-plugin/uat/runners/report.ts +150 -0
- package/telegram-plugin/uat/runners/run-agent-self-sufficiency.sh +50 -0
- package/telegram-plugin/uat/runners/scorer.test.ts +196 -0
- package/telegram-plugin/uat/runners/scorer.ts +106 -0
- package/telegram-plugin/uat/runners/skill-coverage.test.ts +100 -0
- package/telegram-plugin/uat/runners/skill-coverage.ts +620 -0
- package/telegram-plugin/uat/scenarios/ask-user-button-tap-dm.test.ts +141 -0
- package/telegram-plugin/uat/scenarios/bg-sub-agent-dispatch-dm.test.ts +191 -0
- package/telegram-plugin/uat/scenarios/fuzz-extended-dm.test.ts +255 -0
- package/telegram-plugin/uat/scenarios/fuzz-human-style-dm.test.ts +275 -0
- package/telegram-plugin/uat/scenarios/fuzz-random-prompts-dm.test.ts +146 -0
- package/telegram-plugin/uat/scenarios/fuzz-status-ask-dm.test.ts +486 -0
- package/telegram-plugin/uat/scenarios/jtbd-interrupt-marker-dm.test.ts +67 -0
- package/telegram-plugin/uat/scenarios/jtbd-rapid-followup-dm.test.ts +100 -0
- package/telegram-plugin/uat/scenarios/jtbd-soft-commit-dm.test.ts +67 -0
- package/telegram-plugin/uat/scenarios/jtbd-status-query-dm.test.ts +49 -0
- package/telegram-plugin/uat/scenarios/location-inbound-dm.test.ts +65 -0
- package/telegram-plugin/uat/scenarios/midturn-silent-dm.test.ts +175 -0
- package/telegram-plugin/uat/scenarios/reactions-dm.test.ts +142 -0
- package/telegram-plugin/uat/scenarios/reactions-trigger-turn-dm.test.ts +96 -0
- package/telegram-plugin/uat/scenarios/secret-redaction-deletes-original-dm.test.ts +123 -0
- package/telegram-plugin/uat/scenarios/secret-redaction-no-false-positive-dm.test.ts +87 -0
- package/telegram-plugin/uat/scenarios/silence-poke-soft-dm.test.ts +155 -0
- package/telegram-plugin/uat/scenarios/silent-end-recovery-dm.test.ts +95 -0
- package/telegram-plugin/uat/scenarios/smoke-dm-reply.test.ts +57 -0
- package/telegram-plugin/uat/scenarios/subagent-watcher-no-rerun-dm.test.ts +135 -0
- package/telegram-plugin/uat/scenarios/vault-approval-posture-telegram-id-dm.test.ts +191 -0
- package/telegram-plugin/uat/scenarios/vault-audit-allow-dm.test.ts +108 -0
- package/telegram-plugin/uat/scenarios/vault-grant-auto-resume-dm.test.ts +121 -0
- package/telegram-plugin/uat/scenarios/vault-request-access-concurrent-dm.test.ts +161 -0
- package/telegram-plugin/uat/scenarios/vault-request-access-end-to-end-dm.test.ts +158 -0
- package/telegram-plugin/uat/scenarios/voice-inbound-dm.test.ts +65 -0
- package/telegram-plugin/vault-approval-posture.ts +42 -0
- package/telegram-plugin/welcome-text.ts +1 -0
- package/telegram-plugin/active-pins-sweep.ts +0 -204
- package/telegram-plugin/active-pins.ts +0 -146
- package/telegram-plugin/auth-dashboard.ts +0 -1104
- package/telegram-plugin/auth-slot-parser.ts +0 -497
- package/telegram-plugin/card-event-log.ts +0 -138
- package/telegram-plugin/dist/foreman/foreman.js +0 -31106
- package/telegram-plugin/docs/multi-agent-card-design.md +0 -847
- package/telegram-plugin/docs/pinned-progress-card-reliability.md +0 -144
- package/telegram-plugin/foreman/foreman-create-flow.ts +0 -202
- package/telegram-plugin/foreman/foreman-handlers.ts +0 -493
- package/telegram-plugin/foreman/foreman.ts +0 -1165
- package/telegram-plugin/foreman/setup-flow.ts +0 -345
- package/telegram-plugin/foreman/setup-state.ts +0 -239
- package/telegram-plugin/foreman/state.ts +0 -203
- package/telegram-plugin/pin-event-log.ts +0 -76
- package/telegram-plugin/progress-card-driver.ts +0 -2886
- package/telegram-plugin/progress-card-pin-manager.ts +0 -589
- package/telegram-plugin/progress-card-pin-watchdog.ts +0 -98
- package/telegram-plugin/progress-card.ts +0 -1409
- package/telegram-plugin/tests/HARNESS.md +0 -340
- package/telegram-plugin/tests/_progress-card-harness.ts +0 -109
- package/telegram-plugin/tests/active-pins-boot-reaper.test.ts +0 -211
- package/telegram-plugin/tests/active-pins-sweep.test.ts +0 -309
- package/telegram-plugin/tests/active-pins.test.ts +0 -187
- package/telegram-plugin/tests/auth-account-identity-surface.test.ts +0 -118
- package/telegram-plugin/tests/auth-dashboard-edge-cases.test.ts +0 -260
- package/telegram-plugin/tests/auth-dashboard-restart-flow.test.ts +0 -140
- package/telegram-plugin/tests/auth-dashboard-v3b.test.ts +0 -559
- package/telegram-plugin/tests/auth-dashboard.test.ts +0 -1045
- package/telegram-plugin/tests/auth-slot-commands.test.ts +0 -640
- package/telegram-plugin/tests/bg-agent-progress-card-757.test.ts +0 -201
- package/telegram-plugin/tests/boot-card-account-quota.test.ts +0 -137
- package/telegram-plugin/tests/card-event-log.test.ts +0 -145
- package/telegram-plugin/tests/first-paint.test.ts +0 -257
- package/telegram-plugin/tests/foreman-create-flow.test.ts +0 -359
- package/telegram-plugin/tests/foreman-handlers.test.ts +0 -347
- package/telegram-plugin/tests/foreman-state.test.ts +0 -164
- package/telegram-plugin/tests/foreman-write-ops.test.ts +0 -214
- package/telegram-plugin/tests/harness-ordering-invariants.test.ts +0 -243
- package/telegram-plugin/tests/pin-event-log.test.ts +0 -124
- package/telegram-plugin/tests/progress-card-api-failure-during-deferred.test.ts +0 -73
- package/telegram-plugin/tests/progress-card-close-paths-converge.test.ts +0 -272
- package/telegram-plugin/tests/progress-card-cross-turn.test.ts +0 -258
- package/telegram-plugin/tests/progress-card-delay-842.test.ts +0 -160
- package/telegram-plugin/tests/progress-card-dispose-preservepending.test.ts +0 -81
- package/telegram-plugin/tests/progress-card-draft-flag.test.ts +0 -80
- package/telegram-plugin/tests/progress-card-driver-eviction.test.ts +0 -215
- package/telegram-plugin/tests/progress-card-driver-fleet-shadow.test.ts +0 -123
- package/telegram-plugin/tests/progress-card-driver-force-complete-parent-done.test.ts +0 -76
- package/telegram-plugin/tests/progress-card-edit-timestamps-budget.test.ts +0 -62
- package/telegram-plugin/tests/progress-card-memory-bounds.test.ts +0 -84
- package/telegram-plugin/tests/progress-card-pin-failure-paths.test.ts +0 -139
- package/telegram-plugin/tests/progress-card-pin-manager.test.ts +0 -773
- package/telegram-plugin/tests/progress-card-pin-race-fast-turn.test.ts +0 -66
- package/telegram-plugin/tests/progress-card-pin-sidecar-partial-write.test.ts +0 -64
- package/telegram-plugin/tests/progress-card-pin-watchdog.test.ts +0 -190
- package/telegram-plugin/tests/progress-card-sigterm-pin-flush.test.ts +0 -146
- package/telegram-plugin/tests/real-gateway-f1-ladder-integrity.test.ts +0 -123
- package/telegram-plugin/tests/real-gateway-f2-instant-draft.test.ts +0 -82
- package/telegram-plugin/tests/real-gateway-f3-late-card.test.ts +0 -114
- package/telegram-plugin/tests/real-gateway-harness.ts +0 -699
- package/telegram-plugin/tests/real-gateway-i6-turn-flush-replay-dedup.test.ts +0 -313
- package/telegram-plugin/tests/real-gateway-ipc-lifecycle.test.ts +0 -299
- package/telegram-plugin/tests/real-gateway-spec.test.ts +0 -487
- package/telegram-plugin/tests/real-gateway.smoke.test.ts +0 -101
- package/telegram-plugin/tests/setup-flow.test.ts +0 -510
- package/telegram-plugin/tests/setup-state.test.ts +0 -146
- package/telegram-plugin/tests/sync-chat-running-subagents.test.ts +0 -116
- package/telegram-plugin/tests/turn-end-regressions.test.ts +0 -489
- package/telegram-plugin/tests/turn-flush-card-takeover.test.ts +0 -218
- package/telegram-plugin/tests/turn-flush-prose-recovery.test.ts +0 -78
- package/telegram-plugin/tests/two-zone-bg-carry-full-lifecycle.test.ts +0 -131
- package/telegram-plugin/tests/two-zone-bg-detection.test.ts +0 -120
- package/telegram-plugin/tests/two-zone-bg-done-when-all-terminal.test.ts +0 -116
- package/telegram-plugin/tests/two-zone-bg-early-turn-end.test.ts +0 -87
- package/telegram-plugin/tests/two-zone-bg-survives-next-turn.test.ts +0 -211
- package/telegram-plugin/tests/two-zone-card-cap.test.ts +0 -62
- package/telegram-plugin/tests/two-zone-card-fleet-row.test.ts +0 -101
- package/telegram-plugin/tests/two-zone-card-header-phases.test.ts +0 -78
- package/telegram-plugin/tests/two-zone-card-html-balance.test.ts +0 -110
- package/telegram-plugin/tests/two-zone-card-lifecycle.test.ts +0 -128
- package/telegram-plugin/tests/two-zone-card-sanitise.test.ts +0 -58
- package/telegram-plugin/tests/two-zone-card-snapshot.test.ts +0 -133
- package/telegram-plugin/tests/two-zone-concurrent-turns-isolation.test.ts +0 -155
- package/telegram-plugin/tests/two-zone-phasefor-precedence.test.ts +0 -117
- package/telegram-plugin/tests/two-zone-snapshot-extras.test.ts +0 -187
- package/telegram-plugin/tests/two-zone-stuck-edit-throttle.test.ts +0 -149
- package/telegram-plugin/tests/two-zone-stuck-header-escalation.test.ts +0 -101
- package/telegram-plugin/tests/two-zone-stuck-per-member.test.ts +0 -114
- package/telegram-plugin/tests/two-zone-stuck-recovery.test.ts +0 -105
- package/telegram-plugin/tests/waiting-ux-harness.ts +0 -381
- package/telegram-plugin/tests/waiting-ux.e2e.test.ts +0 -233
- package/telegram-plugin/turn-flush-prose-recovery.ts +0 -40
- package/telegram-plugin/two-zone-card.ts +0 -269
- package/telegram-plugin/uat/scenarios/smoke-clerk-reply.test.ts +0 -61
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* JTBD scenario — soft-commit for slow turns.
|
|
3
|
+
*
|
|
4
|
+
* The new conversational-pacing prompt (#1122) instructs the agent
|
|
5
|
+
* to send a one-liner "let me check, back in a few" before slow
|
|
6
|
+
* work. This UAT exercises that behaviour: send a prompt that
|
|
7
|
+
* obviously needs >15s, expect the FIRST outbound to be a short
|
|
8
|
+
* soft-commit message, with the final answer landing later.
|
|
9
|
+
*
|
|
10
|
+
* Not strict — the agent's allowed to skip the soft-commit if it
|
|
11
|
+
* judges the work is fast enough. The assertion is "the user does
|
|
12
|
+
* NOT see a long silent gap before the first sign of life": either
|
|
13
|
+
* a soft-commit OR the actual reply lands within 20s.
|
|
14
|
+
*/
|
|
15
|
+
|
|
16
|
+
import { describe, it, expect } from "vitest";
|
|
17
|
+
import { spinUp } from "../harness.js";
|
|
18
|
+
|
|
19
|
+
// A prompt that needs real work (file reads / web search-ish / some
|
|
20
|
+
// thinking) so the model is incentivised to soft-commit.
|
|
21
|
+
const SLOW_PROMPT = (
|
|
22
|
+
"Read /etc/hostname and /etc/os-release, then summarise this "
|
|
23
|
+
+ "machine in a single sentence (what OS family, what hostname). "
|
|
24
|
+
+ "Take your time."
|
|
25
|
+
);
|
|
26
|
+
|
|
27
|
+
describe("uat: soft-commit pacing", () => {
|
|
28
|
+
it(
|
|
29
|
+
"user asks slow question → first reply lands within 20s",
|
|
30
|
+
async () => {
|
|
31
|
+
const sc = await spinUp({ agent: "test-harness" });
|
|
32
|
+
try {
|
|
33
|
+
const sendStart = Date.now();
|
|
34
|
+
await sc.sendDM(SLOW_PROMPT);
|
|
35
|
+
|
|
36
|
+
// 30s wall-clock budget gives mtcute polling jitter + the
|
|
37
|
+
// agent's first tool call enough headroom that a "near-miss
|
|
38
|
+
// soft commit" (model thinks for 25s then sends) still passes.
|
|
39
|
+
// Previous 25s/22s pair sat exactly in the model's natural
|
|
40
|
+
// think-then-respond window and produced flake unrelated to
|
|
41
|
+
// any real bug.
|
|
42
|
+
const firstReply = await sc.expectMessage(/\S/, {
|
|
43
|
+
from: "bot",
|
|
44
|
+
timeout: 30_000,
|
|
45
|
+
});
|
|
46
|
+
const ttfo = Date.now() - sendStart;
|
|
47
|
+
|
|
48
|
+
expect(firstReply.text.length).toBeGreaterThan(0);
|
|
49
|
+
expect(ttfo).toBeLessThan(30_000);
|
|
50
|
+
|
|
51
|
+
// If the first reply IS the final answer (short, complete),
|
|
52
|
+
// the model skipped soft-commit ceremony — fine, just note.
|
|
53
|
+
if (firstReply.text.length > 200) {
|
|
54
|
+
console.log(
|
|
55
|
+
`[soft-commit] model produced a long final answer as the `
|
|
56
|
+
+ `first message (${firstReply.text.length} chars, ${ttfo}ms). `
|
|
57
|
+
+ `Conversational pacing prompt would prefer a soft-commit `
|
|
58
|
+
+ `first — but this is a soft preference, not a contract.`,
|
|
59
|
+
);
|
|
60
|
+
}
|
|
61
|
+
} finally {
|
|
62
|
+
await sc.tearDown();
|
|
63
|
+
}
|
|
64
|
+
},
|
|
65
|
+
50_000,
|
|
66
|
+
);
|
|
67
|
+
});
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* JTBD scenario — `status?` inbound classifier.
|
|
3
|
+
*
|
|
4
|
+
* The conversational-pacing redesign (#1122 PR1) wired a primary
|
|
5
|
+
* lagging KPI: `inbound_status_query`, the count of users typing
|
|
6
|
+
* "status?", "still there?", "?", etc — every fire is a JTBD
|
|
7
|
+
* failure. We assert the classifier triggers AND the agent
|
|
8
|
+
* gracefully responds (i.e. doesn't crash, doesn't ignore, doesn't
|
|
9
|
+
* loop on it).
|
|
10
|
+
*
|
|
11
|
+
* Note: the classifier is fire-and-forget — it emits a runtime
|
|
12
|
+
* metric event but doesn't change routing. So all we can assert
|
|
13
|
+
* from the driver side is "the agent still replies sensibly" —
|
|
14
|
+
* the metric emission is verified by the unit tests in
|
|
15
|
+
* `tests/inbound-classifier.test.ts`. This UAT exists for
|
|
16
|
+
* end-to-end safety: "sending status? doesn't break anything."
|
|
17
|
+
*/
|
|
18
|
+
|
|
19
|
+
import { describe, it, expect } from "vitest";
|
|
20
|
+
import { spinUp } from "../harness.js";
|
|
21
|
+
|
|
22
|
+
const STATUS_QUERIES = ["status?", "still there?", "any update?", "?"];
|
|
23
|
+
|
|
24
|
+
describe("uat: status-query inbound", () => {
|
|
25
|
+
for (const query of STATUS_QUERIES) {
|
|
26
|
+
it(
|
|
27
|
+
`user sends ${JSON.stringify(query)} → agent replies sensibly`,
|
|
28
|
+
async () => {
|
|
29
|
+
const sc = await spinUp({ agent: "test-harness" });
|
|
30
|
+
try {
|
|
31
|
+
await sc.sendDM(query);
|
|
32
|
+
|
|
33
|
+
// Any non-empty reply within 60s is acceptable. The
|
|
34
|
+
// interesting thing is the classifier metric fired —
|
|
35
|
+
// verified at the unit-test level. Here we just want
|
|
36
|
+
// "no crash, no silent-end, sensible reply."
|
|
37
|
+
const reply = await sc.expectMessage(/\S/, {
|
|
38
|
+
from: "bot",
|
|
39
|
+
timeout: 60_000,
|
|
40
|
+
});
|
|
41
|
+
expect(reply.text.length).toBeGreaterThan(0);
|
|
42
|
+
} finally {
|
|
43
|
+
await sc.tearDown();
|
|
44
|
+
}
|
|
45
|
+
},
|
|
46
|
+
90_000,
|
|
47
|
+
);
|
|
48
|
+
}
|
|
49
|
+
});
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Location-inbound scenario — driver shares a geolocation with the
|
|
3
|
+
* test bot. Exercises the new `message:location` handler from #1077
|
|
4
|
+
* end-to-end: gateway parses the lat/lon, builds a `(location: …)`
|
|
5
|
+
* envelope, forwards to the agent, agent replies.
|
|
6
|
+
*
|
|
7
|
+
* Requires the same env as `smoke-dm-reply.test.ts` (see
|
|
8
|
+
* `uat/SETUP.md` §6).
|
|
9
|
+
*
|
|
10
|
+
* Coordinates are intentionally a well-known landmark (Sydney Opera
|
|
11
|
+
* House) so a failure trace makes "what was shared" obvious — and so
|
|
12
|
+
* a chatbot persona has something semantically grounded to respond to,
|
|
13
|
+
* which makes the bot's reply check more meaningful than asserting
|
|
14
|
+
* `.+`. We still tolerate ANY reply text — the goal is to prove the
|
|
15
|
+
* gateway forwarded the location, not to grade the agent's geography.
|
|
16
|
+
*
|
|
17
|
+
* Other 12 inbound types from #1077 are covered structurally in
|
|
18
|
+
* `tests/inbound-message-types.test.ts`. End-to-end UAT for them
|
|
19
|
+
* (contact, venue, dice, poll, web_app_data, users_shared,
|
|
20
|
+
* chat_shared, dice, game, story, paid_media, successful_payment,
|
|
21
|
+
* passport_data) is deferred — most require either a custom bot
|
|
22
|
+
* setup (mini-app, payments provider) or a Telegram client gesture
|
|
23
|
+
* (story share, dice roll) that the mtcute driver does not script
|
|
24
|
+
* cleanly enough to be worth the brittleness.
|
|
25
|
+
*/
|
|
26
|
+
|
|
27
|
+
import { describe, it, expect } from "vitest";
|
|
28
|
+
import { spinUp } from "../harness.js";
|
|
29
|
+
|
|
30
|
+
// Sydney Opera House — recognizable, non-sensitive, stable across runs.
|
|
31
|
+
const SYDNEY_OPERA_HOUSE_LAT = -33.8568;
|
|
32
|
+
const SYDNEY_OPERA_HOUSE_LON = 151.2153;
|
|
33
|
+
|
|
34
|
+
describe("uat: location-inbound DM round-trip", () => {
|
|
35
|
+
it(
|
|
36
|
+
"driver shares a geolocation, bot replies within 90s",
|
|
37
|
+
async () => {
|
|
38
|
+
const sc = await spinUp({ agent: "test-harness" });
|
|
39
|
+
|
|
40
|
+
try {
|
|
41
|
+
await sc.driver.sendLocation(
|
|
42
|
+
sc.botUserId,
|
|
43
|
+
SYDNEY_OPERA_HOUSE_LAT,
|
|
44
|
+
SYDNEY_OPERA_HOUSE_LON,
|
|
45
|
+
);
|
|
46
|
+
|
|
47
|
+
// Same budget as smoke-dm-reply: 90s tolerates the gateway's
|
|
48
|
+
// coalescing window + one normal Claude turn. A healthy agent
|
|
49
|
+
// replies in <20s.
|
|
50
|
+
const reply = await sc.expectMessage(/.+/, {
|
|
51
|
+
from: "bot",
|
|
52
|
+
timeout: 90_000,
|
|
53
|
+
});
|
|
54
|
+
|
|
55
|
+
expect(reply.text.length).toBeGreaterThan(0);
|
|
56
|
+
expect(reply.senderUserId).toBe(sc.botUserId);
|
|
57
|
+
} finally {
|
|
58
|
+
await sc.tearDown();
|
|
59
|
+
}
|
|
60
|
+
},
|
|
61
|
+
// Mirrors smoke-dm-reply's 110s outer budget — must exceed the
|
|
62
|
+
// 90s inner deadline plus spinUp overhead.
|
|
63
|
+
110_000,
|
|
64
|
+
);
|
|
65
|
+
});
|
|
@@ -0,0 +1,175 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Mid-turn `disable_notification` scenario.
|
|
3
|
+
*
|
|
4
|
+
* Goal context: cause class CC-2 in `docs/status-ask-cause-classes.md`
|
|
5
|
+
* — the L2 conversational layer. The conversational-pacing prompt
|
|
6
|
+
* (`profiles/_shared/telegram-style.md.hbs:10`) instructs the model to
|
|
7
|
+
* pass `disable_notification: true` on mid-turn `reply` calls so the
|
|
8
|
+
* user only gets a device ping on the FINAL answer. If that contract
|
|
9
|
+
* silently degrades — model regression, prompt drift, or a gateway
|
|
10
|
+
* code path that drops the flag — every mid-turn reply pings. Users
|
|
11
|
+
* mute the bot. They then can't tell working from done. They ask
|
|
12
|
+
* "are you alive?" — `inbound_status_query` ticks.
|
|
13
|
+
*
|
|
14
|
+
* The flag is observable on the receiving side via mtcute's
|
|
15
|
+
* `message.isSilent` getter (corresponds to Telegram's
|
|
16
|
+
* `message.silent` flag, set by sender's `disable_notification` Bot
|
|
17
|
+
* API param). The driver was extended in this PR to surface it on
|
|
18
|
+
* `ObservedMessage.silent`.
|
|
19
|
+
*
|
|
20
|
+
* ## What the scenario asserts
|
|
21
|
+
*
|
|
22
|
+
* 1. Send a prompt that should produce multiple bot outbounds (a
|
|
23
|
+
* soft commit + mid-turn updates + a final answer). The prompt
|
|
24
|
+
* is explicit about wanting paced updates so the model doesn't
|
|
25
|
+
* optimize to a single reply.
|
|
26
|
+
* 2. Collect every bot message in the turn (waits for quiescence:
|
|
27
|
+
* no fresh bot message for `QUIESCENCE_MS`).
|
|
28
|
+
* 3. Assert: every bot message EXCEPT THE LAST has `silent === true`.
|
|
29
|
+
* 4. Assert: the LAST bot message has `silent === false` (the final
|
|
30
|
+
* answer should ping).
|
|
31
|
+
*
|
|
32
|
+
* ## Tolerances
|
|
33
|
+
*
|
|
34
|
+
* - If the turn has only one bot message (model judged the work fast
|
|
35
|
+
* enough to skip pacing), the mid-turn assertion is vacuous and we
|
|
36
|
+
* only check that the single final message is NOT silent. The
|
|
37
|
+
* prompt is engineered to be slow enough that this is unlikely,
|
|
38
|
+
* but we don't fail on it.
|
|
39
|
+
* - Quiescence window is 12s — long enough that a paused model isn't
|
|
40
|
+
* mistaken for "done", short enough that test wall-clock stays
|
|
41
|
+
* reasonable.
|
|
42
|
+
* - Edits don't count as fresh messages — we observe `edited === false`
|
|
43
|
+
* only. This matches the production semantic: an edit doesn't push
|
|
44
|
+
* a notification.
|
|
45
|
+
*
|
|
46
|
+
* ## Failure shapes
|
|
47
|
+
*
|
|
48
|
+
* 1. Mid-turn ping degrade — at least one non-last message has
|
|
49
|
+
* `silent === false`. The error message names the offending
|
|
50
|
+
* message index + text preview.
|
|
51
|
+
* 2. Final-answer silent — the last message has `silent === true`.
|
|
52
|
+
* Means the final answer doesn't ping; user might miss the
|
|
53
|
+
* reply landing.
|
|
54
|
+
* 3. No bot messages within timeout — distinct failure: agent
|
|
55
|
+
* isn't responding at all.
|
|
56
|
+
*
|
|
57
|
+
* Requires the same env as `smoke-dm-reply.test.ts` (see
|
|
58
|
+
* `uat/SETUP.md` §6).
|
|
59
|
+
*/
|
|
60
|
+
|
|
61
|
+
import { describe, expect, it } from "vitest";
|
|
62
|
+
import { spinUp } from "../harness.js";
|
|
63
|
+
import type { ObservedMessage } from "../driver.js";
|
|
64
|
+
|
|
65
|
+
const QUIESCENCE_MS = 12_000;
|
|
66
|
+
const OVERALL_DEADLINE_MS = 120_000;
|
|
67
|
+
|
|
68
|
+
// Multi-step prompt with explicit pacing expectations. Engineered so
|
|
69
|
+
// a well-behaved model produces:
|
|
70
|
+
// 1. soft commit ("on it" / "let me check")
|
|
71
|
+
// 2. mid-turn update after each file (with disable_notification: true)
|
|
72
|
+
// 3. final answer
|
|
73
|
+
//
|
|
74
|
+
// The work itself is two trivial file reads + a one-sentence
|
|
75
|
+
// summary. If the model collapses this to a single reply, the test
|
|
76
|
+
// still asserts the disable_notification contract on what it does
|
|
77
|
+
// emit; the vacuous-mid-turn path is allowed.
|
|
78
|
+
const PACED_PROMPT =
|
|
79
|
+
"Please follow this exact pacing protocol for this turn:\n" +
|
|
80
|
+
" 1. First send a brief 'on it' reply so I know you started.\n" +
|
|
81
|
+
" 2. Read /etc/hostname, then send a brief mid-turn update saying " +
|
|
82
|
+
"what the hostname is. Use disable_notification:true on that update.\n" +
|
|
83
|
+
" 3. Read /etc/os-release, then send a brief mid-turn update saying " +
|
|
84
|
+
"what the OS family is. Use disable_notification:true on that update.\n" +
|
|
85
|
+
" 4. Finally send a single-sentence summary as your final answer " +
|
|
86
|
+
"(no disable_notification flag — this one should ping me).\n" +
|
|
87
|
+
"Keep each message short.";
|
|
88
|
+
|
|
89
|
+
describe("uat: mid-turn replies pass disable_notification (CC-2)", () => {
|
|
90
|
+
it(
|
|
91
|
+
"every mid-turn bot reply is silent; only the final answer pings",
|
|
92
|
+
async () => {
|
|
93
|
+
const sc = await spinUp({ agent: "test-harness" });
|
|
94
|
+
try {
|
|
95
|
+
await sc.sendDM(PACED_PROMPT);
|
|
96
|
+
|
|
97
|
+
const collected: ObservedMessage[] = [];
|
|
98
|
+
const overallDeadline = Date.now() + OVERALL_DEADLINE_MS;
|
|
99
|
+
let quiescenceDeadline = Date.now() + 30_000; // first message
|
|
100
|
+
// bigger budget
|
|
101
|
+
|
|
102
|
+
// Drain bot messages until QUIESCENCE_MS passes with no
|
|
103
|
+
// fresh non-edit observation, or the overall deadline hits.
|
|
104
|
+
while (Date.now() < overallDeadline) {
|
|
105
|
+
const remaining = Math.min(
|
|
106
|
+
quiescenceDeadline - Date.now(),
|
|
107
|
+
overallDeadline - Date.now(),
|
|
108
|
+
);
|
|
109
|
+
if (remaining <= 0) break;
|
|
110
|
+
try {
|
|
111
|
+
const msg = await sc.expectMessage(
|
|
112
|
+
(m: ObservedMessage) => m.fromBot && !m.edited,
|
|
113
|
+
{ from: "bot", timeout: remaining },
|
|
114
|
+
);
|
|
115
|
+
collected.push(msg);
|
|
116
|
+
quiescenceDeadline = Date.now() + QUIESCENCE_MS;
|
|
117
|
+
} catch {
|
|
118
|
+
// Timed out — that's the quiescence signal we wanted.
|
|
119
|
+
break;
|
|
120
|
+
}
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
expect(
|
|
124
|
+
collected.length,
|
|
125
|
+
`no bot messages observed within ${OVERALL_DEADLINE_MS}ms — ` +
|
|
126
|
+
`agent isn't responding at all (distinct failure from CC-2).`,
|
|
127
|
+
).toBeGreaterThan(0);
|
|
128
|
+
|
|
129
|
+
const trail = collected
|
|
130
|
+
.map(
|
|
131
|
+
(m, i) =>
|
|
132
|
+
` [${i}] silent=${m.silent} text=${JSON.stringify(
|
|
133
|
+
m.text.slice(0, 80),
|
|
134
|
+
)}`,
|
|
135
|
+
)
|
|
136
|
+
.join("\n");
|
|
137
|
+
|
|
138
|
+
// Final answer should ping.
|
|
139
|
+
const last = collected[collected.length - 1];
|
|
140
|
+
expect(
|
|
141
|
+
last.silent,
|
|
142
|
+
`final answer (message ${collected.length - 1}) was marked ` +
|
|
143
|
+
`silent — the user won't get pinged when the turn finishes. ` +
|
|
144
|
+
`Trail:\n${trail}`,
|
|
145
|
+
).toBe(false);
|
|
146
|
+
|
|
147
|
+
// Mid-turn updates should NOT ping. Vacuous when the model
|
|
148
|
+
// emitted only the final answer; meaningful when paced.
|
|
149
|
+
const midTurn = collected.slice(0, -1);
|
|
150
|
+
const loudMidTurn = midTurn.filter((m) => !m.silent);
|
|
151
|
+
expect(
|
|
152
|
+
loudMidTurn.length,
|
|
153
|
+
`${loudMidTurn.length} mid-turn message(s) were NOT silent — ` +
|
|
154
|
+
`each one pings the user's device. Conversational pacing ` +
|
|
155
|
+
`requires disable_notification:true on mid-turn replies. ` +
|
|
156
|
+
`Trail:\n${trail}`,
|
|
157
|
+
).toBe(0);
|
|
158
|
+
|
|
159
|
+
if (midTurn.length === 0) {
|
|
160
|
+
console.warn(
|
|
161
|
+
`[midturn-silent] model produced only 1 bot reply — the ` +
|
|
162
|
+
`mid-turn assertion was vacuous. Prompt may not be ` +
|
|
163
|
+
`slow enough to force pacing, or the model is ignoring ` +
|
|
164
|
+
`the explicit step-by-step instructions. This is not a ` +
|
|
165
|
+
`failure of CC-2, but the scenario didn't cover its ` +
|
|
166
|
+
`intended ground.`,
|
|
167
|
+
);
|
|
168
|
+
}
|
|
169
|
+
} finally {
|
|
170
|
+
await sc.tearDown();
|
|
171
|
+
}
|
|
172
|
+
},
|
|
173
|
+
OVERALL_DEADLINE_MS + 30_000,
|
|
174
|
+
);
|
|
175
|
+
});
|
|
@@ -0,0 +1,142 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Reaction lifecycle scenario — driver DMs the test bot, bot reacts
|
|
3
|
+
* to the inbound message through the lifecycle and lands a terminal
|
|
4
|
+
* emoji once the reply ships.
|
|
5
|
+
*
|
|
6
|
+
* Part of: https://github.com/switchroom/switchroom/issues/866
|
|
7
|
+
* Goal context: cause class CC-1 / CC-6 in
|
|
8
|
+
* `docs/status-ask-cause-classes.md` (the L1 ambient layer should
|
|
9
|
+
* deliver a definitively-done terminal emoji within a few seconds
|
|
10
|
+
* of the bot's final reply — otherwise the user looks at their
|
|
11
|
+
* inbound, sees it still wearing 🤔, and asks "you done?").
|
|
12
|
+
*
|
|
13
|
+
* History: this scenario was previously `describe.skip` with a
|
|
14
|
+
* rationale that the pinned progress card "renders INSTEAD of
|
|
15
|
+
* reactions". The card was retired in #1126; the card-vs-reaction
|
|
16
|
+
* branch in the gateway is dead. We can now exercise the full
|
|
17
|
+
* lifecycle end-to-end without the two-agent split.
|
|
18
|
+
*
|
|
19
|
+
* What we assert (in priority order):
|
|
20
|
+
*
|
|
21
|
+
* 1. Within the turn, the driver sees AT LEAST ONE `+` reaction
|
|
22
|
+
* op (the L1 "I'm alive" signal). Fast turns may collapse
|
|
23
|
+
* intermediate states, so we only require *one* add, not a
|
|
24
|
+
* specific emoji.
|
|
25
|
+
* 2. By the time the bot has sent a final reply (+ a short tail
|
|
26
|
+
* for Telegram to deliver the terminal-emoji replace), the
|
|
27
|
+
* LAST observed `+` op is in the `done` set (`👍 / 💯 / 🎉`).
|
|
28
|
+
*
|
|
29
|
+
* Why "last `+` op wins" rather than `expectReaction(['👍'])` with
|
|
30
|
+
* a literal sequence: `setMessageReaction` REPLACES the prior emoji
|
|
31
|
+
* atomically. mtcute's update stream can deliver the replace as a
|
|
32
|
+
* `-prev` followed by a `+next`, or as a single coalesced event,
|
|
33
|
+
* depending on server batching. The "last add wins" shape matches
|
|
34
|
+
* the production semantics — whatever's *currently* on the message
|
|
35
|
+
* is what the user actually sees.
|
|
36
|
+
*
|
|
37
|
+
* The observer must be attached BEFORE the reply lands so we
|
|
38
|
+
* capture the queued / working reactions, not just the terminal
|
|
39
|
+
* one. Pattern: `observeReactions` immediately after `sendDM`
|
|
40
|
+
* returns the messageId, drain into a trail array while we wait
|
|
41
|
+
* for the reply, then run a short tail to catch the terminal
|
|
42
|
+
* after the reply.
|
|
43
|
+
*
|
|
44
|
+
* Requires the same env as `smoke-dm-reply.test.ts` (see
|
|
45
|
+
* `uat/SETUP.md` §6).
|
|
46
|
+
*/
|
|
47
|
+
|
|
48
|
+
import { describe, expect, it } from "vitest";
|
|
49
|
+
import { spinUp } from "../harness.js";
|
|
50
|
+
|
|
51
|
+
const TERMINAL_DONE_EMOJI = new Set(["👍", "💯", "🎉"]);
|
|
52
|
+
const TAIL_AFTER_REPLY_MS = 8_000;
|
|
53
|
+
|
|
54
|
+
const INBOUND = (): string => `uat-reactions ${new Date().toISOString()}`;
|
|
55
|
+
|
|
56
|
+
interface ObservedOp {
|
|
57
|
+
emoji: string;
|
|
58
|
+
op: "+" | "-";
|
|
59
|
+
at: number;
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
describe("uat: reaction lifecycle on driver DM", () => {
|
|
63
|
+
it(
|
|
64
|
+
"driver sees an alive reaction, then a terminal-done emoji by reply tail",
|
|
65
|
+
async () => {
|
|
66
|
+
const sc = await spinUp({ agent: "test-harness" });
|
|
67
|
+
try {
|
|
68
|
+
const sent = await sc.sendDM(INBOUND());
|
|
69
|
+
|
|
70
|
+
// Attach the observer immediately so the queued (👀) and
|
|
71
|
+
// working reactions don't fire before the listener exists.
|
|
72
|
+
const trail: ObservedOp[] = [];
|
|
73
|
+
const iter = sc.driver
|
|
74
|
+
.observeReactions(sc.botUserId, { messageId: sent.messageId })
|
|
75
|
+
[Symbol.asyncIterator]();
|
|
76
|
+
let pump: Promise<void> | null = null;
|
|
77
|
+
let stopPump = false;
|
|
78
|
+
pump = (async () => {
|
|
79
|
+
while (!stopPump) {
|
|
80
|
+
const next = await iter.next();
|
|
81
|
+
if (next.done === true) return;
|
|
82
|
+
trail.push({
|
|
83
|
+
emoji: next.value.emoji,
|
|
84
|
+
op: next.value.op,
|
|
85
|
+
at: Date.now(),
|
|
86
|
+
});
|
|
87
|
+
}
|
|
88
|
+
})();
|
|
89
|
+
|
|
90
|
+
try {
|
|
91
|
+
// Wait for the bot's reply (any content). Gives the L1
|
|
92
|
+
// lifecycle time to traverse queued → working → done.
|
|
93
|
+
const reply = await sc.expectMessage(/\S/, {
|
|
94
|
+
from: "bot",
|
|
95
|
+
timeout: 60_000,
|
|
96
|
+
});
|
|
97
|
+
expect(reply.text.length).toBeGreaterThan(0);
|
|
98
|
+
|
|
99
|
+
// Tail after the reply for Telegram to deliver the
|
|
100
|
+
// terminal-emoji replace. In practice <1s on a healthy bot;
|
|
101
|
+
// 8s ceiling absorbs server batching jitter.
|
|
102
|
+
await new Promise((resolve) =>
|
|
103
|
+
setTimeout(resolve, TAIL_AFTER_REPLY_MS),
|
|
104
|
+
);
|
|
105
|
+
} finally {
|
|
106
|
+
stopPump = true;
|
|
107
|
+
await iter.return?.();
|
|
108
|
+
if (pump) {
|
|
109
|
+
await pump.catch(() => {
|
|
110
|
+
/* generator return triggers rejection on pending iter.next() — ignore */
|
|
111
|
+
});
|
|
112
|
+
}
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
// L1 alive signal: at least one `+` op landed during the turn.
|
|
116
|
+
const adds = trail.filter((o) => o.op === "+");
|
|
117
|
+
expect(
|
|
118
|
+
adds.length,
|
|
119
|
+
`expected at least one reaction-add during the turn, got 0. ` +
|
|
120
|
+
`Full trail: ${trail.map((o) => `${o.op}${o.emoji}`).join(" ") || "(empty)"}`,
|
|
121
|
+
).toBeGreaterThan(0);
|
|
122
|
+
|
|
123
|
+
// L1 terminal: the LAST `+` op should be a terminal-done emoji.
|
|
124
|
+
// Extra `-` ops after the final `+` are tolerated (Telegram
|
|
125
|
+
// sometimes emits a bare clean-up `-`); the last `+` is what
|
|
126
|
+
// the user actually sees.
|
|
127
|
+
const lastAdd = adds[adds.length - 1];
|
|
128
|
+
expect(
|
|
129
|
+
TERMINAL_DONE_EMOJI.has(lastAdd.emoji),
|
|
130
|
+
`expected last reaction-add to be one of ${[
|
|
131
|
+
...TERMINAL_DONE_EMOJI,
|
|
132
|
+
].join(", ")}, got ${lastAdd.emoji}. Full trail: ${trail
|
|
133
|
+
.map((o) => `${o.op}${o.emoji}`)
|
|
134
|
+
.join(" ")}`,
|
|
135
|
+
).toBe(true);
|
|
136
|
+
} finally {
|
|
137
|
+
await sc.tearDown();
|
|
138
|
+
}
|
|
139
|
+
},
|
|
140
|
+
90_000,
|
|
141
|
+
);
|
|
142
|
+
});
|
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* UAT scenario — driver reacts to a bot DM with a trigger emoji and
|
|
3
|
+
* observes the agent process a synthetic inbound turn (#1074).
|
|
4
|
+
*
|
|
5
|
+
* Flow:
|
|
6
|
+
* 1. Driver sends a DM that will provoke a bot reply.
|
|
7
|
+
* 2. Bot replies — driver observes the reply message id.
|
|
8
|
+
* 3. Driver places a 👎 reaction on the bot's reply.
|
|
9
|
+
* 4. Assert: the agent emits a subsequent action (another outbound
|
|
10
|
+
* message). The reaction-trigger pipeline synthesizes a new
|
|
11
|
+
* `<channel source="reaction">` inbound turn, which the agent's
|
|
12
|
+
* Claude session treats as a normal turn and (per profile
|
|
13
|
+
* guidance) acknowledges or course-corrects.
|
|
14
|
+
*
|
|
15
|
+
* Negative:
|
|
16
|
+
* - Driver also places a ❤️ reaction (not in the default
|
|
17
|
+
* allowlist) on a separate bot message.
|
|
18
|
+
* - Assert: NO new agent action within the negative-budget window.
|
|
19
|
+
*
|
|
20
|
+
* Requires the same env as `smoke-dm-reply.test.ts` — see
|
|
21
|
+
* `uat/SETUP.md` §6.
|
|
22
|
+
*
|
|
23
|
+
* NOTE: this scenario depends on the test-harness agent having the
|
|
24
|
+
* default `reactions:` config (allowlist includes 👎). If an operator
|
|
25
|
+
* has narrowed the allowlist this case will fail-with-message.
|
|
26
|
+
*/
|
|
27
|
+
|
|
28
|
+
import { describe, it, expect } from "vitest";
|
|
29
|
+
import { spinUp } from "../harness.js";
|
|
30
|
+
|
|
31
|
+
const TRIGGER_INBOUND = `uat-reaction-trigger ${new Date().toISOString()}`;
|
|
32
|
+
const NEGATIVE_INBOUND = `uat-reaction-trigger-negative ${new Date().toISOString()}`;
|
|
33
|
+
|
|
34
|
+
describe("uat: bot reaction triggers synthetic agent turn (#1074)", () => {
|
|
35
|
+
it(
|
|
36
|
+
"👎 on a bot reply dispatches a new agent turn; ❤️ does not",
|
|
37
|
+
async () => {
|
|
38
|
+
const sc = await spinUp({ agent: "test-harness" });
|
|
39
|
+
try {
|
|
40
|
+
// 1. Drive the first bot reply we'll react to.
|
|
41
|
+
await sc.sendDM(TRIGGER_INBOUND);
|
|
42
|
+
const firstReply = await sc.expectMessage(/.+/, {
|
|
43
|
+
from: "bot",
|
|
44
|
+
timeout: 90_000,
|
|
45
|
+
});
|
|
46
|
+
expect(firstReply.senderUserId).toBe(sc.botUserId);
|
|
47
|
+
|
|
48
|
+
// 2. React 👎 to the bot's reply. Default allowlist includes 👎,
|
|
49
|
+
// so the gateway should dispatch a synthetic inbound after
|
|
50
|
+
// the debounce window elapses.
|
|
51
|
+
await sc.driver.sendReaction(sc.botUserId, firstReply.messageId, "👎");
|
|
52
|
+
|
|
53
|
+
// 3. Wait for the agent to emit ANY subsequent message. The
|
|
54
|
+
// debounce window is 30s by default, plus a Claude turn —
|
|
55
|
+
// budget 120s to be safe.
|
|
56
|
+
const triggeredReply = await sc.expectMessage(/.+/, {
|
|
57
|
+
from: "bot",
|
|
58
|
+
timeout: 120_000,
|
|
59
|
+
});
|
|
60
|
+
expect(triggeredReply.messageId).not.toBe(firstReply.messageId);
|
|
61
|
+
expect(triggeredReply.senderUserId).toBe(sc.botUserId);
|
|
62
|
+
|
|
63
|
+
// ── Negative case ───────────────────────────────────────────────
|
|
64
|
+
await sc.sendDM(NEGATIVE_INBOUND);
|
|
65
|
+
const secondReply = await sc.expectMessage(/.+/, {
|
|
66
|
+
from: "bot",
|
|
67
|
+
timeout: 90_000,
|
|
68
|
+
});
|
|
69
|
+
await sc.driver.sendReaction(sc.botUserId, secondReply.messageId, "❤️");
|
|
70
|
+
|
|
71
|
+
// Wait the full debounce window + a generous Claude budget. If
|
|
72
|
+
// a new turn fires within this window, the negative case has
|
|
73
|
+
// failed (the allowlist leaked).
|
|
74
|
+
const NEGATIVE_BUDGET_MS = 45_000;
|
|
75
|
+
let leaked = false;
|
|
76
|
+
try {
|
|
77
|
+
await sc.expectMessage(/.+/, {
|
|
78
|
+
from: "bot",
|
|
79
|
+
timeout: NEGATIVE_BUDGET_MS,
|
|
80
|
+
});
|
|
81
|
+
leaked = true;
|
|
82
|
+
} catch {
|
|
83
|
+
// Expected — no new message within the negative window.
|
|
84
|
+
}
|
|
85
|
+
expect(leaked).toBe(false);
|
|
86
|
+
} finally {
|
|
87
|
+
await sc.tearDown();
|
|
88
|
+
}
|
|
89
|
+
},
|
|
90
|
+
// Per-test budget — must cover trigger turn + debounce + agent
|
|
91
|
+
// reply + negative-budget window + spinUp overhead. 5 minutes is
|
|
92
|
+
// generous but on the order of `progress-card-dm.test.ts` which
|
|
93
|
+
// also has multi-phase waits.
|
|
94
|
+
300_000,
|
|
95
|
+
);
|
|
96
|
+
});
|