switchroom 0.7.15 → 0.10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +51 -59
- package/bin/run-hook.sh +27 -11
- package/bin/timezone-hook.sh +9 -7
- package/dist/agent-scheduler/index.js +410 -133
- package/dist/auth-broker/index.js +13932 -0
- package/dist/cli/switchroom.js +26937 -5601
- package/dist/host-control/main.js +12702 -0
- package/dist/vault/approvals/kernel-server.js +467 -184
- package/dist/vault/broker/server.js +1430 -724
- package/examples/minimal.yaml +63 -0
- package/examples/personal-google-workspace-mcp/.env.example +34 -0
- package/examples/personal-google-workspace-mcp/README.md +194 -0
- package/examples/personal-google-workspace-mcp/compose.yaml +66 -0
- package/examples/switchroom.yaml +220 -0
- package/package.json +7 -4
- package/profiles/_base/settings.json.hbs +20 -5
- package/profiles/_base/start.sh.hbs +16 -3
- package/profiles/_shared/agent-self-service.md.hbs +126 -0
- package/profiles/_shared/telegram-style.md.hbs +20 -90
- package/profiles/_shared/vault-protocol.md.hbs +68 -0
- package/profiles/default/CLAUDE.md +50 -96
- package/profiles/default/CLAUDE.md.hbs +36 -6
- package/profiles/default/workspace/SOUL.md.hbs +12 -5
- package/skills/buildkite-agent-infrastructure/SKILL.md +30 -11
- package/skills/buildkite-agent-runtime/SKILL.md +44 -11
- package/skills/buildkite-api/SKILL.md +31 -8
- package/skills/buildkite-cli/SKILL.md +27 -9
- package/skills/buildkite-migration/SKILL.md +22 -9
- package/skills/buildkite-pipelines/SKILL.md +26 -9
- package/skills/buildkite-secure-delivery/SKILL.md +23 -9
- package/skills/buildkite-test-engine/SKILL.md +25 -8
- package/skills/docx/SKILL.md +1 -1
- package/skills/docx/scripts/office/validators/__pycache__/__init__.cpython-313.pyc +0 -0
- package/skills/docx/scripts/office/validators/__pycache__/base.cpython-313.pyc +0 -0
- package/skills/file-bug/SKILL.md +34 -6
- package/skills/humanizer/SKILL.md +15 -0
- package/skills/humanizer-calibrate/SKILL.md +7 -1
- package/skills/mcp-builder/SKILL.md +1 -1
- package/skills/pdf/SKILL.md +1 -1
- package/skills/pptx/SKILL.md +1 -1
- package/skills/skill-creator/SKILL.md +21 -1
- package/skills/skill-creator/scripts/__pycache__/__init__.cpython-313.pyc +0 -0
- package/skills/skill-creator/scripts/__pycache__/generate_report.cpython-313.pyc +0 -0
- package/skills/skill-creator/scripts/__pycache__/improve_description.cpython-313.pyc +0 -0
- package/skills/skill-creator/scripts/__pycache__/run_eval.cpython-313.pyc +0 -0
- package/skills/skill-creator/scripts/__pycache__/run_loop.cpython-313.pyc +0 -0
- package/skills/skill-creator/scripts/__pycache__/utils.cpython-313.pyc +0 -0
- package/skills/switchroom-cli/SKILL.md +63 -64
- package/skills/switchroom-health/SKILL.md +23 -10
- package/skills/switchroom-install/SKILL.md +3 -3
- package/skills/switchroom-manage/SKILL.md +26 -19
- package/skills/switchroom-runtime/SKILL.md +191 -0
- package/skills/switchroom-status/SKILL.md +27 -2
- package/skills/telegram-test-harness/SKILL.md +3 -0
- package/skills/token-helpers/SKILL.md +24 -1
- package/skills/webapp-testing/SKILL.md +31 -1
- package/skills/xlsx/SKILL.md +1 -1
- package/telegram-plugin/admin-commands/index.ts +7 -5
- package/telegram-plugin/analytics-posthog.ts +191 -0
- package/telegram-plugin/bridge/bridge.ts +69 -0
- package/telegram-plugin/bridge/ipc-client.ts +4 -1
- package/telegram-plugin/dist/bridge/bridge.js +194 -119
- package/telegram-plugin/dist/gateway/gateway.js +23611 -19671
- package/telegram-plugin/dist/server.js +245 -189
- package/telegram-plugin/first-paint.ts +3 -24
- package/telegram-plugin/gateway/auth-add-flow.ts +326 -0
- package/telegram-plugin/gateway/auth-broker-client.ts +75 -0
- package/telegram-plugin/gateway/auth-command.ts +794 -0
- package/telegram-plugin/gateway/auth-line.ts +123 -0
- package/telegram-plugin/gateway/boot-card.ts +169 -40
- package/telegram-plugin/gateway/boot-issue-cache.ts +308 -0
- package/telegram-plugin/gateway/boot-probes.ts +166 -123
- package/telegram-plugin/gateway/boot-reason.ts +41 -7
- package/telegram-plugin/gateway/boot-version.ts +66 -0
- package/telegram-plugin/gateway/gateway.ts +3499 -1885
- package/telegram-plugin/gateway/hostd-dispatch.ts +117 -0
- package/telegram-plugin/gateway/ipc-protocol.ts +18 -0
- package/telegram-plugin/gateway/pending-inbound-buffer.ts +106 -0
- package/telegram-plugin/gateway/quarantine.ts +69 -0
- package/telegram-plugin/gateway/quota-cache.ts +9 -4
- package/telegram-plugin/gateway/reaction-trigger.ts +401 -0
- package/telegram-plugin/gateway/recent-denials.test.ts +103 -0
- package/telegram-plugin/gateway/recent-denials.ts +77 -0
- package/telegram-plugin/gateway/startup-network-retry.ts +109 -31
- package/telegram-plugin/gateway/vault-grant-inbound-builders.ts +125 -0
- package/telegram-plugin/history.ts +91 -0
- package/telegram-plugin/hooks/hooks.json +10 -0
- package/telegram-plugin/hooks/sandbox-hint-posttool.mjs +130 -0
- package/telegram-plugin/hooks/subagent-tracker-posttool.mjs +19 -2
- package/telegram-plugin/hooks/subagent-tracker-pretool.mjs +22 -2
- package/telegram-plugin/hooks/tool-label-pretool.mjs +11 -0
- package/telegram-plugin/hooks/wedge-detect-posttool.mjs +303 -0
- package/telegram-plugin/inbound-classifier.ts +50 -0
- package/telegram-plugin/inline-keyboard-callbacks.ts +136 -0
- package/telegram-plugin/node_modules/.vite/vitest/da39a3ee5e6b4b0d3255bfef95601890afd80709/results.json +1 -0
- package/telegram-plugin/package.json +4 -2
- package/telegram-plugin/permission-rule.ts +51 -0
- package/telegram-plugin/permission-title.ts +56 -0
- package/telegram-plugin/quota-check.ts +19 -41
- package/telegram-plugin/registry/reaper.ts +223 -0
- package/telegram-plugin/retry-api-call.ts +80 -0
- package/telegram-plugin/runtime-metrics.ts +177 -0
- package/telegram-plugin/scripts/build.mjs +0 -1
- package/telegram-plugin/secret-detect/index.ts +24 -0
- package/telegram-plugin/secret-detect/vault-error.test.ts +64 -12
- package/telegram-plugin/secret-detect/vault-error.ts +78 -11
- package/telegram-plugin/secret-detect/vault-write.ts +14 -2
- package/telegram-plugin/server.js +41795 -0
- package/telegram-plugin/session-tail.ts +6 -1
- package/telegram-plugin/shared/bot-runtime.ts +5 -4
- package/telegram-plugin/silence-poke.ts +420 -0
- package/telegram-plugin/silent-end.ts +174 -0
- package/telegram-plugin/stream-controller.ts +13 -0
- package/telegram-plugin/stream-reply-handler.ts +7 -0
- package/telegram-plugin/subagent-watcher.ts +213 -4
- package/telegram-plugin/tests/auth-add-flow.test.ts +559 -0
- package/telegram-plugin/tests/auth-code-redact.test.ts +8 -4
- package/telegram-plugin/tests/auth-command-vernacular.test.ts +531 -0
- package/telegram-plugin/tests/boot-card-issue-dedup.test.ts +247 -0
- package/telegram-plugin/tests/boot-card-reason-to-render.test.ts +182 -0
- package/telegram-plugin/tests/boot-card-reason.test.ts +65 -2
- package/telegram-plugin/tests/boot-card-render.test.ts +146 -0
- package/telegram-plugin/tests/boot-card-silent-on-operator.test.ts +103 -0
- package/telegram-plugin/tests/boot-probes.test.ts +216 -10
- package/telegram-plugin/tests/boot-version-string.test.ts +0 -0
- package/telegram-plugin/tests/finalize-callback.test.ts +190 -0
- package/telegram-plugin/tests/gateway-message-validator.test.ts +26 -0
- package/telegram-plugin/tests/gateway-secret-detect.test.ts +12 -3
- package/telegram-plugin/tests/gateway-startup-network-retry.test.ts +104 -0
- package/telegram-plugin/tests/history-reaper.test.ts +378 -0
- package/telegram-plugin/tests/hostd-dispatch.test.ts +129 -0
- package/telegram-plugin/tests/inbound-classifier.test.ts +76 -0
- package/telegram-plugin/tests/inbound-message-types.test.ts +267 -0
- package/telegram-plugin/tests/issues-card.test.ts +49 -0
- package/telegram-plugin/tests/pending-inbound-buffer.test.ts +132 -0
- package/telegram-plugin/tests/permission-rule.test.ts +80 -1
- package/telegram-plugin/tests/permission-title.test.ts +31 -0
- package/telegram-plugin/tests/quota-check.test.ts +5 -35
- package/telegram-plugin/tests/races.test.ts +179 -0
- package/telegram-plugin/tests/reaction-trigger-flow.test.ts +353 -0
- package/telegram-plugin/tests/reaction-trigger.test.ts +397 -0
- package/telegram-plugin/tests/retry-api-call.test.ts +152 -1
- package/telegram-plugin/tests/runtime-metrics.test.ts +145 -0
- package/telegram-plugin/tests/sandbox-hint-posttool.test.ts +155 -0
- package/telegram-plugin/tests/secret-detect-delete-must-surface-failures.test.ts +133 -0
- package/telegram-plugin/tests/secret-detect-false-positives.test.ts +137 -0
- package/telegram-plugin/tests/silence-poke.test.ts +493 -0
- package/telegram-plugin/tests/silent-end.test.ts +206 -0
- package/telegram-plugin/tests/subagent-tracker-hooks.test.ts +107 -0
- package/telegram-plugin/tests/subagent-watcher-env-thresholds.test.ts +224 -0
- package/telegram-plugin/tests/subagent-watcher-stall-terminal.test.ts +316 -0
- package/telegram-plugin/tests/subagent-watcher.test.ts +263 -0
- package/telegram-plugin/tests/turn-signal-tracker.test.ts +81 -0
- package/telegram-plugin/tests/vault-approval-posture.test.ts +256 -0
- package/telegram-plugin/tests/vault-grant-auto-resume.test.ts +73 -0
- package/telegram-plugin/tests/vault-grant-inbound-builders.test.ts +226 -0
- package/telegram-plugin/tests/vault-grant-union.test.ts +130 -0
- package/telegram-plugin/tests/vault-key-regex-allows-slash.test.ts +140 -0
- package/telegram-plugin/tests/vault-posture-quarantine.test.ts +104 -0
- package/telegram-plugin/tests/vault-request-access-tool.test.ts +114 -0
- package/telegram-plugin/tests/vault-request-access-unlock-resume.test.ts +106 -0
- package/telegram-plugin/turn-signal-tracker.ts +100 -24
- package/telegram-plugin/uat/SETUP.md +210 -35
- package/telegram-plugin/uat/assertions.ts +264 -37
- package/telegram-plugin/uat/driver-info.ts +57 -0
- package/telegram-plugin/uat/driver.ts +590 -51
- package/telegram-plugin/uat/harness.ts +140 -94
- package/telegram-plugin/uat/load-env.test.ts +72 -0
- package/telegram-plugin/uat/load-env.ts +48 -0
- package/telegram-plugin/uat/login.ts +96 -53
- package/telegram-plugin/uat/runners/agent-self-sufficiency.ts +457 -0
- package/telegram-plugin/uat/runners/paraphrases.ts +231 -0
- package/telegram-plugin/uat/runners/report.ts +150 -0
- package/telegram-plugin/uat/runners/run-agent-self-sufficiency.sh +50 -0
- package/telegram-plugin/uat/runners/scorer.test.ts +196 -0
- package/telegram-plugin/uat/runners/scorer.ts +106 -0
- package/telegram-plugin/uat/runners/skill-coverage.test.ts +100 -0
- package/telegram-plugin/uat/runners/skill-coverage.ts +620 -0
- package/telegram-plugin/uat/scenarios/ask-user-button-tap-dm.test.ts +141 -0
- package/telegram-plugin/uat/scenarios/bg-sub-agent-dispatch-dm.test.ts +191 -0
- package/telegram-plugin/uat/scenarios/fuzz-extended-dm.test.ts +255 -0
- package/telegram-plugin/uat/scenarios/fuzz-human-style-dm.test.ts +275 -0
- package/telegram-plugin/uat/scenarios/fuzz-random-prompts-dm.test.ts +146 -0
- package/telegram-plugin/uat/scenarios/fuzz-status-ask-dm.test.ts +486 -0
- package/telegram-plugin/uat/scenarios/jtbd-interrupt-marker-dm.test.ts +67 -0
- package/telegram-plugin/uat/scenarios/jtbd-rapid-followup-dm.test.ts +100 -0
- package/telegram-plugin/uat/scenarios/jtbd-soft-commit-dm.test.ts +67 -0
- package/telegram-plugin/uat/scenarios/jtbd-status-query-dm.test.ts +49 -0
- package/telegram-plugin/uat/scenarios/location-inbound-dm.test.ts +65 -0
- package/telegram-plugin/uat/scenarios/midturn-silent-dm.test.ts +175 -0
- package/telegram-plugin/uat/scenarios/reactions-dm.test.ts +142 -0
- package/telegram-plugin/uat/scenarios/reactions-trigger-turn-dm.test.ts +96 -0
- package/telegram-plugin/uat/scenarios/secret-redaction-deletes-original-dm.test.ts +123 -0
- package/telegram-plugin/uat/scenarios/secret-redaction-no-false-positive-dm.test.ts +87 -0
- package/telegram-plugin/uat/scenarios/silence-poke-soft-dm.test.ts +155 -0
- package/telegram-plugin/uat/scenarios/silent-end-recovery-dm.test.ts +95 -0
- package/telegram-plugin/uat/scenarios/smoke-dm-reply.test.ts +57 -0
- package/telegram-plugin/uat/scenarios/subagent-watcher-no-rerun-dm.test.ts +135 -0
- package/telegram-plugin/uat/scenarios/vault-approval-posture-telegram-id-dm.test.ts +191 -0
- package/telegram-plugin/uat/scenarios/vault-audit-allow-dm.test.ts +108 -0
- package/telegram-plugin/uat/scenarios/vault-grant-auto-resume-dm.test.ts +121 -0
- package/telegram-plugin/uat/scenarios/vault-request-access-concurrent-dm.test.ts +161 -0
- package/telegram-plugin/uat/scenarios/vault-request-access-end-to-end-dm.test.ts +158 -0
- package/telegram-plugin/uat/scenarios/voice-inbound-dm.test.ts +65 -0
- package/telegram-plugin/vault-approval-posture.ts +42 -0
- package/telegram-plugin/welcome-text.ts +1 -0
- package/telegram-plugin/active-pins-sweep.ts +0 -204
- package/telegram-plugin/active-pins.ts +0 -146
- package/telegram-plugin/auth-dashboard.ts +0 -1104
- package/telegram-plugin/auth-slot-parser.ts +0 -497
- package/telegram-plugin/card-event-log.ts +0 -138
- package/telegram-plugin/dist/foreman/foreman.js +0 -31106
- package/telegram-plugin/docs/multi-agent-card-design.md +0 -847
- package/telegram-plugin/docs/pinned-progress-card-reliability.md +0 -144
- package/telegram-plugin/foreman/foreman-create-flow.ts +0 -202
- package/telegram-plugin/foreman/foreman-handlers.ts +0 -493
- package/telegram-plugin/foreman/foreman.ts +0 -1165
- package/telegram-plugin/foreman/setup-flow.ts +0 -345
- package/telegram-plugin/foreman/setup-state.ts +0 -239
- package/telegram-plugin/foreman/state.ts +0 -203
- package/telegram-plugin/pin-event-log.ts +0 -76
- package/telegram-plugin/progress-card-driver.ts +0 -2886
- package/telegram-plugin/progress-card-pin-manager.ts +0 -589
- package/telegram-plugin/progress-card-pin-watchdog.ts +0 -98
- package/telegram-plugin/progress-card.ts +0 -1409
- package/telegram-plugin/tests/HARNESS.md +0 -340
- package/telegram-plugin/tests/_progress-card-harness.ts +0 -109
- package/telegram-plugin/tests/active-pins-boot-reaper.test.ts +0 -211
- package/telegram-plugin/tests/active-pins-sweep.test.ts +0 -309
- package/telegram-plugin/tests/active-pins.test.ts +0 -187
- package/telegram-plugin/tests/auth-account-identity-surface.test.ts +0 -118
- package/telegram-plugin/tests/auth-dashboard-edge-cases.test.ts +0 -260
- package/telegram-plugin/tests/auth-dashboard-restart-flow.test.ts +0 -140
- package/telegram-plugin/tests/auth-dashboard-v3b.test.ts +0 -559
- package/telegram-plugin/tests/auth-dashboard.test.ts +0 -1045
- package/telegram-plugin/tests/auth-slot-commands.test.ts +0 -640
- package/telegram-plugin/tests/bg-agent-progress-card-757.test.ts +0 -201
- package/telegram-plugin/tests/boot-card-account-quota.test.ts +0 -137
- package/telegram-plugin/tests/card-event-log.test.ts +0 -145
- package/telegram-plugin/tests/first-paint.test.ts +0 -257
- package/telegram-plugin/tests/foreman-create-flow.test.ts +0 -359
- package/telegram-plugin/tests/foreman-handlers.test.ts +0 -347
- package/telegram-plugin/tests/foreman-state.test.ts +0 -164
- package/telegram-plugin/tests/foreman-write-ops.test.ts +0 -214
- package/telegram-plugin/tests/harness-ordering-invariants.test.ts +0 -243
- package/telegram-plugin/tests/pin-event-log.test.ts +0 -124
- package/telegram-plugin/tests/progress-card-api-failure-during-deferred.test.ts +0 -73
- package/telegram-plugin/tests/progress-card-close-paths-converge.test.ts +0 -272
- package/telegram-plugin/tests/progress-card-cross-turn.test.ts +0 -258
- package/telegram-plugin/tests/progress-card-delay-842.test.ts +0 -160
- package/telegram-plugin/tests/progress-card-dispose-preservepending.test.ts +0 -81
- package/telegram-plugin/tests/progress-card-draft-flag.test.ts +0 -80
- package/telegram-plugin/tests/progress-card-driver-eviction.test.ts +0 -215
- package/telegram-plugin/tests/progress-card-driver-fleet-shadow.test.ts +0 -123
- package/telegram-plugin/tests/progress-card-driver-force-complete-parent-done.test.ts +0 -76
- package/telegram-plugin/tests/progress-card-edit-timestamps-budget.test.ts +0 -62
- package/telegram-plugin/tests/progress-card-memory-bounds.test.ts +0 -84
- package/telegram-plugin/tests/progress-card-pin-failure-paths.test.ts +0 -139
- package/telegram-plugin/tests/progress-card-pin-manager.test.ts +0 -773
- package/telegram-plugin/tests/progress-card-pin-race-fast-turn.test.ts +0 -66
- package/telegram-plugin/tests/progress-card-pin-sidecar-partial-write.test.ts +0 -64
- package/telegram-plugin/tests/progress-card-pin-watchdog.test.ts +0 -190
- package/telegram-plugin/tests/progress-card-sigterm-pin-flush.test.ts +0 -146
- package/telegram-plugin/tests/real-gateway-f1-ladder-integrity.test.ts +0 -123
- package/telegram-plugin/tests/real-gateway-f2-instant-draft.test.ts +0 -82
- package/telegram-plugin/tests/real-gateway-f3-late-card.test.ts +0 -114
- package/telegram-plugin/tests/real-gateway-harness.ts +0 -699
- package/telegram-plugin/tests/real-gateway-i6-turn-flush-replay-dedup.test.ts +0 -313
- package/telegram-plugin/tests/real-gateway-ipc-lifecycle.test.ts +0 -299
- package/telegram-plugin/tests/real-gateway-spec.test.ts +0 -487
- package/telegram-plugin/tests/real-gateway.smoke.test.ts +0 -101
- package/telegram-plugin/tests/setup-flow.test.ts +0 -510
- package/telegram-plugin/tests/setup-state.test.ts +0 -146
- package/telegram-plugin/tests/sync-chat-running-subagents.test.ts +0 -116
- package/telegram-plugin/tests/turn-end-regressions.test.ts +0 -489
- package/telegram-plugin/tests/turn-flush-card-takeover.test.ts +0 -218
- package/telegram-plugin/tests/turn-flush-prose-recovery.test.ts +0 -78
- package/telegram-plugin/tests/two-zone-bg-carry-full-lifecycle.test.ts +0 -131
- package/telegram-plugin/tests/two-zone-bg-detection.test.ts +0 -120
- package/telegram-plugin/tests/two-zone-bg-done-when-all-terminal.test.ts +0 -116
- package/telegram-plugin/tests/two-zone-bg-early-turn-end.test.ts +0 -87
- package/telegram-plugin/tests/two-zone-bg-survives-next-turn.test.ts +0 -211
- package/telegram-plugin/tests/two-zone-card-cap.test.ts +0 -62
- package/telegram-plugin/tests/two-zone-card-fleet-row.test.ts +0 -101
- package/telegram-plugin/tests/two-zone-card-header-phases.test.ts +0 -78
- package/telegram-plugin/tests/two-zone-card-html-balance.test.ts +0 -110
- package/telegram-plugin/tests/two-zone-card-lifecycle.test.ts +0 -128
- package/telegram-plugin/tests/two-zone-card-sanitise.test.ts +0 -58
- package/telegram-plugin/tests/two-zone-card-snapshot.test.ts +0 -133
- package/telegram-plugin/tests/two-zone-concurrent-turns-isolation.test.ts +0 -155
- package/telegram-plugin/tests/two-zone-phasefor-precedence.test.ts +0 -117
- package/telegram-plugin/tests/two-zone-snapshot-extras.test.ts +0 -187
- package/telegram-plugin/tests/two-zone-stuck-edit-throttle.test.ts +0 -149
- package/telegram-plugin/tests/two-zone-stuck-header-escalation.test.ts +0 -101
- package/telegram-plugin/tests/two-zone-stuck-per-member.test.ts +0 -114
- package/telegram-plugin/tests/two-zone-stuck-recovery.test.ts +0 -105
- package/telegram-plugin/tests/waiting-ux-harness.ts +0 -381
- package/telegram-plugin/tests/waiting-ux.e2e.test.ts +0 -233
- package/telegram-plugin/turn-flush-prose-recovery.ts +0 -40
- package/telegram-plugin/two-zone-card.ts +0 -269
- package/telegram-plugin/uat/scenarios/smoke-clerk-reply.test.ts +0 -61
|
@@ -0,0 +1,457 @@
|
|
|
1
|
+
#!/usr/bin/env bun
|
|
2
|
+
/**
|
|
3
|
+
* Agent-self-sufficiency UAT runner.
|
|
4
|
+
*
|
|
5
|
+
* Drives a real Telegram user-account against the live agent fleet to
|
|
6
|
+
* verify the four acceptance criteria from the
|
|
7
|
+
* "agent-self-sufficiency" goal:
|
|
8
|
+
*
|
|
9
|
+
* 1. Self-management (skill_list, cron_list, audit_tail, config_get)
|
|
10
|
+
* 2. Identity awareness (honest self-ID, knows its name, knows peers)
|
|
11
|
+
* 3. Admin surface (non-admin refusal naming the admin agent)
|
|
12
|
+
* — admin reads (3a/3b) are covered by the hostd vitest suite
|
|
13
|
+
* rather than live fuzz, because they require a docker stub.
|
|
14
|
+
* 4. The fuzzy UAT IS this runner.
|
|
15
|
+
*
|
|
16
|
+
* Usage:
|
|
17
|
+
*
|
|
18
|
+
* bun telegram-plugin/uat/runners/agent-self-sufficiency.ts \\
|
|
19
|
+
* --agent klanker:@klanker_bot \\
|
|
20
|
+
* --agent scribe:@scribe_bot \\
|
|
21
|
+
* --agent doc:@doc_bot \\
|
|
22
|
+
* --admin-agent klanker \\
|
|
23
|
+
* --report ./uat-report.md
|
|
24
|
+
*
|
|
25
|
+
* # OR — discover from env (CI-friendly):
|
|
26
|
+
* UAT_FLEET="klanker:@klanker_bot,scribe:@scribe_bot,doc:@doc_bot" \\
|
|
27
|
+
* UAT_ADMIN_AGENTS="klanker" \\
|
|
28
|
+
* bun telegram-plugin/uat/runners/agent-self-sufficiency.ts
|
|
29
|
+
*
|
|
30
|
+
* Auth env (same as the existing uat harness — see
|
|
31
|
+
* telegram-plugin/uat/SETUP.md):
|
|
32
|
+
*
|
|
33
|
+
* TELEGRAM_API_ID, TELEGRAM_API_HASH, TELEGRAM_UAT_DRIVER_SESSION
|
|
34
|
+
*
|
|
35
|
+
* **Why a user-account session, not bot tokens.** The acceptance-
|
|
36
|
+
* criteria text mentioned `TELEGRAM_BOT_TOKEN_<agent>` env vars, but
|
|
37
|
+
* Telegram's Bot API forbids bots from reading other bots' messages
|
|
38
|
+
* (https://core.telegram.org/bots/faq) — a bot can send to another
|
|
39
|
+
* bot's chat but can't observe the reply. The only way to drive the
|
|
40
|
+
* fleet AND capture every agent's reply is an mtcute user-account
|
|
41
|
+
* session, which is what the existing telegram-plugin/uat harness
|
|
42
|
+
* uses. This runner inherits that machinery wholesale; the env-var
|
|
43
|
+
* rename is forced by the platform, not a design choice.
|
|
44
|
+
*
|
|
45
|
+
* Missing creds fail loud, not silent — the goal explicitly demands
|
|
46
|
+
* no silent skips on missing UAT credentials.
|
|
47
|
+
*/
|
|
48
|
+
|
|
49
|
+
import { writeFileSync } from "node:fs";
|
|
50
|
+
import { Driver, type ObservedMessage } from "../driver.js";
|
|
51
|
+
import { loadUatEnv } from "../load-env.js";
|
|
52
|
+
import { CRITERIA, type CriterionSpec } from "./paraphrases.js";
|
|
53
|
+
import { scoreReply, type CaseResult, type Outcome } from "./scorer.js";
|
|
54
|
+
import { renderMarkdown } from "./report.js";
|
|
55
|
+
|
|
56
|
+
loadUatEnv();
|
|
57
|
+
|
|
58
|
+
// ─── CLI / env parsing ─────────────────────────────────────────────────────
|
|
59
|
+
|
|
60
|
+
interface AgentTarget {
|
|
61
|
+
name: string;
|
|
62
|
+
botUsername: string;
|
|
63
|
+
admin: boolean;
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
interface CliConfig {
|
|
67
|
+
agents: AgentTarget[];
|
|
68
|
+
reportPath: string;
|
|
69
|
+
jsonPath: string;
|
|
70
|
+
/** Per-case reply timeout, ms. Default 60s. */
|
|
71
|
+
replyTimeoutMs: number;
|
|
72
|
+
/** Inter-message settle, ms. Default 4s — keeps us under Telegram's
|
|
73
|
+
* global outbound rate cap and gives the agent time to finish its
|
|
74
|
+
* previous turn before the next inbound. */
|
|
75
|
+
settleMs: number;
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
function parseCli(argv: readonly string[]): CliConfig {
|
|
79
|
+
const agents = new Map<string, AgentTarget>();
|
|
80
|
+
const adminSet = new Set<string>();
|
|
81
|
+
let reportPath = process.env.UAT_REPORT ?? "./uat-agent-self-sufficiency.md";
|
|
82
|
+
let jsonPath = process.env.UAT_REPORT_JSON ?? "./uat-agent-self-sufficiency.json";
|
|
83
|
+
let replyTimeoutMs = Number.parseInt(process.env.UAT_REPLY_TIMEOUT_MS ?? "60000", 10);
|
|
84
|
+
let settleMs = Number.parseInt(process.env.UAT_SETTLE_MS ?? "4000", 10);
|
|
85
|
+
|
|
86
|
+
const envFleet = process.env.UAT_FLEET;
|
|
87
|
+
if (envFleet) {
|
|
88
|
+
for (const tok of envFleet.split(",")) {
|
|
89
|
+
const [name, bot] = tok.split(":").map((s) => s.trim());
|
|
90
|
+
if (name && bot) agents.set(name, { name, botUsername: bot, admin: false });
|
|
91
|
+
}
|
|
92
|
+
}
|
|
93
|
+
const envAdmin = process.env.UAT_ADMIN_AGENTS;
|
|
94
|
+
if (envAdmin) {
|
|
95
|
+
for (const tok of envAdmin.split(",")) {
|
|
96
|
+
const name = tok.trim();
|
|
97
|
+
if (name) adminSet.add(name);
|
|
98
|
+
}
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
for (let i = 0; i < argv.length; i++) {
|
|
102
|
+
const tok = argv[i]!;
|
|
103
|
+
const next = (): string => {
|
|
104
|
+
const v = argv[++i];
|
|
105
|
+
if (!v) fail(`${tok}: missing value`);
|
|
106
|
+
return v;
|
|
107
|
+
};
|
|
108
|
+
switch (tok) {
|
|
109
|
+
case "--agent": {
|
|
110
|
+
const v = next();
|
|
111
|
+
const [name, bot] = v.split(":").map((s) => s.trim());
|
|
112
|
+
if (!name || !bot)
|
|
113
|
+
fail(`--agent expects "<name>:@<bot-username>"; got "${v}"`);
|
|
114
|
+
agents.set(name, { name, botUsername: bot, admin: false });
|
|
115
|
+
break;
|
|
116
|
+
}
|
|
117
|
+
case "--admin-agent": {
|
|
118
|
+
adminSet.add(next());
|
|
119
|
+
break;
|
|
120
|
+
}
|
|
121
|
+
case "--report":
|
|
122
|
+
reportPath = next();
|
|
123
|
+
break;
|
|
124
|
+
case "--json":
|
|
125
|
+
jsonPath = next();
|
|
126
|
+
break;
|
|
127
|
+
case "--reply-timeout-ms":
|
|
128
|
+
replyTimeoutMs = Number.parseInt(next(), 10);
|
|
129
|
+
break;
|
|
130
|
+
case "--settle-ms":
|
|
131
|
+
settleMs = Number.parseInt(next(), 10);
|
|
132
|
+
break;
|
|
133
|
+
case "--help":
|
|
134
|
+
case "-h":
|
|
135
|
+
printHelp();
|
|
136
|
+
process.exit(0);
|
|
137
|
+
break;
|
|
138
|
+
default:
|
|
139
|
+
if (tok.startsWith("--")) fail(`unknown flag: ${tok}`);
|
|
140
|
+
}
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
for (const name of adminSet) {
|
|
144
|
+
const t = agents.get(name);
|
|
145
|
+
if (t) t.admin = true;
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
if (agents.size === 0) {
|
|
149
|
+
fail(
|
|
150
|
+
"no agents to target. Pass --agent <name>:@<bot> at least once, or set UAT_FLEET env",
|
|
151
|
+
);
|
|
152
|
+
}
|
|
153
|
+
if (agents.size < 3) {
|
|
154
|
+
process.stderr.write(
|
|
155
|
+
`[uat] WARNING: only ${agents.size} agent(s) targeted; goal calls for ≥3 to prove shared infra.\n`,
|
|
156
|
+
);
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
return {
|
|
160
|
+
agents: [...agents.values()],
|
|
161
|
+
reportPath,
|
|
162
|
+
jsonPath,
|
|
163
|
+
replyTimeoutMs,
|
|
164
|
+
settleMs,
|
|
165
|
+
};
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
function fail(msg: string): never {
|
|
169
|
+
process.stderr.write(`[uat] ${msg}\n`);
|
|
170
|
+
process.exit(2);
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
function printHelp(): void {
|
|
174
|
+
process.stdout.write(`agent-self-sufficiency UAT runner
|
|
175
|
+
|
|
176
|
+
Required env (or fail loud):
|
|
177
|
+
TELEGRAM_API_ID, TELEGRAM_API_HASH, TELEGRAM_UAT_DRIVER_SESSION
|
|
178
|
+
|
|
179
|
+
Flags:
|
|
180
|
+
--agent NAME:@BOT Add an agent target. Repeatable.
|
|
181
|
+
--admin-agent NAME Mark NAME as admin: true (skips 3d for that agent).
|
|
182
|
+
--report PATH Markdown report path. Default ./uat-agent-self-sufficiency.md
|
|
183
|
+
--json PATH JSON sidecar with all results. Default ./uat-agent-self-sufficiency.json
|
|
184
|
+
--reply-timeout-ms N Per-case timeout. Default 60000.
|
|
185
|
+
--settle-ms N Inter-message settle. Default 4000.
|
|
186
|
+
|
|
187
|
+
Env equivalents:
|
|
188
|
+
UAT_FLEET="name1:@bot1,name2:@bot2,..."
|
|
189
|
+
UAT_ADMIN_AGENTS="name1,name2"
|
|
190
|
+
UAT_REPORT, UAT_REPORT_JSON, UAT_REPLY_TIMEOUT_MS, UAT_SETTLE_MS
|
|
191
|
+
`);
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
// ─── Driver wrapper: send + observe ─────────────────────────────────────────
|
|
195
|
+
|
|
196
|
+
interface ReplyOutcome {
|
|
197
|
+
reply: string;
|
|
198
|
+
outcome: Outcome;
|
|
199
|
+
durationMs: number;
|
|
200
|
+
errorMessage?: string;
|
|
201
|
+
}
|
|
202
|
+
|
|
203
|
+
/**
|
|
204
|
+
* Send one inbound to the agent and wait for a meaningful reply.
|
|
205
|
+
*
|
|
206
|
+
* We subscribe to the chat's message stream BEFORE sending so we don't
|
|
207
|
+
* miss the bot's reply if it lands faster than we can start observing
|
|
208
|
+
* (yes, this happens). Then:
|
|
209
|
+
*
|
|
210
|
+
* 1. Send the inbound.
|
|
211
|
+
* 2. Consume the stream until we see the first non-empty bot message
|
|
212
|
+
* with messageId > our sent.messageId. That's the reply head.
|
|
213
|
+
* 3. Continue consuming for an "edit window" (3s by default) to
|
|
214
|
+
* absorb any edits the gateway makes to its first chunk (stream-
|
|
215
|
+
* reply pattern: bot sends "thinking…" then edits with the final
|
|
216
|
+
* answer). The final post-edit text is what we score.
|
|
217
|
+
* 4. Bail out with `timeout` if we never see a head.
|
|
218
|
+
*/
|
|
219
|
+
async function sendAndScore(
|
|
220
|
+
driver: Driver,
|
|
221
|
+
botUserId: number,
|
|
222
|
+
driverUserId: number,
|
|
223
|
+
spec: CriterionSpec,
|
|
224
|
+
prompt: string,
|
|
225
|
+
agentName: string,
|
|
226
|
+
timeoutMs: number,
|
|
227
|
+
): Promise<ReplyOutcome> {
|
|
228
|
+
const startedAt = Date.now();
|
|
229
|
+
// Start observing FIRST so we don't race the bot's reply.
|
|
230
|
+
const stream = driver.observeMessages(botUserId)[Symbol.asyncIterator]();
|
|
231
|
+
|
|
232
|
+
let sentMessageId: number;
|
|
233
|
+
try {
|
|
234
|
+
const sent = await driver.sendText(botUserId, prompt);
|
|
235
|
+
sentMessageId = sent.messageId;
|
|
236
|
+
} catch (err) {
|
|
237
|
+
try {
|
|
238
|
+
await stream.return?.(undefined);
|
|
239
|
+
} catch {
|
|
240
|
+
/* ignore */
|
|
241
|
+
}
|
|
242
|
+
return {
|
|
243
|
+
reply: "",
|
|
244
|
+
outcome: "error",
|
|
245
|
+
durationMs: Date.now() - startedAt,
|
|
246
|
+
errorMessage: `send failed: ${(err as Error).message}`,
|
|
247
|
+
};
|
|
248
|
+
}
|
|
249
|
+
|
|
250
|
+
const deadline = startedAt + timeoutMs;
|
|
251
|
+
const EDIT_WINDOW_MS = 3000;
|
|
252
|
+
let headSeenAt = 0;
|
|
253
|
+
let replyMessageId = 0;
|
|
254
|
+
let replyText = "";
|
|
255
|
+
|
|
256
|
+
try {
|
|
257
|
+
while (Date.now() < deadline) {
|
|
258
|
+
const remaining = deadline - Date.now();
|
|
259
|
+
const winSize = headSeenAt
|
|
260
|
+
? Math.max(0, EDIT_WINDOW_MS - (Date.now() - headSeenAt))
|
|
261
|
+
: remaining;
|
|
262
|
+
if (headSeenAt && winSize === 0) break;
|
|
263
|
+
const slice = await pullOneWithTimeout(stream, Math.min(remaining, Math.max(250, winSize)));
|
|
264
|
+
if (slice === "timeout") {
|
|
265
|
+
if (headSeenAt) break; // edit window elapsed
|
|
266
|
+
continue;
|
|
267
|
+
}
|
|
268
|
+
if (slice === "done") break;
|
|
269
|
+
const m: ObservedMessage = slice;
|
|
270
|
+
if (m.senderUserId === driverUserId) continue;
|
|
271
|
+
if (m.messageId <= sentMessageId) continue;
|
|
272
|
+
const t = (m.text ?? "").trim();
|
|
273
|
+
if (!t) continue;
|
|
274
|
+
// Either this is the head, or it's an edit/replacement of the
|
|
275
|
+
// bot's reply. Track the most recent.
|
|
276
|
+
replyMessageId = m.messageId;
|
|
277
|
+
replyText = t;
|
|
278
|
+
if (!headSeenAt) headSeenAt = Date.now();
|
|
279
|
+
}
|
|
280
|
+
} finally {
|
|
281
|
+
try {
|
|
282
|
+
await stream.return?.(undefined);
|
|
283
|
+
} catch {
|
|
284
|
+
/* ignore */
|
|
285
|
+
}
|
|
286
|
+
}
|
|
287
|
+
|
|
288
|
+
const durationMs = Date.now() - startedAt;
|
|
289
|
+
if (!replyMessageId) {
|
|
290
|
+
return { reply: "", outcome: "timeout", durationMs };
|
|
291
|
+
}
|
|
292
|
+
const outcome = scoreReply(spec, replyText, { agentName });
|
|
293
|
+
return { reply: replyText, outcome, durationMs };
|
|
294
|
+
}
|
|
295
|
+
|
|
296
|
+
/**
|
|
297
|
+
* Race the next stream item against a timeout. Returns the item, or
|
|
298
|
+
* the literal `"timeout"` / `"done"` sentinels. `done` is rare in
|
|
299
|
+
* practice — the observer doesn't naturally close until we tell it to.
|
|
300
|
+
*/
|
|
301
|
+
async function pullOneWithTimeout(
|
|
302
|
+
it: AsyncIterator<ObservedMessage>,
|
|
303
|
+
ms: number,
|
|
304
|
+
): Promise<ObservedMessage | "timeout" | "done"> {
|
|
305
|
+
return new Promise((resolve) => {
|
|
306
|
+
let settled = false;
|
|
307
|
+
const timer = setTimeout(() => {
|
|
308
|
+
if (settled) return;
|
|
309
|
+
settled = true;
|
|
310
|
+
resolve("timeout");
|
|
311
|
+
}, ms);
|
|
312
|
+
it.next().then(
|
|
313
|
+
(r) => {
|
|
314
|
+
if (settled) return;
|
|
315
|
+
settled = true;
|
|
316
|
+
clearTimeout(timer);
|
|
317
|
+
if (r.done) resolve("done");
|
|
318
|
+
else resolve(r.value);
|
|
319
|
+
},
|
|
320
|
+
() => {
|
|
321
|
+
if (settled) return;
|
|
322
|
+
settled = true;
|
|
323
|
+
clearTimeout(timer);
|
|
324
|
+
resolve("done");
|
|
325
|
+
},
|
|
326
|
+
);
|
|
327
|
+
});
|
|
328
|
+
}
|
|
329
|
+
|
|
330
|
+
// ─── Main orchestration ─────────────────────────────────────────────────────
|
|
331
|
+
|
|
332
|
+
async function main(): Promise<void> {
|
|
333
|
+
const cli = parseCli(process.argv.slice(2));
|
|
334
|
+
|
|
335
|
+
// Hard-fail on missing UAT creds — goal: never silently skip.
|
|
336
|
+
const apiId = Number.parseInt(process.env.TELEGRAM_API_ID ?? "", 10);
|
|
337
|
+
if (!Number.isFinite(apiId)) {
|
|
338
|
+
fail("TELEGRAM_API_ID missing or non-integer — see telegram-plugin/uat/SETUP.md");
|
|
339
|
+
}
|
|
340
|
+
const apiHash = process.env.TELEGRAM_API_HASH ?? "";
|
|
341
|
+
if (!apiHash) fail("TELEGRAM_API_HASH missing — see SETUP.md");
|
|
342
|
+
const session = process.env.TELEGRAM_UAT_DRIVER_SESSION ?? "";
|
|
343
|
+
if (!session)
|
|
344
|
+
fail(
|
|
345
|
+
"TELEGRAM_UAT_DRIVER_SESSION missing — run `bun run uat:login` first (SETUP.md §4)",
|
|
346
|
+
);
|
|
347
|
+
|
|
348
|
+
process.stdout.write(
|
|
349
|
+
`[uat] connecting to Telegram as the UAT driver account...\n`,
|
|
350
|
+
);
|
|
351
|
+
const driver = new Driver({ apiId, apiHash, session });
|
|
352
|
+
await driver.connect();
|
|
353
|
+
const driverUserId = await driver.getMyUserId();
|
|
354
|
+
process.stdout.write(`[uat] driver user_id=${driverUserId}\n`);
|
|
355
|
+
|
|
356
|
+
// Resolve every agent's bot user_id up front so a missing username
|
|
357
|
+
// fails before we waste any time on the run.
|
|
358
|
+
const resolved: { target: AgentTarget; botUserId: number }[] = [];
|
|
359
|
+
for (const a of cli.agents) {
|
|
360
|
+
try {
|
|
361
|
+
const id = await driver.resolveBotUserId(a.botUsername);
|
|
362
|
+
resolved.push({ target: a, botUserId: id });
|
|
363
|
+
process.stdout.write(
|
|
364
|
+
`[uat] resolved ${a.name} ${a.botUsername} → bot_user_id=${id}` +
|
|
365
|
+
(a.admin ? " (admin)" : "") +
|
|
366
|
+
"\n",
|
|
367
|
+
);
|
|
368
|
+
} catch (err) {
|
|
369
|
+
process.stderr.write(
|
|
370
|
+
`[uat] FAILED to resolve ${a.botUsername} for agent ${a.name}: ${(err as Error).message}\n`,
|
|
371
|
+
);
|
|
372
|
+
process.exit(3);
|
|
373
|
+
}
|
|
374
|
+
}
|
|
375
|
+
|
|
376
|
+
// Run!
|
|
377
|
+
const startedAt = new Date();
|
|
378
|
+
const t0 = Date.now();
|
|
379
|
+
const results: CaseResult[] = [];
|
|
380
|
+
|
|
381
|
+
for (const { target, botUserId } of resolved) {
|
|
382
|
+
process.stdout.write(`\n[uat] ─── agent: ${target.name} ─────────────\n`);
|
|
383
|
+
for (const spec of CRITERIA) {
|
|
384
|
+
// Skip 3d (non-admin refusal) on admin agents — they're legitimately
|
|
385
|
+
// capable of those operations, so a "I can't" reply would be wrong.
|
|
386
|
+
if (spec.id === "3d_admin_refusal" && target.admin) {
|
|
387
|
+
process.stdout.write(
|
|
388
|
+
`[uat] skip ${spec.id} on ${target.name} (admin: true)\n`,
|
|
389
|
+
);
|
|
390
|
+
continue;
|
|
391
|
+
}
|
|
392
|
+
|
|
393
|
+
for (const para of spec.paraphrases) {
|
|
394
|
+
const r = await sendAndScore(
|
|
395
|
+
driver,
|
|
396
|
+
botUserId,
|
|
397
|
+
driverUserId,
|
|
398
|
+
spec,
|
|
399
|
+
para.text,
|
|
400
|
+
target.name,
|
|
401
|
+
cli.replyTimeoutMs,
|
|
402
|
+
);
|
|
403
|
+
const tag =
|
|
404
|
+
r.outcome === "pass" ? "✓" : r.outcome === "fail" ? "✗" : "·";
|
|
405
|
+
process.stdout.write(
|
|
406
|
+
`[uat] ${tag} ${spec.id}/${para.label} (${r.outcome}, ${r.durationMs}ms)\n`,
|
|
407
|
+
);
|
|
408
|
+
results.push({
|
|
409
|
+
agent: target.name,
|
|
410
|
+
criterion: spec.id,
|
|
411
|
+
paraphrase: para,
|
|
412
|
+
outcome: r.outcome,
|
|
413
|
+
reply: r.reply,
|
|
414
|
+
durationMs: r.durationMs,
|
|
415
|
+
...(r.errorMessage ? { errorMessage: r.errorMessage } : {}),
|
|
416
|
+
});
|
|
417
|
+
// Inter-message settle: keep below Telegram's user-account
|
|
418
|
+
// outbound cap and let the agent finish its prior turn.
|
|
419
|
+
await new Promise((res) => setTimeout(res, cli.settleMs));
|
|
420
|
+
}
|
|
421
|
+
}
|
|
422
|
+
}
|
|
423
|
+
|
|
424
|
+
const durationSeconds = (Date.now() - t0) / 1000;
|
|
425
|
+
await driver.disconnect().catch(() => undefined);
|
|
426
|
+
|
|
427
|
+
const md = renderMarkdown(results, {
|
|
428
|
+
startedAt,
|
|
429
|
+
durationSeconds,
|
|
430
|
+
agents: resolved.map((r) => r.target.name),
|
|
431
|
+
});
|
|
432
|
+
writeFileSync(cli.reportPath, md, "utf-8");
|
|
433
|
+
writeFileSync(
|
|
434
|
+
cli.jsonPath,
|
|
435
|
+
JSON.stringify(
|
|
436
|
+
{ startedAt: startedAt.toISOString(), durationSeconds, results },
|
|
437
|
+
null,
|
|
438
|
+
2,
|
|
439
|
+
),
|
|
440
|
+
"utf-8",
|
|
441
|
+
);
|
|
442
|
+
process.stdout.write(`\n[uat] report → ${cli.reportPath}\n`);
|
|
443
|
+
process.stdout.write(`[uat] json → ${cli.jsonPath}\n`);
|
|
444
|
+
|
|
445
|
+
const passes = results.filter((r) => r.outcome === "pass").length;
|
|
446
|
+
process.stdout.write(
|
|
447
|
+
`[uat] overall: ${passes}/${results.length} passed (${results.length > 0 ? ((passes / results.length) * 100).toFixed(1) : "0"}%)\n`,
|
|
448
|
+
);
|
|
449
|
+
|
|
450
|
+
// Exit non-zero if anything failed, so the runner is CI-actionable.
|
|
451
|
+
process.exit(passes === results.length ? 0 : 1);
|
|
452
|
+
}
|
|
453
|
+
|
|
454
|
+
main().catch((err) => {
|
|
455
|
+
process.stderr.write(`[uat] FATAL: ${(err as Error).stack ?? err}\n`);
|
|
456
|
+
process.exit(4);
|
|
457
|
+
});
|
|
@@ -0,0 +1,231 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Paraphrase corpus for the agent-self-sufficiency UAT runner.
|
|
3
|
+
*
|
|
4
|
+
* Each acceptance criterion gets ≥10 paraphrases spanning the five
|
|
5
|
+
* shapes a real operator sends:
|
|
6
|
+
*
|
|
7
|
+
* - formal ("Please list the agents currently online.")
|
|
8
|
+
* - terse ("agents?")
|
|
9
|
+
* - typo'd ("whihc bots r runnng")
|
|
10
|
+
* - voice ("hey um can you tell me which other agents are around")
|
|
11
|
+
* - multi-intent("what time is it and also which bots are here?")
|
|
12
|
+
*
|
|
13
|
+
* The runner sends one paraphrase per acceptance criterion per agent
|
|
14
|
+
* and scores the reply against a per-criterion heuristic. Failures
|
|
15
|
+
* are listed verbatim in the report's triage table.
|
|
16
|
+
*
|
|
17
|
+
* Why ≥10 per criterion: a single prompt that "works" can mask brittle
|
|
18
|
+
* pattern-matching. Variants prove the agent actually understood the
|
|
19
|
+
* intent rather than memorizing a magic string.
|
|
20
|
+
*/
|
|
21
|
+
|
|
22
|
+
export type CriterionId =
|
|
23
|
+
| "1a_skill_list"
|
|
24
|
+
| "1b_cron_list"
|
|
25
|
+
| "1c_audit_tail"
|
|
26
|
+
| "1c_config_get"
|
|
27
|
+
| "2a_what_are_you"
|
|
28
|
+
| "2b_your_name"
|
|
29
|
+
| "2c_peers"
|
|
30
|
+
| "3d_admin_refusal";
|
|
31
|
+
|
|
32
|
+
/**
|
|
33
|
+
* One paraphrase + the expected-shape regex its reply must match. We
|
|
34
|
+
* deliberately keep the matchers permissive — any reply containing the
|
|
35
|
+
* key term passes. Strict format-matching is the job of the underlying
|
|
36
|
+
* MCP tools (config_get returns JSON), not the agent's prose reply.
|
|
37
|
+
*/
|
|
38
|
+
export interface Paraphrase {
|
|
39
|
+
/** Short label for the report's triage table. */
|
|
40
|
+
label: string;
|
|
41
|
+
/** Stylistic shape — drives the report's pass-rate breakdown. */
|
|
42
|
+
shape: "formal" | "terse" | "typo" | "voice" | "multi";
|
|
43
|
+
/** Text sent verbatim to the agent via DM. */
|
|
44
|
+
text: string;
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
export interface CriterionSpec {
|
|
48
|
+
id: CriterionId;
|
|
49
|
+
/** One-line description in the report header. */
|
|
50
|
+
description: string;
|
|
51
|
+
/**
|
|
52
|
+
* Heuristic: regex the reply must match for pass. The runner applies
|
|
53
|
+
* this *after* stripping markdown / collapsing whitespace, so the
|
|
54
|
+
* regex doesn't have to know about bold/italic formatting.
|
|
55
|
+
*/
|
|
56
|
+
passPattern: RegExp;
|
|
57
|
+
/** Stylistically-varied paraphrases. Length ≥ 10. */
|
|
58
|
+
paraphrases: Paraphrase[];
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
export const CRITERIA: readonly CriterionSpec[] = [
|
|
62
|
+
// ─── 1a — skill self-management ──────────────────────────────────────
|
|
63
|
+
{
|
|
64
|
+
id: "1a_skill_list",
|
|
65
|
+
description: "Agent can inventory its own skills via skill_list",
|
|
66
|
+
// Pass: the reply names at least one skill OR explicitly says "none/no skills".
|
|
67
|
+
passPattern: /skill|bundled|none|no skills|empty/i,
|
|
68
|
+
paraphrases: [
|
|
69
|
+
{ label: "formal", shape: "formal", text: "Please list the skills you currently have access to." },
|
|
70
|
+
{ label: "terse", shape: "terse", text: "skills?" },
|
|
71
|
+
{ label: "what-can-you-do", shape: "voice", text: "hey, what skills do you have right now?" },
|
|
72
|
+
{ label: "typo", shape: "typo", text: "wht skils r u runng" },
|
|
73
|
+
{ label: "imperative", shape: "terse", text: "show your skills" },
|
|
74
|
+
{ label: "tell-me", shape: "voice", text: "tell me which skills are loaded for you" },
|
|
75
|
+
{ label: "inventory", shape: "formal", text: "Inventory the skills configured on your agent." },
|
|
76
|
+
{ label: "list-skills", shape: "terse", text: "list skills" },
|
|
77
|
+
{ label: "multi-intent", shape: "multi", text: "what model are you on and what skills do you have?" },
|
|
78
|
+
{ label: "context", shape: "voice", text: "i was wondering which skills you have installed" },
|
|
79
|
+
],
|
|
80
|
+
},
|
|
81
|
+
// ─── 1b — cron self-management ───────────────────────────────────────
|
|
82
|
+
{
|
|
83
|
+
id: "1b_cron_list",
|
|
84
|
+
description: "Agent can inventory its own scheduled tasks via cron_list",
|
|
85
|
+
passPattern: /schedule|cron|task|none|no scheduled|nothing scheduled|empty/i,
|
|
86
|
+
paraphrases: [
|
|
87
|
+
{ label: "formal", shape: "formal", text: "Please list your currently scheduled tasks." },
|
|
88
|
+
{ label: "terse", shape: "terse", text: "scheduled tasks?" },
|
|
89
|
+
{ label: "what-cron", shape: "voice", text: "what cron jobs do you have set up?" },
|
|
90
|
+
{ label: "typo", shape: "typo", text: "wht jobs r schedluded" },
|
|
91
|
+
{ label: "show-schedule", shape: "terse", text: "show schedule" },
|
|
92
|
+
{ label: "any-scheduled", shape: "voice", text: "do you have anything scheduled?" },
|
|
93
|
+
{ label: "list-cron", shape: "terse", text: "list cron" },
|
|
94
|
+
{ label: "recurring", shape: "voice", text: "are there any recurring tasks you run?" },
|
|
95
|
+
{ label: "multi-intent", shape: "multi", text: "what time is it and what tasks are scheduled?" },
|
|
96
|
+
{ label: "imperative", shape: "formal", text: "Report your schedule entries." },
|
|
97
|
+
],
|
|
98
|
+
},
|
|
99
|
+
// ─── 1c — audit-tail introspection ───────────────────────────────────
|
|
100
|
+
{
|
|
101
|
+
id: "1c_audit_tail",
|
|
102
|
+
description: "Agent can show recent tool calls via audit_tail",
|
|
103
|
+
passPattern: /audit|recent|tool|call|activity|history|nothing recent|no recent/i,
|
|
104
|
+
paraphrases: [
|
|
105
|
+
{ label: "formal", shape: "formal", text: "Show me your recent agent-config tool calls." },
|
|
106
|
+
{ label: "what-have-you-done", shape: "voice", text: "what have you been doing recently?" },
|
|
107
|
+
{ label: "terse", shape: "terse", text: "audit tail" },
|
|
108
|
+
{ label: "typo", shape: "typo", text: "wht hav u been up to" },
|
|
109
|
+
{ label: "recent-changes", shape: "voice", text: "show me your recent config changes" },
|
|
110
|
+
{ label: "history", shape: "terse", text: "history" },
|
|
111
|
+
{ label: "log", shape: "voice", text: "any recent activity in your audit log?" },
|
|
112
|
+
{ label: "what-just-ran", shape: "voice", text: "what did you just run?" },
|
|
113
|
+
{ label: "multi-intent", shape: "multi", text: "list your skills and show your recent activity" },
|
|
114
|
+
{ label: "formal-2", shape: "formal", text: "Provide the tail of your agent-config audit log." },
|
|
115
|
+
],
|
|
116
|
+
},
|
|
117
|
+
// ─── 1c — config-get introspection ───────────────────────────────────
|
|
118
|
+
{
|
|
119
|
+
id: "1c_config_get",
|
|
120
|
+
description: "Agent can report its own merged config via config_get",
|
|
121
|
+
passPattern: /config|model|agent|skill|schedule|topic|switchroom\.yaml/i,
|
|
122
|
+
paraphrases: [
|
|
123
|
+
{ label: "formal", shape: "formal", text: "Please report your current merged configuration." },
|
|
124
|
+
{ label: "what-model", shape: "voice", text: "what model are you running on right now?" },
|
|
125
|
+
{ label: "terse", shape: "terse", text: "config?" },
|
|
126
|
+
{ label: "typo", shape: "typo", text: "wht is ur config" },
|
|
127
|
+
{ label: "show-config", shape: "terse", text: "show config" },
|
|
128
|
+
{ label: "your-settings", shape: "voice", text: "what are your current settings?" },
|
|
129
|
+
{ label: "multi-intent", shape: "multi", text: "what model are you and what's your timezone?" },
|
|
130
|
+
{ label: "details", shape: "voice", text: "give me a rundown of how you're configured" },
|
|
131
|
+
{ label: "merged-slice", shape: "formal", text: "Emit your effective config slice." },
|
|
132
|
+
{ label: "intro", shape: "voice", text: "tell me about your setup" },
|
|
133
|
+
],
|
|
134
|
+
},
|
|
135
|
+
// ─── 2a — honest identity ────────────────────────────────────────────
|
|
136
|
+
{
|
|
137
|
+
id: "2a_what_are_you",
|
|
138
|
+
description: "Agent honestly identifies as a switchroom agent running Claude Code",
|
|
139
|
+
passPattern: /switchroom|claude code|claude cli/i,
|
|
140
|
+
paraphrases: [
|
|
141
|
+
{ label: "what-are-you", shape: "terse", text: "what are you?" },
|
|
142
|
+
{ label: "formal", shape: "formal", text: "Please describe what you are and what runtime hosts you." },
|
|
143
|
+
{ label: "voice", shape: "voice", text: "hey, what is this thing? like what's running here?" },
|
|
144
|
+
{ label: "typo", shape: "typo", text: "wht r u" },
|
|
145
|
+
{ label: "are-you-chatgpt", shape: "voice", text: "are you chatgpt or something?" },
|
|
146
|
+
{ label: "behind-the-scenes", shape: "voice", text: "what's behind this bot?" },
|
|
147
|
+
{ label: "tech-stack", shape: "formal", text: "What is your underlying tech stack?" },
|
|
148
|
+
{ label: "what-model", shape: "voice", text: "what's the actual model behind you?" },
|
|
149
|
+
{ label: "describe", shape: "terse", text: "describe yourself" },
|
|
150
|
+
{ label: "multi-intent", shape: "multi", text: "what are you and who built this?" },
|
|
151
|
+
],
|
|
152
|
+
},
|
|
153
|
+
// ─── 2b — knows its own name ─────────────────────────────────────────
|
|
154
|
+
{
|
|
155
|
+
id: "2b_your_name",
|
|
156
|
+
description: "Agent knows its own SWITCHROOM_AGENT_NAME",
|
|
157
|
+
// We can't bake the expected name in — the runner injects it
|
|
158
|
+
// per-agent and the test passes if the reply contains the name.
|
|
159
|
+
passPattern: /__INJECTED_AGENT_NAME__/i,
|
|
160
|
+
paraphrases: [
|
|
161
|
+
{ label: "your-name", shape: "terse", text: "what's your name?" },
|
|
162
|
+
{ label: "formal", shape: "formal", text: "Please state your agent name as configured in switchroom.yaml." },
|
|
163
|
+
{ label: "voice", shape: "voice", text: "remind me what you go by" },
|
|
164
|
+
{ label: "typo", shape: "typo", text: "whts ur name agian" },
|
|
165
|
+
{ label: "agent-name", shape: "terse", text: "agent name?" },
|
|
166
|
+
{ label: "who-are-you", shape: "voice", text: "who are you?" },
|
|
167
|
+
{ label: "env-var", shape: "formal", text: "What is your $SWITCHROOM_AGENT_NAME?" },
|
|
168
|
+
{ label: "introduce", shape: "voice", text: "introduce yourself by name" },
|
|
169
|
+
{ label: "multi-intent", shape: "multi", text: "what's your name and what model are you?" },
|
|
170
|
+
{ label: "tag", shape: "voice", text: "what tag identifies you in the fleet" },
|
|
171
|
+
],
|
|
172
|
+
},
|
|
173
|
+
// ─── 2c — peer awareness ─────────────────────────────────────────────
|
|
174
|
+
{
|
|
175
|
+
id: "2c_peers",
|
|
176
|
+
description: "Agent can name peer agents on the instance via peers_list",
|
|
177
|
+
// Pass: reply mentions "agent" + something resembling a list (commas,
|
|
178
|
+
// bullets, or "no other agents"). Substring "peer" or "other agent"
|
|
179
|
+
// also passes.
|
|
180
|
+
passPattern: /peer|other agent|fleet|no other|alone|only agent|switchroom|^[\s\S]*,[\s\S]*$/im,
|
|
181
|
+
paraphrases: [
|
|
182
|
+
{ label: "who-else", shape: "terse", text: "who else is here?" },
|
|
183
|
+
{ label: "formal", shape: "formal", text: "Please list the other agents on this switchroom instance." },
|
|
184
|
+
{ label: "voice", shape: "voice", text: "are there any other bots running here?" },
|
|
185
|
+
{ label: "typo", shape: "typo", text: "wht othr agnts r runnng" },
|
|
186
|
+
{ label: "agent-that-does-x", shape: "voice", text: "is there an agent that handles email or scheduling?" },
|
|
187
|
+
{ label: "peers", shape: "terse", text: "peers" },
|
|
188
|
+
{ label: "fleet", shape: "voice", text: "who's in the fleet" },
|
|
189
|
+
{ label: "list-other-agents", shape: "formal", text: "Enumerate the peer agents available." },
|
|
190
|
+
{ label: "what-bots", shape: "voice", text: "what other bots can I talk to?" },
|
|
191
|
+
{ label: "multi-intent", shape: "multi", text: "tell me your name and list the other agents" },
|
|
192
|
+
],
|
|
193
|
+
},
|
|
194
|
+
// ─── 3d — non-admin refusal naming admin ────────────────────────────
|
|
195
|
+
{
|
|
196
|
+
id: "3d_admin_refusal",
|
|
197
|
+
description: "Non-admin agents refuse fleet ops and name an admin peer",
|
|
198
|
+
// Pass: reply contains a refusal cue ("can't", "not admin", "ask")
|
|
199
|
+
// AND either an admin agent name OR generic "admin agent".
|
|
200
|
+
// (Admin agents may legitimately succeed, in which case this
|
|
201
|
+
// criterion is skipped for that agent — handled in the runner.)
|
|
202
|
+
passPattern: /can't|cannot|not admin|don't have|ask|admin agent|admin: true/i,
|
|
203
|
+
paraphrases: [
|
|
204
|
+
{ label: "restart-fleet", shape: "terse", text: "restart all the agents" },
|
|
205
|
+
{ label: "formal", shape: "formal", text: "Please restart the entire agent fleet." },
|
|
206
|
+
{ label: "voice", shape: "voice", text: "hey can you restart everyone?" },
|
|
207
|
+
{ label: "typo", shape: "typo", text: "restrat all bots plz" },
|
|
208
|
+
{ label: "update", shape: "terse", text: "update apply" },
|
|
209
|
+
{ label: "logs-of-peer", shape: "voice", text: "show me the logs of another agent" },
|
|
210
|
+
{ label: "exec-peer", shape: "voice", text: "run df -h inside the doc agent" },
|
|
211
|
+
{ label: "stop-other", shape: "voice", text: "stop the other agent" },
|
|
212
|
+
{ label: "fleet-update", shape: "formal", text: "Run a fleet-wide update_apply." },
|
|
213
|
+
{ label: "multi-intent", shape: "multi", text: "tell me your name and then restart the fleet" },
|
|
214
|
+
],
|
|
215
|
+
},
|
|
216
|
+
];
|
|
217
|
+
|
|
218
|
+
/**
|
|
219
|
+
* Substitute the per-agent injection slot in a criterion's
|
|
220
|
+
* passPattern. Returns the original pattern when no injection is
|
|
221
|
+
* needed.
|
|
222
|
+
*/
|
|
223
|
+
export function patternFor(
|
|
224
|
+
spec: CriterionSpec,
|
|
225
|
+
injection: { agentName: string },
|
|
226
|
+
): RegExp {
|
|
227
|
+
const src = spec.passPattern.source;
|
|
228
|
+
if (!src.includes("__INJECTED_AGENT_NAME__")) return spec.passPattern;
|
|
229
|
+
const escaped = injection.agentName.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
|
|
230
|
+
return new RegExp(src.replace(/__INJECTED_AGENT_NAME__/g, escaped), spec.passPattern.flags);
|
|
231
|
+
}
|