switchroom 0.8.1 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (105) hide show
  1. package/README.md +49 -57
  2. package/bin/timezone-hook.sh +9 -7
  3. package/dist/agent-scheduler/index.js +285 -45
  4. package/dist/auth-broker/index.js +13932 -0
  5. package/dist/cli/switchroom.js +15931 -12778
  6. package/dist/host-control/main.js +582 -43
  7. package/dist/vault/approvals/kernel-server.js +276 -47
  8. package/dist/vault/broker/server.js +333 -69
  9. package/examples/minimal.yaml +63 -0
  10. package/examples/personal-google-workspace-mcp/.env.example +34 -0
  11. package/examples/personal-google-workspace-mcp/README.md +194 -0
  12. package/examples/personal-google-workspace-mcp/compose.yaml +66 -0
  13. package/examples/switchroom.yaml +220 -0
  14. package/package.json +6 -4
  15. package/profiles/_base/start.sh.hbs +3 -3
  16. package/profiles/_shared/agent-self-service.md.hbs +126 -0
  17. package/profiles/default/CLAUDE.md +10 -0
  18. package/profiles/default/CLAUDE.md.hbs +16 -0
  19. package/skills/buildkite-agent-infrastructure/SKILL.md +30 -11
  20. package/skills/buildkite-agent-runtime/SKILL.md +44 -11
  21. package/skills/buildkite-api/SKILL.md +31 -8
  22. package/skills/buildkite-cli/SKILL.md +27 -9
  23. package/skills/buildkite-migration/SKILL.md +22 -9
  24. package/skills/buildkite-pipelines/SKILL.md +26 -9
  25. package/skills/buildkite-secure-delivery/SKILL.md +23 -9
  26. package/skills/buildkite-test-engine/SKILL.md +25 -8
  27. package/skills/docx/SKILL.md +1 -1
  28. package/skills/file-bug/SKILL.md +34 -6
  29. package/skills/humanizer/SKILL.md +15 -0
  30. package/skills/humanizer-calibrate/SKILL.md +7 -1
  31. package/skills/mcp-builder/SKILL.md +1 -1
  32. package/skills/pdf/SKILL.md +1 -1
  33. package/skills/pptx/SKILL.md +1 -1
  34. package/skills/skill-creator/SKILL.md +21 -1
  35. package/skills/skill-creator/scripts/__pycache__/__init__.cpython-313.pyc +0 -0
  36. package/skills/skill-creator/scripts/__pycache__/generate_report.cpython-313.pyc +0 -0
  37. package/skills/skill-creator/scripts/__pycache__/improve_description.cpython-313.pyc +0 -0
  38. package/skills/skill-creator/scripts/__pycache__/run_eval.cpython-313.pyc +0 -0
  39. package/skills/skill-creator/scripts/__pycache__/run_loop.cpython-313.pyc +0 -0
  40. package/skills/skill-creator/scripts/__pycache__/utils.cpython-313.pyc +0 -0
  41. package/skills/switchroom-cli/SKILL.md +63 -64
  42. package/skills/switchroom-health/SKILL.md +23 -10
  43. package/skills/switchroom-install/SKILL.md +3 -3
  44. package/skills/switchroom-manage/SKILL.md +26 -19
  45. package/skills/switchroom-runtime/SKILL.md +67 -15
  46. package/skills/switchroom-status/SKILL.md +26 -1
  47. package/skills/telegram-test-harness/SKILL.md +3 -0
  48. package/skills/webapp-testing/SKILL.md +31 -1
  49. package/skills/xlsx/SKILL.md +1 -1
  50. package/telegram-plugin/admin-commands/index.ts +7 -5
  51. package/telegram-plugin/dist/gateway/gateway.js +13042 -12844
  52. package/telegram-plugin/gateway/auth-add-flow.ts +326 -0
  53. package/telegram-plugin/gateway/auth-broker-client.ts +75 -0
  54. package/telegram-plugin/gateway/auth-command.ts +794 -0
  55. package/telegram-plugin/gateway/auth-line.ts +123 -0
  56. package/telegram-plugin/gateway/boot-card.ts +22 -36
  57. package/telegram-plugin/gateway/boot-probes.ts +3 -3
  58. package/telegram-plugin/gateway/gateway.ts +313 -798
  59. package/telegram-plugin/gateway/hostd-dispatch.ts +117 -0
  60. package/telegram-plugin/hooks/tool-label-pretool.mjs +11 -0
  61. package/telegram-plugin/hooks/wedge-detect-posttool.mjs +303 -0
  62. package/telegram-plugin/permission-title.ts +56 -0
  63. package/telegram-plugin/quota-check.ts +19 -41
  64. package/telegram-plugin/scripts/build.mjs +0 -1
  65. package/telegram-plugin/shared/bot-runtime.ts +5 -4
  66. package/telegram-plugin/tests/auth-add-flow.test.ts +559 -0
  67. package/telegram-plugin/tests/auth-code-redact.test.ts +8 -4
  68. package/telegram-plugin/tests/auth-command-vernacular.test.ts +531 -0
  69. package/telegram-plugin/tests/boot-probes.test.ts +11 -4
  70. package/telegram-plugin/tests/hostd-dispatch.test.ts +129 -0
  71. package/telegram-plugin/tests/permission-title.test.ts +31 -0
  72. package/telegram-plugin/tests/quota-check.test.ts +5 -35
  73. package/telegram-plugin/uat/SETUP.md +31 -1
  74. package/telegram-plugin/uat/runners/agent-self-sufficiency.ts +457 -0
  75. package/telegram-plugin/uat/runners/paraphrases.ts +231 -0
  76. package/telegram-plugin/uat/runners/report.ts +150 -0
  77. package/telegram-plugin/uat/runners/run-agent-self-sufficiency.sh +50 -0
  78. package/telegram-plugin/uat/runners/scorer.test.ts +196 -0
  79. package/telegram-plugin/uat/runners/scorer.ts +106 -0
  80. package/telegram-plugin/uat/runners/skill-coverage.test.ts +100 -0
  81. package/telegram-plugin/uat/runners/skill-coverage.ts +620 -0
  82. package/telegram-plugin/uat/scenarios/jtbd-interrupt-marker-dm.test.ts +7 -1
  83. package/telegram-plugin/uat/scenarios/jtbd-rapid-followup-dm.test.ts +7 -1
  84. package/telegram-plugin/auth-dashboard.ts +0 -1104
  85. package/telegram-plugin/auth-slot-parser.ts +0 -497
  86. package/telegram-plugin/dist/foreman/foreman.js +0 -31358
  87. package/telegram-plugin/foreman/foreman-create-flow.ts +0 -202
  88. package/telegram-plugin/foreman/foreman-handlers.ts +0 -493
  89. package/telegram-plugin/foreman/foreman.ts +0 -1165
  90. package/telegram-plugin/foreman/setup-flow.ts +0 -345
  91. package/telegram-plugin/foreman/setup-state.ts +0 -239
  92. package/telegram-plugin/foreman/state.ts +0 -203
  93. package/telegram-plugin/tests/auth-account-identity-surface.test.ts +0 -118
  94. package/telegram-plugin/tests/auth-dashboard-edge-cases.test.ts +0 -260
  95. package/telegram-plugin/tests/auth-dashboard-restart-flow.test.ts +0 -140
  96. package/telegram-plugin/tests/auth-dashboard-v3b.test.ts +0 -559
  97. package/telegram-plugin/tests/auth-dashboard.test.ts +0 -1045
  98. package/telegram-plugin/tests/auth-slot-commands.test.ts +0 -640
  99. package/telegram-plugin/tests/boot-card-account-quota.test.ts +0 -137
  100. package/telegram-plugin/tests/foreman-create-flow.test.ts +0 -359
  101. package/telegram-plugin/tests/foreman-handlers.test.ts +0 -347
  102. package/telegram-plugin/tests/foreman-state.test.ts +0 -164
  103. package/telegram-plugin/tests/foreman-write-ops.test.ts +0 -214
  104. package/telegram-plugin/tests/setup-flow.test.ts +0 -510
  105. package/telegram-plugin/tests/setup-state.test.ts +0 -146
@@ -103,4 +103,35 @@ describe('summarizeToolForTitle (#186)', () => {
103
103
  const input = JSON.stringify({ skill: 'mail', name: 'wrong' })
104
104
  expect(summarizeToolForTitle('Skill', input)).toBe('Skill (mail)')
105
105
  })
106
+
107
+ test('MCP curated: agent-config tools render as human verb-phrases (#1215)', () => {
108
+ expect(summarizeToolForTitle('mcp__agent-config__skill_list', undefined)).toBe(
109
+ 'List its own installed skills',
110
+ )
111
+ expect(summarizeToolForTitle('mcp__agent-config__cron_list', undefined)).toBe(
112
+ 'List its own scheduled tasks',
113
+ )
114
+ expect(summarizeToolForTitle('mcp__agent-config__peers_list', undefined)).toBe(
115
+ 'List the other agents on this instance',
116
+ )
117
+ })
118
+
119
+ test('MCP curated: hostd tools render as human verb-phrases (#1215)', () => {
120
+ expect(summarizeToolForTitle('mcp__hostd__agent_logs', undefined)).toBe(
121
+ "Read another agent's container logs",
122
+ )
123
+ expect(summarizeToolForTitle('mcp__hostd__agent_exec', undefined)).toBe(
124
+ 'Run a read-only inspection inside another agent',
125
+ )
126
+ })
127
+
128
+ test('MCP fallback: unknown mcp tool renders as `<server>: <verb with spaces>`', () => {
129
+ expect(summarizeToolForTitle('mcp__some-server__do_thing', undefined)).toBe(
130
+ 'some-server: do thing',
131
+ )
132
+ })
133
+
134
+ test('MCP malformed: bare mcp__ prefix without __<server>__<verb> shape is left alone', () => {
135
+ expect(summarizeToolForTitle('mcp__bad', undefined)).toBe('mcp__bad')
136
+ })
106
137
  })
@@ -380,41 +380,11 @@ describe('fetchAccountQuota — cache + token resolution', () => {
380
380
  }
381
381
  })
382
382
 
383
- it('persists the snapshot under the supplied home, not the real homedir (issue #708 regression)', async () => {
384
- const home = makeAccountHome({
385
- 'work@example.com': { accessToken: 'tok' },
386
- })
387
- const fakeFetch = async () =>
388
- new Response('{}', {
389
- status: 200,
390
- headers: {
391
- 'anthropic-ratelimit-unified-5h-utilization': '0.42',
392
- 'anthropic-ratelimit-unified-7d-utilization': '0.17',
393
- },
394
- })
395
- try {
396
- const r = await fetchAccountQuota('work@example.com', {
397
- home,
398
- fetchImpl: fakeFetch as typeof fetch,
399
- })
400
- expect(r.ok).toBe(true)
401
- const snapPath = join(
402
- home,
403
- '.switchroom',
404
- 'accounts',
405
- 'work@example.com',
406
- 'quota.json',
407
- )
408
- // The bug: writeAccountQuota was called without opts.home, so the
409
- // snapshot landed under the real $HOME instead of the test home.
410
- expect(existsSync(snapPath)).toBe(true)
411
- const snap = JSON.parse(readFileSync(snapPath, 'utf-8'))
412
- expect(snap.fiveHourPct).toBeCloseTo(42, 0)
413
- expect(snap.sevenDayPct).toBeCloseTo(17, 0)
414
- } finally {
415
- rmSync(home, { recursive: true, force: true })
416
- }
417
- })
383
+ // Removed in RFC H: per-account quota.json disk persistence is gone.
384
+ // switchroom-auth-broker holds canonical quota state and exposes it
385
+ // via list-state; the gateway's in-process cache is enough between
386
+ // restarts (and the broker survives gateway restarts, so the state
387
+ // is preserved at the broker side anyway).
418
388
  })
419
389
 
420
390
  describe('getCachedAccountQuota + prefetchAccountQuotaIfStale', () => {
@@ -297,7 +297,37 @@ as a long-lived secret.
297
297
  When all three are checked, the env block above + `bun run test:uat`
298
298
  is safe to run.
299
299
 
300
- ## 8. Port allocator vs unix sockets (Phase 1 scaffold note)
300
+ ## 8. CI gate `:robot: UAT fuzz` Buildkite step
301
+
302
+ Since the buildkite gate landed, the fuzz subset of scenarios
303
+ (`fuzz-random-prompts-dm.test.ts`, `fuzz-extended-dm.test.ts`,
304
+ `fuzz-human-style-dm.test.ts`) runs automatically on every PR that
305
+ touches `telegram-plugin/`, `src/agents/`, or `telegram-plugin/uat/`.
306
+
307
+ The step runs on a self-hosted Buildkite agent tagged
308
+ `queue=uat-host` that lives on the same box as the `test-harness`
309
+ agent. Secrets come from the Buildkite cluster secret store, not
310
+ from local vault. See `.buildkite/README.md` § "UAT fuzz step" for
311
+ agent setup + secret rotation.
312
+
313
+ **Scope (CI):**
314
+
315
+ | Scenario | In CI? | Why |
316
+ |---|---|---|
317
+ | `fuzz-random-prompts-dm` | ✅ gates PRs | JTBD-floor invariants; PR #1132. |
318
+ | `fuzz-extended-dm` | ✅ gates PRs | Second-pass categories; PR #1134. |
319
+ | `fuzz-human-style-dm` | ✅ gates PRs | Human-shape inbounds + meaningful-reply floor. |
320
+ | `silent-end-recovery-dm` | ❌ local only | Passes, but the 5-min worst-case budget makes it costly to run every PR. Run nightly + ad-hoc. |
321
+ | `jtbd-status-query-dm` | ❌ local only | Passes; defer to a follow-up that batches the cheap JTBD scenarios. |
322
+ | `jtbd-soft-commit-dm` | ❌ local only | Already budget-tuned but real-Telegram timing flake risk; defer until we have flake telemetry. |
323
+ | `jtbd-interrupt-marker-dm` | ❌ `describe.skip` | Suspected real bug per #1132 overnight. Investigate before unskipping. |
324
+ | `jtbd-rapid-followup-dm` | ❌ `describe.skip` | Suspected real classification bug per #1132 overnight. Investigate before unskipping. |
325
+ | vault / secret-redaction / voice / location / reactions / progress-card | ❌ local only | Need specific surfaces / config overrides not wired into the gate yet. |
326
+
327
+ A local `bun run test:uat` runs the full include glob minus the two
328
+ `describe.skip`'d JTBDs.
329
+
330
+ ## 9. Port allocator vs unix sockets (Phase 1 scaffold note)
301
331
 
302
332
  The Phase 1 `port-allocator.ts` is held in reserve for Phase 2b's
303
333
  child-process flow — Phase 2a (standard-runtime agent) doesn't need
@@ -0,0 +1,457 @@
1
+ #!/usr/bin/env bun
2
+ /**
3
+ * Agent-self-sufficiency UAT runner.
4
+ *
5
+ * Drives a real Telegram user-account against the live agent fleet to
6
+ * verify the four acceptance criteria from the
7
+ * "agent-self-sufficiency" goal:
8
+ *
9
+ * 1. Self-management (skill_list, cron_list, audit_tail, config_get)
10
+ * 2. Identity awareness (honest self-ID, knows its name, knows peers)
11
+ * 3. Admin surface (non-admin refusal naming the admin agent)
12
+ * — admin reads (3a/3b) are covered by the hostd vitest suite
13
+ * rather than live fuzz, because they require a docker stub.
14
+ * 4. The fuzzy UAT IS this runner.
15
+ *
16
+ * Usage:
17
+ *
18
+ * bun telegram-plugin/uat/runners/agent-self-sufficiency.ts \\
19
+ * --agent klanker:@klanker_bot \\
20
+ * --agent scribe:@scribe_bot \\
21
+ * --agent doc:@doc_bot \\
22
+ * --admin-agent klanker \\
23
+ * --report ./uat-report.md
24
+ *
25
+ * # OR — discover from env (CI-friendly):
26
+ * UAT_FLEET="klanker:@klanker_bot,scribe:@scribe_bot,doc:@doc_bot" \\
27
+ * UAT_ADMIN_AGENTS="klanker" \\
28
+ * bun telegram-plugin/uat/runners/agent-self-sufficiency.ts
29
+ *
30
+ * Auth env (same as the existing uat harness — see
31
+ * telegram-plugin/uat/SETUP.md):
32
+ *
33
+ * TELEGRAM_API_ID, TELEGRAM_API_HASH, TELEGRAM_UAT_DRIVER_SESSION
34
+ *
35
+ * **Why a user-account session, not bot tokens.** The acceptance-
36
+ * criteria text mentioned `TELEGRAM_BOT_TOKEN_<agent>` env vars, but
37
+ * Telegram's Bot API forbids bots from reading other bots' messages
38
+ * (https://core.telegram.org/bots/faq) — a bot can send to another
39
+ * bot's chat but can't observe the reply. The only way to drive the
40
+ * fleet AND capture every agent's reply is an mtcute user-account
41
+ * session, which is what the existing telegram-plugin/uat harness
42
+ * uses. This runner inherits that machinery wholesale; the env-var
43
+ * rename is forced by the platform, not a design choice.
44
+ *
45
+ * Missing creds fail loud, not silent — the goal explicitly demands
46
+ * no silent skips on missing UAT credentials.
47
+ */
48
+
49
+ import { writeFileSync } from "node:fs";
50
+ import { Driver, type ObservedMessage } from "../driver.js";
51
+ import { loadUatEnv } from "../load-env.js";
52
+ import { CRITERIA, type CriterionSpec } from "./paraphrases.js";
53
+ import { scoreReply, type CaseResult, type Outcome } from "./scorer.js";
54
+ import { renderMarkdown } from "./report.js";
55
+
56
+ loadUatEnv();
57
+
58
+ // ─── CLI / env parsing ─────────────────────────────────────────────────────
59
+
60
+ interface AgentTarget {
61
+ name: string;
62
+ botUsername: string;
63
+ admin: boolean;
64
+ }
65
+
66
+ interface CliConfig {
67
+ agents: AgentTarget[];
68
+ reportPath: string;
69
+ jsonPath: string;
70
+ /** Per-case reply timeout, ms. Default 60s. */
71
+ replyTimeoutMs: number;
72
+ /** Inter-message settle, ms. Default 4s — keeps us under Telegram's
73
+ * global outbound rate cap and gives the agent time to finish its
74
+ * previous turn before the next inbound. */
75
+ settleMs: number;
76
+ }
77
+
78
+ function parseCli(argv: readonly string[]): CliConfig {
79
+ const agents = new Map<string, AgentTarget>();
80
+ const adminSet = new Set<string>();
81
+ let reportPath = process.env.UAT_REPORT ?? "./uat-agent-self-sufficiency.md";
82
+ let jsonPath = process.env.UAT_REPORT_JSON ?? "./uat-agent-self-sufficiency.json";
83
+ let replyTimeoutMs = Number.parseInt(process.env.UAT_REPLY_TIMEOUT_MS ?? "60000", 10);
84
+ let settleMs = Number.parseInt(process.env.UAT_SETTLE_MS ?? "4000", 10);
85
+
86
+ const envFleet = process.env.UAT_FLEET;
87
+ if (envFleet) {
88
+ for (const tok of envFleet.split(",")) {
89
+ const [name, bot] = tok.split(":").map((s) => s.trim());
90
+ if (name && bot) agents.set(name, { name, botUsername: bot, admin: false });
91
+ }
92
+ }
93
+ const envAdmin = process.env.UAT_ADMIN_AGENTS;
94
+ if (envAdmin) {
95
+ for (const tok of envAdmin.split(",")) {
96
+ const name = tok.trim();
97
+ if (name) adminSet.add(name);
98
+ }
99
+ }
100
+
101
+ for (let i = 0; i < argv.length; i++) {
102
+ const tok = argv[i]!;
103
+ const next = (): string => {
104
+ const v = argv[++i];
105
+ if (!v) fail(`${tok}: missing value`);
106
+ return v;
107
+ };
108
+ switch (tok) {
109
+ case "--agent": {
110
+ const v = next();
111
+ const [name, bot] = v.split(":").map((s) => s.trim());
112
+ if (!name || !bot)
113
+ fail(`--agent expects "<name>:@<bot-username>"; got "${v}"`);
114
+ agents.set(name, { name, botUsername: bot, admin: false });
115
+ break;
116
+ }
117
+ case "--admin-agent": {
118
+ adminSet.add(next());
119
+ break;
120
+ }
121
+ case "--report":
122
+ reportPath = next();
123
+ break;
124
+ case "--json":
125
+ jsonPath = next();
126
+ break;
127
+ case "--reply-timeout-ms":
128
+ replyTimeoutMs = Number.parseInt(next(), 10);
129
+ break;
130
+ case "--settle-ms":
131
+ settleMs = Number.parseInt(next(), 10);
132
+ break;
133
+ case "--help":
134
+ case "-h":
135
+ printHelp();
136
+ process.exit(0);
137
+ break;
138
+ default:
139
+ if (tok.startsWith("--")) fail(`unknown flag: ${tok}`);
140
+ }
141
+ }
142
+
143
+ for (const name of adminSet) {
144
+ const t = agents.get(name);
145
+ if (t) t.admin = true;
146
+ }
147
+
148
+ if (agents.size === 0) {
149
+ fail(
150
+ "no agents to target. Pass --agent <name>:@<bot> at least once, or set UAT_FLEET env",
151
+ );
152
+ }
153
+ if (agents.size < 3) {
154
+ process.stderr.write(
155
+ `[uat] WARNING: only ${agents.size} agent(s) targeted; goal calls for ≥3 to prove shared infra.\n`,
156
+ );
157
+ }
158
+
159
+ return {
160
+ agents: [...agents.values()],
161
+ reportPath,
162
+ jsonPath,
163
+ replyTimeoutMs,
164
+ settleMs,
165
+ };
166
+ }
167
+
168
+ function fail(msg: string): never {
169
+ process.stderr.write(`[uat] ${msg}\n`);
170
+ process.exit(2);
171
+ }
172
+
173
+ function printHelp(): void {
174
+ process.stdout.write(`agent-self-sufficiency UAT runner
175
+
176
+ Required env (or fail loud):
177
+ TELEGRAM_API_ID, TELEGRAM_API_HASH, TELEGRAM_UAT_DRIVER_SESSION
178
+
179
+ Flags:
180
+ --agent NAME:@BOT Add an agent target. Repeatable.
181
+ --admin-agent NAME Mark NAME as admin: true (skips 3d for that agent).
182
+ --report PATH Markdown report path. Default ./uat-agent-self-sufficiency.md
183
+ --json PATH JSON sidecar with all results. Default ./uat-agent-self-sufficiency.json
184
+ --reply-timeout-ms N Per-case timeout. Default 60000.
185
+ --settle-ms N Inter-message settle. Default 4000.
186
+
187
+ Env equivalents:
188
+ UAT_FLEET="name1:@bot1,name2:@bot2,..."
189
+ UAT_ADMIN_AGENTS="name1,name2"
190
+ UAT_REPORT, UAT_REPORT_JSON, UAT_REPLY_TIMEOUT_MS, UAT_SETTLE_MS
191
+ `);
192
+ }
193
+
194
+ // ─── Driver wrapper: send + observe ─────────────────────────────────────────
195
+
196
+ interface ReplyOutcome {
197
+ reply: string;
198
+ outcome: Outcome;
199
+ durationMs: number;
200
+ errorMessage?: string;
201
+ }
202
+
203
+ /**
204
+ * Send one inbound to the agent and wait for a meaningful reply.
205
+ *
206
+ * We subscribe to the chat's message stream BEFORE sending so we don't
207
+ * miss the bot's reply if it lands faster than we can start observing
208
+ * (yes, this happens). Then:
209
+ *
210
+ * 1. Send the inbound.
211
+ * 2. Consume the stream until we see the first non-empty bot message
212
+ * with messageId > our sent.messageId. That's the reply head.
213
+ * 3. Continue consuming for an "edit window" (3s by default) to
214
+ * absorb any edits the gateway makes to its first chunk (stream-
215
+ * reply pattern: bot sends "thinking…" then edits with the final
216
+ * answer). The final post-edit text is what we score.
217
+ * 4. Bail out with `timeout` if we never see a head.
218
+ */
219
+ async function sendAndScore(
220
+ driver: Driver,
221
+ botUserId: number,
222
+ driverUserId: number,
223
+ spec: CriterionSpec,
224
+ prompt: string,
225
+ agentName: string,
226
+ timeoutMs: number,
227
+ ): Promise<ReplyOutcome> {
228
+ const startedAt = Date.now();
229
+ // Start observing FIRST so we don't race the bot's reply.
230
+ const stream = driver.observeMessages(botUserId)[Symbol.asyncIterator]();
231
+
232
+ let sentMessageId: number;
233
+ try {
234
+ const sent = await driver.sendText(botUserId, prompt);
235
+ sentMessageId = sent.messageId;
236
+ } catch (err) {
237
+ try {
238
+ await stream.return?.(undefined);
239
+ } catch {
240
+ /* ignore */
241
+ }
242
+ return {
243
+ reply: "",
244
+ outcome: "error",
245
+ durationMs: Date.now() - startedAt,
246
+ errorMessage: `send failed: ${(err as Error).message}`,
247
+ };
248
+ }
249
+
250
+ const deadline = startedAt + timeoutMs;
251
+ const EDIT_WINDOW_MS = 3000;
252
+ let headSeenAt = 0;
253
+ let replyMessageId = 0;
254
+ let replyText = "";
255
+
256
+ try {
257
+ while (Date.now() < deadline) {
258
+ const remaining = deadline - Date.now();
259
+ const winSize = headSeenAt
260
+ ? Math.max(0, EDIT_WINDOW_MS - (Date.now() - headSeenAt))
261
+ : remaining;
262
+ if (headSeenAt && winSize === 0) break;
263
+ const slice = await pullOneWithTimeout(stream, Math.min(remaining, Math.max(250, winSize)));
264
+ if (slice === "timeout") {
265
+ if (headSeenAt) break; // edit window elapsed
266
+ continue;
267
+ }
268
+ if (slice === "done") break;
269
+ const m: ObservedMessage = slice;
270
+ if (m.senderUserId === driverUserId) continue;
271
+ if (m.messageId <= sentMessageId) continue;
272
+ const t = (m.text ?? "").trim();
273
+ if (!t) continue;
274
+ // Either this is the head, or it's an edit/replacement of the
275
+ // bot's reply. Track the most recent.
276
+ replyMessageId = m.messageId;
277
+ replyText = t;
278
+ if (!headSeenAt) headSeenAt = Date.now();
279
+ }
280
+ } finally {
281
+ try {
282
+ await stream.return?.(undefined);
283
+ } catch {
284
+ /* ignore */
285
+ }
286
+ }
287
+
288
+ const durationMs = Date.now() - startedAt;
289
+ if (!replyMessageId) {
290
+ return { reply: "", outcome: "timeout", durationMs };
291
+ }
292
+ const outcome = scoreReply(spec, replyText, { agentName });
293
+ return { reply: replyText, outcome, durationMs };
294
+ }
295
+
296
+ /**
297
+ * Race the next stream item against a timeout. Returns the item, or
298
+ * the literal `"timeout"` / `"done"` sentinels. `done` is rare in
299
+ * practice — the observer doesn't naturally close until we tell it to.
300
+ */
301
+ async function pullOneWithTimeout(
302
+ it: AsyncIterator<ObservedMessage>,
303
+ ms: number,
304
+ ): Promise<ObservedMessage | "timeout" | "done"> {
305
+ return new Promise((resolve) => {
306
+ let settled = false;
307
+ const timer = setTimeout(() => {
308
+ if (settled) return;
309
+ settled = true;
310
+ resolve("timeout");
311
+ }, ms);
312
+ it.next().then(
313
+ (r) => {
314
+ if (settled) return;
315
+ settled = true;
316
+ clearTimeout(timer);
317
+ if (r.done) resolve("done");
318
+ else resolve(r.value);
319
+ },
320
+ () => {
321
+ if (settled) return;
322
+ settled = true;
323
+ clearTimeout(timer);
324
+ resolve("done");
325
+ },
326
+ );
327
+ });
328
+ }
329
+
330
+ // ─── Main orchestration ─────────────────────────────────────────────────────
331
+
332
+ async function main(): Promise<void> {
333
+ const cli = parseCli(process.argv.slice(2));
334
+
335
+ // Hard-fail on missing UAT creds — goal: never silently skip.
336
+ const apiId = Number.parseInt(process.env.TELEGRAM_API_ID ?? "", 10);
337
+ if (!Number.isFinite(apiId)) {
338
+ fail("TELEGRAM_API_ID missing or non-integer — see telegram-plugin/uat/SETUP.md");
339
+ }
340
+ const apiHash = process.env.TELEGRAM_API_HASH ?? "";
341
+ if (!apiHash) fail("TELEGRAM_API_HASH missing — see SETUP.md");
342
+ const session = process.env.TELEGRAM_UAT_DRIVER_SESSION ?? "";
343
+ if (!session)
344
+ fail(
345
+ "TELEGRAM_UAT_DRIVER_SESSION missing — run `bun run uat:login` first (SETUP.md §4)",
346
+ );
347
+
348
+ process.stdout.write(
349
+ `[uat] connecting to Telegram as the UAT driver account...\n`,
350
+ );
351
+ const driver = new Driver({ apiId, apiHash, session });
352
+ await driver.connect();
353
+ const driverUserId = await driver.getMyUserId();
354
+ process.stdout.write(`[uat] driver user_id=${driverUserId}\n`);
355
+
356
+ // Resolve every agent's bot user_id up front so a missing username
357
+ // fails before we waste any time on the run.
358
+ const resolved: { target: AgentTarget; botUserId: number }[] = [];
359
+ for (const a of cli.agents) {
360
+ try {
361
+ const id = await driver.resolveBotUserId(a.botUsername);
362
+ resolved.push({ target: a, botUserId: id });
363
+ process.stdout.write(
364
+ `[uat] resolved ${a.name} ${a.botUsername} → bot_user_id=${id}` +
365
+ (a.admin ? " (admin)" : "") +
366
+ "\n",
367
+ );
368
+ } catch (err) {
369
+ process.stderr.write(
370
+ `[uat] FAILED to resolve ${a.botUsername} for agent ${a.name}: ${(err as Error).message}\n`,
371
+ );
372
+ process.exit(3);
373
+ }
374
+ }
375
+
376
+ // Run!
377
+ const startedAt = new Date();
378
+ const t0 = Date.now();
379
+ const results: CaseResult[] = [];
380
+
381
+ for (const { target, botUserId } of resolved) {
382
+ process.stdout.write(`\n[uat] ─── agent: ${target.name} ─────────────\n`);
383
+ for (const spec of CRITERIA) {
384
+ // Skip 3d (non-admin refusal) on admin agents — they're legitimately
385
+ // capable of those operations, so a "I can't" reply would be wrong.
386
+ if (spec.id === "3d_admin_refusal" && target.admin) {
387
+ process.stdout.write(
388
+ `[uat] skip ${spec.id} on ${target.name} (admin: true)\n`,
389
+ );
390
+ continue;
391
+ }
392
+
393
+ for (const para of spec.paraphrases) {
394
+ const r = await sendAndScore(
395
+ driver,
396
+ botUserId,
397
+ driverUserId,
398
+ spec,
399
+ para.text,
400
+ target.name,
401
+ cli.replyTimeoutMs,
402
+ );
403
+ const tag =
404
+ r.outcome === "pass" ? "✓" : r.outcome === "fail" ? "✗" : "·";
405
+ process.stdout.write(
406
+ `[uat] ${tag} ${spec.id}/${para.label} (${r.outcome}, ${r.durationMs}ms)\n`,
407
+ );
408
+ results.push({
409
+ agent: target.name,
410
+ criterion: spec.id,
411
+ paraphrase: para,
412
+ outcome: r.outcome,
413
+ reply: r.reply,
414
+ durationMs: r.durationMs,
415
+ ...(r.errorMessage ? { errorMessage: r.errorMessage } : {}),
416
+ });
417
+ // Inter-message settle: keep below Telegram's user-account
418
+ // outbound cap and let the agent finish its prior turn.
419
+ await new Promise((res) => setTimeout(res, cli.settleMs));
420
+ }
421
+ }
422
+ }
423
+
424
+ const durationSeconds = (Date.now() - t0) / 1000;
425
+ await driver.disconnect().catch(() => undefined);
426
+
427
+ const md = renderMarkdown(results, {
428
+ startedAt,
429
+ durationSeconds,
430
+ agents: resolved.map((r) => r.target.name),
431
+ });
432
+ writeFileSync(cli.reportPath, md, "utf-8");
433
+ writeFileSync(
434
+ cli.jsonPath,
435
+ JSON.stringify(
436
+ { startedAt: startedAt.toISOString(), durationSeconds, results },
437
+ null,
438
+ 2,
439
+ ),
440
+ "utf-8",
441
+ );
442
+ process.stdout.write(`\n[uat] report → ${cli.reportPath}\n`);
443
+ process.stdout.write(`[uat] json → ${cli.jsonPath}\n`);
444
+
445
+ const passes = results.filter((r) => r.outcome === "pass").length;
446
+ process.stdout.write(
447
+ `[uat] overall: ${passes}/${results.length} passed (${results.length > 0 ? ((passes / results.length) * 100).toFixed(1) : "0"}%)\n`,
448
+ );
449
+
450
+ // Exit non-zero if anything failed, so the runner is CI-actionable.
451
+ process.exit(passes === results.length ? 0 : 1);
452
+ }
453
+
454
+ main().catch((err) => {
455
+ process.stderr.write(`[uat] FATAL: ${(err as Error).stack ?? err}\n`);
456
+ process.exit(4);
457
+ });