switchroom 0.8.1 → 0.11.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (137) hide show
  1. package/README.md +54 -61
  2. package/bin/timezone-hook.sh +9 -7
  3. package/dist/agent-scheduler/index.js +285 -45
  4. package/dist/auth-broker/index.js +13932 -0
  5. package/dist/cli/drive-write-pretool.mjs +5418 -0
  6. package/dist/cli/switchroom.js +8890 -5560
  7. package/dist/host-control/main.js +582 -43
  8. package/dist/vault/approvals/kernel-server.js +276 -47
  9. package/dist/vault/broker/server.js +333 -69
  10. package/examples/minimal.yaml +63 -0
  11. package/examples/personal-google-workspace-mcp/.env.example +34 -0
  12. package/examples/personal-google-workspace-mcp/README.md +194 -0
  13. package/examples/personal-google-workspace-mcp/compose.yaml +66 -0
  14. package/examples/switchroom.yaml +220 -0
  15. package/package.json +6 -4
  16. package/profiles/_base/start.sh.hbs +3 -3
  17. package/profiles/_shared/agent-self-service.md.hbs +126 -0
  18. package/profiles/default/CLAUDE.md +10 -0
  19. package/profiles/default/CLAUDE.md.hbs +16 -0
  20. package/skills/buildkite-agent-infrastructure/SKILL.md +30 -11
  21. package/skills/buildkite-agent-runtime/SKILL.md +44 -11
  22. package/skills/buildkite-api/SKILL.md +31 -8
  23. package/skills/buildkite-cli/SKILL.md +27 -9
  24. package/skills/buildkite-migration/SKILL.md +22 -9
  25. package/skills/buildkite-pipelines/SKILL.md +26 -9
  26. package/skills/buildkite-secure-delivery/SKILL.md +23 -9
  27. package/skills/buildkite-test-engine/SKILL.md +25 -8
  28. package/skills/docx/SKILL.md +1 -1
  29. package/skills/file-bug/SKILL.md +34 -6
  30. package/skills/humanizer/SKILL.md +15 -0
  31. package/skills/humanizer-calibrate/SKILL.md +7 -1
  32. package/skills/mcp-builder/SKILL.md +1 -1
  33. package/skills/pdf/SKILL.md +1 -1
  34. package/skills/pptx/SKILL.md +1 -1
  35. package/skills/skill-creator/SKILL.md +21 -1
  36. package/skills/skill-creator/scripts/__pycache__/__init__.cpython-313.pyc +0 -0
  37. package/skills/skill-creator/scripts/__pycache__/generate_report.cpython-313.pyc +0 -0
  38. package/skills/skill-creator/scripts/__pycache__/improve_description.cpython-313.pyc +0 -0
  39. package/skills/skill-creator/scripts/__pycache__/run_eval.cpython-313.pyc +0 -0
  40. package/skills/skill-creator/scripts/__pycache__/run_loop.cpython-313.pyc +0 -0
  41. package/skills/skill-creator/scripts/__pycache__/utils.cpython-313.pyc +0 -0
  42. package/skills/switchroom-cli/SKILL.md +63 -64
  43. package/skills/switchroom-health/SKILL.md +23 -10
  44. package/skills/switchroom-install/SKILL.md +3 -3
  45. package/skills/switchroom-manage/SKILL.md +26 -19
  46. package/skills/switchroom-runtime/SKILL.md +67 -15
  47. package/skills/switchroom-status/SKILL.md +26 -1
  48. package/skills/telegram-test-harness/SKILL.md +3 -0
  49. package/skills/webapp-testing/SKILL.md +31 -1
  50. package/skills/xlsx/SKILL.md +1 -1
  51. package/telegram-plugin/admin-commands/dispatch.test.ts +1 -1
  52. package/telegram-plugin/admin-commands/index.ts +9 -5
  53. package/telegram-plugin/auth-snapshot-format.ts +612 -0
  54. package/telegram-plugin/auto-fallback-fleet.ts +215 -0
  55. package/telegram-plugin/auto-fallback.ts +28 -301
  56. package/telegram-plugin/dist/gateway/gateway.js +17453 -15100
  57. package/telegram-plugin/fleet-fallback-gate.ts +105 -0
  58. package/telegram-plugin/gateway/approval-callback.test.ts +104 -0
  59. package/telegram-plugin/gateway/approval-callback.ts +31 -3
  60. package/telegram-plugin/gateway/auth-add-flow.ts +326 -0
  61. package/telegram-plugin/gateway/auth-broker-client.ts +75 -0
  62. package/telegram-plugin/gateway/auth-command.ts +905 -0
  63. package/telegram-plugin/gateway/auth-line.ts +123 -0
  64. package/telegram-plugin/gateway/auth-status-adapter.ts +101 -0
  65. package/telegram-plugin/gateway/boot-card.ts +23 -37
  66. package/telegram-plugin/gateway/boot-probes.ts +9 -12
  67. package/telegram-plugin/gateway/diff-preview-card.test.ts +192 -0
  68. package/telegram-plugin/gateway/diff-preview-card.ts +170 -0
  69. package/telegram-plugin/gateway/drive-write-approval.test.ts +312 -0
  70. package/telegram-plugin/gateway/drive-write-approval.ts +243 -0
  71. package/telegram-plugin/gateway/folder-picker-handler.test.ts +314 -0
  72. package/telegram-plugin/gateway/folder-picker-handler.ts +348 -0
  73. package/telegram-plugin/gateway/gateway.ts +1156 -938
  74. package/telegram-plugin/gateway/hostd-dispatch.ts +244 -0
  75. package/telegram-plugin/gateway/ipc-protocol.ts +83 -2
  76. package/telegram-plugin/gateway/ipc-server.ts +69 -0
  77. package/telegram-plugin/hooks/sandbox-hint-posttool.mjs +103 -12
  78. package/telegram-plugin/hooks/tool-label-pretool.mjs +11 -0
  79. package/telegram-plugin/hooks/wedge-detect-posttool.mjs +303 -0
  80. package/telegram-plugin/model-unavailable.ts +28 -12
  81. package/telegram-plugin/permission-title.ts +56 -0
  82. package/telegram-plugin/quota-check.ts +19 -41
  83. package/telegram-plugin/scripts/build.mjs +0 -1
  84. package/telegram-plugin/shared/bot-runtime.ts +5 -4
  85. package/telegram-plugin/silence-poke.ts +153 -1
  86. package/telegram-plugin/tests/auth-add-flow.test.ts +559 -0
  87. package/telegram-plugin/tests/auth-code-redact.test.ts +8 -4
  88. package/telegram-plugin/tests/auth-command-format2.test.ts +156 -0
  89. package/telegram-plugin/tests/auth-command-vernacular.test.ts +531 -0
  90. package/telegram-plugin/tests/auth-snapshot-format.test.ts +429 -0
  91. package/telegram-plugin/tests/auth-status-adapter.test.ts +129 -0
  92. package/telegram-plugin/tests/auto-fallback-fleet.test.ts +211 -0
  93. package/telegram-plugin/tests/auto-fallback.test.ts +60 -358
  94. package/telegram-plugin/tests/boot-probes.test.ts +27 -22
  95. package/telegram-plugin/tests/fleet-fallback-gate.test.ts +197 -0
  96. package/telegram-plugin/tests/model-unavailable.test.ts +30 -5
  97. package/telegram-plugin/tests/permission-title.test.ts +31 -0
  98. package/telegram-plugin/tests/quota-check.test.ts +5 -35
  99. package/telegram-plugin/tests/sandbox-hint-posttool.test.ts +212 -2
  100. package/telegram-plugin/tests/silence-poke.test.ts +237 -0
  101. package/telegram-plugin/tests/turn-flush-safety.test.ts +112 -0
  102. package/telegram-plugin/turn-flush-safety.ts +55 -1
  103. package/telegram-plugin/uat/SETUP.md +35 -1
  104. package/telegram-plugin/uat/runners/agent-self-sufficiency.ts +457 -0
  105. package/telegram-plugin/uat/runners/paraphrases.ts +231 -0
  106. package/telegram-plugin/uat/runners/report.ts +150 -0
  107. package/telegram-plugin/uat/runners/run-agent-self-sufficiency.sh +50 -0
  108. package/telegram-plugin/uat/runners/scorer.test.ts +196 -0
  109. package/telegram-plugin/uat/runners/scorer.ts +106 -0
  110. package/telegram-plugin/uat/runners/skill-coverage.test.ts +100 -0
  111. package/telegram-plugin/uat/runners/skill-coverage.ts +620 -0
  112. package/telegram-plugin/uat/scenarios/jtbd-interrupt-marker-dm.test.ts +7 -1
  113. package/telegram-plugin/uat/scenarios/jtbd-rapid-followup-dm.test.ts +7 -1
  114. package/telegram-plugin/auth-dashboard.ts +0 -1104
  115. package/telegram-plugin/auth-slot-parser.ts +0 -497
  116. package/telegram-plugin/auto-fallback-dispatcher.ts +0 -68
  117. package/telegram-plugin/dist/foreman/foreman.js +0 -31358
  118. package/telegram-plugin/foreman/foreman-create-flow.ts +0 -202
  119. package/telegram-plugin/foreman/foreman-handlers.ts +0 -493
  120. package/telegram-plugin/foreman/foreman.ts +0 -1165
  121. package/telegram-plugin/foreman/setup-flow.ts +0 -345
  122. package/telegram-plugin/foreman/setup-state.ts +0 -239
  123. package/telegram-plugin/foreman/state.ts +0 -203
  124. package/telegram-plugin/tests/auth-account-identity-surface.test.ts +0 -118
  125. package/telegram-plugin/tests/auth-dashboard-edge-cases.test.ts +0 -260
  126. package/telegram-plugin/tests/auth-dashboard-restart-flow.test.ts +0 -140
  127. package/telegram-plugin/tests/auth-dashboard-v3b.test.ts +0 -559
  128. package/telegram-plugin/tests/auth-dashboard.test.ts +0 -1045
  129. package/telegram-plugin/tests/auth-slot-commands.test.ts +0 -640
  130. package/telegram-plugin/tests/auto-fallback-dispatcher.e2e.test.ts +0 -183
  131. package/telegram-plugin/tests/boot-card-account-quota.test.ts +0 -137
  132. package/telegram-plugin/tests/foreman-create-flow.test.ts +0 -359
  133. package/telegram-plugin/tests/foreman-handlers.test.ts +0 -347
  134. package/telegram-plugin/tests/foreman-state.test.ts +0 -164
  135. package/telegram-plugin/tests/foreman-write-ops.test.ts +0 -214
  136. package/telegram-plugin/tests/setup-flow.test.ts +0 -510
  137. package/telegram-plugin/tests/setup-state.test.ts +0 -146
@@ -57,6 +57,7 @@ export type FlushDecision =
57
57
  export type FlushSkipReason =
58
58
  | 'flag-disabled'
59
59
  | 'reply-called'
60
+ | 'reply-called-no-new-text'
60
61
  | 'no-inbound-chat'
61
62
  | 'empty-text'
62
63
  | 'silent-marker'
@@ -71,10 +72,33 @@ export interface FlushDecisionInput {
71
72
  /** Raw text content blocks accumulated from assistant events across the
72
73
  * turn. Joined + trimmed internally. */
73
74
  capturedText: string[]
75
+ /** Snapshot of `capturedText.length` at the moment of the most recent
76
+ * reply / stream_reply tool call in this turn. Indices `[capturedText
77
+ * length-at-last-reply, capturedText.length)` are the post-reply tail
78
+ * — substantive content the model emitted AFTER the reply (e.g. soft
79
+ * commit "on it, back in a few" followed by the real answer in
80
+ * terminal text only, the #1291 repro). When the tail meets
81
+ * `replyCalledTailMinChars` we flush it; otherwise we skip.
82
+ *
83
+ * Defaults to `capturedText.length` (treat all captured text as
84
+ * pre-reply, preserve the pre-#1291 behaviour where any reply tool
85
+ * call suppressed flush entirely) so callers that don't track the
86
+ * marker keep the old contract. */
87
+ capturedTextLenAtLastReply?: number
88
+ /** Minimum trimmed-tail length to qualify a post-reply tail flush.
89
+ * Defaults to `REPLY_CALLED_TAIL_MIN_CHARS` (40). Below this we skip
90
+ * with `reply-called-no-new-text` — typical for trailing markdown
91
+ * artifacts or a one-word afterthought. */
92
+ replyCalledTailMinChars?: number
74
93
  /** Feature flag — defaults to true. Pass `false` to force skip everywhere. */
75
94
  flushEnabled?: boolean
76
95
  }
77
96
 
97
+ /** Default minimum trimmed length for the post-reply tail to be flushed
98
+ * as a follow-up message. Below this we treat the tail as noise / artifact
99
+ * and skip silently. */
100
+ export const REPLY_CALLED_TAIL_MIN_CHARS = 40
101
+
78
102
  /**
79
103
  * Pure decision: should the gateway deterministically send the model's
80
104
  * captured assistant text at turn_end? Returns `{kind: 'flush', text}` with
@@ -82,11 +106,41 @@ export interface FlushDecisionInput {
82
106
  *
83
107
  * Ordering of checks is deliberate: cheapest/strongest first so logs
84
108
  * attribute a skip to the most specific cause.
109
+ *
110
+ * #1291 — when `replyCalled` is true we no longer suppress unconditionally.
111
+ * The model may have emitted a soft-commit reply ("on it, back in a few")
112
+ * followed by the real substantive answer in terminal text only. Using
113
+ * `capturedTextLenAtLastReply` we isolate the post-reply tail and flush
114
+ * it if it's substantive enough; otherwise we skip with
115
+ * `reply-called-no-new-text` (logged) or `reply-called` (silent, no tail).
85
116
  */
86
117
  export function decideTurnFlush(input: FlushDecisionInput): FlushDecision {
87
118
  const flushEnabled = input.flushEnabled !== false
88
119
  if (!flushEnabled) return { kind: 'skip', reason: 'flag-disabled' }
89
- if (input.replyCalled) return { kind: 'skip', reason: 'reply-called' }
120
+
121
+ if (input.replyCalled) {
122
+ const tailIdx = input.capturedTextLenAtLastReply ?? input.capturedText.length
123
+ const tail = input.capturedText.slice(tailIdx).join('\n').trim()
124
+ const minChars = input.replyCalledTailMinChars ?? REPLY_CALLED_TAIL_MIN_CHARS
125
+ if (tail.length === 0) {
126
+ // The reply tool was called and nothing of substance came after —
127
+ // the turn is fully served by the reply. Skip silently (the gateway
128
+ // WARN gate excludes this reason from logs).
129
+ return { kind: 'skip', reason: 'reply-called' }
130
+ }
131
+ if (tail.length < minChars) {
132
+ // Post-reply tail exists but is below the substantive-content
133
+ // threshold — typically trailing markdown artifacts or a one-word
134
+ // afterthought. Skip but with a distinct reason so this case IS
135
+ // logged (auditable for #1291 regressions, vs the silent
136
+ // 'reply-called' which is the expected steady state).
137
+ return { kind: 'skip', reason: 'reply-called-no-new-text' }
138
+ }
139
+ if (input.chatId == null) return { kind: 'skip', reason: 'no-inbound-chat' }
140
+ if (isSilentFlushMarker(tail)) return { kind: 'skip', reason: 'silent-marker' }
141
+ return { kind: 'flush', text: tail }
142
+ }
143
+
90
144
  if (input.chatId == null) return { kind: 'skip', reason: 'no-inbound-chat' }
91
145
  const joined = input.capturedText.join('\n').trim()
92
146
  if (joined.length === 0) return { kind: 'skip', reason: 'empty-text' }
@@ -297,7 +297,41 @@ as a long-lived secret.
297
297
  When all three are checked, the env block above + `bun run test:uat`
298
298
  is safe to run.
299
299
 
300
- ## 8. Port allocator vs unix sockets (Phase 1 scaffold note)
300
+ ## 8. CI gate `ci-uat` GitHub Actions workflow
301
+
302
+ Since the GHA gate landed (replacing the original Buildkite gate),
303
+ the fuzz subset of scenarios (`fuzz-random-prompts-dm.test.ts`,
304
+ `fuzz-extended-dm.test.ts`, `fuzz-human-style-dm.test.ts`) runs
305
+ automatically on every PR that touches `telegram-plugin/`,
306
+ `src/agents/`, or `telegram-plugin/uat/`.
307
+
308
+ The workflow (`.github/workflows/ci-uat.yml`) runs on a self-hosted
309
+ GHA runner labelled `[self-hosted, uat-host]` that lives on the
310
+ same box as the `test-harness` agent. Gating: the `UAT_GATE_ENABLED`
311
+ repository variable must be `true` AND the four Telegram secrets
312
+ (`TELEGRAM_API_ID`, `TELEGRAM_API_HASH`, `TELEGRAM_UAT_DRIVER_SESSION`,
313
+ `TELEGRAM_TEST_BOT_USERNAME`) must be present as GitHub Actions
314
+ secrets. The workflow's header docstring covers agent setup + secret
315
+ rotation.
316
+
317
+ **Scope (CI):**
318
+
319
+ | Scenario | In CI? | Why |
320
+ |---|---|---|
321
+ | `fuzz-random-prompts-dm` | ✅ gates PRs | JTBD-floor invariants; PR #1132. |
322
+ | `fuzz-extended-dm` | ✅ gates PRs | Second-pass categories; PR #1134. |
323
+ | `fuzz-human-style-dm` | ✅ gates PRs | Human-shape inbounds + meaningful-reply floor. |
324
+ | `silent-end-recovery-dm` | ❌ local only | Passes, but the 5-min worst-case budget makes it costly to run every PR. Run nightly + ad-hoc. |
325
+ | `jtbd-status-query-dm` | ❌ local only | Passes; defer to a follow-up that batches the cheap JTBD scenarios. |
326
+ | `jtbd-soft-commit-dm` | ❌ local only | Already budget-tuned but real-Telegram timing flake risk; defer until we have flake telemetry. |
327
+ | `jtbd-interrupt-marker-dm` | ❌ `describe.skip` | Suspected real bug per #1132 overnight. Investigate before unskipping. |
328
+ | `jtbd-rapid-followup-dm` | ❌ `describe.skip` | Suspected real classification bug per #1132 overnight. Investigate before unskipping. |
329
+ | vault / secret-redaction / voice / location / reactions / progress-card | ❌ local only | Need specific surfaces / config overrides not wired into the gate yet. |
330
+
331
+ A local `bun run test:uat` runs the full include glob minus the two
332
+ `describe.skip`'d JTBDs.
333
+
334
+ ## 9. Port allocator vs unix sockets (Phase 1 scaffold note)
301
335
 
302
336
  The Phase 1 `port-allocator.ts` is held in reserve for Phase 2b's
303
337
  child-process flow — Phase 2a (standard-runtime agent) doesn't need
@@ -0,0 +1,457 @@
1
+ #!/usr/bin/env bun
2
+ /**
3
+ * Agent-self-sufficiency UAT runner.
4
+ *
5
+ * Drives a real Telegram user-account against the live agent fleet to
6
+ * verify the four acceptance criteria from the
7
+ * "agent-self-sufficiency" goal:
8
+ *
9
+ * 1. Self-management (skill_list, cron_list, audit_tail, config_get)
10
+ * 2. Identity awareness (honest self-ID, knows its name, knows peers)
11
+ * 3. Admin surface (non-admin refusal naming the admin agent)
12
+ * — admin reads (3a/3b) are covered by the hostd vitest suite
13
+ * rather than live fuzz, because they require a docker stub.
14
+ * 4. The fuzzy UAT IS this runner.
15
+ *
16
+ * Usage:
17
+ *
18
+ * bun telegram-plugin/uat/runners/agent-self-sufficiency.ts \\
19
+ * --agent klanker:@klanker_bot \\
20
+ * --agent scribe:@scribe_bot \\
21
+ * --agent doc:@doc_bot \\
22
+ * --admin-agent klanker \\
23
+ * --report ./uat-report.md
24
+ *
25
+ * # OR — discover from env (CI-friendly):
26
+ * UAT_FLEET="klanker:@klanker_bot,scribe:@scribe_bot,doc:@doc_bot" \\
27
+ * UAT_ADMIN_AGENTS="klanker" \\
28
+ * bun telegram-plugin/uat/runners/agent-self-sufficiency.ts
29
+ *
30
+ * Auth env (same as the existing uat harness — see
31
+ * telegram-plugin/uat/SETUP.md):
32
+ *
33
+ * TELEGRAM_API_ID, TELEGRAM_API_HASH, TELEGRAM_UAT_DRIVER_SESSION
34
+ *
35
+ * **Why a user-account session, not bot tokens.** The acceptance-
36
+ * criteria text mentioned `TELEGRAM_BOT_TOKEN_<agent>` env vars, but
37
+ * Telegram's Bot API forbids bots from reading other bots' messages
38
+ * (https://core.telegram.org/bots/faq) — a bot can send to another
39
+ * bot's chat but can't observe the reply. The only way to drive the
40
+ * fleet AND capture every agent's reply is an mtcute user-account
41
+ * session, which is what the existing telegram-plugin/uat harness
42
+ * uses. This runner inherits that machinery wholesale; the env-var
43
+ * rename is forced by the platform, not a design choice.
44
+ *
45
+ * Missing creds fail loud, not silent — the goal explicitly demands
46
+ * no silent skips on missing UAT credentials.
47
+ */
48
+
49
+ import { writeFileSync } from "node:fs";
50
+ import { Driver, type ObservedMessage } from "../driver.js";
51
+ import { loadUatEnv } from "../load-env.js";
52
+ import { CRITERIA, type CriterionSpec } from "./paraphrases.js";
53
+ import { scoreReply, type CaseResult, type Outcome } from "./scorer.js";
54
+ import { renderMarkdown } from "./report.js";
55
+
56
+ loadUatEnv();
57
+
58
+ // ─── CLI / env parsing ─────────────────────────────────────────────────────
59
+
60
+ interface AgentTarget {
61
+ name: string;
62
+ botUsername: string;
63
+ admin: boolean;
64
+ }
65
+
66
+ interface CliConfig {
67
+ agents: AgentTarget[];
68
+ reportPath: string;
69
+ jsonPath: string;
70
+ /** Per-case reply timeout, ms. Default 60s. */
71
+ replyTimeoutMs: number;
72
+ /** Inter-message settle, ms. Default 4s — keeps us under Telegram's
73
+ * global outbound rate cap and gives the agent time to finish its
74
+ * previous turn before the next inbound. */
75
+ settleMs: number;
76
+ }
77
+
78
+ function parseCli(argv: readonly string[]): CliConfig {
79
+ const agents = new Map<string, AgentTarget>();
80
+ const adminSet = new Set<string>();
81
+ let reportPath = process.env.UAT_REPORT ?? "./uat-agent-self-sufficiency.md";
82
+ let jsonPath = process.env.UAT_REPORT_JSON ?? "./uat-agent-self-sufficiency.json";
83
+ let replyTimeoutMs = Number.parseInt(process.env.UAT_REPLY_TIMEOUT_MS ?? "60000", 10);
84
+ let settleMs = Number.parseInt(process.env.UAT_SETTLE_MS ?? "4000", 10);
85
+
86
+ const envFleet = process.env.UAT_FLEET;
87
+ if (envFleet) {
88
+ for (const tok of envFleet.split(",")) {
89
+ const [name, bot] = tok.split(":").map((s) => s.trim());
90
+ if (name && bot) agents.set(name, { name, botUsername: bot, admin: false });
91
+ }
92
+ }
93
+ const envAdmin = process.env.UAT_ADMIN_AGENTS;
94
+ if (envAdmin) {
95
+ for (const tok of envAdmin.split(",")) {
96
+ const name = tok.trim();
97
+ if (name) adminSet.add(name);
98
+ }
99
+ }
100
+
101
+ for (let i = 0; i < argv.length; i++) {
102
+ const tok = argv[i]!;
103
+ const next = (): string => {
104
+ const v = argv[++i];
105
+ if (!v) fail(`${tok}: missing value`);
106
+ return v;
107
+ };
108
+ switch (tok) {
109
+ case "--agent": {
110
+ const v = next();
111
+ const [name, bot] = v.split(":").map((s) => s.trim());
112
+ if (!name || !bot)
113
+ fail(`--agent expects "<name>:@<bot-username>"; got "${v}"`);
114
+ agents.set(name, { name, botUsername: bot, admin: false });
115
+ break;
116
+ }
117
+ case "--admin-agent": {
118
+ adminSet.add(next());
119
+ break;
120
+ }
121
+ case "--report":
122
+ reportPath = next();
123
+ break;
124
+ case "--json":
125
+ jsonPath = next();
126
+ break;
127
+ case "--reply-timeout-ms":
128
+ replyTimeoutMs = Number.parseInt(next(), 10);
129
+ break;
130
+ case "--settle-ms":
131
+ settleMs = Number.parseInt(next(), 10);
132
+ break;
133
+ case "--help":
134
+ case "-h":
135
+ printHelp();
136
+ process.exit(0);
137
+ break;
138
+ default:
139
+ if (tok.startsWith("--")) fail(`unknown flag: ${tok}`);
140
+ }
141
+ }
142
+
143
+ for (const name of adminSet) {
144
+ const t = agents.get(name);
145
+ if (t) t.admin = true;
146
+ }
147
+
148
+ if (agents.size === 0) {
149
+ fail(
150
+ "no agents to target. Pass --agent <name>:@<bot> at least once, or set UAT_FLEET env",
151
+ );
152
+ }
153
+ if (agents.size < 3) {
154
+ process.stderr.write(
155
+ `[uat] WARNING: only ${agents.size} agent(s) targeted; goal calls for ≥3 to prove shared infra.\n`,
156
+ );
157
+ }
158
+
159
+ return {
160
+ agents: [...agents.values()],
161
+ reportPath,
162
+ jsonPath,
163
+ replyTimeoutMs,
164
+ settleMs,
165
+ };
166
+ }
167
+
168
+ function fail(msg: string): never {
169
+ process.stderr.write(`[uat] ${msg}\n`);
170
+ process.exit(2);
171
+ }
172
+
173
+ function printHelp(): void {
174
+ process.stdout.write(`agent-self-sufficiency UAT runner
175
+
176
+ Required env (or fail loud):
177
+ TELEGRAM_API_ID, TELEGRAM_API_HASH, TELEGRAM_UAT_DRIVER_SESSION
178
+
179
+ Flags:
180
+ --agent NAME:@BOT Add an agent target. Repeatable.
181
+ --admin-agent NAME Mark NAME as admin: true (skips 3d for that agent).
182
+ --report PATH Markdown report path. Default ./uat-agent-self-sufficiency.md
183
+ --json PATH JSON sidecar with all results. Default ./uat-agent-self-sufficiency.json
184
+ --reply-timeout-ms N Per-case timeout. Default 60000.
185
+ --settle-ms N Inter-message settle. Default 4000.
186
+
187
+ Env equivalents:
188
+ UAT_FLEET="name1:@bot1,name2:@bot2,..."
189
+ UAT_ADMIN_AGENTS="name1,name2"
190
+ UAT_REPORT, UAT_REPORT_JSON, UAT_REPLY_TIMEOUT_MS, UAT_SETTLE_MS
191
+ `);
192
+ }
193
+
194
+ // ─── Driver wrapper: send + observe ─────────────────────────────────────────
195
+
196
+ interface ReplyOutcome {
197
+ reply: string;
198
+ outcome: Outcome;
199
+ durationMs: number;
200
+ errorMessage?: string;
201
+ }
202
+
203
+ /**
204
+ * Send one inbound to the agent and wait for a meaningful reply.
205
+ *
206
+ * We subscribe to the chat's message stream BEFORE sending so we don't
207
+ * miss the bot's reply if it lands faster than we can start observing
208
+ * (yes, this happens). Then:
209
+ *
210
+ * 1. Send the inbound.
211
+ * 2. Consume the stream until we see the first non-empty bot message
212
+ * with messageId > our sent.messageId. That's the reply head.
213
+ * 3. Continue consuming for an "edit window" (3s by default) to
214
+ * absorb any edits the gateway makes to its first chunk (stream-
215
+ * reply pattern: bot sends "thinking…" then edits with the final
216
+ * answer). The final post-edit text is what we score.
217
+ * 4. Bail out with `timeout` if we never see a head.
218
+ */
219
+ async function sendAndScore(
220
+ driver: Driver,
221
+ botUserId: number,
222
+ driverUserId: number,
223
+ spec: CriterionSpec,
224
+ prompt: string,
225
+ agentName: string,
226
+ timeoutMs: number,
227
+ ): Promise<ReplyOutcome> {
228
+ const startedAt = Date.now();
229
+ // Start observing FIRST so we don't race the bot's reply.
230
+ const stream = driver.observeMessages(botUserId)[Symbol.asyncIterator]();
231
+
232
+ let sentMessageId: number;
233
+ try {
234
+ const sent = await driver.sendText(botUserId, prompt);
235
+ sentMessageId = sent.messageId;
236
+ } catch (err) {
237
+ try {
238
+ await stream.return?.(undefined);
239
+ } catch {
240
+ /* ignore */
241
+ }
242
+ return {
243
+ reply: "",
244
+ outcome: "error",
245
+ durationMs: Date.now() - startedAt,
246
+ errorMessage: `send failed: ${(err as Error).message}`,
247
+ };
248
+ }
249
+
250
+ const deadline = startedAt + timeoutMs;
251
+ const EDIT_WINDOW_MS = 3000;
252
+ let headSeenAt = 0;
253
+ let replyMessageId = 0;
254
+ let replyText = "";
255
+
256
+ try {
257
+ while (Date.now() < deadline) {
258
+ const remaining = deadline - Date.now();
259
+ const winSize = headSeenAt
260
+ ? Math.max(0, EDIT_WINDOW_MS - (Date.now() - headSeenAt))
261
+ : remaining;
262
+ if (headSeenAt && winSize === 0) break;
263
+ const slice = await pullOneWithTimeout(stream, Math.min(remaining, Math.max(250, winSize)));
264
+ if (slice === "timeout") {
265
+ if (headSeenAt) break; // edit window elapsed
266
+ continue;
267
+ }
268
+ if (slice === "done") break;
269
+ const m: ObservedMessage = slice;
270
+ if (m.senderUserId === driverUserId) continue;
271
+ if (m.messageId <= sentMessageId) continue;
272
+ const t = (m.text ?? "").trim();
273
+ if (!t) continue;
274
+ // Either this is the head, or it's an edit/replacement of the
275
+ // bot's reply. Track the most recent.
276
+ replyMessageId = m.messageId;
277
+ replyText = t;
278
+ if (!headSeenAt) headSeenAt = Date.now();
279
+ }
280
+ } finally {
281
+ try {
282
+ await stream.return?.(undefined);
283
+ } catch {
284
+ /* ignore */
285
+ }
286
+ }
287
+
288
+ const durationMs = Date.now() - startedAt;
289
+ if (!replyMessageId) {
290
+ return { reply: "", outcome: "timeout", durationMs };
291
+ }
292
+ const outcome = scoreReply(spec, replyText, { agentName });
293
+ return { reply: replyText, outcome, durationMs };
294
+ }
295
+
296
+ /**
297
+ * Race the next stream item against a timeout. Returns the item, or
298
+ * the literal `"timeout"` / `"done"` sentinels. `done` is rare in
299
+ * practice — the observer doesn't naturally close until we tell it to.
300
+ */
301
+ async function pullOneWithTimeout(
302
+ it: AsyncIterator<ObservedMessage>,
303
+ ms: number,
304
+ ): Promise<ObservedMessage | "timeout" | "done"> {
305
+ return new Promise((resolve) => {
306
+ let settled = false;
307
+ const timer = setTimeout(() => {
308
+ if (settled) return;
309
+ settled = true;
310
+ resolve("timeout");
311
+ }, ms);
312
+ it.next().then(
313
+ (r) => {
314
+ if (settled) return;
315
+ settled = true;
316
+ clearTimeout(timer);
317
+ if (r.done) resolve("done");
318
+ else resolve(r.value);
319
+ },
320
+ () => {
321
+ if (settled) return;
322
+ settled = true;
323
+ clearTimeout(timer);
324
+ resolve("done");
325
+ },
326
+ );
327
+ });
328
+ }
329
+
330
+ // ─── Main orchestration ─────────────────────────────────────────────────────
331
+
332
+ async function main(): Promise<void> {
333
+ const cli = parseCli(process.argv.slice(2));
334
+
335
+ // Hard-fail on missing UAT creds — goal: never silently skip.
336
+ const apiId = Number.parseInt(process.env.TELEGRAM_API_ID ?? "", 10);
337
+ if (!Number.isFinite(apiId)) {
338
+ fail("TELEGRAM_API_ID missing or non-integer — see telegram-plugin/uat/SETUP.md");
339
+ }
340
+ const apiHash = process.env.TELEGRAM_API_HASH ?? "";
341
+ if (!apiHash) fail("TELEGRAM_API_HASH missing — see SETUP.md");
342
+ const session = process.env.TELEGRAM_UAT_DRIVER_SESSION ?? "";
343
+ if (!session)
344
+ fail(
345
+ "TELEGRAM_UAT_DRIVER_SESSION missing — run `bun run uat:login` first (SETUP.md §4)",
346
+ );
347
+
348
+ process.stdout.write(
349
+ `[uat] connecting to Telegram as the UAT driver account...\n`,
350
+ );
351
+ const driver = new Driver({ apiId, apiHash, session });
352
+ await driver.connect();
353
+ const driverUserId = await driver.getMyUserId();
354
+ process.stdout.write(`[uat] driver user_id=${driverUserId}\n`);
355
+
356
+ // Resolve every agent's bot user_id up front so a missing username
357
+ // fails before we waste any time on the run.
358
+ const resolved: { target: AgentTarget; botUserId: number }[] = [];
359
+ for (const a of cli.agents) {
360
+ try {
361
+ const id = await driver.resolveBotUserId(a.botUsername);
362
+ resolved.push({ target: a, botUserId: id });
363
+ process.stdout.write(
364
+ `[uat] resolved ${a.name} ${a.botUsername} → bot_user_id=${id}` +
365
+ (a.admin ? " (admin)" : "") +
366
+ "\n",
367
+ );
368
+ } catch (err) {
369
+ process.stderr.write(
370
+ `[uat] FAILED to resolve ${a.botUsername} for agent ${a.name}: ${(err as Error).message}\n`,
371
+ );
372
+ process.exit(3);
373
+ }
374
+ }
375
+
376
+ // Run!
377
+ const startedAt = new Date();
378
+ const t0 = Date.now();
379
+ const results: CaseResult[] = [];
380
+
381
+ for (const { target, botUserId } of resolved) {
382
+ process.stdout.write(`\n[uat] ─── agent: ${target.name} ─────────────\n`);
383
+ for (const spec of CRITERIA) {
384
+ // Skip 3d (non-admin refusal) on admin agents — they're legitimately
385
+ // capable of those operations, so a "I can't" reply would be wrong.
386
+ if (spec.id === "3d_admin_refusal" && target.admin) {
387
+ process.stdout.write(
388
+ `[uat] skip ${spec.id} on ${target.name} (admin: true)\n`,
389
+ );
390
+ continue;
391
+ }
392
+
393
+ for (const para of spec.paraphrases) {
394
+ const r = await sendAndScore(
395
+ driver,
396
+ botUserId,
397
+ driverUserId,
398
+ spec,
399
+ para.text,
400
+ target.name,
401
+ cli.replyTimeoutMs,
402
+ );
403
+ const tag =
404
+ r.outcome === "pass" ? "✓" : r.outcome === "fail" ? "✗" : "·";
405
+ process.stdout.write(
406
+ `[uat] ${tag} ${spec.id}/${para.label} (${r.outcome}, ${r.durationMs}ms)\n`,
407
+ );
408
+ results.push({
409
+ agent: target.name,
410
+ criterion: spec.id,
411
+ paraphrase: para,
412
+ outcome: r.outcome,
413
+ reply: r.reply,
414
+ durationMs: r.durationMs,
415
+ ...(r.errorMessage ? { errorMessage: r.errorMessage } : {}),
416
+ });
417
+ // Inter-message settle: keep below Telegram's user-account
418
+ // outbound cap and let the agent finish its prior turn.
419
+ await new Promise((res) => setTimeout(res, cli.settleMs));
420
+ }
421
+ }
422
+ }
423
+
424
+ const durationSeconds = (Date.now() - t0) / 1000;
425
+ await driver.disconnect().catch(() => undefined);
426
+
427
+ const md = renderMarkdown(results, {
428
+ startedAt,
429
+ durationSeconds,
430
+ agents: resolved.map((r) => r.target.name),
431
+ });
432
+ writeFileSync(cli.reportPath, md, "utf-8");
433
+ writeFileSync(
434
+ cli.jsonPath,
435
+ JSON.stringify(
436
+ { startedAt: startedAt.toISOString(), durationSeconds, results },
437
+ null,
438
+ 2,
439
+ ),
440
+ "utf-8",
441
+ );
442
+ process.stdout.write(`\n[uat] report → ${cli.reportPath}\n`);
443
+ process.stdout.write(`[uat] json → ${cli.jsonPath}\n`);
444
+
445
+ const passes = results.filter((r) => r.outcome === "pass").length;
446
+ process.stdout.write(
447
+ `[uat] overall: ${passes}/${results.length} passed (${results.length > 0 ? ((passes / results.length) * 100).toFixed(1) : "0"}%)\n`,
448
+ );
449
+
450
+ // Exit non-zero if anything failed, so the runner is CI-actionable.
451
+ process.exit(passes === results.length ? 0 : 1);
452
+ }
453
+
454
+ main().catch((err) => {
455
+ process.stderr.write(`[uat] FATAL: ${(err as Error).stack ?? err}\n`);
456
+ process.exit(4);
457
+ });