npm - switchroom - Versions diffs - 0.8.1 → 0.11.0 - Mend

switchroom 0.8.1 → 0.11.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (137) hide show

package/README.md +54 -61
package/bin/timezone-hook.sh +9 -7
package/dist/agent-scheduler/index.js +285 -45
package/dist/auth-broker/index.js +13932 -0
package/dist/cli/drive-write-pretool.mjs +5418 -0
package/dist/cli/switchroom.js +8890 -5560
package/dist/host-control/main.js +582 -43
package/dist/vault/approvals/kernel-server.js +276 -47
package/dist/vault/broker/server.js +333 -69
package/examples/minimal.yaml +63 -0
package/examples/personal-google-workspace-mcp/.env.example +34 -0
package/examples/personal-google-workspace-mcp/README.md +194 -0
package/examples/personal-google-workspace-mcp/compose.yaml +66 -0
package/examples/switchroom.yaml +220 -0
package/package.json +6 -4
package/profiles/_base/start.sh.hbs +3 -3
package/profiles/_shared/agent-self-service.md.hbs +126 -0
package/profiles/default/CLAUDE.md +10 -0
package/profiles/default/CLAUDE.md.hbs +16 -0
package/skills/buildkite-agent-infrastructure/SKILL.md +30 -11
package/skills/buildkite-agent-runtime/SKILL.md +44 -11
package/skills/buildkite-api/SKILL.md +31 -8
package/skills/buildkite-cli/SKILL.md +27 -9
package/skills/buildkite-migration/SKILL.md +22 -9
package/skills/buildkite-pipelines/SKILL.md +26 -9
package/skills/buildkite-secure-delivery/SKILL.md +23 -9
package/skills/buildkite-test-engine/SKILL.md +25 -8
package/skills/docx/SKILL.md +1 -1
package/skills/file-bug/SKILL.md +34 -6
package/skills/humanizer/SKILL.md +15 -0
package/skills/humanizer-calibrate/SKILL.md +7 -1
package/skills/mcp-builder/SKILL.md +1 -1
package/skills/pdf/SKILL.md +1 -1
package/skills/pptx/SKILL.md +1 -1
package/skills/skill-creator/SKILL.md +21 -1
package/skills/skill-creator/scripts/__pycache__/__init__.cpython-313.pyc +0 -0
package/skills/skill-creator/scripts/__pycache__/generate_report.cpython-313.pyc +0 -0
package/skills/skill-creator/scripts/__pycache__/improve_description.cpython-313.pyc +0 -0
package/skills/skill-creator/scripts/__pycache__/run_eval.cpython-313.pyc +0 -0
package/skills/skill-creator/scripts/__pycache__/run_loop.cpython-313.pyc +0 -0
package/skills/skill-creator/scripts/__pycache__/utils.cpython-313.pyc +0 -0
package/skills/switchroom-cli/SKILL.md +63 -64
package/skills/switchroom-health/SKILL.md +23 -10
package/skills/switchroom-install/SKILL.md +3 -3
package/skills/switchroom-manage/SKILL.md +26 -19
package/skills/switchroom-runtime/SKILL.md +67 -15
package/skills/switchroom-status/SKILL.md +26 -1
package/skills/telegram-test-harness/SKILL.md +3 -0
package/skills/webapp-testing/SKILL.md +31 -1
package/skills/xlsx/SKILL.md +1 -1
package/telegram-plugin/admin-commands/dispatch.test.ts +1 -1
package/telegram-plugin/admin-commands/index.ts +9 -5
package/telegram-plugin/auth-snapshot-format.ts +612 -0
package/telegram-plugin/auto-fallback-fleet.ts +215 -0
package/telegram-plugin/auto-fallback.ts +28 -301
package/telegram-plugin/dist/gateway/gateway.js +17453 -15100
package/telegram-plugin/fleet-fallback-gate.ts +105 -0
package/telegram-plugin/gateway/approval-callback.test.ts +104 -0
package/telegram-plugin/gateway/approval-callback.ts +31 -3
package/telegram-plugin/gateway/auth-add-flow.ts +326 -0
package/telegram-plugin/gateway/auth-broker-client.ts +75 -0
package/telegram-plugin/gateway/auth-command.ts +905 -0
package/telegram-plugin/gateway/auth-line.ts +123 -0
package/telegram-plugin/gateway/auth-status-adapter.ts +101 -0
package/telegram-plugin/gateway/boot-card.ts +23 -37
package/telegram-plugin/gateway/boot-probes.ts +9 -12
package/telegram-plugin/gateway/diff-preview-card.test.ts +192 -0
package/telegram-plugin/gateway/diff-preview-card.ts +170 -0
package/telegram-plugin/gateway/drive-write-approval.test.ts +312 -0
package/telegram-plugin/gateway/drive-write-approval.ts +243 -0
package/telegram-plugin/gateway/folder-picker-handler.test.ts +314 -0
package/telegram-plugin/gateway/folder-picker-handler.ts +348 -0
package/telegram-plugin/gateway/gateway.ts +1156 -938
package/telegram-plugin/gateway/hostd-dispatch.ts +244 -0
package/telegram-plugin/gateway/ipc-protocol.ts +83 -2
package/telegram-plugin/gateway/ipc-server.ts +69 -0
package/telegram-plugin/hooks/sandbox-hint-posttool.mjs +103 -12
package/telegram-plugin/hooks/tool-label-pretool.mjs +11 -0
package/telegram-plugin/hooks/wedge-detect-posttool.mjs +303 -0
package/telegram-plugin/model-unavailable.ts +28 -12
package/telegram-plugin/permission-title.ts +56 -0
package/telegram-plugin/quota-check.ts +19 -41
package/telegram-plugin/scripts/build.mjs +0 -1
package/telegram-plugin/shared/bot-runtime.ts +5 -4
package/telegram-plugin/silence-poke.ts +153 -1
package/telegram-plugin/tests/auth-add-flow.test.ts +559 -0
package/telegram-plugin/tests/auth-code-redact.test.ts +8 -4
package/telegram-plugin/tests/auth-command-format2.test.ts +156 -0
package/telegram-plugin/tests/auth-command-vernacular.test.ts +531 -0
package/telegram-plugin/tests/auth-snapshot-format.test.ts +429 -0
package/telegram-plugin/tests/auth-status-adapter.test.ts +129 -0
package/telegram-plugin/tests/auto-fallback-fleet.test.ts +211 -0
package/telegram-plugin/tests/auto-fallback.test.ts +60 -358
package/telegram-plugin/tests/boot-probes.test.ts +27 -22
package/telegram-plugin/tests/fleet-fallback-gate.test.ts +197 -0
package/telegram-plugin/tests/model-unavailable.test.ts +30 -5
package/telegram-plugin/tests/permission-title.test.ts +31 -0
package/telegram-plugin/tests/quota-check.test.ts +5 -35
package/telegram-plugin/tests/sandbox-hint-posttool.test.ts +212 -2
package/telegram-plugin/tests/silence-poke.test.ts +237 -0
package/telegram-plugin/tests/turn-flush-safety.test.ts +112 -0
package/telegram-plugin/turn-flush-safety.ts +55 -1
package/telegram-plugin/uat/SETUP.md +35 -1
package/telegram-plugin/uat/runners/agent-self-sufficiency.ts +457 -0
package/telegram-plugin/uat/runners/paraphrases.ts +231 -0
package/telegram-plugin/uat/runners/report.ts +150 -0
package/telegram-plugin/uat/runners/run-agent-self-sufficiency.sh +50 -0
package/telegram-plugin/uat/runners/scorer.test.ts +196 -0
package/telegram-plugin/uat/runners/scorer.ts +106 -0
package/telegram-plugin/uat/runners/skill-coverage.test.ts +100 -0
package/telegram-plugin/uat/runners/skill-coverage.ts +620 -0
package/telegram-plugin/uat/scenarios/jtbd-interrupt-marker-dm.test.ts +7 -1
package/telegram-plugin/uat/scenarios/jtbd-rapid-followup-dm.test.ts +7 -1
package/telegram-plugin/auth-dashboard.ts +0 -1104
package/telegram-plugin/auth-slot-parser.ts +0 -497
package/telegram-plugin/auto-fallback-dispatcher.ts +0 -68
package/telegram-plugin/dist/foreman/foreman.js +0 -31358
package/telegram-plugin/foreman/foreman-create-flow.ts +0 -202
package/telegram-plugin/foreman/foreman-handlers.ts +0 -493
package/telegram-plugin/foreman/foreman.ts +0 -1165
package/telegram-plugin/foreman/setup-flow.ts +0 -345
package/telegram-plugin/foreman/setup-state.ts +0 -239
package/telegram-plugin/foreman/state.ts +0 -203
package/telegram-plugin/tests/auth-account-identity-surface.test.ts +0 -118
package/telegram-plugin/tests/auth-dashboard-edge-cases.test.ts +0 -260
package/telegram-plugin/tests/auth-dashboard-restart-flow.test.ts +0 -140
package/telegram-plugin/tests/auth-dashboard-v3b.test.ts +0 -559
package/telegram-plugin/tests/auth-dashboard.test.ts +0 -1045
package/telegram-plugin/tests/auth-slot-commands.test.ts +0 -640
package/telegram-plugin/tests/auto-fallback-dispatcher.e2e.test.ts +0 -183
package/telegram-plugin/tests/boot-card-account-quota.test.ts +0 -137
package/telegram-plugin/tests/foreman-create-flow.test.ts +0 -359
package/telegram-plugin/tests/foreman-handlers.test.ts +0 -347
package/telegram-plugin/tests/foreman-state.test.ts +0 -164
package/telegram-plugin/tests/foreman-write-ops.test.ts +0 -214
package/telegram-plugin/tests/setup-flow.test.ts +0 -510
package/telegram-plugin/tests/setup-state.test.ts +0 -146

package/telegram-plugin/turn-flush-safety.ts CHANGED Viewed

@@ -57,6 +57,7 @@ export type FlushDecision =
 export type FlushSkipReason =
   | 'flag-disabled'
   | 'reply-called'
+  | 'reply-called-no-new-text'
   | 'no-inbound-chat'
   | 'empty-text'
   | 'silent-marker'
@@ -71,10 +72,33 @@ export interface FlushDecisionInput {
   /** Raw text content blocks accumulated from assistant events across the
    * turn. Joined + trimmed internally. */
   capturedText: string[]
+  /** Snapshot of `capturedText.length` at the moment of the most recent
+   * reply / stream_reply tool call in this turn. Indices `[capturedText
+   * length-at-last-reply, capturedText.length)` are the post-reply tail
+   * — substantive content the model emitted AFTER the reply (e.g. soft
+   * commit "on it, back in a few" followed by the real answer in
+   * terminal text only, the #1291 repro). When the tail meets
+   * `replyCalledTailMinChars` we flush it; otherwise we skip.
+   *
+   * Defaults to `capturedText.length` (treat all captured text as
+   * pre-reply, preserve the pre-#1291 behaviour where any reply tool
+   * call suppressed flush entirely) so callers that don't track the
+   * marker keep the old contract. */
+  capturedTextLenAtLastReply?: number
+  /** Minimum trimmed-tail length to qualify a post-reply tail flush.
+   * Defaults to `REPLY_CALLED_TAIL_MIN_CHARS` (40). Below this we skip
+   * with `reply-called-no-new-text` — typical for trailing markdown
+   * artifacts or a one-word afterthought. */
+  replyCalledTailMinChars?: number
   /** Feature flag — defaults to true. Pass `false` to force skip everywhere. */
   flushEnabled?: boolean
 }
+/** Default minimum trimmed length for the post-reply tail to be flushed
+ * as a follow-up message. Below this we treat the tail as noise / artifact
+ * and skip silently. */
+export const REPLY_CALLED_TAIL_MIN_CHARS = 40
 /**
  * Pure decision: should the gateway deterministically send the model's
  * captured assistant text at turn_end? Returns `{kind: 'flush', text}` with
@@ -82,11 +106,41 @@ export interface FlushDecisionInput {
  *
  * Ordering of checks is deliberate: cheapest/strongest first so logs
  * attribute a skip to the most specific cause.
+ *
+ * #1291 — when `replyCalled` is true we no longer suppress unconditionally.
+ * The model may have emitted a soft-commit reply ("on it, back in a few")
+ * followed by the real substantive answer in terminal text only. Using
+ * `capturedTextLenAtLastReply` we isolate the post-reply tail and flush
+ * it if it's substantive enough; otherwise we skip with
+ * `reply-called-no-new-text` (logged) or `reply-called` (silent, no tail).
  */
 export function decideTurnFlush(input: FlushDecisionInput): FlushDecision {
   const flushEnabled = input.flushEnabled !== false
   if (!flushEnabled) return { kind: 'skip', reason: 'flag-disabled' }
-  if (input.replyCalled) return { kind: 'skip', reason: 'reply-called' }
+  if (input.replyCalled) {
+    const tailIdx = input.capturedTextLenAtLastReply ?? input.capturedText.length
+    const tail = input.capturedText.slice(tailIdx).join('\n').trim()
+    const minChars = input.replyCalledTailMinChars ?? REPLY_CALLED_TAIL_MIN_CHARS
+    if (tail.length === 0) {
+      // The reply tool was called and nothing of substance came after —
+      // the turn is fully served by the reply. Skip silently (the gateway
+      // WARN gate excludes this reason from logs).
+      return { kind: 'skip', reason: 'reply-called' }
+    }
+    if (tail.length < minChars) {
+      // Post-reply tail exists but is below the substantive-content
+      // threshold — typically trailing markdown artifacts or a one-word
+      // afterthought. Skip but with a distinct reason so this case IS
+      // logged (auditable for #1291 regressions, vs the silent
+      // 'reply-called' which is the expected steady state).
+      return { kind: 'skip', reason: 'reply-called-no-new-text' }
+    }
+    if (input.chatId == null) return { kind: 'skip', reason: 'no-inbound-chat' }
+    if (isSilentFlushMarker(tail)) return { kind: 'skip', reason: 'silent-marker' }
+    return { kind: 'flush', text: tail }
+  }
   if (input.chatId == null) return { kind: 'skip', reason: 'no-inbound-chat' }
   const joined = input.capturedText.join('\n').trim()
   if (joined.length === 0) return { kind: 'skip', reason: 'empty-text' }

package/telegram-plugin/uat/SETUP.md CHANGED Viewed

@@ -297,7 +297,41 @@ as a long-lived secret.
 When all three are checked, the env block above + `bun run test:uat`
 is safe to run.
-## 8. Port allocator vs unix sockets (Phase 1 scaffold note)
+## 8. CI gate — `ci-uat` GitHub Actions workflow
+Since the GHA gate landed (replacing the original Buildkite gate),
+the fuzz subset of scenarios (`fuzz-random-prompts-dm.test.ts`,
+`fuzz-extended-dm.test.ts`, `fuzz-human-style-dm.test.ts`) runs
+automatically on every PR that touches `telegram-plugin/`,
+`src/agents/`, or `telegram-plugin/uat/`.
+The workflow (`.github/workflows/ci-uat.yml`) runs on a self-hosted
+GHA runner labelled `[self-hosted, uat-host]` that lives on the
+same box as the `test-harness` agent. Gating: the `UAT_GATE_ENABLED`
+repository variable must be `true` AND the four Telegram secrets
+(`TELEGRAM_API_ID`, `TELEGRAM_API_HASH`, `TELEGRAM_UAT_DRIVER_SESSION`,
+`TELEGRAM_TEST_BOT_USERNAME`) must be present as GitHub Actions
+secrets. The workflow's header docstring covers agent setup + secret
+rotation.
+**Scope (CI):**
+| Scenario | In CI? | Why |
+|---|---|---|
+| `fuzz-random-prompts-dm` | ✅ gates PRs | JTBD-floor invariants; PR #1132. |
+| `fuzz-extended-dm` | ✅ gates PRs | Second-pass categories; PR #1134. |
+| `fuzz-human-style-dm` | ✅ gates PRs | Human-shape inbounds + meaningful-reply floor. |
+| `silent-end-recovery-dm` | ❌ local only | Passes, but the 5-min worst-case budget makes it costly to run every PR. Run nightly + ad-hoc. |
+| `jtbd-status-query-dm` | ❌ local only | Passes; defer to a follow-up that batches the cheap JTBD scenarios. |
+| `jtbd-soft-commit-dm` | ❌ local only | Already budget-tuned but real-Telegram timing flake risk; defer until we have flake telemetry. |
+| `jtbd-interrupt-marker-dm` | ❌ `describe.skip` | Suspected real bug per #1132 overnight. Investigate before unskipping. |
+| `jtbd-rapid-followup-dm` | ❌ `describe.skip` | Suspected real classification bug per #1132 overnight. Investigate before unskipping. |
+| vault / secret-redaction / voice / location / reactions / progress-card | ❌ local only | Need specific surfaces / config overrides not wired into the gate yet. |
+A local `bun run test:uat` runs the full include glob minus the two
+`describe.skip`'d JTBDs.
+## 9. Port allocator vs unix sockets (Phase 1 scaffold note)
 The Phase 1 `port-allocator.ts` is held in reserve for Phase 2b's
 child-process flow — Phase 2a (standard-runtime agent) doesn't need

package/telegram-plugin/uat/runners/agent-self-sufficiency.ts ADDED Viewed

@@ -0,0 +1,457 @@
+#!/usr/bin/env bun
+/**
+ * Agent-self-sufficiency UAT runner.
+ *
+ * Drives a real Telegram user-account against the live agent fleet to
+ * verify the four acceptance criteria from the
+ * "agent-self-sufficiency" goal:
+ *
+ *   1. Self-management (skill_list, cron_list, audit_tail, config_get)
+ *   2. Identity awareness (honest self-ID, knows its name, knows peers)
+ *   3. Admin surface (non-admin refusal naming the admin agent)
+ *      — admin reads (3a/3b) are covered by the hostd vitest suite
+ *        rather than live fuzz, because they require a docker stub.
+ *   4. The fuzzy UAT IS this runner.
+ *
+ * Usage:
+ *
+ *   bun telegram-plugin/uat/runners/agent-self-sufficiency.ts \\
+ *       --agent klanker:@klanker_bot \\
+ *       --agent scribe:@scribe_bot \\
+ *       --agent doc:@doc_bot \\
+ *       --admin-agent klanker \\
+ *       --report ./uat-report.md
+ *
+ *   # OR — discover from env (CI-friendly):
+ *   UAT_FLEET="klanker:@klanker_bot,scribe:@scribe_bot,doc:@doc_bot" \\
+ *   UAT_ADMIN_AGENTS="klanker" \\
+ *   bun telegram-plugin/uat/runners/agent-self-sufficiency.ts
+ *
+ * Auth env (same as the existing uat harness — see
+ * telegram-plugin/uat/SETUP.md):
+ *
+ *   TELEGRAM_API_ID, TELEGRAM_API_HASH, TELEGRAM_UAT_DRIVER_SESSION
+ *
+ * **Why a user-account session, not bot tokens.** The acceptance-
+ * criteria text mentioned `TELEGRAM_BOT_TOKEN_<agent>` env vars, but
+ * Telegram's Bot API forbids bots from reading other bots' messages
+ * (https://core.telegram.org/bots/faq) — a bot can send to another
+ * bot's chat but can't observe the reply. The only way to drive the
+ * fleet AND capture every agent's reply is an mtcute user-account
+ * session, which is what the existing telegram-plugin/uat harness
+ * uses. This runner inherits that machinery wholesale; the env-var
+ * rename is forced by the platform, not a design choice.
+ *
+ * Missing creds fail loud, not silent — the goal explicitly demands
+ * no silent skips on missing UAT credentials.
+ */
+import { writeFileSync } from "node:fs";
+import { Driver, type ObservedMessage } from "../driver.js";
+import { loadUatEnv } from "../load-env.js";
+import { CRITERIA, type CriterionSpec } from "./paraphrases.js";
+import { scoreReply, type CaseResult, type Outcome } from "./scorer.js";
+import { renderMarkdown } from "./report.js";
+loadUatEnv();
+// ─── CLI / env parsing ─────────────────────────────────────────────────────
+interface AgentTarget {
+  name: string;
+  botUsername: string;
+  admin: boolean;
+}
+interface CliConfig {
+  agents: AgentTarget[];
+  reportPath: string;
+  jsonPath: string;
+  /** Per-case reply timeout, ms. Default 60s. */
+  replyTimeoutMs: number;
+  /** Inter-message settle, ms. Default 4s — keeps us under Telegram's
+   *  global outbound rate cap and gives the agent time to finish its
+   *  previous turn before the next inbound. */
+  settleMs: number;
+}
+function parseCli(argv: readonly string[]): CliConfig {
+  const agents = new Map<string, AgentTarget>();
+  const adminSet = new Set<string>();
+  let reportPath = process.env.UAT_REPORT ?? "./uat-agent-self-sufficiency.md";
+  let jsonPath = process.env.UAT_REPORT_JSON ?? "./uat-agent-self-sufficiency.json";
+  let replyTimeoutMs = Number.parseInt(process.env.UAT_REPLY_TIMEOUT_MS ?? "60000", 10);
+  let settleMs = Number.parseInt(process.env.UAT_SETTLE_MS ?? "4000", 10);
+  const envFleet = process.env.UAT_FLEET;
+  if (envFleet) {
+    for (const tok of envFleet.split(",")) {
+      const [name, bot] = tok.split(":").map((s) => s.trim());
+      if (name && bot) agents.set(name, { name, botUsername: bot, admin: false });
+    }
+  }
+  const envAdmin = process.env.UAT_ADMIN_AGENTS;
+  if (envAdmin) {
+    for (const tok of envAdmin.split(",")) {
+      const name = tok.trim();
+      if (name) adminSet.add(name);
+    }
+  }
+  for (let i = 0; i < argv.length; i++) {
+    const tok = argv[i]!;
+    const next = (): string => {
+      const v = argv[++i];
+      if (!v) fail(`${tok}: missing value`);
+      return v;
+    };
+    switch (tok) {
+      case "--agent": {
+        const v = next();
+        const [name, bot] = v.split(":").map((s) => s.trim());
+        if (!name || !bot)
+          fail(`--agent expects "<name>:@<bot-username>"; got "${v}"`);
+        agents.set(name, { name, botUsername: bot, admin: false });
+        break;
+      }
+      case "--admin-agent": {
+        adminSet.add(next());
+        break;
+      }
+      case "--report":
+        reportPath = next();
+        break;
+      case "--json":
+        jsonPath = next();
+        break;
+      case "--reply-timeout-ms":
+        replyTimeoutMs = Number.parseInt(next(), 10);
+        break;
+      case "--settle-ms":
+        settleMs = Number.parseInt(next(), 10);
+        break;
+      case "--help":
+      case "-h":
+        printHelp();
+        process.exit(0);
+        break;
+      default:
+        if (tok.startsWith("--")) fail(`unknown flag: ${tok}`);
+    }
+  }
+  for (const name of adminSet) {
+    const t = agents.get(name);
+    if (t) t.admin = true;
+  }
+  if (agents.size === 0) {
+    fail(
+      "no agents to target. Pass --agent <name>:@<bot> at least once, or set UAT_FLEET env",
+    );
+  }
+  if (agents.size < 3) {
+    process.stderr.write(
+      `[uat] WARNING: only ${agents.size} agent(s) targeted; goal calls for ≥3 to prove shared infra.\n`,
+    );
+  }
+  return {
+    agents: [...agents.values()],
+    reportPath,
+    jsonPath,
+    replyTimeoutMs,
+    settleMs,
+  };
+}
+function fail(msg: string): never {
+  process.stderr.write(`[uat] ${msg}\n`);
+  process.exit(2);
+}
+function printHelp(): void {
+  process.stdout.write(`agent-self-sufficiency UAT runner
+Required env (or fail loud):
+  TELEGRAM_API_ID, TELEGRAM_API_HASH, TELEGRAM_UAT_DRIVER_SESSION
+Flags:
+  --agent NAME:@BOT      Add an agent target. Repeatable.
+  --admin-agent NAME     Mark NAME as admin: true (skips 3d for that agent).
+  --report PATH          Markdown report path. Default ./uat-agent-self-sufficiency.md
+  --json PATH            JSON sidecar with all results. Default ./uat-agent-self-sufficiency.json
+  --reply-timeout-ms N   Per-case timeout. Default 60000.
+  --settle-ms N          Inter-message settle. Default 4000.
+Env equivalents:
+  UAT_FLEET="name1:@bot1,name2:@bot2,..."
+  UAT_ADMIN_AGENTS="name1,name2"
+  UAT_REPORT, UAT_REPORT_JSON, UAT_REPLY_TIMEOUT_MS, UAT_SETTLE_MS
+`);
+}
+// ─── Driver wrapper: send + observe ─────────────────────────────────────────
+interface ReplyOutcome {
+  reply: string;
+  outcome: Outcome;
+  durationMs: number;
+  errorMessage?: string;
+}
+/**
+ * Send one inbound to the agent and wait for a meaningful reply.
+ *
+ * We subscribe to the chat's message stream BEFORE sending so we don't
+ * miss the bot's reply if it lands faster than we can start observing
+ * (yes, this happens). Then:
+ *
+ *   1. Send the inbound.
+ *   2. Consume the stream until we see the first non-empty bot message
+ *      with messageId > our sent.messageId. That's the reply head.
+ *   3. Continue consuming for an "edit window" (3s by default) to
+ *      absorb any edits the gateway makes to its first chunk (stream-
+ *      reply pattern: bot sends "thinking…" then edits with the final
+ *      answer). The final post-edit text is what we score.
+ *   4. Bail out with `timeout` if we never see a head.
+ */
+async function sendAndScore(
+  driver: Driver,
+  botUserId: number,
+  driverUserId: number,
+  spec: CriterionSpec,
+  prompt: string,
+  agentName: string,
+  timeoutMs: number,
+): Promise<ReplyOutcome> {
+  const startedAt = Date.now();
+  // Start observing FIRST so we don't race the bot's reply.
+  const stream = driver.observeMessages(botUserId)[Symbol.asyncIterator]();
+  let sentMessageId: number;
+  try {
+    const sent = await driver.sendText(botUserId, prompt);
+    sentMessageId = sent.messageId;
+  } catch (err) {
+    try {
+      await stream.return?.(undefined);
+    } catch {
+      /* ignore */
+    }
+    return {
+      reply: "",
+      outcome: "error",
+      durationMs: Date.now() - startedAt,
+      errorMessage: `send failed: ${(err as Error).message}`,
+    };
+  }
+  const deadline = startedAt + timeoutMs;
+  const EDIT_WINDOW_MS = 3000;
+  let headSeenAt = 0;
+  let replyMessageId = 0;
+  let replyText = "";
+  try {
+    while (Date.now() < deadline) {
+      const remaining = deadline - Date.now();
+      const winSize = headSeenAt
+        ? Math.max(0, EDIT_WINDOW_MS - (Date.now() - headSeenAt))
+        : remaining;
+      if (headSeenAt && winSize === 0) break;
+      const slice = await pullOneWithTimeout(stream, Math.min(remaining, Math.max(250, winSize)));
+      if (slice === "timeout") {
+        if (headSeenAt) break; // edit window elapsed
+        continue;
+      }
+      if (slice === "done") break;
+      const m: ObservedMessage = slice;
+      if (m.senderUserId === driverUserId) continue;
+      if (m.messageId <= sentMessageId) continue;
+      const t = (m.text ?? "").trim();
+      if (!t) continue;
+      // Either this is the head, or it's an edit/replacement of the
+      // bot's reply. Track the most recent.
+      replyMessageId = m.messageId;
+      replyText = t;
+      if (!headSeenAt) headSeenAt = Date.now();
+    }
+  } finally {
+    try {
+      await stream.return?.(undefined);
+    } catch {
+      /* ignore */
+    }
+  }
+  const durationMs = Date.now() - startedAt;
+  if (!replyMessageId) {
+    return { reply: "", outcome: "timeout", durationMs };
+  }
+  const outcome = scoreReply(spec, replyText, { agentName });
+  return { reply: replyText, outcome, durationMs };
+}
+/**
+ * Race the next stream item against a timeout. Returns the item, or
+ * the literal `"timeout"` / `"done"` sentinels. `done` is rare in
+ * practice — the observer doesn't naturally close until we tell it to.
+ */
+async function pullOneWithTimeout(
+  it: AsyncIterator<ObservedMessage>,
+  ms: number,
+): Promise<ObservedMessage | "timeout" | "done"> {
+  return new Promise((resolve) => {
+    let settled = false;
+    const timer = setTimeout(() => {
+      if (settled) return;
+      settled = true;
+      resolve("timeout");
+    }, ms);
+    it.next().then(
+      (r) => {
+        if (settled) return;
+        settled = true;
+        clearTimeout(timer);
+        if (r.done) resolve("done");
+        else resolve(r.value);
+      },
+      () => {
+        if (settled) return;
+        settled = true;
+        clearTimeout(timer);
+        resolve("done");
+      },
+    );
+  });
+}
+// ─── Main orchestration ─────────────────────────────────────────────────────
+async function main(): Promise<void> {
+  const cli = parseCli(process.argv.slice(2));
+  // Hard-fail on missing UAT creds — goal: never silently skip.
+  const apiId = Number.parseInt(process.env.TELEGRAM_API_ID ?? "", 10);
+  if (!Number.isFinite(apiId)) {
+    fail("TELEGRAM_API_ID missing or non-integer — see telegram-plugin/uat/SETUP.md");
+  }
+  const apiHash = process.env.TELEGRAM_API_HASH ?? "";
+  if (!apiHash) fail("TELEGRAM_API_HASH missing — see SETUP.md");
+  const session = process.env.TELEGRAM_UAT_DRIVER_SESSION ?? "";
+  if (!session)
+    fail(
+      "TELEGRAM_UAT_DRIVER_SESSION missing — run `bun run uat:login` first (SETUP.md §4)",
+    );
+  process.stdout.write(
+    `[uat] connecting to Telegram as the UAT driver account...\n`,
+  );
+  const driver = new Driver({ apiId, apiHash, session });
+  await driver.connect();
+  const driverUserId = await driver.getMyUserId();
+  process.stdout.write(`[uat] driver user_id=${driverUserId}\n`);
+  // Resolve every agent's bot user_id up front so a missing username
+  // fails before we waste any time on the run.
+  const resolved: { target: AgentTarget; botUserId: number }[] = [];
+  for (const a of cli.agents) {
+    try {
+      const id = await driver.resolveBotUserId(a.botUsername);
+      resolved.push({ target: a, botUserId: id });
+      process.stdout.write(
+        `[uat] resolved ${a.name} ${a.botUsername} → bot_user_id=${id}` +
+          (a.admin ? " (admin)" : "") +
+          "\n",
+      );
+    } catch (err) {
+      process.stderr.write(
+        `[uat] FAILED to resolve ${a.botUsername} for agent ${a.name}: ${(err as Error).message}\n`,
+      );
+      process.exit(3);
+    }
+  }
+  // Run!
+  const startedAt = new Date();
+  const t0 = Date.now();
+  const results: CaseResult[] = [];
+  for (const { target, botUserId } of resolved) {
+    process.stdout.write(`\n[uat] ─── agent: ${target.name} ─────────────\n`);
+    for (const spec of CRITERIA) {
+      // Skip 3d (non-admin refusal) on admin agents — they're legitimately
+      // capable of those operations, so a "I can't" reply would be wrong.
+      if (spec.id === "3d_admin_refusal" && target.admin) {
+        process.stdout.write(
+          `[uat]   skip ${spec.id} on ${target.name} (admin: true)\n`,
+        );
+        continue;
+      }
+      for (const para of spec.paraphrases) {
+        const r = await sendAndScore(
+          driver,
+          botUserId,
+          driverUserId,
+          spec,
+          para.text,
+          target.name,
+          cli.replyTimeoutMs,
+        );
+        const tag =
+          r.outcome === "pass" ? "✓" : r.outcome === "fail" ? "✗" : "·";
+        process.stdout.write(
+          `[uat]   ${tag} ${spec.id}/${para.label} (${r.outcome}, ${r.durationMs}ms)\n`,
+        );
+        results.push({
+          agent: target.name,
+          criterion: spec.id,
+          paraphrase: para,
+          outcome: r.outcome,
+          reply: r.reply,
+          durationMs: r.durationMs,
+          ...(r.errorMessage ? { errorMessage: r.errorMessage } : {}),
+        });
+        // Inter-message settle: keep below Telegram's user-account
+        // outbound cap and let the agent finish its prior turn.
+        await new Promise((res) => setTimeout(res, cli.settleMs));
+      }
+    }
+  }
+  const durationSeconds = (Date.now() - t0) / 1000;
+  await driver.disconnect().catch(() => undefined);
+  const md = renderMarkdown(results, {
+    startedAt,
+    durationSeconds,
+    agents: resolved.map((r) => r.target.name),
+  });
+  writeFileSync(cli.reportPath, md, "utf-8");
+  writeFileSync(
+    cli.jsonPath,
+    JSON.stringify(
+      { startedAt: startedAt.toISOString(), durationSeconds, results },
+      null,
+      2,
+    ),
+    "utf-8",
+  );
+  process.stdout.write(`\n[uat] report → ${cli.reportPath}\n`);
+  process.stdout.write(`[uat] json   → ${cli.jsonPath}\n`);
+  const passes = results.filter((r) => r.outcome === "pass").length;
+  process.stdout.write(
+    `[uat] overall: ${passes}/${results.length} passed (${results.length > 0 ? ((passes / results.length) * 100).toFixed(1) : "0"}%)\n`,
+  );
+  // Exit non-zero if anything failed, so the runner is CI-actionable.
+  process.exit(passes === results.length ? 0 : 1);
+}
+main().catch((err) => {
+  process.stderr.write(`[uat] FATAL: ${(err as Error).stack ?? err}\n`);
+  process.exit(4);
+});