switchroom 0.8.1 → 0.10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +49 -57
- package/bin/timezone-hook.sh +9 -7
- package/dist/agent-scheduler/index.js +285 -45
- package/dist/auth-broker/index.js +13932 -0
- package/dist/cli/switchroom.js +15931 -12778
- package/dist/host-control/main.js +582 -43
- package/dist/vault/approvals/kernel-server.js +276 -47
- package/dist/vault/broker/server.js +333 -69
- package/examples/minimal.yaml +63 -0
- package/examples/personal-google-workspace-mcp/.env.example +34 -0
- package/examples/personal-google-workspace-mcp/README.md +194 -0
- package/examples/personal-google-workspace-mcp/compose.yaml +66 -0
- package/examples/switchroom.yaml +220 -0
- package/package.json +6 -4
- package/profiles/_base/start.sh.hbs +3 -3
- package/profiles/_shared/agent-self-service.md.hbs +126 -0
- package/profiles/default/CLAUDE.md +10 -0
- package/profiles/default/CLAUDE.md.hbs +16 -0
- package/skills/buildkite-agent-infrastructure/SKILL.md +30 -11
- package/skills/buildkite-agent-runtime/SKILL.md +44 -11
- package/skills/buildkite-api/SKILL.md +31 -8
- package/skills/buildkite-cli/SKILL.md +27 -9
- package/skills/buildkite-migration/SKILL.md +22 -9
- package/skills/buildkite-pipelines/SKILL.md +26 -9
- package/skills/buildkite-secure-delivery/SKILL.md +23 -9
- package/skills/buildkite-test-engine/SKILL.md +25 -8
- package/skills/docx/SKILL.md +1 -1
- package/skills/file-bug/SKILL.md +34 -6
- package/skills/humanizer/SKILL.md +15 -0
- package/skills/humanizer-calibrate/SKILL.md +7 -1
- package/skills/mcp-builder/SKILL.md +1 -1
- package/skills/pdf/SKILL.md +1 -1
- package/skills/pptx/SKILL.md +1 -1
- package/skills/skill-creator/SKILL.md +21 -1
- package/skills/skill-creator/scripts/__pycache__/__init__.cpython-313.pyc +0 -0
- package/skills/skill-creator/scripts/__pycache__/generate_report.cpython-313.pyc +0 -0
- package/skills/skill-creator/scripts/__pycache__/improve_description.cpython-313.pyc +0 -0
- package/skills/skill-creator/scripts/__pycache__/run_eval.cpython-313.pyc +0 -0
- package/skills/skill-creator/scripts/__pycache__/run_loop.cpython-313.pyc +0 -0
- package/skills/skill-creator/scripts/__pycache__/utils.cpython-313.pyc +0 -0
- package/skills/switchroom-cli/SKILL.md +63 -64
- package/skills/switchroom-health/SKILL.md +23 -10
- package/skills/switchroom-install/SKILL.md +3 -3
- package/skills/switchroom-manage/SKILL.md +26 -19
- package/skills/switchroom-runtime/SKILL.md +67 -15
- package/skills/switchroom-status/SKILL.md +26 -1
- package/skills/telegram-test-harness/SKILL.md +3 -0
- package/skills/webapp-testing/SKILL.md +31 -1
- package/skills/xlsx/SKILL.md +1 -1
- package/telegram-plugin/admin-commands/index.ts +7 -5
- package/telegram-plugin/dist/gateway/gateway.js +13042 -12844
- package/telegram-plugin/gateway/auth-add-flow.ts +326 -0
- package/telegram-plugin/gateway/auth-broker-client.ts +75 -0
- package/telegram-plugin/gateway/auth-command.ts +794 -0
- package/telegram-plugin/gateway/auth-line.ts +123 -0
- package/telegram-plugin/gateway/boot-card.ts +22 -36
- package/telegram-plugin/gateway/boot-probes.ts +3 -3
- package/telegram-plugin/gateway/gateway.ts +313 -798
- package/telegram-plugin/gateway/hostd-dispatch.ts +117 -0
- package/telegram-plugin/hooks/tool-label-pretool.mjs +11 -0
- package/telegram-plugin/hooks/wedge-detect-posttool.mjs +303 -0
- package/telegram-plugin/permission-title.ts +56 -0
- package/telegram-plugin/quota-check.ts +19 -41
- package/telegram-plugin/scripts/build.mjs +0 -1
- package/telegram-plugin/shared/bot-runtime.ts +5 -4
- package/telegram-plugin/tests/auth-add-flow.test.ts +559 -0
- package/telegram-plugin/tests/auth-code-redact.test.ts +8 -4
- package/telegram-plugin/tests/auth-command-vernacular.test.ts +531 -0
- package/telegram-plugin/tests/boot-probes.test.ts +11 -4
- package/telegram-plugin/tests/hostd-dispatch.test.ts +129 -0
- package/telegram-plugin/tests/permission-title.test.ts +31 -0
- package/telegram-plugin/tests/quota-check.test.ts +5 -35
- package/telegram-plugin/uat/SETUP.md +31 -1
- package/telegram-plugin/uat/runners/agent-self-sufficiency.ts +457 -0
- package/telegram-plugin/uat/runners/paraphrases.ts +231 -0
- package/telegram-plugin/uat/runners/report.ts +150 -0
- package/telegram-plugin/uat/runners/run-agent-self-sufficiency.sh +50 -0
- package/telegram-plugin/uat/runners/scorer.test.ts +196 -0
- package/telegram-plugin/uat/runners/scorer.ts +106 -0
- package/telegram-plugin/uat/runners/skill-coverage.test.ts +100 -0
- package/telegram-plugin/uat/runners/skill-coverage.ts +620 -0
- package/telegram-plugin/uat/scenarios/jtbd-interrupt-marker-dm.test.ts +7 -1
- package/telegram-plugin/uat/scenarios/jtbd-rapid-followup-dm.test.ts +7 -1
- package/telegram-plugin/auth-dashboard.ts +0 -1104
- package/telegram-plugin/auth-slot-parser.ts +0 -497
- package/telegram-plugin/dist/foreman/foreman.js +0 -31358
- package/telegram-plugin/foreman/foreman-create-flow.ts +0 -202
- package/telegram-plugin/foreman/foreman-handlers.ts +0 -493
- package/telegram-plugin/foreman/foreman.ts +0 -1165
- package/telegram-plugin/foreman/setup-flow.ts +0 -345
- package/telegram-plugin/foreman/setup-state.ts +0 -239
- package/telegram-plugin/foreman/state.ts +0 -203
- package/telegram-plugin/tests/auth-account-identity-surface.test.ts +0 -118
- package/telegram-plugin/tests/auth-dashboard-edge-cases.test.ts +0 -260
- package/telegram-plugin/tests/auth-dashboard-restart-flow.test.ts +0 -140
- package/telegram-plugin/tests/auth-dashboard-v3b.test.ts +0 -559
- package/telegram-plugin/tests/auth-dashboard.test.ts +0 -1045
- package/telegram-plugin/tests/auth-slot-commands.test.ts +0 -640
- package/telegram-plugin/tests/boot-card-account-quota.test.ts +0 -137
- package/telegram-plugin/tests/foreman-create-flow.test.ts +0 -359
- package/telegram-plugin/tests/foreman-handlers.test.ts +0 -347
- package/telegram-plugin/tests/foreman-state.test.ts +0 -164
- package/telegram-plugin/tests/foreman-write-ops.test.ts +0 -214
- package/telegram-plugin/tests/setup-flow.test.ts +0 -510
- package/telegram-plugin/tests/setup-state.test.ts +0 -146
|
@@ -103,4 +103,35 @@ describe('summarizeToolForTitle (#186)', () => {
|
|
|
103
103
|
const input = JSON.stringify({ skill: 'mail', name: 'wrong' })
|
|
104
104
|
expect(summarizeToolForTitle('Skill', input)).toBe('Skill (mail)')
|
|
105
105
|
})
|
|
106
|
+
|
|
107
|
+
test('MCP curated: agent-config tools render as human verb-phrases (#1215)', () => {
|
|
108
|
+
expect(summarizeToolForTitle('mcp__agent-config__skill_list', undefined)).toBe(
|
|
109
|
+
'List its own installed skills',
|
|
110
|
+
)
|
|
111
|
+
expect(summarizeToolForTitle('mcp__agent-config__cron_list', undefined)).toBe(
|
|
112
|
+
'List its own scheduled tasks',
|
|
113
|
+
)
|
|
114
|
+
expect(summarizeToolForTitle('mcp__agent-config__peers_list', undefined)).toBe(
|
|
115
|
+
'List the other agents on this instance',
|
|
116
|
+
)
|
|
117
|
+
})
|
|
118
|
+
|
|
119
|
+
test('MCP curated: hostd tools render as human verb-phrases (#1215)', () => {
|
|
120
|
+
expect(summarizeToolForTitle('mcp__hostd__agent_logs', undefined)).toBe(
|
|
121
|
+
"Read another agent's container logs",
|
|
122
|
+
)
|
|
123
|
+
expect(summarizeToolForTitle('mcp__hostd__agent_exec', undefined)).toBe(
|
|
124
|
+
'Run a read-only inspection inside another agent',
|
|
125
|
+
)
|
|
126
|
+
})
|
|
127
|
+
|
|
128
|
+
test('MCP fallback: unknown mcp tool renders as `<server>: <verb with spaces>`', () => {
|
|
129
|
+
expect(summarizeToolForTitle('mcp__some-server__do_thing', undefined)).toBe(
|
|
130
|
+
'some-server: do thing',
|
|
131
|
+
)
|
|
132
|
+
})
|
|
133
|
+
|
|
134
|
+
test('MCP malformed: bare mcp__ prefix without __<server>__<verb> shape is left alone', () => {
|
|
135
|
+
expect(summarizeToolForTitle('mcp__bad', undefined)).toBe('mcp__bad')
|
|
136
|
+
})
|
|
106
137
|
})
|
|
@@ -380,41 +380,11 @@ describe('fetchAccountQuota — cache + token resolution', () => {
|
|
|
380
380
|
}
|
|
381
381
|
})
|
|
382
382
|
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
new Response('{}', {
|
|
389
|
-
status: 200,
|
|
390
|
-
headers: {
|
|
391
|
-
'anthropic-ratelimit-unified-5h-utilization': '0.42',
|
|
392
|
-
'anthropic-ratelimit-unified-7d-utilization': '0.17',
|
|
393
|
-
},
|
|
394
|
-
})
|
|
395
|
-
try {
|
|
396
|
-
const r = await fetchAccountQuota('work@example.com', {
|
|
397
|
-
home,
|
|
398
|
-
fetchImpl: fakeFetch as typeof fetch,
|
|
399
|
-
})
|
|
400
|
-
expect(r.ok).toBe(true)
|
|
401
|
-
const snapPath = join(
|
|
402
|
-
home,
|
|
403
|
-
'.switchroom',
|
|
404
|
-
'accounts',
|
|
405
|
-
'work@example.com',
|
|
406
|
-
'quota.json',
|
|
407
|
-
)
|
|
408
|
-
// The bug: writeAccountQuota was called without opts.home, so the
|
|
409
|
-
// snapshot landed under the real $HOME instead of the test home.
|
|
410
|
-
expect(existsSync(snapPath)).toBe(true)
|
|
411
|
-
const snap = JSON.parse(readFileSync(snapPath, 'utf-8'))
|
|
412
|
-
expect(snap.fiveHourPct).toBeCloseTo(42, 0)
|
|
413
|
-
expect(snap.sevenDayPct).toBeCloseTo(17, 0)
|
|
414
|
-
} finally {
|
|
415
|
-
rmSync(home, { recursive: true, force: true })
|
|
416
|
-
}
|
|
417
|
-
})
|
|
383
|
+
// Removed in RFC H: per-account quota.json disk persistence is gone.
|
|
384
|
+
// switchroom-auth-broker holds canonical quota state and exposes it
|
|
385
|
+
// via list-state; the gateway's in-process cache is enough between
|
|
386
|
+
// restarts (and the broker survives gateway restarts, so the state
|
|
387
|
+
// is preserved at the broker side anyway).
|
|
418
388
|
})
|
|
419
389
|
|
|
420
390
|
describe('getCachedAccountQuota + prefetchAccountQuotaIfStale', () => {
|
|
@@ -297,7 +297,37 @@ as a long-lived secret.
|
|
|
297
297
|
When all three are checked, the env block above + `bun run test:uat`
|
|
298
298
|
is safe to run.
|
|
299
299
|
|
|
300
|
-
## 8.
|
|
300
|
+
## 8. CI gate — `:robot: UAT fuzz` Buildkite step
|
|
301
|
+
|
|
302
|
+
Since the buildkite gate landed, the fuzz subset of scenarios
|
|
303
|
+
(`fuzz-random-prompts-dm.test.ts`, `fuzz-extended-dm.test.ts`,
|
|
304
|
+
`fuzz-human-style-dm.test.ts`) runs automatically on every PR that
|
|
305
|
+
touches `telegram-plugin/`, `src/agents/`, or `telegram-plugin/uat/`.
|
|
306
|
+
|
|
307
|
+
The step runs on a self-hosted Buildkite agent tagged
|
|
308
|
+
`queue=uat-host` that lives on the same box as the `test-harness`
|
|
309
|
+
agent. Secrets come from the Buildkite cluster secret store, not
|
|
310
|
+
from local vault. See `.buildkite/README.md` § "UAT fuzz step" for
|
|
311
|
+
agent setup + secret rotation.
|
|
312
|
+
|
|
313
|
+
**Scope (CI):**
|
|
314
|
+
|
|
315
|
+
| Scenario | In CI? | Why |
|
|
316
|
+
|---|---|---|
|
|
317
|
+
| `fuzz-random-prompts-dm` | ✅ gates PRs | JTBD-floor invariants; PR #1132. |
|
|
318
|
+
| `fuzz-extended-dm` | ✅ gates PRs | Second-pass categories; PR #1134. |
|
|
319
|
+
| `fuzz-human-style-dm` | ✅ gates PRs | Human-shape inbounds + meaningful-reply floor. |
|
|
320
|
+
| `silent-end-recovery-dm` | ❌ local only | Passes, but the 5-min worst-case budget makes it costly to run every PR. Run nightly + ad-hoc. |
|
|
321
|
+
| `jtbd-status-query-dm` | ❌ local only | Passes; defer to a follow-up that batches the cheap JTBD scenarios. |
|
|
322
|
+
| `jtbd-soft-commit-dm` | ❌ local only | Already budget-tuned but real-Telegram timing flake risk; defer until we have flake telemetry. |
|
|
323
|
+
| `jtbd-interrupt-marker-dm` | ❌ `describe.skip` | Suspected real bug per #1132 overnight. Investigate before unskipping. |
|
|
324
|
+
| `jtbd-rapid-followup-dm` | ❌ `describe.skip` | Suspected real classification bug per #1132 overnight. Investigate before unskipping. |
|
|
325
|
+
| vault / secret-redaction / voice / location / reactions / progress-card | ❌ local only | Need specific surfaces / config overrides not wired into the gate yet. |
|
|
326
|
+
|
|
327
|
+
A local `bun run test:uat` runs the full include glob minus the two
|
|
328
|
+
`describe.skip`'d JTBDs.
|
|
329
|
+
|
|
330
|
+
## 9. Port allocator vs unix sockets (Phase 1 scaffold note)
|
|
301
331
|
|
|
302
332
|
The Phase 1 `port-allocator.ts` is held in reserve for Phase 2b's
|
|
303
333
|
child-process flow — Phase 2a (standard-runtime agent) doesn't need
|
|
@@ -0,0 +1,457 @@
|
|
|
1
|
+
#!/usr/bin/env bun
|
|
2
|
+
/**
|
|
3
|
+
* Agent-self-sufficiency UAT runner.
|
|
4
|
+
*
|
|
5
|
+
* Drives a real Telegram user-account against the live agent fleet to
|
|
6
|
+
* verify the four acceptance criteria from the
|
|
7
|
+
* "agent-self-sufficiency" goal:
|
|
8
|
+
*
|
|
9
|
+
* 1. Self-management (skill_list, cron_list, audit_tail, config_get)
|
|
10
|
+
* 2. Identity awareness (honest self-ID, knows its name, knows peers)
|
|
11
|
+
* 3. Admin surface (non-admin refusal naming the admin agent)
|
|
12
|
+
* — admin reads (3a/3b) are covered by the hostd vitest suite
|
|
13
|
+
* rather than live fuzz, because they require a docker stub.
|
|
14
|
+
* 4. The fuzzy UAT IS this runner.
|
|
15
|
+
*
|
|
16
|
+
* Usage:
|
|
17
|
+
*
|
|
18
|
+
* bun telegram-plugin/uat/runners/agent-self-sufficiency.ts \\
|
|
19
|
+
* --agent klanker:@klanker_bot \\
|
|
20
|
+
* --agent scribe:@scribe_bot \\
|
|
21
|
+
* --agent doc:@doc_bot \\
|
|
22
|
+
* --admin-agent klanker \\
|
|
23
|
+
* --report ./uat-report.md
|
|
24
|
+
*
|
|
25
|
+
* # OR — discover from env (CI-friendly):
|
|
26
|
+
* UAT_FLEET="klanker:@klanker_bot,scribe:@scribe_bot,doc:@doc_bot" \\
|
|
27
|
+
* UAT_ADMIN_AGENTS="klanker" \\
|
|
28
|
+
* bun telegram-plugin/uat/runners/agent-self-sufficiency.ts
|
|
29
|
+
*
|
|
30
|
+
* Auth env (same as the existing uat harness — see
|
|
31
|
+
* telegram-plugin/uat/SETUP.md):
|
|
32
|
+
*
|
|
33
|
+
* TELEGRAM_API_ID, TELEGRAM_API_HASH, TELEGRAM_UAT_DRIVER_SESSION
|
|
34
|
+
*
|
|
35
|
+
* **Why a user-account session, not bot tokens.** The acceptance-
|
|
36
|
+
* criteria text mentioned `TELEGRAM_BOT_TOKEN_<agent>` env vars, but
|
|
37
|
+
* Telegram's Bot API forbids bots from reading other bots' messages
|
|
38
|
+
* (https://core.telegram.org/bots/faq) — a bot can send to another
|
|
39
|
+
* bot's chat but can't observe the reply. The only way to drive the
|
|
40
|
+
* fleet AND capture every agent's reply is an mtcute user-account
|
|
41
|
+
* session, which is what the existing telegram-plugin/uat harness
|
|
42
|
+
* uses. This runner inherits that machinery wholesale; the env-var
|
|
43
|
+
* rename is forced by the platform, not a design choice.
|
|
44
|
+
*
|
|
45
|
+
* Missing creds fail loud, not silent — the goal explicitly demands
|
|
46
|
+
* no silent skips on missing UAT credentials.
|
|
47
|
+
*/
|
|
48
|
+
|
|
49
|
+
import { writeFileSync } from "node:fs";
|
|
50
|
+
import { Driver, type ObservedMessage } from "../driver.js";
|
|
51
|
+
import { loadUatEnv } from "../load-env.js";
|
|
52
|
+
import { CRITERIA, type CriterionSpec } from "./paraphrases.js";
|
|
53
|
+
import { scoreReply, type CaseResult, type Outcome } from "./scorer.js";
|
|
54
|
+
import { renderMarkdown } from "./report.js";
|
|
55
|
+
|
|
56
|
+
loadUatEnv();
|
|
57
|
+
|
|
58
|
+
// ─── CLI / env parsing ─────────────────────────────────────────────────────
|
|
59
|
+
|
|
60
|
+
interface AgentTarget {
|
|
61
|
+
name: string;
|
|
62
|
+
botUsername: string;
|
|
63
|
+
admin: boolean;
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
interface CliConfig {
|
|
67
|
+
agents: AgentTarget[];
|
|
68
|
+
reportPath: string;
|
|
69
|
+
jsonPath: string;
|
|
70
|
+
/** Per-case reply timeout, ms. Default 60s. */
|
|
71
|
+
replyTimeoutMs: number;
|
|
72
|
+
/** Inter-message settle, ms. Default 4s — keeps us under Telegram's
|
|
73
|
+
* global outbound rate cap and gives the agent time to finish its
|
|
74
|
+
* previous turn before the next inbound. */
|
|
75
|
+
settleMs: number;
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
function parseCli(argv: readonly string[]): CliConfig {
|
|
79
|
+
const agents = new Map<string, AgentTarget>();
|
|
80
|
+
const adminSet = new Set<string>();
|
|
81
|
+
let reportPath = process.env.UAT_REPORT ?? "./uat-agent-self-sufficiency.md";
|
|
82
|
+
let jsonPath = process.env.UAT_REPORT_JSON ?? "./uat-agent-self-sufficiency.json";
|
|
83
|
+
let replyTimeoutMs = Number.parseInt(process.env.UAT_REPLY_TIMEOUT_MS ?? "60000", 10);
|
|
84
|
+
let settleMs = Number.parseInt(process.env.UAT_SETTLE_MS ?? "4000", 10);
|
|
85
|
+
|
|
86
|
+
const envFleet = process.env.UAT_FLEET;
|
|
87
|
+
if (envFleet) {
|
|
88
|
+
for (const tok of envFleet.split(",")) {
|
|
89
|
+
const [name, bot] = tok.split(":").map((s) => s.trim());
|
|
90
|
+
if (name && bot) agents.set(name, { name, botUsername: bot, admin: false });
|
|
91
|
+
}
|
|
92
|
+
}
|
|
93
|
+
const envAdmin = process.env.UAT_ADMIN_AGENTS;
|
|
94
|
+
if (envAdmin) {
|
|
95
|
+
for (const tok of envAdmin.split(",")) {
|
|
96
|
+
const name = tok.trim();
|
|
97
|
+
if (name) adminSet.add(name);
|
|
98
|
+
}
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
for (let i = 0; i < argv.length; i++) {
|
|
102
|
+
const tok = argv[i]!;
|
|
103
|
+
const next = (): string => {
|
|
104
|
+
const v = argv[++i];
|
|
105
|
+
if (!v) fail(`${tok}: missing value`);
|
|
106
|
+
return v;
|
|
107
|
+
};
|
|
108
|
+
switch (tok) {
|
|
109
|
+
case "--agent": {
|
|
110
|
+
const v = next();
|
|
111
|
+
const [name, bot] = v.split(":").map((s) => s.trim());
|
|
112
|
+
if (!name || !bot)
|
|
113
|
+
fail(`--agent expects "<name>:@<bot-username>"; got "${v}"`);
|
|
114
|
+
agents.set(name, { name, botUsername: bot, admin: false });
|
|
115
|
+
break;
|
|
116
|
+
}
|
|
117
|
+
case "--admin-agent": {
|
|
118
|
+
adminSet.add(next());
|
|
119
|
+
break;
|
|
120
|
+
}
|
|
121
|
+
case "--report":
|
|
122
|
+
reportPath = next();
|
|
123
|
+
break;
|
|
124
|
+
case "--json":
|
|
125
|
+
jsonPath = next();
|
|
126
|
+
break;
|
|
127
|
+
case "--reply-timeout-ms":
|
|
128
|
+
replyTimeoutMs = Number.parseInt(next(), 10);
|
|
129
|
+
break;
|
|
130
|
+
case "--settle-ms":
|
|
131
|
+
settleMs = Number.parseInt(next(), 10);
|
|
132
|
+
break;
|
|
133
|
+
case "--help":
|
|
134
|
+
case "-h":
|
|
135
|
+
printHelp();
|
|
136
|
+
process.exit(0);
|
|
137
|
+
break;
|
|
138
|
+
default:
|
|
139
|
+
if (tok.startsWith("--")) fail(`unknown flag: ${tok}`);
|
|
140
|
+
}
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
for (const name of adminSet) {
|
|
144
|
+
const t = agents.get(name);
|
|
145
|
+
if (t) t.admin = true;
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
if (agents.size === 0) {
|
|
149
|
+
fail(
|
|
150
|
+
"no agents to target. Pass --agent <name>:@<bot> at least once, or set UAT_FLEET env",
|
|
151
|
+
);
|
|
152
|
+
}
|
|
153
|
+
if (agents.size < 3) {
|
|
154
|
+
process.stderr.write(
|
|
155
|
+
`[uat] WARNING: only ${agents.size} agent(s) targeted; goal calls for ≥3 to prove shared infra.\n`,
|
|
156
|
+
);
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
return {
|
|
160
|
+
agents: [...agents.values()],
|
|
161
|
+
reportPath,
|
|
162
|
+
jsonPath,
|
|
163
|
+
replyTimeoutMs,
|
|
164
|
+
settleMs,
|
|
165
|
+
};
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
function fail(msg: string): never {
|
|
169
|
+
process.stderr.write(`[uat] ${msg}\n`);
|
|
170
|
+
process.exit(2);
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
function printHelp(): void {
|
|
174
|
+
process.stdout.write(`agent-self-sufficiency UAT runner
|
|
175
|
+
|
|
176
|
+
Required env (or fail loud):
|
|
177
|
+
TELEGRAM_API_ID, TELEGRAM_API_HASH, TELEGRAM_UAT_DRIVER_SESSION
|
|
178
|
+
|
|
179
|
+
Flags:
|
|
180
|
+
--agent NAME:@BOT Add an agent target. Repeatable.
|
|
181
|
+
--admin-agent NAME Mark NAME as admin: true (skips 3d for that agent).
|
|
182
|
+
--report PATH Markdown report path. Default ./uat-agent-self-sufficiency.md
|
|
183
|
+
--json PATH JSON sidecar with all results. Default ./uat-agent-self-sufficiency.json
|
|
184
|
+
--reply-timeout-ms N Per-case timeout. Default 60000.
|
|
185
|
+
--settle-ms N Inter-message settle. Default 4000.
|
|
186
|
+
|
|
187
|
+
Env equivalents:
|
|
188
|
+
UAT_FLEET="name1:@bot1,name2:@bot2,..."
|
|
189
|
+
UAT_ADMIN_AGENTS="name1,name2"
|
|
190
|
+
UAT_REPORT, UAT_REPORT_JSON, UAT_REPLY_TIMEOUT_MS, UAT_SETTLE_MS
|
|
191
|
+
`);
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
// ─── Driver wrapper: send + observe ─────────────────────────────────────────
|
|
195
|
+
|
|
196
|
+
interface ReplyOutcome {
|
|
197
|
+
reply: string;
|
|
198
|
+
outcome: Outcome;
|
|
199
|
+
durationMs: number;
|
|
200
|
+
errorMessage?: string;
|
|
201
|
+
}
|
|
202
|
+
|
|
203
|
+
/**
|
|
204
|
+
* Send one inbound to the agent and wait for a meaningful reply.
|
|
205
|
+
*
|
|
206
|
+
* We subscribe to the chat's message stream BEFORE sending so we don't
|
|
207
|
+
* miss the bot's reply if it lands faster than we can start observing
|
|
208
|
+
* (yes, this happens). Then:
|
|
209
|
+
*
|
|
210
|
+
* 1. Send the inbound.
|
|
211
|
+
* 2. Consume the stream until we see the first non-empty bot message
|
|
212
|
+
* with messageId > our sent.messageId. That's the reply head.
|
|
213
|
+
* 3. Continue consuming for an "edit window" (3s by default) to
|
|
214
|
+
* absorb any edits the gateway makes to its first chunk (stream-
|
|
215
|
+
* reply pattern: bot sends "thinking…" then edits with the final
|
|
216
|
+
* answer). The final post-edit text is what we score.
|
|
217
|
+
* 4. Bail out with `timeout` if we never see a head.
|
|
218
|
+
*/
|
|
219
|
+
async function sendAndScore(
|
|
220
|
+
driver: Driver,
|
|
221
|
+
botUserId: number,
|
|
222
|
+
driverUserId: number,
|
|
223
|
+
spec: CriterionSpec,
|
|
224
|
+
prompt: string,
|
|
225
|
+
agentName: string,
|
|
226
|
+
timeoutMs: number,
|
|
227
|
+
): Promise<ReplyOutcome> {
|
|
228
|
+
const startedAt = Date.now();
|
|
229
|
+
// Start observing FIRST so we don't race the bot's reply.
|
|
230
|
+
const stream = driver.observeMessages(botUserId)[Symbol.asyncIterator]();
|
|
231
|
+
|
|
232
|
+
let sentMessageId: number;
|
|
233
|
+
try {
|
|
234
|
+
const sent = await driver.sendText(botUserId, prompt);
|
|
235
|
+
sentMessageId = sent.messageId;
|
|
236
|
+
} catch (err) {
|
|
237
|
+
try {
|
|
238
|
+
await stream.return?.(undefined);
|
|
239
|
+
} catch {
|
|
240
|
+
/* ignore */
|
|
241
|
+
}
|
|
242
|
+
return {
|
|
243
|
+
reply: "",
|
|
244
|
+
outcome: "error",
|
|
245
|
+
durationMs: Date.now() - startedAt,
|
|
246
|
+
errorMessage: `send failed: ${(err as Error).message}`,
|
|
247
|
+
};
|
|
248
|
+
}
|
|
249
|
+
|
|
250
|
+
const deadline = startedAt + timeoutMs;
|
|
251
|
+
const EDIT_WINDOW_MS = 3000;
|
|
252
|
+
let headSeenAt = 0;
|
|
253
|
+
let replyMessageId = 0;
|
|
254
|
+
let replyText = "";
|
|
255
|
+
|
|
256
|
+
try {
|
|
257
|
+
while (Date.now() < deadline) {
|
|
258
|
+
const remaining = deadline - Date.now();
|
|
259
|
+
const winSize = headSeenAt
|
|
260
|
+
? Math.max(0, EDIT_WINDOW_MS - (Date.now() - headSeenAt))
|
|
261
|
+
: remaining;
|
|
262
|
+
if (headSeenAt && winSize === 0) break;
|
|
263
|
+
const slice = await pullOneWithTimeout(stream, Math.min(remaining, Math.max(250, winSize)));
|
|
264
|
+
if (slice === "timeout") {
|
|
265
|
+
if (headSeenAt) break; // edit window elapsed
|
|
266
|
+
continue;
|
|
267
|
+
}
|
|
268
|
+
if (slice === "done") break;
|
|
269
|
+
const m: ObservedMessage = slice;
|
|
270
|
+
if (m.senderUserId === driverUserId) continue;
|
|
271
|
+
if (m.messageId <= sentMessageId) continue;
|
|
272
|
+
const t = (m.text ?? "").trim();
|
|
273
|
+
if (!t) continue;
|
|
274
|
+
// Either this is the head, or it's an edit/replacement of the
|
|
275
|
+
// bot's reply. Track the most recent.
|
|
276
|
+
replyMessageId = m.messageId;
|
|
277
|
+
replyText = t;
|
|
278
|
+
if (!headSeenAt) headSeenAt = Date.now();
|
|
279
|
+
}
|
|
280
|
+
} finally {
|
|
281
|
+
try {
|
|
282
|
+
await stream.return?.(undefined);
|
|
283
|
+
} catch {
|
|
284
|
+
/* ignore */
|
|
285
|
+
}
|
|
286
|
+
}
|
|
287
|
+
|
|
288
|
+
const durationMs = Date.now() - startedAt;
|
|
289
|
+
if (!replyMessageId) {
|
|
290
|
+
return { reply: "", outcome: "timeout", durationMs };
|
|
291
|
+
}
|
|
292
|
+
const outcome = scoreReply(spec, replyText, { agentName });
|
|
293
|
+
return { reply: replyText, outcome, durationMs };
|
|
294
|
+
}
|
|
295
|
+
|
|
296
|
+
/**
|
|
297
|
+
* Race the next stream item against a timeout. Returns the item, or
|
|
298
|
+
* the literal `"timeout"` / `"done"` sentinels. `done` is rare in
|
|
299
|
+
* practice — the observer doesn't naturally close until we tell it to.
|
|
300
|
+
*/
|
|
301
|
+
async function pullOneWithTimeout(
|
|
302
|
+
it: AsyncIterator<ObservedMessage>,
|
|
303
|
+
ms: number,
|
|
304
|
+
): Promise<ObservedMessage | "timeout" | "done"> {
|
|
305
|
+
return new Promise((resolve) => {
|
|
306
|
+
let settled = false;
|
|
307
|
+
const timer = setTimeout(() => {
|
|
308
|
+
if (settled) return;
|
|
309
|
+
settled = true;
|
|
310
|
+
resolve("timeout");
|
|
311
|
+
}, ms);
|
|
312
|
+
it.next().then(
|
|
313
|
+
(r) => {
|
|
314
|
+
if (settled) return;
|
|
315
|
+
settled = true;
|
|
316
|
+
clearTimeout(timer);
|
|
317
|
+
if (r.done) resolve("done");
|
|
318
|
+
else resolve(r.value);
|
|
319
|
+
},
|
|
320
|
+
() => {
|
|
321
|
+
if (settled) return;
|
|
322
|
+
settled = true;
|
|
323
|
+
clearTimeout(timer);
|
|
324
|
+
resolve("done");
|
|
325
|
+
},
|
|
326
|
+
);
|
|
327
|
+
});
|
|
328
|
+
}
|
|
329
|
+
|
|
330
|
+
// ─── Main orchestration ─────────────────────────────────────────────────────
|
|
331
|
+
|
|
332
|
+
async function main(): Promise<void> {
|
|
333
|
+
const cli = parseCli(process.argv.slice(2));
|
|
334
|
+
|
|
335
|
+
// Hard-fail on missing UAT creds — goal: never silently skip.
|
|
336
|
+
const apiId = Number.parseInt(process.env.TELEGRAM_API_ID ?? "", 10);
|
|
337
|
+
if (!Number.isFinite(apiId)) {
|
|
338
|
+
fail("TELEGRAM_API_ID missing or non-integer — see telegram-plugin/uat/SETUP.md");
|
|
339
|
+
}
|
|
340
|
+
const apiHash = process.env.TELEGRAM_API_HASH ?? "";
|
|
341
|
+
if (!apiHash) fail("TELEGRAM_API_HASH missing — see SETUP.md");
|
|
342
|
+
const session = process.env.TELEGRAM_UAT_DRIVER_SESSION ?? "";
|
|
343
|
+
if (!session)
|
|
344
|
+
fail(
|
|
345
|
+
"TELEGRAM_UAT_DRIVER_SESSION missing — run `bun run uat:login` first (SETUP.md §4)",
|
|
346
|
+
);
|
|
347
|
+
|
|
348
|
+
process.stdout.write(
|
|
349
|
+
`[uat] connecting to Telegram as the UAT driver account...\n`,
|
|
350
|
+
);
|
|
351
|
+
const driver = new Driver({ apiId, apiHash, session });
|
|
352
|
+
await driver.connect();
|
|
353
|
+
const driverUserId = await driver.getMyUserId();
|
|
354
|
+
process.stdout.write(`[uat] driver user_id=${driverUserId}\n`);
|
|
355
|
+
|
|
356
|
+
// Resolve every agent's bot user_id up front so a missing username
|
|
357
|
+
// fails before we waste any time on the run.
|
|
358
|
+
const resolved: { target: AgentTarget; botUserId: number }[] = [];
|
|
359
|
+
for (const a of cli.agents) {
|
|
360
|
+
try {
|
|
361
|
+
const id = await driver.resolveBotUserId(a.botUsername);
|
|
362
|
+
resolved.push({ target: a, botUserId: id });
|
|
363
|
+
process.stdout.write(
|
|
364
|
+
`[uat] resolved ${a.name} ${a.botUsername} → bot_user_id=${id}` +
|
|
365
|
+
(a.admin ? " (admin)" : "") +
|
|
366
|
+
"\n",
|
|
367
|
+
);
|
|
368
|
+
} catch (err) {
|
|
369
|
+
process.stderr.write(
|
|
370
|
+
`[uat] FAILED to resolve ${a.botUsername} for agent ${a.name}: ${(err as Error).message}\n`,
|
|
371
|
+
);
|
|
372
|
+
process.exit(3);
|
|
373
|
+
}
|
|
374
|
+
}
|
|
375
|
+
|
|
376
|
+
// Run!
|
|
377
|
+
const startedAt = new Date();
|
|
378
|
+
const t0 = Date.now();
|
|
379
|
+
const results: CaseResult[] = [];
|
|
380
|
+
|
|
381
|
+
for (const { target, botUserId } of resolved) {
|
|
382
|
+
process.stdout.write(`\n[uat] ─── agent: ${target.name} ─────────────\n`);
|
|
383
|
+
for (const spec of CRITERIA) {
|
|
384
|
+
// Skip 3d (non-admin refusal) on admin agents — they're legitimately
|
|
385
|
+
// capable of those operations, so a "I can't" reply would be wrong.
|
|
386
|
+
if (spec.id === "3d_admin_refusal" && target.admin) {
|
|
387
|
+
process.stdout.write(
|
|
388
|
+
`[uat] skip ${spec.id} on ${target.name} (admin: true)\n`,
|
|
389
|
+
);
|
|
390
|
+
continue;
|
|
391
|
+
}
|
|
392
|
+
|
|
393
|
+
for (const para of spec.paraphrases) {
|
|
394
|
+
const r = await sendAndScore(
|
|
395
|
+
driver,
|
|
396
|
+
botUserId,
|
|
397
|
+
driverUserId,
|
|
398
|
+
spec,
|
|
399
|
+
para.text,
|
|
400
|
+
target.name,
|
|
401
|
+
cli.replyTimeoutMs,
|
|
402
|
+
);
|
|
403
|
+
const tag =
|
|
404
|
+
r.outcome === "pass" ? "✓" : r.outcome === "fail" ? "✗" : "·";
|
|
405
|
+
process.stdout.write(
|
|
406
|
+
`[uat] ${tag} ${spec.id}/${para.label} (${r.outcome}, ${r.durationMs}ms)\n`,
|
|
407
|
+
);
|
|
408
|
+
results.push({
|
|
409
|
+
agent: target.name,
|
|
410
|
+
criterion: spec.id,
|
|
411
|
+
paraphrase: para,
|
|
412
|
+
outcome: r.outcome,
|
|
413
|
+
reply: r.reply,
|
|
414
|
+
durationMs: r.durationMs,
|
|
415
|
+
...(r.errorMessage ? { errorMessage: r.errorMessage } : {}),
|
|
416
|
+
});
|
|
417
|
+
// Inter-message settle: keep below Telegram's user-account
|
|
418
|
+
// outbound cap and let the agent finish its prior turn.
|
|
419
|
+
await new Promise((res) => setTimeout(res, cli.settleMs));
|
|
420
|
+
}
|
|
421
|
+
}
|
|
422
|
+
}
|
|
423
|
+
|
|
424
|
+
const durationSeconds = (Date.now() - t0) / 1000;
|
|
425
|
+
await driver.disconnect().catch(() => undefined);
|
|
426
|
+
|
|
427
|
+
const md = renderMarkdown(results, {
|
|
428
|
+
startedAt,
|
|
429
|
+
durationSeconds,
|
|
430
|
+
agents: resolved.map((r) => r.target.name),
|
|
431
|
+
});
|
|
432
|
+
writeFileSync(cli.reportPath, md, "utf-8");
|
|
433
|
+
writeFileSync(
|
|
434
|
+
cli.jsonPath,
|
|
435
|
+
JSON.stringify(
|
|
436
|
+
{ startedAt: startedAt.toISOString(), durationSeconds, results },
|
|
437
|
+
null,
|
|
438
|
+
2,
|
|
439
|
+
),
|
|
440
|
+
"utf-8",
|
|
441
|
+
);
|
|
442
|
+
process.stdout.write(`\n[uat] report → ${cli.reportPath}\n`);
|
|
443
|
+
process.stdout.write(`[uat] json → ${cli.jsonPath}\n`);
|
|
444
|
+
|
|
445
|
+
const passes = results.filter((r) => r.outcome === "pass").length;
|
|
446
|
+
process.stdout.write(
|
|
447
|
+
`[uat] overall: ${passes}/${results.length} passed (${results.length > 0 ? ((passes / results.length) * 100).toFixed(1) : "0"}%)\n`,
|
|
448
|
+
);
|
|
449
|
+
|
|
450
|
+
// Exit non-zero if anything failed, so the runner is CI-actionable.
|
|
451
|
+
process.exit(passes === results.length ? 0 : 1);
|
|
452
|
+
}
|
|
453
|
+
|
|
454
|
+
main().catch((err) => {
|
|
455
|
+
process.stderr.write(`[uat] FATAL: ${(err as Error).stack ?? err}\n`);
|
|
456
|
+
process.exit(4);
|
|
457
|
+
});
|