switchroom 0.8.1 → 0.10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +49 -57
- package/bin/timezone-hook.sh +9 -7
- package/dist/agent-scheduler/index.js +285 -45
- package/dist/auth-broker/index.js +13932 -0
- package/dist/cli/switchroom.js +15931 -12778
- package/dist/host-control/main.js +582 -43
- package/dist/vault/approvals/kernel-server.js +276 -47
- package/dist/vault/broker/server.js +333 -69
- package/examples/minimal.yaml +63 -0
- package/examples/personal-google-workspace-mcp/.env.example +34 -0
- package/examples/personal-google-workspace-mcp/README.md +194 -0
- package/examples/personal-google-workspace-mcp/compose.yaml +66 -0
- package/examples/switchroom.yaml +220 -0
- package/package.json +6 -4
- package/profiles/_base/start.sh.hbs +3 -3
- package/profiles/_shared/agent-self-service.md.hbs +126 -0
- package/profiles/default/CLAUDE.md +10 -0
- package/profiles/default/CLAUDE.md.hbs +16 -0
- package/skills/buildkite-agent-infrastructure/SKILL.md +30 -11
- package/skills/buildkite-agent-runtime/SKILL.md +44 -11
- package/skills/buildkite-api/SKILL.md +31 -8
- package/skills/buildkite-cli/SKILL.md +27 -9
- package/skills/buildkite-migration/SKILL.md +22 -9
- package/skills/buildkite-pipelines/SKILL.md +26 -9
- package/skills/buildkite-secure-delivery/SKILL.md +23 -9
- package/skills/buildkite-test-engine/SKILL.md +25 -8
- package/skills/docx/SKILL.md +1 -1
- package/skills/file-bug/SKILL.md +34 -6
- package/skills/humanizer/SKILL.md +15 -0
- package/skills/humanizer-calibrate/SKILL.md +7 -1
- package/skills/mcp-builder/SKILL.md +1 -1
- package/skills/pdf/SKILL.md +1 -1
- package/skills/pptx/SKILL.md +1 -1
- package/skills/skill-creator/SKILL.md +21 -1
- package/skills/skill-creator/scripts/__pycache__/__init__.cpython-313.pyc +0 -0
- package/skills/skill-creator/scripts/__pycache__/generate_report.cpython-313.pyc +0 -0
- package/skills/skill-creator/scripts/__pycache__/improve_description.cpython-313.pyc +0 -0
- package/skills/skill-creator/scripts/__pycache__/run_eval.cpython-313.pyc +0 -0
- package/skills/skill-creator/scripts/__pycache__/run_loop.cpython-313.pyc +0 -0
- package/skills/skill-creator/scripts/__pycache__/utils.cpython-313.pyc +0 -0
- package/skills/switchroom-cli/SKILL.md +63 -64
- package/skills/switchroom-health/SKILL.md +23 -10
- package/skills/switchroom-install/SKILL.md +3 -3
- package/skills/switchroom-manage/SKILL.md +26 -19
- package/skills/switchroom-runtime/SKILL.md +67 -15
- package/skills/switchroom-status/SKILL.md +26 -1
- package/skills/telegram-test-harness/SKILL.md +3 -0
- package/skills/webapp-testing/SKILL.md +31 -1
- package/skills/xlsx/SKILL.md +1 -1
- package/telegram-plugin/admin-commands/index.ts +7 -5
- package/telegram-plugin/dist/gateway/gateway.js +13042 -12844
- package/telegram-plugin/gateway/auth-add-flow.ts +326 -0
- package/telegram-plugin/gateway/auth-broker-client.ts +75 -0
- package/telegram-plugin/gateway/auth-command.ts +794 -0
- package/telegram-plugin/gateway/auth-line.ts +123 -0
- package/telegram-plugin/gateway/boot-card.ts +22 -36
- package/telegram-plugin/gateway/boot-probes.ts +3 -3
- package/telegram-plugin/gateway/gateway.ts +313 -798
- package/telegram-plugin/gateway/hostd-dispatch.ts +117 -0
- package/telegram-plugin/hooks/tool-label-pretool.mjs +11 -0
- package/telegram-plugin/hooks/wedge-detect-posttool.mjs +303 -0
- package/telegram-plugin/permission-title.ts +56 -0
- package/telegram-plugin/quota-check.ts +19 -41
- package/telegram-plugin/scripts/build.mjs +0 -1
- package/telegram-plugin/shared/bot-runtime.ts +5 -4
- package/telegram-plugin/tests/auth-add-flow.test.ts +559 -0
- package/telegram-plugin/tests/auth-code-redact.test.ts +8 -4
- package/telegram-plugin/tests/auth-command-vernacular.test.ts +531 -0
- package/telegram-plugin/tests/boot-probes.test.ts +11 -4
- package/telegram-plugin/tests/hostd-dispatch.test.ts +129 -0
- package/telegram-plugin/tests/permission-title.test.ts +31 -0
- package/telegram-plugin/tests/quota-check.test.ts +5 -35
- package/telegram-plugin/uat/SETUP.md +31 -1
- package/telegram-plugin/uat/runners/agent-self-sufficiency.ts +457 -0
- package/telegram-plugin/uat/runners/paraphrases.ts +231 -0
- package/telegram-plugin/uat/runners/report.ts +150 -0
- package/telegram-plugin/uat/runners/run-agent-self-sufficiency.sh +50 -0
- package/telegram-plugin/uat/runners/scorer.test.ts +196 -0
- package/telegram-plugin/uat/runners/scorer.ts +106 -0
- package/telegram-plugin/uat/runners/skill-coverage.test.ts +100 -0
- package/telegram-plugin/uat/runners/skill-coverage.ts +620 -0
- package/telegram-plugin/uat/scenarios/jtbd-interrupt-marker-dm.test.ts +7 -1
- package/telegram-plugin/uat/scenarios/jtbd-rapid-followup-dm.test.ts +7 -1
- package/telegram-plugin/auth-dashboard.ts +0 -1104
- package/telegram-plugin/auth-slot-parser.ts +0 -497
- package/telegram-plugin/dist/foreman/foreman.js +0 -31358
- package/telegram-plugin/foreman/foreman-create-flow.ts +0 -202
- package/telegram-plugin/foreman/foreman-handlers.ts +0 -493
- package/telegram-plugin/foreman/foreman.ts +0 -1165
- package/telegram-plugin/foreman/setup-flow.ts +0 -345
- package/telegram-plugin/foreman/setup-state.ts +0 -239
- package/telegram-plugin/foreman/state.ts +0 -203
- package/telegram-plugin/tests/auth-account-identity-surface.test.ts +0 -118
- package/telegram-plugin/tests/auth-dashboard-edge-cases.test.ts +0 -260
- package/telegram-plugin/tests/auth-dashboard-restart-flow.test.ts +0 -140
- package/telegram-plugin/tests/auth-dashboard-v3b.test.ts +0 -559
- package/telegram-plugin/tests/auth-dashboard.test.ts +0 -1045
- package/telegram-plugin/tests/auth-slot-commands.test.ts +0 -640
- package/telegram-plugin/tests/boot-card-account-quota.test.ts +0 -137
- package/telegram-plugin/tests/foreman-create-flow.test.ts +0 -359
- package/telegram-plugin/tests/foreman-handlers.test.ts +0 -347
- package/telegram-plugin/tests/foreman-state.test.ts +0 -164
- package/telegram-plugin/tests/foreman-write-ops.test.ts +0 -214
- package/telegram-plugin/tests/setup-flow.test.ts +0 -510
- package/telegram-plugin/tests/setup-state.test.ts +0 -146
|
@@ -0,0 +1,231 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Paraphrase corpus for the agent-self-sufficiency UAT runner.
|
|
3
|
+
*
|
|
4
|
+
* Each acceptance criterion gets ≥10 paraphrases spanning the five
|
|
5
|
+
* shapes a real operator sends:
|
|
6
|
+
*
|
|
7
|
+
* - formal ("Please list the agents currently online.")
|
|
8
|
+
* - terse ("agents?")
|
|
9
|
+
* - typo'd ("whihc bots r runnng")
|
|
10
|
+
* - voice ("hey um can you tell me which other agents are around")
|
|
11
|
+
* - multi-intent("what time is it and also which bots are here?")
|
|
12
|
+
*
|
|
13
|
+
* The runner sends one paraphrase per acceptance criterion per agent
|
|
14
|
+
* and scores the reply against a per-criterion heuristic. Failures
|
|
15
|
+
* are listed verbatim in the report's triage table.
|
|
16
|
+
*
|
|
17
|
+
* Why ≥10 per criterion: a single prompt that "works" can mask brittle
|
|
18
|
+
* pattern-matching. Variants prove the agent actually understood the
|
|
19
|
+
* intent rather than memorizing a magic string.
|
|
20
|
+
*/
|
|
21
|
+
|
|
22
|
+
export type CriterionId =
|
|
23
|
+
| "1a_skill_list"
|
|
24
|
+
| "1b_cron_list"
|
|
25
|
+
| "1c_audit_tail"
|
|
26
|
+
| "1c_config_get"
|
|
27
|
+
| "2a_what_are_you"
|
|
28
|
+
| "2b_your_name"
|
|
29
|
+
| "2c_peers"
|
|
30
|
+
| "3d_admin_refusal";
|
|
31
|
+
|
|
32
|
+
/**
|
|
33
|
+
* One paraphrase + the expected-shape regex its reply must match. We
|
|
34
|
+
* deliberately keep the matchers permissive — any reply containing the
|
|
35
|
+
* key term passes. Strict format-matching is the job of the underlying
|
|
36
|
+
* MCP tools (config_get returns JSON), not the agent's prose reply.
|
|
37
|
+
*/
|
|
38
|
+
export interface Paraphrase {
|
|
39
|
+
/** Short label for the report's triage table. */
|
|
40
|
+
label: string;
|
|
41
|
+
/** Stylistic shape — drives the report's pass-rate breakdown. */
|
|
42
|
+
shape: "formal" | "terse" | "typo" | "voice" | "multi";
|
|
43
|
+
/** Text sent verbatim to the agent via DM. */
|
|
44
|
+
text: string;
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
export interface CriterionSpec {
|
|
48
|
+
id: CriterionId;
|
|
49
|
+
/** One-line description in the report header. */
|
|
50
|
+
description: string;
|
|
51
|
+
/**
|
|
52
|
+
* Heuristic: regex the reply must match for pass. The runner applies
|
|
53
|
+
* this *after* stripping markdown / collapsing whitespace, so the
|
|
54
|
+
* regex doesn't have to know about bold/italic formatting.
|
|
55
|
+
*/
|
|
56
|
+
passPattern: RegExp;
|
|
57
|
+
/** Stylistically-varied paraphrases. Length ≥ 10. */
|
|
58
|
+
paraphrases: Paraphrase[];
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
export const CRITERIA: readonly CriterionSpec[] = [
|
|
62
|
+
// ─── 1a — skill self-management ──────────────────────────────────────
|
|
63
|
+
{
|
|
64
|
+
id: "1a_skill_list",
|
|
65
|
+
description: "Agent can inventory its own skills via skill_list",
|
|
66
|
+
// Pass: the reply names at least one skill OR explicitly says "none/no skills".
|
|
67
|
+
passPattern: /skill|bundled|none|no skills|empty/i,
|
|
68
|
+
paraphrases: [
|
|
69
|
+
{ label: "formal", shape: "formal", text: "Please list the skills you currently have access to." },
|
|
70
|
+
{ label: "terse", shape: "terse", text: "skills?" },
|
|
71
|
+
{ label: "what-can-you-do", shape: "voice", text: "hey, what skills do you have right now?" },
|
|
72
|
+
{ label: "typo", shape: "typo", text: "wht skils r u runng" },
|
|
73
|
+
{ label: "imperative", shape: "terse", text: "show your skills" },
|
|
74
|
+
{ label: "tell-me", shape: "voice", text: "tell me which skills are loaded for you" },
|
|
75
|
+
{ label: "inventory", shape: "formal", text: "Inventory the skills configured on your agent." },
|
|
76
|
+
{ label: "list-skills", shape: "terse", text: "list skills" },
|
|
77
|
+
{ label: "multi-intent", shape: "multi", text: "what model are you on and what skills do you have?" },
|
|
78
|
+
{ label: "context", shape: "voice", text: "i was wondering which skills you have installed" },
|
|
79
|
+
],
|
|
80
|
+
},
|
|
81
|
+
// ─── 1b — cron self-management ───────────────────────────────────────
|
|
82
|
+
{
|
|
83
|
+
id: "1b_cron_list",
|
|
84
|
+
description: "Agent can inventory its own scheduled tasks via cron_list",
|
|
85
|
+
passPattern: /schedule|cron|task|none|no scheduled|nothing scheduled|empty/i,
|
|
86
|
+
paraphrases: [
|
|
87
|
+
{ label: "formal", shape: "formal", text: "Please list your currently scheduled tasks." },
|
|
88
|
+
{ label: "terse", shape: "terse", text: "scheduled tasks?" },
|
|
89
|
+
{ label: "what-cron", shape: "voice", text: "what cron jobs do you have set up?" },
|
|
90
|
+
{ label: "typo", shape: "typo", text: "wht jobs r schedluded" },
|
|
91
|
+
{ label: "show-schedule", shape: "terse", text: "show schedule" },
|
|
92
|
+
{ label: "any-scheduled", shape: "voice", text: "do you have anything scheduled?" },
|
|
93
|
+
{ label: "list-cron", shape: "terse", text: "list cron" },
|
|
94
|
+
{ label: "recurring", shape: "voice", text: "are there any recurring tasks you run?" },
|
|
95
|
+
{ label: "multi-intent", shape: "multi", text: "what time is it and what tasks are scheduled?" },
|
|
96
|
+
{ label: "imperative", shape: "formal", text: "Report your schedule entries." },
|
|
97
|
+
],
|
|
98
|
+
},
|
|
99
|
+
// ─── 1c — audit-tail introspection ───────────────────────────────────
|
|
100
|
+
{
|
|
101
|
+
id: "1c_audit_tail",
|
|
102
|
+
description: "Agent can show recent tool calls via audit_tail",
|
|
103
|
+
passPattern: /audit|recent|tool|call|activity|history|nothing recent|no recent/i,
|
|
104
|
+
paraphrases: [
|
|
105
|
+
{ label: "formal", shape: "formal", text: "Show me your recent agent-config tool calls." },
|
|
106
|
+
{ label: "what-have-you-done", shape: "voice", text: "what have you been doing recently?" },
|
|
107
|
+
{ label: "terse", shape: "terse", text: "audit tail" },
|
|
108
|
+
{ label: "typo", shape: "typo", text: "wht hav u been up to" },
|
|
109
|
+
{ label: "recent-changes", shape: "voice", text: "show me your recent config changes" },
|
|
110
|
+
{ label: "history", shape: "terse", text: "history" },
|
|
111
|
+
{ label: "log", shape: "voice", text: "any recent activity in your audit log?" },
|
|
112
|
+
{ label: "what-just-ran", shape: "voice", text: "what did you just run?" },
|
|
113
|
+
{ label: "multi-intent", shape: "multi", text: "list your skills and show your recent activity" },
|
|
114
|
+
{ label: "formal-2", shape: "formal", text: "Provide the tail of your agent-config audit log." },
|
|
115
|
+
],
|
|
116
|
+
},
|
|
117
|
+
// ─── 1c — config-get introspection ───────────────────────────────────
|
|
118
|
+
{
|
|
119
|
+
id: "1c_config_get",
|
|
120
|
+
description: "Agent can report its own merged config via config_get",
|
|
121
|
+
passPattern: /config|model|agent|skill|schedule|topic|switchroom\.yaml/i,
|
|
122
|
+
paraphrases: [
|
|
123
|
+
{ label: "formal", shape: "formal", text: "Please report your current merged configuration." },
|
|
124
|
+
{ label: "what-model", shape: "voice", text: "what model are you running on right now?" },
|
|
125
|
+
{ label: "terse", shape: "terse", text: "config?" },
|
|
126
|
+
{ label: "typo", shape: "typo", text: "wht is ur config" },
|
|
127
|
+
{ label: "show-config", shape: "terse", text: "show config" },
|
|
128
|
+
{ label: "your-settings", shape: "voice", text: "what are your current settings?" },
|
|
129
|
+
{ label: "multi-intent", shape: "multi", text: "what model are you and what's your timezone?" },
|
|
130
|
+
{ label: "details", shape: "voice", text: "give me a rundown of how you're configured" },
|
|
131
|
+
{ label: "merged-slice", shape: "formal", text: "Emit your effective config slice." },
|
|
132
|
+
{ label: "intro", shape: "voice", text: "tell me about your setup" },
|
|
133
|
+
],
|
|
134
|
+
},
|
|
135
|
+
// ─── 2a — honest identity ────────────────────────────────────────────
|
|
136
|
+
{
|
|
137
|
+
id: "2a_what_are_you",
|
|
138
|
+
description: "Agent honestly identifies as a switchroom agent running Claude Code",
|
|
139
|
+
passPattern: /switchroom|claude code|claude cli/i,
|
|
140
|
+
paraphrases: [
|
|
141
|
+
{ label: "what-are-you", shape: "terse", text: "what are you?" },
|
|
142
|
+
{ label: "formal", shape: "formal", text: "Please describe what you are and what runtime hosts you." },
|
|
143
|
+
{ label: "voice", shape: "voice", text: "hey, what is this thing? like what's running here?" },
|
|
144
|
+
{ label: "typo", shape: "typo", text: "wht r u" },
|
|
145
|
+
{ label: "are-you-chatgpt", shape: "voice", text: "are you chatgpt or something?" },
|
|
146
|
+
{ label: "behind-the-scenes", shape: "voice", text: "what's behind this bot?" },
|
|
147
|
+
{ label: "tech-stack", shape: "formal", text: "What is your underlying tech stack?" },
|
|
148
|
+
{ label: "what-model", shape: "voice", text: "what's the actual model behind you?" },
|
|
149
|
+
{ label: "describe", shape: "terse", text: "describe yourself" },
|
|
150
|
+
{ label: "multi-intent", shape: "multi", text: "what are you and who built this?" },
|
|
151
|
+
],
|
|
152
|
+
},
|
|
153
|
+
// ─── 2b — knows its own name ─────────────────────────────────────────
|
|
154
|
+
{
|
|
155
|
+
id: "2b_your_name",
|
|
156
|
+
description: "Agent knows its own SWITCHROOM_AGENT_NAME",
|
|
157
|
+
// We can't bake the expected name in — the runner injects it
|
|
158
|
+
// per-agent and the test passes if the reply contains the name.
|
|
159
|
+
passPattern: /__INJECTED_AGENT_NAME__/i,
|
|
160
|
+
paraphrases: [
|
|
161
|
+
{ label: "your-name", shape: "terse", text: "what's your name?" },
|
|
162
|
+
{ label: "formal", shape: "formal", text: "Please state your agent name as configured in switchroom.yaml." },
|
|
163
|
+
{ label: "voice", shape: "voice", text: "remind me what you go by" },
|
|
164
|
+
{ label: "typo", shape: "typo", text: "whts ur name agian" },
|
|
165
|
+
{ label: "agent-name", shape: "terse", text: "agent name?" },
|
|
166
|
+
{ label: "who-are-you", shape: "voice", text: "who are you?" },
|
|
167
|
+
{ label: "env-var", shape: "formal", text: "What is your $SWITCHROOM_AGENT_NAME?" },
|
|
168
|
+
{ label: "introduce", shape: "voice", text: "introduce yourself by name" },
|
|
169
|
+
{ label: "multi-intent", shape: "multi", text: "what's your name and what model are you?" },
|
|
170
|
+
{ label: "tag", shape: "voice", text: "what tag identifies you in the fleet" },
|
|
171
|
+
],
|
|
172
|
+
},
|
|
173
|
+
// ─── 2c — peer awareness ─────────────────────────────────────────────
|
|
174
|
+
{
|
|
175
|
+
id: "2c_peers",
|
|
176
|
+
description: "Agent can name peer agents on the instance via peers_list",
|
|
177
|
+
// Pass: reply mentions "agent" + something resembling a list (commas,
|
|
178
|
+
// bullets, or "no other agents"). Substring "peer" or "other agent"
|
|
179
|
+
// also passes.
|
|
180
|
+
passPattern: /peer|other agent|fleet|no other|alone|only agent|switchroom|^[\s\S]*,[\s\S]*$/im,
|
|
181
|
+
paraphrases: [
|
|
182
|
+
{ label: "who-else", shape: "terse", text: "who else is here?" },
|
|
183
|
+
{ label: "formal", shape: "formal", text: "Please list the other agents on this switchroom instance." },
|
|
184
|
+
{ label: "voice", shape: "voice", text: "are there any other bots running here?" },
|
|
185
|
+
{ label: "typo", shape: "typo", text: "wht othr agnts r runnng" },
|
|
186
|
+
{ label: "agent-that-does-x", shape: "voice", text: "is there an agent that handles email or scheduling?" },
|
|
187
|
+
{ label: "peers", shape: "terse", text: "peers" },
|
|
188
|
+
{ label: "fleet", shape: "voice", text: "who's in the fleet" },
|
|
189
|
+
{ label: "list-other-agents", shape: "formal", text: "Enumerate the peer agents available." },
|
|
190
|
+
{ label: "what-bots", shape: "voice", text: "what other bots can I talk to?" },
|
|
191
|
+
{ label: "multi-intent", shape: "multi", text: "tell me your name and list the other agents" },
|
|
192
|
+
],
|
|
193
|
+
},
|
|
194
|
+
// ─── 3d — non-admin refusal naming admin ────────────────────────────
|
|
195
|
+
{
|
|
196
|
+
id: "3d_admin_refusal",
|
|
197
|
+
description: "Non-admin agents refuse fleet ops and name an admin peer",
|
|
198
|
+
// Pass: reply contains a refusal cue ("can't", "not admin", "ask")
|
|
199
|
+
// AND either an admin agent name OR generic "admin agent".
|
|
200
|
+
// (Admin agents may legitimately succeed, in which case this
|
|
201
|
+
// criterion is skipped for that agent — handled in the runner.)
|
|
202
|
+
passPattern: /can't|cannot|not admin|don't have|ask|admin agent|admin: true/i,
|
|
203
|
+
paraphrases: [
|
|
204
|
+
{ label: "restart-fleet", shape: "terse", text: "restart all the agents" },
|
|
205
|
+
{ label: "formal", shape: "formal", text: "Please restart the entire agent fleet." },
|
|
206
|
+
{ label: "voice", shape: "voice", text: "hey can you restart everyone?" },
|
|
207
|
+
{ label: "typo", shape: "typo", text: "restrat all bots plz" },
|
|
208
|
+
{ label: "update", shape: "terse", text: "update apply" },
|
|
209
|
+
{ label: "logs-of-peer", shape: "voice", text: "show me the logs of another agent" },
|
|
210
|
+
{ label: "exec-peer", shape: "voice", text: "run df -h inside the doc agent" },
|
|
211
|
+
{ label: "stop-other", shape: "voice", text: "stop the other agent" },
|
|
212
|
+
{ label: "fleet-update", shape: "formal", text: "Run a fleet-wide update_apply." },
|
|
213
|
+
{ label: "multi-intent", shape: "multi", text: "tell me your name and then restart the fleet" },
|
|
214
|
+
],
|
|
215
|
+
},
|
|
216
|
+
];
|
|
217
|
+
|
|
218
|
+
/**
|
|
219
|
+
* Substitute the per-agent injection slot in a criterion's
|
|
220
|
+
* passPattern. Returns the original pattern when no injection is
|
|
221
|
+
* needed.
|
|
222
|
+
*/
|
|
223
|
+
export function patternFor(
|
|
224
|
+
spec: CriterionSpec,
|
|
225
|
+
injection: { agentName: string },
|
|
226
|
+
): RegExp {
|
|
227
|
+
const src = spec.passPattern.source;
|
|
228
|
+
if (!src.includes("__INJECTED_AGENT_NAME__")) return spec.passPattern;
|
|
229
|
+
const escaped = injection.agentName.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
|
|
230
|
+
return new RegExp(src.replace(/__INJECTED_AGENT_NAME__/g, escaped), spec.passPattern.flags);
|
|
231
|
+
}
|
|
@@ -0,0 +1,150 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Markdown report renderer for the agent-self-sufficiency UAT.
|
|
3
|
+
*
|
|
4
|
+
* Layout decisions:
|
|
5
|
+
*
|
|
6
|
+
* - Per-criterion pass-rate table is the headline — operator reads
|
|
7
|
+
* "did we move the needle" in one glance.
|
|
8
|
+
* - Per-agent + per-shape tables answer "did this regress for one
|
|
9
|
+
* agent" and "did one shape (typo/voice/multi) collapse".
|
|
10
|
+
* - Triage table lists every failure / timeout / error verbatim with
|
|
11
|
+
* the prompt and the reply, so the operator can diff them in the
|
|
12
|
+
* PR without re-running. Cap at 100 rows to keep the PR body
|
|
13
|
+
* digestible — the JSON sidecar (written alongside) has everything.
|
|
14
|
+
*/
|
|
15
|
+
|
|
16
|
+
import type { CaseResult } from "./scorer.js";
|
|
17
|
+
import { aggregate } from "./scorer.js";
|
|
18
|
+
import { CRITERIA } from "./paraphrases.js";
|
|
19
|
+
|
|
20
|
+
export interface RenderOptions {
|
|
21
|
+
/** When the run started (used in the report header). */
|
|
22
|
+
startedAt: Date;
|
|
23
|
+
/** Total wall-clock seconds for the run. */
|
|
24
|
+
durationSeconds: number;
|
|
25
|
+
/** Agents the runner targeted. */
|
|
26
|
+
agents: readonly string[];
|
|
27
|
+
/** Cap on triage rows shown in the rendered markdown. Default 100. */
|
|
28
|
+
triageCap?: number;
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
export function renderMarkdown(
|
|
32
|
+
results: readonly CaseResult[],
|
|
33
|
+
opts: RenderOptions,
|
|
34
|
+
): string {
|
|
35
|
+
const agg = aggregate(results);
|
|
36
|
+
const total = results.length;
|
|
37
|
+
const passes = results.filter((r) => r.outcome === "pass").length;
|
|
38
|
+
const passRate = total === 0 ? 0 : (passes / total) * 100;
|
|
39
|
+
const cap = opts.triageCap ?? 100;
|
|
40
|
+
|
|
41
|
+
const lines: string[] = [];
|
|
42
|
+
lines.push("# Agent self-sufficiency UAT report");
|
|
43
|
+
lines.push("");
|
|
44
|
+
lines.push(`- **Run start:** ${opts.startedAt.toISOString()}`);
|
|
45
|
+
lines.push(`- **Duration:** ${opts.durationSeconds.toFixed(1)}s`);
|
|
46
|
+
lines.push(`- **Agents:** ${opts.agents.join(", ") || "(none)"}`);
|
|
47
|
+
lines.push(`- **Total cases:** ${total}`);
|
|
48
|
+
lines.push(`- **Overall pass rate:** ${passRate.toFixed(1)}% (${passes}/${total})`);
|
|
49
|
+
lines.push("");
|
|
50
|
+
|
|
51
|
+
// Per-criterion table.
|
|
52
|
+
lines.push("## Pass rate by acceptance criterion");
|
|
53
|
+
lines.push("");
|
|
54
|
+
lines.push("| Criterion | Description | Pass | Fail | Timeout | Error | Rate |");
|
|
55
|
+
lines.push("|---|---|---:|---:|---:|---:|---:|");
|
|
56
|
+
for (const spec of CRITERIA) {
|
|
57
|
+
const row = agg.byCriterion.get(spec.id) ?? {
|
|
58
|
+
pass: 0,
|
|
59
|
+
fail: 0,
|
|
60
|
+
timeout: 0,
|
|
61
|
+
error: 0,
|
|
62
|
+
};
|
|
63
|
+
const n = row.pass + row.fail + row.timeout + row.error;
|
|
64
|
+
const rate = n === 0 ? "—" : `${((row.pass / n) * 100).toFixed(0)}%`;
|
|
65
|
+
lines.push(
|
|
66
|
+
`| \`${spec.id}\` | ${spec.description} | ${row.pass} | ${row.fail} | ${row.timeout} | ${row.error} | ${rate} |`,
|
|
67
|
+
);
|
|
68
|
+
}
|
|
69
|
+
lines.push("");
|
|
70
|
+
|
|
71
|
+
// Per-agent table.
|
|
72
|
+
lines.push("## Pass rate by agent");
|
|
73
|
+
lines.push("");
|
|
74
|
+
lines.push("| Agent | Pass | Fail | Timeout | Error | Rate |");
|
|
75
|
+
lines.push("|---|---:|---:|---:|---:|---:|");
|
|
76
|
+
for (const agent of opts.agents) {
|
|
77
|
+
const row = agg.byAgent.get(agent) ?? {
|
|
78
|
+
pass: 0,
|
|
79
|
+
fail: 0,
|
|
80
|
+
timeout: 0,
|
|
81
|
+
error: 0,
|
|
82
|
+
};
|
|
83
|
+
const n = row.pass + row.fail + row.timeout + row.error;
|
|
84
|
+
const rate = n === 0 ? "—" : `${((row.pass / n) * 100).toFixed(0)}%`;
|
|
85
|
+
lines.push(`| \`${agent}\` | ${row.pass} | ${row.fail} | ${row.timeout} | ${row.error} | ${rate} |`);
|
|
86
|
+
}
|
|
87
|
+
lines.push("");
|
|
88
|
+
|
|
89
|
+
// Per-shape table — does the corpus's typo / voice / multi-intent
|
|
90
|
+
// styles regress relative to formal / terse?
|
|
91
|
+
lines.push("## Pass rate by paraphrase shape");
|
|
92
|
+
lines.push("");
|
|
93
|
+
lines.push("| Shape | Pass | Fail | Timeout | Error | Rate |");
|
|
94
|
+
lines.push("|---|---:|---:|---:|---:|---:|");
|
|
95
|
+
for (const shape of ["formal", "terse", "typo", "voice", "multi"] as const) {
|
|
96
|
+
const row = agg.byShape.get(shape) ?? {
|
|
97
|
+
pass: 0,
|
|
98
|
+
fail: 0,
|
|
99
|
+
timeout: 0,
|
|
100
|
+
error: 0,
|
|
101
|
+
};
|
|
102
|
+
const n = row.pass + row.fail + row.timeout + row.error;
|
|
103
|
+
const rate = n === 0 ? "—" : `${((row.pass / n) * 100).toFixed(0)}%`;
|
|
104
|
+
lines.push(`| ${shape} | ${row.pass} | ${row.fail} | ${row.timeout} | ${row.error} | ${rate} |`);
|
|
105
|
+
}
|
|
106
|
+
lines.push("");
|
|
107
|
+
|
|
108
|
+
// Triage — every non-pass, verbatim.
|
|
109
|
+
const triage = results.filter((r) => r.outcome !== "pass");
|
|
110
|
+
if (triage.length > 0) {
|
|
111
|
+
lines.push("## Triage — failures, timeouts, errors");
|
|
112
|
+
lines.push("");
|
|
113
|
+
lines.push(`${triage.length} non-pass cases (showing up to ${cap}):`);
|
|
114
|
+
lines.push("");
|
|
115
|
+
lines.push("| # | Agent | Criterion | Shape | Outcome | Prompt | Reply (or error) |");
|
|
116
|
+
lines.push("|---:|---|---|---|---|---|---|");
|
|
117
|
+
triage.slice(0, cap).forEach((r, i) => {
|
|
118
|
+
const reply =
|
|
119
|
+
r.outcome === "error"
|
|
120
|
+
? `_error: ${escapeCell(r.errorMessage ?? "?")}_`
|
|
121
|
+
: r.outcome === "timeout"
|
|
122
|
+
? `_timeout after ${r.durationMs}ms_`
|
|
123
|
+
: escapeCell(truncate(r.reply, 240));
|
|
124
|
+
lines.push(
|
|
125
|
+
`| ${i + 1} | \`${r.agent}\` | \`${r.criterion}\` | ${r.paraphrase.shape} | ${r.outcome} | ${escapeCell(truncate(r.paraphrase.text, 120))} | ${reply} |`,
|
|
126
|
+
);
|
|
127
|
+
});
|
|
128
|
+
if (triage.length > cap) {
|
|
129
|
+
lines.push("");
|
|
130
|
+
lines.push(`_…and ${triage.length - cap} more. Full results in the JSON sidecar._`);
|
|
131
|
+
}
|
|
132
|
+
lines.push("");
|
|
133
|
+
} else {
|
|
134
|
+
lines.push("## Triage");
|
|
135
|
+
lines.push("");
|
|
136
|
+
lines.push("All cases passed. No triage required.");
|
|
137
|
+
lines.push("");
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
return lines.join("\n");
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
function escapeCell(s: string): string {
|
|
144
|
+
return s.replace(/\|/g, "\\|").replace(/\n/g, " ").replace(/`/g, "ʼ");
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
function truncate(s: string, n: number): string {
|
|
148
|
+
if (s.length <= n) return s;
|
|
149
|
+
return s.slice(0, n - 1) + "…";
|
|
150
|
+
}
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
#!/usr/bin/env bash
|
|
2
|
+
# Run the agent-self-sufficiency UAT against the live fleet on this host.
|
|
3
|
+
#
|
|
4
|
+
# Why a wrapper script: the UAT runner needs three secrets out of the
|
|
5
|
+
# vault (TELEGRAM_API_ID / API_HASH / DRIVER_SESSION) plus the per-agent
|
|
6
|
+
# bot usernames. Pulling them inline here so an operator can run the
|
|
7
|
+
# whole suite with a single command:
|
|
8
|
+
#
|
|
9
|
+
# ./telegram-plugin/uat/runners/run-agent-self-sufficiency.sh
|
|
10
|
+
#
|
|
11
|
+
# The vault prompts for its passphrase interactively (once); the script
|
|
12
|
+
# then exports the three secrets only into the bun subprocess, never to
|
|
13
|
+
# the surrounding shell.
|
|
14
|
+
#
|
|
15
|
+
# Override fleet selection with UAT_FLEET / UAT_ADMIN_AGENTS (see the
|
|
16
|
+
# runner's --help for the format).
|
|
17
|
+
|
|
18
|
+
set -euo pipefail
|
|
19
|
+
|
|
20
|
+
cd "$(dirname "$0")/../../.." # → repo root
|
|
21
|
+
|
|
22
|
+
# ── 1. Pull the three UAT secrets from vault ────────────────────────────
|
|
23
|
+
# `switchroom vault get` prompts for the passphrase on first call and
|
|
24
|
+
# caches the unlocked broker for the session — subsequent gets are
|
|
25
|
+
# silent. We avoid passing tokens via argv so they don't show up in
|
|
26
|
+
# `ps`. Failed lookups fail loud.
|
|
27
|
+
echo "[uat] unlocking vault to read UAT secrets..."
|
|
28
|
+
TELEGRAM_API_ID="$(switchroom vault get telegram-uat-api-id)"
|
|
29
|
+
TELEGRAM_API_HASH="$(switchroom vault get telegram-uat-api-hash)"
|
|
30
|
+
TELEGRAM_UAT_DRIVER_SESSION="$(switchroom vault get telegram-uat-driver-session)"
|
|
31
|
+
export TELEGRAM_API_ID TELEGRAM_API_HASH TELEGRAM_UAT_DRIVER_SESSION
|
|
32
|
+
|
|
33
|
+
# ── 2. Discover the fleet from switchroom.yaml ──────────────────────────
|
|
34
|
+
# Operator may override by exporting UAT_FLEET / UAT_ADMIN_AGENTS
|
|
35
|
+
# explicitly. Otherwise we extract each agent's bot username from its
|
|
36
|
+
# token via getMe. This requires the operator to have read access to
|
|
37
|
+
# the per-agent .env files — if not, point UAT_FLEET at the right
|
|
38
|
+
# usernames manually.
|
|
39
|
+
if [[ -z "${UAT_FLEET:-}" ]]; then
|
|
40
|
+
echo "[uat] UAT_FLEET not set — set it explicitly to:"
|
|
41
|
+
echo " UAT_FLEET=\"agent1:@bot1,agent2:@bot2,agent3:@bot3\""
|
|
42
|
+
echo " UAT_ADMIN_AGENTS=\"agent1,agent2\" # optional"
|
|
43
|
+
echo ""
|
|
44
|
+
echo " Bot usernames live in BotFather or can be read from each"
|
|
45
|
+
echo " agent's vault entry. Set them and re-run."
|
|
46
|
+
exit 64
|
|
47
|
+
fi
|
|
48
|
+
|
|
49
|
+
# ── 3. Run ──────────────────────────────────────────────────────────────
|
|
50
|
+
exec bun telegram-plugin/uat/runners/agent-self-sufficiency.ts "$@"
|
|
@@ -0,0 +1,196 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Tests for the agent-self-sufficiency UAT runner's pure functions.
|
|
3
|
+
* The driver / Telegram orchestration is exercised live via the
|
|
4
|
+
* runner script itself (`agent-self-sufficiency.ts`) — these tests
|
|
5
|
+
* pin the scoring + reporting contracts so a refactor doesn't
|
|
6
|
+
* silently flip "fail" to "pass" or scramble the markdown layout.
|
|
7
|
+
*/
|
|
8
|
+
|
|
9
|
+
import { describe, it, expect } from "vitest";
|
|
10
|
+
import { scoreReply, aggregate, type CaseResult } from "./scorer.js";
|
|
11
|
+
import { CRITERIA, patternFor } from "./paraphrases.js";
|
|
12
|
+
import { renderMarkdown } from "./report.js";
|
|
13
|
+
|
|
14
|
+
const SPEC_IDENTITY = CRITERIA.find((c) => c.id === "2a_what_are_you")!;
|
|
15
|
+
const SPEC_NAME = CRITERIA.find((c) => c.id === "2b_your_name")!;
|
|
16
|
+
const SPEC_PEERS = CRITERIA.find((c) => c.id === "2c_peers")!;
|
|
17
|
+
const SPEC_CRON = CRITERIA.find((c) => c.id === "1b_cron_list")!;
|
|
18
|
+
const SPEC_REFUSAL = CRITERIA.find((c) => c.id === "3d_admin_refusal")!;
|
|
19
|
+
|
|
20
|
+
describe("CRITERIA corpus shape", () => {
|
|
21
|
+
it("has at least 10 paraphrases per criterion (goal acceptance gate)", () => {
|
|
22
|
+
for (const c of CRITERIA) {
|
|
23
|
+
expect(c.paraphrases.length, `criterion ${c.id}`).toBeGreaterThanOrEqual(
|
|
24
|
+
10,
|
|
25
|
+
);
|
|
26
|
+
}
|
|
27
|
+
});
|
|
28
|
+
|
|
29
|
+
it("covers every paraphrase shape at least once per criterion", () => {
|
|
30
|
+
const shapes = ["formal", "terse", "typo", "voice", "multi"] as const;
|
|
31
|
+
for (const c of CRITERIA) {
|
|
32
|
+
const seen = new Set(c.paraphrases.map((p) => p.shape));
|
|
33
|
+
for (const s of shapes) {
|
|
34
|
+
expect(seen.has(s), `${c.id} missing shape ${s}`).toBe(true);
|
|
35
|
+
}
|
|
36
|
+
}
|
|
37
|
+
});
|
|
38
|
+
});
|
|
39
|
+
|
|
40
|
+
describe("scoreReply", () => {
|
|
41
|
+
it("returns pass when the identity criterion's reply mentions switchroom + claude code", () => {
|
|
42
|
+
const reply =
|
|
43
|
+
"I'm a switchroom agent running Claude Code under the official `claude` CLI.";
|
|
44
|
+
expect(scoreReply(SPEC_IDENTITY, reply, { agentName: "x" })).toBe("pass");
|
|
45
|
+
});
|
|
46
|
+
|
|
47
|
+
it("returns fail when the identity reply is generic 'AI assistant' boilerplate", () => {
|
|
48
|
+
const reply = "I'm an AI assistant here to help you with tasks.";
|
|
49
|
+
expect(scoreReply(SPEC_IDENTITY, reply, { agentName: "x" })).toBe("fail");
|
|
50
|
+
});
|
|
51
|
+
|
|
52
|
+
it("returns fail on empty replies regardless of criterion", () => {
|
|
53
|
+
expect(scoreReply(SPEC_PEERS, "", { agentName: "x" })).toBe("fail");
|
|
54
|
+
expect(scoreReply(SPEC_PEERS, " ", { agentName: "x" })).toBe("fail");
|
|
55
|
+
});
|
|
56
|
+
|
|
57
|
+
it("strips markdown bold/code before matching so formatting doesn't flip outcomes", () => {
|
|
58
|
+
// The bold + backticks would have shielded the keyword if we
|
|
59
|
+
// matched raw — this proves stripMarkdown does its job.
|
|
60
|
+
const reply = "I'm a **switchroom** agent on `claude code`.";
|
|
61
|
+
expect(scoreReply(SPEC_IDENTITY, reply, { agentName: "x" })).toBe("pass");
|
|
62
|
+
});
|
|
63
|
+
|
|
64
|
+
it("substitutes __INJECTED_AGENT_NAME__ for the per-agent name criterion", () => {
|
|
65
|
+
const pattern = patternFor(SPEC_NAME, { agentName: "klanker" });
|
|
66
|
+
expect(pattern.test("my name is klanker")).toBe(true);
|
|
67
|
+
expect(pattern.test("my name is doc")).toBe(false);
|
|
68
|
+
});
|
|
69
|
+
|
|
70
|
+
it("scores 2b_your_name pass when the reply contains the agent name", () => {
|
|
71
|
+
const reply = "My name is klanker.";
|
|
72
|
+
expect(scoreReply(SPEC_NAME, reply, { agentName: "klanker" })).toBe("pass");
|
|
73
|
+
});
|
|
74
|
+
|
|
75
|
+
it("scores 2b_your_name fail when the reply names a different agent", () => {
|
|
76
|
+
const reply = "I'm doc.";
|
|
77
|
+
expect(scoreReply(SPEC_NAME, reply, { agentName: "klanker" })).toBe("fail");
|
|
78
|
+
});
|
|
79
|
+
|
|
80
|
+
it("scores 1b_cron_list pass for honest 'nothing scheduled' replies", () => {
|
|
81
|
+
const reply = "Nothing scheduled right now — my cron list is empty.";
|
|
82
|
+
expect(scoreReply(SPEC_CRON, reply, { agentName: "x" })).toBe("pass");
|
|
83
|
+
});
|
|
84
|
+
|
|
85
|
+
it("scores 3d_admin_refusal pass when reply says can't + names admin agent", () => {
|
|
86
|
+
const reply =
|
|
87
|
+
"I can't restart the fleet — ask klanker, they're the admin agent on this instance.";
|
|
88
|
+
expect(scoreReply(SPEC_REFUSAL, reply, { agentName: "scribe" })).toBe(
|
|
89
|
+
"pass",
|
|
90
|
+
);
|
|
91
|
+
});
|
|
92
|
+
});
|
|
93
|
+
|
|
94
|
+
describe("aggregate", () => {
|
|
95
|
+
it("counts by criterion / agent / shape", () => {
|
|
96
|
+
const mk = (
|
|
97
|
+
agent: string,
|
|
98
|
+
criterion: CaseResult["criterion"],
|
|
99
|
+
shape: "formal" | "terse" | "typo" | "voice" | "multi",
|
|
100
|
+
outcome: "pass" | "fail" | "timeout" | "error",
|
|
101
|
+
): CaseResult => ({
|
|
102
|
+
agent,
|
|
103
|
+
criterion,
|
|
104
|
+
paraphrase: { label: "x", shape, text: "y" },
|
|
105
|
+
outcome,
|
|
106
|
+
reply: "",
|
|
107
|
+
durationMs: 1,
|
|
108
|
+
});
|
|
109
|
+
const results = [
|
|
110
|
+
mk("a", "2a_what_are_you", "formal", "pass"),
|
|
111
|
+
mk("a", "2a_what_are_you", "typo", "fail"),
|
|
112
|
+
mk("b", "2a_what_are_you", "voice", "pass"),
|
|
113
|
+
mk("b", "2c_peers", "terse", "timeout"),
|
|
114
|
+
];
|
|
115
|
+
const a = aggregate(results);
|
|
116
|
+
expect(a.byCriterion.get("2a_what_are_you")).toEqual({
|
|
117
|
+
pass: 2,
|
|
118
|
+
fail: 1,
|
|
119
|
+
timeout: 0,
|
|
120
|
+
error: 0,
|
|
121
|
+
});
|
|
122
|
+
expect(a.byAgent.get("a")).toEqual({
|
|
123
|
+
pass: 1,
|
|
124
|
+
fail: 1,
|
|
125
|
+
timeout: 0,
|
|
126
|
+
error: 0,
|
|
127
|
+
});
|
|
128
|
+
expect(a.byShape.get("typo")).toEqual({
|
|
129
|
+
pass: 0,
|
|
130
|
+
fail: 1,
|
|
131
|
+
timeout: 0,
|
|
132
|
+
error: 0,
|
|
133
|
+
});
|
|
134
|
+
});
|
|
135
|
+
});
|
|
136
|
+
|
|
137
|
+
describe("renderMarkdown", () => {
|
|
138
|
+
it("produces a report with overall pass rate, per-criterion table, and triage when there are failures", () => {
|
|
139
|
+
const results: CaseResult[] = [
|
|
140
|
+
{
|
|
141
|
+
agent: "a",
|
|
142
|
+
criterion: "2a_what_are_you",
|
|
143
|
+
paraphrase: { label: "p1", shape: "formal", text: "what are you?" },
|
|
144
|
+
outcome: "pass",
|
|
145
|
+
reply: "I'm a switchroom agent.",
|
|
146
|
+
durationMs: 500,
|
|
147
|
+
},
|
|
148
|
+
{
|
|
149
|
+
agent: "a",
|
|
150
|
+
criterion: "2a_what_are_you",
|
|
151
|
+
paraphrase: { label: "p2", shape: "typo", text: "wht r u" },
|
|
152
|
+
outcome: "fail",
|
|
153
|
+
reply: "I'm just an AI.",
|
|
154
|
+
durationMs: 800,
|
|
155
|
+
},
|
|
156
|
+
{
|
|
157
|
+
agent: "b",
|
|
158
|
+
criterion: "2c_peers",
|
|
159
|
+
paraphrase: { label: "p3", shape: "voice", text: "who else is here?" },
|
|
160
|
+
outcome: "timeout",
|
|
161
|
+
reply: "",
|
|
162
|
+
durationMs: 60_000,
|
|
163
|
+
},
|
|
164
|
+
];
|
|
165
|
+
const md = renderMarkdown(results, {
|
|
166
|
+
startedAt: new Date("2026-05-14T00:00:00Z"),
|
|
167
|
+
durationSeconds: 90,
|
|
168
|
+
agents: ["a", "b"],
|
|
169
|
+
});
|
|
170
|
+
expect(md).toContain("# Agent self-sufficiency UAT report");
|
|
171
|
+
expect(md).toContain("33.3% (1/3)");
|
|
172
|
+
expect(md).toContain("`2a_what_are_you`");
|
|
173
|
+
expect(md).toContain("Triage");
|
|
174
|
+
// Triage row carries the verbatim prompt + reply.
|
|
175
|
+
expect(md).toContain("wht r u");
|
|
176
|
+
expect(md).toContain("I'm just an AI.");
|
|
177
|
+
expect(md).toMatch(/timeout after 60000ms/);
|
|
178
|
+
});
|
|
179
|
+
|
|
180
|
+
it("renders 'All cases passed' when there are no failures", () => {
|
|
181
|
+
const md = renderMarkdown(
|
|
182
|
+
[
|
|
183
|
+
{
|
|
184
|
+
agent: "a",
|
|
185
|
+
criterion: "2a_what_are_you",
|
|
186
|
+
paraphrase: { label: "p", shape: "formal", text: "what are you?" },
|
|
187
|
+
outcome: "pass",
|
|
188
|
+
reply: "I'm a switchroom agent.",
|
|
189
|
+
durationMs: 500,
|
|
190
|
+
},
|
|
191
|
+
],
|
|
192
|
+
{ startedAt: new Date(), durationSeconds: 1, agents: ["a"] },
|
|
193
|
+
);
|
|
194
|
+
expect(md).toContain("All cases passed");
|
|
195
|
+
});
|
|
196
|
+
});
|