switchroom 0.7.15 → 0.10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +51 -59
- package/bin/run-hook.sh +27 -11
- package/bin/timezone-hook.sh +9 -7
- package/dist/agent-scheduler/index.js +410 -133
- package/dist/auth-broker/index.js +13932 -0
- package/dist/cli/switchroom.js +26937 -5601
- package/dist/host-control/main.js +12702 -0
- package/dist/vault/approvals/kernel-server.js +467 -184
- package/dist/vault/broker/server.js +1430 -724
- package/examples/minimal.yaml +63 -0
- package/examples/personal-google-workspace-mcp/.env.example +34 -0
- package/examples/personal-google-workspace-mcp/README.md +194 -0
- package/examples/personal-google-workspace-mcp/compose.yaml +66 -0
- package/examples/switchroom.yaml +220 -0
- package/package.json +7 -4
- package/profiles/_base/settings.json.hbs +20 -5
- package/profiles/_base/start.sh.hbs +16 -3
- package/profiles/_shared/agent-self-service.md.hbs +126 -0
- package/profiles/_shared/telegram-style.md.hbs +20 -90
- package/profiles/_shared/vault-protocol.md.hbs +68 -0
- package/profiles/default/CLAUDE.md +50 -96
- package/profiles/default/CLAUDE.md.hbs +36 -6
- package/profiles/default/workspace/SOUL.md.hbs +12 -5
- package/skills/buildkite-agent-infrastructure/SKILL.md +30 -11
- package/skills/buildkite-agent-runtime/SKILL.md +44 -11
- package/skills/buildkite-api/SKILL.md +31 -8
- package/skills/buildkite-cli/SKILL.md +27 -9
- package/skills/buildkite-migration/SKILL.md +22 -9
- package/skills/buildkite-pipelines/SKILL.md +26 -9
- package/skills/buildkite-secure-delivery/SKILL.md +23 -9
- package/skills/buildkite-test-engine/SKILL.md +25 -8
- package/skills/docx/SKILL.md +1 -1
- package/skills/docx/scripts/office/validators/__pycache__/__init__.cpython-313.pyc +0 -0
- package/skills/docx/scripts/office/validators/__pycache__/base.cpython-313.pyc +0 -0
- package/skills/file-bug/SKILL.md +34 -6
- package/skills/humanizer/SKILL.md +15 -0
- package/skills/humanizer-calibrate/SKILL.md +7 -1
- package/skills/mcp-builder/SKILL.md +1 -1
- package/skills/pdf/SKILL.md +1 -1
- package/skills/pptx/SKILL.md +1 -1
- package/skills/skill-creator/SKILL.md +21 -1
- package/skills/skill-creator/scripts/__pycache__/__init__.cpython-313.pyc +0 -0
- package/skills/skill-creator/scripts/__pycache__/generate_report.cpython-313.pyc +0 -0
- package/skills/skill-creator/scripts/__pycache__/improve_description.cpython-313.pyc +0 -0
- package/skills/skill-creator/scripts/__pycache__/run_eval.cpython-313.pyc +0 -0
- package/skills/skill-creator/scripts/__pycache__/run_loop.cpython-313.pyc +0 -0
- package/skills/skill-creator/scripts/__pycache__/utils.cpython-313.pyc +0 -0
- package/skills/switchroom-cli/SKILL.md +63 -64
- package/skills/switchroom-health/SKILL.md +23 -10
- package/skills/switchroom-install/SKILL.md +3 -3
- package/skills/switchroom-manage/SKILL.md +26 -19
- package/skills/switchroom-runtime/SKILL.md +191 -0
- package/skills/switchroom-status/SKILL.md +27 -2
- package/skills/telegram-test-harness/SKILL.md +3 -0
- package/skills/token-helpers/SKILL.md +24 -1
- package/skills/webapp-testing/SKILL.md +31 -1
- package/skills/xlsx/SKILL.md +1 -1
- package/telegram-plugin/admin-commands/index.ts +7 -5
- package/telegram-plugin/analytics-posthog.ts +191 -0
- package/telegram-plugin/bridge/bridge.ts +69 -0
- package/telegram-plugin/bridge/ipc-client.ts +4 -1
- package/telegram-plugin/dist/bridge/bridge.js +194 -119
- package/telegram-plugin/dist/gateway/gateway.js +23611 -19671
- package/telegram-plugin/dist/server.js +245 -189
- package/telegram-plugin/first-paint.ts +3 -24
- package/telegram-plugin/gateway/auth-add-flow.ts +326 -0
- package/telegram-plugin/gateway/auth-broker-client.ts +75 -0
- package/telegram-plugin/gateway/auth-command.ts +794 -0
- package/telegram-plugin/gateway/auth-line.ts +123 -0
- package/telegram-plugin/gateway/boot-card.ts +169 -40
- package/telegram-plugin/gateway/boot-issue-cache.ts +308 -0
- package/telegram-plugin/gateway/boot-probes.ts +166 -123
- package/telegram-plugin/gateway/boot-reason.ts +41 -7
- package/telegram-plugin/gateway/boot-version.ts +66 -0
- package/telegram-plugin/gateway/gateway.ts +3499 -1885
- package/telegram-plugin/gateway/hostd-dispatch.ts +117 -0
- package/telegram-plugin/gateway/ipc-protocol.ts +18 -0
- package/telegram-plugin/gateway/pending-inbound-buffer.ts +106 -0
- package/telegram-plugin/gateway/quarantine.ts +69 -0
- package/telegram-plugin/gateway/quota-cache.ts +9 -4
- package/telegram-plugin/gateway/reaction-trigger.ts +401 -0
- package/telegram-plugin/gateway/recent-denials.test.ts +103 -0
- package/telegram-plugin/gateway/recent-denials.ts +77 -0
- package/telegram-plugin/gateway/startup-network-retry.ts +109 -31
- package/telegram-plugin/gateway/vault-grant-inbound-builders.ts +125 -0
- package/telegram-plugin/history.ts +91 -0
- package/telegram-plugin/hooks/hooks.json +10 -0
- package/telegram-plugin/hooks/sandbox-hint-posttool.mjs +130 -0
- package/telegram-plugin/hooks/subagent-tracker-posttool.mjs +19 -2
- package/telegram-plugin/hooks/subagent-tracker-pretool.mjs +22 -2
- package/telegram-plugin/hooks/tool-label-pretool.mjs +11 -0
- package/telegram-plugin/hooks/wedge-detect-posttool.mjs +303 -0
- package/telegram-plugin/inbound-classifier.ts +50 -0
- package/telegram-plugin/inline-keyboard-callbacks.ts +136 -0
- package/telegram-plugin/node_modules/.vite/vitest/da39a3ee5e6b4b0d3255bfef95601890afd80709/results.json +1 -0
- package/telegram-plugin/package.json +4 -2
- package/telegram-plugin/permission-rule.ts +51 -0
- package/telegram-plugin/permission-title.ts +56 -0
- package/telegram-plugin/quota-check.ts +19 -41
- package/telegram-plugin/registry/reaper.ts +223 -0
- package/telegram-plugin/retry-api-call.ts +80 -0
- package/telegram-plugin/runtime-metrics.ts +177 -0
- package/telegram-plugin/scripts/build.mjs +0 -1
- package/telegram-plugin/secret-detect/index.ts +24 -0
- package/telegram-plugin/secret-detect/vault-error.test.ts +64 -12
- package/telegram-plugin/secret-detect/vault-error.ts +78 -11
- package/telegram-plugin/secret-detect/vault-write.ts +14 -2
- package/telegram-plugin/server.js +41795 -0
- package/telegram-plugin/session-tail.ts +6 -1
- package/telegram-plugin/shared/bot-runtime.ts +5 -4
- package/telegram-plugin/silence-poke.ts +420 -0
- package/telegram-plugin/silent-end.ts +174 -0
- package/telegram-plugin/stream-controller.ts +13 -0
- package/telegram-plugin/stream-reply-handler.ts +7 -0
- package/telegram-plugin/subagent-watcher.ts +213 -4
- package/telegram-plugin/tests/auth-add-flow.test.ts +559 -0
- package/telegram-plugin/tests/auth-code-redact.test.ts +8 -4
- package/telegram-plugin/tests/auth-command-vernacular.test.ts +531 -0
- package/telegram-plugin/tests/boot-card-issue-dedup.test.ts +247 -0
- package/telegram-plugin/tests/boot-card-reason-to-render.test.ts +182 -0
- package/telegram-plugin/tests/boot-card-reason.test.ts +65 -2
- package/telegram-plugin/tests/boot-card-render.test.ts +146 -0
- package/telegram-plugin/tests/boot-card-silent-on-operator.test.ts +103 -0
- package/telegram-plugin/tests/boot-probes.test.ts +216 -10
- package/telegram-plugin/tests/boot-version-string.test.ts +0 -0
- package/telegram-plugin/tests/finalize-callback.test.ts +190 -0
- package/telegram-plugin/tests/gateway-message-validator.test.ts +26 -0
- package/telegram-plugin/tests/gateway-secret-detect.test.ts +12 -3
- package/telegram-plugin/tests/gateway-startup-network-retry.test.ts +104 -0
- package/telegram-plugin/tests/history-reaper.test.ts +378 -0
- package/telegram-plugin/tests/hostd-dispatch.test.ts +129 -0
- package/telegram-plugin/tests/inbound-classifier.test.ts +76 -0
- package/telegram-plugin/tests/inbound-message-types.test.ts +267 -0
- package/telegram-plugin/tests/issues-card.test.ts +49 -0
- package/telegram-plugin/tests/pending-inbound-buffer.test.ts +132 -0
- package/telegram-plugin/tests/permission-rule.test.ts +80 -1
- package/telegram-plugin/tests/permission-title.test.ts +31 -0
- package/telegram-plugin/tests/quota-check.test.ts +5 -35
- package/telegram-plugin/tests/races.test.ts +179 -0
- package/telegram-plugin/tests/reaction-trigger-flow.test.ts +353 -0
- package/telegram-plugin/tests/reaction-trigger.test.ts +397 -0
- package/telegram-plugin/tests/retry-api-call.test.ts +152 -1
- package/telegram-plugin/tests/runtime-metrics.test.ts +145 -0
- package/telegram-plugin/tests/sandbox-hint-posttool.test.ts +155 -0
- package/telegram-plugin/tests/secret-detect-delete-must-surface-failures.test.ts +133 -0
- package/telegram-plugin/tests/secret-detect-false-positives.test.ts +137 -0
- package/telegram-plugin/tests/silence-poke.test.ts +493 -0
- package/telegram-plugin/tests/silent-end.test.ts +206 -0
- package/telegram-plugin/tests/subagent-tracker-hooks.test.ts +107 -0
- package/telegram-plugin/tests/subagent-watcher-env-thresholds.test.ts +224 -0
- package/telegram-plugin/tests/subagent-watcher-stall-terminal.test.ts +316 -0
- package/telegram-plugin/tests/subagent-watcher.test.ts +263 -0
- package/telegram-plugin/tests/turn-signal-tracker.test.ts +81 -0
- package/telegram-plugin/tests/vault-approval-posture.test.ts +256 -0
- package/telegram-plugin/tests/vault-grant-auto-resume.test.ts +73 -0
- package/telegram-plugin/tests/vault-grant-inbound-builders.test.ts +226 -0
- package/telegram-plugin/tests/vault-grant-union.test.ts +130 -0
- package/telegram-plugin/tests/vault-key-regex-allows-slash.test.ts +140 -0
- package/telegram-plugin/tests/vault-posture-quarantine.test.ts +104 -0
- package/telegram-plugin/tests/vault-request-access-tool.test.ts +114 -0
- package/telegram-plugin/tests/vault-request-access-unlock-resume.test.ts +106 -0
- package/telegram-plugin/turn-signal-tracker.ts +100 -24
- package/telegram-plugin/uat/SETUP.md +210 -35
- package/telegram-plugin/uat/assertions.ts +264 -37
- package/telegram-plugin/uat/driver-info.ts +57 -0
- package/telegram-plugin/uat/driver.ts +590 -51
- package/telegram-plugin/uat/harness.ts +140 -94
- package/telegram-plugin/uat/load-env.test.ts +72 -0
- package/telegram-plugin/uat/load-env.ts +48 -0
- package/telegram-plugin/uat/login.ts +96 -53
- package/telegram-plugin/uat/runners/agent-self-sufficiency.ts +457 -0
- package/telegram-plugin/uat/runners/paraphrases.ts +231 -0
- package/telegram-plugin/uat/runners/report.ts +150 -0
- package/telegram-plugin/uat/runners/run-agent-self-sufficiency.sh +50 -0
- package/telegram-plugin/uat/runners/scorer.test.ts +196 -0
- package/telegram-plugin/uat/runners/scorer.ts +106 -0
- package/telegram-plugin/uat/runners/skill-coverage.test.ts +100 -0
- package/telegram-plugin/uat/runners/skill-coverage.ts +620 -0
- package/telegram-plugin/uat/scenarios/ask-user-button-tap-dm.test.ts +141 -0
- package/telegram-plugin/uat/scenarios/bg-sub-agent-dispatch-dm.test.ts +191 -0
- package/telegram-plugin/uat/scenarios/fuzz-extended-dm.test.ts +255 -0
- package/telegram-plugin/uat/scenarios/fuzz-human-style-dm.test.ts +275 -0
- package/telegram-plugin/uat/scenarios/fuzz-random-prompts-dm.test.ts +146 -0
- package/telegram-plugin/uat/scenarios/fuzz-status-ask-dm.test.ts +486 -0
- package/telegram-plugin/uat/scenarios/jtbd-interrupt-marker-dm.test.ts +67 -0
- package/telegram-plugin/uat/scenarios/jtbd-rapid-followup-dm.test.ts +100 -0
- package/telegram-plugin/uat/scenarios/jtbd-soft-commit-dm.test.ts +67 -0
- package/telegram-plugin/uat/scenarios/jtbd-status-query-dm.test.ts +49 -0
- package/telegram-plugin/uat/scenarios/location-inbound-dm.test.ts +65 -0
- package/telegram-plugin/uat/scenarios/midturn-silent-dm.test.ts +175 -0
- package/telegram-plugin/uat/scenarios/reactions-dm.test.ts +142 -0
- package/telegram-plugin/uat/scenarios/reactions-trigger-turn-dm.test.ts +96 -0
- package/telegram-plugin/uat/scenarios/secret-redaction-deletes-original-dm.test.ts +123 -0
- package/telegram-plugin/uat/scenarios/secret-redaction-no-false-positive-dm.test.ts +87 -0
- package/telegram-plugin/uat/scenarios/silence-poke-soft-dm.test.ts +155 -0
- package/telegram-plugin/uat/scenarios/silent-end-recovery-dm.test.ts +95 -0
- package/telegram-plugin/uat/scenarios/smoke-dm-reply.test.ts +57 -0
- package/telegram-plugin/uat/scenarios/subagent-watcher-no-rerun-dm.test.ts +135 -0
- package/telegram-plugin/uat/scenarios/vault-approval-posture-telegram-id-dm.test.ts +191 -0
- package/telegram-plugin/uat/scenarios/vault-audit-allow-dm.test.ts +108 -0
- package/telegram-plugin/uat/scenarios/vault-grant-auto-resume-dm.test.ts +121 -0
- package/telegram-plugin/uat/scenarios/vault-request-access-concurrent-dm.test.ts +161 -0
- package/telegram-plugin/uat/scenarios/vault-request-access-end-to-end-dm.test.ts +158 -0
- package/telegram-plugin/uat/scenarios/voice-inbound-dm.test.ts +65 -0
- package/telegram-plugin/vault-approval-posture.ts +42 -0
- package/telegram-plugin/welcome-text.ts +1 -0
- package/telegram-plugin/active-pins-sweep.ts +0 -204
- package/telegram-plugin/active-pins.ts +0 -146
- package/telegram-plugin/auth-dashboard.ts +0 -1104
- package/telegram-plugin/auth-slot-parser.ts +0 -497
- package/telegram-plugin/card-event-log.ts +0 -138
- package/telegram-plugin/dist/foreman/foreman.js +0 -31106
- package/telegram-plugin/docs/multi-agent-card-design.md +0 -847
- package/telegram-plugin/docs/pinned-progress-card-reliability.md +0 -144
- package/telegram-plugin/foreman/foreman-create-flow.ts +0 -202
- package/telegram-plugin/foreman/foreman-handlers.ts +0 -493
- package/telegram-plugin/foreman/foreman.ts +0 -1165
- package/telegram-plugin/foreman/setup-flow.ts +0 -345
- package/telegram-plugin/foreman/setup-state.ts +0 -239
- package/telegram-plugin/foreman/state.ts +0 -203
- package/telegram-plugin/pin-event-log.ts +0 -76
- package/telegram-plugin/progress-card-driver.ts +0 -2886
- package/telegram-plugin/progress-card-pin-manager.ts +0 -589
- package/telegram-plugin/progress-card-pin-watchdog.ts +0 -98
- package/telegram-plugin/progress-card.ts +0 -1409
- package/telegram-plugin/tests/HARNESS.md +0 -340
- package/telegram-plugin/tests/_progress-card-harness.ts +0 -109
- package/telegram-plugin/tests/active-pins-boot-reaper.test.ts +0 -211
- package/telegram-plugin/tests/active-pins-sweep.test.ts +0 -309
- package/telegram-plugin/tests/active-pins.test.ts +0 -187
- package/telegram-plugin/tests/auth-account-identity-surface.test.ts +0 -118
- package/telegram-plugin/tests/auth-dashboard-edge-cases.test.ts +0 -260
- package/telegram-plugin/tests/auth-dashboard-restart-flow.test.ts +0 -140
- package/telegram-plugin/tests/auth-dashboard-v3b.test.ts +0 -559
- package/telegram-plugin/tests/auth-dashboard.test.ts +0 -1045
- package/telegram-plugin/tests/auth-slot-commands.test.ts +0 -640
- package/telegram-plugin/tests/bg-agent-progress-card-757.test.ts +0 -201
- package/telegram-plugin/tests/boot-card-account-quota.test.ts +0 -137
- package/telegram-plugin/tests/card-event-log.test.ts +0 -145
- package/telegram-plugin/tests/first-paint.test.ts +0 -257
- package/telegram-plugin/tests/foreman-create-flow.test.ts +0 -359
- package/telegram-plugin/tests/foreman-handlers.test.ts +0 -347
- package/telegram-plugin/tests/foreman-state.test.ts +0 -164
- package/telegram-plugin/tests/foreman-write-ops.test.ts +0 -214
- package/telegram-plugin/tests/harness-ordering-invariants.test.ts +0 -243
- package/telegram-plugin/tests/pin-event-log.test.ts +0 -124
- package/telegram-plugin/tests/progress-card-api-failure-during-deferred.test.ts +0 -73
- package/telegram-plugin/tests/progress-card-close-paths-converge.test.ts +0 -272
- package/telegram-plugin/tests/progress-card-cross-turn.test.ts +0 -258
- package/telegram-plugin/tests/progress-card-delay-842.test.ts +0 -160
- package/telegram-plugin/tests/progress-card-dispose-preservepending.test.ts +0 -81
- package/telegram-plugin/tests/progress-card-draft-flag.test.ts +0 -80
- package/telegram-plugin/tests/progress-card-driver-eviction.test.ts +0 -215
- package/telegram-plugin/tests/progress-card-driver-fleet-shadow.test.ts +0 -123
- package/telegram-plugin/tests/progress-card-driver-force-complete-parent-done.test.ts +0 -76
- package/telegram-plugin/tests/progress-card-edit-timestamps-budget.test.ts +0 -62
- package/telegram-plugin/tests/progress-card-memory-bounds.test.ts +0 -84
- package/telegram-plugin/tests/progress-card-pin-failure-paths.test.ts +0 -139
- package/telegram-plugin/tests/progress-card-pin-manager.test.ts +0 -773
- package/telegram-plugin/tests/progress-card-pin-race-fast-turn.test.ts +0 -66
- package/telegram-plugin/tests/progress-card-pin-sidecar-partial-write.test.ts +0 -64
- package/telegram-plugin/tests/progress-card-pin-watchdog.test.ts +0 -190
- package/telegram-plugin/tests/progress-card-sigterm-pin-flush.test.ts +0 -146
- package/telegram-plugin/tests/real-gateway-f1-ladder-integrity.test.ts +0 -123
- package/telegram-plugin/tests/real-gateway-f2-instant-draft.test.ts +0 -82
- package/telegram-plugin/tests/real-gateway-f3-late-card.test.ts +0 -114
- package/telegram-plugin/tests/real-gateway-harness.ts +0 -699
- package/telegram-plugin/tests/real-gateway-i6-turn-flush-replay-dedup.test.ts +0 -313
- package/telegram-plugin/tests/real-gateway-ipc-lifecycle.test.ts +0 -299
- package/telegram-plugin/tests/real-gateway-spec.test.ts +0 -487
- package/telegram-plugin/tests/real-gateway.smoke.test.ts +0 -101
- package/telegram-plugin/tests/setup-flow.test.ts +0 -510
- package/telegram-plugin/tests/setup-state.test.ts +0 -146
- package/telegram-plugin/tests/sync-chat-running-subagents.test.ts +0 -116
- package/telegram-plugin/tests/turn-end-regressions.test.ts +0 -489
- package/telegram-plugin/tests/turn-flush-card-takeover.test.ts +0 -218
- package/telegram-plugin/tests/turn-flush-prose-recovery.test.ts +0 -78
- package/telegram-plugin/tests/two-zone-bg-carry-full-lifecycle.test.ts +0 -131
- package/telegram-plugin/tests/two-zone-bg-detection.test.ts +0 -120
- package/telegram-plugin/tests/two-zone-bg-done-when-all-terminal.test.ts +0 -116
- package/telegram-plugin/tests/two-zone-bg-early-turn-end.test.ts +0 -87
- package/telegram-plugin/tests/two-zone-bg-survives-next-turn.test.ts +0 -211
- package/telegram-plugin/tests/two-zone-card-cap.test.ts +0 -62
- package/telegram-plugin/tests/two-zone-card-fleet-row.test.ts +0 -101
- package/telegram-plugin/tests/two-zone-card-header-phases.test.ts +0 -78
- package/telegram-plugin/tests/two-zone-card-html-balance.test.ts +0 -110
- package/telegram-plugin/tests/two-zone-card-lifecycle.test.ts +0 -128
- package/telegram-plugin/tests/two-zone-card-sanitise.test.ts +0 -58
- package/telegram-plugin/tests/two-zone-card-snapshot.test.ts +0 -133
- package/telegram-plugin/tests/two-zone-concurrent-turns-isolation.test.ts +0 -155
- package/telegram-plugin/tests/two-zone-phasefor-precedence.test.ts +0 -117
- package/telegram-plugin/tests/two-zone-snapshot-extras.test.ts +0 -187
- package/telegram-plugin/tests/two-zone-stuck-edit-throttle.test.ts +0 -149
- package/telegram-plugin/tests/two-zone-stuck-header-escalation.test.ts +0 -101
- package/telegram-plugin/tests/two-zone-stuck-per-member.test.ts +0 -114
- package/telegram-plugin/tests/two-zone-stuck-recovery.test.ts +0 -105
- package/telegram-plugin/tests/waiting-ux-harness.ts +0 -381
- package/telegram-plugin/tests/waiting-ux.e2e.test.ts +0 -233
- package/telegram-plugin/turn-flush-prose-recovery.ts +0 -40
- package/telegram-plugin/two-zone-card.ts +0 -269
- package/telegram-plugin/uat/scenarios/smoke-clerk-reply.test.ts +0 -61
|
@@ -0,0 +1,150 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Markdown report renderer for the agent-self-sufficiency UAT.
|
|
3
|
+
*
|
|
4
|
+
* Layout decisions:
|
|
5
|
+
*
|
|
6
|
+
* - Per-criterion pass-rate table is the headline — operator reads
|
|
7
|
+
* "did we move the needle" in one glance.
|
|
8
|
+
* - Per-agent + per-shape tables answer "did this regress for one
|
|
9
|
+
* agent" and "did one shape (typo/voice/multi) collapse".
|
|
10
|
+
* - Triage table lists every failure / timeout / error verbatim with
|
|
11
|
+
* the prompt and the reply, so the operator can diff them in the
|
|
12
|
+
* PR without re-running. Cap at 100 rows to keep the PR body
|
|
13
|
+
* digestible — the JSON sidecar (written alongside) has everything.
|
|
14
|
+
*/
|
|
15
|
+
|
|
16
|
+
import type { CaseResult } from "./scorer.js";
|
|
17
|
+
import { aggregate } from "./scorer.js";
|
|
18
|
+
import { CRITERIA } from "./paraphrases.js";
|
|
19
|
+
|
|
20
|
+
export interface RenderOptions {
|
|
21
|
+
/** When the run started (used in the report header). */
|
|
22
|
+
startedAt: Date;
|
|
23
|
+
/** Total wall-clock seconds for the run. */
|
|
24
|
+
durationSeconds: number;
|
|
25
|
+
/** Agents the runner targeted. */
|
|
26
|
+
agents: readonly string[];
|
|
27
|
+
/** Cap on triage rows shown in the rendered markdown. Default 100. */
|
|
28
|
+
triageCap?: number;
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
export function renderMarkdown(
|
|
32
|
+
results: readonly CaseResult[],
|
|
33
|
+
opts: RenderOptions,
|
|
34
|
+
): string {
|
|
35
|
+
const agg = aggregate(results);
|
|
36
|
+
const total = results.length;
|
|
37
|
+
const passes = results.filter((r) => r.outcome === "pass").length;
|
|
38
|
+
const passRate = total === 0 ? 0 : (passes / total) * 100;
|
|
39
|
+
const cap = opts.triageCap ?? 100;
|
|
40
|
+
|
|
41
|
+
const lines: string[] = [];
|
|
42
|
+
lines.push("# Agent self-sufficiency UAT report");
|
|
43
|
+
lines.push("");
|
|
44
|
+
lines.push(`- **Run start:** ${opts.startedAt.toISOString()}`);
|
|
45
|
+
lines.push(`- **Duration:** ${opts.durationSeconds.toFixed(1)}s`);
|
|
46
|
+
lines.push(`- **Agents:** ${opts.agents.join(", ") || "(none)"}`);
|
|
47
|
+
lines.push(`- **Total cases:** ${total}`);
|
|
48
|
+
lines.push(`- **Overall pass rate:** ${passRate.toFixed(1)}% (${passes}/${total})`);
|
|
49
|
+
lines.push("");
|
|
50
|
+
|
|
51
|
+
// Per-criterion table.
|
|
52
|
+
lines.push("## Pass rate by acceptance criterion");
|
|
53
|
+
lines.push("");
|
|
54
|
+
lines.push("| Criterion | Description | Pass | Fail | Timeout | Error | Rate |");
|
|
55
|
+
lines.push("|---|---|---:|---:|---:|---:|---:|");
|
|
56
|
+
for (const spec of CRITERIA) {
|
|
57
|
+
const row = agg.byCriterion.get(spec.id) ?? {
|
|
58
|
+
pass: 0,
|
|
59
|
+
fail: 0,
|
|
60
|
+
timeout: 0,
|
|
61
|
+
error: 0,
|
|
62
|
+
};
|
|
63
|
+
const n = row.pass + row.fail + row.timeout + row.error;
|
|
64
|
+
const rate = n === 0 ? "—" : `${((row.pass / n) * 100).toFixed(0)}%`;
|
|
65
|
+
lines.push(
|
|
66
|
+
`| \`${spec.id}\` | ${spec.description} | ${row.pass} | ${row.fail} | ${row.timeout} | ${row.error} | ${rate} |`,
|
|
67
|
+
);
|
|
68
|
+
}
|
|
69
|
+
lines.push("");
|
|
70
|
+
|
|
71
|
+
// Per-agent table.
|
|
72
|
+
lines.push("## Pass rate by agent");
|
|
73
|
+
lines.push("");
|
|
74
|
+
lines.push("| Agent | Pass | Fail | Timeout | Error | Rate |");
|
|
75
|
+
lines.push("|---|---:|---:|---:|---:|---:|");
|
|
76
|
+
for (const agent of opts.agents) {
|
|
77
|
+
const row = agg.byAgent.get(agent) ?? {
|
|
78
|
+
pass: 0,
|
|
79
|
+
fail: 0,
|
|
80
|
+
timeout: 0,
|
|
81
|
+
error: 0,
|
|
82
|
+
};
|
|
83
|
+
const n = row.pass + row.fail + row.timeout + row.error;
|
|
84
|
+
const rate = n === 0 ? "—" : `${((row.pass / n) * 100).toFixed(0)}%`;
|
|
85
|
+
lines.push(`| \`${agent}\` | ${row.pass} | ${row.fail} | ${row.timeout} | ${row.error} | ${rate} |`);
|
|
86
|
+
}
|
|
87
|
+
lines.push("");
|
|
88
|
+
|
|
89
|
+
// Per-shape table — does the corpus's typo / voice / multi-intent
|
|
90
|
+
// styles regress relative to formal / terse?
|
|
91
|
+
lines.push("## Pass rate by paraphrase shape");
|
|
92
|
+
lines.push("");
|
|
93
|
+
lines.push("| Shape | Pass | Fail | Timeout | Error | Rate |");
|
|
94
|
+
lines.push("|---|---:|---:|---:|---:|---:|");
|
|
95
|
+
for (const shape of ["formal", "terse", "typo", "voice", "multi"] as const) {
|
|
96
|
+
const row = agg.byShape.get(shape) ?? {
|
|
97
|
+
pass: 0,
|
|
98
|
+
fail: 0,
|
|
99
|
+
timeout: 0,
|
|
100
|
+
error: 0,
|
|
101
|
+
};
|
|
102
|
+
const n = row.pass + row.fail + row.timeout + row.error;
|
|
103
|
+
const rate = n === 0 ? "—" : `${((row.pass / n) * 100).toFixed(0)}%`;
|
|
104
|
+
lines.push(`| ${shape} | ${row.pass} | ${row.fail} | ${row.timeout} | ${row.error} | ${rate} |`);
|
|
105
|
+
}
|
|
106
|
+
lines.push("");
|
|
107
|
+
|
|
108
|
+
// Triage — every non-pass, verbatim.
|
|
109
|
+
const triage = results.filter((r) => r.outcome !== "pass");
|
|
110
|
+
if (triage.length > 0) {
|
|
111
|
+
lines.push("## Triage — failures, timeouts, errors");
|
|
112
|
+
lines.push("");
|
|
113
|
+
lines.push(`${triage.length} non-pass cases (showing up to ${cap}):`);
|
|
114
|
+
lines.push("");
|
|
115
|
+
lines.push("| # | Agent | Criterion | Shape | Outcome | Prompt | Reply (or error) |");
|
|
116
|
+
lines.push("|---:|---|---|---|---|---|---|");
|
|
117
|
+
triage.slice(0, cap).forEach((r, i) => {
|
|
118
|
+
const reply =
|
|
119
|
+
r.outcome === "error"
|
|
120
|
+
? `_error: ${escapeCell(r.errorMessage ?? "?")}_`
|
|
121
|
+
: r.outcome === "timeout"
|
|
122
|
+
? `_timeout after ${r.durationMs}ms_`
|
|
123
|
+
: escapeCell(truncate(r.reply, 240));
|
|
124
|
+
lines.push(
|
|
125
|
+
`| ${i + 1} | \`${r.agent}\` | \`${r.criterion}\` | ${r.paraphrase.shape} | ${r.outcome} | ${escapeCell(truncate(r.paraphrase.text, 120))} | ${reply} |`,
|
|
126
|
+
);
|
|
127
|
+
});
|
|
128
|
+
if (triage.length > cap) {
|
|
129
|
+
lines.push("");
|
|
130
|
+
lines.push(`_…and ${triage.length - cap} more. Full results in the JSON sidecar._`);
|
|
131
|
+
}
|
|
132
|
+
lines.push("");
|
|
133
|
+
} else {
|
|
134
|
+
lines.push("## Triage");
|
|
135
|
+
lines.push("");
|
|
136
|
+
lines.push("All cases passed. No triage required.");
|
|
137
|
+
lines.push("");
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
return lines.join("\n");
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
function escapeCell(s: string): string {
|
|
144
|
+
return s.replace(/\|/g, "\\|").replace(/\n/g, " ").replace(/`/g, "ʼ");
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
function truncate(s: string, n: number): string {
|
|
148
|
+
if (s.length <= n) return s;
|
|
149
|
+
return s.slice(0, n - 1) + "…";
|
|
150
|
+
}
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
#!/usr/bin/env bash
|
|
2
|
+
# Run the agent-self-sufficiency UAT against the live fleet on this host.
|
|
3
|
+
#
|
|
4
|
+
# Why a wrapper script: the UAT runner needs three secrets out of the
|
|
5
|
+
# vault (TELEGRAM_API_ID / API_HASH / DRIVER_SESSION) plus the per-agent
|
|
6
|
+
# bot usernames. Pulling them inline here so an operator can run the
|
|
7
|
+
# whole suite with a single command:
|
|
8
|
+
#
|
|
9
|
+
# ./telegram-plugin/uat/runners/run-agent-self-sufficiency.sh
|
|
10
|
+
#
|
|
11
|
+
# The vault prompts for its passphrase interactively (once); the script
|
|
12
|
+
# then exports the three secrets only into the bun subprocess, never to
|
|
13
|
+
# the surrounding shell.
|
|
14
|
+
#
|
|
15
|
+
# Override fleet selection with UAT_FLEET / UAT_ADMIN_AGENTS (see the
|
|
16
|
+
# runner's --help for the format).
|
|
17
|
+
|
|
18
|
+
set -euo pipefail
|
|
19
|
+
|
|
20
|
+
cd "$(dirname "$0")/../../.." # → repo root
|
|
21
|
+
|
|
22
|
+
# ── 1. Pull the three UAT secrets from vault ────────────────────────────
|
|
23
|
+
# `switchroom vault get` prompts for the passphrase on first call and
|
|
24
|
+
# caches the unlocked broker for the session — subsequent gets are
|
|
25
|
+
# silent. We avoid passing tokens via argv so they don't show up in
|
|
26
|
+
# `ps`. Failed lookups fail loud.
|
|
27
|
+
echo "[uat] unlocking vault to read UAT secrets..."
|
|
28
|
+
TELEGRAM_API_ID="$(switchroom vault get telegram-uat-api-id)"
|
|
29
|
+
TELEGRAM_API_HASH="$(switchroom vault get telegram-uat-api-hash)"
|
|
30
|
+
TELEGRAM_UAT_DRIVER_SESSION="$(switchroom vault get telegram-uat-driver-session)"
|
|
31
|
+
export TELEGRAM_API_ID TELEGRAM_API_HASH TELEGRAM_UAT_DRIVER_SESSION
|
|
32
|
+
|
|
33
|
+
# ── 2. Discover the fleet from switchroom.yaml ──────────────────────────
|
|
34
|
+
# Operator may override by exporting UAT_FLEET / UAT_ADMIN_AGENTS
|
|
35
|
+
# explicitly. Otherwise we extract each agent's bot username from its
|
|
36
|
+
# token via getMe. This requires the operator to have read access to
|
|
37
|
+
# the per-agent .env files — if not, point UAT_FLEET at the right
|
|
38
|
+
# usernames manually.
|
|
39
|
+
if [[ -z "${UAT_FLEET:-}" ]]; then
|
|
40
|
+
echo "[uat] UAT_FLEET not set — set it explicitly to:"
|
|
41
|
+
echo " UAT_FLEET=\"agent1:@bot1,agent2:@bot2,agent3:@bot3\""
|
|
42
|
+
echo " UAT_ADMIN_AGENTS=\"agent1,agent2\" # optional"
|
|
43
|
+
echo ""
|
|
44
|
+
echo " Bot usernames live in BotFather or can be read from each"
|
|
45
|
+
echo " agent's vault entry. Set them and re-run."
|
|
46
|
+
exit 64
|
|
47
|
+
fi
|
|
48
|
+
|
|
49
|
+
# ── 3. Run ──────────────────────────────────────────────────────────────
|
|
50
|
+
exec bun telegram-plugin/uat/runners/agent-self-sufficiency.ts "$@"
|
|
@@ -0,0 +1,196 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Tests for the agent-self-sufficiency UAT runner's pure functions.
|
|
3
|
+
* The driver / Telegram orchestration is exercised live via the
|
|
4
|
+
* runner script itself (`agent-self-sufficiency.ts`) — these tests
|
|
5
|
+
* pin the scoring + reporting contracts so a refactor doesn't
|
|
6
|
+
* silently flip "fail" to "pass" or scramble the markdown layout.
|
|
7
|
+
*/
|
|
8
|
+
|
|
9
|
+
import { describe, it, expect } from "vitest";
|
|
10
|
+
import { scoreReply, aggregate, type CaseResult } from "./scorer.js";
|
|
11
|
+
import { CRITERIA, patternFor } from "./paraphrases.js";
|
|
12
|
+
import { renderMarkdown } from "./report.js";
|
|
13
|
+
|
|
14
|
+
const SPEC_IDENTITY = CRITERIA.find((c) => c.id === "2a_what_are_you")!;
|
|
15
|
+
const SPEC_NAME = CRITERIA.find((c) => c.id === "2b_your_name")!;
|
|
16
|
+
const SPEC_PEERS = CRITERIA.find((c) => c.id === "2c_peers")!;
|
|
17
|
+
const SPEC_CRON = CRITERIA.find((c) => c.id === "1b_cron_list")!;
|
|
18
|
+
const SPEC_REFUSAL = CRITERIA.find((c) => c.id === "3d_admin_refusal")!;
|
|
19
|
+
|
|
20
|
+
describe("CRITERIA corpus shape", () => {
|
|
21
|
+
it("has at least 10 paraphrases per criterion (goal acceptance gate)", () => {
|
|
22
|
+
for (const c of CRITERIA) {
|
|
23
|
+
expect(c.paraphrases.length, `criterion ${c.id}`).toBeGreaterThanOrEqual(
|
|
24
|
+
10,
|
|
25
|
+
);
|
|
26
|
+
}
|
|
27
|
+
});
|
|
28
|
+
|
|
29
|
+
it("covers every paraphrase shape at least once per criterion", () => {
|
|
30
|
+
const shapes = ["formal", "terse", "typo", "voice", "multi"] as const;
|
|
31
|
+
for (const c of CRITERIA) {
|
|
32
|
+
const seen = new Set(c.paraphrases.map((p) => p.shape));
|
|
33
|
+
for (const s of shapes) {
|
|
34
|
+
expect(seen.has(s), `${c.id} missing shape ${s}`).toBe(true);
|
|
35
|
+
}
|
|
36
|
+
}
|
|
37
|
+
});
|
|
38
|
+
});
|
|
39
|
+
|
|
40
|
+
describe("scoreReply", () => {
|
|
41
|
+
it("returns pass when the identity criterion's reply mentions switchroom + claude code", () => {
|
|
42
|
+
const reply =
|
|
43
|
+
"I'm a switchroom agent running Claude Code under the official `claude` CLI.";
|
|
44
|
+
expect(scoreReply(SPEC_IDENTITY, reply, { agentName: "x" })).toBe("pass");
|
|
45
|
+
});
|
|
46
|
+
|
|
47
|
+
it("returns fail when the identity reply is generic 'AI assistant' boilerplate", () => {
|
|
48
|
+
const reply = "I'm an AI assistant here to help you with tasks.";
|
|
49
|
+
expect(scoreReply(SPEC_IDENTITY, reply, { agentName: "x" })).toBe("fail");
|
|
50
|
+
});
|
|
51
|
+
|
|
52
|
+
it("returns fail on empty replies regardless of criterion", () => {
|
|
53
|
+
expect(scoreReply(SPEC_PEERS, "", { agentName: "x" })).toBe("fail");
|
|
54
|
+
expect(scoreReply(SPEC_PEERS, " ", { agentName: "x" })).toBe("fail");
|
|
55
|
+
});
|
|
56
|
+
|
|
57
|
+
it("strips markdown bold/code before matching so formatting doesn't flip outcomes", () => {
|
|
58
|
+
// The bold + backticks would have shielded the keyword if we
|
|
59
|
+
// matched raw — this proves stripMarkdown does its job.
|
|
60
|
+
const reply = "I'm a **switchroom** agent on `claude code`.";
|
|
61
|
+
expect(scoreReply(SPEC_IDENTITY, reply, { agentName: "x" })).toBe("pass");
|
|
62
|
+
});
|
|
63
|
+
|
|
64
|
+
it("substitutes __INJECTED_AGENT_NAME__ for the per-agent name criterion", () => {
|
|
65
|
+
const pattern = patternFor(SPEC_NAME, { agentName: "klanker" });
|
|
66
|
+
expect(pattern.test("my name is klanker")).toBe(true);
|
|
67
|
+
expect(pattern.test("my name is doc")).toBe(false);
|
|
68
|
+
});
|
|
69
|
+
|
|
70
|
+
it("scores 2b_your_name pass when the reply contains the agent name", () => {
|
|
71
|
+
const reply = "My name is klanker.";
|
|
72
|
+
expect(scoreReply(SPEC_NAME, reply, { agentName: "klanker" })).toBe("pass");
|
|
73
|
+
});
|
|
74
|
+
|
|
75
|
+
it("scores 2b_your_name fail when the reply names a different agent", () => {
|
|
76
|
+
const reply = "I'm doc.";
|
|
77
|
+
expect(scoreReply(SPEC_NAME, reply, { agentName: "klanker" })).toBe("fail");
|
|
78
|
+
});
|
|
79
|
+
|
|
80
|
+
it("scores 1b_cron_list pass for honest 'nothing scheduled' replies", () => {
|
|
81
|
+
const reply = "Nothing scheduled right now — my cron list is empty.";
|
|
82
|
+
expect(scoreReply(SPEC_CRON, reply, { agentName: "x" })).toBe("pass");
|
|
83
|
+
});
|
|
84
|
+
|
|
85
|
+
it("scores 3d_admin_refusal pass when reply says can't + names admin agent", () => {
|
|
86
|
+
const reply =
|
|
87
|
+
"I can't restart the fleet — ask klanker, they're the admin agent on this instance.";
|
|
88
|
+
expect(scoreReply(SPEC_REFUSAL, reply, { agentName: "scribe" })).toBe(
|
|
89
|
+
"pass",
|
|
90
|
+
);
|
|
91
|
+
});
|
|
92
|
+
});
|
|
93
|
+
|
|
94
|
+
describe("aggregate", () => {
|
|
95
|
+
it("counts by criterion / agent / shape", () => {
|
|
96
|
+
const mk = (
|
|
97
|
+
agent: string,
|
|
98
|
+
criterion: CaseResult["criterion"],
|
|
99
|
+
shape: "formal" | "terse" | "typo" | "voice" | "multi",
|
|
100
|
+
outcome: "pass" | "fail" | "timeout" | "error",
|
|
101
|
+
): CaseResult => ({
|
|
102
|
+
agent,
|
|
103
|
+
criterion,
|
|
104
|
+
paraphrase: { label: "x", shape, text: "y" },
|
|
105
|
+
outcome,
|
|
106
|
+
reply: "",
|
|
107
|
+
durationMs: 1,
|
|
108
|
+
});
|
|
109
|
+
const results = [
|
|
110
|
+
mk("a", "2a_what_are_you", "formal", "pass"),
|
|
111
|
+
mk("a", "2a_what_are_you", "typo", "fail"),
|
|
112
|
+
mk("b", "2a_what_are_you", "voice", "pass"),
|
|
113
|
+
mk("b", "2c_peers", "terse", "timeout"),
|
|
114
|
+
];
|
|
115
|
+
const a = aggregate(results);
|
|
116
|
+
expect(a.byCriterion.get("2a_what_are_you")).toEqual({
|
|
117
|
+
pass: 2,
|
|
118
|
+
fail: 1,
|
|
119
|
+
timeout: 0,
|
|
120
|
+
error: 0,
|
|
121
|
+
});
|
|
122
|
+
expect(a.byAgent.get("a")).toEqual({
|
|
123
|
+
pass: 1,
|
|
124
|
+
fail: 1,
|
|
125
|
+
timeout: 0,
|
|
126
|
+
error: 0,
|
|
127
|
+
});
|
|
128
|
+
expect(a.byShape.get("typo")).toEqual({
|
|
129
|
+
pass: 0,
|
|
130
|
+
fail: 1,
|
|
131
|
+
timeout: 0,
|
|
132
|
+
error: 0,
|
|
133
|
+
});
|
|
134
|
+
});
|
|
135
|
+
});
|
|
136
|
+
|
|
137
|
+
describe("renderMarkdown", () => {
|
|
138
|
+
it("produces a report with overall pass rate, per-criterion table, and triage when there are failures", () => {
|
|
139
|
+
const results: CaseResult[] = [
|
|
140
|
+
{
|
|
141
|
+
agent: "a",
|
|
142
|
+
criterion: "2a_what_are_you",
|
|
143
|
+
paraphrase: { label: "p1", shape: "formal", text: "what are you?" },
|
|
144
|
+
outcome: "pass",
|
|
145
|
+
reply: "I'm a switchroom agent.",
|
|
146
|
+
durationMs: 500,
|
|
147
|
+
},
|
|
148
|
+
{
|
|
149
|
+
agent: "a",
|
|
150
|
+
criterion: "2a_what_are_you",
|
|
151
|
+
paraphrase: { label: "p2", shape: "typo", text: "wht r u" },
|
|
152
|
+
outcome: "fail",
|
|
153
|
+
reply: "I'm just an AI.",
|
|
154
|
+
durationMs: 800,
|
|
155
|
+
},
|
|
156
|
+
{
|
|
157
|
+
agent: "b",
|
|
158
|
+
criterion: "2c_peers",
|
|
159
|
+
paraphrase: { label: "p3", shape: "voice", text: "who else is here?" },
|
|
160
|
+
outcome: "timeout",
|
|
161
|
+
reply: "",
|
|
162
|
+
durationMs: 60_000,
|
|
163
|
+
},
|
|
164
|
+
];
|
|
165
|
+
const md = renderMarkdown(results, {
|
|
166
|
+
startedAt: new Date("2026-05-14T00:00:00Z"),
|
|
167
|
+
durationSeconds: 90,
|
|
168
|
+
agents: ["a", "b"],
|
|
169
|
+
});
|
|
170
|
+
expect(md).toContain("# Agent self-sufficiency UAT report");
|
|
171
|
+
expect(md).toContain("33.3% (1/3)");
|
|
172
|
+
expect(md).toContain("`2a_what_are_you`");
|
|
173
|
+
expect(md).toContain("Triage");
|
|
174
|
+
// Triage row carries the verbatim prompt + reply.
|
|
175
|
+
expect(md).toContain("wht r u");
|
|
176
|
+
expect(md).toContain("I'm just an AI.");
|
|
177
|
+
expect(md).toMatch(/timeout after 60000ms/);
|
|
178
|
+
});
|
|
179
|
+
|
|
180
|
+
it("renders 'All cases passed' when there are no failures", () => {
|
|
181
|
+
const md = renderMarkdown(
|
|
182
|
+
[
|
|
183
|
+
{
|
|
184
|
+
agent: "a",
|
|
185
|
+
criterion: "2a_what_are_you",
|
|
186
|
+
paraphrase: { label: "p", shape: "formal", text: "what are you?" },
|
|
187
|
+
outcome: "pass",
|
|
188
|
+
reply: "I'm a switchroom agent.",
|
|
189
|
+
durationMs: 500,
|
|
190
|
+
},
|
|
191
|
+
],
|
|
192
|
+
{ startedAt: new Date(), durationSeconds: 1, agents: ["a"] },
|
|
193
|
+
);
|
|
194
|
+
expect(md).toContain("All cases passed");
|
|
195
|
+
});
|
|
196
|
+
});
|
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Heuristic pass/fail scoring for the agent-self-sufficiency UAT.
|
|
3
|
+
*
|
|
4
|
+
* Each result also carries the verbatim reply so the report's triage
|
|
5
|
+
* table can show the operator exactly what the agent said. Scoring is
|
|
6
|
+
* deliberately permissive — we're testing whether the agent
|
|
7
|
+
* understood the *intent* (and reached for the right tool), not
|
|
8
|
+
* whether the reply matches a specific phrasing.
|
|
9
|
+
*
|
|
10
|
+
* Failure modes the runner needs to distinguish from "wrong answer":
|
|
11
|
+
*
|
|
12
|
+
* - timeout: agent never replied within the budget. Could mean
|
|
13
|
+
* the agent is wedged, the bot token's wrong, or
|
|
14
|
+
* Telegram is throttling. Reported separately so the
|
|
15
|
+
* operator doesn't conflate "didn't reply" with
|
|
16
|
+
* "replied wrong".
|
|
17
|
+
* - send_error: driver couldn't even deliver the inbound (bot
|
|
18
|
+
* username missing, mtcute connection died, etc.).
|
|
19
|
+
* These bubble up as `error` results, not `fail`.
|
|
20
|
+
*/
|
|
21
|
+
|
|
22
|
+
import type { CriterionSpec, Paraphrase } from "./paraphrases.js";
|
|
23
|
+
import { patternFor } from "./paraphrases.js";
|
|
24
|
+
|
|
25
|
+
export type Outcome = "pass" | "fail" | "timeout" | "error";
|
|
26
|
+
|
|
27
|
+
export interface CaseResult {
|
|
28
|
+
agent: string;
|
|
29
|
+
criterion: CriterionSpec["id"];
|
|
30
|
+
paraphrase: Paraphrase;
|
|
31
|
+
outcome: Outcome;
|
|
32
|
+
/** Verbatim reply text, empty for timeout/error. Trimmed; markdown
|
|
33
|
+
* preserved so the report can show what the user actually saw. */
|
|
34
|
+
reply: string;
|
|
35
|
+
/** Wall-clock ms from sendDM to first reply (or to timeout). */
|
|
36
|
+
durationMs: number;
|
|
37
|
+
/** Optional error message for `error` outcomes. */
|
|
38
|
+
errorMessage?: string;
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
/**
|
|
42
|
+
* Score a single reply against a criterion. The runner does NOT call
|
|
43
|
+
* this on timeouts or errors — those outcomes are set directly. For
|
|
44
|
+
* `2b_your_name` and other criteria with `__INJECTED_AGENT_NAME__` in
|
|
45
|
+
* their passPattern, the caller passes the agent name so the matcher
|
|
46
|
+
* substitutes correctly.
|
|
47
|
+
*/
|
|
48
|
+
export function scoreReply(
|
|
49
|
+
spec: CriterionSpec,
|
|
50
|
+
reply: string,
|
|
51
|
+
injection: { agentName: string },
|
|
52
|
+
): Outcome {
|
|
53
|
+
if (!reply.trim()) return "fail";
|
|
54
|
+
const normalized = stripMarkdown(reply).toLowerCase();
|
|
55
|
+
return patternFor(spec, injection).test(normalized) ? "pass" : "fail";
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
/**
|
|
59
|
+
* Strip markdown bold/italic/code-fence markers and collapse runs of
|
|
60
|
+
* whitespace. Permissive on purpose — the scorer's regex matches
|
|
61
|
+
* against words, not formatting.
|
|
62
|
+
*/
|
|
63
|
+
function stripMarkdown(s: string): string {
|
|
64
|
+
return s
|
|
65
|
+
.replace(/```[\s\S]*?```/g, " ")
|
|
66
|
+
.replace(/`([^`]+)`/g, "$1")
|
|
67
|
+
.replace(/\*\*([^*]+)\*\*/g, "$1")
|
|
68
|
+
.replace(/__([^_]+)__/g, "$1")
|
|
69
|
+
.replace(/\*([^*]+)\*/g, "$1")
|
|
70
|
+
.replace(/_([^_]+)_/g, "$1")
|
|
71
|
+
.replace(/\s+/g, " ")
|
|
72
|
+
.trim();
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
/**
|
|
76
|
+
* Aggregate per-criterion / per-agent / per-shape pass rates. Pure
|
|
77
|
+
* function — easy to test.
|
|
78
|
+
*/
|
|
79
|
+
export interface Aggregate {
|
|
80
|
+
byCriterion: Map<string, { pass: number; fail: number; timeout: number; error: number }>;
|
|
81
|
+
byAgent: Map<string, { pass: number; fail: number; timeout: number; error: number }>;
|
|
82
|
+
byShape: Map<string, { pass: number; fail: number; timeout: number; error: number }>;
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
export function aggregate(results: readonly CaseResult[]): Aggregate {
|
|
86
|
+
const acc: Aggregate = {
|
|
87
|
+
byCriterion: new Map(),
|
|
88
|
+
byAgent: new Map(),
|
|
89
|
+
byShape: new Map(),
|
|
90
|
+
};
|
|
91
|
+
const bump = (
|
|
92
|
+
m: Aggregate["byCriterion"],
|
|
93
|
+
k: string,
|
|
94
|
+
outcome: Outcome,
|
|
95
|
+
): void => {
|
|
96
|
+
const row = m.get(k) ?? { pass: 0, fail: 0, timeout: 0, error: 0 };
|
|
97
|
+
row[outcome] += 1;
|
|
98
|
+
m.set(k, row);
|
|
99
|
+
};
|
|
100
|
+
for (const r of results) {
|
|
101
|
+
bump(acc.byCriterion, r.criterion, r.outcome);
|
|
102
|
+
bump(acc.byAgent, r.agent, r.outcome);
|
|
103
|
+
bump(acc.byShape, r.paraphrase.shape, r.outcome);
|
|
104
|
+
}
|
|
105
|
+
return acc;
|
|
106
|
+
}
|
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Unit tests for the skill-coverage UAT runner's pure pieces:
|
|
3
|
+
* label extractor + sidecar JSONL reader. Live driver/network paths
|
|
4
|
+
* are validated by operator-driven runs (see runbook).
|
|
5
|
+
*/
|
|
6
|
+
|
|
7
|
+
import { describe, it, expect } from "vitest";
|
|
8
|
+
import {
|
|
9
|
+
extractSkillFromLabel,
|
|
10
|
+
readSkillRowsSince,
|
|
11
|
+
} from "./skill-coverage.js";
|
|
12
|
+
|
|
13
|
+
describe("extractSkillFromLabel", () => {
|
|
14
|
+
it("pulls the slug from the hook's canonical label", () => {
|
|
15
|
+
expect(extractSkillFromLabel("Running skill switchroom-cli")).toBe(
|
|
16
|
+
"switchroom-cli",
|
|
17
|
+
);
|
|
18
|
+
});
|
|
19
|
+
|
|
20
|
+
it("is case-insensitive on the label but lowercases the slug", () => {
|
|
21
|
+
expect(extractSkillFromLabel("RUNNING SKILL BUILDKITE-API")).toBe(
|
|
22
|
+
"buildkite-api",
|
|
23
|
+
);
|
|
24
|
+
});
|
|
25
|
+
|
|
26
|
+
it("returns null for non-Skill labels", () => {
|
|
27
|
+
expect(extractSkillFromLabel("Reading scaffold.ts")).toBeNull();
|
|
28
|
+
expect(extractSkillFromLabel("Replying")).toBeNull();
|
|
29
|
+
});
|
|
30
|
+
|
|
31
|
+
it("returns null when the slug is missing or malformed", () => {
|
|
32
|
+
expect(extractSkillFromLabel("running skill")).toBeNull();
|
|
33
|
+
expect(extractSkillFromLabel("running skill (and)")).toBeNull();
|
|
34
|
+
});
|
|
35
|
+
});
|
|
36
|
+
|
|
37
|
+
describe("readSkillRowsSince", () => {
|
|
38
|
+
const files: Record<string, string> = {
|
|
39
|
+
"tool-labels-A.jsonl": [
|
|
40
|
+
// before sinceMs: ignored
|
|
41
|
+
JSON.stringify({ ts: 100, tool_use_id: "u1", agent_id: "ag", label: "Running skill docx", tool_name: "Skill" }),
|
|
42
|
+
// after sinceMs, Skill: kept
|
|
43
|
+
JSON.stringify({ ts: 1500, tool_use_id: "u2", agent_id: "ag", label: "Running skill switchroom-cli", tool_name: "Skill" }),
|
|
44
|
+
// after sinceMs, non-Skill: ignored
|
|
45
|
+
JSON.stringify({ ts: 1600, tool_use_id: "u3", agent_id: "ag", label: "Reading foo.ts", tool_name: "Read" }),
|
|
46
|
+
].join("\n") + "\n",
|
|
47
|
+
"tool-labels-B.jsonl": [
|
|
48
|
+
JSON.stringify({ ts: 2000, tool_use_id: "u4", agent_id: "ag", label: "Running skill buildkite-cli", tool_name: "Skill" }),
|
|
49
|
+
// malformed line: ignored
|
|
50
|
+
"{not-json",
|
|
51
|
+
"",
|
|
52
|
+
].join("\n") + "\n",
|
|
53
|
+
"other.jsonl": JSON.stringify({ ts: 2500, tool_name: "Skill", label: "Running skill x" }),
|
|
54
|
+
};
|
|
55
|
+
|
|
56
|
+
const fakeReaddir = (_p: string): string[] => Object.keys(files);
|
|
57
|
+
const fakeReadFile = (p: string): string => {
|
|
58
|
+
const name = p.split("/").pop()!;
|
|
59
|
+
if (files[name] === undefined) throw new Error("ENOENT");
|
|
60
|
+
return files[name]!;
|
|
61
|
+
};
|
|
62
|
+
|
|
63
|
+
it("returns only Skill rows from tool-labels-*.jsonl with ts >= sinceMs", () => {
|
|
64
|
+
const got = readSkillRowsSince("/fake", 1000, fakeReaddir, fakeReadFile);
|
|
65
|
+
const labels = got.map((r) => r.label).sort();
|
|
66
|
+
expect(labels).toEqual([
|
|
67
|
+
"Running skill buildkite-cli",
|
|
68
|
+
"Running skill switchroom-cli",
|
|
69
|
+
]);
|
|
70
|
+
});
|
|
71
|
+
|
|
72
|
+
it("returns [] when the dir read throws", () => {
|
|
73
|
+
expect(
|
|
74
|
+
readSkillRowsSince("/fake", 0, () => { throw new Error("EACCES"); }, fakeReadFile),
|
|
75
|
+
).toEqual([]);
|
|
76
|
+
});
|
|
77
|
+
|
|
78
|
+
it("skips files that fail to read but keeps siblings", () => {
|
|
79
|
+
const breakingRead = (p: string): string => {
|
|
80
|
+
if (p.endsWith("tool-labels-A.jsonl")) throw new Error("EACCES");
|
|
81
|
+
return fakeReadFile(p);
|
|
82
|
+
};
|
|
83
|
+
const got = readSkillRowsSince("/fake", 0, fakeReaddir, breakingRead);
|
|
84
|
+
expect(got.map((r) => r.label)).toEqual(["Running skill buildkite-cli"]);
|
|
85
|
+
});
|
|
86
|
+
|
|
87
|
+
it("ignores files that don't match the tool-labels-*.jsonl pattern", () => {
|
|
88
|
+
const files2: Record<string, string> = {
|
|
89
|
+
"other.jsonl": JSON.stringify({ ts: 100, tool_name: "Skill", label: "x" }),
|
|
90
|
+
"tool-labels-A.jsonl": "",
|
|
91
|
+
};
|
|
92
|
+
const got = readSkillRowsSince(
|
|
93
|
+
"/fake",
|
|
94
|
+
0,
|
|
95
|
+
() => Object.keys(files2),
|
|
96
|
+
(p) => files2[p.split("/").pop()!]!,
|
|
97
|
+
);
|
|
98
|
+
expect(got).toEqual([]);
|
|
99
|
+
});
|
|
100
|
+
});
|