switchroom 0.8.1 → 0.10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +49 -57
- package/bin/timezone-hook.sh +9 -7
- package/dist/agent-scheduler/index.js +285 -45
- package/dist/auth-broker/index.js +13932 -0
- package/dist/cli/switchroom.js +15931 -12778
- package/dist/host-control/main.js +582 -43
- package/dist/vault/approvals/kernel-server.js +276 -47
- package/dist/vault/broker/server.js +333 -69
- package/examples/minimal.yaml +63 -0
- package/examples/personal-google-workspace-mcp/.env.example +34 -0
- package/examples/personal-google-workspace-mcp/README.md +194 -0
- package/examples/personal-google-workspace-mcp/compose.yaml +66 -0
- package/examples/switchroom.yaml +220 -0
- package/package.json +6 -4
- package/profiles/_base/start.sh.hbs +3 -3
- package/profiles/_shared/agent-self-service.md.hbs +126 -0
- package/profiles/default/CLAUDE.md +10 -0
- package/profiles/default/CLAUDE.md.hbs +16 -0
- package/skills/buildkite-agent-infrastructure/SKILL.md +30 -11
- package/skills/buildkite-agent-runtime/SKILL.md +44 -11
- package/skills/buildkite-api/SKILL.md +31 -8
- package/skills/buildkite-cli/SKILL.md +27 -9
- package/skills/buildkite-migration/SKILL.md +22 -9
- package/skills/buildkite-pipelines/SKILL.md +26 -9
- package/skills/buildkite-secure-delivery/SKILL.md +23 -9
- package/skills/buildkite-test-engine/SKILL.md +25 -8
- package/skills/docx/SKILL.md +1 -1
- package/skills/file-bug/SKILL.md +34 -6
- package/skills/humanizer/SKILL.md +15 -0
- package/skills/humanizer-calibrate/SKILL.md +7 -1
- package/skills/mcp-builder/SKILL.md +1 -1
- package/skills/pdf/SKILL.md +1 -1
- package/skills/pptx/SKILL.md +1 -1
- package/skills/skill-creator/SKILL.md +21 -1
- package/skills/skill-creator/scripts/__pycache__/__init__.cpython-313.pyc +0 -0
- package/skills/skill-creator/scripts/__pycache__/generate_report.cpython-313.pyc +0 -0
- package/skills/skill-creator/scripts/__pycache__/improve_description.cpython-313.pyc +0 -0
- package/skills/skill-creator/scripts/__pycache__/run_eval.cpython-313.pyc +0 -0
- package/skills/skill-creator/scripts/__pycache__/run_loop.cpython-313.pyc +0 -0
- package/skills/skill-creator/scripts/__pycache__/utils.cpython-313.pyc +0 -0
- package/skills/switchroom-cli/SKILL.md +63 -64
- package/skills/switchroom-health/SKILL.md +23 -10
- package/skills/switchroom-install/SKILL.md +3 -3
- package/skills/switchroom-manage/SKILL.md +26 -19
- package/skills/switchroom-runtime/SKILL.md +67 -15
- package/skills/switchroom-status/SKILL.md +26 -1
- package/skills/telegram-test-harness/SKILL.md +3 -0
- package/skills/webapp-testing/SKILL.md +31 -1
- package/skills/xlsx/SKILL.md +1 -1
- package/telegram-plugin/admin-commands/index.ts +7 -5
- package/telegram-plugin/dist/gateway/gateway.js +13042 -12844
- package/telegram-plugin/gateway/auth-add-flow.ts +326 -0
- package/telegram-plugin/gateway/auth-broker-client.ts +75 -0
- package/telegram-plugin/gateway/auth-command.ts +794 -0
- package/telegram-plugin/gateway/auth-line.ts +123 -0
- package/telegram-plugin/gateway/boot-card.ts +22 -36
- package/telegram-plugin/gateway/boot-probes.ts +3 -3
- package/telegram-plugin/gateway/gateway.ts +313 -798
- package/telegram-plugin/gateway/hostd-dispatch.ts +117 -0
- package/telegram-plugin/hooks/tool-label-pretool.mjs +11 -0
- package/telegram-plugin/hooks/wedge-detect-posttool.mjs +303 -0
- package/telegram-plugin/permission-title.ts +56 -0
- package/telegram-plugin/quota-check.ts +19 -41
- package/telegram-plugin/scripts/build.mjs +0 -1
- package/telegram-plugin/shared/bot-runtime.ts +5 -4
- package/telegram-plugin/tests/auth-add-flow.test.ts +559 -0
- package/telegram-plugin/tests/auth-code-redact.test.ts +8 -4
- package/telegram-plugin/tests/auth-command-vernacular.test.ts +531 -0
- package/telegram-plugin/tests/boot-probes.test.ts +11 -4
- package/telegram-plugin/tests/hostd-dispatch.test.ts +129 -0
- package/telegram-plugin/tests/permission-title.test.ts +31 -0
- package/telegram-plugin/tests/quota-check.test.ts +5 -35
- package/telegram-plugin/uat/SETUP.md +31 -1
- package/telegram-plugin/uat/runners/agent-self-sufficiency.ts +457 -0
- package/telegram-plugin/uat/runners/paraphrases.ts +231 -0
- package/telegram-plugin/uat/runners/report.ts +150 -0
- package/telegram-plugin/uat/runners/run-agent-self-sufficiency.sh +50 -0
- package/telegram-plugin/uat/runners/scorer.test.ts +196 -0
- package/telegram-plugin/uat/runners/scorer.ts +106 -0
- package/telegram-plugin/uat/runners/skill-coverage.test.ts +100 -0
- package/telegram-plugin/uat/runners/skill-coverage.ts +620 -0
- package/telegram-plugin/uat/scenarios/jtbd-interrupt-marker-dm.test.ts +7 -1
- package/telegram-plugin/uat/scenarios/jtbd-rapid-followup-dm.test.ts +7 -1
- package/telegram-plugin/auth-dashboard.ts +0 -1104
- package/telegram-plugin/auth-slot-parser.ts +0 -497
- package/telegram-plugin/dist/foreman/foreman.js +0 -31358
- package/telegram-plugin/foreman/foreman-create-flow.ts +0 -202
- package/telegram-plugin/foreman/foreman-handlers.ts +0 -493
- package/telegram-plugin/foreman/foreman.ts +0 -1165
- package/telegram-plugin/foreman/setup-flow.ts +0 -345
- package/telegram-plugin/foreman/setup-state.ts +0 -239
- package/telegram-plugin/foreman/state.ts +0 -203
- package/telegram-plugin/tests/auth-account-identity-surface.test.ts +0 -118
- package/telegram-plugin/tests/auth-dashboard-edge-cases.test.ts +0 -260
- package/telegram-plugin/tests/auth-dashboard-restart-flow.test.ts +0 -140
- package/telegram-plugin/tests/auth-dashboard-v3b.test.ts +0 -559
- package/telegram-plugin/tests/auth-dashboard.test.ts +0 -1045
- package/telegram-plugin/tests/auth-slot-commands.test.ts +0 -640
- package/telegram-plugin/tests/boot-card-account-quota.test.ts +0 -137
- package/telegram-plugin/tests/foreman-create-flow.test.ts +0 -359
- package/telegram-plugin/tests/foreman-handlers.test.ts +0 -347
- package/telegram-plugin/tests/foreman-state.test.ts +0 -164
- package/telegram-plugin/tests/foreman-write-ops.test.ts +0 -214
- package/telegram-plugin/tests/setup-flow.test.ts +0 -510
- package/telegram-plugin/tests/setup-state.test.ts +0 -146
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Heuristic pass/fail scoring for the agent-self-sufficiency UAT.
|
|
3
|
+
*
|
|
4
|
+
* Each result also carries the verbatim reply so the report's triage
|
|
5
|
+
* table can show the operator exactly what the agent said. Scoring is
|
|
6
|
+
* deliberately permissive — we're testing whether the agent
|
|
7
|
+
* understood the *intent* (and reached for the right tool), not
|
|
8
|
+
* whether the reply matches a specific phrasing.
|
|
9
|
+
*
|
|
10
|
+
* Failure modes the runner needs to distinguish from "wrong answer":
|
|
11
|
+
*
|
|
12
|
+
* - timeout: agent never replied within the budget. Could mean
|
|
13
|
+
* the agent is wedged, the bot token's wrong, or
|
|
14
|
+
* Telegram is throttling. Reported separately so the
|
|
15
|
+
* operator doesn't conflate "didn't reply" with
|
|
16
|
+
* "replied wrong".
|
|
17
|
+
* - send_error: driver couldn't even deliver the inbound (bot
|
|
18
|
+
* username missing, mtcute connection died, etc.).
|
|
19
|
+
* These bubble up as `error` results, not `fail`.
|
|
20
|
+
*/
|
|
21
|
+
|
|
22
|
+
import type { CriterionSpec, Paraphrase } from "./paraphrases.js";
|
|
23
|
+
import { patternFor } from "./paraphrases.js";
|
|
24
|
+
|
|
25
|
+
export type Outcome = "pass" | "fail" | "timeout" | "error";
|
|
26
|
+
|
|
27
|
+
export interface CaseResult {
|
|
28
|
+
agent: string;
|
|
29
|
+
criterion: CriterionSpec["id"];
|
|
30
|
+
paraphrase: Paraphrase;
|
|
31
|
+
outcome: Outcome;
|
|
32
|
+
/** Verbatim reply text, empty for timeout/error. Trimmed; markdown
|
|
33
|
+
* preserved so the report can show what the user actually saw. */
|
|
34
|
+
reply: string;
|
|
35
|
+
/** Wall-clock ms from sendDM to first reply (or to timeout). */
|
|
36
|
+
durationMs: number;
|
|
37
|
+
/** Optional error message for `error` outcomes. */
|
|
38
|
+
errorMessage?: string;
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
/**
|
|
42
|
+
* Score a single reply against a criterion. The runner does NOT call
|
|
43
|
+
* this on timeouts or errors — those outcomes are set directly. For
|
|
44
|
+
* `2b_your_name` and other criteria with `__INJECTED_AGENT_NAME__` in
|
|
45
|
+
* their passPattern, the caller passes the agent name so the matcher
|
|
46
|
+
* substitutes correctly.
|
|
47
|
+
*/
|
|
48
|
+
export function scoreReply(
|
|
49
|
+
spec: CriterionSpec,
|
|
50
|
+
reply: string,
|
|
51
|
+
injection: { agentName: string },
|
|
52
|
+
): Outcome {
|
|
53
|
+
if (!reply.trim()) return "fail";
|
|
54
|
+
const normalized = stripMarkdown(reply).toLowerCase();
|
|
55
|
+
return patternFor(spec, injection).test(normalized) ? "pass" : "fail";
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
/**
|
|
59
|
+
* Strip markdown bold/italic/code-fence markers and collapse runs of
|
|
60
|
+
* whitespace. Permissive on purpose — the scorer's regex matches
|
|
61
|
+
* against words, not formatting.
|
|
62
|
+
*/
|
|
63
|
+
function stripMarkdown(s: string): string {
|
|
64
|
+
return s
|
|
65
|
+
.replace(/```[\s\S]*?```/g, " ")
|
|
66
|
+
.replace(/`([^`]+)`/g, "$1")
|
|
67
|
+
.replace(/\*\*([^*]+)\*\*/g, "$1")
|
|
68
|
+
.replace(/__([^_]+)__/g, "$1")
|
|
69
|
+
.replace(/\*([^*]+)\*/g, "$1")
|
|
70
|
+
.replace(/_([^_]+)_/g, "$1")
|
|
71
|
+
.replace(/\s+/g, " ")
|
|
72
|
+
.trim();
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
/**
|
|
76
|
+
* Aggregate per-criterion / per-agent / per-shape pass rates. Pure
|
|
77
|
+
* function — easy to test.
|
|
78
|
+
*/
|
|
79
|
+
export interface Aggregate {
|
|
80
|
+
byCriterion: Map<string, { pass: number; fail: number; timeout: number; error: number }>;
|
|
81
|
+
byAgent: Map<string, { pass: number; fail: number; timeout: number; error: number }>;
|
|
82
|
+
byShape: Map<string, { pass: number; fail: number; timeout: number; error: number }>;
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
export function aggregate(results: readonly CaseResult[]): Aggregate {
|
|
86
|
+
const acc: Aggregate = {
|
|
87
|
+
byCriterion: new Map(),
|
|
88
|
+
byAgent: new Map(),
|
|
89
|
+
byShape: new Map(),
|
|
90
|
+
};
|
|
91
|
+
const bump = (
|
|
92
|
+
m: Aggregate["byCriterion"],
|
|
93
|
+
k: string,
|
|
94
|
+
outcome: Outcome,
|
|
95
|
+
): void => {
|
|
96
|
+
const row = m.get(k) ?? { pass: 0, fail: 0, timeout: 0, error: 0 };
|
|
97
|
+
row[outcome] += 1;
|
|
98
|
+
m.set(k, row);
|
|
99
|
+
};
|
|
100
|
+
for (const r of results) {
|
|
101
|
+
bump(acc.byCriterion, r.criterion, r.outcome);
|
|
102
|
+
bump(acc.byAgent, r.agent, r.outcome);
|
|
103
|
+
bump(acc.byShape, r.paraphrase.shape, r.outcome);
|
|
104
|
+
}
|
|
105
|
+
return acc;
|
|
106
|
+
}
|
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Unit tests for the skill-coverage UAT runner's pure pieces:
|
|
3
|
+
* label extractor + sidecar JSONL reader. Live driver/network paths
|
|
4
|
+
* are validated by operator-driven runs (see runbook).
|
|
5
|
+
*/
|
|
6
|
+
|
|
7
|
+
import { describe, it, expect } from "vitest";
|
|
8
|
+
import {
|
|
9
|
+
extractSkillFromLabel,
|
|
10
|
+
readSkillRowsSince,
|
|
11
|
+
} from "./skill-coverage.js";
|
|
12
|
+
|
|
13
|
+
describe("extractSkillFromLabel", () => {
|
|
14
|
+
it("pulls the slug from the hook's canonical label", () => {
|
|
15
|
+
expect(extractSkillFromLabel("Running skill switchroom-cli")).toBe(
|
|
16
|
+
"switchroom-cli",
|
|
17
|
+
);
|
|
18
|
+
});
|
|
19
|
+
|
|
20
|
+
it("is case-insensitive on the label but lowercases the slug", () => {
|
|
21
|
+
expect(extractSkillFromLabel("RUNNING SKILL BUILDKITE-API")).toBe(
|
|
22
|
+
"buildkite-api",
|
|
23
|
+
);
|
|
24
|
+
});
|
|
25
|
+
|
|
26
|
+
it("returns null for non-Skill labels", () => {
|
|
27
|
+
expect(extractSkillFromLabel("Reading scaffold.ts")).toBeNull();
|
|
28
|
+
expect(extractSkillFromLabel("Replying")).toBeNull();
|
|
29
|
+
});
|
|
30
|
+
|
|
31
|
+
it("returns null when the slug is missing or malformed", () => {
|
|
32
|
+
expect(extractSkillFromLabel("running skill")).toBeNull();
|
|
33
|
+
expect(extractSkillFromLabel("running skill (and)")).toBeNull();
|
|
34
|
+
});
|
|
35
|
+
});
|
|
36
|
+
|
|
37
|
+
describe("readSkillRowsSince", () => {
|
|
38
|
+
const files: Record<string, string> = {
|
|
39
|
+
"tool-labels-A.jsonl": [
|
|
40
|
+
// before sinceMs: ignored
|
|
41
|
+
JSON.stringify({ ts: 100, tool_use_id: "u1", agent_id: "ag", label: "Running skill docx", tool_name: "Skill" }),
|
|
42
|
+
// after sinceMs, Skill: kept
|
|
43
|
+
JSON.stringify({ ts: 1500, tool_use_id: "u2", agent_id: "ag", label: "Running skill switchroom-cli", tool_name: "Skill" }),
|
|
44
|
+
// after sinceMs, non-Skill: ignored
|
|
45
|
+
JSON.stringify({ ts: 1600, tool_use_id: "u3", agent_id: "ag", label: "Reading foo.ts", tool_name: "Read" }),
|
|
46
|
+
].join("\n") + "\n",
|
|
47
|
+
"tool-labels-B.jsonl": [
|
|
48
|
+
JSON.stringify({ ts: 2000, tool_use_id: "u4", agent_id: "ag", label: "Running skill buildkite-cli", tool_name: "Skill" }),
|
|
49
|
+
// malformed line: ignored
|
|
50
|
+
"{not-json",
|
|
51
|
+
"",
|
|
52
|
+
].join("\n") + "\n",
|
|
53
|
+
"other.jsonl": JSON.stringify({ ts: 2500, tool_name: "Skill", label: "Running skill x" }),
|
|
54
|
+
};
|
|
55
|
+
|
|
56
|
+
const fakeReaddir = (_p: string): string[] => Object.keys(files);
|
|
57
|
+
const fakeReadFile = (p: string): string => {
|
|
58
|
+
const name = p.split("/").pop()!;
|
|
59
|
+
if (files[name] === undefined) throw new Error("ENOENT");
|
|
60
|
+
return files[name]!;
|
|
61
|
+
};
|
|
62
|
+
|
|
63
|
+
it("returns only Skill rows from tool-labels-*.jsonl with ts >= sinceMs", () => {
|
|
64
|
+
const got = readSkillRowsSince("/fake", 1000, fakeReaddir, fakeReadFile);
|
|
65
|
+
const labels = got.map((r) => r.label).sort();
|
|
66
|
+
expect(labels).toEqual([
|
|
67
|
+
"Running skill buildkite-cli",
|
|
68
|
+
"Running skill switchroom-cli",
|
|
69
|
+
]);
|
|
70
|
+
});
|
|
71
|
+
|
|
72
|
+
it("returns [] when the dir read throws", () => {
|
|
73
|
+
expect(
|
|
74
|
+
readSkillRowsSince("/fake", 0, () => { throw new Error("EACCES"); }, fakeReadFile),
|
|
75
|
+
).toEqual([]);
|
|
76
|
+
});
|
|
77
|
+
|
|
78
|
+
it("skips files that fail to read but keeps siblings", () => {
|
|
79
|
+
const breakingRead = (p: string): string => {
|
|
80
|
+
if (p.endsWith("tool-labels-A.jsonl")) throw new Error("EACCES");
|
|
81
|
+
return fakeReadFile(p);
|
|
82
|
+
};
|
|
83
|
+
const got = readSkillRowsSince("/fake", 0, fakeReaddir, breakingRead);
|
|
84
|
+
expect(got.map((r) => r.label)).toEqual(["Running skill buildkite-cli"]);
|
|
85
|
+
});
|
|
86
|
+
|
|
87
|
+
it("ignores files that don't match the tool-labels-*.jsonl pattern", () => {
|
|
88
|
+
const files2: Record<string, string> = {
|
|
89
|
+
"other.jsonl": JSON.stringify({ ts: 100, tool_name: "Skill", label: "x" }),
|
|
90
|
+
"tool-labels-A.jsonl": "",
|
|
91
|
+
};
|
|
92
|
+
const got = readSkillRowsSince(
|
|
93
|
+
"/fake",
|
|
94
|
+
0,
|
|
95
|
+
() => Object.keys(files2),
|
|
96
|
+
(p) => files2[p.split("/").pop()!]!,
|
|
97
|
+
);
|
|
98
|
+
expect(got).toEqual([]);
|
|
99
|
+
});
|
|
100
|
+
});
|