role-os 1.0.2 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/trial.mjs ADDED
@@ -0,0 +1,252 @@
1
+ /**
2
+ * Role Execution Trial Runner
3
+ *
4
+ * Runs gold-task and rejection tests against individual roles to verify:
5
+ * 1. Artifact matches contract deliverable shape
6
+ * 2. Output is better than what a nearby role would produce
7
+ * 3. Handoff sets up the next role cleanly
8
+ * 4. Role escalates correctly when given out-of-lane work
9
+ * 5. No bluffing — role does not fake competence outside its mission
10
+ */
11
+
12
+ import { ROLE_CATALOG, scoreRole, MIN_SCORE_THRESHOLD } from "./route.mjs";
13
+ import { TOOL_PROFILES } from "./dispatch.mjs";
14
+ import { getRequirements } from "./evidence.mjs";
15
+ import { readFileSync, writeFileSync, mkdirSync, existsSync } from "node:fs";
16
+ import { join, resolve } from "node:path";
17
+
18
+ // ── Trial types ───────────────────────────────────────────────────────────────
19
+
20
+ /**
21
+ * @typedef {Object} TrialCase
22
+ * @property {string} id - Unique trial case ID
23
+ * @property {string} role - Role name being tested
24
+ * @property {string} type - 'gold-task' | 'rejection' | 'handoff'
25
+ * @property {string} prompt - Task description given to the role
26
+ * @property {string} expectedBehavior - What the role should do
27
+ * @property {string[]} mustInclude - Output must contain these elements
28
+ * @property {string[]} mustNotInclude - Output must NOT contain these elements
29
+ * @property {string} [escalateTo] - If rejection test, expected escalation target
30
+ */
31
+
32
+ /**
33
+ * @typedef {Object} TrialResult
34
+ * @property {string} caseId
35
+ * @property {string} role
36
+ * @property {string} type
37
+ * @property {boolean} passed
38
+ * @property {string[]} findings - What was observed
39
+ * @property {string[]} violations - Contract violations found
40
+ * @property {string} [escalationTarget] - Who the role should hand off to
41
+ */
42
+
43
+ // ── Role contract extractors ──────────────────────────────────────────────────
44
+
45
+ /**
46
+ * Get a role's contract summary for trial evaluation.
47
+ */
48
+ export function getRoleContract(roleName) {
49
+ const role = ROLE_CATALOG.find(r => r.name === roleName);
50
+ if (!role) return null;
51
+
52
+ const tools = TOOL_PROFILES[roleName] || [];
53
+ const evidence = getRequirements(roleName);
54
+
55
+ return {
56
+ name: roleName,
57
+ pack: role.pack,
58
+ phase: role.phase,
59
+ keywords: role.keywords,
60
+ triggers: role.triggers,
61
+ tools,
62
+ evidenceRequired: evidence.required,
63
+ evidenceRecommended: evidence.recommended,
64
+ evidenceDescription: evidence.description,
65
+ };
66
+ }
67
+
68
+ // ── Trial case builder ────────────────────────────────────────────────────────
69
+
70
+ /**
71
+ * Build gold-task trial cases for a role cluster.
72
+ */
73
+ export function buildClusterTrials(clusterName, roles, taskContext) {
74
+ const trials = [];
75
+
76
+ for (const { role, goldTask, rejectionTask } of roles) {
77
+ // Gold-task test
78
+ trials.push({
79
+ id: `${clusterName}-gold-${role.replace(/\s+/g, '-').toLowerCase()}`,
80
+ role,
81
+ type: "gold-task",
82
+ prompt: goldTask.prompt,
83
+ expectedBehavior: goldTask.expected,
84
+ mustInclude: goldTask.mustInclude || [],
85
+ mustNotInclude: goldTask.mustNotInclude || [],
86
+ context: taskContext,
87
+ });
88
+
89
+ // Rejection test
90
+ if (rejectionTask) {
91
+ trials.push({
92
+ id: `${clusterName}-reject-${role.replace(/\s+/g, '-').toLowerCase()}`,
93
+ role,
94
+ type: "rejection",
95
+ prompt: rejectionTask.prompt,
96
+ expectedBehavior: rejectionTask.expected,
97
+ mustInclude: rejectionTask.mustInclude || [],
98
+ mustNotInclude: rejectionTask.mustNotInclude || [],
99
+ escalateTo: rejectionTask.escalateTo,
100
+ context: taskContext,
101
+ });
102
+ }
103
+ }
104
+
105
+ return trials;
106
+ }
107
+
108
+ // ── Trial evaluation ──────────────────────────────────────────────────────────
109
+
110
+ /**
111
+ * Evaluate a role's output against trial expectations.
112
+ * This is a structural check, not an LLM judge.
113
+ */
114
+ export function evaluateTrialOutput(trialCase, output) {
115
+ const violations = [];
116
+ const findings = [];
117
+
118
+ // Check mustInclude
119
+ for (const required of trialCase.mustInclude) {
120
+ if (!output.toLowerCase().includes(required.toLowerCase())) {
121
+ violations.push(`Missing required element: "${required}"`);
122
+ } else {
123
+ findings.push(`✓ Contains: "${required}"`);
124
+ }
125
+ }
126
+
127
+ // Check mustNotInclude
128
+ for (const forbidden of trialCase.mustNotInclude) {
129
+ if (output.toLowerCase().includes(forbidden.toLowerCase())) {
130
+ violations.push(`Contains forbidden element: "${forbidden}"`);
131
+ } else {
132
+ findings.push(`✓ Avoids: "${forbidden}"`);
133
+ }
134
+ }
135
+
136
+ // For rejection tests: check if escalation is mentioned
137
+ if (trialCase.type === "rejection" && trialCase.escalateTo) {
138
+ if (output.toLowerCase().includes(trialCase.escalateTo.toLowerCase()) ||
139
+ output.toLowerCase().includes("escalat") ||
140
+ output.toLowerCase().includes("hand off") ||
141
+ output.toLowerCase().includes("not my") ||
142
+ output.toLowerCase().includes("outside my")) {
143
+ findings.push(`✓ Correctly identifies need to escalate/hand off`);
144
+ } else {
145
+ violations.push(`Should escalate to ${trialCase.escalateTo} but did not indicate escalation`);
146
+ }
147
+ }
148
+
149
+ const passed = violations.length === 0;
150
+
151
+ return {
152
+ caseId: trialCase.id,
153
+ role: trialCase.role,
154
+ type: trialCase.type,
155
+ passed,
156
+ findings,
157
+ violations,
158
+ escalationTarget: trialCase.escalateTo,
159
+ };
160
+ }
161
+
162
+ // ── Trial report ──────────────────────────────────────────────────────────────
163
+
164
+ /**
165
+ * Format trial results as a readable report.
166
+ */
167
+ export function formatTrialReport(clusterName, results) {
168
+ const lines = [`\n# Trial Report: ${clusterName}\n`];
169
+
170
+ const passed = results.filter(r => r.passed).length;
171
+ const total = results.length;
172
+ lines.push(`Results: ${passed}/${total} passed\n`);
173
+
174
+ for (const result of results) {
175
+ const icon = result.passed ? "✓" : "✗";
176
+ lines.push(`${icon} ${result.caseId} (${result.type})`);
177
+ lines.push(` Role: ${result.role}`);
178
+
179
+ if (result.findings.length > 0) {
180
+ for (const f of result.findings) {
181
+ lines.push(` ${f}`);
182
+ }
183
+ }
184
+
185
+ if (result.violations.length > 0) {
186
+ for (const v of result.violations) {
187
+ lines.push(` ✗ ${v}`);
188
+ }
189
+ }
190
+
191
+ lines.push("");
192
+ }
193
+
194
+ return lines.join("\n");
195
+ }
196
+
197
+ // ── Pre-built cluster trials ──────────────────────────────────────────────────
198
+
199
+ export const PRODUCT_CLUSTER_TRIALS = buildClusterTrials(
200
+ "product-cluster",
201
+ [
202
+ {
203
+ role: "Product Strategist",
204
+ goldTask: {
205
+ prompt: "We want to add tool-use analytics to claude-guardian. Help us decide what's worth building.",
206
+ expected: "Problem framing, scope definition, non-goals, tradeoff analysis",
207
+ mustInclude: ["scope", "tradeoff"],
208
+ mustNotInclude: ["acceptance criteria", "edge case enumeration"],
209
+ },
210
+ rejectionTask: {
211
+ prompt: "Write detailed acceptance criteria for the analytics feature.",
212
+ expected: "Escalate to Spec Writer",
213
+ escalateTo: "Spec Writer",
214
+ mustInclude: [],
215
+ mustNotInclude: [],
216
+ },
217
+ },
218
+ {
219
+ role: "Spec Writer",
220
+ goldTask: {
221
+ prompt: "Product Strategist approved scope: track tool call counts, failure rates, and tool sprawl detection. Write the execution-grade spec.",
222
+ expected: "Acceptance criteria, edge cases, data schema, NFRs",
223
+ mustInclude: ["acceptance criteria"],
224
+ mustNotInclude: [],
225
+ },
226
+ rejectionTask: {
227
+ prompt: "Which of these 4 features should we build first: analytics, crash recovery, disk monitoring, preview health?",
228
+ expected: "Escalate to Roadmap Prioritizer",
229
+ escalateTo: "Roadmap Prioritizer",
230
+ mustInclude: [],
231
+ mustNotInclude: [],
232
+ },
233
+ },
234
+ {
235
+ role: "Roadmap Prioritizer",
236
+ goldTask: {
237
+ prompt: "We have 4 guardian features in the backlog: tool-use analytics, crash recovery, disk monitoring, preview health. Prioritize them by leverage and dependency.",
238
+ expected: "Sequenced list with leverage/risk rationale, dependency analysis",
239
+ mustInclude: ["prioriti", "depend"],
240
+ mustNotInclude: [],
241
+ },
242
+ rejectionTask: {
243
+ prompt: "What should the analytics feature include? Define the scope.",
244
+ expected: "Escalate to Product Strategist",
245
+ escalateTo: "Product Strategist",
246
+ mustInclude: [],
247
+ mustNotInclude: [],
248
+ },
249
+ },
250
+ ],
251
+ "claude-guardian MCP server — tool-use analytics feature"
252
+ );