role-os 1.0.2 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +74 -0
- package/README.md +55 -23
- package/bin/roleos.mjs +8 -1
- package/package.json +13 -3
- package/src/conflicts.mjs +217 -0
- package/src/dispatch.mjs +310 -0
- package/src/escalation.mjs +288 -0
- package/src/evidence.mjs +288 -0
- package/src/packs-cmd.mjs +143 -0
- package/src/packs.mjs +331 -0
- package/src/review.mjs +12 -0
- package/src/route.mjs +477 -82
- package/src/trial.mjs +252 -0
package/src/trial.mjs
ADDED
|
@@ -0,0 +1,252 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Role Execution Trial Runner
|
|
3
|
+
*
|
|
4
|
+
* Runs gold-task and rejection tests against individual roles to verify:
|
|
5
|
+
* 1. Artifact matches contract deliverable shape
|
|
6
|
+
* 2. Output is better than what a nearby role would produce
|
|
7
|
+
* 3. Handoff sets up the next role cleanly
|
|
8
|
+
* 4. Role escalates correctly when given out-of-lane work
|
|
9
|
+
* 5. No bluffing — role does not fake competence outside its mission
|
|
10
|
+
*/
|
|
11
|
+
|
|
12
|
+
import { ROLE_CATALOG, scoreRole, MIN_SCORE_THRESHOLD } from "./route.mjs";
|
|
13
|
+
import { TOOL_PROFILES } from "./dispatch.mjs";
|
|
14
|
+
import { getRequirements } from "./evidence.mjs";
|
|
15
|
+
import { readFileSync, writeFileSync, mkdirSync, existsSync } from "node:fs";
|
|
16
|
+
import { join, resolve } from "node:path";
|
|
17
|
+
|
|
18
|
+
// ── Trial types ───────────────────────────────────────────────────────────────
|
|
19
|
+
|
|
20
|
+
/**
|
|
21
|
+
* @typedef {Object} TrialCase
|
|
22
|
+
* @property {string} id - Unique trial case ID
|
|
23
|
+
* @property {string} role - Role name being tested
|
|
24
|
+
* @property {string} type - 'gold-task' | 'rejection' | 'handoff'
|
|
25
|
+
* @property {string} prompt - Task description given to the role
|
|
26
|
+
* @property {string} expectedBehavior - What the role should do
|
|
27
|
+
* @property {string[]} mustInclude - Output must contain these elements
|
|
28
|
+
* @property {string[]} mustNotInclude - Output must NOT contain these elements
|
|
29
|
+
* @property {string} [escalateTo] - If rejection test, expected escalation target
|
|
30
|
+
*/
|
|
31
|
+
|
|
32
|
+
/**
|
|
33
|
+
* @typedef {Object} TrialResult
|
|
34
|
+
* @property {string} caseId
|
|
35
|
+
* @property {string} role
|
|
36
|
+
* @property {string} type
|
|
37
|
+
* @property {boolean} passed
|
|
38
|
+
* @property {string[]} findings - What was observed
|
|
39
|
+
* @property {string[]} violations - Contract violations found
|
|
40
|
+
* @property {string} [escalationTarget] - Who the role should hand off to
|
|
41
|
+
*/
|
|
42
|
+
|
|
43
|
+
// ── Role contract extractors ──────────────────────────────────────────────────
|
|
44
|
+
|
|
45
|
+
/**
|
|
46
|
+
* Get a role's contract summary for trial evaluation.
|
|
47
|
+
*/
|
|
48
|
+
export function getRoleContract(roleName) {
|
|
49
|
+
const role = ROLE_CATALOG.find(r => r.name === roleName);
|
|
50
|
+
if (!role) return null;
|
|
51
|
+
|
|
52
|
+
const tools = TOOL_PROFILES[roleName] || [];
|
|
53
|
+
const evidence = getRequirements(roleName);
|
|
54
|
+
|
|
55
|
+
return {
|
|
56
|
+
name: roleName,
|
|
57
|
+
pack: role.pack,
|
|
58
|
+
phase: role.phase,
|
|
59
|
+
keywords: role.keywords,
|
|
60
|
+
triggers: role.triggers,
|
|
61
|
+
tools,
|
|
62
|
+
evidenceRequired: evidence.required,
|
|
63
|
+
evidenceRecommended: evidence.recommended,
|
|
64
|
+
evidenceDescription: evidence.description,
|
|
65
|
+
};
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
// ── Trial case builder ────────────────────────────────────────────────────────
|
|
69
|
+
|
|
70
|
+
/**
|
|
71
|
+
* Build gold-task trial cases for a role cluster.
|
|
72
|
+
*/
|
|
73
|
+
export function buildClusterTrials(clusterName, roles, taskContext) {
|
|
74
|
+
const trials = [];
|
|
75
|
+
|
|
76
|
+
for (const { role, goldTask, rejectionTask } of roles) {
|
|
77
|
+
// Gold-task test
|
|
78
|
+
trials.push({
|
|
79
|
+
id: `${clusterName}-gold-${role.replace(/\s+/g, '-').toLowerCase()}`,
|
|
80
|
+
role,
|
|
81
|
+
type: "gold-task",
|
|
82
|
+
prompt: goldTask.prompt,
|
|
83
|
+
expectedBehavior: goldTask.expected,
|
|
84
|
+
mustInclude: goldTask.mustInclude || [],
|
|
85
|
+
mustNotInclude: goldTask.mustNotInclude || [],
|
|
86
|
+
context: taskContext,
|
|
87
|
+
});
|
|
88
|
+
|
|
89
|
+
// Rejection test
|
|
90
|
+
if (rejectionTask) {
|
|
91
|
+
trials.push({
|
|
92
|
+
id: `${clusterName}-reject-${role.replace(/\s+/g, '-').toLowerCase()}`,
|
|
93
|
+
role,
|
|
94
|
+
type: "rejection",
|
|
95
|
+
prompt: rejectionTask.prompt,
|
|
96
|
+
expectedBehavior: rejectionTask.expected,
|
|
97
|
+
mustInclude: rejectionTask.mustInclude || [],
|
|
98
|
+
mustNotInclude: rejectionTask.mustNotInclude || [],
|
|
99
|
+
escalateTo: rejectionTask.escalateTo,
|
|
100
|
+
context: taskContext,
|
|
101
|
+
});
|
|
102
|
+
}
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
return trials;
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
// ── Trial evaluation ──────────────────────────────────────────────────────────
|
|
109
|
+
|
|
110
|
+
/**
|
|
111
|
+
* Evaluate a role's output against trial expectations.
|
|
112
|
+
* This is a structural check, not an LLM judge.
|
|
113
|
+
*/
|
|
114
|
+
export function evaluateTrialOutput(trialCase, output) {
|
|
115
|
+
const violations = [];
|
|
116
|
+
const findings = [];
|
|
117
|
+
|
|
118
|
+
// Check mustInclude
|
|
119
|
+
for (const required of trialCase.mustInclude) {
|
|
120
|
+
if (!output.toLowerCase().includes(required.toLowerCase())) {
|
|
121
|
+
violations.push(`Missing required element: "${required}"`);
|
|
122
|
+
} else {
|
|
123
|
+
findings.push(`✓ Contains: "${required}"`);
|
|
124
|
+
}
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
// Check mustNotInclude
|
|
128
|
+
for (const forbidden of trialCase.mustNotInclude) {
|
|
129
|
+
if (output.toLowerCase().includes(forbidden.toLowerCase())) {
|
|
130
|
+
violations.push(`Contains forbidden element: "${forbidden}"`);
|
|
131
|
+
} else {
|
|
132
|
+
findings.push(`✓ Avoids: "${forbidden}"`);
|
|
133
|
+
}
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
// For rejection tests: check if escalation is mentioned
|
|
137
|
+
if (trialCase.type === "rejection" && trialCase.escalateTo) {
|
|
138
|
+
if (output.toLowerCase().includes(trialCase.escalateTo.toLowerCase()) ||
|
|
139
|
+
output.toLowerCase().includes("escalat") ||
|
|
140
|
+
output.toLowerCase().includes("hand off") ||
|
|
141
|
+
output.toLowerCase().includes("not my") ||
|
|
142
|
+
output.toLowerCase().includes("outside my")) {
|
|
143
|
+
findings.push(`✓ Correctly identifies need to escalate/hand off`);
|
|
144
|
+
} else {
|
|
145
|
+
violations.push(`Should escalate to ${trialCase.escalateTo} but did not indicate escalation`);
|
|
146
|
+
}
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
const passed = violations.length === 0;
|
|
150
|
+
|
|
151
|
+
return {
|
|
152
|
+
caseId: trialCase.id,
|
|
153
|
+
role: trialCase.role,
|
|
154
|
+
type: trialCase.type,
|
|
155
|
+
passed,
|
|
156
|
+
findings,
|
|
157
|
+
violations,
|
|
158
|
+
escalationTarget: trialCase.escalateTo,
|
|
159
|
+
};
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
// ── Trial report ──────────────────────────────────────────────────────────────
|
|
163
|
+
|
|
164
|
+
/**
|
|
165
|
+
* Format trial results as a readable report.
|
|
166
|
+
*/
|
|
167
|
+
export function formatTrialReport(clusterName, results) {
|
|
168
|
+
const lines = [`\n# Trial Report: ${clusterName}\n`];
|
|
169
|
+
|
|
170
|
+
const passed = results.filter(r => r.passed).length;
|
|
171
|
+
const total = results.length;
|
|
172
|
+
lines.push(`Results: ${passed}/${total} passed\n`);
|
|
173
|
+
|
|
174
|
+
for (const result of results) {
|
|
175
|
+
const icon = result.passed ? "✓" : "✗";
|
|
176
|
+
lines.push(`${icon} ${result.caseId} (${result.type})`);
|
|
177
|
+
lines.push(` Role: ${result.role}`);
|
|
178
|
+
|
|
179
|
+
if (result.findings.length > 0) {
|
|
180
|
+
for (const f of result.findings) {
|
|
181
|
+
lines.push(` ${f}`);
|
|
182
|
+
}
|
|
183
|
+
}
|
|
184
|
+
|
|
185
|
+
if (result.violations.length > 0) {
|
|
186
|
+
for (const v of result.violations) {
|
|
187
|
+
lines.push(` ✗ ${v}`);
|
|
188
|
+
}
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
lines.push("");
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
return lines.join("\n");
|
|
195
|
+
}
|
|
196
|
+
|
|
197
|
+
// ── Pre-built cluster trials ──────────────────────────────────────────────────
|
|
198
|
+
|
|
199
|
+
export const PRODUCT_CLUSTER_TRIALS = buildClusterTrials(
|
|
200
|
+
"product-cluster",
|
|
201
|
+
[
|
|
202
|
+
{
|
|
203
|
+
role: "Product Strategist",
|
|
204
|
+
goldTask: {
|
|
205
|
+
prompt: "We want to add tool-use analytics to claude-guardian. Help us decide what's worth building.",
|
|
206
|
+
expected: "Problem framing, scope definition, non-goals, tradeoff analysis",
|
|
207
|
+
mustInclude: ["scope", "tradeoff"],
|
|
208
|
+
mustNotInclude: ["acceptance criteria", "edge case enumeration"],
|
|
209
|
+
},
|
|
210
|
+
rejectionTask: {
|
|
211
|
+
prompt: "Write detailed acceptance criteria for the analytics feature.",
|
|
212
|
+
expected: "Escalate to Spec Writer",
|
|
213
|
+
escalateTo: "Spec Writer",
|
|
214
|
+
mustInclude: [],
|
|
215
|
+
mustNotInclude: [],
|
|
216
|
+
},
|
|
217
|
+
},
|
|
218
|
+
{
|
|
219
|
+
role: "Spec Writer",
|
|
220
|
+
goldTask: {
|
|
221
|
+
prompt: "Product Strategist approved scope: track tool call counts, failure rates, and tool sprawl detection. Write the execution-grade spec.",
|
|
222
|
+
expected: "Acceptance criteria, edge cases, data schema, NFRs",
|
|
223
|
+
mustInclude: ["acceptance criteria"],
|
|
224
|
+
mustNotInclude: [],
|
|
225
|
+
},
|
|
226
|
+
rejectionTask: {
|
|
227
|
+
prompt: "Which of these 4 features should we build first: analytics, crash recovery, disk monitoring, preview health?",
|
|
228
|
+
expected: "Escalate to Roadmap Prioritizer",
|
|
229
|
+
escalateTo: "Roadmap Prioritizer",
|
|
230
|
+
mustInclude: [],
|
|
231
|
+
mustNotInclude: [],
|
|
232
|
+
},
|
|
233
|
+
},
|
|
234
|
+
{
|
|
235
|
+
role: "Roadmap Prioritizer",
|
|
236
|
+
goldTask: {
|
|
237
|
+
prompt: "We have 4 guardian features in the backlog: tool-use analytics, crash recovery, disk monitoring, preview health. Prioritize them by leverage and dependency.",
|
|
238
|
+
expected: "Sequenced list with leverage/risk rationale, dependency analysis",
|
|
239
|
+
mustInclude: ["prioriti", "depend"],
|
|
240
|
+
mustNotInclude: [],
|
|
241
|
+
},
|
|
242
|
+
rejectionTask: {
|
|
243
|
+
prompt: "What should the analytics feature include? Define the scope.",
|
|
244
|
+
expected: "Escalate to Product Strategist",
|
|
245
|
+
escalateTo: "Product Strategist",
|
|
246
|
+
mustInclude: [],
|
|
247
|
+
mustNotInclude: [],
|
|
248
|
+
},
|
|
249
|
+
},
|
|
250
|
+
],
|
|
251
|
+
"claude-guardian MCP server — tool-use analytics feature"
|
|
252
|
+
);
|