role-os 1.2.0 → 1.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +40 -0
- package/README.md +7 -2
- package/package.json +1 -1
- package/src/calibration.mjs +292 -0
- package/src/composite.mjs +454 -0
- package/src/decompose.mjs +311 -0
- package/src/packs.mjs +33 -5
- package/src/replan.mjs +404 -0
package/CHANGELOG.md
CHANGED
|
@@ -1,5 +1,45 @@
|
|
|
1
1
|
# Changelog
|
|
2
2
|
|
|
3
|
+
## 1.3.0
|
|
4
|
+
|
|
5
|
+
### Added
|
|
6
|
+
|
|
7
|
+
#### Outcome Calibration (Phase M)
|
|
8
|
+
- Run outcome ledger — append-only JSONL recording pack selection, confidence, overrides, escalations, corrections, completion status
|
|
9
|
+
- `computeCalibration()` — pack usage rates, high-confidence accuracy, operator override rates, per-pack performance
|
|
10
|
+
- `computePackBoosts()` — weight tuning from clean completed runs (+0.5/run, capped at 2.0)
|
|
11
|
+
- `computeConfidenceAdjustment()` — raises threshold when high-confidence is often overridden, lowers when medium is often accepted
|
|
12
|
+
- Auto-generated calibration suggestions when metrics drift
|
|
13
|
+
- Safety constraint: calibration never overrides mismatch guards, conflict rules, escalation honesty, or evidence requirements
|
|
14
|
+
|
|
15
|
+
#### Mixed-Task Decomposition (Phase N)
|
|
16
|
+
- `detectComposite()` — 7 subtask categories (build, bugfix, security, docs, research, launch, treatment) with signal-based detection
|
|
17
|
+
- Structural connector detection ("and then", "after that", "plus", "also")
|
|
18
|
+
- Confidence levels: high (3+ categories or 2+ with connectors), medium, low
|
|
19
|
+
- `decompose()` — generates linked child packets sorted by phase order
|
|
20
|
+
- `createRunPlan()` — dependency-aware parent plan with child tracking
|
|
21
|
+
- Honest fallback: medium/low confidence shows uncertainty warning with `--no-split` override
|
|
22
|
+
|
|
23
|
+
#### Composite Execution (Phase O)
|
|
24
|
+
- `initExecution()` / `advance()` — dependency-driven child execution with artifact passing
|
|
25
|
+
- 7 artifact contracts defining what each category produces and expects
|
|
26
|
+
- Artifact ledger tracking all cross-packet handoffs
|
|
27
|
+
- `blockChild()` / `recoverChild()` / `failChild()` — branch recovery with transitive cascade
|
|
28
|
+
- `invalidateDownstream()` — resets stale children when upstream changes, removes stale artifacts
|
|
29
|
+
- `synthesize()` — truthful parent-level completion report
|
|
30
|
+
- Independent branches continue unaffected when a sibling fails
|
|
31
|
+
|
|
32
|
+
#### Adaptive Replanning (Phase P)
|
|
33
|
+
- 6 structured change event types: scope-change, artifact-changed, new-requirement, review-finding, dependency-discovered, priority-change
|
|
34
|
+
- `analyzeImpact()` — identifies valid/stale children, stale artifacts, whether new children or reorder needed
|
|
35
|
+
- `replan()` — selective replanning: invalidates only affected branches, inserts new children, updates dependencies
|
|
36
|
+
- Plan diff: shows what changed, what stayed valid, what reopened, what was inserted
|
|
37
|
+
- Execution resumes from next valid child after replan — no restart required
|
|
38
|
+
|
|
39
|
+
### Evidence
|
|
40
|
+
- 317 tests, zero failures
|
|
41
|
+
- Calibration, decomposition, composite execution, and replanning each have dedicated test suites
|
|
42
|
+
|
|
3
43
|
## 1.2.0
|
|
4
44
|
|
|
5
45
|
### Added
|
package/README.md
CHANGED
|
@@ -173,7 +173,11 @@ Role OS operates **locally only**. It copies markdown templates and writes packe
|
|
|
173
173
|
| **Evidence** | Role-aware structured evidence in verdicts. Sufficiency checks. 12 evidence kinds. | ✓ Shipped |
|
|
174
174
|
| **Dispatch** | Generates execution manifests for multi-claude. Per-role tool profiles, system prompts, budgets. | ✓ Shipped |
|
|
175
175
|
| **Trials** | Full roster proven: 30/30 gold-task + 5/5 negative trials. 7 pack trials complete. | ✓ Complete |
|
|
176
|
-
| **Team Packs** | 7
|
|
176
|
+
| **Team Packs** | 7 calibrated packs with auto-selection, mismatch guards, and free-routing fallback. | ✓ Shipped |
|
|
177
|
+
| **Outcome calibration** | Records run outcomes, tunes pack/role weights from results, adjusts confidence thresholds. | ✓ Shipped |
|
|
178
|
+
| **Mixed-task decomposition** | Detects composite work, splits into child packets, assigns packs, preserves dependencies. | ✓ Shipped |
|
|
179
|
+
| **Composite execution** | Runs child packets in dependency order with artifact passing, branch recovery, and synthesis. | ✓ Shipped |
|
|
180
|
+
| **Adaptive replanning** | Mid-run scope changes, findings, or new requirements update the plan without restarting. | ✓ Shipped |
|
|
177
181
|
|
|
178
182
|
## Status
|
|
179
183
|
|
|
@@ -181,7 +185,8 @@ Role OS operates **locally only**. It copies markdown templates and writes packe
|
|
|
181
185
|
- v1.0.0: 32 roles, full CLI, proven treatment, multi-repo portability
|
|
182
186
|
- v1.0.2: Role OS lockdown (bootstrap truth fixes, init --force)
|
|
183
187
|
- v1.1.0: 31 roles, full routing spine, conflict detection, escalation, evidence, dispatch, 7 proven team packs. 35 execution trials. 212 tests.
|
|
184
|
-
-
|
|
188
|
+
- v1.2.0: Calibrated packs promoted to default entry. Auto-selection, mismatch detection, alternative suggestion, free-routing fallback. 246 tests.
|
|
189
|
+
- **Current**: Outcome calibration, mixed-task decomposition, composite execution, adaptive replanning. 317 tests.
|
|
185
190
|
|
|
186
191
|
## License
|
|
187
192
|
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "role-os",
|
|
3
|
-
"version": "1.
|
|
3
|
+
"version": "1.3.0",
|
|
4
4
|
"description": "Role OS — a multi-Claude operating system where 31 specialized roles execute work through contracts, conflict detection, escalation, and structured evidence. 7 proven team packs for common task families.",
|
|
5
5
|
"homepage": "https://mcp-tool-shop-org.github.io/role-os/",
|
|
6
6
|
"bugs": {
|
|
@@ -0,0 +1,292 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Outcome Calibration — Phase M
|
|
3
|
+
*
|
|
4
|
+
* Records outcome signals from real runs and tunes routing/pack
|
|
5
|
+
* selection from results. The learning loop improves selection
|
|
6
|
+
* without overriding hard safety constraints.
|
|
7
|
+
*
|
|
8
|
+
* What it tunes:
|
|
9
|
+
* - Pack suggestion weights (which pack for which signals)
|
|
10
|
+
* - Confidence thresholds (when to suggest vs fall back)
|
|
11
|
+
* - Role scoring boosts (from successful chain patterns)
|
|
12
|
+
*
|
|
13
|
+
* What it NEVER overrides:
|
|
14
|
+
* - Hard mismatch guards
|
|
15
|
+
* - Conflict detection rules
|
|
16
|
+
* - Escalation honesty
|
|
17
|
+
* - Evidence requirements
|
|
18
|
+
*/
|
|
19
|
+
|
|
20
|
+
import { existsSync, readFileSync, writeFileSync, mkdirSync } from "node:fs";
|
|
21
|
+
import { join } from "node:path";
|
|
22
|
+
|
|
23
|
+
// ── Outcome record shape ──────────────────────────────────────────────────────
|
|
24
|
+
|
|
25
|
+
/**
|
|
26
|
+
* @typedef {Object} RunOutcome
|
|
27
|
+
* @property {string} runId - Unique run identifier
|
|
28
|
+
* @property {string} timestamp - ISO timestamp
|
|
29
|
+
* @property {string} packetFile - Path to the packet
|
|
30
|
+
* @property {string} detectedType - feature/integration/identity
|
|
31
|
+
* @property {string|null} suggestedPack - Pack auto-suggested (or null)
|
|
32
|
+
* @property {string} suggestedConfidence - high/medium/low
|
|
33
|
+
* @property {string|null} selectedPack - Pack actually used (or null for free routing)
|
|
34
|
+
* @property {boolean} operatorOverride - Did operator change the suggestion?
|
|
35
|
+
* @property {boolean} mismatchRedirect - Was a mismatch guard triggered?
|
|
36
|
+
* @property {string|null} mismatchFrom - Pack that was rejected
|
|
37
|
+
* @property {string|null} mismatchTo - Pack that was suggested instead
|
|
38
|
+
* @property {number} chainLength - Number of roles in the chain
|
|
39
|
+
* @property {number} escalations - Number of escalation events
|
|
40
|
+
* @property {number} rejectedVerdicts - Number of rejected verdicts
|
|
41
|
+
* @property {number} corrections - Number of operator corrections
|
|
42
|
+
* @property {string} completionStatus - completed/blocked/failed/abandoned
|
|
43
|
+
* @property {string[]} rolesUsed - Roles that participated
|
|
44
|
+
*/
|
|
45
|
+
|
|
46
|
+
// ── Ledger ────────────────────────────────────────────────────────────────────
|
|
47
|
+
|
|
48
|
+
const LEDGER_DIR = ".claude/calibration";
|
|
49
|
+
const LEDGER_FILE = "outcome-ledger.jsonl";
|
|
50
|
+
|
|
51
|
+
/**
|
|
52
|
+
* Record a run outcome to the ledger (append-only JSONL).
|
|
53
|
+
*
|
|
54
|
+
* @param {RunOutcome} outcome
|
|
55
|
+
* @param {string} [cwd] - Working directory (default: process.cwd())
|
|
56
|
+
*/
|
|
57
|
+
export function recordOutcome(outcome, cwd = process.cwd()) {
|
|
58
|
+
const dir = join(cwd, LEDGER_DIR);
|
|
59
|
+
mkdirSync(dir, { recursive: true });
|
|
60
|
+
const path = join(dir, LEDGER_FILE);
|
|
61
|
+
const line = JSON.stringify({ ...outcome, recordedAt: new Date().toISOString() }) + "\n";
|
|
62
|
+
writeFileSync(path, line, { flag: "a" });
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
/**
|
|
66
|
+
* Read all outcomes from the ledger.
|
|
67
|
+
*
|
|
68
|
+
* @param {string} [cwd]
|
|
69
|
+
* @returns {RunOutcome[]}
|
|
70
|
+
*/
|
|
71
|
+
export function readOutcomes(cwd = process.cwd()) {
|
|
72
|
+
const path = join(cwd, LEDGER_DIR, LEDGER_FILE);
|
|
73
|
+
if (!existsSync(path)) return [];
|
|
74
|
+
|
|
75
|
+
return readFileSync(path, "utf-8")
|
|
76
|
+
.split("\n")
|
|
77
|
+
.filter(line => line.trim())
|
|
78
|
+
.map(line => {
|
|
79
|
+
try { return JSON.parse(line); }
|
|
80
|
+
catch { return null; }
|
|
81
|
+
})
|
|
82
|
+
.filter(Boolean);
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
// ── Calibration analysis ──────────────────────────────────────────────────────
|
|
86
|
+
|
|
87
|
+
/**
|
|
88
|
+
* Compute calibration metrics from the outcome ledger.
|
|
89
|
+
*
|
|
90
|
+
* @param {RunOutcome[]} outcomes
|
|
91
|
+
* @returns {CalibrationReport}
|
|
92
|
+
*/
|
|
93
|
+
export function computeCalibration(outcomes) {
|
|
94
|
+
if (outcomes.length === 0) {
|
|
95
|
+
return {
|
|
96
|
+
totalRuns: 0,
|
|
97
|
+
packUsageRate: 0,
|
|
98
|
+
freeRoutingRate: 0,
|
|
99
|
+
highConfidenceAccuracy: 0,
|
|
100
|
+
operatorOverrideRate: 0,
|
|
101
|
+
mismatchRedirectRate: 0,
|
|
102
|
+
avgEscalations: 0,
|
|
103
|
+
avgCorrections: 0,
|
|
104
|
+
completionRate: 0,
|
|
105
|
+
packPerformance: {},
|
|
106
|
+
suggestions: [],
|
|
107
|
+
};
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
const total = outcomes.length;
|
|
111
|
+
const packUsed = outcomes.filter(o => o.selectedPack);
|
|
112
|
+
const freeRouted = outcomes.filter(o => !o.selectedPack);
|
|
113
|
+
const highConf = outcomes.filter(o => o.suggestedConfidence === "high");
|
|
114
|
+
const highConfCorrect = highConf.filter(o =>
|
|
115
|
+
o.selectedPack === o.suggestedPack && !o.operatorOverride
|
|
116
|
+
);
|
|
117
|
+
const overrides = outcomes.filter(o => o.operatorOverride);
|
|
118
|
+
const redirects = outcomes.filter(o => o.mismatchRedirect);
|
|
119
|
+
const completed = outcomes.filter(o => o.completionStatus === "completed");
|
|
120
|
+
|
|
121
|
+
// Per-pack performance
|
|
122
|
+
const packPerformance = {};
|
|
123
|
+
for (const o of outcomes) {
|
|
124
|
+
const key = o.selectedPack || "free-routing";
|
|
125
|
+
if (!packPerformance[key]) {
|
|
126
|
+
packPerformance[key] = {
|
|
127
|
+
runs: 0,
|
|
128
|
+
completed: 0,
|
|
129
|
+
escalations: 0,
|
|
130
|
+
corrections: 0,
|
|
131
|
+
avgChainLength: 0,
|
|
132
|
+
totalChainLength: 0,
|
|
133
|
+
};
|
|
134
|
+
}
|
|
135
|
+
const p = packPerformance[key];
|
|
136
|
+
p.runs++;
|
|
137
|
+
if (o.completionStatus === "completed") p.completed++;
|
|
138
|
+
p.escalations += o.escalations;
|
|
139
|
+
p.corrections += o.corrections;
|
|
140
|
+
p.totalChainLength += o.chainLength;
|
|
141
|
+
}
|
|
142
|
+
for (const p of Object.values(packPerformance)) {
|
|
143
|
+
p.avgChainLength = p.runs > 0 ? Math.round((p.totalChainLength / p.runs) * 10) / 10 : 0;
|
|
144
|
+
p.completionRate = p.runs > 0 ? Math.round((p.completed / p.runs) * 100) : 0;
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
// Generate suggestions
|
|
148
|
+
const suggestions = [];
|
|
149
|
+
|
|
150
|
+
// If high-confidence accuracy is below 80%, suggest tightening
|
|
151
|
+
const highConfAccuracy = highConf.length > 0
|
|
152
|
+
? Math.round((highConfCorrect.length / highConf.length) * 100)
|
|
153
|
+
: 100;
|
|
154
|
+
if (highConfAccuracy < 80 && highConf.length >= 3) {
|
|
155
|
+
suggestions.push({
|
|
156
|
+
type: "tighten-confidence",
|
|
157
|
+
detail: `High-confidence accuracy is ${highConfAccuracy}% (${highConfCorrect.length}/${highConf.length}). Consider raising the confidence threshold.`,
|
|
158
|
+
});
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
// If operator override rate is above 20%, suggest review
|
|
162
|
+
const overrideRate = Math.round((overrides.length / total) * 100);
|
|
163
|
+
if (overrideRate > 20 && total >= 5) {
|
|
164
|
+
suggestions.push({
|
|
165
|
+
type: "review-suggestions",
|
|
166
|
+
detail: `Operator override rate is ${overrideRate}%. Pack suggestions may be misaligned with operator intent.`,
|
|
167
|
+
});
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
// If a specific pack has high escalation rate, flag it
|
|
171
|
+
for (const [key, p] of Object.entries(packPerformance)) {
|
|
172
|
+
if (p.runs >= 3 && p.escalations / p.runs > 1) {
|
|
173
|
+
suggestions.push({
|
|
174
|
+
type: "pack-escalation-concern",
|
|
175
|
+
detail: `Pack "${key}" averages ${(p.escalations / p.runs).toFixed(1)} escalations per run. May need retuning.`,
|
|
176
|
+
});
|
|
177
|
+
}
|
|
178
|
+
}
|
|
179
|
+
|
|
180
|
+
return {
|
|
181
|
+
totalRuns: total,
|
|
182
|
+
packUsageRate: Math.round((packUsed.length / total) * 100),
|
|
183
|
+
freeRoutingRate: Math.round((freeRouted.length / total) * 100),
|
|
184
|
+
highConfidenceAccuracy: highConfAccuracy,
|
|
185
|
+
operatorOverrideRate: overrideRate,
|
|
186
|
+
mismatchRedirectRate: Math.round((redirects.length / total) * 100),
|
|
187
|
+
avgEscalations: Math.round((outcomes.reduce((s, o) => s + o.escalations, 0) / total) * 10) / 10,
|
|
188
|
+
avgCorrections: Math.round((outcomes.reduce((s, o) => s + o.corrections, 0) / total) * 10) / 10,
|
|
189
|
+
completionRate: Math.round((completed.length / total) * 100),
|
|
190
|
+
packPerformance,
|
|
191
|
+
suggestions,
|
|
192
|
+
};
|
|
193
|
+
}
|
|
194
|
+
|
|
195
|
+
// ── Weight tuning ─────────────────────────────────────────────────────────────
|
|
196
|
+
|
|
197
|
+
/**
|
|
198
|
+
* Compute pack score boosts from successful outcome patterns.
|
|
199
|
+
* Returns a map of pack → keyword → boost amount.
|
|
200
|
+
*
|
|
201
|
+
* Rules:
|
|
202
|
+
* - Only boost from completed runs with 0 corrections
|
|
203
|
+
* - Never boost above +2 per keyword
|
|
204
|
+
* - Never create new keywords — only boost existing ones
|
|
205
|
+
*
|
|
206
|
+
* @param {RunOutcome[]} outcomes
|
|
207
|
+
* @returns {Record<string, number>} packName → boost amount
|
|
208
|
+
*/
|
|
209
|
+
export function computePackBoosts(outcomes) {
|
|
210
|
+
const boosts = {};
|
|
211
|
+
|
|
212
|
+
const cleanRuns = outcomes.filter(
|
|
213
|
+
o => o.selectedPack && o.completionStatus === "completed" && o.corrections === 0
|
|
214
|
+
);
|
|
215
|
+
|
|
216
|
+
for (const o of cleanRuns) {
|
|
217
|
+
const pack = o.selectedPack;
|
|
218
|
+
boosts[pack] = Math.min((boosts[pack] || 0) + 0.5, 2.0);
|
|
219
|
+
}
|
|
220
|
+
|
|
221
|
+
return boosts;
|
|
222
|
+
}
|
|
223
|
+
|
|
224
|
+
/**
|
|
225
|
+
* Compute confidence threshold adjustment from outcomes.
|
|
226
|
+
* If high-confidence suggestions are often overridden, raise the threshold.
|
|
227
|
+
* If medium-confidence suggestions are often accepted, lower it.
|
|
228
|
+
*
|
|
229
|
+
* @param {RunOutcome[]} outcomes
|
|
230
|
+
* @returns {{ adjustment: number, reason: string }}
|
|
231
|
+
*/
|
|
232
|
+
export function computeConfidenceAdjustment(outcomes) {
|
|
233
|
+
const highConf = outcomes.filter(o => o.suggestedConfidence === "high");
|
|
234
|
+
const medConf = outcomes.filter(o => o.suggestedConfidence === "medium");
|
|
235
|
+
|
|
236
|
+
const highOverridden = highConf.filter(o => o.operatorOverride).length;
|
|
237
|
+
const medAccepted = medConf.filter(o => o.selectedPack === o.suggestedPack && !o.operatorOverride).length;
|
|
238
|
+
|
|
239
|
+
if (highConf.length >= 3 && highOverridden / highConf.length > 0.3) {
|
|
240
|
+
return {
|
|
241
|
+
adjustment: +1,
|
|
242
|
+
reason: `${Math.round(highOverridden / highConf.length * 100)}% of high-confidence suggestions were overridden. Recommend raising keyword threshold from 3 to 4 for "high."`,
|
|
243
|
+
};
|
|
244
|
+
}
|
|
245
|
+
|
|
246
|
+
if (medConf.length >= 3 && medAccepted / medConf.length > 0.7) {
|
|
247
|
+
return {
|
|
248
|
+
adjustment: -1,
|
|
249
|
+
reason: `${Math.round(medAccepted / medConf.length * 100)}% of medium-confidence suggestions were accepted. Recommend lowering keyword threshold from 3 to 2 for "high."`,
|
|
250
|
+
};
|
|
251
|
+
}
|
|
252
|
+
|
|
253
|
+
return { adjustment: 0, reason: "Confidence thresholds are well-calibrated." };
|
|
254
|
+
}
|
|
255
|
+
|
|
256
|
+
/**
|
|
257
|
+
* Format a calibration report for display.
|
|
258
|
+
*
|
|
259
|
+
* @param {object} report
|
|
260
|
+
* @returns {string}
|
|
261
|
+
*/
|
|
262
|
+
export function formatCalibrationReport(report) {
|
|
263
|
+
const lines = [
|
|
264
|
+
`\nOutcome Calibration Report`,
|
|
265
|
+
`─────────────────────────`,
|
|
266
|
+
`Total runs: ${report.totalRuns}`,
|
|
267
|
+
`Pack usage: ${report.packUsageRate}% | Free routing: ${report.freeRoutingRate}%`,
|
|
268
|
+
`High-confidence accuracy: ${report.highConfidenceAccuracy}%`,
|
|
269
|
+
`Operator override rate: ${report.operatorOverrideRate}%`,
|
|
270
|
+
`Mismatch redirect rate: ${report.mismatchRedirectRate}%`,
|
|
271
|
+
`Avg escalations: ${report.avgEscalations} | Avg corrections: ${report.avgCorrections}`,
|
|
272
|
+
`Completion rate: ${report.completionRate}%`,
|
|
273
|
+
];
|
|
274
|
+
|
|
275
|
+
if (Object.keys(report.packPerformance).length > 0) {
|
|
276
|
+
lines.push(`\nPer-pack performance:`);
|
|
277
|
+
for (const [key, p] of Object.entries(report.packPerformance)) {
|
|
278
|
+
lines.push(` ${key}: ${p.runs} runs, ${p.completionRate}% completion, ${p.avgChainLength} avg chain, ${p.escalations} escalations`);
|
|
279
|
+
}
|
|
280
|
+
}
|
|
281
|
+
|
|
282
|
+
if (report.suggestions.length > 0) {
|
|
283
|
+
lines.push(`\nCalibration suggestions:`);
|
|
284
|
+
for (const s of report.suggestions) {
|
|
285
|
+
lines.push(` ! [${s.type}] ${s.detail}`);
|
|
286
|
+
}
|
|
287
|
+
} else {
|
|
288
|
+
lines.push(`\nNo calibration adjustments needed.`);
|
|
289
|
+
}
|
|
290
|
+
|
|
291
|
+
return lines.join("\n");
|
|
292
|
+
}
|