role-os 2.6.0 → 2.7.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +26 -0
- package/README.es.md +185 -129
- package/README.fr.md +193 -137
- package/README.hi.md +191 -135
- package/README.it.md +186 -130
- package/README.ja.md +191 -135
- package/README.md +6 -18
- package/README.pt-BR.md +188 -132
- package/README.zh.md +192 -139
- package/bin/roleos.mjs +10 -0
- package/package.json +1 -1
- package/src/specialist/budget-consult.mjs +120 -0
- package/src/specialist/client.mjs +131 -0
- package/src/specialist/dispatch.mjs +237 -0
- package/src/specialist/events.mjs +56 -0
- package/src/specialist/gate.mjs +202 -0
- package/src/specialist/registry.mjs +219 -0
- package/src/specialist/shadow.mjs +122 -0
- package/src/specialist/state.mjs +125 -0
- package/src/specialist-cmd.mjs +378 -0
- package/starter-pack/policy/specialist-tier.md +288 -0
- package/starter-pack/schemas/specialist.md +155 -0
|
@@ -0,0 +1,202 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Gate — the per-dispatch routing decision: specialist or Claude.
|
|
3
|
+
*
|
|
4
|
+
* Hides one secret family (Parnas): the routing math. Callers pass in registry state and
|
|
5
|
+
* runtime signals; the gate is a pure function. Fail-open is the default — any uncertainty,
|
|
6
|
+
* any malformed signal, any out-of-band condition routes to Claude.
|
|
7
|
+
*
|
|
8
|
+
* The OvA score is computed per-specialist (Verma & Nalisnick ICML 2022, arXiv:2202.03673);
|
|
9
|
+
* there is NO joint softmax across roles. The default v0.1 classifier is deterministic +
|
|
10
|
+
* embedding-similarity (cosine of input embedding to the version's exam_centroid). A trained
|
|
11
|
+
* classifier is an additive upgrade — drop in a different `scoreFn`.
|
|
12
|
+
*
|
|
13
|
+
* The reject conditions enforced here are R-tier policy gates from
|
|
14
|
+
* `starter-pack/policy/specialist-tier.md`:
|
|
15
|
+
* - shadow_probe_halt — sticky halt dominates everything
|
|
16
|
+
* - no_active_version — registry has no active pointer
|
|
17
|
+
* - uncertified_active_version — Reject 2 safety net (load-time also catches this)
|
|
18
|
+
* - quota_exhausted — Reject 3
|
|
19
|
+
* - ood — Reject 5
|
|
20
|
+
* - score_below_threshold — Reject 4
|
|
21
|
+
* - score_invalid — fail-open on a malformed classifier output
|
|
22
|
+
*/
|
|
23
|
+
|
|
24
|
+
import { resolveActiveVersion } from "./registry.mjs";
|
|
25
|
+
|
|
26
|
+
/**
|
|
27
|
+
* @typedef {object} GateDecision
|
|
28
|
+
* @property {"specialist"|"claude"} route where the dispatch goes
|
|
29
|
+
* @property {string} reason machine-readable reason
|
|
30
|
+
* @property {number} score OvA score (0 when not computed)
|
|
31
|
+
* @property {boolean} ood OOD check result
|
|
32
|
+
* @property {boolean} quotaOk quota check result
|
|
33
|
+
* @property {string} [detail] operator-facing explanation
|
|
34
|
+
*/
|
|
35
|
+
|
|
36
|
+
/**
|
|
37
|
+
* @typedef {object} QuotaState
|
|
38
|
+
* @property {number} used dispatches taken by this specialist in the current window
|
|
39
|
+
* @property {number} window window size (dispatches)
|
|
40
|
+
*/
|
|
41
|
+
|
|
42
|
+
/**
|
|
43
|
+
* @typedef {object} HaltState
|
|
44
|
+
* @property {boolean} halted
|
|
45
|
+
* @property {string} [reason]
|
|
46
|
+
*/
|
|
47
|
+
|
|
48
|
+
/**
|
|
49
|
+
* @typedef {object} Classifier
|
|
50
|
+
* @property {(input:any, version:object, entry:object) => number} scoreFn returns OvA score in [0, 1]
|
|
51
|
+
* @property {(input:any, version:object, entry:object) => boolean} oodFn true if input is OOD for this specialist
|
|
52
|
+
*/
|
|
53
|
+
|
|
54
|
+
/**
|
|
55
|
+
* Decide where this dispatch goes. Pure function; all state is in `params`.
|
|
56
|
+
*
|
|
57
|
+
* @param {object} params
|
|
58
|
+
* @param {string} params.role
|
|
59
|
+
* @param {*} params.input
|
|
60
|
+
* @param {object} params.registryEntry the specialist registry entry for `role`
|
|
61
|
+
* @param {QuotaState} params.quotaState
|
|
62
|
+
* @param {HaltState} params.haltState
|
|
63
|
+
* @param {Classifier} [params.classifier] v0.1 default = deterministic + embedding-similarity
|
|
64
|
+
* @returns {GateDecision}
|
|
65
|
+
*/
|
|
66
|
+
export function gate({ role, input, registryEntry, quotaState, haltState, classifier = defaultClassifier }) {
|
|
67
|
+
if (!registryEntry) {
|
|
68
|
+
return failOpen("no_registry_entry", `role "${role}" has no specialist registry entry`);
|
|
69
|
+
}
|
|
70
|
+
if (haltState && haltState.halted) {
|
|
71
|
+
return failOpen("shadow_probe_halt", haltState.reason || "specialist dispatch halted by shadow-probe disagreement");
|
|
72
|
+
}
|
|
73
|
+
let active;
|
|
74
|
+
try {
|
|
75
|
+
active = resolveActiveVersion(registryEntry);
|
|
76
|
+
} catch (err) {
|
|
77
|
+
return failOpen("registry_dangling_pointer", err.message);
|
|
78
|
+
}
|
|
79
|
+
if (!active) {
|
|
80
|
+
return failOpen("no_active_version", `role "${role}" has no active specialist version`);
|
|
81
|
+
}
|
|
82
|
+
if (active.certified_level === "L0") {
|
|
83
|
+
return failOpen("uncertified_active_version", `active version "${active.id}" is L0 (uncertified)`);
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
// Quota check (R3 in the policy): if accepting this dispatch would exceed the workload
|
|
87
|
+
// quota, fail open. We compute share-if-added — the cap is "share AFTER this dispatch",
|
|
88
|
+
// so the gate's decision itself is what enforces the cap (no race with later increments).
|
|
89
|
+
const window = Math.max(1, quotaState?.window ?? 0);
|
|
90
|
+
const used = Math.max(0, quotaState?.used ?? 0);
|
|
91
|
+
const shareIfAdd = (used + 1) / window;
|
|
92
|
+
const quotaOk = shareIfAdd <= registryEntry.workload_quota;
|
|
93
|
+
if (!quotaOk) {
|
|
94
|
+
return failOpen(
|
|
95
|
+
"quota_exhausted",
|
|
96
|
+
`would push specialist share to ${shareIfAdd.toFixed(3)} > quota ${registryEntry.workload_quota}`,
|
|
97
|
+
{ quotaOk: false },
|
|
98
|
+
);
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
// OOD check (R5).
|
|
102
|
+
let ood = false;
|
|
103
|
+
try {
|
|
104
|
+
ood = !!classifier.oodFn(input, active, registryEntry);
|
|
105
|
+
} catch (err) {
|
|
106
|
+
return failOpen("ood_check_threw", `oodFn threw: ${err.message}`);
|
|
107
|
+
}
|
|
108
|
+
if (ood) {
|
|
109
|
+
return failOpen("ood", "input is out-of-distribution for the active specialist", { ood: true });
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
// OvA score (R4).
|
|
113
|
+
let score;
|
|
114
|
+
try {
|
|
115
|
+
score = classifier.scoreFn(input, active, registryEntry);
|
|
116
|
+
} catch (err) {
|
|
117
|
+
return failOpen("score_fn_threw", `scoreFn threw: ${err.message}`);
|
|
118
|
+
}
|
|
119
|
+
if (typeof score !== "number" || !Number.isFinite(score) || score < 0 || score > 1) {
|
|
120
|
+
return failOpen("score_invalid", `scoreFn returned ${score} (must be a finite number in [0, 1])`);
|
|
121
|
+
}
|
|
122
|
+
if (score < active.gate_threshold) {
|
|
123
|
+
return failOpen(
|
|
124
|
+
"score_below_threshold",
|
|
125
|
+
`OvA score ${score.toFixed(3)} < gate_threshold ${active.gate_threshold}`,
|
|
126
|
+
{ score },
|
|
127
|
+
);
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
return {
|
|
131
|
+
route: "specialist",
|
|
132
|
+
reason: "ok",
|
|
133
|
+
score,
|
|
134
|
+
ood: false,
|
|
135
|
+
quotaOk: true,
|
|
136
|
+
detail: `routed to specialist (score=${score.toFixed(3)}, threshold=${active.gate_threshold})`,
|
|
137
|
+
};
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
function failOpen(reason, detail, overrides = {}) {
|
|
141
|
+
return {
|
|
142
|
+
route: "claude",
|
|
143
|
+
reason,
|
|
144
|
+
score: overrides.score ?? 0,
|
|
145
|
+
ood: overrides.ood ?? false,
|
|
146
|
+
quotaOk: overrides.quotaOk ?? true,
|
|
147
|
+
detail,
|
|
148
|
+
};
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
// ── Default classifier — deterministic + embedding-similarity ─────────────────────────────────
|
|
152
|
+
|
|
153
|
+
/**
|
|
154
|
+
* Default OvA score: cosine similarity of `input.embedding` to `version.exam_centroid`,
|
|
155
|
+
* mapped from [-1, 1] to [0, 1]. If either is missing, returns 0 (which is below any
|
|
156
|
+
* positive gate_threshold and so fails open).
|
|
157
|
+
*
|
|
158
|
+
* v0.1 inputs that do not carry an embedding will always score 0 → always fail open. That is
|
|
159
|
+
* the correct default until either:
|
|
160
|
+
* (a) inputs are pre-embedded by the consumer (e.g. prism for the Verifier specialist), or
|
|
161
|
+
* (b) a trained classifier replaces this default scoreFn.
|
|
162
|
+
*/
|
|
163
|
+
export function defaultScoreFn(input, version /*, entry */) {
|
|
164
|
+
const emb = input && Array.isArray(input.embedding) ? input.embedding : null;
|
|
165
|
+
const centroid = Array.isArray(version.exam_centroid) ? version.exam_centroid : null;
|
|
166
|
+
if (!emb || !centroid || emb.length === 0 || emb.length !== centroid.length) return 0;
|
|
167
|
+
const sim = cosineSimilarity(emb, centroid);
|
|
168
|
+
return (sim + 1) / 2;
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
/**
|
|
172
|
+
* Default OOD: when there's no centroid or no embedding to measure against, the v0.1 gate
|
|
173
|
+
* cannot judge in-distribution and so calls the input OOD (fail-open). Once a centroid +
|
|
174
|
+
* input embedding are both present, OOD is true iff cosine similarity is below `ood_floor`
|
|
175
|
+
* (default 0.4). A trained OOD detector is an additive upgrade.
|
|
176
|
+
*
|
|
177
|
+
* Read together with `defaultScoreFn`: with no embedding, `ood = true` AND `score = 0` —
|
|
178
|
+
* both reject paths are taken, both fail open. The redundancy is intentional (defense in
|
|
179
|
+
* depth — neither rejection depends on the other working).
|
|
180
|
+
*/
|
|
181
|
+
export function defaultOodFn(input, version /*, entry */) {
|
|
182
|
+
const emb = input && Array.isArray(input.embedding) ? input.embedding : null;
|
|
183
|
+
const centroid = Array.isArray(version.exam_centroid) ? version.exam_centroid : null;
|
|
184
|
+
if (!emb || !centroid || emb.length === 0 || emb.length !== centroid.length) return true;
|
|
185
|
+
const floor = typeof version.ood_floor === "number" ? version.ood_floor : 0.4;
|
|
186
|
+
return cosineSimilarity(emb, centroid) < floor;
|
|
187
|
+
}
|
|
188
|
+
|
|
189
|
+
export const defaultClassifier = { scoreFn: defaultScoreFn, oodFn: defaultOodFn };
|
|
190
|
+
|
|
191
|
+
/** Cosine similarity in [-1, 1]. Returns 0 for zero-norm vectors. */
|
|
192
|
+
export function cosineSimilarity(a, b) {
|
|
193
|
+
let dot = 0, na = 0, nb = 0;
|
|
194
|
+
for (let i = 0; i < a.length; i++) {
|
|
195
|
+
const x = a[i], y = b[i];
|
|
196
|
+
dot += x * y;
|
|
197
|
+
na += x * x;
|
|
198
|
+
nb += y * y;
|
|
199
|
+
}
|
|
200
|
+
if (na === 0 || nb === 0) return 0;
|
|
201
|
+
return dot / Math.sqrt(na * nb);
|
|
202
|
+
}
|
|
@@ -0,0 +1,219 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Specialist registry — load + validate `.role-os/specialists.json`.
|
|
3
|
+
*
|
|
4
|
+
* Hides one secret family (Parnas CACM 1972): the on-disk schema and the reject conditions
|
|
5
|
+
* R1-R7 from `starter-pack/schemas/specialist.md`. Callers see a Map<role, entry> and the
|
|
6
|
+
* resolved active version; they never parse the file themselves.
|
|
7
|
+
*
|
|
8
|
+
* Reject 1 (same-family base) and R3/R4 (id collisions / dangling pointer) are correctness
|
|
9
|
+
* invariants — no bypass flag. R2 (uncertified active) is also a load-time reject so a
|
|
10
|
+
* misedited file cannot route to L0.
|
|
11
|
+
*/
|
|
12
|
+
|
|
13
|
+
import { readFileSync, writeFileSync, existsSync, mkdirSync } from "node:fs";
|
|
14
|
+
import { dirname } from "node:path";
|
|
15
|
+
|
|
16
|
+
export const REGISTRY_SCHEMA = "roleos-specialist-registry/v1";
|
|
17
|
+
|
|
18
|
+
/** Known Claude-family base-model prefixes — refused by R1 at load. Conservative; widen as needed. */
|
|
19
|
+
const CLAUDE_FAMILY_PREFIXES = [
|
|
20
|
+
"claude-",
|
|
21
|
+
"anthropic/",
|
|
22
|
+
"anthropic.",
|
|
23
|
+
];
|
|
24
|
+
|
|
25
|
+
export function isClaudeFamily(baseModel) {
|
|
26
|
+
if (typeof baseModel !== "string") return false;
|
|
27
|
+
const m = baseModel.toLowerCase();
|
|
28
|
+
return CLAUDE_FAMILY_PREFIXES.some((p) => m.startsWith(p));
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
/**
|
|
32
|
+
* @typedef {object} SpecialistVersion
|
|
33
|
+
* @property {string} id
|
|
34
|
+
* @property {string} adapter_id
|
|
35
|
+
* @property {string} base_model
|
|
36
|
+
* @property {number} gate_threshold
|
|
37
|
+
* @property {string} certified_level
|
|
38
|
+
* @property {string} exam_hash
|
|
39
|
+
* @property {number} field_audit_window
|
|
40
|
+
* @property {string} created_at
|
|
41
|
+
* @property {string} [notes]
|
|
42
|
+
* @property {number[]} [exam_centroid] optional pre-computed centroid for embedding-similarity scoring
|
|
43
|
+
*/
|
|
44
|
+
|
|
45
|
+
/**
|
|
46
|
+
* @typedef {object} SpecialistEntry
|
|
47
|
+
* @property {string} role
|
|
48
|
+
* @property {string} backend_url
|
|
49
|
+
* @property {string} fallback must be "claude" in v0.1
|
|
50
|
+
* @property {number} workload_quota in (0, 1]
|
|
51
|
+
* @property {string|null} active_version id from versions[], or null
|
|
52
|
+
* @property {SpecialistVersion[]} versions
|
|
53
|
+
*/
|
|
54
|
+
|
|
55
|
+
/**
|
|
56
|
+
* @typedef {object} Registry
|
|
57
|
+
* @property {string} schema
|
|
58
|
+
* @property {SpecialistEntry[]} specialists
|
|
59
|
+
*/
|
|
60
|
+
|
|
61
|
+
/**
|
|
62
|
+
* Load a registry from disk. Returns `{ registry, byRole, errors }`.
|
|
63
|
+
*
|
|
64
|
+
* - If the file does not exist, returns an empty registry (`{ specialists: [] }`) with no
|
|
65
|
+
* errors — the framework's default state is "no specialists deployed", and that must not
|
|
66
|
+
* fail load.
|
|
67
|
+
* - If the file exists but fails validation, returns `errors[]` populated and `byRole` empty
|
|
68
|
+
* (a partial registry is not a registry).
|
|
69
|
+
*/
|
|
70
|
+
export function loadRegistry(path) {
|
|
71
|
+
if (!existsSync(path)) {
|
|
72
|
+
return { registry: emptyRegistry(), byRole: new Map(), errors: [] };
|
|
73
|
+
}
|
|
74
|
+
let raw;
|
|
75
|
+
try {
|
|
76
|
+
raw = JSON.parse(readFileSync(path, "utf8"));
|
|
77
|
+
} catch (err) {
|
|
78
|
+
return {
|
|
79
|
+
registry: emptyRegistry(),
|
|
80
|
+
byRole: new Map(),
|
|
81
|
+
errors: [`registry parse error: ${err.message}`],
|
|
82
|
+
};
|
|
83
|
+
}
|
|
84
|
+
const { ok, errors } = validateRegistry(raw);
|
|
85
|
+
if (!ok) return { registry: emptyRegistry(), byRole: new Map(), errors };
|
|
86
|
+
const byRole = new Map();
|
|
87
|
+
for (const e of raw.specialists) byRole.set(e.role, e);
|
|
88
|
+
return { registry: raw, byRole, errors: [] };
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
/** Persist a registry to disk. Creates the parent directory if needed. Atomic-ish write. */
|
|
92
|
+
export function saveRegistry(path, raw) {
|
|
93
|
+
const { ok, errors } = validateRegistry(raw);
|
|
94
|
+
if (!ok) {
|
|
95
|
+
const err = new Error(`refusing to save invalid registry: ${errors.join("; ")}`);
|
|
96
|
+
err.code = "REGISTRY_INVALID";
|
|
97
|
+
err.errors = errors;
|
|
98
|
+
throw err;
|
|
99
|
+
}
|
|
100
|
+
mkdirSync(dirname(path), { recursive: true });
|
|
101
|
+
writeFileSync(path, JSON.stringify(raw, null, 2) + "\n", "utf8");
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
export function emptyRegistry() {
|
|
105
|
+
return { schema: REGISTRY_SCHEMA, specialists: [] };
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
/**
|
|
109
|
+
* Validate a parsed registry against R1-R7 (see schema doc). Returns `{ ok, errors }`.
|
|
110
|
+
* Validation is total — every error is reported, not just the first.
|
|
111
|
+
*/
|
|
112
|
+
export function validateRegistry(raw) {
|
|
113
|
+
const errors = [];
|
|
114
|
+
if (!raw || typeof raw !== "object") {
|
|
115
|
+
return { ok: false, errors: ["registry is not an object"] };
|
|
116
|
+
}
|
|
117
|
+
// R7: schema major version
|
|
118
|
+
if (raw.schema !== REGISTRY_SCHEMA) {
|
|
119
|
+
errors.push(`R7: schema mismatch — got "${raw.schema}", expected "${REGISTRY_SCHEMA}"`);
|
|
120
|
+
}
|
|
121
|
+
if (!Array.isArray(raw.specialists)) {
|
|
122
|
+
errors.push("registry.specialists must be an array");
|
|
123
|
+
return { ok: false, errors };
|
|
124
|
+
}
|
|
125
|
+
const seenRoles = new Set();
|
|
126
|
+
for (let i = 0; i < raw.specialists.length; i++) {
|
|
127
|
+
const e = raw.specialists[i];
|
|
128
|
+
const tag = `specialists[${i}]${e && e.role ? ` (role="${e.role}")` : ""}`;
|
|
129
|
+
if (!e || typeof e !== "object") {
|
|
130
|
+
errors.push(`${tag}: not an object`);
|
|
131
|
+
continue;
|
|
132
|
+
}
|
|
133
|
+
if (typeof e.role !== "string" || !e.role) errors.push(`${tag}: role must be a non-empty string`);
|
|
134
|
+
if (e.role && seenRoles.has(e.role)) errors.push(`${tag}: duplicate role`);
|
|
135
|
+
if (e.role) seenRoles.add(e.role);
|
|
136
|
+
if (typeof e.backend_url !== "string" || !e.backend_url) errors.push(`${tag}: backend_url must be a non-empty string`);
|
|
137
|
+
if (e.fallback !== "claude") errors.push(`${tag}: fallback must be "claude" in v0.1 (got "${e.fallback}")`);
|
|
138
|
+
// R6
|
|
139
|
+
if (typeof e.workload_quota !== "number" || !(e.workload_quota > 0 && e.workload_quota <= 1)) {
|
|
140
|
+
errors.push(`${tag}: R6 — workload_quota must be in (0, 1] (got ${e.workload_quota})`);
|
|
141
|
+
}
|
|
142
|
+
if (!Array.isArray(e.versions)) {
|
|
143
|
+
errors.push(`${tag}: versions must be an array`);
|
|
144
|
+
continue;
|
|
145
|
+
}
|
|
146
|
+
// R3: id collisions inside versions[]
|
|
147
|
+
const seenIds = new Set();
|
|
148
|
+
for (let j = 0; j < e.versions.length; j++) {
|
|
149
|
+
const v = e.versions[j];
|
|
150
|
+
const vtag = `${tag}.versions[${j}]${v && v.id ? ` (id="${v.id}")` : ""}`;
|
|
151
|
+
const vErrors = validateVersion(v, vtag);
|
|
152
|
+
errors.push(...vErrors);
|
|
153
|
+
if (v && typeof v.id === "string") {
|
|
154
|
+
if (seenIds.has(v.id)) errors.push(`${vtag}: R3 — duplicate version id within role`);
|
|
155
|
+
seenIds.add(v.id);
|
|
156
|
+
}
|
|
157
|
+
}
|
|
158
|
+
// R4: active_version must appear in versions[] (or be null)
|
|
159
|
+
if (e.active_version !== null && e.active_version !== undefined) {
|
|
160
|
+
if (typeof e.active_version !== "string") {
|
|
161
|
+
errors.push(`${tag}: active_version must be a string or null`);
|
|
162
|
+
} else if (!seenIds.has(e.active_version)) {
|
|
163
|
+
errors.push(`${tag}: R4 — active_version "${e.active_version}" not found in versions[]`);
|
|
164
|
+
} else {
|
|
165
|
+
// R2: active_version must point to a certified (non-L0) version
|
|
166
|
+
const active = e.versions.find((v) => v && v.id === e.active_version);
|
|
167
|
+
if (active && active.certified_level === "L0") {
|
|
168
|
+
errors.push(`${tag}: R2 — active_version "${e.active_version}" is L0 (uncertified); cannot be active`);
|
|
169
|
+
}
|
|
170
|
+
}
|
|
171
|
+
}
|
|
172
|
+
}
|
|
173
|
+
return { ok: errors.length === 0, errors };
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
function validateVersion(v, tag) {
|
|
177
|
+
const errors = [];
|
|
178
|
+
if (!v || typeof v !== "object") return [`${tag}: not an object`];
|
|
179
|
+
if (typeof v.id !== "string" || !v.id) errors.push(`${tag}: id must be a non-empty string`);
|
|
180
|
+
if (typeof v.adapter_id !== "string" || !v.adapter_id) errors.push(`${tag}: adapter_id must be a non-empty string`);
|
|
181
|
+
if (typeof v.base_model !== "string" || !v.base_model) {
|
|
182
|
+
errors.push(`${tag}: base_model must be a non-empty string`);
|
|
183
|
+
} else if (isClaudeFamily(v.base_model)) {
|
|
184
|
+
// R1: same-family base is a correctness regression, not a routing preference.
|
|
185
|
+
errors.push(`${tag}: R1 — base_model "${v.base_model}" is Claude-family; specialists must be cross-family`);
|
|
186
|
+
}
|
|
187
|
+
// R5
|
|
188
|
+
if (typeof v.gate_threshold !== "number" || !(v.gate_threshold >= 0 && v.gate_threshold <= 1)) {
|
|
189
|
+
errors.push(`${tag}: R5 — gate_threshold must be in [0, 1] (got ${v.gate_threshold})`);
|
|
190
|
+
}
|
|
191
|
+
if (typeof v.certified_level !== "string" || !v.certified_level) {
|
|
192
|
+
errors.push(`${tag}: certified_level must be a non-empty string (e.g. "L0", "L1", …)`);
|
|
193
|
+
}
|
|
194
|
+
if (typeof v.exam_hash !== "string" || !v.exam_hash) errors.push(`${tag}: exam_hash must be a non-empty string`);
|
|
195
|
+
if (typeof v.field_audit_window !== "number" || v.field_audit_window <= 0) {
|
|
196
|
+
errors.push(`${tag}: field_audit_window must be a positive number`);
|
|
197
|
+
}
|
|
198
|
+
if (typeof v.created_at !== "string" || !v.created_at) errors.push(`${tag}: created_at must be a non-empty ISO-8601 string`);
|
|
199
|
+
if (v.exam_centroid !== undefined && !Array.isArray(v.exam_centroid)) {
|
|
200
|
+
errors.push(`${tag}: exam_centroid, if present, must be an array of numbers`);
|
|
201
|
+
}
|
|
202
|
+
return errors;
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
/**
|
|
206
|
+
* Resolve the active version for a registry entry. Returns the version object, or null if
|
|
207
|
+
* `active_version` is null. Throws if the registry was malformed (which loadRegistry would
|
|
208
|
+
* have already rejected, so callers should not normally see this).
|
|
209
|
+
*/
|
|
210
|
+
export function resolveActiveVersion(entry) {
|
|
211
|
+
if (!entry || !entry.active_version) return null;
|
|
212
|
+
const v = entry.versions.find((x) => x && x.id === entry.active_version);
|
|
213
|
+
if (!v) {
|
|
214
|
+
const err = new Error(`active_version "${entry.active_version}" not found in versions[]`);
|
|
215
|
+
err.code = "REGISTRY_DANGLING_POINTER";
|
|
216
|
+
throw err;
|
|
217
|
+
}
|
|
218
|
+
return v;
|
|
219
|
+
}
|
|
@@ -0,0 +1,122 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Shadow-probe — every Kth specialist dispatch also runs the Claude path; the two verdicts
|
|
3
|
+
* are compared and logged. Over a rolling window of N probes, if agreement falls below
|
|
4
|
+
* `1 - τ`, the role is halted (andon).
|
|
5
|
+
*
|
|
6
|
+
* Hides one secret family: the probe schedule + halt math. Callers see `shouldShadowProbe`,
|
|
7
|
+
* `recordProbe`, and `checkHalt`; they don't compute K-th or read JSONL by hand.
|
|
8
|
+
*
|
|
9
|
+
* Defaults (per `policy/specialist-tier.md`):
|
|
10
|
+
* - K (probe-every-N) = 20
|
|
11
|
+
* - N (rolling window) = 50
|
|
12
|
+
* - τ (disagreement threshold) = 0.15 → halt when agreement rate < 0.85
|
|
13
|
+
*
|
|
14
|
+
* These defaults are configurable per-role via the registry entry (future) or globally via
|
|
15
|
+
* env (`ROLEOS_SHADOW_K`, `ROLEOS_SHADOW_N`, `ROLEOS_SHADOW_TAU`).
|
|
16
|
+
*/
|
|
17
|
+
|
|
18
|
+
import { readEvents, appendEvent } from "./events.mjs";
|
|
19
|
+
|
|
20
|
+
export const SHADOW_DEFAULTS = {
|
|
21
|
+
K: 20,
|
|
22
|
+
N: 50,
|
|
23
|
+
TAU: 0.15,
|
|
24
|
+
};
|
|
25
|
+
|
|
26
|
+
/**
|
|
27
|
+
* Decide whether the dispatch we're about to make should also fire a shadow probe.
|
|
28
|
+
* Probe fires on the Kth dispatch (counter === K). Caller increments the counter elsewhere
|
|
29
|
+
* (see state.mjs/incrementProbeCounter); this is pure.
|
|
30
|
+
*/
|
|
31
|
+
export function shouldShadowProbe(counter, K = SHADOW_DEFAULTS.K) {
|
|
32
|
+
return counter >= K;
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
/**
|
|
36
|
+
* Record a shadow probe outcome. `agreed` is the consumer-domain agreement (the caller
|
|
37
|
+
* supplied a domain-aware comparator). Optional summary fields are kept short and
|
|
38
|
+
* operator-facing — full verdicts go in the dispatch receipt, not in the event log.
|
|
39
|
+
*
|
|
40
|
+
* @param {string} eventsPath
|
|
41
|
+
* @param {object} probe
|
|
42
|
+
* @param {string} probe.role
|
|
43
|
+
* @param {string} probe.trace_id
|
|
44
|
+
* @param {boolean} probe.agreed
|
|
45
|
+
* @param {string} [probe.specialist_summary]
|
|
46
|
+
* @param {string} [probe.claude_summary]
|
|
47
|
+
* @param {string} probe.ts ISO-8601 (caller-supplied; no Date.now() here)
|
|
48
|
+
*/
|
|
49
|
+
export function recordProbe(eventsPath, probe) {
|
|
50
|
+
appendEvent(eventsPath, {
|
|
51
|
+
kind: "shadow-probe",
|
|
52
|
+
role: probe.role,
|
|
53
|
+
ts: probe.ts,
|
|
54
|
+
data: {
|
|
55
|
+
trace_id: probe.trace_id,
|
|
56
|
+
agreed: !!probe.agreed,
|
|
57
|
+
...(probe.specialist_summary ? { specialist_summary: probe.specialist_summary } : {}),
|
|
58
|
+
...(probe.claude_summary ? { claude_summary: probe.claude_summary } : {}),
|
|
59
|
+
},
|
|
60
|
+
});
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
/**
|
|
64
|
+
* Check the last N shadow-probe events for `role` and compute agreement rate. Returns
|
|
65
|
+
* `{ probes, agreed, rate, shouldHalt }`. If fewer than N probes exist, `shouldHalt` is
|
|
66
|
+
* always false — we don't halt off a thin sample (Snell 2024 phase-transition argument:
|
|
67
|
+
* narrow fine-tunes show step changes, so an early halt on a small sample would be a noise
|
|
68
|
+
* trigger, not a real disagreement signal).
|
|
69
|
+
*
|
|
70
|
+
* @param {string} eventsPath
|
|
71
|
+
* @param {string} role
|
|
72
|
+
* @param {number} [N]
|
|
73
|
+
* @param {number} [tau]
|
|
74
|
+
* @returns {{ probes: number, agreed: number, rate: number, shouldHalt: boolean }}
|
|
75
|
+
*/
|
|
76
|
+
export function checkHalt(eventsPath, role, N = SHADOW_DEFAULTS.N, tau = SHADOW_DEFAULTS.TAU) {
|
|
77
|
+
const events = readEvents(eventsPath, { role, kind: "shadow-probe" });
|
|
78
|
+
const window = events.slice(-N);
|
|
79
|
+
const probes = window.length;
|
|
80
|
+
const agreed = window.filter((e) => e.data && e.data.agreed === true).length;
|
|
81
|
+
const rate = probes === 0 ? 1 : agreed / probes;
|
|
82
|
+
const shouldHalt = probes >= N && rate < 1 - tau;
|
|
83
|
+
return { probes, agreed, rate, shouldHalt };
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
/**
|
|
87
|
+
* Compose a contrastive halt message — workflow-standard #5 (Buçinca CHI 2025
|
|
88
|
+
* arXiv:2410.04253). Names what the specialist did, what Claude did, and why we halted.
|
|
89
|
+
* Caller writes this into the halt event AND into the state file's halt slot.
|
|
90
|
+
*/
|
|
91
|
+
export function contrastiveHaltMessage({ role, probes, rate, tau }) {
|
|
92
|
+
const pct = (rate * 100).toFixed(1);
|
|
93
|
+
const required = ((1 - tau) * 100).toFixed(1);
|
|
94
|
+
return (
|
|
95
|
+
`specialist for role "${role}" halted: shadow-probe agreement ${pct}% over the last ` +
|
|
96
|
+
`${probes} probes < required ${required}% (τ=${tau}). The specialist's verdicts have ` +
|
|
97
|
+
`drifted from Claude's on the same inputs. Clear with: roleos specialist clear-halt ${role}`
|
|
98
|
+
);
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
/**
|
|
102
|
+
* Append a halt event AND set the state's halt slot. The caller saves the state file; this
|
|
103
|
+
* function only appends to the events log so the operations remain composable (events.jsonl
|
|
104
|
+
* is shared; state.json is per-call).
|
|
105
|
+
*/
|
|
106
|
+
export function appendHaltEvent(eventsPath, { role, ts, reason, probes, agreed, rate, tau }) {
|
|
107
|
+
appendEvent(eventsPath, {
|
|
108
|
+
kind: "halt",
|
|
109
|
+
role,
|
|
110
|
+
ts,
|
|
111
|
+
data: { reason, probes, agreed, rate, tau },
|
|
112
|
+
});
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
export function appendClearHaltEvent(eventsPath, { role, ts, operator, reason }) {
|
|
116
|
+
appendEvent(eventsPath, {
|
|
117
|
+
kind: "clear-halt",
|
|
118
|
+
role,
|
|
119
|
+
ts,
|
|
120
|
+
data: { operator, reason: reason || "" },
|
|
121
|
+
});
|
|
122
|
+
}
|
|
@@ -0,0 +1,125 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Specialist runtime state — quota counters + shadow-probe counter + halt state, per role.
|
|
3
|
+
*
|
|
4
|
+
* Hides one secret family (Parnas): the persistence of routing counters. Callers see
|
|
5
|
+
* `get/inc/setHalt/getHalt`; they never touch the on-disk format. The on-disk format is a
|
|
6
|
+
* single JSON file, intentionally simple so an operator can hand-edit it in a pinch.
|
|
7
|
+
*
|
|
8
|
+
* State default path: `<repo>/.role-os/specialist-state.json`. Override with
|
|
9
|
+
* `ROLEOS_SPECIALIST_STATE_PATH`.
|
|
10
|
+
*
|
|
11
|
+
* Quota: a sliding-window counter. We store the last `window` dispatch timestamps so the
|
|
12
|
+
* window is a true rolling window, not aligned to wall clock. (A wall-clock window can be
|
|
13
|
+
* timed against the edge, which the workload-quota anti-collapse argument is meant to
|
|
14
|
+
* prevent.)
|
|
15
|
+
*/
|
|
16
|
+
|
|
17
|
+
import { readFileSync, writeFileSync, existsSync, mkdirSync } from "node:fs";
|
|
18
|
+
import { dirname } from "node:path";
|
|
19
|
+
|
|
20
|
+
export const STATE_SCHEMA = "roleos-specialist-state/v1";
|
|
21
|
+
|
|
22
|
+
/**
|
|
23
|
+
* @typedef {object} RoleState
|
|
24
|
+
* @property {number[]} dispatch_timestamps sliding window of dispatch unix-ms
|
|
25
|
+
* @property {number} probe_counter count of dispatches since the last shadow probe
|
|
26
|
+
* @property {object|null} halt { reason, since } or null when not halted
|
|
27
|
+
*/
|
|
28
|
+
|
|
29
|
+
/**
|
|
30
|
+
* @typedef {object} StateFile
|
|
31
|
+
* @property {string} schema
|
|
32
|
+
* @property {Object<string, RoleState>} roles
|
|
33
|
+
*/
|
|
34
|
+
|
|
35
|
+
export function emptyState() {
|
|
36
|
+
return { schema: STATE_SCHEMA, roles: {} };
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
export function loadState(path) {
|
|
40
|
+
if (!existsSync(path)) return emptyState();
|
|
41
|
+
try {
|
|
42
|
+
const raw = JSON.parse(readFileSync(path, "utf8"));
|
|
43
|
+
if (!raw || raw.schema !== STATE_SCHEMA || typeof raw.roles !== "object") {
|
|
44
|
+
// Refuse to silently accept a mis-shaped state file. Caller decides what to do.
|
|
45
|
+
const err = new Error(`state file schema mismatch: got "${raw && raw.schema}", expected "${STATE_SCHEMA}"`);
|
|
46
|
+
err.code = "STATE_SCHEMA_MISMATCH";
|
|
47
|
+
throw err;
|
|
48
|
+
}
|
|
49
|
+
return raw;
|
|
50
|
+
} catch (err) {
|
|
51
|
+
if (err.code === "STATE_SCHEMA_MISMATCH") throw err;
|
|
52
|
+
const wrapped = new Error(`state file parse error: ${err.message}`);
|
|
53
|
+
wrapped.code = "STATE_PARSE_ERROR";
|
|
54
|
+
throw wrapped;
|
|
55
|
+
}
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
export function saveState(path, state) {
|
|
59
|
+
mkdirSync(dirname(path), { recursive: true });
|
|
60
|
+
writeFileSync(path, JSON.stringify(state, null, 2) + "\n", "utf8");
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
/** Get or create a role's slot in the state object. Mutates and returns the slot. */
|
|
64
|
+
export function ensureRole(state, role) {
|
|
65
|
+
if (!state.roles[role]) {
|
|
66
|
+
state.roles[role] = { dispatch_timestamps: [], probe_counter: 0, halt: null };
|
|
67
|
+
}
|
|
68
|
+
return state.roles[role];
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
/**
|
|
72
|
+
* Record a specialist dispatch in the sliding window. Pure — returns updated state.
|
|
73
|
+
* `windowSize` is in dispatches, not seconds; we keep the last `windowSize` timestamps.
|
|
74
|
+
* `nowMs` must be supplied (no Date.now() inside this function for testability).
|
|
75
|
+
*/
|
|
76
|
+
export function recordDispatch(state, role, windowSize, nowMs) {
|
|
77
|
+
const slot = ensureRole(state, role);
|
|
78
|
+
slot.dispatch_timestamps.push(nowMs);
|
|
79
|
+
if (slot.dispatch_timestamps.length > windowSize) {
|
|
80
|
+
slot.dispatch_timestamps.splice(0, slot.dispatch_timestamps.length - windowSize);
|
|
81
|
+
}
|
|
82
|
+
return state;
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
/**
|
|
86
|
+
* Build a QuotaState view for the gate. `used` is "how many of the last `windowSize`
|
|
87
|
+
* dispatches went to the specialist" — but here EVERY tracked timestamp is a specialist
|
|
88
|
+
* dispatch (Claude calls are not tracked), so `used = dispatch_timestamps.length` and
|
|
89
|
+
* `window` accounts for both — the gate computes share-if-added.
|
|
90
|
+
*
|
|
91
|
+
* Important: this caps `window` at `windowSize`. With fewer than `windowSize` dispatches,
|
|
92
|
+
* the quota check is generous (a small denominator means small share). That is intentional
|
|
93
|
+
* — the quota cap is meant to prevent collapse at scale, not to gate a cold start.
|
|
94
|
+
*/
|
|
95
|
+
export function quotaStateFor(state, role, windowSize) {
|
|
96
|
+
const slot = state.roles[role];
|
|
97
|
+
const used = slot ? slot.dispatch_timestamps.length : 0;
|
|
98
|
+
return { used, window: windowSize };
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
export function incrementProbeCounter(state, role) {
|
|
102
|
+
const slot = ensureRole(state, role);
|
|
103
|
+
slot.probe_counter += 1;
|
|
104
|
+
return slot.probe_counter;
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
export function resetProbeCounter(state, role) {
|
|
108
|
+
const slot = ensureRole(state, role);
|
|
109
|
+
slot.probe_counter = 0;
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
export function getHalt(state, role) {
|
|
113
|
+
const slot = state.roles[role];
|
|
114
|
+
if (!slot || !slot.halt) return { halted: false };
|
|
115
|
+
return { halted: true, reason: slot.halt.reason, since: slot.halt.since };
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
export function setHalt(state, role, halt) {
|
|
119
|
+
const slot = ensureRole(state, role);
|
|
120
|
+
if (halt) {
|
|
121
|
+
slot.halt = { reason: halt.reason, since: halt.since };
|
|
122
|
+
} else {
|
|
123
|
+
slot.halt = null;
|
|
124
|
+
}
|
|
125
|
+
}
|