@ijfw/memory-server 1.5.6 → 1.6.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/ijfw-dashboard +20 -1
- package/package.json +4 -3
- package/src/audit-roster.js +89 -12
- package/src/brain/tiered-llm.js +57 -7
- package/src/cross-orchestrator-cli.js +390 -4
- package/src/cross-project-search.js +39 -1
- package/src/dashboard-server.js +23 -1
- package/src/dream/runner.mjs +560 -8
- package/src/handlers/brain-handler.js +101 -1
- package/src/importers/discover.js +1 -1
- package/src/memory/bench-metrics.js +289 -0
- package/src/memory/benchmark.js +1 -1
- package/src/memory/search.js +53 -1
- package/src/model-refresh.js +4 -2
- package/src/orchestrator/plan-checker.js +1 -1
- package/src/profile/audit.js +671 -0
- package/src/profile/capture.js +871 -0
- package/src/profile/derive-dialectic.js +242 -0
- package/src/profile/derive-heuristic.js +733 -0
- package/src/profile/derive.js +156 -0
- package/src/profile/egress.js +306 -0
- package/src/profile/eval/build-real-probes.mjs +197 -0
- package/src/profile/eval/corpus-from-reddit.mjs +166 -0
- package/src/profile/eval/corpus-from-reddit.test.mjs +121 -0
- package/src/profile/eval/corpus-from-transcripts.mjs +264 -0
- package/src/profile/eval/gate-b-behavior.mjs +420 -0
- package/src/profile/eval/gate-b-decision-run.mjs +171 -0
- package/src/profile/eval/gate-b-decision-run.test.mjs +141 -0
- package/src/profile/eval/gate-b-run.mjs +417 -0
- package/src/profile/eval/gate-b-run.test.mjs +204 -0
- package/src/profile/eval/gate-c-capture.mjs +323 -0
- package/src/profile/eval/harness.mjs +551 -0
- package/src/profile/eval/instrument-validation.mjs +248 -0
- package/src/profile/eval/instrument-validation.test.mjs +125 -0
- package/src/profile/eval/multi-subject-harness.mjs +106 -0
- package/src/profile/eval/multi-subject-harness.test.mjs +99 -0
- package/src/profile/eval/personas.test.mjs +83 -0
- package/src/profile/eval/plumbing.test.mjs +69 -0
- package/src/profile/eval/prereg.mjs +130 -0
- package/src/profile/eval/prereg.test.mjs +78 -0
- package/src/profile/eval/real-corpus.test.mjs +103 -0
- package/src/profile/eval/real-personas.mjs +109 -0
- package/src/profile/eval/run-real-corpus-concurrent.mjs +407 -0
- package/src/profile/eval/run-real-corpus.mjs +358 -0
- package/src/profile/eval/slug-quality.mjs +464 -0
- package/src/profile/eval/stylometry-features.js +85 -0
- package/src/profile/eval/stylometry-reference.js +16 -0
- package/src/profile/eval/stylometry.js +224 -0
- package/src/profile/eval/stylometry.test.mjs +103 -0
- package/src/profile/eval/synthetic-personas.js +91 -0
- package/src/profile/eval/verifier-features.mjs +170 -0
- package/src/profile/eval/verifier-logreg.mjs +74 -0
- package/src/profile/eval/verifier-pair.mjs +122 -0
- package/src/profile/eval/verifier-reference.mjs +68 -0
- package/src/profile/eval/verifier-scorer.mjs +30 -0
- package/src/profile/eval/wrong-target-control.mjs +168 -0
- package/src/profile/eval/wrong-target-control.test.mjs +124 -0
- package/src/profile/exemplar-capture.js +232 -0
- package/src/profile/exemplar-retrieve.js +138 -0
- package/src/profile/exemplar-store.js +314 -0
- package/src/profile/lock.js +64 -0
- package/src/profile/merge.js +624 -0
- package/src/profile/path-policy.js +213 -0
- package/src/profile/precision-stamp.mjs +151 -0
- package/src/profile/render-brief.js +717 -0
- package/src/profile/schema.js +244 -0
- package/src/profile/sensitivity.js +249 -0
- package/src/profile/serve.js +345 -0
- package/src/profile/store.js +261 -0
- package/src/profile/telemetry.js +289 -0
- package/src/recovery/checkpoint.js +7 -1
- package/src/server.js +194 -16
- package/src/.registry-meta-key.pem +0 -3
|
@@ -0,0 +1,141 @@
|
|
|
1
|
+
// Gate B v2 — Task T7: decision runner + gate. The honest verdict logic and the
|
|
2
|
+
// refuse-to-spend / fresh-seed / pilot-descriptive / guard-before-spend orchestration.
|
|
3
|
+
|
|
4
|
+
import { test } from 'node:test';
|
|
5
|
+
import assert from 'node:assert/strict';
|
|
6
|
+
import {
|
|
7
|
+
decideGateB, confirmatoryBooleans, runGateBDecision, deriveRealArmsCarried,
|
|
8
|
+
} from './gate-b-decision-run.mjs';
|
|
9
|
+
import { buildPreReg } from './prereg.mjs';
|
|
10
|
+
|
|
11
|
+
const VALID = { instrumentValid: true, baselinePasses: false, registerEchoPasses: false, realArmsCarried: true };
|
|
12
|
+
|
|
13
|
+
test('decideGateB: instrument invalid ⇒ NULL, no spend', () => {
|
|
14
|
+
assert.equal(decideGateB({ instrumentValid: false }).verdict, 'NULL');
|
|
15
|
+
});
|
|
16
|
+
|
|
17
|
+
test('decideGateB: baseline or register-echo PASS ⇒ VOID (rig contaminated)', () => {
|
|
18
|
+
assert.equal(decideGateB({ ...VALID, baselinePasses: true }).verdict, 'VOID');
|
|
19
|
+
assert.equal(decideGateB({ ...VALID, registerEchoPasses: true }).verdict, 'VOID');
|
|
20
|
+
});
|
|
21
|
+
|
|
22
|
+
test('decideGateB: real authors did not carry it ⇒ NULL (synthetic cannot license)', () => {
|
|
23
|
+
assert.equal(decideGateB({ ...VALID, realArmsCarried: false, derivedPasses: true, oraclePasses: true }).verdict, 'NULL');
|
|
24
|
+
});
|
|
25
|
+
|
|
26
|
+
test('decideGateB: derived passes ⇒ PASS (product win)', () => {
|
|
27
|
+
assert.equal(decideGateB({ ...VALID, derivedPasses: true }).verdict, 'PASS');
|
|
28
|
+
});
|
|
29
|
+
|
|
30
|
+
test('decideGateB: only oracle passes ⇒ PASS_ORACLE → Phase 3 (NOT a cut)', () => {
|
|
31
|
+
const v = decideGateB({ ...VALID, derivedPasses: false, oraclePasses: true });
|
|
32
|
+
assert.equal(v.verdict, 'PASS_ORACLE');
|
|
33
|
+
assert.equal(v.next, 'phase-3-exemplar-lever');
|
|
34
|
+
});
|
|
35
|
+
|
|
36
|
+
test('decideGateB: CUT licensed ONLY by few-shot-oracle NULL', () => {
|
|
37
|
+
// oracle passes → never CUT
|
|
38
|
+
assert.notEqual(decideGateB({ ...VALID, derivedPasses: false, oraclePasses: true }).verdict, 'CUT');
|
|
39
|
+
// oracle nulls (and derived nulls) → CUT
|
|
40
|
+
assert.equal(decideGateB({ ...VALID, derivedPasses: false, oraclePasses: false }).verdict, 'CUT');
|
|
41
|
+
});
|
|
42
|
+
|
|
43
|
+
test('confirmatoryBooleans: Bonferroni alpha BITES (arm must beat baseline at per-test alpha)', () => {
|
|
44
|
+
const preReg = buildPreReg({}); // perTestAlpha derived=fewShotOracle=0.005
|
|
45
|
+
const control = {
|
|
46
|
+
registerEchoPasses: false,
|
|
47
|
+
perArm: {
|
|
48
|
+
baseline: { verdict: { passes: false } },
|
|
49
|
+
derived: { verdict: { passes: true }, vsBaseline: { beatsBaseline: true } }, // directioned + significant
|
|
50
|
+
fewShotOracle: { verdict: { passes: true }, vsBaseline: { beatsBaseline: false } }, // fails direction/alpha
|
|
51
|
+
},
|
|
52
|
+
};
|
|
53
|
+
const b = confirmatoryBooleans(control, preReg, { realArmsCarried: true });
|
|
54
|
+
assert.equal(b.derivedPasses, true);
|
|
55
|
+
assert.equal(b.oraclePasses, false, 'beatsBaseline=false ⇒ does not pass');
|
|
56
|
+
});
|
|
57
|
+
|
|
58
|
+
test('confirmatoryBooleans THROWS if the register-echo VOID rail was not measured', () => {
|
|
59
|
+
const preReg = buildPreReg({});
|
|
60
|
+
const control = { perArm: { baseline: { verdict: { passes: false } } } }; // no registerEchoPasses
|
|
61
|
+
assert.throws(() => confirmatoryBooleans(control, preReg, { realArmsCarried: true }), /register-echo/);
|
|
62
|
+
});
|
|
63
|
+
|
|
64
|
+
test('decideGateB THROWS if a safety rail is undefined (never silent-false)', () => {
|
|
65
|
+
assert.throws(
|
|
66
|
+
() => decideGateB({ instrumentValid: true, baselinePasses: false, derivedPasses: true }),
|
|
67
|
+
/safety rail/,
|
|
68
|
+
);
|
|
69
|
+
});
|
|
70
|
+
|
|
71
|
+
test('deriveRealArmsCarried: synthetic personas cannot license; needs enough real decidable', () => {
|
|
72
|
+
const real = [{ id: 'r1', headlineEligible: true, synthetic: false }, { id: 'r2', headlineEligible: true, synthetic: false }];
|
|
73
|
+
const synth = [{ id: 's1', headlineEligible: false, synthetic: true }, { id: 's2', headlineEligible: false, synthetic: true }];
|
|
74
|
+
assert.equal(deriveRealArmsCarried(real, ['r1', 'r2'], 2), true);
|
|
75
|
+
assert.equal(deriveRealArmsCarried(synth, ['s1', 's2'], 1), false, 'synthetic forced false');
|
|
76
|
+
assert.equal(deriveRealArmsCarried(real, ['r1'], 2), false, 'not enough real decidable');
|
|
77
|
+
});
|
|
78
|
+
|
|
79
|
+
// ---- orchestrator with injected fakes ----
|
|
80
|
+
function makeDeps(overrides = {}) {
|
|
81
|
+
const calls = { guard: 0, measure: 0, phases: [] };
|
|
82
|
+
const defaultMeasure = {
|
|
83
|
+
baselinePasses: false, registerEchoPasses: false, derivedPasses: true, oraclePasses: true, realArmsCarried: true,
|
|
84
|
+
};
|
|
85
|
+
return {
|
|
86
|
+
calls,
|
|
87
|
+
buildPreReg: (i) => buildPreReg(i),
|
|
88
|
+
validate: async () => overrides.validation ?? { passes: true, betweenMean: 0.5, withinMean: 0.3 },
|
|
89
|
+
guard: async () => { calls.guard += 1; },
|
|
90
|
+
measure: async ({ seed, phase }) => {
|
|
91
|
+
calls.measure += 1; calls.phases.push({ phase, seed });
|
|
92
|
+
const m = overrides.measure ? overrides.measure(phase) : defaultMeasure;
|
|
93
|
+
return { seed, phase, ...m };
|
|
94
|
+
},
|
|
95
|
+
};
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
test('REFUSES TO SPEND when validation fails (no guard, no measure)', async () => {
|
|
99
|
+
const deps = makeDeps({ validation: { passes: false } });
|
|
100
|
+
const r = await runGateBDecision(deps, {});
|
|
101
|
+
assert.equal(r.spent, false);
|
|
102
|
+
assert.equal(r.verdict.verdict, 'NULL');
|
|
103
|
+
assert.equal(deps.calls.guard, 0);
|
|
104
|
+
assert.equal(deps.calls.measure, 0);
|
|
105
|
+
});
|
|
106
|
+
|
|
107
|
+
test('guard is asserted BEFORE every spend phase (pilot + confirmatory)', async () => {
|
|
108
|
+
const deps = makeDeps();
|
|
109
|
+
await runGateBDecision(deps, {});
|
|
110
|
+
assert.equal(deps.calls.guard, 2);
|
|
111
|
+
assert.equal(deps.calls.measure, 2);
|
|
112
|
+
});
|
|
113
|
+
|
|
114
|
+
test('confirmatory uses a FRESH seed distinct from the pilot', async () => {
|
|
115
|
+
const deps = makeDeps();
|
|
116
|
+
const r = await runGateBDecision(deps, {});
|
|
117
|
+
assert.notEqual(r.seeds.pilotSeed, r.seeds.confirmSeed);
|
|
118
|
+
const phases = Object.fromEntries(deps.calls.phases.map((p) => [p.phase, p.seed]));
|
|
119
|
+
assert.notEqual(phases.pilot, phases.confirmatory);
|
|
120
|
+
});
|
|
121
|
+
|
|
122
|
+
test('pilot is DESCRIPTIVE: the verdict comes from confirmatory, not pilot', async () => {
|
|
123
|
+
// pilot says everything passes; confirmatory says everything nulls → verdict must be CUT
|
|
124
|
+
const deps = makeDeps({
|
|
125
|
+
measure: (phase) => (phase === 'pilot'
|
|
126
|
+
? { baselinePasses: false, registerEchoPasses: false, derivedPasses: true, oraclePasses: true, realArmsCarried: true }
|
|
127
|
+
: { baselinePasses: false, registerEchoPasses: false, derivedPasses: false, oraclePasses: false, realArmsCarried: true }),
|
|
128
|
+
});
|
|
129
|
+
const r = await runGateBDecision(deps, {});
|
|
130
|
+
assert.equal(r.verdict.verdict, 'CUT', 'confirmatory NULL drives the verdict, pilot PASS ignored');
|
|
131
|
+
});
|
|
132
|
+
|
|
133
|
+
test('real-arm NULL ⇒ mission NULL regardless of (synthetic) pilot optimism', async () => {
|
|
134
|
+
const deps = makeDeps({
|
|
135
|
+
measure: () => ({
|
|
136
|
+
baselinePasses: false, registerEchoPasses: false, derivedPasses: true, oraclePasses: true, realArmsCarried: false,
|
|
137
|
+
}),
|
|
138
|
+
});
|
|
139
|
+
const r = await runGateBDecision(deps, {});
|
|
140
|
+
assert.equal(r.verdict.verdict, 'NULL');
|
|
141
|
+
});
|
|
@@ -0,0 +1,417 @@
|
|
|
1
|
+
// gate-b-run.mjs — Gate B v2 PRODUCTION runner. The single executable that wires the
|
|
2
|
+
// already-built+green modules into one honest decision pipeline:
|
|
3
|
+
//
|
|
4
|
+
// validateInstrument (HARD GATE, no spend on fail)
|
|
5
|
+
// → buildPreReg + hashPreReg (frozen before any measure)
|
|
6
|
+
// → runGateBDecision(deps): pilot (descriptive) + FRESH-seed confirmatory
|
|
7
|
+
// measure(): runHarness (baseline/derived/fewShotOracle)
|
|
8
|
+
// + a spliced REGISTER-ECHO arm (the VOID rail's live input)
|
|
9
|
+
// + wrongTargetControl (the discriminator)
|
|
10
|
+
// + deriveRealArmsCarried (synthetic can never license the claim)
|
|
11
|
+
// → confirmatoryBooleans → decideGateB → TRUE yes/no verdict
|
|
12
|
+
//
|
|
13
|
+
// HONESTY RAILS (enforced + unit-tested in gate-b-run.test.mjs):
|
|
14
|
+
// * instrument gate BEFORE spend: validation fail ⇒ zero transport calls (runGateBDecision
|
|
15
|
+
// refuses to spend).
|
|
16
|
+
// * pre-reg frozen: hashPreReg once; assertFrozen guards re-registration.
|
|
17
|
+
// * the register-echo arm is RUN + spliced, so control.registerEchoPasses is a measured
|
|
18
|
+
// boolean (the rails in gate-b-decision-run throw on undefined — we feed them, not paper
|
|
19
|
+
// over them).
|
|
20
|
+
// * realArmsCarried via deriveRealArmsCarried on headlineEligible-only personas.
|
|
21
|
+
// * NO metric/judge loosening, NO dropped cases, run-once. A NULL/CUT prints cleanly.
|
|
22
|
+
// * fail-closed: missing key / corpus ⇒ BLOCKED/throw, never a silent empty pass.
|
|
23
|
+
//
|
|
24
|
+
// PRIVACY GUARD (preserved from run-real-corpus-concurrent.mjs): the cloud agent only ever
|
|
25
|
+
// receives style-axis-band briefs + OWN-train exemplars + authored probe prompts — NEVER a
|
|
26
|
+
// foreign author's prose and NEVER the user's held-out TEST text. Foreign authors enter the
|
|
27
|
+
// pipeline as numeric fullStyleVector fingerprints ONLY (they are never passed to runHarness).
|
|
28
|
+
// A closed allowed-set is asserted before every cloud call.
|
|
29
|
+
|
|
30
|
+
import {
|
|
31
|
+
runHarness, buildBriefs, assertBriefNonLeaky, DEFAULT_PROBES,
|
|
32
|
+
} from './multi-subject-harness.mjs';
|
|
33
|
+
import { wrongTargetControl } from './wrong-target-control.mjs';
|
|
34
|
+
import { styleVector, fullStyleVector, fullStyleDistance } from './stylometry.js';
|
|
35
|
+
import { loadRealPersonas } from './real-personas.mjs';
|
|
36
|
+
import { validateInstrument } from './instrument-validation.mjs';
|
|
37
|
+
import {
|
|
38
|
+
buildPreReg, assertFrozen,
|
|
39
|
+
} from './prereg.mjs';
|
|
40
|
+
import {
|
|
41
|
+
runGateBDecision, confirmatoryBooleans, deriveRealArmsCarried,
|
|
42
|
+
} from './gate-b-decision-run.mjs';
|
|
43
|
+
import { ingestRedditCorpus } from './corpus-from-reddit.mjs';
|
|
44
|
+
|
|
45
|
+
export const ECHO_ARM = 'registerEcho';
|
|
46
|
+
const ANTHROPIC_MODEL = process.env.IJFW_EVAL_MODEL || 'claude-opus-4-8';
|
|
47
|
+
|
|
48
|
+
// ---- register-echo brief (the VOID rail's input) -------------------------------------
|
|
49
|
+
// A register-ONLY echo: describe the TRAIN register bands and explicitly instruct the agent
|
|
50
|
+
// to obey ONLY the register, imitating no specific person. If a register-obeyer PASSES the
|
|
51
|
+
// wrong-target control, the instrument is a register meter ⇒ VOID. The brief carries no
|
|
52
|
+
// exemplar prose, so it is non-leaky by construction (asserted like every non-baseline arm).
|
|
53
|
+
function describeBands(reg) {
|
|
54
|
+
const band = (v, lo, hi, low, mid, high) => (v < lo ? low : v > hi ? high : mid);
|
|
55
|
+
return [
|
|
56
|
+
`length ${band(reg.terseness, 0.4, 0.6, 'expansive', 'moderate', 'very terse')}`,
|
|
57
|
+
`tone ${band(reg.formality, 0.15, 0.4, 'casual', 'neutral', 'formal')}`,
|
|
58
|
+
reg.emojiRate > 0.08 ? 'uses emoji' : 'no emoji',
|
|
59
|
+
].join('; ');
|
|
60
|
+
}
|
|
61
|
+
export function buildRegisterEchoBrief(persona) {
|
|
62
|
+
const reg = styleVector(persona.trainDocs.join('\n'));
|
|
63
|
+
return `REGISTER-ONLY control. Match ONLY these register bands — ${describeBands(reg)}. `
|
|
64
|
+
+ 'Do NOT imitate any specific person\'s voice or phrasing.';
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
// Run ONE arm for every persona and aggregate per-subject (mirrors runHarness aggregation):
|
|
68
|
+
// concat a subject's probe outputs → ONE authorship vector. Used for the spliced echo arm.
|
|
69
|
+
async function runEchoArm(personas, { transport, probes, briefFor }) {
|
|
70
|
+
const out = {};
|
|
71
|
+
for (const p of personas) {
|
|
72
|
+
const brief = briefFor(p);
|
|
73
|
+
assertBriefNonLeaky(brief, p, { leakFloor: 0 }); // echo carries no prose; floor 0 = verbatim-only check
|
|
74
|
+
const outputs = [];
|
|
75
|
+
for (const task of probes) {
|
|
76
|
+
// eslint-disable-next-line no-await-in-loop
|
|
77
|
+
outputs.push(String(await transport(`${brief}\n\nTask: ${task}`)));
|
|
78
|
+
}
|
|
79
|
+
out[p.id] = { vector: fullStyleVector(outputs.join('\n')), outputs };
|
|
80
|
+
}
|
|
81
|
+
return out;
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
// ---- makeMeasure: the injection seam runGateBDecision expects -------------------------
|
|
85
|
+
// deps.measure({ seed, phase, preReg, minMeanMargin }) → the EXACT confirmatory shape
|
|
86
|
+
// confirmatoryBooleans + decideGateB consume:
|
|
87
|
+
// { instrumentValid, baselinePasses, registerEchoPasses, derivedPasses, oraclePasses,
|
|
88
|
+
// realArmsCarried, control, harness } (control/harness attached for reporting).
|
|
89
|
+
//
|
|
90
|
+
// `personas` = REAL headline subjects (headlineEligible:true). `foreigners` = an ADDITIONAL
|
|
91
|
+
// same-register pool whose members serve as nearest-foreigner targets. They are also scored,
|
|
92
|
+
// but they are headlineEligible:false so deriveRealArmsCarried never counts them toward the
|
|
93
|
+
// verdict — they only thicken each subject's same-register foreigner candidate set.
|
|
94
|
+
//
|
|
95
|
+
// ARCHITECTURE NOTE (subject/foreigner conflation). wrongTargetControl draws each subject's
|
|
96
|
+
// foreigners from the SAME pool it scores (spec §4.1 headline design: the same-register peers
|
|
97
|
+
// ARE the foreigners). So the pool = personas ∪ foreigners, run through the harness arms as
|
|
98
|
+
// full subjects. PRIVACY (spec §2.4) is preserved: a persona's OWN train prose only ever
|
|
99
|
+
// appears in that SAME persona's own brief — a foreigner's prose is NEVER injected as another
|
|
100
|
+
// subject's TARGET (targets are always numeric held-out fingerprints). Foreigners are tagged
|
|
101
|
+
// headlineEligible:false on ingest so they cannot license the headline claim.
|
|
102
|
+
export function makeMeasure({
|
|
103
|
+
transport, personas, foreigners = [], probes = DEFAULT_PROBES,
|
|
104
|
+
minMeanMargin: defaultFloor, minRealSubjects, harnessCfg = {},
|
|
105
|
+
registerDelta = 0.15,
|
|
106
|
+
}) {
|
|
107
|
+
if (typeof transport !== 'function') throw new Error('makeMeasure requires a transport(prompt)=>text');
|
|
108
|
+
if (!Array.isArray(personas) || !personas.length) throw new Error('makeMeasure requires real personas');
|
|
109
|
+
|
|
110
|
+
// foreigner-pool members are scored subjects too, but NEVER headline-eligible.
|
|
111
|
+
const foreignSubjects = foreigners.map((f) => ({ ...f, headlineEligible: false }));
|
|
112
|
+
const pool = [...personas, ...foreignSubjects];
|
|
113
|
+
|
|
114
|
+
return async function measure({
|
|
115
|
+
seed, phase, preReg, minMeanMargin,
|
|
116
|
+
}) {
|
|
117
|
+
const floor = Number.isFinite(minMeanMargin) ? minMeanMargin
|
|
118
|
+
: (Number.isFinite(defaultFloor) ? defaultFloor : 0.01);
|
|
119
|
+
const cfg = { probes, ...harnessCfg };
|
|
120
|
+
|
|
121
|
+
// 1) baseline / derived / fewShotOracle — every pool member (each is another's foreigner).
|
|
122
|
+
const harness = await runHarness(pool, { transport, probes, cfg });
|
|
123
|
+
|
|
124
|
+
// 2) REGISTER-ECHO arm (the VOID rail input) — run + splice into the harness output so
|
|
125
|
+
// wrongTargetControl populates registerEchoPasses (never left undefined → rails fire).
|
|
126
|
+
const echo = await runEchoArm(pool, { transport, probes, briefFor: buildRegisterEchoBrief });
|
|
127
|
+
const arms = [...harness.arms, ECHO_ARM];
|
|
128
|
+
const results = {};
|
|
129
|
+
for (const id of harness.personaIds) {
|
|
130
|
+
results[id] = { ...harness.results[id], [ECHO_ARM]: echo[id] };
|
|
131
|
+
}
|
|
132
|
+
const splicedHarness = { personaIds: harness.personaIds, arms, results };
|
|
133
|
+
|
|
134
|
+
// 3) the discriminator. perTestAlpha is the Bonferroni split derived in prereg.
|
|
135
|
+
const perTestAlpha = preReg && preReg.perTestAlpha
|
|
136
|
+
? Math.min(...Object.values(preReg.perTestAlpha)) : 0.01;
|
|
137
|
+
const control = wrongTargetControl(splicedHarness, pool, {
|
|
138
|
+
registerDelta, minMeanMargin: floor, perTestAlpha,
|
|
139
|
+
});
|
|
140
|
+
|
|
141
|
+
// 4) realArmsCarried — synthetic personas + foreigner-pool members can NEVER license the
|
|
142
|
+
// claim (headlineEligible:false). Decidable HEADLINE subjects must reach the floor.
|
|
143
|
+
const minReal = Number.isFinite(minRealSubjects) ? minRealSubjects
|
|
144
|
+
: (preReg ? preReg.minSubjects : personas.length);
|
|
145
|
+
const realArmsCarried = deriveRealArmsCarried(pool, control.decidableIds, minReal);
|
|
146
|
+
|
|
147
|
+
// 5) reduce to the confirmatory booleans (throws if registerEchoPasses undefined — it isn't).
|
|
148
|
+
const booleans = confirmatoryBooleans(control, preReg, { realArmsCarried });
|
|
149
|
+
|
|
150
|
+
return {
|
|
151
|
+
...booleans, seed, phase, control, harness: splicedHarness, minMeanMargin: floor,
|
|
152
|
+
};
|
|
153
|
+
};
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
// ---- the full production runner -------------------------------------------------------
|
|
157
|
+
// runGateBProduction(opts) → { status, spent, verdict, preRegHash, validation, ... }.
|
|
158
|
+
// opts:
|
|
159
|
+
// apiKey — ANTHROPIC_API_KEY (BLOCKED if absent; never hardcoded)
|
|
160
|
+
// corpus — [{id,docs}] REAL subjects (or load via dumpPath below)
|
|
161
|
+
// foreignersCorpus — [{id,docs}] same-register foreigner pool (disjoint)
|
|
162
|
+
// dumpPath — alternative to corpus: a local reddit dump (ingestRedditCorpus)
|
|
163
|
+
// transport — async (prompt)=>text. PROD: a privacy-guarded anthropicCall wrapper
|
|
164
|
+
// (makeCloudTransport). TESTS: a deterministic fake.
|
|
165
|
+
// validateOverride — TEST-ONLY: inject a validation result (skips the real AUC sweep)
|
|
166
|
+
// preRegInput — buildPreReg overrides (seed, minSubjects, floorK, verdictArms, ...)
|
|
167
|
+
// measureCfg — { probes, minMeanMargin, minRealSubjects, harnessCfg, registerDelta }
|
|
168
|
+
export async function runGateBProduction(opts = {}) {
|
|
169
|
+
const apiKey = opts.apiKey ?? process.env.ANTHROPIC_API_KEY;
|
|
170
|
+
if (!apiKey) return { status: 'BLOCKED', reason: 'ANTHROPIC_API_KEY not set in env (fail-closed; no run, no spend)' };
|
|
171
|
+
|
|
172
|
+
// 1) corpus — explicit arrays, or ingest a local dump (NO network).
|
|
173
|
+
let corpus = opts.corpus;
|
|
174
|
+
let foreignersCorpus = opts.foreignersCorpus;
|
|
175
|
+
if ((!corpus || !foreignersCorpus) && opts.dumpPath) {
|
|
176
|
+
const ing = ingestRedditCorpus(opts.dumpPath, opts.ingestCfg || {});
|
|
177
|
+
corpus = corpus || ing.corpus;
|
|
178
|
+
foreignersCorpus = foreignersCorpus || ing.foreigners;
|
|
179
|
+
}
|
|
180
|
+
if (!Array.isArray(corpus) || !corpus.length) {
|
|
181
|
+
return { status: 'BLOCKED', reason: 'no subject corpus (pass corpus[] or dumpPath); fail-closed' };
|
|
182
|
+
}
|
|
183
|
+
if (!Array.isArray(foreignersCorpus) || !foreignersCorpus.length) {
|
|
184
|
+
return { status: 'BLOCKED', reason: 'no same-register foreigner pool; fail-closed (the wrong-target control is unrunnable)' };
|
|
185
|
+
}
|
|
186
|
+
|
|
187
|
+
// 2) personas (real, headline-eligible) + foreigner fingerprints.
|
|
188
|
+
const preRegInput = { corpus: 'reddit-single-subreddit', ...opts.preRegInput };
|
|
189
|
+
const seedForPersonas = preRegInput.seed ?? 1;
|
|
190
|
+
const nAuthors = preRegInput.minSubjects ?? Math.min(corpus.length, corpus.length);
|
|
191
|
+
const personas = loadRealPersonas(corpus, {
|
|
192
|
+
nAuthors: Math.min(nAuthors, corpus.length),
|
|
193
|
+
seed: seedForPersonas,
|
|
194
|
+
...opts.personaCfg,
|
|
195
|
+
});
|
|
196
|
+
const foreigners = loadRealPersonas(foreignersCorpus, {
|
|
197
|
+
nAuthors: foreignersCorpus.length,
|
|
198
|
+
seed: seedForPersonas,
|
|
199
|
+
...opts.personaCfg,
|
|
200
|
+
});
|
|
201
|
+
|
|
202
|
+
// 3) build the deps for runGateBDecision. validate() is the HARD GATE: on fail,
|
|
203
|
+
// runGateBDecision returns spent:false with NO guard/measure call ⇒ zero transport calls.
|
|
204
|
+
const validate = opts.validateOverride
|
|
205
|
+
? async (preReg) => opts.validateOverride(preReg)
|
|
206
|
+
: async (preReg) => validateInstrument(corpus, preReg);
|
|
207
|
+
|
|
208
|
+
// the privacy/budget guard wrapper. In production this also enforces the closed allowed-set
|
|
209
|
+
// (built from the harness's own briefs) + a hard call budget. In tests the fake transport
|
|
210
|
+
// is passed directly. We expose a thin guard hook that runGateBDecision calls before spend.
|
|
211
|
+
const guardCalls = [];
|
|
212
|
+
const guard = async ({ phase }) => { guardCalls.push(phase); };
|
|
213
|
+
|
|
214
|
+
const measureCfg = opts.measureCfg || {};
|
|
215
|
+
const probes = measureCfg.probes || DEFAULT_PROBES;
|
|
216
|
+
|
|
217
|
+
// 3b) transport. TESTS inject a deterministic fake. The LIVE run builds the privacy- and
|
|
218
|
+
// budget-guarded cloud transport here: the allowed-set is the closed set of EVERY brief
|
|
219
|
+
// the pool's own personas + foreigner-pool produce (baseline '' + derived + fewShotOracle
|
|
220
|
+
// + register-echo) — foreign prose is never a target, only a fingerprint. The budget is
|
|
221
|
+
// sized from arms × pool × probes × (pilot + confirmatory) with headroom.
|
|
222
|
+
const poolForGuard = [...personas, ...foreigners];
|
|
223
|
+
const budget = opts.budget || {
|
|
224
|
+
calls: 0,
|
|
225
|
+
max: opts.maxCalls || (estimateCalls({ nArms: 4, nSubjects: poolForGuard.length, nProbes: probes.length }) * 3),
|
|
226
|
+
};
|
|
227
|
+
const transport = opts.transport || makeCloudTransport({
|
|
228
|
+
apiKey,
|
|
229
|
+
model: opts.model || ANTHROPIC_MODEL,
|
|
230
|
+
allowedSys: buildAllowedSys(poolForGuard, measureCfg.harnessCfg || {}),
|
|
231
|
+
allowedPr: new Set(probes),
|
|
232
|
+
budget,
|
|
233
|
+
});
|
|
234
|
+
|
|
235
|
+
const measure = makeMeasure({
|
|
236
|
+
transport,
|
|
237
|
+
personas,
|
|
238
|
+
foreigners,
|
|
239
|
+
probes,
|
|
240
|
+
minMeanMargin: measureCfg.minMeanMargin,
|
|
241
|
+
minRealSubjects: measureCfg.minRealSubjects,
|
|
242
|
+
harnessCfg: measureCfg.harnessCfg || {},
|
|
243
|
+
registerDelta: measureCfg.registerDelta ?? 0.15,
|
|
244
|
+
});
|
|
245
|
+
|
|
246
|
+
// 4) freeze the pre-reg ONCE (tamper-evident) before any measure runs.
|
|
247
|
+
const frozenRegistry = opts.frozenRegistry || new Map();
|
|
248
|
+
const deps = {
|
|
249
|
+
buildPreReg: (i) => {
|
|
250
|
+
const pr = buildPreReg(i);
|
|
251
|
+
assertFrozen(frozenRegistry, pr); // run-once: a re-registered runId throws
|
|
252
|
+
return pr;
|
|
253
|
+
},
|
|
254
|
+
validate,
|
|
255
|
+
guard,
|
|
256
|
+
measure,
|
|
257
|
+
};
|
|
258
|
+
|
|
259
|
+
const decision = await runGateBDecision(deps, preRegInput);
|
|
260
|
+
|
|
261
|
+
return {
|
|
262
|
+
status: 'DONE',
|
|
263
|
+
model: opts.model || ANTHROPIC_MODEL,
|
|
264
|
+
spent: decision.spent,
|
|
265
|
+
verdict: decision.verdict,
|
|
266
|
+
runId: decision.runId,
|
|
267
|
+
preRegHash: decision.preRegHash,
|
|
268
|
+
validation: decision.validation,
|
|
269
|
+
confirmatory: decision.confirmatory,
|
|
270
|
+
pilot: decision.pilot,
|
|
271
|
+
seeds: decision.seeds,
|
|
272
|
+
guardPhases: guardCalls,
|
|
273
|
+
nSubjects: personas.length,
|
|
274
|
+
nForeigners: foreigners.length,
|
|
275
|
+
cloudCalls: budget.calls,
|
|
276
|
+
budgetMax: budget.max,
|
|
277
|
+
};
|
|
278
|
+
}
|
|
279
|
+
|
|
280
|
+
// ---- production cloud transport (privacy + budget guarded) ----------------------------
|
|
281
|
+
// makeCloudTransport({ apiKey, model, allowedSys, allowedPr, budget }) → async (prompt)=>text.
|
|
282
|
+
// The harness composes prompt = `${brief}\n\nTask: ${task}`. We split it back into the system
|
|
283
|
+
// brief + the authored task, assert BOTH are in their closed allowed-sets, budget-count, then
|
|
284
|
+
// call Anthropic with the brief as the SYSTEM context (how a host injects a profile).
|
|
285
|
+
export function makeCloudTransport({
|
|
286
|
+
apiKey, model = ANTHROPIC_MODEL, allowedSys, allowedPr, budget, maxTokens = 1024,
|
|
287
|
+
}) {
|
|
288
|
+
if (!apiKey) throw new Error('makeCloudTransport: ANTHROPIC_API_KEY required');
|
|
289
|
+
const URL = 'https://api.anthropic.com/v1/messages';
|
|
290
|
+
return async function cloudTransport(prompt) {
|
|
291
|
+
// recover (brief, task) from the harness's `${brief}\n\nTask: ${task}` composition.
|
|
292
|
+
const idx = prompt.lastIndexOf('\n\nTask: ');
|
|
293
|
+
const brief = idx >= 0 ? prompt.slice(0, idx) : '';
|
|
294
|
+
const task = idx >= 0 ? prompt.slice(idx + '\n\nTask: '.length) : prompt;
|
|
295
|
+
if (allowedSys && !allowedSys.has(brief)) throw new Error('PRIVACY GUARD: system brief not in allowed set — aborting');
|
|
296
|
+
if (allowedPr && !allowedPr.has(task)) throw new Error('PRIVACY GUARD: prompt not in authored set — aborting');
|
|
297
|
+
if (budget) {
|
|
298
|
+
if (budget.calls >= budget.max) throw new Error(`BUDGET: exceeded ${budget.max} cloud calls`);
|
|
299
|
+
budget.calls += 1;
|
|
300
|
+
}
|
|
301
|
+
const body = { model, max_tokens: maxTokens, messages: [{ role: 'user', content: task }] };
|
|
302
|
+
if (brief) body.system = brief;
|
|
303
|
+
const res = await fetch(URL, {
|
|
304
|
+
method: 'POST',
|
|
305
|
+
headers: { 'x-api-key': apiKey, 'anthropic-version': '2023-06-01', 'content-type': 'application/json' },
|
|
306
|
+
body: JSON.stringify(body),
|
|
307
|
+
});
|
|
308
|
+
if (!res.ok) {
|
|
309
|
+
const t = await res.text().catch(() => '');
|
|
310
|
+
throw new Error(`Anthropic HTTP ${res.status}: ${t.slice(0, 160)}`);
|
|
311
|
+
}
|
|
312
|
+
const j = await res.json();
|
|
313
|
+
return (j.content || []).filter((c) => c.type === 'text').map((c) => c.text).join('');
|
|
314
|
+
};
|
|
315
|
+
}
|
|
316
|
+
|
|
317
|
+
// Build the closed allowed-set of every system brief a cloud call may carry, from the
|
|
318
|
+
// personas' OWN briefs (derived + fewShotOracle + register-echo + baseline ''). Foreign
|
|
319
|
+
// prose is NEVER in this set. Used to wire makeCloudTransport for the live run.
|
|
320
|
+
export function buildAllowedSys(personas, cfg = {}) {
|
|
321
|
+
const sys = new Set(['']); // baseline
|
|
322
|
+
for (const p of personas) {
|
|
323
|
+
const b = buildBriefs(p, cfg);
|
|
324
|
+
sys.add(b.derived);
|
|
325
|
+
sys.add(b.fewShotOracle);
|
|
326
|
+
sys.add(buildRegisterEchoBrief(p));
|
|
327
|
+
}
|
|
328
|
+
return sys;
|
|
329
|
+
}
|
|
330
|
+
|
|
331
|
+
// Estimate the cloud-call budget: arms × subjects × probes, per spend phase.
|
|
332
|
+
export function estimateCalls({
|
|
333
|
+
nArms = 4, nSubjects, nProbes,
|
|
334
|
+
}) {
|
|
335
|
+
return nArms * nSubjects * nProbes;
|
|
336
|
+
}
|
|
337
|
+
|
|
338
|
+
// Human-readable verdict report. A NULL/CUT prints just as cleanly as a PASS — the runner
|
|
339
|
+
// NEVER retries to force a pass.
|
|
340
|
+
export function formatVerdict(r) {
|
|
341
|
+
if (r.status === 'BLOCKED') return `BLOCKED: ${r.reason}`;
|
|
342
|
+
const v = r.verdict || {};
|
|
343
|
+
const lines = [
|
|
344
|
+
`Gate B verdict: ${v.verdict}`,
|
|
345
|
+
` reason: ${v.reason || ''}`,
|
|
346
|
+
v.claim ? ` claim: ${v.claim}` : null,
|
|
347
|
+
v.ship ? ` ship: ${v.ship}` : null,
|
|
348
|
+
v.next ? ` next: ${v.next}` : null,
|
|
349
|
+
` runId: ${r.runId}`,
|
|
350
|
+
` preRegHash: ${r.preRegHash}`,
|
|
351
|
+
` spent: ${r.spent} cloudCalls: ${r.cloudCalls}/${r.budgetMax}`,
|
|
352
|
+
` subjects: ${r.nSubjects} headline + ${r.nForeigners} same-register foreigners`,
|
|
353
|
+
];
|
|
354
|
+
const conf = r.confirmatory;
|
|
355
|
+
if (conf && conf.control && conf.control.perArm) {
|
|
356
|
+
const pa = conf.control.perArm;
|
|
357
|
+
for (const arm of ['baseline', 'derived', 'fewShotOracle', ECHO_ARM]) {
|
|
358
|
+
const a = pa[arm];
|
|
359
|
+
if (!a) continue;
|
|
360
|
+
const ci = `CI99-lo ${Number.isFinite(a.ciLower) ? a.ciLower.toFixed(4) : 'NaN'}`;
|
|
361
|
+
lines.push(` [${arm}] mean-margin ${Number.isFinite(a.meanMargin) ? a.meanMargin.toFixed(4) : 'NaN'} `
|
|
362
|
+
+ `dz ${a.verdict && Number.isFinite(a.verdict.dz) ? a.verdict.dz.toFixed(2) : 'NaN'} `
|
|
363
|
+
+ `${ci} pct+ ${Number.isFinite(a.pctPositive) ? a.pctPositive.toFixed(2) : 'NaN'} `
|
|
364
|
+
+ `passes ${a.verdict ? a.verdict.passes : '?'}`);
|
|
365
|
+
}
|
|
366
|
+
}
|
|
367
|
+
return lines.filter(Boolean).join('\n');
|
|
368
|
+
}
|
|
369
|
+
|
|
370
|
+
export const __test = { describeBands, runEchoArm, fullStyleDistance };
|
|
371
|
+
export default {
|
|
372
|
+
makeMeasure, runGateBProduction, makeCloudTransport, buildRegisterEchoBrief,
|
|
373
|
+
buildAllowedSys, estimateCalls, formatVerdict, ECHO_ARM,
|
|
374
|
+
};
|
|
375
|
+
|
|
376
|
+
// ---- CLI entrypoint -------------------------------------------------------------------
|
|
377
|
+
// The exact command an operator runs for the LIVE verdict:
|
|
378
|
+
// ANTHROPIC_API_KEY=… IJFW_GATEB_DUMP=/path/to/subreddit.jsonl \
|
|
379
|
+
// node --experimental-sqlite src/profile/eval/gate-b-run.mjs
|
|
380
|
+
// Optional env: IJFW_EVAL_MODEL (default claude-opus-4-8), IJFW_GATEB_SEED,
|
|
381
|
+
// IJFW_GATEB_NSUBJECTS (default 60), IJFW_GATEB_NPROBES (default 20),
|
|
382
|
+
// IJFW_GATEB_FLOORK (default 0.25), IJFW_GATEB_MAXCALLS.
|
|
383
|
+
// Reads ANTHROPIC_API_KEY from env (BLOCKED if absent — never hardcoded). NO network unless a
|
|
384
|
+
// real key + dump are present; a missing dump / too-few authors fail-closes (throws/BLOCKED).
|
|
385
|
+
if (import.meta.url === `file://${process.argv[1]}`) {
|
|
386
|
+
const dumpPath = process.env.IJFW_GATEB_DUMP;
|
|
387
|
+
if (!dumpPath) {
|
|
388
|
+
// eslint-disable-next-line no-console
|
|
389
|
+
console.error('BLOCKED: set IJFW_GATEB_DUMP=/path/to/single-subreddit.jsonl (local file; no network fetch)');
|
|
390
|
+
process.exit(1);
|
|
391
|
+
}
|
|
392
|
+
const nSubjects = Number(process.env.IJFW_GATEB_NSUBJECTS) || 60;
|
|
393
|
+
const nProbes = Number(process.env.IJFW_GATEB_NPROBES) || 20;
|
|
394
|
+
runGateBProduction({
|
|
395
|
+
dumpPath,
|
|
396
|
+
ingestCfg: { nPersonaAuthors: nSubjects, nForeignAuthors: nSubjects },
|
|
397
|
+
preRegInput: {
|
|
398
|
+
seed: Number(process.env.IJFW_GATEB_SEED) || 1,
|
|
399
|
+
minSubjects: nSubjects,
|
|
400
|
+
floorK: Number(process.env.IJFW_GATEB_FLOORK) || 0.25,
|
|
401
|
+
nProbes,
|
|
402
|
+
},
|
|
403
|
+
measureCfg: {
|
|
404
|
+
probes: DEFAULT_PROBES, // authored, closed set; expand to nProbes-many authored prompts for the real run
|
|
405
|
+
},
|
|
406
|
+
maxCalls: Number(process.env.IJFW_GATEB_MAXCALLS) || undefined,
|
|
407
|
+
}).then((r) => {
|
|
408
|
+
// eslint-disable-next-line no-console
|
|
409
|
+
console.log(formatVerdict(r));
|
|
410
|
+
// a clean NULL/CUT is a SUCCESSFUL run (the honest outcome), exit 0; only BLOCKED is non-zero.
|
|
411
|
+
process.exit(r.status === 'BLOCKED' ? 1 : 0);
|
|
412
|
+
}).catch((e) => {
|
|
413
|
+
// eslint-disable-next-line no-console
|
|
414
|
+
console.error('RUN ERROR (fail-closed):', e.message);
|
|
415
|
+
process.exit(1);
|
|
416
|
+
});
|
|
417
|
+
}
|