@ijfw/memory-server 1.5.5 → 1.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. package/bin/ijfw-dashboard +20 -1
  2. package/package.json +4 -3
  3. package/src/audit-roster.js +89 -12
  4. package/src/brain/tiered-llm.js +57 -7
  5. package/src/cross-orchestrator-cli.js +344 -4
  6. package/src/cross-project-search.js +39 -1
  7. package/src/dashboard-server.js +7 -1
  8. package/src/dream/runner.mjs +560 -8
  9. package/src/handlers/brain-handler.js +101 -1
  10. package/src/importers/discover.js +1 -1
  11. package/src/memory/bench-metrics.js +289 -0
  12. package/src/memory/benchmark.js +1 -1
  13. package/src/memory/search.js +53 -1
  14. package/src/orchestrator/plan-checker.js +1 -1
  15. package/src/profile/audit.js +671 -0
  16. package/src/profile/capture.js +871 -0
  17. package/src/profile/derive-dialectic.js +242 -0
  18. package/src/profile/derive-heuristic.js +733 -0
  19. package/src/profile/derive.js +156 -0
  20. package/src/profile/egress.js +306 -0
  21. package/src/profile/eval/build-real-probes.mjs +197 -0
  22. package/src/profile/eval/corpus-from-reddit.mjs +166 -0
  23. package/src/profile/eval/corpus-from-reddit.test.mjs +121 -0
  24. package/src/profile/eval/corpus-from-transcripts.mjs +264 -0
  25. package/src/profile/eval/gate-b-behavior.mjs +420 -0
  26. package/src/profile/eval/gate-b-decision-run.mjs +171 -0
  27. package/src/profile/eval/gate-b-decision-run.test.mjs +141 -0
  28. package/src/profile/eval/gate-b-run.mjs +417 -0
  29. package/src/profile/eval/gate-b-run.test.mjs +204 -0
  30. package/src/profile/eval/gate-c-capture.mjs +323 -0
  31. package/src/profile/eval/harness.mjs +551 -0
  32. package/src/profile/eval/instrument-validation.mjs +248 -0
  33. package/src/profile/eval/instrument-validation.test.mjs +125 -0
  34. package/src/profile/eval/multi-subject-harness.mjs +106 -0
  35. package/src/profile/eval/multi-subject-harness.test.mjs +99 -0
  36. package/src/profile/eval/personas.test.mjs +83 -0
  37. package/src/profile/eval/plumbing.test.mjs +69 -0
  38. package/src/profile/eval/prereg.mjs +130 -0
  39. package/src/profile/eval/prereg.test.mjs +78 -0
  40. package/src/profile/eval/real-corpus.test.mjs +103 -0
  41. package/src/profile/eval/real-personas.mjs +109 -0
  42. package/src/profile/eval/run-real-corpus-concurrent.mjs +407 -0
  43. package/src/profile/eval/run-real-corpus.mjs +358 -0
  44. package/src/profile/eval/slug-quality.mjs +464 -0
  45. package/src/profile/eval/stylometry-features.js +85 -0
  46. package/src/profile/eval/stylometry-reference.js +16 -0
  47. package/src/profile/eval/stylometry.js +224 -0
  48. package/src/profile/eval/stylometry.test.mjs +103 -0
  49. package/src/profile/eval/synthetic-personas.js +91 -0
  50. package/src/profile/eval/verifier-features.mjs +170 -0
  51. package/src/profile/eval/verifier-logreg.mjs +74 -0
  52. package/src/profile/eval/verifier-pair.mjs +122 -0
  53. package/src/profile/eval/verifier-reference.mjs +68 -0
  54. package/src/profile/eval/verifier-scorer.mjs +30 -0
  55. package/src/profile/eval/wrong-target-control.mjs +168 -0
  56. package/src/profile/eval/wrong-target-control.test.mjs +124 -0
  57. package/src/profile/exemplar-capture.js +232 -0
  58. package/src/profile/exemplar-retrieve.js +138 -0
  59. package/src/profile/exemplar-store.js +314 -0
  60. package/src/profile/lock.js +64 -0
  61. package/src/profile/merge.js +624 -0
  62. package/src/profile/path-policy.js +213 -0
  63. package/src/profile/precision-stamp.mjs +151 -0
  64. package/src/profile/render-brief.js +717 -0
  65. package/src/profile/schema.js +244 -0
  66. package/src/profile/sensitivity.js +249 -0
  67. package/src/profile/serve.js +345 -0
  68. package/src/profile/store.js +261 -0
  69. package/src/profile/telemetry.js +289 -0
  70. package/src/recovery/checkpoint.js +7 -1
  71. package/src/server.js +185 -14
  72. package/src/.registry-meta-key.pem +0 -3
@@ -0,0 +1,141 @@
1
+ // Gate B v2 — Task T7: decision runner + gate. The honest verdict logic and the
2
+ // refuse-to-spend / fresh-seed / pilot-descriptive / guard-before-spend orchestration.
3
+
4
+ import { test } from 'node:test';
5
+ import assert from 'node:assert/strict';
6
+ import {
7
+ decideGateB, confirmatoryBooleans, runGateBDecision, deriveRealArmsCarried,
8
+ } from './gate-b-decision-run.mjs';
9
+ import { buildPreReg } from './prereg.mjs';
10
+
11
+ const VALID = { instrumentValid: true, baselinePasses: false, registerEchoPasses: false, realArmsCarried: true };
12
+
13
+ test('decideGateB: instrument invalid ⇒ NULL, no spend', () => {
14
+ assert.equal(decideGateB({ instrumentValid: false }).verdict, 'NULL');
15
+ });
16
+
17
+ test('decideGateB: baseline or register-echo PASS ⇒ VOID (rig contaminated)', () => {
18
+ assert.equal(decideGateB({ ...VALID, baselinePasses: true }).verdict, 'VOID');
19
+ assert.equal(decideGateB({ ...VALID, registerEchoPasses: true }).verdict, 'VOID');
20
+ });
21
+
22
+ test('decideGateB: real authors did not carry it ⇒ NULL (synthetic cannot license)', () => {
23
+ assert.equal(decideGateB({ ...VALID, realArmsCarried: false, derivedPasses: true, oraclePasses: true }).verdict, 'NULL');
24
+ });
25
+
26
+ test('decideGateB: derived passes ⇒ PASS (product win)', () => {
27
+ assert.equal(decideGateB({ ...VALID, derivedPasses: true }).verdict, 'PASS');
28
+ });
29
+
30
+ test('decideGateB: only oracle passes ⇒ PASS_ORACLE → Phase 3 (NOT a cut)', () => {
31
+ const v = decideGateB({ ...VALID, derivedPasses: false, oraclePasses: true });
32
+ assert.equal(v.verdict, 'PASS_ORACLE');
33
+ assert.equal(v.next, 'phase-3-exemplar-lever');
34
+ });
35
+
36
+ test('decideGateB: CUT licensed ONLY by few-shot-oracle NULL', () => {
37
+ // oracle passes → never CUT
38
+ assert.notEqual(decideGateB({ ...VALID, derivedPasses: false, oraclePasses: true }).verdict, 'CUT');
39
+ // oracle nulls (and derived nulls) → CUT
40
+ assert.equal(decideGateB({ ...VALID, derivedPasses: false, oraclePasses: false }).verdict, 'CUT');
41
+ });
42
+
43
+ test('confirmatoryBooleans: Bonferroni alpha BITES (arm must beat baseline at per-test alpha)', () => {
44
+ const preReg = buildPreReg({}); // perTestAlpha derived=fewShotOracle=0.005
45
+ const control = {
46
+ registerEchoPasses: false,
47
+ perArm: {
48
+ baseline: { verdict: { passes: false } },
49
+ derived: { verdict: { passes: true }, vsBaseline: { beatsBaseline: true } }, // directioned + significant
50
+ fewShotOracle: { verdict: { passes: true }, vsBaseline: { beatsBaseline: false } }, // fails direction/alpha
51
+ },
52
+ };
53
+ const b = confirmatoryBooleans(control, preReg, { realArmsCarried: true });
54
+ assert.equal(b.derivedPasses, true);
55
+ assert.equal(b.oraclePasses, false, 'beatsBaseline=false ⇒ does not pass');
56
+ });
57
+
58
+ test('confirmatoryBooleans THROWS if the register-echo VOID rail was not measured', () => {
59
+ const preReg = buildPreReg({});
60
+ const control = { perArm: { baseline: { verdict: { passes: false } } } }; // no registerEchoPasses
61
+ assert.throws(() => confirmatoryBooleans(control, preReg, { realArmsCarried: true }), /register-echo/);
62
+ });
63
+
64
+ test('decideGateB THROWS if a safety rail is undefined (never silent-false)', () => {
65
+ assert.throws(
66
+ () => decideGateB({ instrumentValid: true, baselinePasses: false, derivedPasses: true }),
67
+ /safety rail/,
68
+ );
69
+ });
70
+
71
+ test('deriveRealArmsCarried: synthetic personas cannot license; needs enough real decidable', () => {
72
+ const real = [{ id: 'r1', headlineEligible: true, synthetic: false }, { id: 'r2', headlineEligible: true, synthetic: false }];
73
+ const synth = [{ id: 's1', headlineEligible: false, synthetic: true }, { id: 's2', headlineEligible: false, synthetic: true }];
74
+ assert.equal(deriveRealArmsCarried(real, ['r1', 'r2'], 2), true);
75
+ assert.equal(deriveRealArmsCarried(synth, ['s1', 's2'], 1), false, 'synthetic forced false');
76
+ assert.equal(deriveRealArmsCarried(real, ['r1'], 2), false, 'not enough real decidable');
77
+ });
78
+
79
+ // ---- orchestrator with injected fakes ----
80
+ function makeDeps(overrides = {}) {
81
+ const calls = { guard: 0, measure: 0, phases: [] };
82
+ const defaultMeasure = {
83
+ baselinePasses: false, registerEchoPasses: false, derivedPasses: true, oraclePasses: true, realArmsCarried: true,
84
+ };
85
+ return {
86
+ calls,
87
+ buildPreReg: (i) => buildPreReg(i),
88
+ validate: async () => overrides.validation ?? { passes: true, betweenMean: 0.5, withinMean: 0.3 },
89
+ guard: async () => { calls.guard += 1; },
90
+ measure: async ({ seed, phase }) => {
91
+ calls.measure += 1; calls.phases.push({ phase, seed });
92
+ const m = overrides.measure ? overrides.measure(phase) : defaultMeasure;
93
+ return { seed, phase, ...m };
94
+ },
95
+ };
96
+ }
97
+
98
+ test('REFUSES TO SPEND when validation fails (no guard, no measure)', async () => {
99
+ const deps = makeDeps({ validation: { passes: false } });
100
+ const r = await runGateBDecision(deps, {});
101
+ assert.equal(r.spent, false);
102
+ assert.equal(r.verdict.verdict, 'NULL');
103
+ assert.equal(deps.calls.guard, 0);
104
+ assert.equal(deps.calls.measure, 0);
105
+ });
106
+
107
+ test('guard is asserted BEFORE every spend phase (pilot + confirmatory)', async () => {
108
+ const deps = makeDeps();
109
+ await runGateBDecision(deps, {});
110
+ assert.equal(deps.calls.guard, 2);
111
+ assert.equal(deps.calls.measure, 2);
112
+ });
113
+
114
+ test('confirmatory uses a FRESH seed distinct from the pilot', async () => {
115
+ const deps = makeDeps();
116
+ const r = await runGateBDecision(deps, {});
117
+ assert.notEqual(r.seeds.pilotSeed, r.seeds.confirmSeed);
118
+ const phases = Object.fromEntries(deps.calls.phases.map((p) => [p.phase, p.seed]));
119
+ assert.notEqual(phases.pilot, phases.confirmatory);
120
+ });
121
+
122
+ test('pilot is DESCRIPTIVE: the verdict comes from confirmatory, not pilot', async () => {
123
+ // pilot says everything passes; confirmatory says everything nulls → verdict must be CUT
124
+ const deps = makeDeps({
125
+ measure: (phase) => (phase === 'pilot'
126
+ ? { baselinePasses: false, registerEchoPasses: false, derivedPasses: true, oraclePasses: true, realArmsCarried: true }
127
+ : { baselinePasses: false, registerEchoPasses: false, derivedPasses: false, oraclePasses: false, realArmsCarried: true }),
128
+ });
129
+ const r = await runGateBDecision(deps, {});
130
+ assert.equal(r.verdict.verdict, 'CUT', 'confirmatory NULL drives the verdict, pilot PASS ignored');
131
+ });
132
+
133
+ test('real-arm NULL ⇒ mission NULL regardless of (synthetic) pilot optimism', async () => {
134
+ const deps = makeDeps({
135
+ measure: () => ({
136
+ baselinePasses: false, registerEchoPasses: false, derivedPasses: true, oraclePasses: true, realArmsCarried: false,
137
+ }),
138
+ });
139
+ const r = await runGateBDecision(deps, {});
140
+ assert.equal(r.verdict.verdict, 'NULL');
141
+ });
@@ -0,0 +1,417 @@
1
+ // gate-b-run.mjs — Gate B v2 PRODUCTION runner. The single executable that wires the
2
+ // already-built+green modules into one honest decision pipeline:
3
+ //
4
+ // validateInstrument (HARD GATE, no spend on fail)
5
+ // → buildPreReg + hashPreReg (frozen before any measure)
6
+ // → runGateBDecision(deps): pilot (descriptive) + FRESH-seed confirmatory
7
+ // measure(): runHarness (baseline/derived/fewShotOracle)
8
+ // + a spliced REGISTER-ECHO arm (the VOID rail's live input)
9
+ // + wrongTargetControl (the discriminator)
10
+ // + deriveRealArmsCarried (synthetic can never license the claim)
11
+ // → confirmatoryBooleans → decideGateB → TRUE yes/no verdict
12
+ //
13
+ // HONESTY RAILS (enforced + unit-tested in gate-b-run.test.mjs):
14
+ // * instrument gate BEFORE spend: validation fail ⇒ zero transport calls (runGateBDecision
15
+ // refuses to spend).
16
+ // * pre-reg frozen: hashPreReg once; assertFrozen guards re-registration.
17
+ // * the register-echo arm is RUN + spliced, so control.registerEchoPasses is a measured
18
+ // boolean (the rails in gate-b-decision-run throw on undefined — we feed them, not paper
19
+ // over them).
20
+ // * realArmsCarried via deriveRealArmsCarried on headlineEligible-only personas.
21
+ // * NO metric/judge loosening, NO dropped cases, run-once. A NULL/CUT prints cleanly.
22
+ // * fail-closed: missing key / corpus ⇒ BLOCKED/throw, never a silent empty pass.
23
+ //
24
+ // PRIVACY GUARD (preserved from run-real-corpus-concurrent.mjs): the cloud agent only ever
25
+ // receives style-axis-band briefs + OWN-train exemplars + authored probe prompts — NEVER a
26
+ // foreign author's prose and NEVER the user's held-out TEST text. Foreign authors enter the
27
+ // pipeline as numeric fullStyleVector fingerprints ONLY (they are never passed to runHarness).
28
+ // A closed allowed-set is asserted before every cloud call.
29
+
30
+ import {
31
+ runHarness, buildBriefs, assertBriefNonLeaky, DEFAULT_PROBES,
32
+ } from './multi-subject-harness.mjs';
33
+ import { wrongTargetControl } from './wrong-target-control.mjs';
34
+ import { styleVector, fullStyleVector, fullStyleDistance } from './stylometry.js';
35
+ import { loadRealPersonas } from './real-personas.mjs';
36
+ import { validateInstrument } from './instrument-validation.mjs';
37
+ import {
38
+ buildPreReg, assertFrozen,
39
+ } from './prereg.mjs';
40
+ import {
41
+ runGateBDecision, confirmatoryBooleans, deriveRealArmsCarried,
42
+ } from './gate-b-decision-run.mjs';
43
+ import { ingestRedditCorpus } from './corpus-from-reddit.mjs';
44
+
45
+ export const ECHO_ARM = 'registerEcho';
46
+ const ANTHROPIC_MODEL = process.env.IJFW_EVAL_MODEL || 'claude-opus-4-8';
47
+
48
+ // ---- register-echo brief (the VOID rail's input) -------------------------------------
49
+ // A register-ONLY echo: describe the TRAIN register bands and explicitly instruct the agent
50
+ // to obey ONLY the register, imitating no specific person. If a register-obeyer PASSES the
51
+ // wrong-target control, the instrument is a register meter ⇒ VOID. The brief carries no
52
+ // exemplar prose, so it is non-leaky by construction (asserted like every non-baseline arm).
53
+ function describeBands(reg) {
54
+ const band = (v, lo, hi, low, mid, high) => (v < lo ? low : v > hi ? high : mid);
55
+ return [
56
+ `length ${band(reg.terseness, 0.4, 0.6, 'expansive', 'moderate', 'very terse')}`,
57
+ `tone ${band(reg.formality, 0.15, 0.4, 'casual', 'neutral', 'formal')}`,
58
+ reg.emojiRate > 0.08 ? 'uses emoji' : 'no emoji',
59
+ ].join('; ');
60
+ }
61
+ export function buildRegisterEchoBrief(persona) {
62
+ const reg = styleVector(persona.trainDocs.join('\n'));
63
+ return `REGISTER-ONLY control. Match ONLY these register bands — ${describeBands(reg)}. `
64
+ + 'Do NOT imitate any specific person\'s voice or phrasing.';
65
+ }
66
+
67
+ // Run ONE arm for every persona and aggregate per-subject (mirrors runHarness aggregation):
68
+ // concat a subject's probe outputs → ONE authorship vector. Used for the spliced echo arm.
69
+ async function runEchoArm(personas, { transport, probes, briefFor }) {
70
+ const out = {};
71
+ for (const p of personas) {
72
+ const brief = briefFor(p);
73
+ assertBriefNonLeaky(brief, p, { leakFloor: 0 }); // echo carries no prose; floor 0 = verbatim-only check
74
+ const outputs = [];
75
+ for (const task of probes) {
76
+ // eslint-disable-next-line no-await-in-loop
77
+ outputs.push(String(await transport(`${brief}\n\nTask: ${task}`)));
78
+ }
79
+ out[p.id] = { vector: fullStyleVector(outputs.join('\n')), outputs };
80
+ }
81
+ return out;
82
+ }
83
+
84
+ // ---- makeMeasure: the injection seam runGateBDecision expects -------------------------
85
+ // deps.measure({ seed, phase, preReg, minMeanMargin }) → the EXACT confirmatory shape
86
+ // confirmatoryBooleans + decideGateB consume:
87
+ // { instrumentValid, baselinePasses, registerEchoPasses, derivedPasses, oraclePasses,
88
+ // realArmsCarried, control, harness } (control/harness attached for reporting).
89
+ //
90
+ // `personas` = REAL headline subjects (headlineEligible:true). `foreigners` = an ADDITIONAL
91
+ // same-register pool whose members serve as nearest-foreigner targets. They are also scored,
92
+ // but they are headlineEligible:false so deriveRealArmsCarried never counts them toward the
93
+ // verdict — they only thicken each subject's same-register foreigner candidate set.
94
+ //
95
+ // ARCHITECTURE NOTE (subject/foreigner conflation). wrongTargetControl draws each subject's
96
+ // foreigners from the SAME pool it scores (spec §4.1 headline design: the same-register peers
97
+ // ARE the foreigners). So the pool = personas ∪ foreigners, run through the harness arms as
98
+ // full subjects. PRIVACY (spec §2.4) is preserved: a persona's OWN train prose only ever
99
+ // appears in that SAME persona's own brief — a foreigner's prose is NEVER injected as another
100
+ // subject's TARGET (targets are always numeric held-out fingerprints). Foreigners are tagged
101
+ // headlineEligible:false on ingest so they cannot license the headline claim.
102
+ export function makeMeasure({
103
+ transport, personas, foreigners = [], probes = DEFAULT_PROBES,
104
+ minMeanMargin: defaultFloor, minRealSubjects, harnessCfg = {},
105
+ registerDelta = 0.15,
106
+ }) {
107
+ if (typeof transport !== 'function') throw new Error('makeMeasure requires a transport(prompt)=>text');
108
+ if (!Array.isArray(personas) || !personas.length) throw new Error('makeMeasure requires real personas');
109
+
110
+ // foreigner-pool members are scored subjects too, but NEVER headline-eligible.
111
+ const foreignSubjects = foreigners.map((f) => ({ ...f, headlineEligible: false }));
112
+ const pool = [...personas, ...foreignSubjects];
113
+
114
+ return async function measure({
115
+ seed, phase, preReg, minMeanMargin,
116
+ }) {
117
+ const floor = Number.isFinite(minMeanMargin) ? minMeanMargin
118
+ : (Number.isFinite(defaultFloor) ? defaultFloor : 0.01);
119
+ const cfg = { probes, ...harnessCfg };
120
+
121
+ // 1) baseline / derived / fewShotOracle — every pool member (each is another's foreigner).
122
+ const harness = await runHarness(pool, { transport, probes, cfg });
123
+
124
+ // 2) REGISTER-ECHO arm (the VOID rail input) — run + splice into the harness output so
125
+ // wrongTargetControl populates registerEchoPasses (never left undefined → rails fire).
126
+ const echo = await runEchoArm(pool, { transport, probes, briefFor: buildRegisterEchoBrief });
127
+ const arms = [...harness.arms, ECHO_ARM];
128
+ const results = {};
129
+ for (const id of harness.personaIds) {
130
+ results[id] = { ...harness.results[id], [ECHO_ARM]: echo[id] };
131
+ }
132
+ const splicedHarness = { personaIds: harness.personaIds, arms, results };
133
+
134
+ // 3) the discriminator. perTestAlpha is the Bonferroni split derived in prereg.
135
+ const perTestAlpha = preReg && preReg.perTestAlpha
136
+ ? Math.min(...Object.values(preReg.perTestAlpha)) : 0.01;
137
+ const control = wrongTargetControl(splicedHarness, pool, {
138
+ registerDelta, minMeanMargin: floor, perTestAlpha,
139
+ });
140
+
141
+ // 4) realArmsCarried — synthetic personas + foreigner-pool members can NEVER license the
142
+ // claim (headlineEligible:false). Decidable HEADLINE subjects must reach the floor.
143
+ const minReal = Number.isFinite(minRealSubjects) ? minRealSubjects
144
+ : (preReg ? preReg.minSubjects : personas.length);
145
+ const realArmsCarried = deriveRealArmsCarried(pool, control.decidableIds, minReal);
146
+
147
+ // 5) reduce to the confirmatory booleans (throws if registerEchoPasses undefined — it isn't).
148
+ const booleans = confirmatoryBooleans(control, preReg, { realArmsCarried });
149
+
150
+ return {
151
+ ...booleans, seed, phase, control, harness: splicedHarness, minMeanMargin: floor,
152
+ };
153
+ };
154
+ }
155
+
156
+ // ---- the full production runner -------------------------------------------------------
157
+ // runGateBProduction(opts) → { status, spent, verdict, preRegHash, validation, ... }.
158
+ // opts:
159
+ // apiKey — ANTHROPIC_API_KEY (BLOCKED if absent; never hardcoded)
160
+ // corpus — [{id,docs}] REAL subjects (or load via dumpPath below)
161
+ // foreignersCorpus — [{id,docs}] same-register foreigner pool (disjoint)
162
+ // dumpPath — alternative to corpus: a local reddit dump (ingestRedditCorpus)
163
+ // transport — async (prompt)=>text. PROD: a privacy-guarded anthropicCall wrapper
164
+ // (makeCloudTransport). TESTS: a deterministic fake.
165
+ // validateOverride — TEST-ONLY: inject a validation result (skips the real AUC sweep)
166
+ // preRegInput — buildPreReg overrides (seed, minSubjects, floorK, verdictArms, ...)
167
+ // measureCfg — { probes, minMeanMargin, minRealSubjects, harnessCfg, registerDelta }
168
+ export async function runGateBProduction(opts = {}) {
169
+ const apiKey = opts.apiKey ?? process.env.ANTHROPIC_API_KEY;
170
+ if (!apiKey) return { status: 'BLOCKED', reason: 'ANTHROPIC_API_KEY not set in env (fail-closed; no run, no spend)' };
171
+
172
+ // 1) corpus — explicit arrays, or ingest a local dump (NO network).
173
+ let corpus = opts.corpus;
174
+ let foreignersCorpus = opts.foreignersCorpus;
175
+ if ((!corpus || !foreignersCorpus) && opts.dumpPath) {
176
+ const ing = ingestRedditCorpus(opts.dumpPath, opts.ingestCfg || {});
177
+ corpus = corpus || ing.corpus;
178
+ foreignersCorpus = foreignersCorpus || ing.foreigners;
179
+ }
180
+ if (!Array.isArray(corpus) || !corpus.length) {
181
+ return { status: 'BLOCKED', reason: 'no subject corpus (pass corpus[] or dumpPath); fail-closed' };
182
+ }
183
+ if (!Array.isArray(foreignersCorpus) || !foreignersCorpus.length) {
184
+ return { status: 'BLOCKED', reason: 'no same-register foreigner pool; fail-closed (the wrong-target control is unrunnable)' };
185
+ }
186
+
187
+ // 2) personas (real, headline-eligible) + foreigner fingerprints.
188
+ const preRegInput = { corpus: 'reddit-single-subreddit', ...opts.preRegInput };
189
+ const seedForPersonas = preRegInput.seed ?? 1;
190
+ const nAuthors = preRegInput.minSubjects ?? Math.min(corpus.length, corpus.length);
191
+ const personas = loadRealPersonas(corpus, {
192
+ nAuthors: Math.min(nAuthors, corpus.length),
193
+ seed: seedForPersonas,
194
+ ...opts.personaCfg,
195
+ });
196
+ const foreigners = loadRealPersonas(foreignersCorpus, {
197
+ nAuthors: foreignersCorpus.length,
198
+ seed: seedForPersonas,
199
+ ...opts.personaCfg,
200
+ });
201
+
202
+ // 3) build the deps for runGateBDecision. validate() is the HARD GATE: on fail,
203
+ // runGateBDecision returns spent:false with NO guard/measure call ⇒ zero transport calls.
204
+ const validate = opts.validateOverride
205
+ ? async (preReg) => opts.validateOverride(preReg)
206
+ : async (preReg) => validateInstrument(corpus, preReg);
207
+
208
+ // the privacy/budget guard wrapper. In production this also enforces the closed allowed-set
209
+ // (built from the harness's own briefs) + a hard call budget. In tests the fake transport
210
+ // is passed directly. We expose a thin guard hook that runGateBDecision calls before spend.
211
+ const guardCalls = [];
212
+ const guard = async ({ phase }) => { guardCalls.push(phase); };
213
+
214
+ const measureCfg = opts.measureCfg || {};
215
+ const probes = measureCfg.probes || DEFAULT_PROBES;
216
+
217
+ // 3b) transport. TESTS inject a deterministic fake. The LIVE run builds the privacy- and
218
+ // budget-guarded cloud transport here: the allowed-set is the closed set of EVERY brief
219
+ // the pool's own personas + foreigner-pool produce (baseline '' + derived + fewShotOracle
220
+ // + register-echo) — foreign prose is never a target, only a fingerprint. The budget is
221
+ // sized from arms × pool × probes × (pilot + confirmatory) with headroom.
222
+ const poolForGuard = [...personas, ...foreigners];
223
+ const budget = opts.budget || {
224
+ calls: 0,
225
+ max: opts.maxCalls || (estimateCalls({ nArms: 4, nSubjects: poolForGuard.length, nProbes: probes.length }) * 3),
226
+ };
227
+ const transport = opts.transport || makeCloudTransport({
228
+ apiKey,
229
+ model: opts.model || ANTHROPIC_MODEL,
230
+ allowedSys: buildAllowedSys(poolForGuard, measureCfg.harnessCfg || {}),
231
+ allowedPr: new Set(probes),
232
+ budget,
233
+ });
234
+
235
+ const measure = makeMeasure({
236
+ transport,
237
+ personas,
238
+ foreigners,
239
+ probes,
240
+ minMeanMargin: measureCfg.minMeanMargin,
241
+ minRealSubjects: measureCfg.minRealSubjects,
242
+ harnessCfg: measureCfg.harnessCfg || {},
243
+ registerDelta: measureCfg.registerDelta ?? 0.15,
244
+ });
245
+
246
+ // 4) freeze the pre-reg ONCE (tamper-evident) before any measure runs.
247
+ const frozenRegistry = opts.frozenRegistry || new Map();
248
+ const deps = {
249
+ buildPreReg: (i) => {
250
+ const pr = buildPreReg(i);
251
+ assertFrozen(frozenRegistry, pr); // run-once: a re-registered runId throws
252
+ return pr;
253
+ },
254
+ validate,
255
+ guard,
256
+ measure,
257
+ };
258
+
259
+ const decision = await runGateBDecision(deps, preRegInput);
260
+
261
+ return {
262
+ status: 'DONE',
263
+ model: opts.model || ANTHROPIC_MODEL,
264
+ spent: decision.spent,
265
+ verdict: decision.verdict,
266
+ runId: decision.runId,
267
+ preRegHash: decision.preRegHash,
268
+ validation: decision.validation,
269
+ confirmatory: decision.confirmatory,
270
+ pilot: decision.pilot,
271
+ seeds: decision.seeds,
272
+ guardPhases: guardCalls,
273
+ nSubjects: personas.length,
274
+ nForeigners: foreigners.length,
275
+ cloudCalls: budget.calls,
276
+ budgetMax: budget.max,
277
+ };
278
+ }
279
+
280
+ // ---- production cloud transport (privacy + budget guarded) ----------------------------
281
+ // makeCloudTransport({ apiKey, model, allowedSys, allowedPr, budget }) → async (prompt)=>text.
282
+ // The harness composes prompt = `${brief}\n\nTask: ${task}`. We split it back into the system
283
+ // brief + the authored task, assert BOTH are in their closed allowed-sets, budget-count, then
284
+ // call Anthropic with the brief as the SYSTEM context (how a host injects a profile).
285
+ export function makeCloudTransport({
286
+ apiKey, model = ANTHROPIC_MODEL, allowedSys, allowedPr, budget, maxTokens = 1024,
287
+ }) {
288
+ if (!apiKey) throw new Error('makeCloudTransport: ANTHROPIC_API_KEY required');
289
+ const URL = 'https://api.anthropic.com/v1/messages';
290
+ return async function cloudTransport(prompt) {
291
+ // recover (brief, task) from the harness's `${brief}\n\nTask: ${task}` composition.
292
+ const idx = prompt.lastIndexOf('\n\nTask: ');
293
+ const brief = idx >= 0 ? prompt.slice(0, idx) : '';
294
+ const task = idx >= 0 ? prompt.slice(idx + '\n\nTask: '.length) : prompt;
295
+ if (allowedSys && !allowedSys.has(brief)) throw new Error('PRIVACY GUARD: system brief not in allowed set — aborting');
296
+ if (allowedPr && !allowedPr.has(task)) throw new Error('PRIVACY GUARD: prompt not in authored set — aborting');
297
+ if (budget) {
298
+ if (budget.calls >= budget.max) throw new Error(`BUDGET: exceeded ${budget.max} cloud calls`);
299
+ budget.calls += 1;
300
+ }
301
+ const body = { model, max_tokens: maxTokens, messages: [{ role: 'user', content: task }] };
302
+ if (brief) body.system = brief;
303
+ const res = await fetch(URL, {
304
+ method: 'POST',
305
+ headers: { 'x-api-key': apiKey, 'anthropic-version': '2023-06-01', 'content-type': 'application/json' },
306
+ body: JSON.stringify(body),
307
+ });
308
+ if (!res.ok) {
309
+ const t = await res.text().catch(() => '');
310
+ throw new Error(`Anthropic HTTP ${res.status}: ${t.slice(0, 160)}`);
311
+ }
312
+ const j = await res.json();
313
+ return (j.content || []).filter((c) => c.type === 'text').map((c) => c.text).join('');
314
+ };
315
+ }
316
+
317
+ // Build the closed allowed-set of every system brief a cloud call may carry, from the
318
+ // personas' OWN briefs (derived + fewShotOracle + register-echo + baseline ''). Foreign
319
+ // prose is NEVER in this set. Used to wire makeCloudTransport for the live run.
320
+ export function buildAllowedSys(personas, cfg = {}) {
321
+ const sys = new Set(['']); // baseline
322
+ for (const p of personas) {
323
+ const b = buildBriefs(p, cfg);
324
+ sys.add(b.derived);
325
+ sys.add(b.fewShotOracle);
326
+ sys.add(buildRegisterEchoBrief(p));
327
+ }
328
+ return sys;
329
+ }
330
+
331
+ // Estimate the cloud-call budget: arms × subjects × probes, per spend phase.
332
+ export function estimateCalls({
333
+ nArms = 4, nSubjects, nProbes,
334
+ }) {
335
+ return nArms * nSubjects * nProbes;
336
+ }
337
+
338
+ // Human-readable verdict report. A NULL/CUT prints just as cleanly as a PASS — the runner
339
+ // NEVER retries to force a pass.
340
+ export function formatVerdict(r) {
341
+ if (r.status === 'BLOCKED') return `BLOCKED: ${r.reason}`;
342
+ const v = r.verdict || {};
343
+ const lines = [
344
+ `Gate B verdict: ${v.verdict}`,
345
+ ` reason: ${v.reason || ''}`,
346
+ v.claim ? ` claim: ${v.claim}` : null,
347
+ v.ship ? ` ship: ${v.ship}` : null,
348
+ v.next ? ` next: ${v.next}` : null,
349
+ ` runId: ${r.runId}`,
350
+ ` preRegHash: ${r.preRegHash}`,
351
+ ` spent: ${r.spent} cloudCalls: ${r.cloudCalls}/${r.budgetMax}`,
352
+ ` subjects: ${r.nSubjects} headline + ${r.nForeigners} same-register foreigners`,
353
+ ];
354
+ const conf = r.confirmatory;
355
+ if (conf && conf.control && conf.control.perArm) {
356
+ const pa = conf.control.perArm;
357
+ for (const arm of ['baseline', 'derived', 'fewShotOracle', ECHO_ARM]) {
358
+ const a = pa[arm];
359
+ if (!a) continue;
360
+ const ci = `CI99-lo ${Number.isFinite(a.ciLower) ? a.ciLower.toFixed(4) : 'NaN'}`;
361
+ lines.push(` [${arm}] mean-margin ${Number.isFinite(a.meanMargin) ? a.meanMargin.toFixed(4) : 'NaN'} `
362
+ + `dz ${a.verdict && Number.isFinite(a.verdict.dz) ? a.verdict.dz.toFixed(2) : 'NaN'} `
363
+ + `${ci} pct+ ${Number.isFinite(a.pctPositive) ? a.pctPositive.toFixed(2) : 'NaN'} `
364
+ + `passes ${a.verdict ? a.verdict.passes : '?'}`);
365
+ }
366
+ }
367
+ return lines.filter(Boolean).join('\n');
368
+ }
369
+
370
+ export const __test = { describeBands, runEchoArm, fullStyleDistance };
371
+ export default {
372
+ makeMeasure, runGateBProduction, makeCloudTransport, buildRegisterEchoBrief,
373
+ buildAllowedSys, estimateCalls, formatVerdict, ECHO_ARM,
374
+ };
375
+
376
+ // ---- CLI entrypoint -------------------------------------------------------------------
377
+ // The exact command an operator runs for the LIVE verdict:
378
+ // ANTHROPIC_API_KEY=… IJFW_GATEB_DUMP=/path/to/subreddit.jsonl \
379
+ // node --experimental-sqlite src/profile/eval/gate-b-run.mjs
380
+ // Optional env: IJFW_EVAL_MODEL (default claude-opus-4-8), IJFW_GATEB_SEED,
381
+ // IJFW_GATEB_NSUBJECTS (default 60), IJFW_GATEB_NPROBES (default 20),
382
+ // IJFW_GATEB_FLOORK (default 0.25), IJFW_GATEB_MAXCALLS.
383
+ // Reads ANTHROPIC_API_KEY from env (BLOCKED if absent — never hardcoded). NO network unless a
384
+ // real key + dump are present; a missing dump / too-few authors fail-closes (throws/BLOCKED).
385
+ if (import.meta.url === `file://${process.argv[1]}`) {
386
+ const dumpPath = process.env.IJFW_GATEB_DUMP;
387
+ if (!dumpPath) {
388
+ // eslint-disable-next-line no-console
389
+ console.error('BLOCKED: set IJFW_GATEB_DUMP=/path/to/single-subreddit.jsonl (local file; no network fetch)');
390
+ process.exit(1);
391
+ }
392
+ const nSubjects = Number(process.env.IJFW_GATEB_NSUBJECTS) || 60;
393
+ const nProbes = Number(process.env.IJFW_GATEB_NPROBES) || 20;
394
+ runGateBProduction({
395
+ dumpPath,
396
+ ingestCfg: { nPersonaAuthors: nSubjects, nForeignAuthors: nSubjects },
397
+ preRegInput: {
398
+ seed: Number(process.env.IJFW_GATEB_SEED) || 1,
399
+ minSubjects: nSubjects,
400
+ floorK: Number(process.env.IJFW_GATEB_FLOORK) || 0.25,
401
+ nProbes,
402
+ },
403
+ measureCfg: {
404
+ probes: DEFAULT_PROBES, // authored, closed set; expand to nProbes-many authored prompts for the real run
405
+ },
406
+ maxCalls: Number(process.env.IJFW_GATEB_MAXCALLS) || undefined,
407
+ }).then((r) => {
408
+ // eslint-disable-next-line no-console
409
+ console.log(formatVerdict(r));
410
+ // a clean NULL/CUT is a SUCCESSFUL run (the honest outcome), exit 0; only BLOCKED is non-zero.
411
+ process.exit(r.status === 'BLOCKED' ? 1 : 0);
412
+ }).catch((e) => {
413
+ // eslint-disable-next-line no-console
414
+ console.error('RUN ERROR (fail-closed):', e.message);
415
+ process.exit(1);
416
+ });
417
+ }