@ijfw/memory-server 1.5.5 → 1.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/ijfw-dashboard +20 -1
- package/package.json +4 -3
- package/src/audit-roster.js +89 -12
- package/src/brain/tiered-llm.js +57 -7
- package/src/cross-orchestrator-cli.js +344 -4
- package/src/cross-project-search.js +39 -1
- package/src/dashboard-server.js +7 -1
- package/src/dream/runner.mjs +560 -8
- package/src/handlers/brain-handler.js +101 -1
- package/src/importers/discover.js +1 -1
- package/src/memory/bench-metrics.js +289 -0
- package/src/memory/benchmark.js +1 -1
- package/src/memory/search.js +53 -1
- package/src/orchestrator/plan-checker.js +1 -1
- package/src/profile/audit.js +671 -0
- package/src/profile/capture.js +871 -0
- package/src/profile/derive-dialectic.js +242 -0
- package/src/profile/derive-heuristic.js +733 -0
- package/src/profile/derive.js +156 -0
- package/src/profile/egress.js +306 -0
- package/src/profile/eval/build-real-probes.mjs +197 -0
- package/src/profile/eval/corpus-from-reddit.mjs +166 -0
- package/src/profile/eval/corpus-from-reddit.test.mjs +121 -0
- package/src/profile/eval/corpus-from-transcripts.mjs +264 -0
- package/src/profile/eval/gate-b-behavior.mjs +420 -0
- package/src/profile/eval/gate-b-decision-run.mjs +171 -0
- package/src/profile/eval/gate-b-decision-run.test.mjs +141 -0
- package/src/profile/eval/gate-b-run.mjs +417 -0
- package/src/profile/eval/gate-b-run.test.mjs +204 -0
- package/src/profile/eval/gate-c-capture.mjs +323 -0
- package/src/profile/eval/harness.mjs +551 -0
- package/src/profile/eval/instrument-validation.mjs +248 -0
- package/src/profile/eval/instrument-validation.test.mjs +125 -0
- package/src/profile/eval/multi-subject-harness.mjs +106 -0
- package/src/profile/eval/multi-subject-harness.test.mjs +99 -0
- package/src/profile/eval/personas.test.mjs +83 -0
- package/src/profile/eval/plumbing.test.mjs +69 -0
- package/src/profile/eval/prereg.mjs +130 -0
- package/src/profile/eval/prereg.test.mjs +78 -0
- package/src/profile/eval/real-corpus.test.mjs +103 -0
- package/src/profile/eval/real-personas.mjs +109 -0
- package/src/profile/eval/run-real-corpus-concurrent.mjs +407 -0
- package/src/profile/eval/run-real-corpus.mjs +358 -0
- package/src/profile/eval/slug-quality.mjs +464 -0
- package/src/profile/eval/stylometry-features.js +85 -0
- package/src/profile/eval/stylometry-reference.js +16 -0
- package/src/profile/eval/stylometry.js +224 -0
- package/src/profile/eval/stylometry.test.mjs +103 -0
- package/src/profile/eval/synthetic-personas.js +91 -0
- package/src/profile/eval/verifier-features.mjs +170 -0
- package/src/profile/eval/verifier-logreg.mjs +74 -0
- package/src/profile/eval/verifier-pair.mjs +122 -0
- package/src/profile/eval/verifier-reference.mjs +68 -0
- package/src/profile/eval/verifier-scorer.mjs +30 -0
- package/src/profile/eval/wrong-target-control.mjs +168 -0
- package/src/profile/eval/wrong-target-control.test.mjs +124 -0
- package/src/profile/exemplar-capture.js +232 -0
- package/src/profile/exemplar-retrieve.js +138 -0
- package/src/profile/exemplar-store.js +314 -0
- package/src/profile/lock.js +64 -0
- package/src/profile/merge.js +624 -0
- package/src/profile/path-policy.js +213 -0
- package/src/profile/precision-stamp.mjs +151 -0
- package/src/profile/render-brief.js +717 -0
- package/src/profile/schema.js +244 -0
- package/src/profile/sensitivity.js +249 -0
- package/src/profile/serve.js +345 -0
- package/src/profile/store.js +261 -0
- package/src/profile/telemetry.js +289 -0
- package/src/recovery/checkpoint.js +7 -1
- package/src/server.js +185 -14
- package/src/.registry-meta-key.pem +0 -3
|
@@ -0,0 +1,358 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* profile/eval/run-real-corpus.mjs — drive Gate C + Gate B against the user's
|
|
3
|
+
* REAL Claude Code corpus with a CLOUD frontier model (Anthropic) as the
|
|
4
|
+
* agent-under-test. Produces an HONEST results blob (JSON) written to the local
|
|
5
|
+
* gitignored scratch dir; the report doc is generated from it separately.
|
|
6
|
+
*
|
|
7
|
+
* ── PRIVACY (enforced, not asserted) ────────────────────────────────────────
|
|
8
|
+
* - The corpus builder already reduces transcripts to counts; the only place
|
|
9
|
+
* raw-ish text survives is feedback `context` snippets, which live ONLY in
|
|
10
|
+
* the local scratch artifacts and are NEVER passed to the cloud transport.
|
|
11
|
+
* - The cloud agent receives ONLY: (a) a system brief that is either '' or the
|
|
12
|
+
* STYLE-ONLY brief (four axis descriptors — NO user text), or an authored
|
|
13
|
+
* oracle style brief; and (b) an authored generic prompt. A guard asserts the
|
|
14
|
+
* injected system/prompt strings are drawn from that closed set before any
|
|
15
|
+
* network call — if anything else appears the run ABORTS.
|
|
16
|
+
* - The full preference-tier brief (shareSensitive) is DELIBERATELY NOT sent:
|
|
17
|
+
* on this corpus its "Observed preference" lines are sentence fragments of the
|
|
18
|
+
* user's real prose (a finding in itself), so transmitting it would violate
|
|
19
|
+
* the raw-text constraint. Gate B's heuristic arm therefore uses the
|
|
20
|
+
* privacy-safe style-only brief — which is also the genuinely portable signal.
|
|
21
|
+
*
|
|
22
|
+
* ── BOUNDED SPEND ───────────────────────────────────────────────────────────
|
|
23
|
+
* Total cloud calls = nProbes * (#agent arms) + (nProbes judge calls). With
|
|
24
|
+
* nProbes=30, 3 agent arms (baseline/heuristic/oracle) + 1 judge pass =>
|
|
25
|
+
* 30*3 + 30 = 120 calls, maxTokens 320 each. A hard call-counter aborts the run
|
|
26
|
+
* if it would exceed `maxCalls` (default 200).
|
|
27
|
+
*
|
|
28
|
+
* Node built-ins only (global fetch). No new deps.
|
|
29
|
+
*/
|
|
30
|
+
|
|
31
|
+
import { writeFileSync, mkdirSync } from 'node:fs';
|
|
32
|
+
import { join } from 'node:path';
|
|
33
|
+
import { renderBrief } from '../render-brief.js';
|
|
34
|
+
import { makeProfile } from '../schema.js';
|
|
35
|
+
import { deriveProfileFromSessions } from './gate-c-capture.mjs';
|
|
36
|
+
import { runArm } from './gate-b-behavior.mjs';
|
|
37
|
+
import {
|
|
38
|
+
objectiveStyle, cohenKappa,
|
|
39
|
+
bootstrapCI, mcnemar, expectedCalibrationError,
|
|
40
|
+
} from './harness.mjs';
|
|
41
|
+
import { buildCorpus } from './corpus-from-transcripts.mjs';
|
|
42
|
+
import { buildRealEval } from './build-real-probes.mjs';
|
|
43
|
+
|
|
44
|
+
const ANTHROPIC_MODEL = process.env.IJFW_EVAL_MODEL || 'claude-sonnet-4-5';
|
|
45
|
+
const ANTHROPIC_URL = 'https://api.anthropic.com/v1/messages';
|
|
46
|
+
|
|
47
|
+
/**
|
|
48
|
+
* Build an Anthropic agent transport with a hard call budget + a privacy guard.
|
|
49
|
+
* The guard records every (system, prompt) pair and verifies system ∈ allowed
|
|
50
|
+
* brief set and prompt ∈ allowed prompt set BEFORE the network call.
|
|
51
|
+
*/
|
|
52
|
+
function makeAnthropicAgent({ apiKey, model, allowedSystems, allowedPrompts, budget }) {
|
|
53
|
+
const allowedSys = new Set(allowedSystems);
|
|
54
|
+
const allowedPr = new Set(allowedPrompts);
|
|
55
|
+
const MAX_TOKENS = 1024; // eval-fixed budget; runArm's 256 is intentionally overridden
|
|
56
|
+
return async ({ prompt, system }) => {
|
|
57
|
+
const sys = String(system || '');
|
|
58
|
+
const pr = String(prompt || '');
|
|
59
|
+
// PRIVACY GUARD — only the closed set of authored prompts + derived
|
|
60
|
+
// style/oracle briefs may ever reach the network.
|
|
61
|
+
if (!allowedPr.has(pr)) throw new Error('PRIVACY GUARD: prompt not in authored set — aborting');
|
|
62
|
+
if (!allowedSys.has(sys)) throw new Error('PRIVACY GUARD: system brief not in allowed set — aborting');
|
|
63
|
+
if (budget.calls >= budget.max) throw new Error(`BUDGET: exceeded ${budget.max} cloud calls`);
|
|
64
|
+
budget.calls += 1;
|
|
65
|
+
const body = {
|
|
66
|
+
model,
|
|
67
|
+
// 1024 tokens so the LENGTH signal can vary between arms: at the runArm
|
|
68
|
+
// default (256) both arms clip at the ceiling and terseness is pinned,
|
|
69
|
+
// masking the effect (verified live). 1024 lets the expansive-brief arm
|
|
70
|
+
// run longer than baseline without either hitting the cap on most probes.
|
|
71
|
+
max_tokens: MAX_TOKENS,
|
|
72
|
+
messages: [{ role: 'user', content: pr }],
|
|
73
|
+
};
|
|
74
|
+
if (sys) body.system = sys;
|
|
75
|
+
const res = await fetch(ANTHROPIC_URL, {
|
|
76
|
+
method: 'POST',
|
|
77
|
+
headers: { 'x-api-key': apiKey, 'anthropic-version': '2023-06-01', 'content-type': 'application/json' },
|
|
78
|
+
body: JSON.stringify(body),
|
|
79
|
+
});
|
|
80
|
+
if (!res.ok) {
|
|
81
|
+
const t = await res.text().catch(() => '');
|
|
82
|
+
throw new Error(`Anthropic HTTP ${res.status}: ${t.slice(0, 120)}`);
|
|
83
|
+
}
|
|
84
|
+
const j = await res.json();
|
|
85
|
+
const text = (j.content || []).filter((c) => c.type === 'text').map((c) => c.text).join('');
|
|
86
|
+
return { text, usage: j.usage || null };
|
|
87
|
+
};
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
/**
|
|
91
|
+
* Run all judge pairs through Anthropic (async), returning a 0/1 preferA vector.
|
|
92
|
+
* Replicates biasControlledJudge's position-randomization + length-control with
|
|
93
|
+
* an async (network) judge — the harness's sync wrapper cannot await a fetch.
|
|
94
|
+
*/
|
|
95
|
+
async function runJudgePairs({ apiKey, model, styleDescription, pairs, budget, seed }) {
|
|
96
|
+
// We replicate biasControlledJudge's position-randomization + length-control
|
|
97
|
+
// but with an async judge call. Mirrors harness semantics exactly.
|
|
98
|
+
const { mulberry32 } = await import('./harness.mjs');
|
|
99
|
+
const rng = mulberry32(seed);
|
|
100
|
+
const preferA = [];
|
|
101
|
+
const details = [];
|
|
102
|
+
for (const it of pairs) {
|
|
103
|
+
const aFirst = rng() < 0.5;
|
|
104
|
+
const rawA = String(it.a || '');
|
|
105
|
+
const rawB = String(it.b || '');
|
|
106
|
+
const target = Math.min(rawA.length, rawB.length) || Math.max(rawA.length, rawB.length);
|
|
107
|
+
const cut = (s) => {
|
|
108
|
+
if (s.length <= target) return s;
|
|
109
|
+
const c = s.slice(0, target);
|
|
110
|
+
const sp = c.lastIndexOf(' ');
|
|
111
|
+
return sp > target * 0.6 ? c.slice(0, sp) : c;
|
|
112
|
+
};
|
|
113
|
+
const ctlA = cut(rawA);
|
|
114
|
+
const ctlB = cut(rawB);
|
|
115
|
+
const firstTxt = aFirst ? ctlA : ctlB;
|
|
116
|
+
const secondTxt = aFirst ? ctlB : ctlA;
|
|
117
|
+
if (budget.calls >= budget.max) throw new Error(`BUDGET: exceeded ${budget.max} cloud calls`);
|
|
118
|
+
budget.calls += 1;
|
|
119
|
+
const sys = `You are a STYLE judge. Decide which candidate better matches this writing style: ${styleDescription}. `
|
|
120
|
+
+ 'Answer with ONLY the single character "1" or "2". No other text.';
|
|
121
|
+
const usr = `Candidate 1:\n${firstTxt}\n\n---\n\nCandidate 2:\n${secondTxt}\n\nWhich better matches the target style? Reply 1 or 2.`;
|
|
122
|
+
// eslint-disable-next-line no-await-in-loop
|
|
123
|
+
const res = await fetch(ANTHROPIC_URL, {
|
|
124
|
+
method: 'POST',
|
|
125
|
+
headers: { 'x-api-key': apiKey, 'anthropic-version': '2023-06-01', 'content-type': 'application/json' },
|
|
126
|
+
body: JSON.stringify({ model, max_tokens: 4, system: sys, messages: [{ role: 'user', content: usr }] }),
|
|
127
|
+
});
|
|
128
|
+
if (!res.ok) throw new Error(`judge HTTP ${res.status}`);
|
|
129
|
+
const j = await res.json();
|
|
130
|
+
const out = (j.content || []).map((c) => c.text || '').join('').trim();
|
|
131
|
+
const choseSecond = out.startsWith('2') ? 1 : 0; // 0 = first, 1 = second
|
|
132
|
+
const judgePrefersFirst = choseSecond === 0;
|
|
133
|
+
const prefersA = aFirst ? judgePrefersFirst : !judgePrefersFirst;
|
|
134
|
+
preferA.push(prefersA ? 1 : 0);
|
|
135
|
+
details.push({ aFirst, out });
|
|
136
|
+
}
|
|
137
|
+
return { preferA, details };
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
/**
|
|
141
|
+
* Continuous per-probe distance of an output to the user's style target, scoped
|
|
142
|
+
* to the BRIEF-CONTROLLABLE dimensions (terseness, formality, emoji presence).
|
|
143
|
+
* We deliberately do NOT use the harness styleDistance here because its codeBlock
|
|
144
|
+
* presence bit penalizes outputs for a facet the style brief never conveys (see
|
|
145
|
+
* build-real-probes styleTargetFromAxes). The thresholded harness
|
|
146
|
+
* `objectiveAdherence` (which DOES include codeBlock) is still reported as the
|
|
147
|
+
* conservative secondary view.
|
|
148
|
+
*/
|
|
149
|
+
function scopedDistance(out, target) {
|
|
150
|
+
const g = objectiveStyle(out);
|
|
151
|
+
let d = Math.abs((g.terseness || 0) - (target.terseness || 0));
|
|
152
|
+
d += Math.abs((g.formalityMarkers || 0) - (target.formalityMarkers || 0));
|
|
153
|
+
d += Math.abs((g.emojiPerChar > 0 ? 1 : 0) - (target.emojiPerChar > 0 ? 1 : 0)) * 0.5;
|
|
154
|
+
return d;
|
|
155
|
+
}
|
|
156
|
+
function distVec(outputs, target) {
|
|
157
|
+
return outputs.map((o) => scopedDistance(o, target));
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
/**
|
|
161
|
+
* main — orchestrate the full real-corpus run.
|
|
162
|
+
*/
|
|
163
|
+
export async function runRealCorpus(opts = {}) {
|
|
164
|
+
const apiKey = process.env.ANTHROPIC_API_KEY;
|
|
165
|
+
if (!apiKey) return { status: 'BLOCKED', reason: 'ANTHROPIC_API_KEY not set in env' };
|
|
166
|
+
|
|
167
|
+
const scratch = opts.scratch || join(process.cwd(), '.ijfw', 'profile-eval-scratch');
|
|
168
|
+
mkdirSync(scratch, { recursive: true });
|
|
169
|
+
|
|
170
|
+
const nProbes = Number.isFinite(opts.nProbes) ? opts.nProbes : 30;
|
|
171
|
+
const budget = { calls: 0, max: Number.isFinite(opts.maxCalls) ? opts.maxCalls : 200 };
|
|
172
|
+
const seed = 7;
|
|
173
|
+
|
|
174
|
+
// 1) REAL corpus.
|
|
175
|
+
const corpus = buildCorpus({ minMessages: opts.minMessages ?? 8, cap: opts.cap ?? 400 });
|
|
176
|
+
if (!corpus.sessions.length) return { status: 'BLOCKED', reason: `empty corpus: ${JSON.stringify(corpus.stats)}` };
|
|
177
|
+
|
|
178
|
+
// Persist corpus stats + (LOCAL ONLY) the feedback artifacts for audit.
|
|
179
|
+
writeFileSync(join(scratch, 'corpus-stats.json'), JSON.stringify(corpus.stats, null, 2));
|
|
180
|
+
|
|
181
|
+
// 2) Split + probes.
|
|
182
|
+
const ev = await buildRealEval(corpus, { trainFraction: opts.trainFraction ?? 0.6, nProbes });
|
|
183
|
+
writeFileSync(join(scratch, 'split.json'), JSON.stringify(ev.split, null, 2));
|
|
184
|
+
|
|
185
|
+
// 3) GATE C — capture, held-out (OFFLINE, no cloud). Explicit-probe mode:
|
|
186
|
+
// train = ev.train, probe = ev.probes (carry the held-out TEST gold).
|
|
187
|
+
const cCorpus = { sessions: ev.train, probes: ev.probes, negativeControl: ev.negativeControl };
|
|
188
|
+
const { runGateC } = await import('./gate-c-capture.mjs');
|
|
189
|
+
const gateC = await runGateC(cCorpus, { bootstrapSeed: 42 });
|
|
190
|
+
|
|
191
|
+
// 3b) STYLE-axis capture (the portable signal): TRAIN-derived vs TEST-derived
|
|
192
|
+
// EMA per axis + |diff|. This is the honest "does it capture you" leg that
|
|
193
|
+
// actually generalizes (the preference slug leg does not).
|
|
194
|
+
const trainProfile = await deriveProfileFromSessions(ev.train, {});
|
|
195
|
+
const testProfile = await deriveProfileFromSessions(ev.test, {});
|
|
196
|
+
const styleCapture = {};
|
|
197
|
+
for (const ax of ['formality', 'energy', 'terseness', 'emoji_use']) {
|
|
198
|
+
const a = trainProfile.global.style[ax];
|
|
199
|
+
const b = testProfile.global.style[ax];
|
|
200
|
+
styleCapture[ax] = {
|
|
201
|
+
train_ema: a ? a.ema : null,
|
|
202
|
+
test_ema: b ? b.ema : null,
|
|
203
|
+
abs_diff: (a && b) ? Math.abs(a.ema - b.ema) : null,
|
|
204
|
+
train_evidence: a ? a.evidence_count : 0,
|
|
205
|
+
test_evidence: b ? b.evidence_count : 0,
|
|
206
|
+
confirmed_both: (a && b && a.evidence_count >= 5 && b.evidence_count >= 5),
|
|
207
|
+
};
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
// 4) GATE B — behavior A/B with the CLOUD agent. Privacy-safe arms only.
|
|
211
|
+
const renderOpts = { env: process.env };
|
|
212
|
+
// heuristic STYLE-ONLY brief = default render (no opt-in => low-tier style axes
|
|
213
|
+
// only, NO preference fragments). This is what is privacy-safe to transmit.
|
|
214
|
+
const styleBrief = renderBrief(trainProfile, renderOpts).text; // style axes only
|
|
215
|
+
const baselineBrief = renderBrief(makeProfile(), renderOpts).text; // ''
|
|
216
|
+
// ORACLE — an authored, transcript-free ceiling describing the user's real
|
|
217
|
+
// fingerprint in plain words (best-case explicit style signal).
|
|
218
|
+
const st = ev.split.styleTarget;
|
|
219
|
+
const terseWord = st.terseness < 0.34 ? 'expansive and detailed' : st.terseness < 0.67 ? 'moderate length' : 'terse';
|
|
220
|
+
const emojiWord = st.emojiPerChar > 0 ? 'occasional emoji are welcome' : 'no emoji';
|
|
221
|
+
const formalWord = st.formalityMarkers < 0.34 ? 'casual' : st.formalityMarkers < 0.67 ? 'balanced/neutral' : 'formal';
|
|
222
|
+
const oracleBrief = `User writing-style profile (observed): responses should be ${terseWord}; tone ${formalWord}; ${emojiWord}.`;
|
|
223
|
+
|
|
224
|
+
const allowedSystems = [baselineBrief, styleBrief, oracleBrief];
|
|
225
|
+
const allowedPrompts = ev.probes.map((p) => p.prompt);
|
|
226
|
+
const agent = makeAnthropicAgent({ apiKey, model: ANTHROPIC_MODEL, allowedSystems, allowedPrompts, budget });
|
|
227
|
+
|
|
228
|
+
// Run the three arms through the REAL runArm (REAL objectiveAdherence scoring).
|
|
229
|
+
const baseline = await runArm(agent, baselineBrief, ev.probes);
|
|
230
|
+
const heuristic = await runArm(agent, styleBrief, ev.probes);
|
|
231
|
+
const oracle = await runArm(agent, oracleBrief, ev.probes);
|
|
232
|
+
|
|
233
|
+
// Primary headline: paired McNemar on objectiveAdherence (heuristic vs baseline).
|
|
234
|
+
const headline = mcnemar(baseline.adherence, heuristic.adherence);
|
|
235
|
+
const oracleVsBaseline = mcnemar(baseline.adherence, oracle.adherence);
|
|
236
|
+
|
|
237
|
+
// Per-arm adherence rate + bootstrap CI (REAL helper).
|
|
238
|
+
const arms = {
|
|
239
|
+
baseline: { adherence: baseline.adherence, ci: bootstrapCI(baseline.adherence, { seed }) },
|
|
240
|
+
heuristic: { adherence: heuristic.adherence, ci: bootstrapCI(heuristic.adherence, { seed: seed + 1 }) },
|
|
241
|
+
oracle: { adherence: oracle.adherence, ci: bootstrapCI(oracle.adherence, { seed: seed + 2 }) },
|
|
242
|
+
};
|
|
243
|
+
|
|
244
|
+
// Secondary continuous: per-probe style-distance to the user's target. A
|
|
245
|
+
// closer-than-baseline 0/1 vector feeds a second paired McNemar (more
|
|
246
|
+
// sensitive than the thresholded adherence on a homogeneous corpus).
|
|
247
|
+
const target = ev.split.styleTarget;
|
|
248
|
+
const dBase = distVec(baseline.outputs, target);
|
|
249
|
+
const dHeur = distVec(heuristic.outputs, target);
|
|
250
|
+
const dOracle = distVec(oracle.outputs, target);
|
|
251
|
+
const heurCloser = dBase.map((d, i) => (dHeur[i] < d ? 1 : 0));
|
|
252
|
+
const baseCloserThanHeur = dBase.map((d, i) => (d < dHeur[i] ? 1 : 0));
|
|
253
|
+
const distanceMcnemar = mcnemar(baseCloserThanHeur, heurCloser); // before=base-wins, after=heur-wins
|
|
254
|
+
const meanDist = (v) => v.reduce((a, b) => a + b, 0) / (v.length || 1);
|
|
255
|
+
|
|
256
|
+
// ECE on confidence — the profile's style axes carry a confidence proxy via
|
|
257
|
+
// Beta mass. We compute a calibration over per-probe (confidence, correct)
|
|
258
|
+
// where confidence = the brief's asserted style strength and correct = the
|
|
259
|
+
// heuristic-arm adherence. This makes the confidence number honest.
|
|
260
|
+
const styleConfidence = (() => {
|
|
261
|
+
// average confirmed-axis "strength" = mean |ema-0.5|*2 over confirmed axes
|
|
262
|
+
let sum = 0; let n = 0;
|
|
263
|
+
for (const ax of ['formality', 'energy', 'terseness', 'emoji_use']) {
|
|
264
|
+
const a = trainProfile.global.style[ax];
|
|
265
|
+
if (a && a.evidence_count >= 5) { sum += Math.min(1, Math.abs(a.ema - 0.5) * 2); n += 1; }
|
|
266
|
+
}
|
|
267
|
+
return n ? sum / n : 0.5;
|
|
268
|
+
})();
|
|
269
|
+
const ecePairs = heuristic.adherence.map((y) => ({ confidence: styleConfidence, correct: y }));
|
|
270
|
+
const ece = expectedCalibrationError(ecePairs, { nBins: 10 });
|
|
271
|
+
|
|
272
|
+
// 5) BIAS-CONTROLLED JUDGE (secondary rater) + κ vs objective. Anthropic judge.
|
|
273
|
+
let judge = null;
|
|
274
|
+
try {
|
|
275
|
+
const styleDescription = `${terseWord}; tone ${formalWord}; ${emojiWord}`;
|
|
276
|
+
const pairs = ev.probes.map((_, i) => ({ a: heuristic.outputs[i], b: baseline.outputs[i] }));
|
|
277
|
+
const { preferA, details } = await runJudgePairs({
|
|
278
|
+
apiKey, model: ANTHROPIC_MODEL, styleDescription, pairs, budget, seed,
|
|
279
|
+
});
|
|
280
|
+
const objectivePrefersA = ev.probes.map((_, i) => (
|
|
281
|
+
heuristic.adherence[i] === 1 && baseline.adherence[i] === 0 ? 1 : 0
|
|
282
|
+
));
|
|
283
|
+
judge = {
|
|
284
|
+
preferA,
|
|
285
|
+
objectivePrefersA,
|
|
286
|
+
judgePreferAHeuristicRate: preferA.reduce((a, b) => a + b, 0) / (preferA.length || 1),
|
|
287
|
+
kappa: cohenKappa(preferA, objectivePrefersA),
|
|
288
|
+
sampleDetails: details.slice(0, 3).map((d) => ({ aFirst: d.aFirst, out: d.out })),
|
|
289
|
+
};
|
|
290
|
+
} catch (e) {
|
|
291
|
+
judge = { error: String(e.message || e) };
|
|
292
|
+
}
|
|
293
|
+
|
|
294
|
+
const result = {
|
|
295
|
+
status: 'DONE',
|
|
296
|
+
model: ANTHROPIC_MODEL,
|
|
297
|
+
cloudCalls: budget.calls,
|
|
298
|
+
corpusStats: corpus.stats,
|
|
299
|
+
split: ev.split,
|
|
300
|
+
gateC: {
|
|
301
|
+
precision: gateC.precision,
|
|
302
|
+
recall: gateC.recall,
|
|
303
|
+
f1: gateC.f1,
|
|
304
|
+
predicted: gateC.predicted,
|
|
305
|
+
goldCount: gateC.gold.length,
|
|
306
|
+
counts: gateC.counts,
|
|
307
|
+
negativeControl: gateC.negativeControl,
|
|
308
|
+
nTrain: gateC.nTrain,
|
|
309
|
+
nProbe: gateC.nProbe,
|
|
310
|
+
heldOutDisjoint: gateC.heldOut.disjoint,
|
|
311
|
+
},
|
|
312
|
+
styleCapture,
|
|
313
|
+
gateB: {
|
|
314
|
+
arms: {
|
|
315
|
+
baseline: { rate: arms.baseline.ci.point, ci: arms.baseline.ci, adherence: arms.baseline.adherence },
|
|
316
|
+
heuristic_styleonly: { rate: arms.heuristic.ci.point, ci: arms.heuristic.ci, adherence: arms.heuristic.adherence },
|
|
317
|
+
oracle_styleceiling: { rate: arms.oracle.ci.point, ci: arms.oracle.ci, adherence: arms.oracle.adherence },
|
|
318
|
+
},
|
|
319
|
+
headline_mcnemar: headline,
|
|
320
|
+
oracle_vs_baseline_mcnemar: oracleVsBaseline,
|
|
321
|
+
distance: {
|
|
322
|
+
mean_baseline: meanDist(dBase),
|
|
323
|
+
mean_heuristic: meanDist(dHeur),
|
|
324
|
+
mean_oracle: meanDist(dOracle),
|
|
325
|
+
heuristic_closer_count: heurCloser.reduce((a, b) => a + b, 0),
|
|
326
|
+
baseline_closer_count: baseCloserThanHeur.reduce((a, b) => a + b, 0),
|
|
327
|
+
mcnemar: distanceMcnemar,
|
|
328
|
+
},
|
|
329
|
+
ece: { ece: ece.ece, styleConfidence },
|
|
330
|
+
judge,
|
|
331
|
+
},
|
|
332
|
+
privacy: {
|
|
333
|
+
transcripts_sent_to_cloud: false,
|
|
334
|
+
preference_brief_transmitted: false,
|
|
335
|
+
allowed_systems_count: allowedSystems.length,
|
|
336
|
+
note: 'Only style-only/oracle briefs (no user text) + authored prompts reached the cloud; guard-enforced.',
|
|
337
|
+
},
|
|
338
|
+
timestamp: new Date().toISOString(),
|
|
339
|
+
};
|
|
340
|
+
|
|
341
|
+
writeFileSync(join(scratch, 'results.json'), JSON.stringify(result, null, 2));
|
|
342
|
+
return result;
|
|
343
|
+
}
|
|
344
|
+
|
|
345
|
+
// CLI entry.
|
|
346
|
+
if (import.meta.url === `file://${process.argv[1]}`) {
|
|
347
|
+
runRealCorpus({
|
|
348
|
+
nProbes: Number(process.env.IJFW_EVAL_NPROBES) || 30,
|
|
349
|
+
maxCalls: Number(process.env.IJFW_EVAL_MAXCALLS) || 200,
|
|
350
|
+
}).then((r) => {
|
|
351
|
+
// print a compact summary (no raw text)
|
|
352
|
+
const { status, model, cloudCalls } = r;
|
|
353
|
+
console.log(JSON.stringify({ status, model, cloudCalls, reason: r.reason || null }, null, 2));
|
|
354
|
+
if (r.status !== 'DONE') process.exit(1);
|
|
355
|
+
}).catch((e) => { console.error('RUN ERROR:', e.message); process.exit(1); });
|
|
356
|
+
}
|
|
357
|
+
|
|
358
|
+
export default { runRealCorpus };
|