@ijfw/memory-server 1.5.6 → 1.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. package/bin/ijfw-dashboard +20 -1
  2. package/package.json +4 -3
  3. package/src/audit-roster.js +89 -12
  4. package/src/brain/tiered-llm.js +57 -7
  5. package/src/cross-orchestrator-cli.js +344 -4
  6. package/src/cross-project-search.js +39 -1
  7. package/src/dashboard-server.js +7 -1
  8. package/src/dream/runner.mjs +560 -8
  9. package/src/handlers/brain-handler.js +101 -1
  10. package/src/importers/discover.js +1 -1
  11. package/src/memory/bench-metrics.js +289 -0
  12. package/src/memory/benchmark.js +1 -1
  13. package/src/memory/search.js +53 -1
  14. package/src/orchestrator/plan-checker.js +1 -1
  15. package/src/profile/audit.js +671 -0
  16. package/src/profile/capture.js +871 -0
  17. package/src/profile/derive-dialectic.js +242 -0
  18. package/src/profile/derive-heuristic.js +733 -0
  19. package/src/profile/derive.js +156 -0
  20. package/src/profile/egress.js +306 -0
  21. package/src/profile/eval/build-real-probes.mjs +197 -0
  22. package/src/profile/eval/corpus-from-reddit.mjs +166 -0
  23. package/src/profile/eval/corpus-from-reddit.test.mjs +121 -0
  24. package/src/profile/eval/corpus-from-transcripts.mjs +264 -0
  25. package/src/profile/eval/gate-b-behavior.mjs +420 -0
  26. package/src/profile/eval/gate-b-decision-run.mjs +171 -0
  27. package/src/profile/eval/gate-b-decision-run.test.mjs +141 -0
  28. package/src/profile/eval/gate-b-run.mjs +417 -0
  29. package/src/profile/eval/gate-b-run.test.mjs +204 -0
  30. package/src/profile/eval/gate-c-capture.mjs +323 -0
  31. package/src/profile/eval/harness.mjs +551 -0
  32. package/src/profile/eval/instrument-validation.mjs +248 -0
  33. package/src/profile/eval/instrument-validation.test.mjs +125 -0
  34. package/src/profile/eval/multi-subject-harness.mjs +106 -0
  35. package/src/profile/eval/multi-subject-harness.test.mjs +99 -0
  36. package/src/profile/eval/personas.test.mjs +83 -0
  37. package/src/profile/eval/plumbing.test.mjs +69 -0
  38. package/src/profile/eval/prereg.mjs +130 -0
  39. package/src/profile/eval/prereg.test.mjs +78 -0
  40. package/src/profile/eval/real-corpus.test.mjs +103 -0
  41. package/src/profile/eval/real-personas.mjs +109 -0
  42. package/src/profile/eval/run-real-corpus-concurrent.mjs +407 -0
  43. package/src/profile/eval/run-real-corpus.mjs +358 -0
  44. package/src/profile/eval/slug-quality.mjs +464 -0
  45. package/src/profile/eval/stylometry-features.js +85 -0
  46. package/src/profile/eval/stylometry-reference.js +16 -0
  47. package/src/profile/eval/stylometry.js +224 -0
  48. package/src/profile/eval/stylometry.test.mjs +103 -0
  49. package/src/profile/eval/synthetic-personas.js +91 -0
  50. package/src/profile/eval/verifier-features.mjs +170 -0
  51. package/src/profile/eval/verifier-logreg.mjs +74 -0
  52. package/src/profile/eval/verifier-pair.mjs +122 -0
  53. package/src/profile/eval/verifier-reference.mjs +68 -0
  54. package/src/profile/eval/verifier-scorer.mjs +30 -0
  55. package/src/profile/eval/wrong-target-control.mjs +168 -0
  56. package/src/profile/eval/wrong-target-control.test.mjs +124 -0
  57. package/src/profile/exemplar-capture.js +232 -0
  58. package/src/profile/exemplar-retrieve.js +138 -0
  59. package/src/profile/exemplar-store.js +314 -0
  60. package/src/profile/lock.js +64 -0
  61. package/src/profile/merge.js +624 -0
  62. package/src/profile/path-policy.js +213 -0
  63. package/src/profile/precision-stamp.mjs +151 -0
  64. package/src/profile/render-brief.js +717 -0
  65. package/src/profile/schema.js +244 -0
  66. package/src/profile/sensitivity.js +249 -0
  67. package/src/profile/serve.js +345 -0
  68. package/src/profile/store.js +261 -0
  69. package/src/profile/telemetry.js +289 -0
  70. package/src/recovery/checkpoint.js +7 -1
  71. package/src/server.js +185 -14
  72. package/src/.registry-meta-key.pem +0 -3
@@ -0,0 +1,358 @@
1
+ /**
2
+ * profile/eval/run-real-corpus.mjs — drive Gate C + Gate B against the user's
3
+ * REAL Claude Code corpus with a CLOUD frontier model (Anthropic) as the
4
+ * agent-under-test. Produces an HONEST results blob (JSON) written to the local
5
+ * gitignored scratch dir; the report doc is generated from it separately.
6
+ *
7
+ * ── PRIVACY (enforced, not asserted) ────────────────────────────────────────
8
+ * - The corpus builder already reduces transcripts to counts; the only place
9
+ * raw-ish text survives is feedback `context` snippets, which live ONLY in
10
+ * the local scratch artifacts and are NEVER passed to the cloud transport.
11
+ * - The cloud agent receives ONLY: (a) a system brief that is either '' or the
12
+ * STYLE-ONLY brief (four axis descriptors — NO user text), or an authored
13
+ * oracle style brief; and (b) an authored generic prompt. A guard asserts the
14
+ * injected system/prompt strings are drawn from that closed set before any
15
+ * network call — if anything else appears the run ABORTS.
16
+ * - The full preference-tier brief (shareSensitive) is DELIBERATELY NOT sent:
17
+ * on this corpus its "Observed preference" lines are sentence fragments of the
18
+ * user's real prose (a finding in itself), so transmitting it would violate
19
+ * the raw-text constraint. Gate B's heuristic arm therefore uses the
20
+ * privacy-safe style-only brief — which is also the genuinely portable signal.
21
+ *
22
+ * ── BOUNDED SPEND ───────────────────────────────────────────────────────────
23
+ * Total cloud calls = nProbes * (#agent arms) + (nProbes judge calls). With
24
+ * nProbes=30, 3 agent arms (baseline/heuristic/oracle) + 1 judge pass =>
25
+ * 30*3 + 30 = 120 calls, maxTokens 320 each. A hard call-counter aborts the run
26
+ * if it would exceed `maxCalls` (default 200).
27
+ *
28
+ * Node built-ins only (global fetch). No new deps.
29
+ */
30
+
31
+ import { writeFileSync, mkdirSync } from 'node:fs';
32
+ import { join } from 'node:path';
33
+ import { renderBrief } from '../render-brief.js';
34
+ import { makeProfile } from '../schema.js';
35
+ import { deriveProfileFromSessions } from './gate-c-capture.mjs';
36
+ import { runArm } from './gate-b-behavior.mjs';
37
+ import {
38
+ objectiveStyle, cohenKappa,
39
+ bootstrapCI, mcnemar, expectedCalibrationError,
40
+ } from './harness.mjs';
41
+ import { buildCorpus } from './corpus-from-transcripts.mjs';
42
+ import { buildRealEval } from './build-real-probes.mjs';
43
+
44
+ const ANTHROPIC_MODEL = process.env.IJFW_EVAL_MODEL || 'claude-sonnet-4-5';
45
+ const ANTHROPIC_URL = 'https://api.anthropic.com/v1/messages';
46
+
47
+ /**
48
+ * Build an Anthropic agent transport with a hard call budget + a privacy guard.
49
+ * The guard records every (system, prompt) pair and verifies system ∈ allowed
50
+ * brief set and prompt ∈ allowed prompt set BEFORE the network call.
51
+ */
52
+ function makeAnthropicAgent({ apiKey, model, allowedSystems, allowedPrompts, budget }) {
53
+ const allowedSys = new Set(allowedSystems);
54
+ const allowedPr = new Set(allowedPrompts);
55
+ const MAX_TOKENS = 1024; // eval-fixed budget; runArm's 256 is intentionally overridden
56
+ return async ({ prompt, system }) => {
57
+ const sys = String(system || '');
58
+ const pr = String(prompt || '');
59
+ // PRIVACY GUARD — only the closed set of authored prompts + derived
60
+ // style/oracle briefs may ever reach the network.
61
+ if (!allowedPr.has(pr)) throw new Error('PRIVACY GUARD: prompt not in authored set — aborting');
62
+ if (!allowedSys.has(sys)) throw new Error('PRIVACY GUARD: system brief not in allowed set — aborting');
63
+ if (budget.calls >= budget.max) throw new Error(`BUDGET: exceeded ${budget.max} cloud calls`);
64
+ budget.calls += 1;
65
+ const body = {
66
+ model,
67
+ // 1024 tokens so the LENGTH signal can vary between arms: at the runArm
68
+ // default (256) both arms clip at the ceiling and terseness is pinned,
69
+ // masking the effect (verified live). 1024 lets the expansive-brief arm
70
+ // run longer than baseline without either hitting the cap on most probes.
71
+ max_tokens: MAX_TOKENS,
72
+ messages: [{ role: 'user', content: pr }],
73
+ };
74
+ if (sys) body.system = sys;
75
+ const res = await fetch(ANTHROPIC_URL, {
76
+ method: 'POST',
77
+ headers: { 'x-api-key': apiKey, 'anthropic-version': '2023-06-01', 'content-type': 'application/json' },
78
+ body: JSON.stringify(body),
79
+ });
80
+ if (!res.ok) {
81
+ const t = await res.text().catch(() => '');
82
+ throw new Error(`Anthropic HTTP ${res.status}: ${t.slice(0, 120)}`);
83
+ }
84
+ const j = await res.json();
85
+ const text = (j.content || []).filter((c) => c.type === 'text').map((c) => c.text).join('');
86
+ return { text, usage: j.usage || null };
87
+ };
88
+ }
89
+
90
+ /**
91
+ * Run all judge pairs through Anthropic (async), returning a 0/1 preferA vector.
92
+ * Replicates biasControlledJudge's position-randomization + length-control with
93
+ * an async (network) judge — the harness's sync wrapper cannot await a fetch.
94
+ */
95
+ async function runJudgePairs({ apiKey, model, styleDescription, pairs, budget, seed }) {
96
+ // We replicate biasControlledJudge's position-randomization + length-control
97
+ // but with an async judge call. Mirrors harness semantics exactly.
98
+ const { mulberry32 } = await import('./harness.mjs');
99
+ const rng = mulberry32(seed);
100
+ const preferA = [];
101
+ const details = [];
102
+ for (const it of pairs) {
103
+ const aFirst = rng() < 0.5;
104
+ const rawA = String(it.a || '');
105
+ const rawB = String(it.b || '');
106
+ const target = Math.min(rawA.length, rawB.length) || Math.max(rawA.length, rawB.length);
107
+ const cut = (s) => {
108
+ if (s.length <= target) return s;
109
+ const c = s.slice(0, target);
110
+ const sp = c.lastIndexOf(' ');
111
+ return sp > target * 0.6 ? c.slice(0, sp) : c;
112
+ };
113
+ const ctlA = cut(rawA);
114
+ const ctlB = cut(rawB);
115
+ const firstTxt = aFirst ? ctlA : ctlB;
116
+ const secondTxt = aFirst ? ctlB : ctlA;
117
+ if (budget.calls >= budget.max) throw new Error(`BUDGET: exceeded ${budget.max} cloud calls`);
118
+ budget.calls += 1;
119
+ const sys = `You are a STYLE judge. Decide which candidate better matches this writing style: ${styleDescription}. `
120
+ + 'Answer with ONLY the single character "1" or "2". No other text.';
121
+ const usr = `Candidate 1:\n${firstTxt}\n\n---\n\nCandidate 2:\n${secondTxt}\n\nWhich better matches the target style? Reply 1 or 2.`;
122
+ // eslint-disable-next-line no-await-in-loop
123
+ const res = await fetch(ANTHROPIC_URL, {
124
+ method: 'POST',
125
+ headers: { 'x-api-key': apiKey, 'anthropic-version': '2023-06-01', 'content-type': 'application/json' },
126
+ body: JSON.stringify({ model, max_tokens: 4, system: sys, messages: [{ role: 'user', content: usr }] }),
127
+ });
128
+ if (!res.ok) throw new Error(`judge HTTP ${res.status}`);
129
+ const j = await res.json();
130
+ const out = (j.content || []).map((c) => c.text || '').join('').trim();
131
+ const choseSecond = out.startsWith('2') ? 1 : 0; // 0 = first, 1 = second
132
+ const judgePrefersFirst = choseSecond === 0;
133
+ const prefersA = aFirst ? judgePrefersFirst : !judgePrefersFirst;
134
+ preferA.push(prefersA ? 1 : 0);
135
+ details.push({ aFirst, out });
136
+ }
137
+ return { preferA, details };
138
+ }
139
+
140
+ /**
141
+ * Continuous per-probe distance of an output to the user's style target, scoped
142
+ * to the BRIEF-CONTROLLABLE dimensions (terseness, formality, emoji presence).
143
+ * We deliberately do NOT use the harness styleDistance here because its codeBlock
144
+ * presence bit penalizes outputs for a facet the style brief never conveys (see
145
+ * build-real-probes styleTargetFromAxes). The thresholded harness
146
+ * `objectiveAdherence` (which DOES include codeBlock) is still reported as the
147
+ * conservative secondary view.
148
+ */
149
+ function scopedDistance(out, target) {
150
+ const g = objectiveStyle(out);
151
+ let d = Math.abs((g.terseness || 0) - (target.terseness || 0));
152
+ d += Math.abs((g.formalityMarkers || 0) - (target.formalityMarkers || 0));
153
+ d += Math.abs((g.emojiPerChar > 0 ? 1 : 0) - (target.emojiPerChar > 0 ? 1 : 0)) * 0.5;
154
+ return d;
155
+ }
156
+ function distVec(outputs, target) {
157
+ return outputs.map((o) => scopedDistance(o, target));
158
+ }
159
+
160
+ /**
161
+ * main — orchestrate the full real-corpus run.
162
+ */
163
+ export async function runRealCorpus(opts = {}) {
164
+ const apiKey = process.env.ANTHROPIC_API_KEY;
165
+ if (!apiKey) return { status: 'BLOCKED', reason: 'ANTHROPIC_API_KEY not set in env' };
166
+
167
+ const scratch = opts.scratch || join(process.cwd(), '.ijfw', 'profile-eval-scratch');
168
+ mkdirSync(scratch, { recursive: true });
169
+
170
+ const nProbes = Number.isFinite(opts.nProbes) ? opts.nProbes : 30;
171
+ const budget = { calls: 0, max: Number.isFinite(opts.maxCalls) ? opts.maxCalls : 200 };
172
+ const seed = 7;
173
+
174
+ // 1) REAL corpus.
175
+ const corpus = buildCorpus({ minMessages: opts.minMessages ?? 8, cap: opts.cap ?? 400 });
176
+ if (!corpus.sessions.length) return { status: 'BLOCKED', reason: `empty corpus: ${JSON.stringify(corpus.stats)}` };
177
+
178
+ // Persist corpus stats + (LOCAL ONLY) the feedback artifacts for audit.
179
+ writeFileSync(join(scratch, 'corpus-stats.json'), JSON.stringify(corpus.stats, null, 2));
180
+
181
+ // 2) Split + probes.
182
+ const ev = await buildRealEval(corpus, { trainFraction: opts.trainFraction ?? 0.6, nProbes });
183
+ writeFileSync(join(scratch, 'split.json'), JSON.stringify(ev.split, null, 2));
184
+
185
+ // 3) GATE C — capture, held-out (OFFLINE, no cloud). Explicit-probe mode:
186
+ // train = ev.train, probe = ev.probes (carry the held-out TEST gold).
187
+ const cCorpus = { sessions: ev.train, probes: ev.probes, negativeControl: ev.negativeControl };
188
+ const { runGateC } = await import('./gate-c-capture.mjs');
189
+ const gateC = await runGateC(cCorpus, { bootstrapSeed: 42 });
190
+
191
+ // 3b) STYLE-axis capture (the portable signal): TRAIN-derived vs TEST-derived
192
+ // EMA per axis + |diff|. This is the honest "does it capture you" leg that
193
+ // actually generalizes (the preference slug leg does not).
194
+ const trainProfile = await deriveProfileFromSessions(ev.train, {});
195
+ const testProfile = await deriveProfileFromSessions(ev.test, {});
196
+ const styleCapture = {};
197
+ for (const ax of ['formality', 'energy', 'terseness', 'emoji_use']) {
198
+ const a = trainProfile.global.style[ax];
199
+ const b = testProfile.global.style[ax];
200
+ styleCapture[ax] = {
201
+ train_ema: a ? a.ema : null,
202
+ test_ema: b ? b.ema : null,
203
+ abs_diff: (a && b) ? Math.abs(a.ema - b.ema) : null,
204
+ train_evidence: a ? a.evidence_count : 0,
205
+ test_evidence: b ? b.evidence_count : 0,
206
+ confirmed_both: (a && b && a.evidence_count >= 5 && b.evidence_count >= 5),
207
+ };
208
+ }
209
+
210
+ // 4) GATE B — behavior A/B with the CLOUD agent. Privacy-safe arms only.
211
+ const renderOpts = { env: process.env };
212
+ // heuristic STYLE-ONLY brief = default render (no opt-in => low-tier style axes
213
+ // only, NO preference fragments). This is what is privacy-safe to transmit.
214
+ const styleBrief = renderBrief(trainProfile, renderOpts).text; // style axes only
215
+ const baselineBrief = renderBrief(makeProfile(), renderOpts).text; // ''
216
+ // ORACLE — an authored, transcript-free ceiling describing the user's real
217
+ // fingerprint in plain words (best-case explicit style signal).
218
+ const st = ev.split.styleTarget;
219
+ const terseWord = st.terseness < 0.34 ? 'expansive and detailed' : st.terseness < 0.67 ? 'moderate length' : 'terse';
220
+ const emojiWord = st.emojiPerChar > 0 ? 'occasional emoji are welcome' : 'no emoji';
221
+ const formalWord = st.formalityMarkers < 0.34 ? 'casual' : st.formalityMarkers < 0.67 ? 'balanced/neutral' : 'formal';
222
+ const oracleBrief = `User writing-style profile (observed): responses should be ${terseWord}; tone ${formalWord}; ${emojiWord}.`;
223
+
224
+ const allowedSystems = [baselineBrief, styleBrief, oracleBrief];
225
+ const allowedPrompts = ev.probes.map((p) => p.prompt);
226
+ const agent = makeAnthropicAgent({ apiKey, model: ANTHROPIC_MODEL, allowedSystems, allowedPrompts, budget });
227
+
228
+ // Run the three arms through the REAL runArm (REAL objectiveAdherence scoring).
229
+ const baseline = await runArm(agent, baselineBrief, ev.probes);
230
+ const heuristic = await runArm(agent, styleBrief, ev.probes);
231
+ const oracle = await runArm(agent, oracleBrief, ev.probes);
232
+
233
+ // Primary headline: paired McNemar on objectiveAdherence (heuristic vs baseline).
234
+ const headline = mcnemar(baseline.adherence, heuristic.adherence);
235
+ const oracleVsBaseline = mcnemar(baseline.adherence, oracle.adherence);
236
+
237
+ // Per-arm adherence rate + bootstrap CI (REAL helper).
238
+ const arms = {
239
+ baseline: { adherence: baseline.adherence, ci: bootstrapCI(baseline.adherence, { seed }) },
240
+ heuristic: { adherence: heuristic.adherence, ci: bootstrapCI(heuristic.adherence, { seed: seed + 1 }) },
241
+ oracle: { adherence: oracle.adherence, ci: bootstrapCI(oracle.adherence, { seed: seed + 2 }) },
242
+ };
243
+
244
+ // Secondary continuous: per-probe style-distance to the user's target. A
245
+ // closer-than-baseline 0/1 vector feeds a second paired McNemar (more
246
+ // sensitive than the thresholded adherence on a homogeneous corpus).
247
+ const target = ev.split.styleTarget;
248
+ const dBase = distVec(baseline.outputs, target);
249
+ const dHeur = distVec(heuristic.outputs, target);
250
+ const dOracle = distVec(oracle.outputs, target);
251
+ const heurCloser = dBase.map((d, i) => (dHeur[i] < d ? 1 : 0));
252
+ const baseCloserThanHeur = dBase.map((d, i) => (d < dHeur[i] ? 1 : 0));
253
+ const distanceMcnemar = mcnemar(baseCloserThanHeur, heurCloser); // before=base-wins, after=heur-wins
254
+ const meanDist = (v) => v.reduce((a, b) => a + b, 0) / (v.length || 1);
255
+
256
+ // ECE on confidence — the profile's style axes carry a confidence proxy via
257
+ // Beta mass. We compute a calibration over per-probe (confidence, correct)
258
+ // where confidence = the brief's asserted style strength and correct = the
259
+ // heuristic-arm adherence. This makes the confidence number honest.
260
+ const styleConfidence = (() => {
261
+ // average confirmed-axis "strength" = mean |ema-0.5|*2 over confirmed axes
262
+ let sum = 0; let n = 0;
263
+ for (const ax of ['formality', 'energy', 'terseness', 'emoji_use']) {
264
+ const a = trainProfile.global.style[ax];
265
+ if (a && a.evidence_count >= 5) { sum += Math.min(1, Math.abs(a.ema - 0.5) * 2); n += 1; }
266
+ }
267
+ return n ? sum / n : 0.5;
268
+ })();
269
+ const ecePairs = heuristic.adherence.map((y) => ({ confidence: styleConfidence, correct: y }));
270
+ const ece = expectedCalibrationError(ecePairs, { nBins: 10 });
271
+
272
+ // 5) BIAS-CONTROLLED JUDGE (secondary rater) + κ vs objective. Anthropic judge.
273
+ let judge = null;
274
+ try {
275
+ const styleDescription = `${terseWord}; tone ${formalWord}; ${emojiWord}`;
276
+ const pairs = ev.probes.map((_, i) => ({ a: heuristic.outputs[i], b: baseline.outputs[i] }));
277
+ const { preferA, details } = await runJudgePairs({
278
+ apiKey, model: ANTHROPIC_MODEL, styleDescription, pairs, budget, seed,
279
+ });
280
+ const objectivePrefersA = ev.probes.map((_, i) => (
281
+ heuristic.adherence[i] === 1 && baseline.adherence[i] === 0 ? 1 : 0
282
+ ));
283
+ judge = {
284
+ preferA,
285
+ objectivePrefersA,
286
+ judgePreferAHeuristicRate: preferA.reduce((a, b) => a + b, 0) / (preferA.length || 1),
287
+ kappa: cohenKappa(preferA, objectivePrefersA),
288
+ sampleDetails: details.slice(0, 3).map((d) => ({ aFirst: d.aFirst, out: d.out })),
289
+ };
290
+ } catch (e) {
291
+ judge = { error: String(e.message || e) };
292
+ }
293
+
294
+ const result = {
295
+ status: 'DONE',
296
+ model: ANTHROPIC_MODEL,
297
+ cloudCalls: budget.calls,
298
+ corpusStats: corpus.stats,
299
+ split: ev.split,
300
+ gateC: {
301
+ precision: gateC.precision,
302
+ recall: gateC.recall,
303
+ f1: gateC.f1,
304
+ predicted: gateC.predicted,
305
+ goldCount: gateC.gold.length,
306
+ counts: gateC.counts,
307
+ negativeControl: gateC.negativeControl,
308
+ nTrain: gateC.nTrain,
309
+ nProbe: gateC.nProbe,
310
+ heldOutDisjoint: gateC.heldOut.disjoint,
311
+ },
312
+ styleCapture,
313
+ gateB: {
314
+ arms: {
315
+ baseline: { rate: arms.baseline.ci.point, ci: arms.baseline.ci, adherence: arms.baseline.adherence },
316
+ heuristic_styleonly: { rate: arms.heuristic.ci.point, ci: arms.heuristic.ci, adherence: arms.heuristic.adherence },
317
+ oracle_styleceiling: { rate: arms.oracle.ci.point, ci: arms.oracle.ci, adherence: arms.oracle.adherence },
318
+ },
319
+ headline_mcnemar: headline,
320
+ oracle_vs_baseline_mcnemar: oracleVsBaseline,
321
+ distance: {
322
+ mean_baseline: meanDist(dBase),
323
+ mean_heuristic: meanDist(dHeur),
324
+ mean_oracle: meanDist(dOracle),
325
+ heuristic_closer_count: heurCloser.reduce((a, b) => a + b, 0),
326
+ baseline_closer_count: baseCloserThanHeur.reduce((a, b) => a + b, 0),
327
+ mcnemar: distanceMcnemar,
328
+ },
329
+ ece: { ece: ece.ece, styleConfidence },
330
+ judge,
331
+ },
332
+ privacy: {
333
+ transcripts_sent_to_cloud: false,
334
+ preference_brief_transmitted: false,
335
+ allowed_systems_count: allowedSystems.length,
336
+ note: 'Only style-only/oracle briefs (no user text) + authored prompts reached the cloud; guard-enforced.',
337
+ },
338
+ timestamp: new Date().toISOString(),
339
+ };
340
+
341
+ writeFileSync(join(scratch, 'results.json'), JSON.stringify(result, null, 2));
342
+ return result;
343
+ }
344
+
345
+ // CLI entry.
346
+ if (import.meta.url === `file://${process.argv[1]}`) {
347
+ runRealCorpus({
348
+ nProbes: Number(process.env.IJFW_EVAL_NPROBES) || 30,
349
+ maxCalls: Number(process.env.IJFW_EVAL_MAXCALLS) || 200,
350
+ }).then((r) => {
351
+ // print a compact summary (no raw text)
352
+ const { status, model, cloudCalls } = r;
353
+ console.log(JSON.stringify({ status, model, cloudCalls, reason: r.reason || null }, null, 2));
354
+ if (r.status !== 'DONE') process.exit(1);
355
+ }).catch((e) => { console.error('RUN ERROR:', e.message); process.exit(1); });
356
+ }
357
+
358
+ export default { runRealCorpus };