@ijfw/memory-server 1.5.6 → 1.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. package/bin/ijfw-dashboard +20 -1
  2. package/package.json +4 -3
  3. package/src/audit-roster.js +89 -12
  4. package/src/brain/tiered-llm.js +57 -7
  5. package/src/cross-orchestrator-cli.js +344 -4
  6. package/src/cross-project-search.js +39 -1
  7. package/src/dashboard-server.js +7 -1
  8. package/src/dream/runner.mjs +560 -8
  9. package/src/handlers/brain-handler.js +101 -1
  10. package/src/importers/discover.js +1 -1
  11. package/src/memory/bench-metrics.js +289 -0
  12. package/src/memory/benchmark.js +1 -1
  13. package/src/memory/search.js +53 -1
  14. package/src/orchestrator/plan-checker.js +1 -1
  15. package/src/profile/audit.js +671 -0
  16. package/src/profile/capture.js +871 -0
  17. package/src/profile/derive-dialectic.js +242 -0
  18. package/src/profile/derive-heuristic.js +733 -0
  19. package/src/profile/derive.js +156 -0
  20. package/src/profile/egress.js +306 -0
  21. package/src/profile/eval/build-real-probes.mjs +197 -0
  22. package/src/profile/eval/corpus-from-reddit.mjs +166 -0
  23. package/src/profile/eval/corpus-from-reddit.test.mjs +121 -0
  24. package/src/profile/eval/corpus-from-transcripts.mjs +264 -0
  25. package/src/profile/eval/gate-b-behavior.mjs +420 -0
  26. package/src/profile/eval/gate-b-decision-run.mjs +171 -0
  27. package/src/profile/eval/gate-b-decision-run.test.mjs +141 -0
  28. package/src/profile/eval/gate-b-run.mjs +417 -0
  29. package/src/profile/eval/gate-b-run.test.mjs +204 -0
  30. package/src/profile/eval/gate-c-capture.mjs +323 -0
  31. package/src/profile/eval/harness.mjs +551 -0
  32. package/src/profile/eval/instrument-validation.mjs +248 -0
  33. package/src/profile/eval/instrument-validation.test.mjs +125 -0
  34. package/src/profile/eval/multi-subject-harness.mjs +106 -0
  35. package/src/profile/eval/multi-subject-harness.test.mjs +99 -0
  36. package/src/profile/eval/personas.test.mjs +83 -0
  37. package/src/profile/eval/plumbing.test.mjs +69 -0
  38. package/src/profile/eval/prereg.mjs +130 -0
  39. package/src/profile/eval/prereg.test.mjs +78 -0
  40. package/src/profile/eval/real-corpus.test.mjs +103 -0
  41. package/src/profile/eval/real-personas.mjs +109 -0
  42. package/src/profile/eval/run-real-corpus-concurrent.mjs +407 -0
  43. package/src/profile/eval/run-real-corpus.mjs +358 -0
  44. package/src/profile/eval/slug-quality.mjs +464 -0
  45. package/src/profile/eval/stylometry-features.js +85 -0
  46. package/src/profile/eval/stylometry-reference.js +16 -0
  47. package/src/profile/eval/stylometry.js +224 -0
  48. package/src/profile/eval/stylometry.test.mjs +103 -0
  49. package/src/profile/eval/synthetic-personas.js +91 -0
  50. package/src/profile/eval/verifier-features.mjs +170 -0
  51. package/src/profile/eval/verifier-logreg.mjs +74 -0
  52. package/src/profile/eval/verifier-pair.mjs +122 -0
  53. package/src/profile/eval/verifier-reference.mjs +68 -0
  54. package/src/profile/eval/verifier-scorer.mjs +30 -0
  55. package/src/profile/eval/wrong-target-control.mjs +168 -0
  56. package/src/profile/eval/wrong-target-control.test.mjs +124 -0
  57. package/src/profile/exemplar-capture.js +232 -0
  58. package/src/profile/exemplar-retrieve.js +138 -0
  59. package/src/profile/exemplar-store.js +314 -0
  60. package/src/profile/lock.js +64 -0
  61. package/src/profile/merge.js +624 -0
  62. package/src/profile/path-policy.js +213 -0
  63. package/src/profile/precision-stamp.mjs +151 -0
  64. package/src/profile/render-brief.js +717 -0
  65. package/src/profile/schema.js +244 -0
  66. package/src/profile/sensitivity.js +249 -0
  67. package/src/profile/serve.js +345 -0
  68. package/src/profile/store.js +261 -0
  69. package/src/profile/telemetry.js +289 -0
  70. package/src/recovery/checkpoint.js +7 -1
  71. package/src/server.js +185 -14
  72. package/src/.registry-meta-key.pem +0 -3
@@ -0,0 +1,407 @@
1
+ /**
2
+ * profile/eval/run-real-corpus-concurrent.mjs — CONCURRENT driver for the
3
+ * real-corpus profile-bus eval (Gate C + Gate B) against the user's OWN Claude
4
+ * Code transcripts with a CLOUD frontier model as the agent-under-test.
5
+ *
6
+ * WHY THIS EXISTS (vs run-real-corpus.mjs):
7
+ * The sibling sequential orchestrator (`run-real-corpus.mjs`) runs every arm and
8
+ * every judge pair as a serial `await` — ~150 blocking round-trips, which makes a
9
+ * fully-synchronous in-turn run impractically slow. This driver uses the same
10
+ * REAL scoring + REAL stats but dispatches the cloud calls in bounded
11
+ * `Promise.all` BATCHES (default 8) so the whole A/B finishes in minutes, in
12
+ * one turn, with no backgrounding.
13
+ *
14
+ * PUSH V2: a FOURTH arm — the DIRECTIVE brief (renderBrief style:'directive') —
15
+ * sits alongside baseline / descriptive / oracle. The four pre-registered
16
+ * paired McNemar contrasts (descriptive-vs-baseline, directive-vs-baseline [the
17
+ * bar], directive-vs-descriptive, oracle-vs-baseline) are reported on both the
18
+ * thresholded adherence metric and the continuous style-distance metric. Gate C
19
+ * reports BOTH the exact-slug and the SEMANTIC (Jaccard) preference metric.
20
+ *
21
+ * NO NEW SCORING MATH. Every statistic comes from the REAL helpers:
22
+ * - derivation: deriveProfileFromSessions (gate-c-capture.mjs) -> REAL derive/merge
23
+ * - brief: renderBrief (render-brief.js) -> the production serving string
24
+ * - adherence: objectiveAdherence (gate-b-behavior.mjs)
25
+ * - stats: mcnemar / bootstrapCI / cohenKappa / expectedCalibrationError /
26
+ * objectiveStyle (harness.mjs, re-exporting bench-metrics.js)
27
+ * - Gate C: runGateC (gate-c-capture.mjs)
28
+ * - corpus+split: buildCorpus / buildRealEval
29
+ * The ONLY thing this file owns is the concurrency loop + the same privacy guard
30
+ * and call-budget that run-real-corpus.mjs uses.
31
+ *
32
+ * PRIVACY (guard-enforced, identical to the sequential driver):
33
+ * The cloud agent only ever receives (a) a system brief drawn from the CLOSED
34
+ * set { '' , style-only-brief , authored-oracle-brief } — NONE of which contains
35
+ * raw user prose — and (b) an authored generic prompt from GENERIC_PROMPTS. A
36
+ * guard asserts membership BEFORE every network call; anything else ABORTS the
37
+ * run. The full preference-tier brief (whose "Observed preference" lines are
38
+ * fragments of the user's real prose on this corpus) is DELIBERATELY NOT sent.
39
+ *
40
+ * BOUNDED SPEND: a hard call counter aborts if a call would exceed `maxCalls`.
41
+ *
42
+ * Node built-ins only (global fetch). No new deps. ESM.
43
+ */
44
+
45
+ import { writeFileSync, mkdirSync } from 'node:fs';
46
+ import { join } from 'node:path';
47
+ import { renderBrief } from '../render-brief.js';
48
+ import { makeProfile } from '../schema.js';
49
+ import { deriveProfileFromSessions, runGateC } from './gate-c-capture.mjs';
50
+ import { objectiveAdherence } from './gate-b-behavior.mjs';
51
+ import {
52
+ objectiveStyle, cohenKappa, bootstrapCI, mcnemar,
53
+ expectedCalibrationError, mulberry32,
54
+ } from './harness.mjs';
55
+ import { buildCorpus } from './corpus-from-transcripts.mjs';
56
+ import { buildRealEval } from './build-real-probes.mjs';
57
+
58
+ const ANTHROPIC_MODEL = process.env.IJFW_EVAL_MODEL || 'claude-opus-4-8';
59
+ const ANTHROPIC_URL = 'https://api.anthropic.com/v1/messages';
60
+ const AGENT_MAX_TOKENS = 1024; // matches the sequential driver: lets length vary between arms
61
+
62
+ /** Run an array of async thunks in bounded-concurrency batches, preserving order. */
63
+ async function mapBatched(items, batchSize, fn) {
64
+ const out = Array.from({ length: items.length });
65
+ for (let i = 0; i < items.length; i += batchSize) {
66
+ const slice = items.slice(i, i + batchSize);
67
+ // eslint-disable-next-line no-await-in-loop
68
+ const res = await Promise.all(slice.map((it, j) => fn(it, i + j)));
69
+ for (let j = 0; j < res.length; j++) out[i + j] = res[j];
70
+ }
71
+ return out;
72
+ }
73
+
74
+ /** One Anthropic chat call. Returns the joined text. Budget + privacy enforced by caller. */
75
+ async function anthropicCall({ apiKey, model, system, prompt, maxTokens }) {
76
+ const body = { model, max_tokens: maxTokens, messages: [{ role: 'user', content: prompt }] };
77
+ if (system) body.system = system;
78
+ const res = await fetch(ANTHROPIC_URL, {
79
+ method: 'POST',
80
+ headers: { 'x-api-key': apiKey, 'anthropic-version': '2023-06-01', 'content-type': 'application/json' },
81
+ body: JSON.stringify(body),
82
+ });
83
+ if (!res.ok) {
84
+ const t = await res.text().catch(() => '');
85
+ throw new Error(`Anthropic HTTP ${res.status}: ${t.slice(0, 160)}`);
86
+ }
87
+ const j = await res.json();
88
+ return (j.content || []).filter((c) => c.type === 'text').map((c) => c.text).join('');
89
+ }
90
+
91
+ /**
92
+ * Run ONE arm CONCURRENTLY over the probe set. The brief is the SYSTEM context —
93
+ * exactly how a host passively injects the profile. Returns { outputs, adherence }
94
+ * in probe order. Privacy guard + budget checked before each call.
95
+ */
96
+ async function runArmConcurrent({
97
+ apiKey, model, brief, probes, allowedSys, allowedPr, budget, batchSize,
98
+ }) {
99
+ const sys = String(brief || '');
100
+ if (!allowedSys.has(sys)) throw new Error('PRIVACY GUARD: system brief not in allowed set — aborting');
101
+ const outputs = await mapBatched(probes, batchSize, async (probe) => {
102
+ const pr = String(probe.prompt || '');
103
+ if (!allowedPr.has(pr)) throw new Error('PRIVACY GUARD: prompt not in authored set — aborting');
104
+ if (budget.calls >= budget.max) throw new Error(`BUDGET: exceeded ${budget.max} cloud calls`);
105
+ budget.calls += 1;
106
+ return anthropicCall({ apiKey, model, system: sys, prompt: pr, maxTokens: AGENT_MAX_TOKENS });
107
+ });
108
+ const adherence = outputs.map((text, i) => objectiveAdherence(text, probes[i]));
109
+ return { outputs, adherence };
110
+ }
111
+
112
+ /**
113
+ * Bias-controlled pairwise judge, CONCURRENT. Replicates biasControlledJudge's
114
+ * position-randomization + length-control with an async (network) judge, batched.
115
+ * Returns the 0/1 preferA vector (position flip undone).
116
+ */
117
+ async function runJudgeConcurrent({
118
+ apiKey, model, styleDescription, pairs, budget, batchSize, seed,
119
+ }) {
120
+ const rng = mulberry32(seed);
121
+ // pre-roll the coin per pair so order-of-resolution can't change randomization
122
+ const plans = pairs.map((it) => {
123
+ const aFirst = rng() < 0.5;
124
+ const rawA = String(it.a || '');
125
+ const rawB = String(it.b || '');
126
+ const target = Math.min(rawA.length, rawB.length) || Math.max(rawA.length, rawB.length);
127
+ const cut = (s) => {
128
+ if (s.length <= target) return s;
129
+ const c = s.slice(0, target);
130
+ const sp = c.lastIndexOf(' ');
131
+ return sp > target * 0.6 ? c.slice(0, sp) : c;
132
+ };
133
+ return { aFirst, first: aFirst ? cut(rawA) : cut(rawB), second: aFirst ? cut(rawB) : cut(rawA) };
134
+ });
135
+ const results = await mapBatched(plans, batchSize, async (p) => {
136
+ if (budget.calls >= budget.max) throw new Error(`BUDGET: exceeded ${budget.max} cloud calls`);
137
+ budget.calls += 1;
138
+ const sys = `You are a STYLE judge. Decide which candidate better matches this writing style: ${styleDescription}. `
139
+ + 'Answer with ONLY the single character "1" or "2". No other text.';
140
+ const usr = `Candidate 1:\n${p.first}\n\n---\n\nCandidate 2:\n${p.second}\n\nWhich better matches the target style? Reply 1 or 2.`;
141
+ const out = await anthropicCall({ apiKey, model, system: sys, prompt: usr, maxTokens: 4 });
142
+ return out.trim();
143
+ });
144
+ const preferA = [];
145
+ const details = [];
146
+ for (let i = 0; i < plans.length; i++) {
147
+ const out = results[i];
148
+ const choseSecond = out.startsWith('2') ? 1 : 0;
149
+ const judgePrefersFirst = choseSecond === 0;
150
+ const prefersA = plans[i].aFirst ? judgePrefersFirst : !judgePrefersFirst;
151
+ preferA.push(prefersA ? 1 : 0);
152
+ details.push({ aFirst: plans[i].aFirst, out });
153
+ }
154
+ return { preferA, details };
155
+ }
156
+
157
+ /** Scoped per-probe distance (brief-controllable dims only) — mirrors the sequential driver. */
158
+ function scopedDistance(out, target) {
159
+ const g = objectiveStyle(out);
160
+ let d = Math.abs((g.terseness || 0) - (target.terseness || 0));
161
+ d += Math.abs((g.formalityMarkers || 0) - (target.formalityMarkers || 0));
162
+ d += Math.abs((g.emojiPerChar > 0 ? 1 : 0) - (target.emojiPerChar > 0 ? 1 : 0)) * 0.5;
163
+ return d;
164
+ }
165
+
166
+ export async function runRealCorpusConcurrent(opts = {}) {
167
+ const apiKey = process.env.ANTHROPIC_API_KEY;
168
+ if (!apiKey) return { status: 'BLOCKED', reason: 'ANTHROPIC_API_KEY not set in env' };
169
+
170
+ const scratch = opts.scratch || join(process.cwd(), '.ijfw', 'profile-eval-scratch');
171
+ mkdirSync(scratch, { recursive: true });
172
+
173
+ const nProbes = Number.isFinite(opts.nProbes) ? opts.nProbes : 24;
174
+ const batchSize = Number.isFinite(opts.batchSize) ? opts.batchSize : 8;
175
+ const budget = { calls: 0, max: Number.isFinite(opts.maxCalls) ? opts.maxCalls : 150 };
176
+ const seed = 7;
177
+ const model = opts.model || ANTHROPIC_MODEL;
178
+ // Push v2: a 4th arm (directive brief). The judge is a SECONDARY reliability
179
+ // rater (κ only), so it runs on a bounded subsample to keep total cloud spend
180
+ // under the pre-registered cap. judgeSampleN pairs are judged.
181
+ const judgeSampleN = Number.isFinite(opts.judgeSampleN) ? opts.judgeSampleN : 30;
182
+
183
+ // 1) REAL corpus (bounded). Deterministic stratified round-robin (no RNG): the
184
+ // "seed" is the sorted-dir + chronological ordering, fully reproducible.
185
+ const corpus = buildCorpus({ minMessages: opts.minMessages ?? 8, cap: opts.cap ?? 200 });
186
+ if (!corpus.sessions.length) return { status: 'BLOCKED', reason: `empty corpus: ${JSON.stringify(corpus.stats)}` };
187
+ writeFileSync(join(scratch, 'corpus-stats.json'), JSON.stringify(corpus.stats, null, 2));
188
+
189
+ // 2) Time-based split + probes (REAL).
190
+ const ev = await buildRealEval(corpus, { trainFraction: opts.trainFraction ?? 0.6, nProbes });
191
+ writeFileSync(join(scratch, 'split.json'), JSON.stringify(ev.split, null, 2));
192
+
193
+ // 3) GATE C — held-out capture (OFFLINE, no cloud).
194
+ const cCorpus = { sessions: ev.train, probes: ev.probes, negativeControl: ev.negativeControl };
195
+ const gateC = await runGateC(cCorpus, { bootstrapSeed: 42 });
196
+
197
+ // 3b) STYLE-axis capture: TRAIN-derived vs TEST-derived EMA per axis (+|diff|).
198
+ const trainProfile = await deriveProfileFromSessions(ev.train, {});
199
+ const testProfile = await deriveProfileFromSessions(ev.test, {});
200
+ const styleCapture = {};
201
+ for (const ax of ['formality', 'energy', 'terseness', 'emoji_use']) {
202
+ const a = trainProfile.global.style[ax];
203
+ const b = testProfile.global.style[ax];
204
+ styleCapture[ax] = {
205
+ train_ema: a ? a.ema : null,
206
+ test_ema: b ? b.ema : null,
207
+ abs_diff: (a && b) ? Math.abs(a.ema - b.ema) : null,
208
+ train_evidence: a ? a.evidence_count : 0,
209
+ test_evidence: b ? b.evidence_count : 0,
210
+ confirmed_both: Boolean(a && b && a.evidence_count >= 5 && b.evidence_count >= 5),
211
+ };
212
+ }
213
+
214
+ // 4) GATE B — behavior A/B (CLOUD), privacy-safe arms only.
215
+ const renderOpts = { env: process.env };
216
+ const styleBrief = renderBrief(trainProfile, renderOpts).text; // descriptive style axes (no prose)
217
+ // Push v2 CHANGE 1: the DIRECTIVE arm — the SAME derived/gated style content,
218
+ // phrased as actionable guidance. Opt-in mode; default stays descriptive.
219
+ const directiveBrief = renderBrief(trainProfile, { ...renderOpts, style: 'directive' }).text;
220
+ const baselineBrief = renderBrief(makeProfile(), renderOpts).text; // '' by construction
221
+ const st = ev.split.styleTarget;
222
+ const terseWord = st.terseness < 0.34 ? 'expansive and detailed' : st.terseness < 0.67 ? 'moderate length' : 'terse';
223
+ const emojiWord = st.emojiPerChar > 0 ? 'occasional emoji are welcome' : 'no emoji';
224
+ const formalWord = st.formalityMarkers < 0.34 ? 'casual' : st.formalityMarkers < 0.67 ? 'balanced/neutral' : 'formal';
225
+ const oracleBrief = `User writing-style profile (observed): responses should be ${terseWord}; tone ${formalWord}; ${emojiWord}.`;
226
+
227
+ // PRIVACY GUARD allow-set: every system string a cloud call may carry. The
228
+ // directive brief, like the descriptive one, is style-axis-only (no user
229
+ // prose) — confirm it is non-empty and prose-free before admitting it.
230
+ const allowedSys = new Set([baselineBrief, styleBrief, directiveBrief, oracleBrief]);
231
+ const allowedPr = new Set(ev.probes.map((p) => p.prompt));
232
+
233
+ const armArgs = { apiKey, model, probes: ev.probes, allowedSys, allowedPr, budget, batchSize };
234
+ const baseline = await runArmConcurrent({ ...armArgs, brief: baselineBrief });
235
+ const heuristic = await runArmConcurrent({ ...armArgs, brief: styleBrief }); // descriptive
236
+ const directive = await runArmConcurrent({ ...armArgs, brief: directiveBrief }); // directive
237
+ const oracle = await runArmConcurrent({ ...armArgs, brief: oracleBrief });
238
+
239
+ // Four pre-registered paired McNemar contrasts on objectiveAdherence:
240
+ // (1) descriptive vs baseline — replicates the prior headline
241
+ // (2) directive vs baseline — THE PUSH-V2 BAR
242
+ // (3) directive vs descriptive— does directive phrasing beat descriptive?
243
+ // (4) oracle vs baseline — the explicit-signal ceiling
244
+ const descriptiveVsBaseline = mcnemar(baseline.adherence, heuristic.adherence);
245
+ const directiveVsBaseline = mcnemar(baseline.adherence, directive.adherence);
246
+ const directiveVsDescriptive = mcnemar(heuristic.adherence, directive.adherence);
247
+ const oracleVsBaseline = mcnemar(baseline.adherence, oracle.adherence);
248
+ const headline = directiveVsBaseline; // the bar this push must clear
249
+
250
+ const arms = {
251
+ baseline: bootstrapCI(baseline.adherence, { seed }),
252
+ heuristic: bootstrapCI(heuristic.adherence, { seed: seed + 1 }),
253
+ directive: bootstrapCI(directive.adherence, { seed: seed + 4 }),
254
+ oracle: bootstrapCI(oracle.adherence, { seed: seed + 2 }),
255
+ };
256
+
257
+ // Secondary continuous: per-probe style-distance to the user's target -> paired McNemar.
258
+ const target = ev.split.styleTarget;
259
+ const dBase = baseline.outputs.map((o) => scopedDistance(o, target));
260
+ const dHeur = heuristic.outputs.map((o) => scopedDistance(o, target));
261
+ const dDir = directive.outputs.map((o) => scopedDistance(o, target));
262
+ const dOracle = oracle.outputs.map((o) => scopedDistance(o, target));
263
+ const meanDist = (v) => v.reduce((a, b) => a + b, 0) / (v.length || 1);
264
+ // closer-than-baseline 0/1 vectors per arm, fed to paired McNemar.
265
+ const heurCloser = dBase.map((d, i) => (dHeur[i] < d ? 1 : 0));
266
+ const baseCloserH = dBase.map((d, i) => (d < dHeur[i] ? 1 : 0));
267
+ const dirCloser = dBase.map((d, i) => (dDir[i] < d ? 1 : 0));
268
+ const baseCloserD = dBase.map((d, i) => (d < dDir[i] ? 1 : 0));
269
+ // directive-closer-than-descriptive
270
+ const dirCloserThanHeur = dHeur.map((d, i) => (dDir[i] < d ? 1 : 0));
271
+ const heurCloserThanDir = dHeur.map((d, i) => (d < dDir[i] ? 1 : 0));
272
+ const distanceMcnemarHeur = mcnemar(baseCloserH, heurCloser);
273
+ const distanceMcnemarDir = mcnemar(baseCloserD, dirCloser);
274
+ const distanceMcnemarDirVsHeur = mcnemar(heurCloserThanDir, dirCloserThanHeur);
275
+
276
+ // ECE on confidence (style strength of confirmed axes vs heuristic-arm adherence).
277
+ const styleConfidence = (() => {
278
+ let sum = 0; let n = 0;
279
+ for (const ax of ['formality', 'energy', 'terseness', 'emoji_use']) {
280
+ const a = trainProfile.global.style[ax];
281
+ if (a && a.evidence_count >= 5) { sum += Math.min(1, Math.abs(a.ema - 0.5) * 2); n += 1; }
282
+ }
283
+ return n ? sum / n : 0.5;
284
+ })();
285
+ // ECE on the DIRECTIVE arm (the bar) — asserted style confidence vs directive-arm hit-rate.
286
+ const ece = expectedCalibrationError(
287
+ directive.adherence.map((y) => ({ confidence: styleConfidence, correct: y })),
288
+ { nBins: 10 },
289
+ );
290
+ const eceDescriptive = expectedCalibrationError(
291
+ heuristic.adherence.map((y) => ({ confidence: styleConfidence, correct: y })),
292
+ { nBins: 10 },
293
+ );
294
+
295
+ // 5) BIAS-CONTROLLED JUDGE (secondary rater) + κ vs objective (CONCURRENT).
296
+ // Judges the DIRECTIVE arm (the bar) vs baseline, on a bounded SUBSAMPLE of
297
+ // pairs (κ only needs a sample; this keeps total cloud spend under the
298
+ // pre-registered cap). Deterministic first-judgeSampleN probes.
299
+ let judge = null;
300
+ try {
301
+ const styleDescription = `${terseWord}; tone ${formalWord}; ${emojiWord}`;
302
+ const nJudge = Math.min(judgeSampleN, ev.probes.length);
303
+ const idx = Array.from({ length: nJudge }, (_, i) => i);
304
+ const pairs = idx.map((i) => ({ a: directive.outputs[i], b: baseline.outputs[i] }));
305
+ const { preferA, details } = await runJudgeConcurrent({
306
+ apiKey, model, styleDescription, pairs, budget, batchSize, seed,
307
+ });
308
+ const objectivePrefersA = idx.map((i) => (
309
+ directive.adherence[i] === 1 && baseline.adherence[i] === 0 ? 1 : 0
310
+ ));
311
+ judge = {
312
+ arm: 'directive_vs_baseline',
313
+ nJudged: nJudge,
314
+ judgePreferADirectiveRate: preferA.reduce((a, b) => a + b, 0) / (preferA.length || 1),
315
+ kappa: cohenKappa(preferA, objectivePrefersA),
316
+ preferA,
317
+ objectivePrefersA,
318
+ sampleDetails: details.slice(0, 3),
319
+ };
320
+ } catch (e) {
321
+ judge = { error: String(e.message || e) };
322
+ }
323
+
324
+ const result = {
325
+ status: 'DONE',
326
+ model,
327
+ concurrency: batchSize,
328
+ cloudCalls: budget.calls,
329
+ budgetMax: budget.max,
330
+ corpusStats: corpus.stats,
331
+ split: ev.split,
332
+ gateC: {
333
+ // exact-slug metric (the honest floor — kept for continuity with prior run)
334
+ precision: gateC.precision,
335
+ recall: gateC.recall,
336
+ f1: gateC.f1,
337
+ // Push v2 CHANGE 2: SEMANTIC metric (Jaccard >= threshold on content tokens)
338
+ semantic: gateC.semantic,
339
+ predicted: gateC.predicted,
340
+ goldCount: gateC.gold.length,
341
+ counts: gateC.counts,
342
+ negativeControl: gateC.negativeControl,
343
+ nTrain: gateC.nTrain,
344
+ nProbe: gateC.nProbe,
345
+ heldOutDisjoint: gateC.heldOut.disjoint,
346
+ },
347
+ styleCapture,
348
+ gateB: {
349
+ nProbes: ev.probes.length,
350
+ arms: {
351
+ baseline: { rate: arms.baseline.point, ci: arms.baseline, adherence: baseline.adherence },
352
+ descriptive_styleonly: { rate: arms.heuristic.point, ci: arms.heuristic, adherence: heuristic.adherence },
353
+ directive_styleonly: { rate: arms.directive.point, ci: arms.directive, adherence: directive.adherence },
354
+ oracle_styleceiling: { rate: arms.oracle.point, ci: arms.oracle, adherence: oracle.adherence },
355
+ },
356
+ mcnemar: {
357
+ descriptive_vs_baseline: descriptiveVsBaseline,
358
+ directive_vs_baseline: directiveVsBaseline, // THE BAR
359
+ directive_vs_descriptive: directiveVsDescriptive,
360
+ oracle_vs_baseline: oracleVsBaseline,
361
+ },
362
+ headline_mcnemar: headline, // == directive_vs_baseline
363
+ distance: {
364
+ mean_baseline: meanDist(dBase),
365
+ mean_descriptive: meanDist(dHeur),
366
+ mean_directive: meanDist(dDir),
367
+ mean_oracle: meanDist(dOracle),
368
+ descriptive_closer_count: heurCloser.reduce((a, b) => a + b, 0),
369
+ directive_closer_count: dirCloser.reduce((a, b) => a + b, 0),
370
+ mcnemar_descriptive_vs_baseline: distanceMcnemarHeur,
371
+ mcnemar_directive_vs_baseline: distanceMcnemarDir,
372
+ mcnemar_directive_vs_descriptive: distanceMcnemarDirVsHeur,
373
+ },
374
+ ece: { directive: ece.ece, descriptive: eceDescriptive.ece, styleConfidence },
375
+ judge,
376
+ },
377
+ privacy: {
378
+ transcripts_sent_to_cloud: false,
379
+ preference_brief_transmitted: false,
380
+ allowed_systems_count: allowedSys.size,
381
+ directive_brief_prose_free: !/Observed preference|Tentative pattern|Honor this preference|Where it fits/.test(directiveBrief),
382
+ note: 'Only baseline/descriptive/directive style-only briefs + oracle (none containing user prose) + authored prompts reached the cloud; guard-enforced before every call.',
383
+ },
384
+ timestamp: new Date().toISOString(),
385
+ };
386
+
387
+ writeFileSync(join(scratch, 'results.json'), JSON.stringify(result, null, 2));
388
+ return result;
389
+ }
390
+
391
+ // CLI entry.
392
+ if (import.meta.url === `file://${process.argv[1]}`) {
393
+ runRealCorpusConcurrent({
394
+ scratch: process.env.IJFW_EVAL_SCRATCH,
395
+ nProbes: Number(process.env.IJFW_EVAL_NPROBES) || 80,
396
+ maxCalls: Number(process.env.IJFW_EVAL_MAXCALLS) || 360,
397
+ batchSize: Number(process.env.IJFW_EVAL_BATCH) || 8,
398
+ cap: Number(process.env.IJFW_EVAL_CAP) || 200,
399
+ judgeSampleN: Number(process.env.IJFW_EVAL_JUDGE_N) || 30,
400
+ }).then((r) => {
401
+ const { status, model, cloudCalls } = r;
402
+ console.error(JSON.stringify({ status, model, cloudCalls, reason: r.reason || null }));
403
+ if (r.status !== 'DONE') process.exit(1);
404
+ }).catch((e) => { console.error('RUN ERROR:', e.message); process.exit(1); });
405
+ }
406
+
407
+ export default { runRealCorpusConcurrent };