@ijfw/memory-server 1.5.6 → 1.6.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (73) hide show
  1. package/bin/ijfw-dashboard +20 -1
  2. package/package.json +4 -3
  3. package/src/audit-roster.js +89 -12
  4. package/src/brain/tiered-llm.js +57 -7
  5. package/src/cross-orchestrator-cli.js +390 -4
  6. package/src/cross-project-search.js +39 -1
  7. package/src/dashboard-server.js +23 -1
  8. package/src/dream/runner.mjs +560 -8
  9. package/src/handlers/brain-handler.js +101 -1
  10. package/src/importers/discover.js +1 -1
  11. package/src/memory/bench-metrics.js +289 -0
  12. package/src/memory/benchmark.js +1 -1
  13. package/src/memory/search.js +53 -1
  14. package/src/model-refresh.js +4 -2
  15. package/src/orchestrator/plan-checker.js +1 -1
  16. package/src/profile/audit.js +671 -0
  17. package/src/profile/capture.js +871 -0
  18. package/src/profile/derive-dialectic.js +242 -0
  19. package/src/profile/derive-heuristic.js +733 -0
  20. package/src/profile/derive.js +156 -0
  21. package/src/profile/egress.js +306 -0
  22. package/src/profile/eval/build-real-probes.mjs +197 -0
  23. package/src/profile/eval/corpus-from-reddit.mjs +166 -0
  24. package/src/profile/eval/corpus-from-reddit.test.mjs +121 -0
  25. package/src/profile/eval/corpus-from-transcripts.mjs +264 -0
  26. package/src/profile/eval/gate-b-behavior.mjs +420 -0
  27. package/src/profile/eval/gate-b-decision-run.mjs +171 -0
  28. package/src/profile/eval/gate-b-decision-run.test.mjs +141 -0
  29. package/src/profile/eval/gate-b-run.mjs +417 -0
  30. package/src/profile/eval/gate-b-run.test.mjs +204 -0
  31. package/src/profile/eval/gate-c-capture.mjs +323 -0
  32. package/src/profile/eval/harness.mjs +551 -0
  33. package/src/profile/eval/instrument-validation.mjs +248 -0
  34. package/src/profile/eval/instrument-validation.test.mjs +125 -0
  35. package/src/profile/eval/multi-subject-harness.mjs +106 -0
  36. package/src/profile/eval/multi-subject-harness.test.mjs +99 -0
  37. package/src/profile/eval/personas.test.mjs +83 -0
  38. package/src/profile/eval/plumbing.test.mjs +69 -0
  39. package/src/profile/eval/prereg.mjs +130 -0
  40. package/src/profile/eval/prereg.test.mjs +78 -0
  41. package/src/profile/eval/real-corpus.test.mjs +103 -0
  42. package/src/profile/eval/real-personas.mjs +109 -0
  43. package/src/profile/eval/run-real-corpus-concurrent.mjs +407 -0
  44. package/src/profile/eval/run-real-corpus.mjs +358 -0
  45. package/src/profile/eval/slug-quality.mjs +464 -0
  46. package/src/profile/eval/stylometry-features.js +85 -0
  47. package/src/profile/eval/stylometry-reference.js +16 -0
  48. package/src/profile/eval/stylometry.js +224 -0
  49. package/src/profile/eval/stylometry.test.mjs +103 -0
  50. package/src/profile/eval/synthetic-personas.js +91 -0
  51. package/src/profile/eval/verifier-features.mjs +170 -0
  52. package/src/profile/eval/verifier-logreg.mjs +74 -0
  53. package/src/profile/eval/verifier-pair.mjs +122 -0
  54. package/src/profile/eval/verifier-reference.mjs +68 -0
  55. package/src/profile/eval/verifier-scorer.mjs +30 -0
  56. package/src/profile/eval/wrong-target-control.mjs +168 -0
  57. package/src/profile/eval/wrong-target-control.test.mjs +124 -0
  58. package/src/profile/exemplar-capture.js +232 -0
  59. package/src/profile/exemplar-retrieve.js +138 -0
  60. package/src/profile/exemplar-store.js +314 -0
  61. package/src/profile/lock.js +64 -0
  62. package/src/profile/merge.js +624 -0
  63. package/src/profile/path-policy.js +213 -0
  64. package/src/profile/precision-stamp.mjs +151 -0
  65. package/src/profile/render-brief.js +717 -0
  66. package/src/profile/schema.js +244 -0
  67. package/src/profile/sensitivity.js +249 -0
  68. package/src/profile/serve.js +345 -0
  69. package/src/profile/store.js +261 -0
  70. package/src/profile/telemetry.js +289 -0
  71. package/src/recovery/checkpoint.js +7 -1
  72. package/src/server.js +194 -16
  73. package/src/.registry-meta-key.pem +0 -3
@@ -0,0 +1,551 @@
1
+ /**
2
+ * profile/eval/harness.mjs — Cross-system profile bus, PHASE P5 (shared eval infra).
3
+ *
4
+ * THE "PROVE IT" FRONT. This is the rigor stack the two gates (Gate C capture,
5
+ * Gate B behavior) sit on top of. The brand position attacks competitors for the
6
+ * "assert-not-prove" move (the Honcho move): grading an INTERNAL artifact and
7
+ * calling it proof of learning. This harness ports the SAME fact-recall rigor our
8
+ * published memory benchmark uses — held-out splits, paired baselines, bootstrap
9
+ * CIs, paired McNemar, bias-controlled judging, ECE, κ — onto the profile bus.
10
+ *
11
+ * NO STUBS IN THE PIPELINE. Every gate wires the REAL profile modules:
12
+ * - deriveHeuristic / deriveProfile (src/profile/derive*.js) — derivation
13
+ * - applyDelta (src/profile/merge.js) — fold to profile
14
+ * - renderBrief (src/profile/render-brief) — the injected brief
15
+ * and the REAL statistics helpers, imported (NOT re-derived) from the lab-study
16
+ * memory benchmark:
17
+ * - bootstrapCI / mcnemar (src/memory/bench-metrics.js)
18
+ * The ONLY env-gated stub-point is the LIVE-LLM agent run in Gate B, and even
19
+ * there the injection point is real (exercised by a deterministic fake transport
20
+ * in tests; a genuine local HTTP transport when run live).
21
+ *
22
+ * Zero deps. ESM. Node built-ins only.
23
+ *
24
+ * Cites (methodology): LaMP time-based split [2304.11406] · PrefEval behavior
25
+ * [2502.09597] · LLM-judge bias [2410.02736] · persona caricature [2402.10811].
26
+ */
27
+
28
+ // REUSE the real lab-study stat helpers — do NOT re-derive the math. These are
29
+ // the same functions the published memory benchmark uses (bench-metrics.js).
30
+ import { bootstrapCI, mcnemar } from '../../memory/bench-metrics.js';
31
+
32
+ // Re-export so the gates import the SAME implementations (single source of truth).
33
+ export { bootstrapCI, mcnemar };
34
+
35
+ // ---------------------------------------------------------------------------
36
+ // Deterministic RNG (mulberry32) — for position randomization + any sampling.
37
+ // Matches the seed discipline the bench harness uses so eval runs are
38
+ // reproducible (a non-deterministic eval cannot be a regression test).
39
+ // ---------------------------------------------------------------------------
40
+
41
+ export function mulberry32(seed) {
42
+ let a = seed >>> 0;
43
+ return function rng() {
44
+ a |= 0;
45
+ a = (a + 0x6d2b79f5) | 0;
46
+ let t = Math.imul(a ^ (a >>> 15), 1 | a);
47
+ t = (t + Math.imul(t ^ (t >>> 7), 61 | t)) ^ t;
48
+ return ((t ^ (t >>> 14)) >>> 0) / 4294967296;
49
+ };
50
+ }
51
+
52
+ // ---------------------------------------------------------------------------
53
+ // Held-out TIME-BASED split (LaMP-style). NEVER test on a session you trained
54
+ // on. The split is by timestamp: train = sessions strictly before `cutoff`,
55
+ // probe = sessions at/after `cutoff`. We also return the disjointness proof so a
56
+ // gate can ASSERT no session id leaks across the boundary before scoring.
57
+ // ---------------------------------------------------------------------------
58
+
59
+ /**
60
+ * splitByTime(sessions, opts) -> { train, probe, trainIds, probeIds, disjoint }.
61
+ *
62
+ * @param {Array<{session_id?:string, sessionId?:string, ts?:string}>} sessions
63
+ * @param {object} [opts]
64
+ * @param {string|number} [opts.cutoff] ISO/epoch boundary. If omitted, splits
65
+ * chronologically at `trainFraction` of the (time-sorted) sessions.
66
+ * @param {number} [opts.trainFraction] default 0.6 (sessions 1..k = first 60%).
67
+ * @param {boolean} [opts.allowUndated] escape hatch: permit a majority-undated
68
+ * corpus (default false — a mostly-undated corpus throws, see below).
69
+ * @param {number} [opts.maxUndatedFraction] default 0.5 — the fraction of
70
+ * undated sessions above which the split is rejected as degenerate.
71
+ *
72
+ * A session with no parseable ts sorts to the END (treated as most-recent, i.e.
73
+ * probe-side) so an undated probe never silently lands in train.
74
+ *
75
+ * DEGENERATE-SPLIT GUARD (M3): on a malformed real corpus where most sessions are
76
+ * undated, every undated session sorts to the probe side and a time-based split
77
+ * becomes meaningless — yet it would still "pass" (disjoint, non-empty). We refuse
78
+ * that silently-degenerate split: if more than `maxUndatedFraction` of sessions
79
+ * are undated we THROW (unless `allowUndated` is set for a known-undated test
80
+ * corpus). A held-out split you can't trust is worse than no split.
81
+ */
82
+ export function splitByTime(sessions = [], opts = {}) {
83
+ const list = Array.isArray(sessions) ? sessions.slice() : [];
84
+ const tsOf = (s) => {
85
+ const t = Date.parse(s && s.ts);
86
+ return Number.isFinite(t) ? t : Number.POSITIVE_INFINITY;
87
+ };
88
+ // Guard against a silently-degenerate split on a mostly-undated corpus.
89
+ if (list.length > 0 && !opts.allowUndated) {
90
+ const maxUndated = Number.isFinite(opts.maxUndatedFraction) ? opts.maxUndatedFraction : 0.5;
91
+ const undated = list.filter((s) => !Number.isFinite(Date.parse(s && s.ts))).length;
92
+ if (undated / list.length > maxUndated) {
93
+ throw new Error(
94
+ `splitByTime: ${undated}/${list.length} sessions are undated (> ${maxUndated} of the corpus). `
95
+ + 'A time-based held-out split on a mostly-undated corpus is degenerate (all undated sessions '
96
+ + 'sort to probe). Refusing to produce a split that looks valid but is not. '
97
+ + 'Pass { allowUndated: true } only for a known-undated test corpus.',
98
+ );
99
+ }
100
+ }
101
+ list.sort((a, b) => tsOf(a) - tsOf(b));
102
+
103
+ let cutoffMs = null;
104
+ if (opts.cutoff !== undefined && opts.cutoff !== null) {
105
+ const t = typeof opts.cutoff === 'number' ? opts.cutoff : Date.parse(opts.cutoff);
106
+ if (Number.isFinite(t)) cutoffMs = t;
107
+ }
108
+
109
+ let train; let probe;
110
+ if (cutoffMs !== null) {
111
+ train = list.filter((s) => tsOf(s) < cutoffMs);
112
+ probe = list.filter((s) => tsOf(s) >= cutoffMs);
113
+ } else {
114
+ const frac = Number.isFinite(opts.trainFraction) ? opts.trainFraction : 0.6;
115
+ const k = Math.max(1, Math.min(list.length - 1, Math.round(list.length * frac)));
116
+ train = list.slice(0, k);
117
+ probe = list.slice(k);
118
+ }
119
+
120
+ const idOf = (s) => String((s && (s.session_id ?? s.sessionId)) ?? '');
121
+ const trainIds = new Set(train.map(idOf).filter(Boolean));
122
+ const probeIds = new Set(probe.map(idOf).filter(Boolean));
123
+ // Disjointness proof: no session id may appear on both sides of the split.
124
+ let disjoint = true;
125
+ for (const id of probeIds) {
126
+ if (trainIds.has(id)) { disjoint = false; break; }
127
+ }
128
+ return { train, probe, trainIds, probeIds, disjoint };
129
+ }
130
+
131
+ // ---------------------------------------------------------------------------
132
+ // Precision / recall for a SET-recovery task (Gate C). gold = the set of
133
+ // held-out preference subjects the user actually expressed in the probe window;
134
+ // predicted = the subjects the derived profile asserts. Pure set arithmetic.
135
+ // ---------------------------------------------------------------------------
136
+
137
+ /**
138
+ * precisionRecall(predicted, gold) -> { precision, recall, f1, tp, fp, fn,
139
+ * perGoldHit:number[], perPredCorrect:number[] }.
140
+ *
141
+ * `perGoldHit` is a 0/1 vector over the gold set (1 = recovered) and
142
+ * `perPredCorrect` a 0/1 vector over predictions (1 = correct) — these feed the
143
+ * REAL bootstrapCI helper so precision AND recall both get a CI.
144
+ */
145
+ export function precisionRecall(predicted = [], gold = []) {
146
+ const pred = new Set((predicted || []).map((x) => String(x)));
147
+ const goldSet = new Set((gold || []).map((x) => String(x)));
148
+ let tp = 0;
149
+ const perPredCorrect = [];
150
+ for (const p of pred) {
151
+ const hit = goldSet.has(p) ? 1 : 0;
152
+ perPredCorrect.push(hit);
153
+ if (hit) tp += 1;
154
+ }
155
+ const perGoldHit = [];
156
+ for (const g of goldSet) perGoldHit.push(pred.has(g) ? 1 : 0);
157
+ const fp = pred.size - tp;
158
+ const fn = goldSet.size - tp;
159
+ const precision = pred.size > 0 ? tp / pred.size : 0;
160
+ const recall = goldSet.size > 0 ? tp / goldSet.size : 0;
161
+ const f1 = precision + recall > 0 ? (2 * precision * recall) / (precision + recall) : 0;
162
+ return { precision, recall, f1, tp, fp, fn, perGoldHit, perPredCorrect };
163
+ }
164
+
165
+ // ---------------------------------------------------------------------------
166
+ // Cohen's κ — inter-rater agreement (two raters, binary or categorical labels).
167
+ // Reported wherever a judge is used, against a second (objective) rater, so the
168
+ // judge's reliability is measured, not assumed (LLM-judge-bias mitigation).
169
+ // ---------------------------------------------------------------------------
170
+
171
+ /**
172
+ * cohenKappa(raterA, raterB) -> { kappa, degenerate, reason, n }.
173
+ *
174
+ * κ is the chance-corrected agreement in [-1,1]. CRITICAL HONESTY GUARD: κ is
175
+ * UNDEFINED when either rater has no label variance — with a single label there
176
+ * is no chance baseline to correct against, so the formula's `1 - pe` denominator
177
+ * collapses and the naive result is a spurious 1.0. Reporting that 1.0 as
178
+ * "agreement" is exactly the lenient-judge-looks-rigorous texture we attack: a
179
+ * non-result dressed up as a strong one. So when EITHER rater is single-valued we
180
+ * return `kappa: null, degenerate: true` with a reason string, NOT 1.0. The caller
181
+ * MUST surface that reason instead of treating the judge as validated.
182
+ *
183
+ * A genuinely varied, fully-agreeing pair still returns kappa = 1 (real result).
184
+ */
185
+ export function cohenKappa(raterA = [], raterB = []) {
186
+ const n = Math.min(raterA.length, raterB.length);
187
+ if (n === 0) return { kappa: null, degenerate: true, reason: 'degenerate: empty rater sequence (n=0)', n: 0 };
188
+ const labels = new Set();
189
+ const setA = new Set();
190
+ const setB = new Set();
191
+ for (let i = 0; i < n; i++) {
192
+ labels.add(raterA[i]); labels.add(raterB[i]);
193
+ setA.add(raterA[i]); setB.add(raterB[i]);
194
+ }
195
+ // DEGENERACY GUARD: a rater with a single distinct label has zero variance, so
196
+ // κ is undefined (no chance baseline). Refuse to manufacture a 1.0.
197
+ if (setA.size < 2 || setB.size < 2) {
198
+ return {
199
+ kappa: null,
200
+ degenerate: true,
201
+ reason: `degenerate: no label variance at n=${n}`,
202
+ n,
203
+ };
204
+ }
205
+ let agree = 0;
206
+ const margA = new Map();
207
+ const margB = new Map();
208
+ for (let i = 0; i < n; i++) {
209
+ if (raterA[i] === raterB[i]) agree += 1;
210
+ margA.set(raterA[i], (margA.get(raterA[i]) || 0) + 1);
211
+ margB.set(raterB[i], (margB.get(raterB[i]) || 0) + 1);
212
+ }
213
+ const po = agree / n;
214
+ let pe = 0;
215
+ for (const l of labels) {
216
+ pe += ((margA.get(l) || 0) / n) * ((margB.get(l) || 0) / n);
217
+ }
218
+ // pe === 1 can only happen if a rater is single-valued, already handled above;
219
+ // keep a defensive guard so we never divide by zero.
220
+ if (pe >= 1) {
221
+ return { kappa: null, degenerate: true, reason: `degenerate: no chance baseline at n=${n}`, n };
222
+ }
223
+ return { kappa: (po - pe) / (1 - pe), degenerate: false, reason: null, n };
224
+ }
225
+
226
+ // ---------------------------------------------------------------------------
227
+ // ECE — Expected Calibration Error on the profile's `confidence` field. Bins
228
+ // (confidence, correctness) pairs and measures |avg-confidence − accuracy| per
229
+ // bin, weighted by bin mass. A well-calibrated profile that says "0.7 confident"
230
+ // is right ~70% of the time. This is what makes `confidence` an honest number
231
+ // instead of decoration.
232
+ // ---------------------------------------------------------------------------
233
+
234
+ /**
235
+ * expectedCalibrationError(pairs, opts) -> { ece, bins }.
236
+ *
237
+ * @param {Array<{confidence:number, correct:0|1|boolean}>} pairs
238
+ * @param {object} [opts] @param {number} [opts.nBins] default 10
239
+ */
240
+ export function expectedCalibrationError(pairs = [], opts = {}) {
241
+ const nBins = Number.isFinite(opts.nBins) && opts.nBins > 0 ? Math.floor(opts.nBins) : 10;
242
+ const rows = (pairs || [])
243
+ .map((p) => ({ c: Number(p.confidence), y: p.correct ? 1 : 0 }))
244
+ .filter((p) => Number.isFinite(p.c) && p.c >= 0 && p.c <= 1);
245
+ const n = rows.length;
246
+ const bins = Array.from({ length: nBins }, () => ({ count: 0, confSum: 0, correctSum: 0 }));
247
+ for (const r of rows) {
248
+ let idx = Math.floor(r.c * nBins);
249
+ if (idx >= nBins) idx = nBins - 1; // c === 1 lands in the top bin
250
+ const b = bins[idx];
251
+ b.count += 1; b.confSum += r.c; b.correctSum += r.y;
252
+ }
253
+ let ece = 0;
254
+ const binStats = [];
255
+ for (let i = 0; i < nBins; i++) {
256
+ const b = bins[i];
257
+ if (b.count === 0) { binStats.push({ lo: i / nBins, hi: (i + 1) / nBins, count: 0, avgConf: 0, acc: 0, gap: 0 }); continue; }
258
+ const avgConf = b.confSum / b.count;
259
+ const acc = b.correctSum / b.count;
260
+ const gap = Math.abs(avgConf - acc);
261
+ ece += (b.count / n) * gap;
262
+ binStats.push({ lo: i / nBins, hi: (i + 1) / nBins, count: b.count, avgConf, acc, gap });
263
+ }
264
+ return { ece: n > 0 ? ece : 0, bins: binStats };
265
+ }
266
+
267
+ // ---------------------------------------------------------------------------
268
+ // OBJECTIVE style metrics — computed against the user's OWN held-out samples.
269
+ // This is the bias-FREE rater: no LLM, no judge, just measurable properties of
270
+ // the text (length/terseness, emoji density, code-block presence, formality
271
+ // markers). Gate B compares an arm's output to these targets; a closer match =
272
+ // better adherence. This is the second rater κ is computed against.
273
+ // ---------------------------------------------------------------------------
274
+
275
+ const EMOJI_RE = /[\u{1F300}-\u{1FAFF}\u{2600}-\u{27BF}\u{2190}-\u{21FF}\u{2B00}-\u{2BFF}]/gu;
276
+ const FORMAL_MARKERS = [
277
+ 'therefore', 'however', 'furthermore', 'consequently', 'regards', 'kindly',
278
+ 'please', 'thank you', 'would you', 'could you', 'i would', 'shall',
279
+ ];
280
+ const CASUAL_MARKERS = ["gonna", "wanna", "yeah", "yep", "nope", "lol", "btw", "ok", "cool", "hey"];
281
+
282
+ /**
283
+ * objectiveStyle(text) -> { len, emojiPerChar, codeBlock, formalityMarkers,
284
+ * terseness }. All measurable, no judge. `terseness` is the inverse-length
285
+ * sample on the SAME 120-char crossover the heuristic derive uses, so the
286
+ * eval target and the derived axis are on one scale.
287
+ */
288
+ export function objectiveStyle(text) {
289
+ const s = String(text || '');
290
+ const len = s.length;
291
+ const emojis = (s.match(EMOJI_RE) || []).length;
292
+ const codeBlock = /```|\n {4}\S|`[^`]+`/.test(s) ? 1 : 0;
293
+ const lower = s.toLowerCase();
294
+ let formalHits = 0;
295
+ for (const m of FORMAL_MARKERS) if (lower.includes(m)) formalHits += 1;
296
+ let casualHits = 0;
297
+ for (const m of CASUAL_MARKERS) if (lower.includes(m)) casualHits += 1;
298
+ const markerTotal = formalHits + casualHits;
299
+ const formalityMarkers = markerTotal > 0 ? formalHits / markerTotal : 0.5;
300
+ // Inverse-length terseness on the heuristic's 120-char crossover.
301
+ const avgLen = len; // single-sample text; the caller may average across samples
302
+ const terseness = 1 - avgLen / (avgLen + 120);
303
+ return {
304
+ len,
305
+ emojiPerChar: len > 0 ? emojis / len : 0,
306
+ codeBlock,
307
+ formalityMarkers,
308
+ terseness,
309
+ };
310
+ }
311
+
312
+ /**
313
+ * styleDistance(a, b) -> number >= 0. L1 distance over the comparable, scale-free
314
+ * style dimensions (terseness, formalityMarkers, emoji presence, code presence).
315
+ * Lower = closer to target. Used to turn "did the arm adhere to the user's style"
316
+ * into an OBJECTIVE 0/1 (closer-than-the-other-arm) for the paired McNemar.
317
+ */
318
+ export function styleDistance(a, b) {
319
+ const dims = ['terseness', 'formalityMarkers'];
320
+ let d = 0;
321
+ for (const k of dims) d += Math.abs((Number(a[k]) || 0) - (Number(b[k]) || 0));
322
+ // emoji + code as presence bits (clamped) so a single emoji doesn't dominate.
323
+ d += Math.abs((a.emojiPerChar > 0 ? 1 : 0) - (b.emojiPerChar > 0 ? 1 : 0)) * 0.5;
324
+ d += Math.abs((a.codeBlock ? 1 : 0) - (b.codeBlock ? 1 : 0)) * 0.5;
325
+ return d;
326
+ }
327
+
328
+ // ---------------------------------------------------------------------------
329
+ // BIAS-CONTROLLED PAIRWISE JUDGE wrapper. Where a judge is needed it must be:
330
+ // - PAIRWISE (A vs B, not absolute scores — absolute scores drift)
331
+ // - POSITION-RANDOMIZED (seeded coin flip on presentation order — kills the
332
+ // first-position bias documented in 2410.02736)
333
+ // - LENGTH-CONTROLLED (both candidates normalized to ~equal length before
334
+ // judging — kills the verbosity bias)
335
+ // - IDENTITY-MASKED (arm labels stripped; the judge sees "Candidate 1/2",
336
+ // never "with-profile"/"baseline" — kills label bias)
337
+ // The judge itself is an injected function so unit tests can drive it
338
+ // deterministically; live runs inject an LLM-backed judge. We then report κ
339
+ // between the judge and the OBJECTIVE style rater.
340
+ // ---------------------------------------------------------------------------
341
+
342
+ function lengthControl(text, targetLen) {
343
+ const s = String(text || '');
344
+ if (s.length <= targetLen) return s;
345
+ // Truncate at a word boundary near the target so neither candidate is
346
+ // advantaged by raw verbosity.
347
+ const cut = s.slice(0, targetLen);
348
+ const lastSpace = cut.lastIndexOf(' ');
349
+ return lastSpace > targetLen * 0.6 ? cut.slice(0, lastSpace) : cut;
350
+ }
351
+
352
+ /**
353
+ * biasControlledJudge(items, judgeFn, opts) -> { preferA:number[], details[] }.
354
+ *
355
+ * @param {Array<{ a:string, b:string }>} items candidate pairs (A = arm-under-test, B = comparator)
356
+ * @param {Function} judgeFn ({ first, second, meta }) -> 0 | 1
357
+ * returns which POSITION (0=first shown, 1=second shown) the judge prefers.
358
+ * The judge NEVER sees which position is A vs B (identity-masked).
359
+ * @param {object} [opts] @param {number} [opts.seed] position-randomization seed.
360
+ *
361
+ * Returns `preferA` as a 0/1 vector (1 = judge preferred arm A) with the
362
+ * position flip UNDONE, so downstream paired tests see a clean A-vs-B signal.
363
+ */
364
+ export function biasControlledJudge(items = [], judgeFn, opts = {}) {
365
+ const rng = mulberry32(Number.isFinite(opts.seed) ? opts.seed : 1234);
366
+ const preferA = [];
367
+ const details = [];
368
+ for (const it of items) {
369
+ const aFirst = rng() < 0.5; // POSITION RANDOMIZATION
370
+ const rawA = String(it.a || '');
371
+ const rawB = String(it.b || '');
372
+ const target = Math.min(rawA.length, rawB.length) || Math.max(rawA.length, rawB.length);
373
+ const ctlA = lengthControl(rawA, target); // LENGTH CONTROL
374
+ const ctlB = lengthControl(rawB, target);
375
+ const first = aFirst ? ctlA : ctlB;
376
+ const second = aFirst ? ctlB : ctlA;
377
+ // IDENTITY MASK: judge only sees positions, never arm labels.
378
+ const choice = judgeFn({ first, second, meta: {} }) === 1 ? 1 : 0;
379
+ const judgePrefersFirst = choice === 0;
380
+ const prefersA = aFirst ? judgePrefersFirst : !judgePrefersFirst;
381
+ preferA.push(prefersA ? 1 : 0);
382
+ details.push({ aFirst, judgeChoseSecond: choice });
383
+ }
384
+ return { preferA, details };
385
+ }
386
+
387
+ // ---------------------------------------------------------------------------
388
+ // Built-in synthetic-but-HELD-OUT fixture. A multi-session corpus with a clear
389
+ // underlying persona, plus a DISJOINT, surface-varied probe set (different
390
+ // phrasings of the same preferences, in a later time window) and a NEGATIVE
391
+ // CONTROL persona (a profile that should NOT match the probes).
392
+ //
393
+ // Structured so a REAL session corpus can be dropped in: same shape
394
+ // (`{ sessions, probes, negativeControl }`), same field names the gates read.
395
+ // ---------------------------------------------------------------------------
396
+
397
+ /** A correction/preference feedback row in the .session-feedback.jsonl shape. */
398
+ function fb(session_id, ts, kind, phrase) {
399
+ return { session_id, ts, kind, phrase, context: '' };
400
+ }
401
+
402
+ /**
403
+ * makeHeldOutFixture(opts) -> { sessions, probes, negativeControl }.
404
+ *
405
+ * sessions: TRAIN-window sessions (early timestamps) carrying the persona's
406
+ * feedback + per-session style metadata.
407
+ * probes: PROBE-window sessions (later timestamps, DISJOINT ids) whose
408
+ * `goldSubjects` are surface-varied restatements of the SAME preferences — so
409
+ * recovering them tests generalization, not memorization. Each probe also
410
+ * carries `goldStyle` (the objective style target) for Gate B.
411
+ * negativeControl: a persona whose gold prefs DIFFER, used to confirm the
412
+ * derived profile does NOT spuriously match an unrelated user.
413
+ */
414
+ export function makeHeldOutFixture() {
415
+ // Persona "Ada": terse, uses tabs, prefers TypeScript, no emoji, formal-ish.
416
+ // Train window: 2026-06-01 .. 2026-06-05 (sessions t0..t4).
417
+ const terseMeta = {
418
+ avg_msg_chars: 38, emoji_per_msg: 0, code_block_ratio: 0.7,
419
+ formality_markers: 0.55, turn_cadence_per_min: 6, msg_count: 22,
420
+ };
421
+ const sessions = [];
422
+ // The SAME preference is corroborated across >=3 sessions so the brief's
423
+ // evidence floor (>=3) and confidence floor (>0.6) can be cleared honestly.
424
+ const personaPhrases = [
425
+ 'use tabs not spaces',
426
+ 'prefer typescript over javascript',
427
+ 'keep responses terse',
428
+ ];
429
+ for (let i = 0; i < 5; i++) {
430
+ const sid = `train-${i}`;
431
+ const ts = new Date(Date.UTC(2026, 5, 1 + i)).toISOString();
432
+ const feedback = personaPhrases.map((p) => fb(sid, ts, 'correction', p));
433
+ sessions.push({
434
+ session_id: sid, sessionId: sid, host: 'claude', ts,
435
+ metadata: { ...terseMeta },
436
+ feedback,
437
+ });
438
+ }
439
+
440
+ // PROBE window: 2026-06-20.. — DISJOINT ids, in a LATER time window.
441
+ //
442
+ // HONEST DESIGN (not a rigged 1.0): the heuristic derive keys a preference on
443
+ // the EXACT phrase slug (phraseToSubject), so it recovers EXACT-restatement
444
+ // probes but NOT genuine paraphrases. We deliberately include BOTH so Gate C
445
+ // measures the REAL generalization gap instead of asserting perfect capture:
446
+ // - exact restatements -> recovered by the heuristic floor (capture works);
447
+ // - one PARAPHRASE ('avoid spaces, indent with tabs') of a known pref -> a
448
+ // DIFFERENT slug the heuristic MISSES, so recall is realistically < 1.0.
449
+ // When the optional dialectic (semantic) tier is wired in, recall on the
450
+ // paraphrase should rise — and Gate C will show it. That is the point of a
451
+ // held-out probe: it can FAIL, which is what makes a passing score mean
452
+ // something.
453
+ const probes = [
454
+ {
455
+ session_id: 'probe-0', sessionId: 'probe-0', host: 'gemini',
456
+ ts: new Date(Date.UTC(2026, 5, 20)).toISOString(),
457
+ // exact restatements (heuristic recovers these):
458
+ goldSubjects: ['use tabs not spaces', 'prefer typescript over javascript'],
459
+ goldStyle: objectiveStyle('Use tabs. TypeScript. Short.'),
460
+ prompt: 'How should I format and type this module?',
461
+ },
462
+ {
463
+ session_id: 'probe-1', sessionId: 'probe-1', host: 'cursor',
464
+ ts: new Date(Date.UTC(2026, 5, 21)).toISOString(),
465
+ // exact restatement + one true PARAPHRASE the heuristic floor will MISS:
466
+ goldSubjects: ['keep responses terse', 'avoid spaces indent with tabs'],
467
+ goldStyle: objectiveStyle('Terse.'),
468
+ prompt: 'Explain this function.',
469
+ },
470
+ ];
471
+
472
+ // NEGATIVE CONTROL: persona "Bo" — verbose, spaces, JS, heavy emoji. A profile
473
+ // derived from Ada must NOT match Bo's gold prefs (precision guard).
474
+ const negativeControl = {
475
+ name: 'Bo',
476
+ goldSubjects: ['use spaces not tabs', 'prefer plain javascript', 'write detailed explanations'],
477
+ goldStyle: objectiveStyle(
478
+ 'Sure! Let me walk you through this in detail, step by step, with lots of context. 😀🎉',
479
+ ),
480
+ };
481
+
482
+ return { sessions, probes, negativeControl };
483
+ }
484
+
485
+ // ---------------------------------------------------------------------------
486
+ // LIVE-LLM transport resolution for the Gate B agent run. The ONLY env-gated
487
+ // part of the harness. When IJFW_PROFILE_EVAL_LIVE is set AND no transport is
488
+ // injected, we construct a REAL Ollama-compatible local HTTP caller (kept
489
+ // self-contained, same shape as derive.js — never reaches a cloud model). When a
490
+ // transport IS injected (tests), we use it. With neither, live runs refuse
491
+ // rather than silently faking a result.
492
+ // ---------------------------------------------------------------------------
493
+
494
+ function makeLocalAgentTransport(url) {
495
+ return async ({ prompt, system, maxTokens, model }) => {
496
+ const fullPrompt = system ? `${system}\n\n${prompt}` : prompt;
497
+ const res = await fetch(url.replace(/\/$/, '') + '/api/generate', {
498
+ method: 'POST',
499
+ headers: { 'Content-Type': 'application/json' },
500
+ body: JSON.stringify({
501
+ model: model || 'llama3',
502
+ prompt: fullPrompt,
503
+ stream: false,
504
+ options: { num_predict: maxTokens || 256 },
505
+ }),
506
+ });
507
+ if (!res.ok) throw new Error(`profile eval live LLM HTTP ${res.status}`);
508
+ const data = await res.json();
509
+ return { text: (data && data.response) || '', via: 'local' };
510
+ };
511
+ }
512
+
513
+ /**
514
+ * resolveAgentTransport(opts) -> { transport, live } | { transport:null }.
515
+ *
516
+ * @param {object} [opts]
517
+ * @param {Function} [opts.agent] injected transport (tests / explicit live).
518
+ * @param {object} [opts.env] defaults to process.env.
519
+ *
520
+ * Resolution order:
521
+ * 1. an explicitly injected `agent` (deterministic fake in tests; real caller
522
+ * when an operator wires one) — used regardless of the live flag.
523
+ * 2. IJFW_PROFILE_EVAL_LIVE set + a local URL (IJFW_PROFILE_LOCAL_URL /
524
+ * IJFW_BRAIN_LOCAL_URL) -> a real local HTTP transport.
525
+ * 3. otherwise null (no transport) — the caller decides whether that's an
526
+ * error (live mode) or fine (offline objective-only scoring).
527
+ */
528
+ export function resolveAgentTransport(opts = {}) {
529
+ const env = opts.env || process.env;
530
+ if (typeof opts.agent === 'function') return { transport: opts.agent, live: Boolean(env.IJFW_PROFILE_EVAL_LIVE) };
531
+ if (env && env.IJFW_PROFILE_EVAL_LIVE) {
532
+ const url = (env.IJFW_PROFILE_LOCAL_URL || env.IJFW_BRAIN_LOCAL_URL || '').trim();
533
+ if (url) return { transport: makeLocalAgentTransport(url), live: true };
534
+ }
535
+ return { transport: null, live: false };
536
+ }
537
+
538
+ export default {
539
+ bootstrapCI,
540
+ mcnemar,
541
+ mulberry32,
542
+ splitByTime,
543
+ precisionRecall,
544
+ cohenKappa,
545
+ expectedCalibrationError,
546
+ objectiveStyle,
547
+ styleDistance,
548
+ biasControlledJudge,
549
+ makeHeldOutFixture,
550
+ resolveAgentTransport,
551
+ };