@ijfw/memory-server 1.5.6 → 1.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/ijfw-dashboard +20 -1
- package/package.json +4 -3
- package/src/audit-roster.js +89 -12
- package/src/brain/tiered-llm.js +57 -7
- package/src/cross-orchestrator-cli.js +344 -4
- package/src/cross-project-search.js +39 -1
- package/src/dashboard-server.js +7 -1
- package/src/dream/runner.mjs +560 -8
- package/src/handlers/brain-handler.js +101 -1
- package/src/importers/discover.js +1 -1
- package/src/memory/bench-metrics.js +289 -0
- package/src/memory/benchmark.js +1 -1
- package/src/memory/search.js +53 -1
- package/src/orchestrator/plan-checker.js +1 -1
- package/src/profile/audit.js +671 -0
- package/src/profile/capture.js +871 -0
- package/src/profile/derive-dialectic.js +242 -0
- package/src/profile/derive-heuristic.js +733 -0
- package/src/profile/derive.js +156 -0
- package/src/profile/egress.js +306 -0
- package/src/profile/eval/build-real-probes.mjs +197 -0
- package/src/profile/eval/corpus-from-reddit.mjs +166 -0
- package/src/profile/eval/corpus-from-reddit.test.mjs +121 -0
- package/src/profile/eval/corpus-from-transcripts.mjs +264 -0
- package/src/profile/eval/gate-b-behavior.mjs +420 -0
- package/src/profile/eval/gate-b-decision-run.mjs +171 -0
- package/src/profile/eval/gate-b-decision-run.test.mjs +141 -0
- package/src/profile/eval/gate-b-run.mjs +417 -0
- package/src/profile/eval/gate-b-run.test.mjs +204 -0
- package/src/profile/eval/gate-c-capture.mjs +323 -0
- package/src/profile/eval/harness.mjs +551 -0
- package/src/profile/eval/instrument-validation.mjs +248 -0
- package/src/profile/eval/instrument-validation.test.mjs +125 -0
- package/src/profile/eval/multi-subject-harness.mjs +106 -0
- package/src/profile/eval/multi-subject-harness.test.mjs +99 -0
- package/src/profile/eval/personas.test.mjs +83 -0
- package/src/profile/eval/plumbing.test.mjs +69 -0
- package/src/profile/eval/prereg.mjs +130 -0
- package/src/profile/eval/prereg.test.mjs +78 -0
- package/src/profile/eval/real-corpus.test.mjs +103 -0
- package/src/profile/eval/real-personas.mjs +109 -0
- package/src/profile/eval/run-real-corpus-concurrent.mjs +407 -0
- package/src/profile/eval/run-real-corpus.mjs +358 -0
- package/src/profile/eval/slug-quality.mjs +464 -0
- package/src/profile/eval/stylometry-features.js +85 -0
- package/src/profile/eval/stylometry-reference.js +16 -0
- package/src/profile/eval/stylometry.js +224 -0
- package/src/profile/eval/stylometry.test.mjs +103 -0
- package/src/profile/eval/synthetic-personas.js +91 -0
- package/src/profile/eval/verifier-features.mjs +170 -0
- package/src/profile/eval/verifier-logreg.mjs +74 -0
- package/src/profile/eval/verifier-pair.mjs +122 -0
- package/src/profile/eval/verifier-reference.mjs +68 -0
- package/src/profile/eval/verifier-scorer.mjs +30 -0
- package/src/profile/eval/wrong-target-control.mjs +168 -0
- package/src/profile/eval/wrong-target-control.test.mjs +124 -0
- package/src/profile/exemplar-capture.js +232 -0
- package/src/profile/exemplar-retrieve.js +138 -0
- package/src/profile/exemplar-store.js +314 -0
- package/src/profile/lock.js +64 -0
- package/src/profile/merge.js +624 -0
- package/src/profile/path-policy.js +213 -0
- package/src/profile/precision-stamp.mjs +151 -0
- package/src/profile/render-brief.js +717 -0
- package/src/profile/schema.js +244 -0
- package/src/profile/sensitivity.js +249 -0
- package/src/profile/serve.js +345 -0
- package/src/profile/store.js +261 -0
- package/src/profile/telemetry.js +289 -0
- package/src/recovery/checkpoint.js +7 -1
- package/src/server.js +185 -14
- package/src/.registry-meta-key.pem +0 -3
|
@@ -0,0 +1,204 @@
|
|
|
1
|
+
// Gate B v2 — PRODUCTION runner wiring. These tests prove the integration seam, not the
|
|
2
|
+
// statistics (those are unit-tested in the module tests). The load-bearing claims:
|
|
3
|
+
//
|
|
4
|
+
// * makeMeasure produces the EXACT confirmatory shape decideGateB/confirmatoryBooleans
|
|
5
|
+
// consume — registerEchoPasses + realArmsCarried are COMPUTED booleans, never absent
|
|
6
|
+
// (the rails THROW on undefined; we prove they don't fire here).
|
|
7
|
+
// * The register-echo arm is actually run + spliced, so the VOID rail is live.
|
|
8
|
+
// * A voice-matching fake transport drives toward PASS; a generic fake → NULL.
|
|
9
|
+
// * INSTRUMENT GATE BEFORE SPEND: when validateInstrument fails, the runner refuses to
|
|
10
|
+
// spend and the transport is NEVER called (zero calls asserted).
|
|
11
|
+
// * Fail-closed: missing API key ⇒ BLOCKED, never a silent empty pass.
|
|
12
|
+
|
|
13
|
+
import { test } from 'node:test';
|
|
14
|
+
import assert from 'node:assert/strict';
|
|
15
|
+
import {
|
|
16
|
+
makeMeasure, runGateBProduction, buildRegisterEchoBrief, ECHO_ARM,
|
|
17
|
+
} from './gate-b-run.mjs';
|
|
18
|
+
import { buildPreReg } from './prereg.mjs';
|
|
19
|
+
import { fullStyleVector } from './stylometry.js';
|
|
20
|
+
import { generatePersonaText } from './synthetic-personas.js';
|
|
21
|
+
|
|
22
|
+
// ---- persona fixtures: real human-ish text, formal register (archetype 0) ----
|
|
23
|
+
function persona(id, seed) {
|
|
24
|
+
const trainDocs = [generatePersonaText(0, seed + 1, 20), generatePersonaText(0, seed + 2, 20)];
|
|
25
|
+
const testDocs = [generatePersonaText(0, seed + 9001, 16)];
|
|
26
|
+
return {
|
|
27
|
+
id,
|
|
28
|
+
synthetic: false,
|
|
29
|
+
headlineEligible: true,
|
|
30
|
+
trainDocs,
|
|
31
|
+
testDocs,
|
|
32
|
+
trainTokens: 999,
|
|
33
|
+
testTokens: 999,
|
|
34
|
+
fingerprint: fullStyleVector(testDocs.join('\n')),
|
|
35
|
+
};
|
|
36
|
+
}
|
|
37
|
+
// 8 same-register subjects + 4 same-register foreigners (so every subject is decidable).
|
|
38
|
+
const SUBJECTS = Array.from({ length: 8 }, (_, i) => persona(`s${i}`, 1000 + i * 37));
|
|
39
|
+
const FOREIGNERS = Array.from({ length: 4 }, (_, i) => persona(`f${i}`, 7000 + i * 53));
|
|
40
|
+
|
|
41
|
+
// A FAITHFUL agent: when the prompt carries fewShotOracle exemplars (""" """), echo them
|
|
42
|
+
// (→ that subject's own train voice). When it carries the register-echo instruction, emit a
|
|
43
|
+
// register-centered generic blob (no idiosyncratic voice). Otherwise a fixed casual default.
|
|
44
|
+
function voiceMatchingTransport(subjectById) {
|
|
45
|
+
return (prompt) => {
|
|
46
|
+
const ex = prompt.match(/"""([\s\S]*?)"""/g);
|
|
47
|
+
if (ex) return ex.map((s) => s.replace(/"""/g, '')).join(' ');
|
|
48
|
+
if (prompt.includes('REGISTER-ONLY')) return generatePersonaText(0, 424242, 16);
|
|
49
|
+
// derived/baseline: land near OWN voice if the brief names the subject's id, else generic
|
|
50
|
+
for (const [id, p] of Object.entries(subjectById)) {
|
|
51
|
+
if (prompt.includes(`__voice:${id}__`)) return p.trainDocs.join(' ');
|
|
52
|
+
}
|
|
53
|
+
return generatePersonaText(0, 424242, 16);
|
|
54
|
+
};
|
|
55
|
+
}
|
|
56
|
+
// A GENERIC agent: same register-centered blob no matter the brief → no own-voice advantage.
|
|
57
|
+
function genericTransport() {
|
|
58
|
+
return () => generatePersonaText(0, 424242, 16);
|
|
59
|
+
}
|
|
60
|
+
// A call-counting wrapper so we can assert ZERO spend on the instrument-fail path.
|
|
61
|
+
function counting(fn) {
|
|
62
|
+
const t = (p) => { t.calls += 1; return fn(p); };
|
|
63
|
+
t.calls = 0;
|
|
64
|
+
return t;
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
const TEST_CFG = {
|
|
68
|
+
minMeanMargin: 0.0005, // fixture-scaled measured floor (real run derives floorK*(between-within))
|
|
69
|
+
probes: ['Write a short note about your week.', 'React to a teammate proposal.'],
|
|
70
|
+
minRealSubjects: 1,
|
|
71
|
+
// synthetic templated train≈test ⇒ relax the 2nd-tier leak floor (real corpus uses prereg.leakFloor).
|
|
72
|
+
harnessCfg: { leakFloor: 0.0001 },
|
|
73
|
+
};
|
|
74
|
+
|
|
75
|
+
test('buildRegisterEchoBrief: a register-ONLY echo, distinct + honestly labeled', () => {
|
|
76
|
+
const b = buildRegisterEchoBrief(SUBJECTS[0]);
|
|
77
|
+
assert.ok(b.includes('REGISTER-ONLY'), 'echo brief is explicitly register-only');
|
|
78
|
+
// must NOT carry any train exemplar prose (it is a register echo, not a voice brief)
|
|
79
|
+
assert.ok(!SUBJECTS[0].trainDocs.some((d) => b.includes(d)));
|
|
80
|
+
});
|
|
81
|
+
|
|
82
|
+
test('makeMeasure returns the EXACT confirmatory shape (all rails are computed booleans)', async () => {
|
|
83
|
+
const measure = makeMeasure({
|
|
84
|
+
transport: voiceMatchingTransport(Object.fromEntries(SUBJECTS.map((p) => [p.id, p]))),
|
|
85
|
+
personas: SUBJECTS,
|
|
86
|
+
foreigners: FOREIGNERS,
|
|
87
|
+
...TEST_CFG,
|
|
88
|
+
});
|
|
89
|
+
const preReg = buildPreReg({ verdictArms: ['derived', 'fewShotOracle'] });
|
|
90
|
+
const m = await measure({ seed: 1, phase: 'confirmatory', preReg, minMeanMargin: TEST_CFG.minMeanMargin });
|
|
91
|
+
for (const k of ['instrumentValid', 'baselinePasses', 'registerEchoPasses', 'derivedPasses', 'oraclePasses', 'realArmsCarried']) {
|
|
92
|
+
assert.equal(typeof m[k], 'boolean', `${k} must be a computed boolean, got ${typeof m[k]}`);
|
|
93
|
+
}
|
|
94
|
+
// the register-echo arm really ran (rail is live, not silent-false)
|
|
95
|
+
assert.ok(m.control.perArm[ECHO_ARM], 'register-echo arm spliced into the control');
|
|
96
|
+
});
|
|
97
|
+
|
|
98
|
+
test('the register-echo arm is spliced so confirmatoryBooleans does NOT throw', async () => {
|
|
99
|
+
// confirmatoryBooleans throws if control.registerEchoPasses === undefined. Prove it is defined.
|
|
100
|
+
const measure = makeMeasure({
|
|
101
|
+
transport: genericTransport(), personas: SUBJECTS, foreigners: FOREIGNERS, ...TEST_CFG,
|
|
102
|
+
});
|
|
103
|
+
const preReg = buildPreReg({});
|
|
104
|
+
const m = await measure({ seed: 1, phase: 'confirmatory', preReg, minMeanMargin: TEST_CFG.minMeanMargin });
|
|
105
|
+
assert.notEqual(m.control.registerEchoPasses, undefined);
|
|
106
|
+
});
|
|
107
|
+
|
|
108
|
+
test('GENERIC transport ⇒ derived does NOT pass the wrong-target control (honest NULL)', async () => {
|
|
109
|
+
const measure = makeMeasure({
|
|
110
|
+
transport: genericTransport(), personas: SUBJECTS, foreigners: FOREIGNERS, ...TEST_CFG,
|
|
111
|
+
});
|
|
112
|
+
const preReg = buildPreReg({});
|
|
113
|
+
const m = await measure({ seed: 1, phase: 'confirmatory', preReg, minMeanMargin: TEST_CFG.minMeanMargin });
|
|
114
|
+
assert.equal(m.derivedPasses, false, 'generic output is equidistant from same-register targets');
|
|
115
|
+
});
|
|
116
|
+
|
|
117
|
+
test('VOICE-MATCHING transport ⇒ fewShotOracle margin beats generic (drives toward PASS)', async () => {
|
|
118
|
+
const measure = makeMeasure({
|
|
119
|
+
transport: voiceMatchingTransport(Object.fromEntries(SUBJECTS.map((p) => [p.id, p]))),
|
|
120
|
+
personas: SUBJECTS,
|
|
121
|
+
foreigners: FOREIGNERS,
|
|
122
|
+
...TEST_CFG,
|
|
123
|
+
});
|
|
124
|
+
const preReg = buildPreReg({});
|
|
125
|
+
const m = await measure({ seed: 1, phase: 'confirmatory', preReg, minMeanMargin: TEST_CFG.minMeanMargin });
|
|
126
|
+
// The oracle arm, echoing OWN train voice, lands closer to OWN test than to the nearest
|
|
127
|
+
// same-register foreigner → positive mean margin (the structural PASS direction).
|
|
128
|
+
assert.ok(m.control.perArm.fewShotOracle.meanMargin > m.control.perArm.baseline.meanMargin,
|
|
129
|
+
'oracle own-voice margin exceeds the no-brief baseline margin');
|
|
130
|
+
});
|
|
131
|
+
|
|
132
|
+
// ---- the full runner: instrument gate, refuse-to-spend, fail-closed ----
|
|
133
|
+
|
|
134
|
+
test('INSTRUMENT GATE: validateInstrument fails ⇒ NULL verdict, ZERO transport calls', async () => {
|
|
135
|
+
const transport = counting(genericTransport());
|
|
136
|
+
const r = await runGateBProduction({
|
|
137
|
+
apiKey: 'sk-test-fake',
|
|
138
|
+
corpus: SUBJECTS.map((p) => ({ id: p.id, docs: [...p.trainDocs, ...p.testDocs] })),
|
|
139
|
+
foreignersCorpus: FOREIGNERS.map((p) => ({ id: p.id, docs: [...p.trainDocs, ...p.testDocs] })),
|
|
140
|
+
transport,
|
|
141
|
+
personaCfg: { minTrainTokens: 50, minTestTokens: 30 }, // fixture-scaled (prod = 1200/600)
|
|
142
|
+
// force the instrument gate to fail regardless of fixture AUC:
|
|
143
|
+
validateOverride: () => ({ passes: false, failedChecks: ['forced-fail'], betweenMean: 0.4, withinMean: 0.3 }),
|
|
144
|
+
});
|
|
145
|
+
assert.equal(r.spent, false);
|
|
146
|
+
assert.equal(r.verdict.verdict, 'NULL');
|
|
147
|
+
assert.equal(transport.calls, 0, 'NO cloud spend when the instrument gate fails');
|
|
148
|
+
});
|
|
149
|
+
|
|
150
|
+
test('FAIL-CLOSED: missing API key ⇒ BLOCKED, never a silent empty pass', async () => {
|
|
151
|
+
const r = await runGateBProduction({ apiKey: '', corpus: [], foreignersCorpus: [], transport: genericTransport() });
|
|
152
|
+
assert.equal(r.status, 'BLOCKED');
|
|
153
|
+
assert.match(r.reason, /ANTHROPIC_API_KEY|key/i);
|
|
154
|
+
});
|
|
155
|
+
|
|
156
|
+
test('runner wires validate→prereg→measure and returns a verdict when the gate passes', async () => {
|
|
157
|
+
const transport = counting(genericTransport());
|
|
158
|
+
const r = await runGateBProduction({
|
|
159
|
+
apiKey: 'sk-test-fake',
|
|
160
|
+
corpus: SUBJECTS.map((p) => ({ id: p.id, docs: [...p.trainDocs, ...p.testDocs] })),
|
|
161
|
+
foreignersCorpus: FOREIGNERS.map((p) => ({ id: p.id, docs: [...p.trainDocs, ...p.testDocs] })),
|
|
162
|
+
transport,
|
|
163
|
+
personaCfg: { minTrainTokens: 50, minTestTokens: 30 }, // fixture-scaled (prod = 1200/600)
|
|
164
|
+
validateOverride: () => ({ passes: true, betweenMean: 0.5, withinMean: 0.3 }),
|
|
165
|
+
preRegInput: { minSubjects: 1, floorK: 0.001 },
|
|
166
|
+
measureCfg: { ...TEST_CFG, minRealSubjects: 1 },
|
|
167
|
+
});
|
|
168
|
+
assert.equal(r.spent, true);
|
|
169
|
+
assert.ok(['PASS', 'PASS_ORACLE', 'CUT', 'NULL', 'VOID'].includes(r.verdict.verdict));
|
|
170
|
+
assert.ok(typeof r.preRegHash === 'string' && r.preRegHash.length === 64, 'frozen prereg hash surfaced');
|
|
171
|
+
assert.ok(transport.calls > 0, 'the gate passed, so the transport WAS exercised');
|
|
172
|
+
});
|
|
173
|
+
|
|
174
|
+
test('END-TO-END: voice-matching transport never VOIDs (baseline + register-echo NULL)', async () => {
|
|
175
|
+
// The honesty rail: a no-signal arm (baseline / register-echo) must NOT pass the control.
|
|
176
|
+
// With a faithful agent only the OWN-voice arms move; baseline + echo land on register-center.
|
|
177
|
+
const transport = voiceMatchingTransport(Object.fromEntries(SUBJECTS.map((p) => [p.id, p])));
|
|
178
|
+
const r = await runGateBProduction({
|
|
179
|
+
apiKey: 'sk-test-fake',
|
|
180
|
+
corpus: SUBJECTS.map((p) => ({ id: p.id, docs: [...p.trainDocs, ...p.testDocs] })),
|
|
181
|
+
foreignersCorpus: FOREIGNERS.map((p) => ({ id: p.id, docs: [...p.trainDocs, ...p.testDocs] })),
|
|
182
|
+
transport,
|
|
183
|
+
personaCfg: { minTrainTokens: 50, minTestTokens: 30 },
|
|
184
|
+
validateOverride: () => ({ passes: true, betweenMean: 0.5, withinMean: 0.3 }),
|
|
185
|
+
preRegInput: { minSubjects: 4, floorK: 0.001 },
|
|
186
|
+
measureCfg: { ...TEST_CFG, minRealSubjects: 1 },
|
|
187
|
+
});
|
|
188
|
+
assert.notEqual(r.verdict.verdict, 'VOID', 'no-signal arms did not contaminate the control');
|
|
189
|
+
const pa = r.confirmatory.control.perArm;
|
|
190
|
+
assert.equal(pa.baseline.verdict.passes, false, 'baseline NULLs (no own-voice signal)');
|
|
191
|
+
assert.equal(pa.registerEcho.verdict.passes, false, 'register-echo NULLs (register alone cannot win)');
|
|
192
|
+
});
|
|
193
|
+
|
|
194
|
+
test('formatVerdict prints a NULL/CUT as cleanly as a PASS (no retry-to-force)', async () => {
|
|
195
|
+
const { formatVerdict } = await import('./gate-b-run.mjs');
|
|
196
|
+
const out = formatVerdict({
|
|
197
|
+
status: 'DONE', spent: true, cloudCalls: 10, budgetMax: 100, nSubjects: 4, nForeigners: 4,
|
|
198
|
+
runId: 'gateb-x', preRegHash: 'h'.repeat(64),
|
|
199
|
+
verdict: { verdict: 'NULL', reason: 'real authors did not carry it', ship: 'portability' },
|
|
200
|
+
confirmatory: { control: { perArm: {} } },
|
|
201
|
+
});
|
|
202
|
+
assert.match(out, /Gate B verdict: NULL/);
|
|
203
|
+
assert.match(out, /ship:\s+portability/);
|
|
204
|
+
});
|
|
@@ -0,0 +1,323 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* profile/eval/gate-c-capture.mjs — Cross-system profile bus, PHASE P5 (Gate C).
|
|
3
|
+
*
|
|
4
|
+
* GATE C — CAPTURE, HELD-OUT. The honest answer to "does the profile capture the
|
|
5
|
+
* user?" without the circular train==test leakage the slice-1 hook had. We:
|
|
6
|
+
* 1. Derive a profile from TRAIN sessions ONLY (sessions 1..k), through the
|
|
7
|
+
* REAL pipeline: deriveProfile() -> applyDelta() -> a real UserProfile.
|
|
8
|
+
* 2. Evaluate on a DISJOINT, surface-varied PROBE set (LaMP time-based split):
|
|
9
|
+
* probes live in a later time window with DIFFERENT session ids and
|
|
10
|
+
* paraphrased restatements of the same preferences — so recovering them
|
|
11
|
+
* tests GENERALIZATION, not memorization of exact train strings.
|
|
12
|
+
* 3. Report inference PRECISION and RECALL, each with a bootstrap CI from the
|
|
13
|
+
* REAL lab-study helper, plus a NEGATIVE-CONTROL persona (the derived
|
|
14
|
+
* profile must NOT match an unrelated user's prefs — a precision guard).
|
|
15
|
+
*
|
|
16
|
+
* HELD-OUT ENFORCEMENT is not a comment — it is an ASSERTION. Before scoring we
|
|
17
|
+
* verify train-session-ids ∩ probe-session-ids = ∅ (splitByTime.disjoint). If a
|
|
18
|
+
* caller hands us a leaky split, runGateC throws rather than reporting an
|
|
19
|
+
* inflated number. That is the whole point of Gate C vs the slice-1 hook.
|
|
20
|
+
*
|
|
21
|
+
* Zero deps. ESM. Wires the REAL derive/merge modules + REAL stats. No stubs.
|
|
22
|
+
*
|
|
23
|
+
* Cites: LaMP time-based split [2304.11406].
|
|
24
|
+
*/
|
|
25
|
+
|
|
26
|
+
import { deriveProfile } from '../derive.js';
|
|
27
|
+
import { applyDelta } from '../merge.js';
|
|
28
|
+
import { makeProfile } from '../schema.js';
|
|
29
|
+
import {
|
|
30
|
+
splitByTime,
|
|
31
|
+
precisionRecall,
|
|
32
|
+
bootstrapCI,
|
|
33
|
+
makeHeldOutFixture,
|
|
34
|
+
} from './harness.mjs';
|
|
35
|
+
|
|
36
|
+
/**
|
|
37
|
+
* Normalize a phrase to the SAME subject slug the heuristic derive uses
|
|
38
|
+
* (derive-heuristic.js phraseToSubject), so a probe's gold phrase compares
|
|
39
|
+
* apples-to-apples with a derived inference's `subject`. Re-implemented here
|
|
40
|
+
* (5 lines, identical rule) to keep this eval module from importing derive
|
|
41
|
+
* internals — the derive does not export phraseToSubject.
|
|
42
|
+
*/
|
|
43
|
+
function toSubject(phrase) {
|
|
44
|
+
return String(phrase || '')
|
|
45
|
+
.toLowerCase()
|
|
46
|
+
.replace(/[^a-z0-9\s]+/g, ' ')
|
|
47
|
+
.replace(/\s+/g, ' ')
|
|
48
|
+
.trim()
|
|
49
|
+
.slice(0, 80);
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
/**
|
|
53
|
+
* Push v2 — SEMANTIC preference match (eval-metric fix).
|
|
54
|
+
*
|
|
55
|
+
* The exact-slug match (set equality on toSubject) reports recall = 0 on the real
|
|
56
|
+
* corpus because the user restates the SAME preference cross-time with DIFFERENT
|
|
57
|
+
* words, so the verbatim sentence-fragment slugs never coincide. That conflates
|
|
58
|
+
* "captured-but-paraphrased" with "genuinely-not-captured". To separate them we
|
|
59
|
+
* add a principled LEXICAL-SEMANTIC match: two subjects count as the same
|
|
60
|
+
* preference if their CONTENT-WORD token sets overlap by Jaccard >= a documented
|
|
61
|
+
* threshold (default 0.5). No embedder is used — bringing the memory-tier
|
|
62
|
+
* embedder onto this path would also drag network/LLM modules across the profile
|
|
63
|
+
* moat — so we use a deterministic, dependency-free lexical-semantic similarity
|
|
64
|
+
* with a pre-registered threshold instead (the task's sanctioned fallback).
|
|
65
|
+
*
|
|
66
|
+
* STOPWORDS are stripped so function words ("the user prefers to") don't inflate
|
|
67
|
+
* overlap; the match is on the substantive tokens that carry the preference.
|
|
68
|
+
*/
|
|
69
|
+
const STOPWORDS = new Set([
|
|
70
|
+
'the', 'a', 'an', 'to', 'of', 'and', 'or', 'for', 'in', 'on', 'with', 'is',
|
|
71
|
+
'are', 'be', 'i', 'you', 'user', 'prefer', 'prefers', 'preference', 'like',
|
|
72
|
+
'likes', 'want', 'wants', 'use', 'uses', 'using', 'should', 'would', 'do',
|
|
73
|
+
'does', 'my', 'me', 'it', 'this', 'that', 'when', 'always', 'not', 'no',
|
|
74
|
+
]);
|
|
75
|
+
|
|
76
|
+
/** Content-word token SET for a (already-slugged) subject, stopwords removed. */
|
|
77
|
+
function contentTokens(subject) {
|
|
78
|
+
const out = new Set();
|
|
79
|
+
for (const t of String(subject || '').split(/\s+/)) {
|
|
80
|
+
if (t.length >= 3 && !STOPWORDS.has(t)) out.add(t);
|
|
81
|
+
}
|
|
82
|
+
return out;
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
/** Jaccard similarity between two content-token sets (0 when either is empty). */
|
|
86
|
+
function jaccard(aSet, bSet) {
|
|
87
|
+
if (!aSet.size || !bSet.size) return 0;
|
|
88
|
+
let inter = 0;
|
|
89
|
+
for (const t of aSet) if (bSet.has(t)) inter += 1;
|
|
90
|
+
return inter / (aSet.size + bSet.size - inter);
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
/**
|
|
94
|
+
* SEMANTIC precision/recall over predicted vs gold subjects, in the SAME shape
|
|
95
|
+
* the REAL precisionRecall helper returns (so the existing bootstrapCI consumes
|
|
96
|
+
* it unchanged). A predicted subject is "correct" iff it lexical-semantically
|
|
97
|
+
* matches ANY gold subject at Jaccard >= threshold; a gold subject is
|
|
98
|
+
* "recovered" iff ANY predicted subject matches it. Bipartite OR-matching (not a
|
|
99
|
+
* 1:1 assignment) — appropriate for a recovery task where one expressed
|
|
100
|
+
* preference may be derived as several near-duplicate slugs.
|
|
101
|
+
*/
|
|
102
|
+
export function semanticPrecisionRecall(predicted = [], gold = [], threshold = 0.5) {
|
|
103
|
+
const predTok = (predicted || []).map((p) => ({ raw: String(p), tok: contentTokens(p) }));
|
|
104
|
+
const goldTok = (gold || []).map((g) => ({ raw: String(g), tok: contentTokens(g) }));
|
|
105
|
+
const perPredCorrect = predTok.map((p) => (
|
|
106
|
+
goldTok.some((g) => jaccard(p.tok, g.tok) >= threshold) ? 1 : 0
|
|
107
|
+
));
|
|
108
|
+
const perGoldHit = goldTok.map((g) => (
|
|
109
|
+
predTok.some((p) => jaccard(p.tok, g.tok) >= threshold) ? 1 : 0
|
|
110
|
+
));
|
|
111
|
+
const tp = perPredCorrect.reduce((a, b) => a + b, 0);
|
|
112
|
+
const fp = perPredCorrect.length - tp;
|
|
113
|
+
const recovered = perGoldHit.reduce((a, b) => a + b, 0);
|
|
114
|
+
const fn = perGoldHit.length - recovered;
|
|
115
|
+
const precision = predTok.length ? tp / predTok.length : 0;
|
|
116
|
+
const recall = goldTok.length ? recovered / goldTok.length : 0;
|
|
117
|
+
const f1 = precision + recall > 0 ? (2 * precision * recall) / (precision + recall) : 0;
|
|
118
|
+
return { precision, recall, f1, tp, fp, fn, perGoldHit, perPredCorrect, threshold };
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
/**
|
|
122
|
+
* Derive a REAL profile from a list of train sessions. Each session is run
|
|
123
|
+
* through deriveProfile() (heuristic floor + optional injected dialectic) and the
|
|
124
|
+
* resulting delta is folded with the REAL applyDelta() CRDT merge — exactly the
|
|
125
|
+
* SessionEnd path, just driven by the eval instead of the dream runner.
|
|
126
|
+
*
|
|
127
|
+
* @param {Array} sessions train sessions ({ metadata, feedback, outcomes, session_id, host })
|
|
128
|
+
* @param {object} [opts] @param {Function} [opts._localTransport] dialectic arm (ablation)
|
|
129
|
+
* @param {object} [opts.env]
|
|
130
|
+
* @returns {Promise<object>} a UserProfile (schema shape)
|
|
131
|
+
*/
|
|
132
|
+
export async function deriveProfileFromSessions(sessions = [], opts = {}) {
|
|
133
|
+
let profile = makeProfile();
|
|
134
|
+
for (const s of sessions) {
|
|
135
|
+
const signals = {
|
|
136
|
+
metadata: s.metadata,
|
|
137
|
+
feedback: s.feedback,
|
|
138
|
+
outcomes: s.outcomes,
|
|
139
|
+
sessionId: s.session_id ?? s.sessionId,
|
|
140
|
+
host: s.host,
|
|
141
|
+
// dialectic corroborates over the per-session style array when present:
|
|
142
|
+
style: s.style,
|
|
143
|
+
};
|
|
144
|
+
// REAL derivation (heuristic always; dialectic only if a transport is
|
|
145
|
+
// injected — that is the heuristic-vs-dialectic ablation lever).
|
|
146
|
+
// eslint-disable-next-line no-await-in-loop
|
|
147
|
+
const delta = await deriveProfile(signals, {
|
|
148
|
+
env: opts.env,
|
|
149
|
+
_localTransport: opts._localTransport,
|
|
150
|
+
});
|
|
151
|
+
profile = applyDelta(profile, delta); // REAL CRDT merge
|
|
152
|
+
}
|
|
153
|
+
return profile;
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
/**
|
|
157
|
+
* Collect the preference subjects a profile ASSERTS — the predicted set Gate C
|
|
158
|
+
* scores against the held-out gold. We read the REAL profile's global dialectic
|
|
159
|
+
* (where derived preference inferences live) and keep the EARNED ones: a
|
|
160
|
+
* subject the profile is confident enough about to surface. We use the brief's
|
|
161
|
+
* own inclusion floor (confidence > 0.6 AND evidence_count >= 3) so Gate C scores
|
|
162
|
+
* the SAME signal a host would actually receive — not raw, un-surfaced atoms.
|
|
163
|
+
*/
|
|
164
|
+
export function assertedSubjects(profile, opts = {}) {
|
|
165
|
+
const minConf = Number.isFinite(opts.minConfidence) ? opts.minConfidence : 0.6;
|
|
166
|
+
const minEv = Number.isFinite(opts.minEvidence) ? opts.minEvidence : 3;
|
|
167
|
+
const dialectic = profile && profile.global && Array.isArray(profile.global.dialectic)
|
|
168
|
+
? profile.global.dialectic : [];
|
|
169
|
+
const out = new Set();
|
|
170
|
+
for (const inf of dialectic) {
|
|
171
|
+
if (!inf || inf.kind !== 'preference') continue;
|
|
172
|
+
if (Number(inf.confidence) > minConf && (Number(inf.evidence_count) || 0) >= minEv) {
|
|
173
|
+
out.add(toSubject(inf.subject));
|
|
174
|
+
}
|
|
175
|
+
}
|
|
176
|
+
return [...out];
|
|
177
|
+
}
|
|
178
|
+
|
|
179
|
+
/**
|
|
180
|
+
* runGateC(corpus, opts) -> Gate C report.
|
|
181
|
+
*
|
|
182
|
+
* @param {object} [corpus] { sessions, probes, negativeControl } — defaults to
|
|
183
|
+
* the built-in synthetic-but-held-out fixture. A REAL session corpus drops in
|
|
184
|
+
* here unchanged (same shape).
|
|
185
|
+
* @param {object} [opts]
|
|
186
|
+
* @param {string|number} [opts.cutoff] time-split boundary (else fraction)
|
|
187
|
+
* @param {number} [opts.trainFraction] default 0.6
|
|
188
|
+
* @param {Function} [opts._localTransport] dialectic arm (ablation)
|
|
189
|
+
* @param {object} [opts.env]
|
|
190
|
+
* @param {number} [opts.bootstrapSeed] default 42 (reproducible CIs)
|
|
191
|
+
*
|
|
192
|
+
* @returns {Promise<{
|
|
193
|
+
* heldOut: { disjoint, trainIds, probeIds },
|
|
194
|
+
* precision: { point, lo, hi }, recall: { point, lo, hi },
|
|
195
|
+
* f1, predicted, gold, negativeControl: { precision, matched },
|
|
196
|
+
* nTrain, nProbe }>}
|
|
197
|
+
*
|
|
198
|
+
* THROWS if the split is not disjoint (leaky held-out = invalid Gate C).
|
|
199
|
+
*/
|
|
200
|
+
export async function runGateC(corpus, opts = {}) {
|
|
201
|
+
const data = corpus || makeHeldOutFixture();
|
|
202
|
+
const allSessions = Array.isArray(data.sessions) ? data.sessions : [];
|
|
203
|
+
const explicitProbes = Array.isArray(data.probes) ? data.probes : [];
|
|
204
|
+
|
|
205
|
+
// TWO held-out modes, both LaMP time-based and both leakage-asserted:
|
|
206
|
+
// (a) EXPLICIT probe set: `corpus.probes` is a separate, later-time-window
|
|
207
|
+
// set with disjoint ids. TRAIN = all `sessions`; PROBE = `probes`.
|
|
208
|
+
// (b) NO explicit probes: split `sessions` chronologically by time/fraction
|
|
209
|
+
// (sessions 1..k train, the rest probe).
|
|
210
|
+
let trainSessions; let probeSessions; let trainIds; let probeIds;
|
|
211
|
+
if (explicitProbes.length > 0) {
|
|
212
|
+
trainSessions = allSessions;
|
|
213
|
+
probeSessions = explicitProbes;
|
|
214
|
+
trainIds = new Set(allSessions.map((s) => String(s.session_id ?? s.sessionId ?? '')).filter(Boolean));
|
|
215
|
+
probeIds = new Set(explicitProbes.map((p) => String(p.session_id ?? p.sessionId ?? '')).filter(Boolean));
|
|
216
|
+
} else {
|
|
217
|
+
const split = splitByTime(allSessions, {
|
|
218
|
+
cutoff: opts.cutoff,
|
|
219
|
+
trainFraction: opts.trainFraction,
|
|
220
|
+
});
|
|
221
|
+
trainSessions = split.train;
|
|
222
|
+
probeSessions = split.probe;
|
|
223
|
+
trainIds = split.trainIds;
|
|
224
|
+
probeIds = split.probeIds;
|
|
225
|
+
}
|
|
226
|
+
|
|
227
|
+
// HELD-OUT ENFORCEMENT (assertion, not comment): no probe id may appear in the
|
|
228
|
+
// train id set. A leaky split is a circular eval — refuse it.
|
|
229
|
+
let disjoint = true;
|
|
230
|
+
for (const id of probeIds) {
|
|
231
|
+
if (trainIds.has(id)) { disjoint = false; break; }
|
|
232
|
+
}
|
|
233
|
+
if (!disjoint) {
|
|
234
|
+
throw new Error(
|
|
235
|
+
'Gate C held-out violation: a probe session id also appears in the train set '
|
|
236
|
+
+ '(train/test leakage). Refusing to report a circular capture score.',
|
|
237
|
+
);
|
|
238
|
+
}
|
|
239
|
+
|
|
240
|
+
// DERIVE from train sessions ONLY (the held-out probe sessions never touch
|
|
241
|
+
// derivation). REAL pipeline.
|
|
242
|
+
const profile = await deriveProfileFromSessions(trainSessions, {
|
|
243
|
+
env: opts.env,
|
|
244
|
+
_localTransport: opts._localTransport,
|
|
245
|
+
});
|
|
246
|
+
|
|
247
|
+
// PREDICTED = what the profile asserts (earned, surfaced subjects).
|
|
248
|
+
const predicted = assertedSubjects(profile);
|
|
249
|
+
|
|
250
|
+
// GOLD = the union of the probe set's surface-varied gold subjects, normalized
|
|
251
|
+
// to the same slug space. These are paraphrases the profile must GENERALIZE to.
|
|
252
|
+
const goldSet = new Set();
|
|
253
|
+
for (const p of probeSessions) {
|
|
254
|
+
for (const g of (p.goldSubjects || [])) goldSet.add(toSubject(g));
|
|
255
|
+
}
|
|
256
|
+
const gold = [...goldSet];
|
|
257
|
+
|
|
258
|
+
const pr = precisionRecall(predicted, gold);
|
|
259
|
+
|
|
260
|
+
// Bootstrap CIs on precision AND recall using the REAL helper. The per-unit
|
|
261
|
+
// 0/1 vectors precisionRecall already produced are the bootstrap inputs.
|
|
262
|
+
const seed = Number.isFinite(opts.bootstrapSeed) ? opts.bootstrapSeed : 42;
|
|
263
|
+
const precision = bootstrapCI(pr.perPredCorrect, { seed });
|
|
264
|
+
const recall = bootstrapCI(pr.perGoldHit, { seed: seed + 1 });
|
|
265
|
+
|
|
266
|
+
// Push v2 — SEMANTIC metric alongside the exact one. The exact slug match is
|
|
267
|
+
// kept (it is the honest "do the verbatim slugs coincide" floor); the semantic
|
|
268
|
+
// match (Jaccard >= threshold on content tokens) tests whether the SAME
|
|
269
|
+
// preference was captured-but-paraphrased. Same bootstrapCI helper, same
|
|
270
|
+
// per-unit vectors -> CIs are comparable across the two metrics.
|
|
271
|
+
const semThreshold = Number.isFinite(opts.semanticThreshold) ? opts.semanticThreshold : 0.5;
|
|
272
|
+
const semPr = semanticPrecisionRecall(predicted, gold, semThreshold);
|
|
273
|
+
const semantic = {
|
|
274
|
+
threshold: semThreshold,
|
|
275
|
+
precision: bootstrapCI(semPr.perPredCorrect, { seed: seed + 2 }),
|
|
276
|
+
recall: bootstrapCI(semPr.perGoldHit, { seed: seed + 3 }),
|
|
277
|
+
f1: semPr.f1,
|
|
278
|
+
counts: { tp: semPr.tp, fp: semPr.fp, fn: semPr.fn },
|
|
279
|
+
};
|
|
280
|
+
|
|
281
|
+
// NEGATIVE CONTROL — the derived (Ada) profile must NOT match Bo's prefs. We
|
|
282
|
+
// score precision of the SAME predicted set against the negative persona's
|
|
283
|
+
// gold; a low match here is the precision guard (it confirms we're not just
|
|
284
|
+
// emitting universal prefs that match anyone).
|
|
285
|
+
let negativeControl = { precision: 0, matched: [], semanticPrecision: 0, semanticMatched: [] };
|
|
286
|
+
if (data.negativeControl && Array.isArray(data.negativeControl.goldSubjects)) {
|
|
287
|
+
const ncGoldSlugs = data.negativeControl.goldSubjects.map(toSubject);
|
|
288
|
+
const ncGold = new Set(ncGoldSlugs);
|
|
289
|
+
const matched = predicted.filter((p) => ncGold.has(p));
|
|
290
|
+
// SEMANTIC negative control: the SAME Jaccard matcher applied to the inverted
|
|
291
|
+
// persona's gold. A near-zero here is the proof the semantic metric is not
|
|
292
|
+
// trivially-always-positive — it must NOT light up on an unrelated user.
|
|
293
|
+
const ncSem = semanticPrecisionRecall(predicted, ncGoldSlugs, semThreshold);
|
|
294
|
+
negativeControl = {
|
|
295
|
+
precision: predicted.length ? matched.length / predicted.length : 0,
|
|
296
|
+
matched,
|
|
297
|
+
semanticPrecision: ncSem.precision,
|
|
298
|
+
semanticMatched: predicted.filter((p, i) => ncSem.perPredCorrect[i] === 1),
|
|
299
|
+
};
|
|
300
|
+
}
|
|
301
|
+
|
|
302
|
+
return {
|
|
303
|
+
heldOut: {
|
|
304
|
+
disjoint: true,
|
|
305
|
+
trainIds: [...trainIds],
|
|
306
|
+
probeIds: [...probeIds],
|
|
307
|
+
},
|
|
308
|
+
precision,
|
|
309
|
+
recall,
|
|
310
|
+
f1: pr.f1,
|
|
311
|
+
semantic,
|
|
312
|
+
predicted,
|
|
313
|
+
gold,
|
|
314
|
+
counts: { tp: pr.tp, fp: pr.fp, fn: pr.fn },
|
|
315
|
+
negativeControl,
|
|
316
|
+
nTrain: trainSessions.length,
|
|
317
|
+
nProbe: probeIds.size,
|
|
318
|
+
};
|
|
319
|
+
}
|
|
320
|
+
|
|
321
|
+
export default {
|
|
322
|
+
runGateC, deriveProfileFromSessions, assertedSubjects, semanticPrecisionRecall,
|
|
323
|
+
};
|