@ijfw/memory-server 1.5.6 → 1.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/ijfw-dashboard +20 -1
- package/package.json +4 -3
- package/src/audit-roster.js +89 -12
- package/src/brain/tiered-llm.js +57 -7
- package/src/cross-orchestrator-cli.js +344 -4
- package/src/cross-project-search.js +39 -1
- package/src/dashboard-server.js +7 -1
- package/src/dream/runner.mjs +560 -8
- package/src/handlers/brain-handler.js +101 -1
- package/src/importers/discover.js +1 -1
- package/src/memory/bench-metrics.js +289 -0
- package/src/memory/benchmark.js +1 -1
- package/src/memory/search.js +53 -1
- package/src/orchestrator/plan-checker.js +1 -1
- package/src/profile/audit.js +671 -0
- package/src/profile/capture.js +871 -0
- package/src/profile/derive-dialectic.js +242 -0
- package/src/profile/derive-heuristic.js +733 -0
- package/src/profile/derive.js +156 -0
- package/src/profile/egress.js +306 -0
- package/src/profile/eval/build-real-probes.mjs +197 -0
- package/src/profile/eval/corpus-from-reddit.mjs +166 -0
- package/src/profile/eval/corpus-from-reddit.test.mjs +121 -0
- package/src/profile/eval/corpus-from-transcripts.mjs +264 -0
- package/src/profile/eval/gate-b-behavior.mjs +420 -0
- package/src/profile/eval/gate-b-decision-run.mjs +171 -0
- package/src/profile/eval/gate-b-decision-run.test.mjs +141 -0
- package/src/profile/eval/gate-b-run.mjs +417 -0
- package/src/profile/eval/gate-b-run.test.mjs +204 -0
- package/src/profile/eval/gate-c-capture.mjs +323 -0
- package/src/profile/eval/harness.mjs +551 -0
- package/src/profile/eval/instrument-validation.mjs +248 -0
- package/src/profile/eval/instrument-validation.test.mjs +125 -0
- package/src/profile/eval/multi-subject-harness.mjs +106 -0
- package/src/profile/eval/multi-subject-harness.test.mjs +99 -0
- package/src/profile/eval/personas.test.mjs +83 -0
- package/src/profile/eval/plumbing.test.mjs +69 -0
- package/src/profile/eval/prereg.mjs +130 -0
- package/src/profile/eval/prereg.test.mjs +78 -0
- package/src/profile/eval/real-corpus.test.mjs +103 -0
- package/src/profile/eval/real-personas.mjs +109 -0
- package/src/profile/eval/run-real-corpus-concurrent.mjs +407 -0
- package/src/profile/eval/run-real-corpus.mjs +358 -0
- package/src/profile/eval/slug-quality.mjs +464 -0
- package/src/profile/eval/stylometry-features.js +85 -0
- package/src/profile/eval/stylometry-reference.js +16 -0
- package/src/profile/eval/stylometry.js +224 -0
- package/src/profile/eval/stylometry.test.mjs +103 -0
- package/src/profile/eval/synthetic-personas.js +91 -0
- package/src/profile/eval/verifier-features.mjs +170 -0
- package/src/profile/eval/verifier-logreg.mjs +74 -0
- package/src/profile/eval/verifier-pair.mjs +122 -0
- package/src/profile/eval/verifier-reference.mjs +68 -0
- package/src/profile/eval/verifier-scorer.mjs +30 -0
- package/src/profile/eval/wrong-target-control.mjs +168 -0
- package/src/profile/eval/wrong-target-control.test.mjs +124 -0
- package/src/profile/exemplar-capture.js +232 -0
- package/src/profile/exemplar-retrieve.js +138 -0
- package/src/profile/exemplar-store.js +314 -0
- package/src/profile/lock.js +64 -0
- package/src/profile/merge.js +624 -0
- package/src/profile/path-policy.js +213 -0
- package/src/profile/precision-stamp.mjs +151 -0
- package/src/profile/render-brief.js +717 -0
- package/src/profile/schema.js +244 -0
- package/src/profile/sensitivity.js +249 -0
- package/src/profile/serve.js +345 -0
- package/src/profile/store.js +261 -0
- package/src/profile/telemetry.js +289 -0
- package/src/recovery/checkpoint.js +7 -1
- package/src/server.js +185 -14
- package/src/.registry-meta-key.pem +0 -3
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* profile/eval/plumbing.test.mjs — PHASE P5.1.
|
|
3
|
+
*
|
|
4
|
+
* ┌─────────────────────────────────────────────────────────────────────────┐
|
|
5
|
+
* │ THIS IS A PLUMBING / REGRESSION TEST. IT IS *NOT* PROOF THE SYSTEM │
|
|
6
|
+
* │ "LEARNS YOU." │
|
|
7
|
+
* │ │
|
|
8
|
+
* │ It asserts ONE thing: synthetic signals round-trip through the REAL │
|
|
9
|
+
* │ derivation pipeline (deriveHeuristic / deriveProfile) into a profile │
|
|
10
|
+
* │ delta with the expected shape. That is a wiring check — it proves the │
|
|
11
|
+
* │ pipes connect, nothing more. │
|
|
12
|
+
* │ │
|
|
13
|
+
* │ It deliberately TRAINS AND TESTS ON THE SAME SIGNAL (the circular setup │
|
|
14
|
+
* │ the slice-1 audit flagged as "the Honcho move" — assert-not-prove). That │
|
|
15
|
+
* │ circularity is FINE for a plumbing test and FATAL for a proof-of- │
|
|
16
|
+
* │ learning claim. Do NOT cite this test as evidence the profile captures │
|
|
17
|
+
* │ or changes behavior. │
|
|
18
|
+
* │ │
|
|
19
|
+
* │ The REAL evidence lives in the two held-out gates: │
|
|
20
|
+
* │ - Gate C (gate-c-capture.mjs): held-out capture, precision AND recall. │
|
|
21
|
+
* │ - Gate B (gate-b-behavior.mjs): behavioral A/B, paired McNemar. │
|
|
22
|
+
* │ The word "prove" is reserved for Gate B passing — never for this file. │
|
|
23
|
+
* └─────────────────────────────────────────────────────────────────────────┘
|
|
24
|
+
*
|
|
25
|
+
* Cite (why circular-capture eval is not proof): PrefEval [2502.09597].
|
|
26
|
+
*/
|
|
27
|
+
|
|
28
|
+
import { test } from 'node:test';
|
|
29
|
+
import assert from 'node:assert/strict';
|
|
30
|
+
|
|
31
|
+
import { deriveHeuristic } from '../derive-heuristic.js';
|
|
32
|
+
import { deriveProfile } from '../derive.js';
|
|
33
|
+
|
|
34
|
+
const SIGNALS = {
|
|
35
|
+
metadata: {
|
|
36
|
+
avg_msg_chars: 38, emoji_per_msg: 0, code_block_ratio: 0.7,
|
|
37
|
+
formality_markers: 0.55, turn_cadence_per_min: 6, msg_count: 20,
|
|
38
|
+
},
|
|
39
|
+
feedback: [
|
|
40
|
+
{ ts: '2026-06-01T00:00:00.000Z', kind: 'correction', phrase: 'use tabs not spaces', context: '' },
|
|
41
|
+
],
|
|
42
|
+
sessionId: 's0',
|
|
43
|
+
host: 'claude',
|
|
44
|
+
};
|
|
45
|
+
|
|
46
|
+
test('[PLUMBING / NOT PROOF] synthetic signals round-trip through the REAL deriveHeuristic into a delta', () => {
|
|
47
|
+
const delta = deriveHeuristic(SIGNALS);
|
|
48
|
+
// Shape-only assertions — this is a wiring check, not a learning claim.
|
|
49
|
+
assert.ok(delta && typeof delta === 'object', 'delta produced');
|
|
50
|
+
assert.ok(delta.style && delta.style.terseness, 'style axis present');
|
|
51
|
+
assert.ok(Array.isArray(delta.inferences) && delta.inferences.length === 1, 'one preference inference');
|
|
52
|
+
assert.equal(delta.inferences[0].kind, 'preference');
|
|
53
|
+
assert.equal(delta.inferences[0].subject, 'use tabs not spaces');
|
|
54
|
+
});
|
|
55
|
+
|
|
56
|
+
test('[PLUMBING / NOT PROOF] deriveProfile orchestrator returns the heuristic floor with zero LLM config', async () => {
|
|
57
|
+
// No local URL, no cloud opt-in -> heuristic-only floor. Round-trip wiring only.
|
|
58
|
+
const delta = await deriveProfile(SIGNALS, { env: {} });
|
|
59
|
+
assert.ok(delta && typeof delta === 'object');
|
|
60
|
+
assert.ok(delta.style && delta.style.terseness, 'heuristic floor style present');
|
|
61
|
+
assert.ok(Array.isArray(delta.inferences) && delta.inferences.length >= 1, 'heuristic floor inference present');
|
|
62
|
+
});
|
|
63
|
+
|
|
64
|
+
test('[PLUMBING / NOT PROOF] this file makes NO held-out or behavioral claim (guard)', () => {
|
|
65
|
+
// A literal guard so a future reader cannot quietly upgrade this plumbing test
|
|
66
|
+
// into a proof: the only thing asserted here is round-trip wiring. Real proof
|
|
67
|
+
// is Gate C (held-out capture) + Gate B (behavioral A/B), in their own files.
|
|
68
|
+
assert.ok(true, 'plumbing only — see gate-c-capture.mjs / gate-b-behavior.mjs for proof');
|
|
69
|
+
});
|
|
@@ -0,0 +1,130 @@
|
|
|
1
|
+
// prereg.mjs — Gate B v2, Task T6. Pre-registration rig. Freezes the analysis plan BEFORE
|
|
2
|
+
// the confirmatory run so nothing can be tuned to pass after seeing results. The hash
|
|
3
|
+
// covers EVERY decision-bearing field (including leakFloor, perTestAlpha, foreignAggregation)
|
|
4
|
+
// so any post-hoc edit is tamper-evident. Split/persona/bootstrap seeds are a pure function
|
|
5
|
+
// of preReg.seed. Per-arm alpha is Bonferroni-split across the verdict-bearing arms.
|
|
6
|
+
|
|
7
|
+
import crypto from 'node:crypto';
|
|
8
|
+
import { bootstrapCI } from '../../memory/bench-metrics.js';
|
|
9
|
+
|
|
10
|
+
// Every field here is frozen and hashed. Editing any of them changes the runId hash.
|
|
11
|
+
export const PREREG_DEFAULTS = Object.freeze({
|
|
12
|
+
primaryEndpoint: 'own-vs-nearest-same-register-foreigner-margin',
|
|
13
|
+
corpus: 'reddit-single-subreddit',
|
|
14
|
+
registerDelta: 0.15,
|
|
15
|
+
familyAlpha: 0.01, // family-wise; Bonferroni-split across verdictArms
|
|
16
|
+
alpha: 0.02, // bootstrap two-sided @0.02 ⇒ one-sided 99% lower bound
|
|
17
|
+
floorK: 0.25, // measured-scale floor MULTIPLIER: minMeanMargin = floorK*(betweenMean-withinMean)
|
|
18
|
+
leakFloor: 0.02,
|
|
19
|
+
foreignAggregation: 'nearest', // NEVER 'mean'/'centroid'
|
|
20
|
+
nExemplars: 2,
|
|
21
|
+
verdictArms: ['derived', 'fewShotOracle'],
|
|
22
|
+
minSubjects: 60,
|
|
23
|
+
minDetectableEffect: 0.02,
|
|
24
|
+
seed: 1,
|
|
25
|
+
});
|
|
26
|
+
|
|
27
|
+
// perTestAlpha is derived but ALSO hashed, so a manual post-hoc override is tamper-evident.
|
|
28
|
+
const HASHED_FIELDS = [...Object.keys(PREREG_DEFAULTS), 'perTestAlpha'];
|
|
29
|
+
|
|
30
|
+
// Stable stringify: object keys sorted (key-order independent), array order preserved.
|
|
31
|
+
function stableStringify(o) {
|
|
32
|
+
if (Array.isArray(o)) return `[${o.map(stableStringify).join(',')}]`;
|
|
33
|
+
if (o && typeof o === 'object') {
|
|
34
|
+
return `{${Object.keys(o).sort().map((k) => `${JSON.stringify(k)}:${stableStringify(o[k])}`).join(',')}}`;
|
|
35
|
+
}
|
|
36
|
+
return JSON.stringify(o);
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
export function hashPreReg(preReg) {
|
|
40
|
+
const picked = {};
|
|
41
|
+
for (const f of HASHED_FIELDS) picked[f] = preReg[f];
|
|
42
|
+
return crypto.createHash('sha256').update(stableStringify(picked)).digest('hex');
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
// Bonferroni per-arm alpha — NOT one reused alpha. Returns { arm: familyAlpha/k }.
|
|
46
|
+
export function bonferroniAlpha(familyAlpha, verdictArms) {
|
|
47
|
+
const k = verdictArms.length || 1;
|
|
48
|
+
const per = familyAlpha / k;
|
|
49
|
+
const out = {};
|
|
50
|
+
for (const a of verdictArms) out[a] = per;
|
|
51
|
+
return out;
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
// Measured-scale floor: the minimum mean margin that counts as a real effect, expressed
|
|
55
|
+
// in the instrument's OWN units = floorK * (betweenMean − withinMean) from validateInstrument.
|
|
56
|
+
// This REPLACES the blind absolute constant (the prior attempt's failure class). Frozen
|
|
57
|
+
// before any cloud spend (floorK is hashed; the derived value is recorded in the run).
|
|
58
|
+
export function deriveMinMeanMargin(validation, floorK) {
|
|
59
|
+
const sep = validation.betweenMean - validation.withinMean;
|
|
60
|
+
if (!(Number.isFinite(sep) && sep > 0)) {
|
|
61
|
+
throw new Error(`cannot derive measured-scale floor: invalid instrument separation (between ${validation.betweenMean}, within ${validation.withinMean})`);
|
|
62
|
+
}
|
|
63
|
+
if (!(Number.isFinite(floorK) && floorK > 0)) throw new Error(`invalid floorK ${floorK}`);
|
|
64
|
+
return floorK * sep;
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
// Seeds are a pure function of preReg.seed — same seed ⇒ same splits everywhere.
|
|
68
|
+
export function deriveSeeds(seed) {
|
|
69
|
+
const h = (tag) => parseInt(crypto.createHash('sha256').update(`${seed}:${tag}`).digest('hex').slice(0, 8), 16) >>> 0;
|
|
70
|
+
return { splitSeed: h('split'), personaSeed: h('persona'), bootstrapSeed: h('bootstrap') };
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
// buildPreReg(input) → frozen config + derived alpha/seeds + runId hash.
|
|
74
|
+
export function buildPreReg(input = {}) {
|
|
75
|
+
const cfg = { ...PREREG_DEFAULTS, ...input };
|
|
76
|
+
const perTestAlpha = bonferroniAlpha(cfg.familyAlpha, cfg.verdictArms);
|
|
77
|
+
const seeds = deriveSeeds(cfg.seed);
|
|
78
|
+
const base = { ...cfg, perTestAlpha, seeds };
|
|
79
|
+
const hash = hashPreReg(base);
|
|
80
|
+
return Object.freeze({ ...base, hash, runId: input.runId || `gateb-${hash.slice(0, 12)}` });
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
// assertFrozen(registry, preReg): a runId can be registered ONCE. Re-registering (a re-run
|
|
84
|
+
// after an edit) throws — the rig refuses to silently overwrite a frozen plan.
|
|
85
|
+
export function assertFrozen(registry, preReg) {
|
|
86
|
+
if (registry.has(preReg.runId)) {
|
|
87
|
+
throw new Error(`runId ${preReg.runId} already frozen (hash ${registry.get(preReg.runId).slice(0, 12)}); start a new run`);
|
|
88
|
+
}
|
|
89
|
+
registry.set(preReg.runId, preReg.hash);
|
|
90
|
+
return true;
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
function mulberry32(seed) {
|
|
94
|
+
let a = seed >>> 0;
|
|
95
|
+
return function rng() {
|
|
96
|
+
a |= 0; a = (a + 0x6D2B79F5) | 0;
|
|
97
|
+
let t = Math.imul(a ^ (a >>> 15), 1 | a);
|
|
98
|
+
t = (t + Math.imul(t ^ (t >>> 7), 61 | t)) ^ t;
|
|
99
|
+
return ((t ^ (t >>> 14)) >>> 0) / 4294967296;
|
|
100
|
+
};
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
// simulatePower: Monte Carlo power under a RIGHT-SKEWED margin distribution (margins are
|
|
104
|
+
// not normal — a few subjects carry most of the signal). For each sim, draw nSubjects
|
|
105
|
+
// margins (mean = trueMeanMargin, right-skewed via exponential), then apply the EXACT
|
|
106
|
+
// decision rule (bootstrap one-sided lower bound > 0 AND mean >= minMeanMargin). Power =
|
|
107
|
+
// fraction of sims that PASS. Deterministic by seed.
|
|
108
|
+
export function simulatePower(preReg, {
|
|
109
|
+
trueMeanMargin, spread = 0.05, nSubjects, sims = 300, bootIters = 300, minMeanMargin = 0,
|
|
110
|
+
} = {}) {
|
|
111
|
+
const n = nSubjects || preReg.minSubjects;
|
|
112
|
+
const rng = mulberry32(preReg.seeds.bootstrapSeed ^ 0x9e3779b9);
|
|
113
|
+
let pass = 0;
|
|
114
|
+
for (let s = 0; s < sims; s += 1) {
|
|
115
|
+
const margins = Array.from({ length: n });
|
|
116
|
+
for (let i = 0; i < n; i += 1) {
|
|
117
|
+
// exponential(mean=spread) shifted so the overall mean is trueMeanMargin
|
|
118
|
+
const exp = -Math.log(1 - rng()) * spread;
|
|
119
|
+
margins[i] = trueMeanMargin - spread + exp;
|
|
120
|
+
}
|
|
121
|
+
const mean = margins.reduce((a, b) => a + b, 0) / n;
|
|
122
|
+
// measured-scale floor is run-derived (deriveMinMeanMargin); the planner passes a
|
|
123
|
+
// hypothesized value here. Defaults to 0 so the CI leg dominates the planning estimate.
|
|
124
|
+
const ci = bootstrapCI(margins, { iters: bootIters, alpha: preReg.alpha, seed: (s * 2654435761) >>> 0 });
|
|
125
|
+
if (ci.lo > 0 && mean >= minMeanMargin) pass += 1;
|
|
126
|
+
}
|
|
127
|
+
return pass / sims;
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
export const __test = { stableStringify, HASHED_FIELDS };
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
// Gate B v2 — Task T6: pre-registration rig. Tamper-evidence (any frozen field changes the
|
|
2
|
+
// hash), freeze-once (no overwrite), seed-pure derivation, per-arm Bonferroni alpha, and a
|
|
3
|
+
// power simulation that responds to the true effect size.
|
|
4
|
+
|
|
5
|
+
import { test } from 'node:test';
|
|
6
|
+
import assert from 'node:assert/strict';
|
|
7
|
+
import {
|
|
8
|
+
buildPreReg, hashPreReg, bonferroniAlpha, deriveSeeds, assertFrozen, simulatePower, deriveMinMeanMargin,
|
|
9
|
+
} from './prereg.mjs';
|
|
10
|
+
|
|
11
|
+
test('deriveMinMeanMargin = floorK*(between-within); throws on invalid instrument scale', () => {
|
|
12
|
+
assert.ok(Math.abs(deriveMinMeanMargin({ betweenMean: 0.5, withinMean: 0.3 }, 0.25) - 0.05) < 1e-12);
|
|
13
|
+
assert.throws(() => deriveMinMeanMargin({ betweenMean: 0.3, withinMean: 0.5 }, 0.25), /separation/);
|
|
14
|
+
assert.throws(() => deriveMinMeanMargin({ betweenMean: 0.5, withinMean: 0.3 }, 0), /floorK/);
|
|
15
|
+
});
|
|
16
|
+
|
|
17
|
+
test('hashPreReg is stable and key-order independent', () => {
|
|
18
|
+
const a = buildPreReg({ seed: 7 });
|
|
19
|
+
const b = buildPreReg({ seed: 7 });
|
|
20
|
+
assert.equal(a.hash, b.hash);
|
|
21
|
+
assert.equal(a.runId, b.runId);
|
|
22
|
+
// key-order independence: same fields, different insertion order ⇒ same hash
|
|
23
|
+
const o1 = { ...a };
|
|
24
|
+
const o2 = {};
|
|
25
|
+
for (const k of Object.keys(o1).reverse()) o2[k] = o1[k];
|
|
26
|
+
assert.equal(hashPreReg(o1), hashPreReg(o2));
|
|
27
|
+
});
|
|
28
|
+
|
|
29
|
+
test('TAMPER-EVIDENCE: changing any frozen field changes the hash', () => {
|
|
30
|
+
const base = buildPreReg({});
|
|
31
|
+
assert.notEqual(base.hash, buildPreReg({ leakFloor: 0.05 }).hash, 'leakFloor');
|
|
32
|
+
assert.notEqual(base.hash, buildPreReg({ foreignAggregation: 'mean' }).hash, 'foreignAggregation');
|
|
33
|
+
assert.notEqual(base.hash, buildPreReg({ familyAlpha: 0.02 }).hash, 'familyAlpha (⇒ perTestAlpha)');
|
|
34
|
+
assert.notEqual(base.hash, buildPreReg({ registerDelta: 0.2 }).hash, 'registerDelta');
|
|
35
|
+
assert.notEqual(base.hash, buildPreReg({ floorK: 0.4 }).hash, 'floorK (measured-scale floor multiplier)');
|
|
36
|
+
// perTestAlpha itself is hashed: a manual override is caught
|
|
37
|
+
assert.notEqual(hashPreReg(base), hashPreReg({ ...base, perTestAlpha: { derived: 0.001, fewShotOracle: 0.009 } }));
|
|
38
|
+
});
|
|
39
|
+
|
|
40
|
+
test('FREEZE-ONCE: assertFrozen refuses to overwrite an existing runId', () => {
|
|
41
|
+
const registry = new Map();
|
|
42
|
+
const pr = buildPreReg({ seed: 3 });
|
|
43
|
+
assert.equal(assertFrozen(registry, pr), true);
|
|
44
|
+
assert.throws(() => assertFrozen(registry, pr), /already frozen/);
|
|
45
|
+
});
|
|
46
|
+
|
|
47
|
+
test('seeds are a pure function of preReg.seed', () => {
|
|
48
|
+
assert.deepEqual(deriveSeeds(5), deriveSeeds(5));
|
|
49
|
+
assert.notDeepEqual(deriveSeeds(5), deriveSeeds(6));
|
|
50
|
+
const pr = buildPreReg({ seed: 5 });
|
|
51
|
+
assert.deepEqual(pr.seeds, deriveSeeds(5));
|
|
52
|
+
});
|
|
53
|
+
|
|
54
|
+
test('per-arm Bonferroni alpha (familyAlpha/k), not one reused alpha', () => {
|
|
55
|
+
const a = bonferroniAlpha(0.01, ['derived', 'fewShotOracle']);
|
|
56
|
+
assert.equal(a.derived, 0.005);
|
|
57
|
+
assert.equal(a.fewShotOracle, 0.005);
|
|
58
|
+
const pr = buildPreReg({});
|
|
59
|
+
assert.equal(Object.keys(pr.perTestAlpha).length, 2, 'one alpha entry per verdict arm');
|
|
60
|
+
assert.ok(pr.perTestAlpha.derived < pr.familyAlpha, 'per-test alpha is stricter than family alpha');
|
|
61
|
+
});
|
|
62
|
+
|
|
63
|
+
test('buildPreReg returns a frozen object with a runId', () => {
|
|
64
|
+
const pr = buildPreReg({});
|
|
65
|
+
assert.ok(Object.isFrozen(pr));
|
|
66
|
+
assert.match(pr.runId, /^gateb-[0-9a-f]{12}$/);
|
|
67
|
+
});
|
|
68
|
+
|
|
69
|
+
test('simulatePower is in [0,1] and increases with the true effect', () => {
|
|
70
|
+
const pr = buildPreReg({ minSubjects: 60 });
|
|
71
|
+
const opts = { nSubjects: 60, sims: 120, bootIters: 200, spread: 0.05 };
|
|
72
|
+
const lowEffect = simulatePower(pr, { ...opts, trueMeanMargin: 0.0 });
|
|
73
|
+
const highEffect = simulatePower(pr, { ...opts, trueMeanMargin: 0.10 });
|
|
74
|
+
for (const p of [lowEffect, highEffect]) assert.ok(p >= 0 && p <= 1);
|
|
75
|
+
assert.ok(highEffect > lowEffect, `power rises with effect: ${highEffect} > ${lowEffect}`);
|
|
76
|
+
// deterministic
|
|
77
|
+
assert.equal(simulatePower(pr, { ...opts, trueMeanMargin: 0.10 }), highEffect);
|
|
78
|
+
});
|
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* real-corpus.test.mjs — offline plumbing tests for the REAL-CORPUS eval scripts
|
|
3
|
+
* (corpus-from-transcripts + build-real-probes). These do NOT touch the network
|
|
4
|
+
* or the user's real ~/.claude/projects; they drive a tiny synthetic transcript
|
|
5
|
+
* tree in a tmp dir so the parser/split/probe logic is exercised deterministically.
|
|
6
|
+
*
|
|
7
|
+
* The cloud-driven runner (run-real-corpus.mjs) is intentionally NOT unit-tested
|
|
8
|
+
* here (it requires a live API key + bounded spend); its scoring pieces are the
|
|
9
|
+
* already-tested harness functions, and it is exercised by the operator run.
|
|
10
|
+
*/
|
|
11
|
+
|
|
12
|
+
import { test } from 'node:test';
|
|
13
|
+
import assert from 'node:assert/strict';
|
|
14
|
+
import { mkdtempSync, mkdirSync, writeFileSync } from 'node:fs';
|
|
15
|
+
import { tmpdir } from 'node:os';
|
|
16
|
+
import { join } from 'node:path';
|
|
17
|
+
import { buildCorpus } from './corpus-from-transcripts.mjs';
|
|
18
|
+
import { buildRealEval } from './build-real-probes.mjs';
|
|
19
|
+
|
|
20
|
+
/** Write a minimal Claude-Code-style transcript .jsonl. */
|
|
21
|
+
function writeTranscript(dir, sid, messages, baseTs) {
|
|
22
|
+
const lines = messages.map((m, i) => JSON.stringify({
|
|
23
|
+
type: 'user',
|
|
24
|
+
sessionId: sid,
|
|
25
|
+
timestamp: new Date(baseTs + i * 1000).toISOString(),
|
|
26
|
+
message: { role: 'user', content: m },
|
|
27
|
+
}));
|
|
28
|
+
// include a tool_result line (must be skipped) and a meta line (must be skipped)
|
|
29
|
+
lines.push(JSON.stringify({ type: 'user', sessionId: sid, isMeta: true, timestamp: new Date(baseTs).toISOString(), message: { role: 'user', content: 'meta noise' } }));
|
|
30
|
+
lines.push(JSON.stringify({ type: 'user', sessionId: sid, timestamp: new Date(baseTs).toISOString(), message: { role: 'user', content: [{ type: 'tool_result', content: 'tool output' }] } }));
|
|
31
|
+
writeFileSync(join(dir, `${sid}.jsonl`), `${lines.join('\n')}\n`);
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
function makeTree() {
|
|
35
|
+
const root = mkdtempSync(join(tmpdir(), 'ijfw-corpus-test-'));
|
|
36
|
+
const projA = join(root, '-Users-x-projA');
|
|
37
|
+
mkdirSync(projA, { recursive: true });
|
|
38
|
+
// 4 sessions, each with >=8 human messages, spread over time. Some carry
|
|
39
|
+
// feedback-detector-triggering phrases ("don't do that", "I prefer", etc).
|
|
40
|
+
const t0 = Date.UTC(2026, 0, 1);
|
|
41
|
+
for (let s = 0; s < 4; s++) {
|
|
42
|
+
const msgs = [];
|
|
43
|
+
for (let i = 0; i < 9; i++) {
|
|
44
|
+
msgs.push(i === 0 ? "No, don't do that. I prefer tabs from now on." : `message ${i} with some content here to count chars and words properly`);
|
|
45
|
+
}
|
|
46
|
+
writeTranscript(projA, `sess-${s}`, msgs, t0 + s * 86400000); // one day apart
|
|
47
|
+
}
|
|
48
|
+
return root;
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
test('buildCorpus parses transcripts, skips meta+tool_result, derives metadata+feedback', () => {
|
|
52
|
+
const root = makeTree();
|
|
53
|
+
const { sessions, stats } = buildCorpus({ root, minMessages: 8, cap: 100 });
|
|
54
|
+
assert.equal(sessions.length, 4, 'all 4 sessions kept (>=8 human msgs each)');
|
|
55
|
+
assert.ok(stats.totalFeedbackRows > 0, 'feedback detected from trigger phrases');
|
|
56
|
+
for (const s of sessions) {
|
|
57
|
+
// metadata is the deriveStyle input shape, numbers only (no raw text)
|
|
58
|
+
assert.ok(Number.isFinite(s.metadata.avg_msg_chars));
|
|
59
|
+
assert.ok('emoji_per_msg' in s.metadata && 'code_block_ratio' in s.metadata);
|
|
60
|
+
assert.ok('formality_markers' in s.metadata && 'turn_cadence_per_min' in s.metadata);
|
|
61
|
+
assert.equal(s.host, 'claude-code');
|
|
62
|
+
assert.ok(s.ts, 'session carries a timestamp');
|
|
63
|
+
// msg_count excludes the meta + tool_result lines (9 human msgs only)
|
|
64
|
+
assert.equal(s.msg_count, 9);
|
|
65
|
+
}
|
|
66
|
+
});
|
|
67
|
+
|
|
68
|
+
test('buildCorpus excludes slash-command / system-reminder artifacts from style', () => {
|
|
69
|
+
const root = mkdtempSync(join(tmpdir(), 'ijfw-corpus-art-'));
|
|
70
|
+
const proj = join(root, '-Users-x-projB');
|
|
71
|
+
mkdirSync(proj, { recursive: true });
|
|
72
|
+
const t0 = Date.UTC(2026, 1, 1);
|
|
73
|
+
const msgs = [];
|
|
74
|
+
for (let i = 0; i < 10; i++) msgs.push(`genuine human message number ${i} about code`);
|
|
75
|
+
msgs.push('<command-name>/foo</command-name>');
|
|
76
|
+
msgs.push('<system-reminder>noise</system-reminder>');
|
|
77
|
+
msgs.push('Caveat: The messages below were generated by the user');
|
|
78
|
+
writeTranscript(proj, 'sx', msgs, t0);
|
|
79
|
+
const { sessions } = buildCorpus({ root, minMessages: 8, cap: 10 });
|
|
80
|
+
assert.equal(sessions.length, 1);
|
|
81
|
+
// 10 genuine + the writeTranscript trailing meta/tool_result skipped; the 3
|
|
82
|
+
// artifact lines must also be skipped -> exactly 10 counted.
|
|
83
|
+
assert.equal(sessions[0].msg_count, 10);
|
|
84
|
+
});
|
|
85
|
+
|
|
86
|
+
test('buildRealEval produces a disjoint LaMP time split + transcript-free probes', async () => {
|
|
87
|
+
const root = makeTree();
|
|
88
|
+
const corpus = buildCorpus({ root, minMessages: 8, cap: 100 });
|
|
89
|
+
const ev = await buildRealEval(corpus, { trainFraction: 0.5, nProbes: 5 });
|
|
90
|
+
assert.equal(ev.split.disjoint, true, 'train/test session ids are disjoint');
|
|
91
|
+
assert.ok(ev.train.length >= 1 && ev.test.length >= 1, 'both windows non-empty');
|
|
92
|
+
// train window strictly precedes test window (time-based)
|
|
93
|
+
assert.ok(Date.parse(ev.split.trainTsMax) <= Date.parse(ev.split.testTsMin));
|
|
94
|
+
assert.equal(ev.probes.length, 5);
|
|
95
|
+
for (const p of ev.probes) {
|
|
96
|
+
assert.ok(typeof p.prompt === 'string' && p.prompt.length > 0);
|
|
97
|
+
assert.ok(p.goldStyle && Number.isFinite(p.goldStyle.terseness));
|
|
98
|
+
// probe prompt must be from the authored generic bank (transcript-free)
|
|
99
|
+
assert.ok(!p.prompt.includes('message '), 'no raw transcript text in probe prompt');
|
|
100
|
+
}
|
|
101
|
+
// negative control carries an inverted target
|
|
102
|
+
assert.ok(ev.negativeControl && Array.isArray(ev.negativeControl.goldSubjects));
|
|
103
|
+
});
|
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
// real-personas.mjs — Gate B v2, Task T3 (real-author half). A "persona" is one real
|
|
2
|
+
// author split into DISJOINT train/test document slices. The brief is derived from TRAIN
|
|
3
|
+
// only; the held-out TEST fingerprint is the scoring target (kills the train/test
|
|
4
|
+
// circularity C1). Real authors carry the confirmatory headline (synthetic-personas.js
|
|
5
|
+
// may only downgrade).
|
|
6
|
+
//
|
|
7
|
+
// SELECTION-BIAS GUARD (audit must-fix): persona selection + ordering depend ONLY on the
|
|
8
|
+
// seed and author identity — NEVER on the mutual fullStyleDistance under test. Choosing
|
|
9
|
+
// "well-separated" authors by the scored metric would inflate every downstream result.
|
|
10
|
+
|
|
11
|
+
import { tokenizeWords } from './stylometry-features.js';
|
|
12
|
+
import { fullStyleVector } from './stylometry.js';
|
|
13
|
+
|
|
14
|
+
export const PERSONA_DEFAULTS = Object.freeze({
|
|
15
|
+
minTrainTokens: 1200, minTestTokens: 600, seed: 1,
|
|
16
|
+
});
|
|
17
|
+
|
|
18
|
+
// FNV-1a string hash → uint32. Deterministic, content-independent ordering key.
|
|
19
|
+
function fnv1a(str) {
|
|
20
|
+
let h = 0x811c9dc5;
|
|
21
|
+
for (let i = 0; i < str.length; i += 1) {
|
|
22
|
+
h ^= str.charCodeAt(i);
|
|
23
|
+
h = Math.imul(h, 0x01000193);
|
|
24
|
+
}
|
|
25
|
+
return h >>> 0;
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
function tokenCount(text) { return tokenizeWords(text).length; }
|
|
29
|
+
|
|
30
|
+
// Split one author's docs into disjoint train/test slices. Allocates whole documents to
|
|
31
|
+
// TEST until the test-token floor is met, the remainder to TRAIN; both must clear their
|
|
32
|
+
// floors or it THROWS (abort — do not silently shrink a slice below power).
|
|
33
|
+
export function splitAuthorDocs(author, opts = {}) {
|
|
34
|
+
const cfg = { ...PERSONA_DEFAULTS, ...opts };
|
|
35
|
+
const docs = author.docs || [];
|
|
36
|
+
if (docs.length < 2) throw new Error(`persona ${author.id}: needs >=2 documents for a disjoint split`);
|
|
37
|
+
|
|
38
|
+
// deterministic doc order by seeded hash (never by content distance)
|
|
39
|
+
const order = docs.map((_, i) => i).sort((a, b) => {
|
|
40
|
+
const ha = fnv1a(`${cfg.seed}:${author.id}:${a}`);
|
|
41
|
+
const hb = fnv1a(`${cfg.seed}:${author.id}:${b}`);
|
|
42
|
+
return ha - hb || a - b;
|
|
43
|
+
});
|
|
44
|
+
|
|
45
|
+
const testIdx = [];
|
|
46
|
+
let testTok = 0;
|
|
47
|
+
for (const i of order) {
|
|
48
|
+
if (testTok >= cfg.minTestTokens) break;
|
|
49
|
+
testIdx.push(i);
|
|
50
|
+
testTok += tokenCount(docs[i]);
|
|
51
|
+
}
|
|
52
|
+
const testSet = new Set(testIdx);
|
|
53
|
+
const trainIdx = order.filter((i) => !testSet.has(i));
|
|
54
|
+
const trainDocs = trainIdx.map((i) => docs[i]);
|
|
55
|
+
const testDocs = testIdx.map((i) => docs[i]);
|
|
56
|
+
const trainTok = trainDocs.reduce((s, d) => s + tokenCount(d), 0);
|
|
57
|
+
|
|
58
|
+
if (testTok < cfg.minTestTokens) throw new Error(`persona ${author.id}: test slice ${testTok} < ${cfg.minTestTokens} tokens`);
|
|
59
|
+
if (trainTok < cfg.minTrainTokens) throw new Error(`persona ${author.id}: train slice ${trainTok} < ${cfg.minTrainTokens} tokens`);
|
|
60
|
+
// disjoint by construction (test/train index sets are complementary)
|
|
61
|
+
return {
|
|
62
|
+
trainDocs, testDocs, trainTokens: trainTok, testTokens: testTok,
|
|
63
|
+
};
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
export function makePersona(author, opts = {}) {
|
|
67
|
+
const split = splitAuthorDocs(author, opts);
|
|
68
|
+
return {
|
|
69
|
+
id: author.id,
|
|
70
|
+
synthetic: false,
|
|
71
|
+
headlineEligible: true,
|
|
72
|
+
trainDocs: split.trainDocs,
|
|
73
|
+
testDocs: split.testDocs,
|
|
74
|
+
trainTokens: split.trainTokens,
|
|
75
|
+
testTokens: split.testTokens,
|
|
76
|
+
// held-out TEST fingerprint = the scoring target (NOT derived from train).
|
|
77
|
+
fingerprint: fullStyleVector(split.testDocs.join('\n')),
|
|
78
|
+
};
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
// loadRealPersonas(corpus, opts) → up to nAuthors personas. Authors are considered in
|
|
82
|
+
// seed+identity order; too-short authors are SKIPPED (not fatal in selection mode), but
|
|
83
|
+
// if fewer than nAuthors qualify it THROWS — abort-and-ingest-more, never evaluate the
|
|
84
|
+
// downstream gate on an underpowered set.
|
|
85
|
+
export function loadRealPersonas(corpus, opts = {}) {
|
|
86
|
+
const cfg = { nAuthors: corpus.length, ...PERSONA_DEFAULTS, ...opts };
|
|
87
|
+
const ordered = [...corpus].sort((a, b) => {
|
|
88
|
+
const ha = fnv1a(`${cfg.seed}:${a.id}`);
|
|
89
|
+
const hb = fnv1a(`${cfg.seed}:${b.id}`);
|
|
90
|
+
return ha - hb || (a.id < b.id ? -1 : 1);
|
|
91
|
+
});
|
|
92
|
+
|
|
93
|
+
const personas = [];
|
|
94
|
+
const skipped = [];
|
|
95
|
+
for (const author of ordered) {
|
|
96
|
+
if (personas.length >= cfg.nAuthors) break;
|
|
97
|
+
try {
|
|
98
|
+
personas.push(makePersona(author, cfg));
|
|
99
|
+
} catch (e) {
|
|
100
|
+
skipped.push(`${author.id}: ${e.message}`);
|
|
101
|
+
}
|
|
102
|
+
}
|
|
103
|
+
if (personas.length < cfg.nAuthors) {
|
|
104
|
+
throw new Error(`only ${personas.length} qualifying personas (< ${cfg.nAuthors}); ingest more authors. skipped: ${skipped.length}`);
|
|
105
|
+
}
|
|
106
|
+
return personas;
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
export const __test = { fnv1a, tokenCount };
|