@ijfw/memory-server 1.5.6 → 1.6.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/ijfw-dashboard +20 -1
- package/package.json +4 -3
- package/src/audit-roster.js +89 -12
- package/src/brain/tiered-llm.js +57 -7
- package/src/cross-orchestrator-cli.js +390 -4
- package/src/cross-project-search.js +39 -1
- package/src/dashboard-server.js +23 -1
- package/src/dream/runner.mjs +560 -8
- package/src/handlers/brain-handler.js +101 -1
- package/src/importers/discover.js +1 -1
- package/src/memory/bench-metrics.js +289 -0
- package/src/memory/benchmark.js +1 -1
- package/src/memory/search.js +53 -1
- package/src/model-refresh.js +4 -2
- package/src/orchestrator/plan-checker.js +1 -1
- package/src/profile/audit.js +671 -0
- package/src/profile/capture.js +871 -0
- package/src/profile/derive-dialectic.js +242 -0
- package/src/profile/derive-heuristic.js +733 -0
- package/src/profile/derive.js +156 -0
- package/src/profile/egress.js +306 -0
- package/src/profile/eval/build-real-probes.mjs +197 -0
- package/src/profile/eval/corpus-from-reddit.mjs +166 -0
- package/src/profile/eval/corpus-from-reddit.test.mjs +121 -0
- package/src/profile/eval/corpus-from-transcripts.mjs +264 -0
- package/src/profile/eval/gate-b-behavior.mjs +420 -0
- package/src/profile/eval/gate-b-decision-run.mjs +171 -0
- package/src/profile/eval/gate-b-decision-run.test.mjs +141 -0
- package/src/profile/eval/gate-b-run.mjs +417 -0
- package/src/profile/eval/gate-b-run.test.mjs +204 -0
- package/src/profile/eval/gate-c-capture.mjs +323 -0
- package/src/profile/eval/harness.mjs +551 -0
- package/src/profile/eval/instrument-validation.mjs +248 -0
- package/src/profile/eval/instrument-validation.test.mjs +125 -0
- package/src/profile/eval/multi-subject-harness.mjs +106 -0
- package/src/profile/eval/multi-subject-harness.test.mjs +99 -0
- package/src/profile/eval/personas.test.mjs +83 -0
- package/src/profile/eval/plumbing.test.mjs +69 -0
- package/src/profile/eval/prereg.mjs +130 -0
- package/src/profile/eval/prereg.test.mjs +78 -0
- package/src/profile/eval/real-corpus.test.mjs +103 -0
- package/src/profile/eval/real-personas.mjs +109 -0
- package/src/profile/eval/run-real-corpus-concurrent.mjs +407 -0
- package/src/profile/eval/run-real-corpus.mjs +358 -0
- package/src/profile/eval/slug-quality.mjs +464 -0
- package/src/profile/eval/stylometry-features.js +85 -0
- package/src/profile/eval/stylometry-reference.js +16 -0
- package/src/profile/eval/stylometry.js +224 -0
- package/src/profile/eval/stylometry.test.mjs +103 -0
- package/src/profile/eval/synthetic-personas.js +91 -0
- package/src/profile/eval/verifier-features.mjs +170 -0
- package/src/profile/eval/verifier-logreg.mjs +74 -0
- package/src/profile/eval/verifier-pair.mjs +122 -0
- package/src/profile/eval/verifier-reference.mjs +68 -0
- package/src/profile/eval/verifier-scorer.mjs +30 -0
- package/src/profile/eval/wrong-target-control.mjs +168 -0
- package/src/profile/eval/wrong-target-control.test.mjs +124 -0
- package/src/profile/exemplar-capture.js +232 -0
- package/src/profile/exemplar-retrieve.js +138 -0
- package/src/profile/exemplar-store.js +314 -0
- package/src/profile/lock.js +64 -0
- package/src/profile/merge.js +624 -0
- package/src/profile/path-policy.js +213 -0
- package/src/profile/precision-stamp.mjs +151 -0
- package/src/profile/render-brief.js +717 -0
- package/src/profile/schema.js +244 -0
- package/src/profile/sensitivity.js +249 -0
- package/src/profile/serve.js +345 -0
- package/src/profile/store.js +261 -0
- package/src/profile/telemetry.js +289 -0
- package/src/recovery/checkpoint.js +7 -1
- package/src/server.js +194 -16
- package/src/.registry-meta-key.pem +0 -3
|
@@ -0,0 +1,122 @@
|
|
|
1
|
+
// verifier-pair.mjs — Gate B v3. Turn two authors' chunk-feature sets into ONE fixed
|
|
2
|
+
// pair-summary feature vector for the trained logistic verifier. ZERO deps.
|
|
3
|
+
//
|
|
4
|
+
// For an author pair (A,B) we compute, over all (or sampled) cross-author chunk pairs,
|
|
5
|
+
// the DISTRIBUTION of per-family Burrows-Delta distances (mean |z_a - z_b| over that
|
|
6
|
+
// family's dims — NOT a single channel cosine, which washes out the few idiosyncratic
|
|
7
|
+
// dims). We summarize each family's cross-pair distance distribution with
|
|
8
|
+
// mean/median/min/max + 10/25/75/90 percentiles + fraction-below-cutpoint, and add a
|
|
9
|
+
// within-author consistency term (how tight each author's own chunks cluster). These
|
|
10
|
+
// summaries are the verifier's input — the logreg learns the same/different boundary.
|
|
11
|
+
|
|
12
|
+
import { FAMILIES } from './verifier-features.mjs';
|
|
13
|
+
|
|
14
|
+
// mean |z_a - z_b| over a family's dimensions = Burrows-Delta-style distance for ONE
|
|
15
|
+
// chunk pair within ONE family.
|
|
16
|
+
function familyDelta(za, zb) {
|
|
17
|
+
const n = za.length;
|
|
18
|
+
if (!n) return 0;
|
|
19
|
+
let s = 0;
|
|
20
|
+
for (let i = 0; i < n; i += 1) s += Math.abs(za[i] - zb[i]);
|
|
21
|
+
return s / n;
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
function percentile(sorted, p) {
|
|
25
|
+
if (!sorted.length) return 0;
|
|
26
|
+
const idx = Math.min(sorted.length - 1, Math.max(0, Math.round((p / 100) * (sorted.length - 1))));
|
|
27
|
+
return sorted[idx];
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
function summarize(dists, cutpoint) {
|
|
31
|
+
if (!dists.length) return [0, 0, 0, 0, 0, 0, 0, 0, 0];
|
|
32
|
+
const s = [...dists].sort((a, b) => a - b);
|
|
33
|
+
const mean = s.reduce((a, b) => a + b, 0) / s.length;
|
|
34
|
+
const median = percentile(s, 50);
|
|
35
|
+
const belowCut = s.filter((d) => d <= cutpoint).length / s.length;
|
|
36
|
+
return [
|
|
37
|
+
mean, median, s[0], s[s.length - 1],
|
|
38
|
+
percentile(s, 10), percentile(s, 25), percentile(s, 75), percentile(s, 90),
|
|
39
|
+
belowCut,
|
|
40
|
+
];
|
|
41
|
+
}
|
|
42
|
+
const SUMMARY_LEN = 9; // mean,median,min,max,p10,p25,p75,p90,fracBelow
|
|
43
|
+
|
|
44
|
+
// within-author consistency: mean cross-chunk family delta among an author's OWN chunks
|
|
45
|
+
// (low = tight idiolect). Returns one scalar per family.
|
|
46
|
+
function withinConsistency(chunkFeats) {
|
|
47
|
+
const out = {};
|
|
48
|
+
for (const fam of FAMILIES) {
|
|
49
|
+
const cs = chunkFeats.map((c) => c[fam]).filter((v) => v && v.length);
|
|
50
|
+
if (cs.length < 2) { out[fam] = 0; continue; }
|
|
51
|
+
let s = 0; let n = 0;
|
|
52
|
+
const cap = 40; // bound the within-author pair count for speed
|
|
53
|
+
for (let i = 0; i < cs.length && n < cap * cs.length; i += 1) {
|
|
54
|
+
for (let j = i + 1; j < cs.length; j += 1) { s += familyDelta(cs[i], cs[j]); n += 1; }
|
|
55
|
+
}
|
|
56
|
+
out[fam] = n ? s / n : 0;
|
|
57
|
+
}
|
|
58
|
+
return out;
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
// Deterministic sampling of cross-author chunk pairs (seeded) so the pair vector is
|
|
62
|
+
// reproducible and bounded for large authors.
|
|
63
|
+
function sampleCrossPairs(nA, nB, cap, rng) {
|
|
64
|
+
const total = nA * nB;
|
|
65
|
+
if (total <= cap) {
|
|
66
|
+
const out = [];
|
|
67
|
+
for (let i = 0; i < nA; i += 1) for (let j = 0; j < nB; j += 1) out.push([i, j]);
|
|
68
|
+
return out;
|
|
69
|
+
}
|
|
70
|
+
const seen = new Set();
|
|
71
|
+
const out = [];
|
|
72
|
+
while (out.length < cap) {
|
|
73
|
+
const i = Math.floor(rng() * nA);
|
|
74
|
+
const j = Math.floor(rng() * nB);
|
|
75
|
+
const k = i * nB + j;
|
|
76
|
+
if (seen.has(k)) continue;
|
|
77
|
+
seen.add(k); out.push([i, j]);
|
|
78
|
+
}
|
|
79
|
+
return out;
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
// pairFeatures(featsA, featsB, cfg) -> Float64Array. featsA/featsB are arrays of
|
|
83
|
+
// chunkFeatures objects. The feature layout is fixed (same length every call).
|
|
84
|
+
export function pairFeatures(featsA, featsB, cfg = {}) {
|
|
85
|
+
const cap = cfg.pairCap || 400;
|
|
86
|
+
const rng = cfg.rng || Math.random;
|
|
87
|
+
const cutByFam = cfg.cutpoints || {}; // per-family DEV-learned cutpoint
|
|
88
|
+
const idxs = sampleCrossPairs(featsA.length, featsB.length, cap, rng);
|
|
89
|
+
|
|
90
|
+
// collect cross-pair family-delta distributions
|
|
91
|
+
const dists = {};
|
|
92
|
+
for (const fam of FAMILIES) dists[fam] = [];
|
|
93
|
+
for (const [i, j] of idxs) {
|
|
94
|
+
const a = featsA[i]; const b = featsB[j];
|
|
95
|
+
for (const fam of FAMILIES) {
|
|
96
|
+
if (a[fam] && a[fam].length) dists[fam].push(familyDelta(a[fam], b[fam]));
|
|
97
|
+
}
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
const consA = withinConsistency(featsA);
|
|
101
|
+
const consB = withinConsistency(featsB);
|
|
102
|
+
|
|
103
|
+
const vec = [];
|
|
104
|
+
for (const fam of FAMILIES) {
|
|
105
|
+
const cut = Number.isFinite(cutByFam[fam]) ? cutByFam[fam] : 2.0;
|
|
106
|
+
vec.push(...summarize(dists[fam], cut));
|
|
107
|
+
// consistency context: each author's own tightness + the ratio cross/within
|
|
108
|
+
const cA = consA[fam]; const cB = consB[fam];
|
|
109
|
+
const crossMean = dists[fam].length
|
|
110
|
+
? dists[fam].reduce((a, b) => a + b, 0) / dists[fam].length : 0;
|
|
111
|
+
vec.push(cA, cB, (cA + cB) / 2);
|
|
112
|
+
// separation ratio: cross distance vs the tighter within-author distance (a strong
|
|
113
|
+
// same/diff signal — within-author cross-pair distance ~ cross distance for same).
|
|
114
|
+
const denom = Math.max(1e-6, (cA + cB) / 2);
|
|
115
|
+
vec.push(crossMean / denom);
|
|
116
|
+
}
|
|
117
|
+
return Float64Array.from(vec);
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
export const PAIR_FEATURE_LEN = FAMILIES.length * (SUMMARY_LEN + 4);
|
|
121
|
+
|
|
122
|
+
export const __test = { familyDelta, summarize, withinConsistency, sampleCrossPairs };
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
// verifier-reference.mjs — Gate B v3, Task B (calibration regen). Build the per-family
|
|
2
|
+
// reference mean/SD from a set of TRAIN-author chunks (DEV, author-disjoint from the
|
|
3
|
+
// validation fold). Percentile SD floor; floored dims dropped from the scalar/affix
|
|
4
|
+
// families is NOT done (fixed-length vectors needed) — instead a robust SD floor keeps
|
|
5
|
+
// rare dims from blowing up to the z-cap. ZERO deps. Never reads sealed corpora.
|
|
6
|
+
|
|
7
|
+
import { relFreqFunc, relFreqTrigrams, relFreqPunct } from './stylometry-features.js';
|
|
8
|
+
import { FUNCTION_WORDS, TRIGRAM_KEYS, PUNCT_KEYS } from './stylometry-reference.js';
|
|
9
|
+
import { topicRobustScalars, affixCounts, relFreqAffix } from './verifier-features.mjs';
|
|
10
|
+
|
|
11
|
+
function meanSd(rows, floorPct) {
|
|
12
|
+
const d = rows[0].length;
|
|
13
|
+
const mean = Array.from({ length: d }, () => 0);
|
|
14
|
+
const sd = Array.from({ length: d }, () => 0);
|
|
15
|
+
for (const r of rows) for (let j = 0; j < d; j += 1) mean[j] += r[j];
|
|
16
|
+
for (let j = 0; j < d; j += 1) mean[j] /= rows.length;
|
|
17
|
+
for (const r of rows) for (let j = 0; j < d; j += 1) sd[j] += (r[j] - mean[j]) ** 2;
|
|
18
|
+
for (let j = 0; j < d; j += 1) sd[j] = Math.sqrt(sd[j] / rows.length);
|
|
19
|
+
// percentile SD floor: floor at the `floorPct` percentile of the POSITIVE sds, so rare
|
|
20
|
+
// near-constant dims don't divide by ~0 (which sends them to the z-cap and dominates).
|
|
21
|
+
const pos = sd.filter((x) => x > 0).sort((a, b) => a - b);
|
|
22
|
+
const floor = pos.length ? pos[Math.floor((floorPct / 100) * (pos.length - 1))] : 1e-6;
|
|
23
|
+
for (let j = 0; j < d; j += 1) if (!(sd[j] > floor)) sd[j] = floor;
|
|
24
|
+
return { mean, sd };
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
// Pick the top-K most frequent affix keys across the train chunks (content-light:
|
|
28
|
+
// 3-char prefixes/suffixes). Deterministic by (count desc, key asc).
|
|
29
|
+
function pickAffixKeys(chunks, k) {
|
|
30
|
+
const counts = Object.create(null);
|
|
31
|
+
for (const c of chunks) {
|
|
32
|
+
const { pre, suf } = affixCounts(c);
|
|
33
|
+
for (const key in pre) counts[key] = (counts[key] || 0) + pre[key];
|
|
34
|
+
for (const key in suf) counts[key] = (counts[key] || 0) + suf[key];
|
|
35
|
+
}
|
|
36
|
+
return Object.entries(counts)
|
|
37
|
+
.sort((a, b) => (b[1] - a[1]) || (a[0] < b[0] ? -1 : 1))
|
|
38
|
+
.slice(0, k)
|
|
39
|
+
.map(([key]) => key);
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
// buildReference(trainChunks, opts) -> ref object consumed by chunkFeatures.
|
|
43
|
+
export function buildReference(trainChunks, opts = {}) {
|
|
44
|
+
const floorPct = opts.floorPct ?? 10;
|
|
45
|
+
const affixK = opts.affixK ?? 150;
|
|
46
|
+
const affixKeys = pickAffixKeys(trainChunks, affixK);
|
|
47
|
+
|
|
48
|
+
const funcRows = trainChunks.map((c) => relFreqFunc(c, FUNCTION_WORDS));
|
|
49
|
+
const triRows = trainChunks.map((c) => relFreqTrigrams(c, TRIGRAM_KEYS));
|
|
50
|
+
const punctRows = trainChunks.map((c) => relFreqPunct(c, PUNCT_KEYS));
|
|
51
|
+
const scalarRows = trainChunks.map((c) => topicRobustScalars(c));
|
|
52
|
+
const affixRows = trainChunks.map((c) => relFreqAffix(c, affixKeys));
|
|
53
|
+
|
|
54
|
+
const f = meanSd(funcRows, floorPct);
|
|
55
|
+
const t = meanSd(triRows, floorPct);
|
|
56
|
+
const p = meanSd(punctRows, floorPct);
|
|
57
|
+
const s = meanSd(scalarRows, floorPct);
|
|
58
|
+
const a = meanSd(affixRows, floorPct);
|
|
59
|
+
|
|
60
|
+
return {
|
|
61
|
+
funcMean: f.mean, funcSd: f.sd,
|
|
62
|
+
triMean: t.mean, triSd: t.sd,
|
|
63
|
+
punctMean: p.mean, punctSd: p.sd,
|
|
64
|
+
scalarMean: s.mean, scalarSd: s.sd,
|
|
65
|
+
affixKeys, affixMean: a.mean, affixSd: a.sd,
|
|
66
|
+
nTrainChunks: trainChunks.length,
|
|
67
|
+
};
|
|
68
|
+
}
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
// verifier-scorer.mjs — Gate B v3. Adapts the trained chunk-pair logistic verifier to
|
|
2
|
+
// the `scorer` interface validateInstrument expects: { vectorize, distance }.
|
|
3
|
+
//
|
|
4
|
+
// UNIT OF PAIRING = one CHUNK per "doc". vectorize(text) -> per-family z-vectors for that
|
|
5
|
+
// chunk (against the DEV-derived reference). distance(a,b) -> 1 - P(same-author) from the
|
|
6
|
+
// trained logreg over the single chunk-pair summary. Training (in the harness) uses the
|
|
7
|
+
// IDENTICAL single-pair summary representation, so train and score are consistent.
|
|
8
|
+
|
|
9
|
+
import { chunkFeatures } from './verifier-features.mjs';
|
|
10
|
+
import { pairFeatures } from './verifier-pair.mjs';
|
|
11
|
+
import { applyStandardizer, predictProba } from './verifier-logreg.mjs';
|
|
12
|
+
|
|
13
|
+
// Build a single chunk-pair summary vector (two singleton chunk sets). cutpoints kept
|
|
14
|
+
// fixed across train/score (DEV-learned, passed in).
|
|
15
|
+
export function chunkPairFeatures(featA, featB, cfg) {
|
|
16
|
+
return pairFeatures([featA], [featB], cfg);
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
// makeScorer(ref, model, standardizer, cfg) -> { vectorize, distance }.
|
|
20
|
+
export function makeScorer(ref, model, standardizer, cfg = {}) {
|
|
21
|
+
return {
|
|
22
|
+
vectorize(text) { return chunkFeatures(text, ref); },
|
|
23
|
+
distance(a, b) {
|
|
24
|
+
const pf = chunkPairFeatures(a, b, cfg);
|
|
25
|
+
const std = applyStandardizer(pf, standardizer);
|
|
26
|
+
const p = predictProba(std, model); // P(same-author)
|
|
27
|
+
return 1 - p; // distance: higher = more different (validateInstrument convention)
|
|
28
|
+
},
|
|
29
|
+
};
|
|
30
|
+
}
|
|
@@ -0,0 +1,168 @@
|
|
|
1
|
+
// wrong-target-control.mjs — Gate B v2, Task T5. THE discriminator.
|
|
2
|
+
//
|
|
3
|
+
// For each subject P and arm, the margin is:
|
|
4
|
+
// m_P = distance(output, NEAREST same-register foreigner) − distance(output, OWN test)
|
|
5
|
+
// m_P > 0 means the styled output landed closer to P's OWN held-out fingerprint than to
|
|
6
|
+
// the CLOSEST same-register stranger. A generic register-obeyer is ~equidistant from all
|
|
7
|
+
// same-register targets ⇒ m≈0 ⇒ NULL. Only idiosyncratic voice capture wins.
|
|
8
|
+
//
|
|
9
|
+
// Audit must-fixes baked in:
|
|
10
|
+
// * Foreign pool restricted to the SAME register band (δ=0.15), by the register metric
|
|
11
|
+
// only — a register-obeyer cannot false-PASS.
|
|
12
|
+
// * NEAREST foreigner, never a centroid/mean of foreigners (centroid regresses to the
|
|
13
|
+
// population center and manufactures a positive margin).
|
|
14
|
+
// * Undecidable subjects (no same-register foreigner) are EXCLUDED — never fall back to
|
|
15
|
+
// a register-diverse pool.
|
|
16
|
+
// * Bootstrap resamples SUBJECTS (the margins array is length = nDecidableSubjects).
|
|
17
|
+
// * One-sided 99% lower bound = bootstrapCI(margins, {alpha:0.02}).lo.
|
|
18
|
+
// * Measured-scale floor (minMeanMargin) must bite: significant-but-trivial ⇒ NULL.
|
|
19
|
+
// * McNemar vs baseline reads .pValue and applies perTestAlpha locally (ignores the
|
|
20
|
+
// helper's hardcoded .significant=0.05).
|
|
21
|
+
|
|
22
|
+
import { fullStyleDistance, styleDistance } from './stylometry.js';
|
|
23
|
+
import { bootstrapCI, mcnemar } from '../../memory/bench-metrics.js';
|
|
24
|
+
|
|
25
|
+
export const CONTROL_DEFAULTS = Object.freeze({
|
|
26
|
+
registerDelta: 0.15,
|
|
27
|
+
alpha: 0.02, // two-sided helper @0.02 → one-sided 99% lower bound
|
|
28
|
+
minMeanMargin: 0.01, // TEST-ONLY default; the real run derives floorK*(between-within) (prereg.deriveMinMeanMargin)
|
|
29
|
+
minDz: 0.5, // minimum standardized effect size (mean/sd) — kills "few subjects carry it"
|
|
30
|
+
perTestAlpha: 0.01, // Bonferroni-adjusted at T7; default single-test 0.01
|
|
31
|
+
bootstrapIters: 2000,
|
|
32
|
+
seed: 42,
|
|
33
|
+
});
|
|
34
|
+
|
|
35
|
+
function mean(xs) { return xs.length ? xs.reduce((a, b) => a + b, 0) / xs.length : NaN; }
|
|
36
|
+
function stdev(xs) {
|
|
37
|
+
if (xs.length < 2) return NaN;
|
|
38
|
+
const mu = mean(xs);
|
|
39
|
+
return Math.sqrt(xs.reduce((a, b) => a + (b - mu) ** 2, 0) / (xs.length - 1));
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
// Same-register foreigners of P: other personas whose TEST register vector is within δ of
|
|
43
|
+
// P's. Register-only — independent of the authorship distance under test.
|
|
44
|
+
export function sameRegisterForeigners(p, personas, delta) {
|
|
45
|
+
const pReg = p.fingerprint.register;
|
|
46
|
+
return personas.filter((q) => q.id !== p.id && styleDistance(pReg, q.fingerprint.register) <= delta);
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
// Margin of one arm output for subject P against the NEAREST same-register foreigner.
|
|
50
|
+
export function subjectMargin(armVector, p, foreigners) {
|
|
51
|
+
const dOwn = fullStyleDistance(armVector, p.fingerprint);
|
|
52
|
+
let dNearestForeign = Infinity;
|
|
53
|
+
for (const q of foreigners) {
|
|
54
|
+
const d = fullStyleDistance(armVector, q.fingerprint);
|
|
55
|
+
if (d < dNearestForeign) dNearestForeign = d;
|
|
56
|
+
}
|
|
57
|
+
return { dOwn, dNearestForeign, margin: dNearestForeign - dOwn };
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
// Significance at a given alpha, reading the raw p-value (NOT mcnemar's hardcoded .significant).
|
|
61
|
+
export function significantAt(pValue, alpha) { return Number.isFinite(pValue) && pValue < alpha; }
|
|
62
|
+
|
|
63
|
+
function fmt4(x) { return Number.isFinite(x) ? x.toFixed(4) : 'NaN'; }
|
|
64
|
+
|
|
65
|
+
// Pure arm-level control verdict. PASS requires ALL of (spec §4.4 + §4.7):
|
|
66
|
+
// 1. one-sided 99% CI lower bound > 0 (the effect is real)
|
|
67
|
+
// 2. meanMargin >= measured-scale floor (the effect is non-trivial in instrument units)
|
|
68
|
+
// 3. majority of subjects positive (>50%) (NOT a few subjects carrying it)
|
|
69
|
+
// 4. standardized effect dz = mean/sd >= minDz (the effect is large relative to spread)
|
|
70
|
+
// 5. sign-test pValue < perTestAlpha (the majority is statistically significant)
|
|
71
|
+
// A heavy-tailed margin that clears 1+2 but fails 3/4/5 is a NULL, not a PASS.
|
|
72
|
+
export function controlVerdict(armStats, cfg) {
|
|
73
|
+
const reasons = [];
|
|
74
|
+
const {
|
|
75
|
+
ciLower, meanMargin, sd, pctPositive, signPValue,
|
|
76
|
+
} = armStats;
|
|
77
|
+
const minDz = cfg.minDz ?? 0.5;
|
|
78
|
+
const dz = (Number.isFinite(sd) && sd > 0)
|
|
79
|
+
? meanMargin / sd
|
|
80
|
+
: (meanMargin > 0 ? Infinity : 0); // sd≈0 with positive mean = perfectly consistent effect
|
|
81
|
+
|
|
82
|
+
if (!(Number.isFinite(ciLower) && ciLower > 0)) reasons.push(`CI-lower ${fmt4(ciLower)} not > 0`);
|
|
83
|
+
if (!(Number.isFinite(meanMargin) && meanMargin >= cfg.minMeanMargin)) reasons.push(`meanMargin ${fmt4(meanMargin)} < floor ${cfg.minMeanMargin}`);
|
|
84
|
+
if (!(Number.isFinite(pctPositive) && pctPositive > 0.5)) reasons.push(`pctPositive ${fmt4(pctPositive)} <= 0.5 (minority of subjects)`);
|
|
85
|
+
if (!(dz >= minDz)) reasons.push(`dz ${Number.isFinite(dz) ? dz.toFixed(3) : dz} < ${minDz}`);
|
|
86
|
+
if (!(Number.isFinite(signPValue) && signPValue < cfg.perTestAlpha)) reasons.push(`sign-test p ${fmt4(signPValue)} >= perTestAlpha ${cfg.perTestAlpha}`);
|
|
87
|
+
|
|
88
|
+
return { passes: reasons.length === 0, reasons, dz };
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
// wrongTargetControl(harnessOut, personas, opts) → per-arm margin stats + verdicts.
|
|
92
|
+
export function wrongTargetControl(harnessOut, personas, opts = {}) {
|
|
93
|
+
const cfg = { ...CONTROL_DEFAULTS, ...opts };
|
|
94
|
+
const byId = new Map(personas.map((p) => [p.id, p]));
|
|
95
|
+
|
|
96
|
+
// foreigners per persona (register-only); decidable = has >=1 same-register foreigner
|
|
97
|
+
const foreignersById = {};
|
|
98
|
+
for (const p of personas) foreignersById[p.id] = sameRegisterForeigners(p, personas, cfg.registerDelta);
|
|
99
|
+
const decidableIds = personas.filter((p) => foreignersById[p.id].length > 0).map((p) => p.id);
|
|
100
|
+
|
|
101
|
+
const perArm = {};
|
|
102
|
+
for (const arm of harnessOut.arms) {
|
|
103
|
+
const margins = [];
|
|
104
|
+
const ownWin = [];
|
|
105
|
+
const regGaps = [];
|
|
106
|
+
for (const id of decidableIds) {
|
|
107
|
+
const p = byId.get(id);
|
|
108
|
+
const armVec = harnessOut.results[id][arm].vector;
|
|
109
|
+
const sm = subjectMargin(armVec, p, foreignersById[id]);
|
|
110
|
+
margins.push(sm.margin);
|
|
111
|
+
ownWin.push(sm.margin > 0 ? 1 : 0);
|
|
112
|
+
// diagnostic: register distance to the nearest same-register foreigner
|
|
113
|
+
const nearest = foreignersById[id].reduce((best, q) => {
|
|
114
|
+
const d = fullStyleDistance(armVec, q.fingerprint);
|
|
115
|
+
return d < best.d ? { q, d } : best;
|
|
116
|
+
}, { q: null, d: Infinity }).q;
|
|
117
|
+
if (nearest) regGaps.push(styleDistance(p.fingerprint.register, nearest.fingerprint.register));
|
|
118
|
+
}
|
|
119
|
+
const ownLoss = margins.map((m) => (m < 0 ? 1 : 0));
|
|
120
|
+
const ci = bootstrapCI(margins, { iters: cfg.bootstrapIters, alpha: cfg.alpha, seed: cfg.seed });
|
|
121
|
+
// zeros-vs-wins sign test: b = #(margin>0), c = #(margin<0); two-sided p on |b−c|.
|
|
122
|
+
const sign = mcnemar(ownLoss, ownWin);
|
|
123
|
+
perArm[arm] = {
|
|
124
|
+
arm,
|
|
125
|
+
nDecidable: margins.length,
|
|
126
|
+
margins,
|
|
127
|
+
ownWin,
|
|
128
|
+
meanMargin: mean(margins),
|
|
129
|
+
sd: stdev(margins),
|
|
130
|
+
pctPositive: mean(ownWin),
|
|
131
|
+
signPValue: sign.pValue,
|
|
132
|
+
signPositives: sign.b,
|
|
133
|
+
signNegatives: sign.c,
|
|
134
|
+
ciLower: ci.lo,
|
|
135
|
+
ciPoint: ci.point,
|
|
136
|
+
registerGap: mean(regGaps),
|
|
137
|
+
};
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
// McNemar vs baseline for each verdict arm (own-match win indicator), pValue exposed.
|
|
141
|
+
for (const arm of harnessOut.arms) {
|
|
142
|
+
if (arm === 'baseline' || !perArm.baseline) continue;
|
|
143
|
+
const m = mcnemar(perArm.baseline.ownWin, perArm[arm].ownWin);
|
|
144
|
+
// mcnemar.pValue is TWO-SIDED (|b−c|), so the direction guard m.b > m.c is mandatory:
|
|
145
|
+
// the arm must FLIP MORE subjects to own-match than baseline does, not merely differ.
|
|
146
|
+
perArm[arm].vsBaseline = {
|
|
147
|
+
b: m.b, c: m.c, pValue: m.pValue, beatsBaseline: significantAt(m.pValue, cfg.perTestAlpha) && m.b > m.c,
|
|
148
|
+
};
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
// arm verdicts
|
|
152
|
+
for (const arm of harnessOut.arms) perArm[arm].verdict = controlVerdict(perArm[arm], cfg);
|
|
153
|
+
|
|
154
|
+
return {
|
|
155
|
+
cfg,
|
|
156
|
+
nSubjects: personas.length,
|
|
157
|
+
nDecidable: decidableIds.length,
|
|
158
|
+
nUndecidable: personas.length - decidableIds.length,
|
|
159
|
+
decidableIds,
|
|
160
|
+
perArm,
|
|
161
|
+
// VOID rail input: PASS of a register-only-echo arm means the control is fooled by
|
|
162
|
+
// register alone. undefined (no registerEcho arm present) ⇒ the decision layer THROWS
|
|
163
|
+
// rather than silently treating the missing rail as false.
|
|
164
|
+
registerEchoPasses: perArm.registerEcho ? perArm.registerEcho.verdict.passes : undefined,
|
|
165
|
+
};
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
export const __test = { mean };
|
|
@@ -0,0 +1,124 @@
|
|
|
1
|
+
// Gate B v2 — Task T5: wrong-target control. The discriminator's guards: same-register
|
|
2
|
+
// foreign pool, undecidable exclusion, NEAREST (not centroid) margin, subject-level
|
|
3
|
+
// bootstrap, measured-scale floor, McNemar by p-value. Positive control (own output)
|
|
4
|
+
// PASSES; generic output NULLs.
|
|
5
|
+
|
|
6
|
+
import { test } from 'node:test';
|
|
7
|
+
import assert from 'node:assert/strict';
|
|
8
|
+
import {
|
|
9
|
+
wrongTargetControl, sameRegisterForeigners, subjectMargin, controlVerdict, significantAt,
|
|
10
|
+
} from './wrong-target-control.mjs';
|
|
11
|
+
import { mcnemar } from '../../memory/bench-metrics.js';
|
|
12
|
+
import { generatePersonaText } from './synthetic-personas.js';
|
|
13
|
+
import { fullStyleVector } from './stylometry.js';
|
|
14
|
+
|
|
15
|
+
function persona(id, text) { return { id, fingerprint: fullStyleVector(text) }; }
|
|
16
|
+
// formal personas (archetype 0) — same register, different authors. n=10 so the sign test
|
|
17
|
+
// can clear perTestAlpha=0.01 when every subject is positive (the positive control).
|
|
18
|
+
const FORMAL = Array.from({ length: 10 }, (_, i) => persona(`f${i}`, generatePersonaText(0, 17 * (i + 1), 16)));
|
|
19
|
+
// Unambiguously different register: emoji + contractions + exclamations + casual (several
|
|
20
|
+
// axes differ from formal, clearing the 0.15 band).
|
|
21
|
+
const CASUAL = persona('c0', "omg yes!! 🚀 lol i love it, it's so cool 😄 you're gonna wanna try it, honestly it's the best, i can't even, no cap!! 💯");
|
|
22
|
+
|
|
23
|
+
function fakeHarness(arms, personas, vectorFor) {
|
|
24
|
+
const results = {};
|
|
25
|
+
for (const p of personas) {
|
|
26
|
+
results[p.id] = {};
|
|
27
|
+
for (const arm of arms) results[p.id][arm] = { vector: vectorFor(p, arm) };
|
|
28
|
+
}
|
|
29
|
+
return { arms, personaIds: personas.map((p) => p.id), results };
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
test('foreign pool is restricted to the SAME register band (δ=0.15)', () => {
|
|
33
|
+
const pool = [...FORMAL, CASUAL];
|
|
34
|
+
const f = sameRegisterForeigners(FORMAL[0], pool, 0.15).map((q) => q.id);
|
|
35
|
+
assert.ok(f.includes('f1'), 'a same-register formal peer is in the pool');
|
|
36
|
+
assert.ok(!f.includes('c0'), 'the different-register casual author is excluded');
|
|
37
|
+
});
|
|
38
|
+
|
|
39
|
+
test('undecidable subjects (no same-register foreigner) are EXCLUDED, no diverse fallback', () => {
|
|
40
|
+
const personas = [FORMAL[0], CASUAL]; // each other is a different register
|
|
41
|
+
const h = fakeHarness(['baseline'], personas, (p) => p.fingerprint);
|
|
42
|
+
const r = wrongTargetControl(h, personas);
|
|
43
|
+
assert.equal(r.nDecidable, 0);
|
|
44
|
+
assert.equal(r.nUndecidable, 2);
|
|
45
|
+
assert.deepEqual(r.decidableIds, []);
|
|
46
|
+
});
|
|
47
|
+
|
|
48
|
+
test('margin uses NEAREST foreigner, not a centroid', () => {
|
|
49
|
+
const p = FORMAL[0];
|
|
50
|
+
const foreigners = [FORMAL[1], FORMAL[2], FORMAL[3]];
|
|
51
|
+
// output IS a foreigner (f1) → nearest distance is 0 → margin must be negative
|
|
52
|
+
const sm = subjectMargin(FORMAL[1].fingerprint, p, foreigners);
|
|
53
|
+
assert.equal(sm.dNearestForeign, 0, 'nearest foreigner distance is 0 (output == f1)');
|
|
54
|
+
assert.ok(sm.margin < 0, 'matching a stranger yields a NEGATIVE margin (centroid would hide this)');
|
|
55
|
+
// output IS own → margin positive
|
|
56
|
+
const smOwn = subjectMargin(p.fingerprint, p, foreigners);
|
|
57
|
+
assert.ok(smOwn.margin > 0, 'matching OWN yields a positive margin');
|
|
58
|
+
});
|
|
59
|
+
|
|
60
|
+
test('POSITIVE control: own-output arm PASSES (margins positive, CI-lower>0, floor cleared)', () => {
|
|
61
|
+
const h = fakeHarness(['baseline'], FORMAL, (p) => p.fingerprint); // output == own
|
|
62
|
+
const r = wrongTargetControl(h, FORMAL);
|
|
63
|
+
assert.equal(r.nDecidable, 10);
|
|
64
|
+
assert.equal(r.perArm.baseline.margins.length, 10, 'bootstrap operates on SUBJECTS (=10), not probes');
|
|
65
|
+
assert.equal(r.perArm.baseline.verdict.passes, true);
|
|
66
|
+
assert.ok(r.perArm.baseline.ciLower > 0);
|
|
67
|
+
assert.ok(r.perArm.baseline.pctPositive > 0.5 && r.perArm.baseline.signPValue < 0.01);
|
|
68
|
+
});
|
|
69
|
+
|
|
70
|
+
test('NEGATIVE control: generic constant output does NOT pass the control (≈NULL)', () => {
|
|
71
|
+
const constVec = fullStyleVector(generatePersonaText(1, 999, 16)); // casual generic, matches no formal voice
|
|
72
|
+
const h = fakeHarness(['derived'], FORMAL, () => constVec);
|
|
73
|
+
const r = wrongTargetControl(h, FORMAL);
|
|
74
|
+
assert.equal(r.perArm.derived.verdict.passes, false, 'generic output is not closer to OWN than to nearest stranger');
|
|
75
|
+
});
|
|
76
|
+
|
|
77
|
+
const FULL_PASS = {
|
|
78
|
+
ciLower: 0.02, meanMargin: 0.05, sd: 0.05, pctPositive: 0.8, signPValue: 0.001,
|
|
79
|
+
};
|
|
80
|
+
const FULL_CFG = { minMeanMargin: 0.01, minDz: 0.5, perTestAlpha: 0.01 };
|
|
81
|
+
|
|
82
|
+
test('controlVerdict PASSES only when ALL legs hold (not vacuously false)', () => {
|
|
83
|
+
assert.equal(controlVerdict(FULL_PASS, FULL_CFG).passes, true);
|
|
84
|
+
});
|
|
85
|
+
|
|
86
|
+
test('measured-scale floor BITES: significant-but-trivial ⇒ NULL', () => {
|
|
87
|
+
assert.equal(controlVerdict({ ...FULL_PASS, meanMargin: 0.004 }, FULL_CFG).passes, false);
|
|
88
|
+
assert.equal(controlVerdict({ ...FULL_PASS, ciLower: -0.01 }, FULL_CFG).passes, false);
|
|
89
|
+
});
|
|
90
|
+
|
|
91
|
+
test('NEW LEG — effect size dz: a few subjects carrying it (high sd) ⇒ NULL', () => {
|
|
92
|
+
// mean clears floor and CI>0, but sd is huge ⇒ dz<0.5 ⇒ NULL (heavy-tail false-PASS killed)
|
|
93
|
+
const g = controlVerdict({ ...FULL_PASS, sd: 0.5 }, FULL_CFG);
|
|
94
|
+
assert.equal(g.passes, false);
|
|
95
|
+
assert.ok(g.reasons.some((r) => /dz/.test(r)));
|
|
96
|
+
});
|
|
97
|
+
|
|
98
|
+
test('NEW LEG — majority: <=50% of subjects positive ⇒ NULL', () => {
|
|
99
|
+
const g = controlVerdict({ ...FULL_PASS, pctPositive: 0.5 }, FULL_CFG);
|
|
100
|
+
assert.equal(g.passes, false);
|
|
101
|
+
assert.ok(g.reasons.some((r) => /pctPositive/.test(r)));
|
|
102
|
+
});
|
|
103
|
+
|
|
104
|
+
test('NEW LEG — sign-test: p >= perTestAlpha ⇒ NULL (even with mean+CI+majority)', () => {
|
|
105
|
+
const g = controlVerdict({ ...FULL_PASS, signPValue: 0.03 }, FULL_CFG);
|
|
106
|
+
assert.equal(g.passes, false);
|
|
107
|
+
assert.ok(g.reasons.some((r) => /sign-test/.test(r)));
|
|
108
|
+
});
|
|
109
|
+
|
|
110
|
+
test('McNemar reads p-value with perTestAlpha — ignores the helper hardcoded .significant(0.05)', () => {
|
|
111
|
+
const baseline = Array.from({ length: 20 }, () => 0);
|
|
112
|
+
const arm = Array.from({ length: 20 }, (_, i) => (i < 7 ? 1 : 0)); // 7 flips, 0 losses
|
|
113
|
+
const m = mcnemar(baseline, arm);
|
|
114
|
+
assert.ok(m.pValue > 0.01 && m.pValue < 0.05, `p=${m.pValue.toFixed(3)} lands between the alphas`);
|
|
115
|
+
assert.equal(significantAt(m.pValue, 0.01), false, 'not significant at the strict Bonferroni alpha');
|
|
116
|
+
assert.equal(significantAt(m.pValue, 0.05), true, 'would be "significant" at the lax 0.05 the helper hardcodes');
|
|
117
|
+
});
|
|
118
|
+
|
|
119
|
+
test('registerGap diagnostic is emitted for decidable arms', () => {
|
|
120
|
+
const h = fakeHarness(['baseline'], FORMAL, (p) => p.fingerprint);
|
|
121
|
+
const r = wrongTargetControl(h, FORMAL);
|
|
122
|
+
assert.ok(Number.isFinite(r.perArm.baseline.registerGap));
|
|
123
|
+
assert.ok(r.perArm.baseline.registerGap <= 0.15, 'nearest foreigner really is same-register');
|
|
124
|
+
});
|