@ijfw/memory-server 1.5.6 → 1.6.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/ijfw-dashboard +20 -1
- package/package.json +4 -3
- package/src/audit-roster.js +89 -12
- package/src/brain/tiered-llm.js +57 -7
- package/src/cross-orchestrator-cli.js +390 -4
- package/src/cross-project-search.js +39 -1
- package/src/dashboard-server.js +23 -1
- package/src/dream/runner.mjs +560 -8
- package/src/handlers/brain-handler.js +101 -1
- package/src/importers/discover.js +1 -1
- package/src/memory/bench-metrics.js +289 -0
- package/src/memory/benchmark.js +1 -1
- package/src/memory/search.js +53 -1
- package/src/model-refresh.js +4 -2
- package/src/orchestrator/plan-checker.js +1 -1
- package/src/profile/audit.js +671 -0
- package/src/profile/capture.js +871 -0
- package/src/profile/derive-dialectic.js +242 -0
- package/src/profile/derive-heuristic.js +733 -0
- package/src/profile/derive.js +156 -0
- package/src/profile/egress.js +306 -0
- package/src/profile/eval/build-real-probes.mjs +197 -0
- package/src/profile/eval/corpus-from-reddit.mjs +166 -0
- package/src/profile/eval/corpus-from-reddit.test.mjs +121 -0
- package/src/profile/eval/corpus-from-transcripts.mjs +264 -0
- package/src/profile/eval/gate-b-behavior.mjs +420 -0
- package/src/profile/eval/gate-b-decision-run.mjs +171 -0
- package/src/profile/eval/gate-b-decision-run.test.mjs +141 -0
- package/src/profile/eval/gate-b-run.mjs +417 -0
- package/src/profile/eval/gate-b-run.test.mjs +204 -0
- package/src/profile/eval/gate-c-capture.mjs +323 -0
- package/src/profile/eval/harness.mjs +551 -0
- package/src/profile/eval/instrument-validation.mjs +248 -0
- package/src/profile/eval/instrument-validation.test.mjs +125 -0
- package/src/profile/eval/multi-subject-harness.mjs +106 -0
- package/src/profile/eval/multi-subject-harness.test.mjs +99 -0
- package/src/profile/eval/personas.test.mjs +83 -0
- package/src/profile/eval/plumbing.test.mjs +69 -0
- package/src/profile/eval/prereg.mjs +130 -0
- package/src/profile/eval/prereg.test.mjs +78 -0
- package/src/profile/eval/real-corpus.test.mjs +103 -0
- package/src/profile/eval/real-personas.mjs +109 -0
- package/src/profile/eval/run-real-corpus-concurrent.mjs +407 -0
- package/src/profile/eval/run-real-corpus.mjs +358 -0
- package/src/profile/eval/slug-quality.mjs +464 -0
- package/src/profile/eval/stylometry-features.js +85 -0
- package/src/profile/eval/stylometry-reference.js +16 -0
- package/src/profile/eval/stylometry.js +224 -0
- package/src/profile/eval/stylometry.test.mjs +103 -0
- package/src/profile/eval/synthetic-personas.js +91 -0
- package/src/profile/eval/verifier-features.mjs +170 -0
- package/src/profile/eval/verifier-logreg.mjs +74 -0
- package/src/profile/eval/verifier-pair.mjs +122 -0
- package/src/profile/eval/verifier-reference.mjs +68 -0
- package/src/profile/eval/verifier-scorer.mjs +30 -0
- package/src/profile/eval/wrong-target-control.mjs +168 -0
- package/src/profile/eval/wrong-target-control.test.mjs +124 -0
- package/src/profile/exemplar-capture.js +232 -0
- package/src/profile/exemplar-retrieve.js +138 -0
- package/src/profile/exemplar-store.js +314 -0
- package/src/profile/lock.js +64 -0
- package/src/profile/merge.js +624 -0
- package/src/profile/path-policy.js +213 -0
- package/src/profile/precision-stamp.mjs +151 -0
- package/src/profile/render-brief.js +717 -0
- package/src/profile/schema.js +244 -0
- package/src/profile/sensitivity.js +249 -0
- package/src/profile/serve.js +345 -0
- package/src/profile/store.js +261 -0
- package/src/profile/telemetry.js +289 -0
- package/src/recovery/checkpoint.js +7 -1
- package/src/server.js +194 -16
- package/src/.registry-meta-key.pem +0 -3
|
@@ -0,0 +1,166 @@
|
|
|
1
|
+
// corpus-from-reddit.mjs — Gate B v2 corpus ingest. PURE local transform: takes a RAW
|
|
2
|
+
// single-subreddit dump already on disk and emits the {id,docs} corpus loadRealPersonas
|
|
3
|
+
// wants PLUS a disjoint same-register foreigner pool. NO network is ever touched — the
|
|
4
|
+
// operator fetches/exports the subreddit separately (do not redistribute the corpus).
|
|
5
|
+
//
|
|
6
|
+
// EXPECTED RAW INPUT SCHEMA (single subreddit ⇒ same register by construction):
|
|
7
|
+
// A local file at `dumpPath`, either:
|
|
8
|
+
// (a) JSONL — one JSON object per line, OR
|
|
9
|
+
// (b) JSON — a top-level array of objects, OR { posts:[...] } / { data:[...] }.
|
|
10
|
+
// Each object must carry an author handle + a text body. Field names are flexible:
|
|
11
|
+
// author : `author` | `author_fullname` | `user` | `username`
|
|
12
|
+
// body : `body` | `selftext` | `text` | `title`+`selftext` (concatenated)
|
|
13
|
+
// Rows with a deleted/removed/bot/empty author are DROPPED. Single-subreddit input is
|
|
14
|
+
// assumed (the same-register guarantee the wrong-target control relies on); a `subreddit`
|
|
15
|
+
// field, if present, is NOT cross-checked here — keep one subreddit per file.
|
|
16
|
+
//
|
|
17
|
+
// FAIL-CLOSED: an author below minDocsPerAuthor or below minTokensPerAuthor is DROPPED
|
|
18
|
+
// (never padded); if fewer than nPersonaAuthors + nForeignAuthors qualify, ingest THROWS.
|
|
19
|
+
// Never evaluate the downstream gate on an underpowered slice — ingest more, don't loosen.
|
|
20
|
+
|
|
21
|
+
import fs from 'node:fs';
|
|
22
|
+
import { tokenizeWords } from './stylometry-features.js';
|
|
23
|
+
|
|
24
|
+
export const REDDIT_DEFAULTS = Object.freeze({
|
|
25
|
+
nPersonaAuthors: 60, // headline confirmatory N
|
|
26
|
+
nForeignAuthors: 60, // same-register foreigner pool (disjoint from personas)
|
|
27
|
+
minDocsPerAuthor: 2, // need ≥2 docs for a disjoint train/test split downstream
|
|
28
|
+
minTokensPerAuthor: 1800, // train(1200)+test(600) floors of real-personas, with headroom
|
|
29
|
+
minBodyChars: 40, // drop near-empty bodies before counting docs
|
|
30
|
+
seed: 1,
|
|
31
|
+
});
|
|
32
|
+
|
|
33
|
+
const DELETED_AUTHORS = new Set(['[deleted]', '[removed]', 'automoderator', 'deleted', 'removed', '']);
|
|
34
|
+
|
|
35
|
+
// FNV-1a → uint32. Deterministic, content-independent ordering key (mirrors real-personas).
|
|
36
|
+
function fnv1a(str) {
|
|
37
|
+
let h = 0x811c9dc5;
|
|
38
|
+
for (let i = 0; i < str.length; i += 1) {
|
|
39
|
+
h ^= str.charCodeAt(i);
|
|
40
|
+
h = Math.imul(h, 0x01000193);
|
|
41
|
+
}
|
|
42
|
+
return h >>> 0;
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
function pickAuthor(r) {
|
|
46
|
+
return String(r.author ?? r.user ?? r.username ?? r.author_fullname ?? '').trim();
|
|
47
|
+
}
|
|
48
|
+
function pickBody(r) {
|
|
49
|
+
if (typeof r.body === 'string' && r.body.trim()) return r.body;
|
|
50
|
+
if (typeof r.selftext === 'string' && (r.title || r.selftext).trim()) {
|
|
51
|
+
return [r.title, r.selftext].filter(Boolean).join('\n').trim();
|
|
52
|
+
}
|
|
53
|
+
if (typeof r.text === 'string' && r.text.trim()) return r.text;
|
|
54
|
+
if (typeof r.title === 'string' && r.title.trim()) return r.title;
|
|
55
|
+
return '';
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
// Parse a local dump (JSONL or JSON array / {posts|data:[...]}). THROWS on missing/unreadable.
|
|
59
|
+
export function parseDump(dumpPath) {
|
|
60
|
+
let raw;
|
|
61
|
+
try {
|
|
62
|
+
raw = fs.readFileSync(dumpPath, 'utf8');
|
|
63
|
+
} catch (e) {
|
|
64
|
+
throw new Error(`cannot read dump ${dumpPath}: ${e.code || e.message}`);
|
|
65
|
+
}
|
|
66
|
+
const trimmed = raw.trim();
|
|
67
|
+
if (!trimmed) throw new Error(`empty dump ${dumpPath}`);
|
|
68
|
+
// JSON array or object first (cheap to detect by first non-space char).
|
|
69
|
+
if (trimmed[0] === '[' || trimmed[0] === '{') {
|
|
70
|
+
let parsed;
|
|
71
|
+
try {
|
|
72
|
+
parsed = JSON.parse(trimmed);
|
|
73
|
+
} catch {
|
|
74
|
+
parsed = null;
|
|
75
|
+
}
|
|
76
|
+
if (parsed) {
|
|
77
|
+
if (Array.isArray(parsed)) return parsed;
|
|
78
|
+
if (Array.isArray(parsed.posts)) return parsed.posts;
|
|
79
|
+
if (Array.isArray(parsed.data)) return parsed.data;
|
|
80
|
+
// a single object on one line is not a valid corpus
|
|
81
|
+
}
|
|
82
|
+
}
|
|
83
|
+
// JSONL: one object per line, tolerant of blank lines.
|
|
84
|
+
const rows = [];
|
|
85
|
+
for (const line of trimmed.split('\n')) {
|
|
86
|
+
const s = line.trim();
|
|
87
|
+
if (!s) continue;
|
|
88
|
+
try {
|
|
89
|
+
rows.push(JSON.parse(s));
|
|
90
|
+
} catch {
|
|
91
|
+
// skip an unparseable line rather than abort the whole ingest
|
|
92
|
+
}
|
|
93
|
+
}
|
|
94
|
+
if (!rows.length) throw new Error(`no parseable rows in dump ${dumpPath}`);
|
|
95
|
+
return rows;
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
// groupByAuthor(rows, cfg) → [{ id, docs:[...] }] for authors clearing the floors. Deleted/
|
|
99
|
+
// bot/empty authors and near-empty bodies are dropped. Stable order: by author handle.
|
|
100
|
+
export function groupByAuthor(rows, cfg = REDDIT_DEFAULTS) {
|
|
101
|
+
const c = { ...REDDIT_DEFAULTS, ...cfg };
|
|
102
|
+
const byAuthor = new Map();
|
|
103
|
+
for (const r of rows) {
|
|
104
|
+
const author = pickAuthor(r);
|
|
105
|
+
if (DELETED_AUTHORS.has(author.toLowerCase())) continue;
|
|
106
|
+
const body = pickBody(r);
|
|
107
|
+
if (!body || body.length < c.minBodyChars) continue;
|
|
108
|
+
if (!byAuthor.has(author)) byAuthor.set(author, []);
|
|
109
|
+
byAuthor.get(author).push(body);
|
|
110
|
+
}
|
|
111
|
+
const out = [];
|
|
112
|
+
for (const [id, docs] of byAuthor) {
|
|
113
|
+
if (docs.length < c.minDocsPerAuthor) continue;
|
|
114
|
+
const tokens = docs.reduce((s, d) => s + tokenizeWords(d).length, 0);
|
|
115
|
+
if (tokens < c.minTokensPerAuthor) continue;
|
|
116
|
+
out.push({ id, docs });
|
|
117
|
+
}
|
|
118
|
+
out.sort((a, b) => (a.id < b.id ? -1 : a.id > b.id ? 1 : 0));
|
|
119
|
+
return out;
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
// ingestRedditCorpus(dumpPath, opts) → { corpus, foreigners, stats }.
|
|
123
|
+
// corpus — nPersonaAuthors qualifying authors (the {id,docs} loadRealPersonas wants)
|
|
124
|
+
// foreigners — nForeignAuthors DISJOINT qualifying authors (same-register pool; same file
|
|
125
|
+
// ⇒ same subreddit ⇒ same register by construction)
|
|
126
|
+
// Selection + partition are a pure function of `seed` and author identity — NEVER of any
|
|
127
|
+
// style distance (selection-bias guard, mirrors real-personas). THROWS if too few qualify.
|
|
128
|
+
export function ingestRedditCorpus(dumpPath, opts = {}) {
|
|
129
|
+
const cfg = { ...REDDIT_DEFAULTS, ...opts };
|
|
130
|
+
const rows = parseDump(dumpPath);
|
|
131
|
+
const qualifying = groupByAuthor(rows, cfg);
|
|
132
|
+
|
|
133
|
+
const need = cfg.nPersonaAuthors + cfg.nForeignAuthors;
|
|
134
|
+
if (qualifying.length < need) {
|
|
135
|
+
throw new Error(
|
|
136
|
+
`too few qualifying authors: ${qualifying.length} < ${need} `
|
|
137
|
+
+ `(personas ${cfg.nPersonaAuthors} + foreigners ${cfg.nForeignAuthors}); ingest more — do not underpower`,
|
|
138
|
+
);
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
// Seeded deterministic order, independent of content/style distance.
|
|
142
|
+
const ordered = [...qualifying].sort((a, b) => {
|
|
143
|
+
const ha = fnv1a(`${cfg.seed}:${a.id}`);
|
|
144
|
+
const hb = fnv1a(`${cfg.seed}:${b.id}`);
|
|
145
|
+
return ha - hb || (a.id < b.id ? -1 : 1);
|
|
146
|
+
});
|
|
147
|
+
|
|
148
|
+
const corpus = ordered.slice(0, cfg.nPersonaAuthors);
|
|
149
|
+
const foreigners = ordered.slice(cfg.nPersonaAuthors, cfg.nPersonaAuthors + cfg.nForeignAuthors);
|
|
150
|
+
|
|
151
|
+
return {
|
|
152
|
+
corpus,
|
|
153
|
+
foreigners,
|
|
154
|
+
stats: {
|
|
155
|
+
totalRows: rows.length,
|
|
156
|
+
qualifyingAuthors: qualifying.length,
|
|
157
|
+
personaAuthors: corpus.length,
|
|
158
|
+
foreignAuthors: foreigners.length,
|
|
159
|
+
minDocsPerAuthor: cfg.minDocsPerAuthor,
|
|
160
|
+
minTokensPerAuthor: cfg.minTokensPerAuthor,
|
|
161
|
+
},
|
|
162
|
+
};
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
export const __test = { fnv1a, pickAuthor, pickBody };
|
|
166
|
+
export default { ingestRedditCorpus, groupByAuthor, parseDump, REDDIT_DEFAULTS };
|
|
@@ -0,0 +1,121 @@
|
|
|
1
|
+
// Gate B v2 — corpus-from-reddit ingest loader. Pure transform from a RAW local
|
|
2
|
+
// single-subreddit dump to the {id,docs} corpus loadRealPersonas wants + a disjoint
|
|
3
|
+
// same-register foreigner pool. The guards: fail-closed on too-few-authors / too-short
|
|
4
|
+
// (never silently underpower), author grouping is deterministic, the foreigner pool is
|
|
5
|
+
// disjoint from the persona corpus, and NO network is ever touched.
|
|
6
|
+
|
|
7
|
+
import { test } from 'node:test';
|
|
8
|
+
import assert from 'node:assert/strict';
|
|
9
|
+
import fs from 'node:fs';
|
|
10
|
+
import os from 'node:os';
|
|
11
|
+
import path from 'node:path';
|
|
12
|
+
import {
|
|
13
|
+
groupByAuthor, ingestRedditCorpus, REDDIT_DEFAULTS,
|
|
14
|
+
} from './corpus-from-reddit.mjs';
|
|
15
|
+
|
|
16
|
+
// ---- fixtures: synthetic Reddit-shaped rows (no network) ----
|
|
17
|
+
// One long post per row; many rows per author so each author clears the doc-count + token floors.
|
|
18
|
+
function row(author, body, id) {
|
|
19
|
+
return {
|
|
20
|
+
author, body, id: id || `${author}-${Math.random().toString(36).slice(2)}`, subreddit: 'testsub',
|
|
21
|
+
};
|
|
22
|
+
}
|
|
23
|
+
const LONG = 'This is a reasonably long body of text written by a real person on a forum. '
|
|
24
|
+
+ 'It contains several sentences so that the token floor is comfortably cleared by each document. '
|
|
25
|
+
+ 'People tend to ramble a bit when they post, which is convenient for stylometry. '
|
|
26
|
+
+ 'The quick brown fox jumped over the lazy dog while the cat watched from a distance, unimpressed.';
|
|
27
|
+
|
|
28
|
+
function makeAuthorRows(author, nDocs) {
|
|
29
|
+
return Array.from({ length: nDocs }, (_, i) => row(author, `${LONG} (post ${i} by ${author})`, `${author}-${i}`));
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
function writeJsonl(rows) {
|
|
33
|
+
const dir = fs.mkdtempSync(path.join(os.tmpdir(), 'reddit-fix-'));
|
|
34
|
+
const p = path.join(dir, 'dump.jsonl');
|
|
35
|
+
fs.writeFileSync(p, rows.map((r) => JSON.stringify(r)).join('\n'));
|
|
36
|
+
return p;
|
|
37
|
+
}
|
|
38
|
+
function writeJson(rows) {
|
|
39
|
+
const dir = fs.mkdtempSync(path.join(os.tmpdir(), 'reddit-fix-'));
|
|
40
|
+
const p = path.join(dir, 'dump.json');
|
|
41
|
+
fs.writeFileSync(p, JSON.stringify(rows));
|
|
42
|
+
return p;
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
// 10 authors × 6 long docs each — comfortably over the floors.
|
|
46
|
+
function tenAuthors() {
|
|
47
|
+
const rows = [];
|
|
48
|
+
for (let a = 0; a < 10; a += 1) rows.push(...makeAuthorRows(`u${a}`, 6));
|
|
49
|
+
return rows;
|
|
50
|
+
}
|
|
51
|
+
// Fixtures use small docs; pass a fixture-scaled token floor (production default = 1800).
|
|
52
|
+
const FIX = { minTokensPerAuthor: 200 };
|
|
53
|
+
|
|
54
|
+
test('groupByAuthor: groups rows into {id,docs}, drops deleted/bot/empty authors', () => {
|
|
55
|
+
const rows = [
|
|
56
|
+
row('alice', 'hello world one'), row('alice', 'hello world two'),
|
|
57
|
+
row('[deleted]', 'should be dropped'), row('AutoModerator', 'bot post'),
|
|
58
|
+
row('', 'empty author'), row('bob', 'a post by bob'),
|
|
59
|
+
];
|
|
60
|
+
// minTokensPerAuthor:0 + minDocsPerAuthor:1 isolates the AUTHOR-dropping behavior under test.
|
|
61
|
+
const grouped = groupByAuthor(rows, { minTokensPerAuthor: 0, minDocsPerAuthor: 1, minBodyChars: 1 });
|
|
62
|
+
const ids = grouped.map((g) => g.id).sort();
|
|
63
|
+
assert.deepEqual(ids, ['alice', 'bob']);
|
|
64
|
+
assert.equal(grouped.find((g) => g.id === 'alice').docs.length, 2);
|
|
65
|
+
});
|
|
66
|
+
|
|
67
|
+
test('ingest emits the {id,docs} corpus shape loadRealPersonas consumes', () => {
|
|
68
|
+
const p = writeJsonl(tenAuthors());
|
|
69
|
+
const { corpus } = ingestRedditCorpus(p, { nPersonaAuthors: 4, nForeignAuthors: 4, ...FIX });
|
|
70
|
+
assert.ok(Array.isArray(corpus) && corpus.length === 4);
|
|
71
|
+
for (const author of corpus) {
|
|
72
|
+
assert.ok(typeof author.id === 'string' && author.id.length > 0);
|
|
73
|
+
assert.ok(Array.isArray(author.docs) && author.docs.length >= REDDIT_DEFAULTS.minDocsPerAuthor);
|
|
74
|
+
assert.ok(author.docs.every((d) => typeof d === 'string'));
|
|
75
|
+
}
|
|
76
|
+
});
|
|
77
|
+
|
|
78
|
+
test('foreigner pool is DISJOINT from the persona corpus', () => {
|
|
79
|
+
const p = writeJsonl(tenAuthors());
|
|
80
|
+
const { corpus, foreigners } = ingestRedditCorpus(p, { nPersonaAuthors: 4, nForeignAuthors: 4, ...FIX });
|
|
81
|
+
const cIds = new Set(corpus.map((a) => a.id));
|
|
82
|
+
const fIds = new Set(foreigners.map((a) => a.id));
|
|
83
|
+
for (const id of fIds) assert.ok(!cIds.has(id), `foreigner ${id} must not be a persona`);
|
|
84
|
+
assert.equal(foreigners.length, 4);
|
|
85
|
+
});
|
|
86
|
+
|
|
87
|
+
test('accepts a JSON-array dump as well as JSONL', () => {
|
|
88
|
+
const p = writeJson(tenAuthors());
|
|
89
|
+
const { corpus } = ingestRedditCorpus(p, { nPersonaAuthors: 3, nForeignAuthors: 3, ...FIX });
|
|
90
|
+
assert.equal(corpus.length, 3);
|
|
91
|
+
});
|
|
92
|
+
|
|
93
|
+
test('FAIL-CLOSED: too few qualifying authors THROWS (never silently underpowers)', () => {
|
|
94
|
+
const rows = [...makeAuthorRows('only1', 6), ...makeAuthorRows('only2', 6)];
|
|
95
|
+
const p = writeJsonl(rows);
|
|
96
|
+
assert.throws(
|
|
97
|
+
() => ingestRedditCorpus(p, { nPersonaAuthors: 4, nForeignAuthors: 4, ...FIX }),
|
|
98
|
+
/qualifying authors|too few/i,
|
|
99
|
+
);
|
|
100
|
+
});
|
|
101
|
+
|
|
102
|
+
test('FAIL-CLOSED: authors below the doc-count floor are dropped, not padded', () => {
|
|
103
|
+
// u_short has only 1 doc (< minDocsPerAuthor) → must not appear
|
|
104
|
+
const rows = [...tenAuthors(), row('u_short', LONG)];
|
|
105
|
+
const p = writeJsonl(rows);
|
|
106
|
+
const { corpus, foreigners } = ingestRedditCorpus(p, { nPersonaAuthors: 5, nForeignAuthors: 5, ...FIX });
|
|
107
|
+
const all = new Set([...corpus, ...foreigners].map((a) => a.id));
|
|
108
|
+
assert.ok(!all.has('u_short'), 'under-floor author excluded');
|
|
109
|
+
});
|
|
110
|
+
|
|
111
|
+
test('FAIL-CLOSED: a missing dump file THROWS', () => {
|
|
112
|
+
assert.throws(() => ingestRedditCorpus('/no/such/dump.jsonl', {}), /ENOENT|not found|read/i);
|
|
113
|
+
});
|
|
114
|
+
|
|
115
|
+
test('selection is deterministic for a fixed seed (same authors, same order)', () => {
|
|
116
|
+
const p = writeJsonl(tenAuthors());
|
|
117
|
+
const a = ingestRedditCorpus(p, { nPersonaAuthors: 4, nForeignAuthors: 3, seed: 7, ...FIX });
|
|
118
|
+
const b = ingestRedditCorpus(p, { nPersonaAuthors: 4, nForeignAuthors: 3, seed: 7, ...FIX });
|
|
119
|
+
assert.deepEqual(a.corpus.map((x) => x.id), b.corpus.map((x) => x.id));
|
|
120
|
+
assert.deepEqual(a.foreigners.map((x) => x.id), b.foreigners.map((x) => x.id));
|
|
121
|
+
});
|
|
@@ -0,0 +1,264 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* profile/eval/corpus-from-transcripts.mjs — REAL-CORPUS builder for the
|
|
3
|
+
* profile-bus eval (v1.6.0). Turns the user's OWN Claude Code transcripts into
|
|
4
|
+
* the `{ sessions, feedback }` shape the REAL capture/derive pipeline consumes,
|
|
5
|
+
* so Gate B/C can be run against genuine "learns YOU" data instead of a synthetic
|
|
6
|
+
* fixture.
|
|
7
|
+
*
|
|
8
|
+
* ── PRIVACY (the load-bearing invariant) ───────────────────────────────────
|
|
9
|
+
* Raw transcript TEXT never leaves this process and is never persisted as a
|
|
10
|
+
* style record. Each user message is reduced to COUNTS via the REAL
|
|
11
|
+
* `extractMessageMetadata` (capture.js) and the string is discarded. The ONLY
|
|
12
|
+
* place raw-ish text survives is a `.session-feedback.jsonl` row's `context`
|
|
13
|
+
* snippet (the feedback detector's 120-char window) — and that file is written
|
|
14
|
+
* ONLY to the local, gitignored scratch dir and is NEVER sent to any cloud API
|
|
15
|
+
* (the cloud only ever sees the DERIVED brief = slugs, plus authored prompts).
|
|
16
|
+
*
|
|
17
|
+
* ── METHOD ─────────────────────────────────────────────────────────────────
|
|
18
|
+
* 1. Walk ~/.claude/projects/<projectDir>/<uuid>.jsonl.
|
|
19
|
+
* 2. Keep lines: type==='user' AND message.content is a STRING AND NOT isMeta
|
|
20
|
+
* AND not a slash-command/hook/system artifact (those are not the user's
|
|
21
|
+
* own authored prose — counting them would poison the style fingerprint).
|
|
22
|
+
* 3. Group by sessionId. For each session: fold per-message metadata into a
|
|
23
|
+
* counts-only accumulator (mirrors capture.js flushSession math EXACTLY) and
|
|
24
|
+
* run the REAL `detectFeedback` over each human message → feedback rows.
|
|
25
|
+
* 4. Emit one session record per session with: a `metadata` block in the exact
|
|
26
|
+
* `toDeriveMeta`/deriveStyle input shape, a `feedback[]` array in the
|
|
27
|
+
* .session-feedback.jsonl shape, a session `ts` (first human-message time),
|
|
28
|
+
* and `host: 'claude-code'`.
|
|
29
|
+
*
|
|
30
|
+
* Zero deps. Node built-ins only. No network. No LLM.
|
|
31
|
+
*/
|
|
32
|
+
|
|
33
|
+
import { readFileSync, readdirSync, statSync } from 'node:fs';
|
|
34
|
+
import { join } from 'node:path';
|
|
35
|
+
import { homedir } from 'node:os';
|
|
36
|
+
import { extractMessageMetadata } from '../capture.js';
|
|
37
|
+
import { detectFeedback } from '../../feedback-detector.js';
|
|
38
|
+
|
|
39
|
+
/** Default corpus root: the user's real Claude Code transcript store. */
|
|
40
|
+
export function defaultProjectsRoot() {
|
|
41
|
+
return join(homedir(), '.claude', 'projects');
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
/**
|
|
45
|
+
* A user message line is "authored human prose" iff it is a plain string turn
|
|
46
|
+
* that is NOT a slash-command expansion, hook-injected context, system reminder,
|
|
47
|
+
* tool-result echo, or interrupt marker. These artifacts are machine-authored
|
|
48
|
+
* (or template-expanded) and counting them would corrupt the style signal.
|
|
49
|
+
*/
|
|
50
|
+
const ARTIFACT_PREFIXES = [
|
|
51
|
+
'<command-name>', '<command-message>', '<command-args>', '<local-command-',
|
|
52
|
+
'<system-reminder', '<user-memory-input>', 'Caveat:', '[Request interrupted',
|
|
53
|
+
'<bash-', '<task-', '<post-tool-use', '<pre-tool-use',
|
|
54
|
+
];
|
|
55
|
+
function isAuthoredHumanText(text) {
|
|
56
|
+
if (typeof text !== 'string') return false;
|
|
57
|
+
const t = text.trim();
|
|
58
|
+
if (!t) return false;
|
|
59
|
+
for (const p of ARTIFACT_PREFIXES) if (t.startsWith(p)) return false;
|
|
60
|
+
// A pasted system-reminder block mid-text is still machine context.
|
|
61
|
+
if (t.startsWith('<') && t.includes('</')) return false;
|
|
62
|
+
return true;
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
/** Per-session counts accumulator — identical math to capture.js flushSession. */
|
|
66
|
+
function freshAcc(sessionId) {
|
|
67
|
+
return {
|
|
68
|
+
session_id: sessionId,
|
|
69
|
+
first_ts: null, last_ts: null,
|
|
70
|
+
msg_count: 0, total_chars: 0, total_emojis: 0,
|
|
71
|
+
code_msgs: 0, formality_msgs: 0,
|
|
72
|
+
feedback: [],
|
|
73
|
+
};
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
function clamp01(x) {
|
|
77
|
+
const n = Number(x);
|
|
78
|
+
if (!Number.isFinite(n)) return 0;
|
|
79
|
+
return n < 0 ? 0 : n > 1 ? 1 : n;
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
/**
|
|
83
|
+
* Fold one accumulator into the eval session record. Mirrors capture.js
|
|
84
|
+
* flushSession's field derivation (avg_msg_chars / emoji_rate / code_block_ratio
|
|
85
|
+
* / formality_markers / turn_cadence_s) then maps to the deriveStyle input shape
|
|
86
|
+
* (emoji_per_msg, turn_cadence_per_min) — exactly toDeriveMeta's transform — so
|
|
87
|
+
* the heuristic derive sees production-identical metadata.
|
|
88
|
+
*/
|
|
89
|
+
function accToSession(acc, host) {
|
|
90
|
+
const n = acc.msg_count;
|
|
91
|
+
if (n <= 0) return null;
|
|
92
|
+
const avgChars = acc.total_chars / n;
|
|
93
|
+
const emojiRate = acc.total_emojis / n;
|
|
94
|
+
const codeRatio = clamp01(acc.code_msgs / n);
|
|
95
|
+
const formality = clamp01(acc.formality_msgs / n);
|
|
96
|
+
let cadenceS = 0;
|
|
97
|
+
if (n > 1 && Number.isFinite(acc.first_ts) && Number.isFinite(acc.last_ts) && acc.last_ts > acc.first_ts) {
|
|
98
|
+
cadenceS = ((acc.last_ts - acc.first_ts) / 1000) / (n - 1);
|
|
99
|
+
}
|
|
100
|
+
const metadata = {
|
|
101
|
+
avg_msg_chars: Math.round(avgChars * 100) / 100,
|
|
102
|
+
emoji_per_msg: Math.round(emojiRate * 1000) / 1000,
|
|
103
|
+
code_block_ratio: Math.round(codeRatio * 1000) / 1000,
|
|
104
|
+
formality_markers: Math.round(formality * 1000) / 1000,
|
|
105
|
+
turn_cadence_per_min: cadenceS > 0 ? Math.round((60 / cadenceS) * 1000) / 1000 : 0,
|
|
106
|
+
};
|
|
107
|
+
const tsIso = Number.isFinite(acc.first_ts) ? new Date(acc.first_ts).toISOString() : null;
|
|
108
|
+
// Stamp session_id/ts onto every feedback row (capture's contract).
|
|
109
|
+
const feedback = acc.feedback.map((f) => ({
|
|
110
|
+
session_id: acc.session_id,
|
|
111
|
+
ts: tsIso,
|
|
112
|
+
kind: f.kind,
|
|
113
|
+
phrase: f.phrase,
|
|
114
|
+
context: f.context,
|
|
115
|
+
}));
|
|
116
|
+
return {
|
|
117
|
+
session_id: acc.session_id,
|
|
118
|
+
sessionId: acc.session_id,
|
|
119
|
+
host,
|
|
120
|
+
ts: tsIso,
|
|
121
|
+
metadata,
|
|
122
|
+
feedback,
|
|
123
|
+
msg_count: n,
|
|
124
|
+
};
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
/**
|
|
128
|
+
* parseTranscript(file) -> Map<sessionId, acc>. One .jsonl may contain a single
|
|
129
|
+
* session (the common case) but we group by the line's own sessionId to be safe.
|
|
130
|
+
*/
|
|
131
|
+
function parseTranscript(file, accumulators, _host) {
|
|
132
|
+
let raw;
|
|
133
|
+
try { raw = readFileSync(file, 'utf8'); } catch { return; }
|
|
134
|
+
const lines = raw.split('\n');
|
|
135
|
+
for (const line of lines) {
|
|
136
|
+
if (!line.trim()) continue;
|
|
137
|
+
let o;
|
|
138
|
+
try { o = JSON.parse(line); } catch { continue; }
|
|
139
|
+
if (!o || o.type !== 'user' || !o.message) continue;
|
|
140
|
+
if (o.isMeta === true) continue;
|
|
141
|
+
const content = o.message.content;
|
|
142
|
+
if (!isAuthoredHumanText(content)) continue; // string-only + non-artifact
|
|
143
|
+
|
|
144
|
+
const sid = String(o.sessionId || o.session_id || '').trim();
|
|
145
|
+
if (!sid) continue;
|
|
146
|
+
let acc = accumulators.get(sid);
|
|
147
|
+
if (!acc) { acc = freshAcc(sid); accumulators.set(sid, acc); }
|
|
148
|
+
|
|
149
|
+
// METADATA ONLY — extract counts, discard the string.
|
|
150
|
+
const meta = extractMessageMetadata(content);
|
|
151
|
+
acc.msg_count += 1;
|
|
152
|
+
acc.total_chars += meta.chars;
|
|
153
|
+
acc.total_emojis += meta.emojis;
|
|
154
|
+
if (meta.hasCode) acc.code_msgs += 1;
|
|
155
|
+
if (meta.formalityHits > 0) acc.formality_msgs += 1;
|
|
156
|
+
|
|
157
|
+
const t = Date.parse(o.timestamp);
|
|
158
|
+
if (Number.isFinite(t)) {
|
|
159
|
+
if (acc.first_ts === null || t < acc.first_ts) acc.first_ts = t;
|
|
160
|
+
if (acc.last_ts === null || t > acc.last_ts) acc.last_ts = t;
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
// REAL feedback detection over the human message (high-precision, low-recall).
|
|
164
|
+
for (const fb of detectFeedback(content)) {
|
|
165
|
+
acc.feedback.push(fb);
|
|
166
|
+
}
|
|
167
|
+
}
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
/**
|
|
171
|
+
* buildCorpus(opts) -> { sessions, stats }.
|
|
172
|
+
*
|
|
173
|
+
* @param {object} [opts]
|
|
174
|
+
* @param {string} [opts.root] projects root (default ~/.claude/projects)
|
|
175
|
+
* @param {number} [opts.minMessages] keep sessions with >= this many human msgs (default 8)
|
|
176
|
+
* @param {number} [opts.cap] cap total kept sessions (stratified by project) (default 400)
|
|
177
|
+
* @param {number} [opts.maxFiles] safety cap on files scanned (default Infinity)
|
|
178
|
+
*
|
|
179
|
+
* Stratified sampling: we keep up to `perDir` sessions from each project dir so a
|
|
180
|
+
* few huge projects don't dominate the profile. Deterministic (sorted) ordering.
|
|
181
|
+
*/
|
|
182
|
+
export function buildCorpus(opts = {}) {
|
|
183
|
+
const root = opts.root || defaultProjectsRoot();
|
|
184
|
+
const minMessages = Number.isFinite(opts.minMessages) ? opts.minMessages : 8;
|
|
185
|
+
const cap = Number.isFinite(opts.cap) ? opts.cap : 400;
|
|
186
|
+
const host = 'claude-code';
|
|
187
|
+
|
|
188
|
+
let dirs = [];
|
|
189
|
+
try { dirs = readdirSync(root).filter((d) => !d.startsWith('.')); } catch {
|
|
190
|
+
return { sessions: [], stats: { error: `cannot read ${root}` } };
|
|
191
|
+
}
|
|
192
|
+
dirs.sort();
|
|
193
|
+
|
|
194
|
+
let filesScanned = 0;
|
|
195
|
+
let sessionsSeen = 0;
|
|
196
|
+
// Collect per-dir kept sessions for stratified capping.
|
|
197
|
+
const perDirSessions = new Map();
|
|
198
|
+
for (const d of dirs) {
|
|
199
|
+
const dirPath = join(root, d);
|
|
200
|
+
let files = [];
|
|
201
|
+
try { files = readdirSync(dirPath).filter((f) => f.endsWith('.jsonl')); } catch { continue; }
|
|
202
|
+
files.sort();
|
|
203
|
+
const accumulators = new Map();
|
|
204
|
+
for (const f of files) {
|
|
205
|
+
if (filesScanned >= (opts.maxFiles ?? Infinity)) break;
|
|
206
|
+
const full = join(dirPath, f);
|
|
207
|
+
try { if (!statSync(full).isFile()) continue; } catch { continue; }
|
|
208
|
+
parseTranscript(full, accumulators, host);
|
|
209
|
+
filesScanned += 1;
|
|
210
|
+
}
|
|
211
|
+
const kept = [];
|
|
212
|
+
for (const acc of accumulators.values()) {
|
|
213
|
+
sessionsSeen += 1;
|
|
214
|
+
if (acc.msg_count < minMessages) continue;
|
|
215
|
+
const sess = accToSession(acc, host);
|
|
216
|
+
if (sess && sess.ts) kept.push(sess);
|
|
217
|
+
}
|
|
218
|
+
// sort kept by ts so the per-dir slice is chronological + deterministic
|
|
219
|
+
kept.sort((a, b) => Date.parse(a.ts) - Date.parse(b.ts));
|
|
220
|
+
if (kept.length) perDirSessions.set(d, kept);
|
|
221
|
+
}
|
|
222
|
+
|
|
223
|
+
// Stratified cap: round-robin across dirs until we hit `cap`.
|
|
224
|
+
const dirKeys = [...perDirSessions.keys()].sort();
|
|
225
|
+
const sessions = [];
|
|
226
|
+
let added = true;
|
|
227
|
+
const cursors = new Map(dirKeys.map((k) => [k, 0]));
|
|
228
|
+
while (added && sessions.length < cap) {
|
|
229
|
+
added = false;
|
|
230
|
+
for (const k of dirKeys) {
|
|
231
|
+
if (sessions.length >= cap) break;
|
|
232
|
+
const list = perDirSessions.get(k);
|
|
233
|
+
const i = cursors.get(k);
|
|
234
|
+
if (i < list.length) {
|
|
235
|
+
sessions.push(list[i]);
|
|
236
|
+
cursors.set(k, i + 1);
|
|
237
|
+
added = true;
|
|
238
|
+
}
|
|
239
|
+
}
|
|
240
|
+
}
|
|
241
|
+
|
|
242
|
+
// Final chronological sort (the eval split is time-based).
|
|
243
|
+
sessions.sort((a, b) => Date.parse(a.ts) - Date.parse(b.ts));
|
|
244
|
+
|
|
245
|
+
const withFeedback = sessions.filter((s) => s.feedback.length > 0).length;
|
|
246
|
+
const totalFeedback = sessions.reduce((n, s) => n + s.feedback.length, 0);
|
|
247
|
+
return {
|
|
248
|
+
sessions,
|
|
249
|
+
stats: {
|
|
250
|
+
projectDirs: dirs.length,
|
|
251
|
+
filesScanned,
|
|
252
|
+
sessionsSeen,
|
|
253
|
+
sessionsKept: sessions.length,
|
|
254
|
+
minMessages,
|
|
255
|
+
cap,
|
|
256
|
+
sessionsWithFeedback: withFeedback,
|
|
257
|
+
totalFeedbackRows: totalFeedback,
|
|
258
|
+
tsMin: sessions.length ? sessions[0].ts : null,
|
|
259
|
+
tsMax: sessions.length ? sessions[sessions.length - 1].ts : null,
|
|
260
|
+
},
|
|
261
|
+
};
|
|
262
|
+
}
|
|
263
|
+
|
|
264
|
+
export default { buildCorpus, defaultProjectsRoot };
|