@machinespirits/eval 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +91 -9
- package/config/eval-settings.yaml +3 -3
- package/config/paper-manifest.json +486 -0
- package/config/providers.yaml +9 -6
- package/config/tutor-agents.yaml +2261 -0
- package/content/README.md +23 -0
- package/content/courses/479/course.md +53 -0
- package/content/courses/479/lecture-1.md +361 -0
- package/content/courses/479/lecture-2.md +360 -0
- package/content/courses/479/lecture-3.md +655 -0
- package/content/courses/479/lecture-4.md +530 -0
- package/content/courses/479/lecture-5.md +326 -0
- package/content/courses/479/lecture-6.md +346 -0
- package/content/courses/479/lecture-7.md +326 -0
- package/content/courses/479/lecture-8.md +273 -0
- package/content/courses/479/roadmap-slides.md +656 -0
- package/content/manifest.yaml +8 -0
- package/docs/research/build.sh +44 -20
- package/docs/research/figures/figure10.png +0 -0
- package/docs/research/figures/figure11.png +0 -0
- package/docs/research/figures/figure3.png +0 -0
- package/docs/research/figures/figure4.png +0 -0
- package/docs/research/figures/figure5.png +0 -0
- package/docs/research/figures/figure6.png +0 -0
- package/docs/research/figures/figure7.png +0 -0
- package/docs/research/figures/figure8.png +0 -0
- package/docs/research/figures/figure9.png +0 -0
- package/docs/research/header.tex +23 -2
- package/docs/research/paper-full.md +941 -285
- package/docs/research/paper-short.md +216 -585
- package/docs/research/references.bib +132 -0
- package/docs/research/slides-header.tex +188 -0
- package/docs/research/slides-pptx.md +363 -0
- package/docs/research/slides.md +531 -0
- package/docs/research/style-reference-pptx.py +199 -0
- package/package.json +6 -5
- package/scripts/analyze-eval-results.js +69 -17
- package/scripts/analyze-mechanism-traces.js +763 -0
- package/scripts/analyze-modulation-learning.js +498 -0
- package/scripts/analyze-prosthesis.js +144 -0
- package/scripts/analyze-run.js +264 -79
- package/scripts/assess-transcripts.js +853 -0
- package/scripts/browse-transcripts.js +854 -0
- package/scripts/check-parse-failures.js +73 -0
- package/scripts/code-dialectical-modulation.js +1320 -0
- package/scripts/download-data.sh +55 -0
- package/scripts/eval-cli.js +106 -18
- package/scripts/generate-paper-figures.js +663 -0
- package/scripts/generate-paper-figures.py +577 -76
- package/scripts/generate-paper-tables.js +299 -0
- package/scripts/qualitative-analysis-ai.js +3 -3
- package/scripts/render-sequence-diagram.js +694 -0
- package/scripts/test-latency.js +210 -0
- package/scripts/test-rate-limit.js +95 -0
- package/scripts/test-token-budget.js +332 -0
- package/scripts/validate-paper-manifest.js +670 -0
- package/services/__tests__/evalConfigLoader.test.js +2 -2
- package/services/__tests__/learnerRubricEvaluator.test.js +361 -0
- package/services/__tests__/learnerTutorInteractionEngine.test.js +326 -0
- package/services/evaluationRunner.js +975 -98
- package/services/evaluationStore.js +12 -4
- package/services/learnerTutorInteractionEngine.js +27 -2
- package/services/mockProvider.js +133 -0
- package/services/promptRewriter.js +1471 -5
- package/services/rubricEvaluator.js +55 -2
- package/services/transcriptFormatter.js +675 -0
- package/docs/EVALUATION-VARIABLES.md +0 -589
- package/docs/REPLICATION-PLAN.md +0 -577
- package/scripts/analyze-run.mjs +0 -282
- package/scripts/compare-runs.js +0 -44
- package/scripts/compare-suggestions.js +0 -80
- package/scripts/dig-into-run.js +0 -158
- package/scripts/show-failed-suggestions.js +0 -64
- /package/scripts/{check-run.mjs → check-run.js} +0 -0
|
@@ -0,0 +1,299 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
/**
|
|
3
|
+
* generate-paper-tables.js — Level 3 paper manifest table generation + prose validation
|
|
4
|
+
*
|
|
5
|
+
* Reads config/paper-manifest.json, queries the DB, and:
|
|
6
|
+
* 1. Generates Table 2 (Evaluation Sample Summary) markdown
|
|
7
|
+
* 2. Generates Appendix D (Reproducibility Run IDs) markdown
|
|
8
|
+
* 3. Validates all prose N-count references in paper-full.md
|
|
9
|
+
* 4. Reports any discrepancies
|
|
10
|
+
*
|
|
11
|
+
* Usage:
|
|
12
|
+
* node scripts/generate-paper-tables.js # validate only
|
|
13
|
+
* node scripts/generate-paper-tables.js --generate # output generated tables
|
|
14
|
+
* node scripts/generate-paper-tables.js --diff # show diffs against paper
|
|
15
|
+
*/
|
|
16
|
+
|
|
17
|
+
import { readFileSync, existsSync } from 'fs';
|
|
18
|
+
import { join, dirname } from 'path';
|
|
19
|
+
import { fileURLToPath } from 'url';
|
|
20
|
+
import Database from 'better-sqlite3';
|
|
21
|
+
|
|
22
|
+
const __dirname = dirname(fileURLToPath(import.meta.url));
|
|
23
|
+
const ROOT = join(__dirname, '..');
|
|
24
|
+
|
|
25
|
+
const MANIFEST_PATH = join(ROOT, 'config', 'paper-manifest.json');
|
|
26
|
+
const DB_PATH = join(ROOT, 'data', 'evaluations.db');
|
|
27
|
+
const PAPER_PATH = join(ROOT, 'docs', 'research', 'paper-full.md');
|
|
28
|
+
|
|
29
|
+
const doGenerate = process.argv.includes('--generate');
|
|
30
|
+
const doDiff = process.argv.includes('--diff');
|
|
31
|
+
|
|
32
|
+
// ── Helpers ─────────────────────────────────────────────────────────────────
|
|
33
|
+
|
|
34
|
+
function commaNum(n) {
|
|
35
|
+
return n.toLocaleString('en-US');
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
function numToWord(n) {
|
|
39
|
+
const words = {
|
|
40
|
+
1: 'one', 2: 'two', 3: 'three', 4: 'four', 5: 'five',
|
|
41
|
+
6: 'six', 7: 'seven', 8: 'eight', 9: 'nine', 10: 'ten',
|
|
42
|
+
20: 'twenty', 21: 'twenty-one', 22: 'twenty-two', 23: 'twenty-three',
|
|
43
|
+
24: 'twenty-four', 25: 'twenty-five', 26: 'twenty-six',
|
|
44
|
+
27: 'twenty-seven', 28: 'twenty-eight', 29: 'twenty-nine',
|
|
45
|
+
30: 'thirty', 31: 'thirty-one',
|
|
46
|
+
};
|
|
47
|
+
return words[n] || String(n);
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
// ── Main ────────────────────────────────────────────────────────────────────
|
|
51
|
+
|
|
52
|
+
function main() {
|
|
53
|
+
if (!existsSync(MANIFEST_PATH)) {
|
|
54
|
+
console.error(`Manifest not found: ${MANIFEST_PATH}`);
|
|
55
|
+
process.exit(1);
|
|
56
|
+
}
|
|
57
|
+
if (!existsSync(DB_PATH)) {
|
|
58
|
+
console.error(`Database not found: ${DB_PATH}`);
|
|
59
|
+
process.exit(1);
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
const manifest = JSON.parse(readFileSync(MANIFEST_PATH, 'utf8'));
|
|
63
|
+
const db = new Database(DB_PATH, { readonly: true });
|
|
64
|
+
|
|
65
|
+
// ── Query actual data ───────────────────────────────────────────────────
|
|
66
|
+
|
|
67
|
+
const evalData = [];
|
|
68
|
+
let totalAttempts = 0;
|
|
69
|
+
let totalScored = 0;
|
|
70
|
+
|
|
71
|
+
for (const eval_ of manifest.key_evaluations) {
|
|
72
|
+
const runIds = eval_.run_ids;
|
|
73
|
+
const judgePattern = eval_.primary_judge_pattern;
|
|
74
|
+
const placeholders = runIds.map(() => '?').join(',');
|
|
75
|
+
|
|
76
|
+
let scored;
|
|
77
|
+
if (eval_.unit === 'learner turn') {
|
|
78
|
+
const row = db.prepare(`
|
|
79
|
+
SELECT COUNT(*) as total,
|
|
80
|
+
SUM(CASE WHEN learner_overall_score IS NOT NULL THEN 1 ELSE 0 END) as scored
|
|
81
|
+
FROM evaluation_results
|
|
82
|
+
WHERE run_id IN (${placeholders}) AND judge_model LIKE ?
|
|
83
|
+
`).get(...runIds, judgePattern);
|
|
84
|
+
scored = row?.scored ?? 0;
|
|
85
|
+
} else {
|
|
86
|
+
const row = db.prepare(`
|
|
87
|
+
SELECT COUNT(*) as total,
|
|
88
|
+
SUM(CASE WHEN overall_score IS NOT NULL THEN 1 ELSE 0 END) as scored
|
|
89
|
+
FROM evaluation_results
|
|
90
|
+
WHERE run_id IN (${placeholders}) AND judge_model LIKE ?
|
|
91
|
+
`).get(...runIds, judgePattern);
|
|
92
|
+
scored = row?.scored ?? 0;
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
evalData.push({
|
|
96
|
+
...eval_,
|
|
97
|
+
actual_scored: scored,
|
|
98
|
+
run_id_display: runIds.join(', '),
|
|
99
|
+
});
|
|
100
|
+
|
|
101
|
+
totalAttempts += eval_.expected_attempts;
|
|
102
|
+
totalScored += scored;
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
// ── Generate Table 2 ───────────────────────────────────────────────────
|
|
106
|
+
|
|
107
|
+
if (doGenerate || doDiff) {
|
|
108
|
+
console.log('═══ Generated Table 2: Evaluation Sample Summary ═══\n');
|
|
109
|
+
|
|
110
|
+
const lines = [
|
|
111
|
+
'| Evaluation | Run ID | Section | Total Attempts | Scored | Unit |',
|
|
112
|
+
'|------------|--------|---------|----------------|--------|------|',
|
|
113
|
+
];
|
|
114
|
+
|
|
115
|
+
for (const e of evalData) {
|
|
116
|
+
const label = e.label.replace(/×/g, '$\\times$');
|
|
117
|
+
lines.push(
|
|
118
|
+
`| ${label} | ${e.run_id_display} | ${e.section} | ${e.expected_attempts} | ${e.actual_scored} | ${e.unit} |`
|
|
119
|
+
);
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
lines.push(
|
|
123
|
+
`| **Paper totals** | — | — | **${commaNum(totalAttempts)}** | **${commaNum(totalScored)}** | — |`
|
|
124
|
+
);
|
|
125
|
+
|
|
126
|
+
console.log(lines.join('\n'));
|
|
127
|
+
console.log();
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
// ── Generate Appendix D ─────────────────────────────────────────────────
|
|
131
|
+
|
|
132
|
+
if (doGenerate || doDiff) {
|
|
133
|
+
console.log('═══ Generated Appendix D: Reproducibility and Key Evaluation Run IDs ═══\n');
|
|
134
|
+
|
|
135
|
+
const uniqueEvals = manifest.key_evaluations;
|
|
136
|
+
const uniqueRunIds = [...new Set(uniqueEvals.flatMap(e => e.run_ids))];
|
|
137
|
+
|
|
138
|
+
// Find duplicate run IDs (same run used for multiple evaluations)
|
|
139
|
+
const runIdCounts = {};
|
|
140
|
+
for (const e of uniqueEvals) {
|
|
141
|
+
for (const rid of e.run_ids) {
|
|
142
|
+
runIdCounts[rid] = (runIdCounts[rid] || 0) + 1;
|
|
143
|
+
}
|
|
144
|
+
}
|
|
145
|
+
const duplicateRuns = Object.entries(runIdCounts)
|
|
146
|
+
.filter(([, count]) => count > 1)
|
|
147
|
+
.map(([rid]) => rid);
|
|
148
|
+
|
|
149
|
+
// Multi-ID evals
|
|
150
|
+
const multiIdEvals = uniqueEvals.filter(e => e.run_ids.length > 1);
|
|
151
|
+
|
|
152
|
+
const notes = [];
|
|
153
|
+
if (duplicateRuns.length > 0) {
|
|
154
|
+
notes.push(`${duplicateRuns.join(', ')} serves both ${
|
|
155
|
+
uniqueEvals.filter(e => e.run_ids.some(r => duplicateRuns.includes(r))).map(e => e.label.toLowerCase()).join(' and ')
|
|
156
|
+
}`);
|
|
157
|
+
}
|
|
158
|
+
if (multiIdEvals.length > 0) {
|
|
159
|
+
notes.push(`${multiIdEvals[0].run_ids.join(' and ')} are combined as one ${multiIdEvals[0].label.toLowerCase()}`);
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
console.log(`The ${numToWord(uniqueEvals.length)} key evaluations are listed below${notes.length > 0 ? ` (${notes.join('; ')})` : ''}:\n`);
|
|
163
|
+
|
|
164
|
+
const dLines = [
|
|
165
|
+
'| Finding | Run ID | Section |',
|
|
166
|
+
'|---------|--------|---------|',
|
|
167
|
+
];
|
|
168
|
+
for (const e of evalData) {
|
|
169
|
+
dLines.push(`| ${e.label} | ${e.run_id_display} | ${e.section} |`);
|
|
170
|
+
}
|
|
171
|
+
|
|
172
|
+
console.log(dLines.join('\n'));
|
|
173
|
+
console.log();
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
// ── Prose validation ──────────────────────────────────────────────────
|
|
177
|
+
|
|
178
|
+
console.log('═══ Prose N-Count Validation ═══\n');
|
|
179
|
+
|
|
180
|
+
if (!existsSync(PAPER_PATH)) {
|
|
181
|
+
console.log(' Paper not found, skipping prose validation');
|
|
182
|
+
return;
|
|
183
|
+
}
|
|
184
|
+
|
|
185
|
+
const paper = readFileSync(PAPER_PATH, 'utf8');
|
|
186
|
+
const lines = paper.split('\n');
|
|
187
|
+
let issues = 0;
|
|
188
|
+
|
|
189
|
+
const expectedScored = commaNum(totalScored);
|
|
190
|
+
const expectedAttempts = commaNum(totalAttempts);
|
|
191
|
+
|
|
192
|
+
// Check: paper-total N references (only those with comma-separated thousands)
|
|
193
|
+
// Per-evaluation N values (N=262, N=88, etc.) are intentionally excluded.
|
|
194
|
+
// Revision history (Appendix E) is excluded as it describes past states.
|
|
195
|
+
const appendixEStart = paper.indexOf('## Appendix E');
|
|
196
|
+
const mainBody = appendixEStart > 0 ? paper.substring(0, appendixEStart) : paper;
|
|
197
|
+
|
|
198
|
+
const nPattern = /N[=≈]\s*([\d,]+)\s*(?:primary\s+)?scored/g;
|
|
199
|
+
let match;
|
|
200
|
+
while ((match = nPattern.exec(mainBody)) !== null) {
|
|
201
|
+
const found = match[1];
|
|
202
|
+
// Only check values with commas (>= 1,000) — these are paper totals
|
|
203
|
+
if (found.includes(',') && found !== expectedScored) {
|
|
204
|
+
const lineNum = mainBody.substring(0, match.index).split('\n').length;
|
|
205
|
+
console.log(` ✗ Line ${lineNum}: found "N=${found} scored", expected "N=${expectedScored} scored"`);
|
|
206
|
+
issues++;
|
|
207
|
+
}
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
// Check: manifest prose_n_references patterns all appear
|
|
211
|
+
if (manifest.prose_n_references) {
|
|
212
|
+
for (const ref of manifest.prose_n_references) {
|
|
213
|
+
if (!mainBody.includes(ref.pattern)) {
|
|
214
|
+
console.log(` ✗ Expected pattern "${ref.pattern}" not found in ${ref.location}`);
|
|
215
|
+
issues++;
|
|
216
|
+
}
|
|
217
|
+
}
|
|
218
|
+
}
|
|
219
|
+
|
|
220
|
+
// Check: no stale N values (in main body only, not revision history)
|
|
221
|
+
const staleValues = ['3,047', '3,112', '3,130', '2,906'];
|
|
222
|
+
for (const stale of staleValues) {
|
|
223
|
+
const staleRe = new RegExp(stale.replace(',', ','), 'g');
|
|
224
|
+
let m;
|
|
225
|
+
while ((m = staleRe.exec(mainBody)) !== null) {
|
|
226
|
+
const lineNum = mainBody.substring(0, m.index).split('\n').length;
|
|
227
|
+
console.log(` ✗ Line ${lineNum}: stale N value "${stale}" found`);
|
|
228
|
+
issues++;
|
|
229
|
+
}
|
|
230
|
+
}
|
|
231
|
+
|
|
232
|
+
// Check: Table 2 totals row
|
|
233
|
+
const totalsPattern = new RegExp(
|
|
234
|
+
`\\*\\*${expectedAttempts.replace(/,/g, ',')}\\*\\*.*\\*\\*${expectedScored.replace(/,/g, ',')}\\*\\*`
|
|
235
|
+
);
|
|
236
|
+
if (!totalsPattern.test(paper)) {
|
|
237
|
+
console.log(` ✗ Table 2 totals row doesn't match expected ${expectedAttempts}/${expectedScored}`);
|
|
238
|
+
issues++;
|
|
239
|
+
}
|
|
240
|
+
|
|
241
|
+
// Check: evaluation count in prose
|
|
242
|
+
const countWord = numToWord(manifest.totals.evaluations);
|
|
243
|
+
const countPattern = new RegExp(`${countWord} key evaluations`, 'g');
|
|
244
|
+
const countMatches = paper.match(countPattern) || [];
|
|
245
|
+
if (countMatches.length === 0) {
|
|
246
|
+
console.log(` ✗ "${countWord} key evaluations" not found in paper`);
|
|
247
|
+
issues++;
|
|
248
|
+
}
|
|
249
|
+
|
|
250
|
+
// Check: judge accounting
|
|
251
|
+
const opusWord = numToWord(manifest.totals.opus_primary_count);
|
|
252
|
+
const opusCapWord = opusWord.charAt(0).toUpperCase() + opusWord.slice(1);
|
|
253
|
+
if (!paper.includes(`${opusCapWord} of the ${countWord}`)) {
|
|
254
|
+
console.log(` ✗ Judge accounting: expected "${opusCapWord} of the ${countWord}" not found`);
|
|
255
|
+
issues++;
|
|
256
|
+
}
|
|
257
|
+
|
|
258
|
+
// Check: each run ID appears in paper
|
|
259
|
+
const allRunIds = manifest.key_evaluations.flatMap(e => e.run_ids);
|
|
260
|
+
const uniqueRunIds = [...new Set(allRunIds)];
|
|
261
|
+
for (const runId of uniqueRunIds) {
|
|
262
|
+
if (!paper.includes(runId)) {
|
|
263
|
+
console.log(` ✗ Run ID ${runId} not found in paper`);
|
|
264
|
+
issues++;
|
|
265
|
+
}
|
|
266
|
+
}
|
|
267
|
+
|
|
268
|
+
// Check: per-row scored counts match Table 2
|
|
269
|
+
for (const e of evalData) {
|
|
270
|
+
if (e.actual_scored !== e.expected_scored) {
|
|
271
|
+
console.log(` ✗ ${e.label}: DB scored=${e.actual_scored}, manifest expected=${e.expected_scored}`);
|
|
272
|
+
issues++;
|
|
273
|
+
}
|
|
274
|
+
|
|
275
|
+
// Check the row appears in paper with correct scored count
|
|
276
|
+
const rowPattern = new RegExp(
|
|
277
|
+
`${e.run_ids[0].replace(/[.*+?^${}()|[\]\\]/g, '\\$&')}.*\\|.*\\|.*${e.expected_scored}`
|
|
278
|
+
);
|
|
279
|
+
if (!rowPattern.test(paper)) {
|
|
280
|
+
console.log(` ⚠ ${e.label}: scored count ${e.expected_scored} may not appear in Table 2 row`);
|
|
281
|
+
}
|
|
282
|
+
}
|
|
283
|
+
|
|
284
|
+
// ── Summary ─────────────────────────────────────────────────────────────
|
|
285
|
+
|
|
286
|
+
if (issues === 0) {
|
|
287
|
+
console.log(` ✓ All prose N-counts consistent (${expectedScored} scored, ${expectedAttempts} attempts)`);
|
|
288
|
+
console.log(` ✓ All ${uniqueRunIds.length} run IDs present in paper`);
|
|
289
|
+
console.log(` ✓ Judge accounting correct (${opusCapWord} of ${countWord} Opus-primary)`);
|
|
290
|
+
console.log('\n ALL PASSED ✓');
|
|
291
|
+
} else {
|
|
292
|
+
console.log(`\n ${issues} issue(s) found`);
|
|
293
|
+
process.exit(1);
|
|
294
|
+
}
|
|
295
|
+
|
|
296
|
+
db.close();
|
|
297
|
+
}
|
|
298
|
+
|
|
299
|
+
main();
|
|
@@ -416,7 +416,7 @@ function loadData(db, cells, sampleSize) {
|
|
|
416
416
|
WHERE success = 1
|
|
417
417
|
AND overall_score IS NOT NULL
|
|
418
418
|
AND suggestions IS NOT NULL
|
|
419
|
-
AND judge_model
|
|
419
|
+
AND judge_model LIKE 'claude-opus-%'
|
|
420
420
|
AND profile_name IN (${placeholders})
|
|
421
421
|
`;
|
|
422
422
|
|
|
@@ -459,7 +459,7 @@ function printCostEstimate(db) {
|
|
|
459
459
|
const factorialCount = db.prepare(`
|
|
460
460
|
SELECT COUNT(*) as n FROM evaluation_results
|
|
461
461
|
WHERE success = 1 AND overall_score IS NOT NULL AND suggestions IS NOT NULL
|
|
462
|
-
AND judge_model
|
|
462
|
+
AND judge_model LIKE 'claude-opus-%'
|
|
463
463
|
AND (profile_name LIKE 'cell_1_%' OR profile_name LIKE 'cell_2_%'
|
|
464
464
|
OR profile_name LIKE 'cell_3_%' OR profile_name LIKE 'cell_4_%'
|
|
465
465
|
OR profile_name LIKE 'cell_5_%' OR profile_name LIKE 'cell_6_%'
|
|
@@ -469,7 +469,7 @@ function printCostEstimate(db) {
|
|
|
469
469
|
const allCount = db.prepare(`
|
|
470
470
|
SELECT COUNT(*) as n FROM evaluation_results
|
|
471
471
|
WHERE success = 1 AND overall_score IS NOT NULL AND suggestions IS NOT NULL
|
|
472
|
-
AND judge_model
|
|
472
|
+
AND judge_model LIKE 'claude-opus-%'
|
|
473
473
|
`).get().n;
|
|
474
474
|
|
|
475
475
|
// Estimated tokens per call
|