@machinespirits/eval 0.2.1 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (78) hide show
  1. package/README.md +91 -9
  2. package/config/eval-settings.yaml +3 -3
  3. package/config/paper-manifest.json +486 -0
  4. package/config/providers.yaml +9 -6
  5. package/config/tutor-agents.yaml +2261 -0
  6. package/content/README.md +23 -0
  7. package/content/courses/479/course.md +53 -0
  8. package/content/courses/479/lecture-1.md +361 -0
  9. package/content/courses/479/lecture-2.md +360 -0
  10. package/content/courses/479/lecture-3.md +655 -0
  11. package/content/courses/479/lecture-4.md +530 -0
  12. package/content/courses/479/lecture-5.md +326 -0
  13. package/content/courses/479/lecture-6.md +346 -0
  14. package/content/courses/479/lecture-7.md +326 -0
  15. package/content/courses/479/lecture-8.md +273 -0
  16. package/content/courses/479/roadmap-slides.md +656 -0
  17. package/content/manifest.yaml +8 -0
  18. package/docs/research/apa.csl +2133 -0
  19. package/docs/research/build.sh +98 -0
  20. package/docs/research/figures/figure1.png +0 -0
  21. package/docs/research/figures/figure10.png +0 -0
  22. package/docs/research/figures/figure11.png +0 -0
  23. package/docs/research/figures/figure2.png +0 -0
  24. package/docs/research/figures/figure3.png +0 -0
  25. package/docs/research/figures/figure4.png +0 -0
  26. package/docs/research/figures/figure5.png +0 -0
  27. package/docs/research/figures/figure6.png +0 -0
  28. package/docs/research/figures/figure7.png +0 -0
  29. package/docs/research/figures/figure8.png +0 -0
  30. package/docs/research/figures/figure9.png +0 -0
  31. package/docs/research/header.tex +25 -0
  32. package/docs/research/paper-full.md +2565 -0
  33. package/docs/research/paper-short.md +436 -0
  34. package/docs/research/references.bib +1143 -0
  35. package/docs/research/slides-header.tex +188 -0
  36. package/docs/research/slides-pptx.md +363 -0
  37. package/docs/research/slides.md +531 -0
  38. package/docs/research/style-reference-pptx.py +199 -0
  39. package/package.json +5 -5
  40. package/scripts/analyze-eval-results.js +69 -17
  41. package/scripts/analyze-mechanism-traces.js +763 -0
  42. package/scripts/analyze-modulation-learning.js +498 -0
  43. package/scripts/analyze-prosthesis.js +144 -0
  44. package/scripts/analyze-run.js +264 -79
  45. package/scripts/assess-transcripts.js +853 -0
  46. package/scripts/browse-transcripts.js +854 -0
  47. package/scripts/check-parse-failures.js +73 -0
  48. package/scripts/code-dialectical-modulation.js +1320 -0
  49. package/scripts/download-data.sh +55 -0
  50. package/scripts/eval-cli.js +106 -18
  51. package/scripts/generate-paper-figures.js +663 -0
  52. package/scripts/generate-paper-figures.py +577 -76
  53. package/scripts/generate-paper-tables.js +299 -0
  54. package/scripts/qualitative-analysis-ai.js +3 -3
  55. package/scripts/render-sequence-diagram.js +694 -0
  56. package/scripts/test-latency.js +210 -0
  57. package/scripts/test-rate-limit.js +95 -0
  58. package/scripts/test-token-budget.js +332 -0
  59. package/scripts/validate-paper-manifest.js +670 -0
  60. package/services/__tests__/evalConfigLoader.test.js +2 -2
  61. package/services/__tests__/learnerRubricEvaluator.test.js +361 -0
  62. package/services/__tests__/learnerTutorInteractionEngine.test.js +326 -0
  63. package/services/evaluationRunner.js +975 -98
  64. package/services/evaluationStore.js +12 -4
  65. package/services/learnerTutorInteractionEngine.js +27 -2
  66. package/services/mockProvider.js +133 -0
  67. package/services/promptRewriter.js +1471 -5
  68. package/services/rubricEvaluator.js +55 -2
  69. package/services/transcriptFormatter.js +675 -0
  70. package/config/machinespirits-eval.code-workspace +0 -11
  71. package/docs/EVALUATION-VARIABLES.md +0 -589
  72. package/docs/REPLICATION-PLAN.md +0 -577
  73. package/scripts/analyze-run.mjs +0 -282
  74. package/scripts/compare-runs.js +0 -44
  75. package/scripts/compare-suggestions.js +0 -80
  76. package/scripts/dig-into-run.js +0 -158
  77. package/scripts/show-failed-suggestions.js +0 -64
  78. /package/scripts/{check-run.mjs → check-run.js} +0 -0
@@ -0,0 +1,670 @@
1
+ #!/usr/bin/env node
2
+ /**
3
+ * validate-paper-manifest.js — Paper manifest and consistency validation
4
+ *
5
+ * Level 1: Reads config/paper-manifest.json, queries the DB, and reports any
6
+ * discrepancies between expected and actual values.
7
+ *
8
+ * Level 2 (--deep): Paper-internal consistency checks that don't require the DB.
9
+ * Catches N-count drift, broken cross-references, orphaned run IDs, and
10
+ * Table 2 structural issues.
11
+ *
12
+ * Usage: node scripts/validate-paper-manifest.js [--fix-status] [--deep]
13
+ * --fix-status Mark stalled "running" runs as completed
14
+ * --deep Run deep paper-internal consistency checks (passes A–E)
15
+ */
16
+
17
+ import { readFileSync, existsSync } from 'fs';
18
+ import { join, dirname } from 'path';
19
+ import { fileURLToPath } from 'url';
20
+ import Database from 'better-sqlite3';
21
+
22
+ const __dirname = dirname(fileURLToPath(import.meta.url));
23
+ const ROOT = join(__dirname, '..');
24
+
25
+ const MANIFEST_PATH = join(ROOT, 'config', 'paper-manifest.json');
26
+ const DB_PATH = join(ROOT, 'data', 'evaluations.db');
27
+ const PAPER_PATH = join(ROOT, 'docs', 'research', 'paper-full.md');
28
+
29
+ const fixStatus = process.argv.includes('--fix-status');
30
+ const deepMode = process.argv.includes('--deep');
31
+
32
+ // ── Helpers ─────────────────────────────────────────────────────────────────
33
+
34
+ let passCount = 0;
35
+ let warnCount = 0;
36
+ let failCount = 0;
37
+
38
+ function pass(msg) {
39
+ passCount++;
40
+ console.log(` ✓ ${msg}`);
41
+ }
42
+
43
+ function warn(msg) {
44
+ warnCount++;
45
+ console.log(` ⚠ ${msg}`);
46
+ }
47
+
48
+ function fail(msg) {
49
+ failCount++;
50
+ console.log(` ✗ ${msg}`);
51
+ }
52
+
53
+ // Number words for matching spelled-out counts in prose
54
+ const WORD_TO_NUM = {
55
+ 'one': 1, 'two': 2, 'three': 3, 'four': 4, 'five': 5,
56
+ 'six': 6, 'seven': 7, 'eight': 8, 'nine': 9, 'ten': 10,
57
+ 'eleven': 11, 'twelve': 12, 'thirteen': 13, 'fourteen': 14, 'fifteen': 15,
58
+ 'sixteen': 16, 'seventeen': 17, 'eighteen': 18, 'nineteen': 19, 'twenty': 20,
59
+ 'twenty-one': 21, 'twenty-two': 22, 'twenty-three': 23, 'twenty-four': 24,
60
+ 'twenty-five': 25, 'twenty-six': 26, 'twenty-seven': 27, 'twenty-eight': 28,
61
+ 'twenty-nine': 29, 'thirty': 30, 'thirty-one': 31, 'thirty-two': 32,
62
+ 'thirty-three': 33, 'thirty-four': 34, 'thirty-five': 35, 'thirty-six': 36,
63
+ 'thirty-seven': 37, 'thirty-eight': 38, 'thirty-nine': 39, 'forty': 40,
64
+ };
65
+
66
+ function numToWord(n) {
67
+ return Object.entries(WORD_TO_NUM).find(([, v]) => v === n)?.[0];
68
+ }
69
+
70
+ /**
71
+ * Split paper into main body (for validation) and revision history (excluded).
72
+ * Appendix E is revision history — references there are editorial notes, not claims.
73
+ */
74
+ function splitPaper(paper) {
75
+ const appendixEMatch = paper.match(/^## Appendix E/m);
76
+ if (appendixEMatch) {
77
+ const idx = paper.indexOf(appendixEMatch[0]);
78
+ return { body: paper.slice(0, idx), revisionHistory: paper.slice(idx) };
79
+ }
80
+ return { body: paper, revisionHistory: '' };
81
+ }
82
+
83
+ // ── Level 1: Manifest ↔ DB ─────────────────────────────────────────────────
84
+
85
+ function runLevel1(manifest, db) {
86
+ console.log('\n── Level 1: Manifest ↔ DB ─────────────────────────────────');
87
+
88
+ // 1. Validate each key evaluation against DB
89
+ console.log('\n ── Per-Run Validation ──');
90
+ let computedAttempts = 0;
91
+ let computedScored = 0;
92
+
93
+ for (const eval_ of manifest.key_evaluations) {
94
+ const label = eval_.label;
95
+ const runIds = eval_.run_ids;
96
+ const judgePattern = eval_.primary_judge_pattern;
97
+
98
+ const placeholders = runIds.map(() => '?').join(',');
99
+ const profileFilter = eval_.profile_filter;
100
+ const profileClause = profileFilter ? ' AND profile_name LIKE ?' : '';
101
+ const params = [...runIds, judgePattern, ...(profileFilter ? [profileFilter] : [])];
102
+
103
+ const row = db.prepare(`
104
+ SELECT
105
+ COUNT(*) as total,
106
+ SUM(CASE WHEN overall_score IS NOT NULL THEN 1 ELSE 0 END) as scored
107
+ FROM evaluation_results
108
+ WHERE run_id IN (${placeholders})
109
+ AND judge_model LIKE ?${profileClause}
110
+ `).get(...params);
111
+
112
+ const actualTotal = row?.total ?? 0;
113
+ const actualScored = row?.scored ?? 0;
114
+
115
+ let effectiveScored = actualScored;
116
+ if (eval_.unit === 'learner turn') {
117
+ const learnerRow = db.prepare(`
118
+ SELECT
119
+ COUNT(*) as total,
120
+ SUM(CASE WHEN learner_overall_score IS NOT NULL THEN 1 ELSE 0 END) as scored
121
+ FROM evaluation_results
122
+ WHERE run_id IN (${placeholders})
123
+ AND judge_model LIKE ?${profileClause}
124
+ `).get(...params);
125
+ effectiveScored = learnerRow?.scored ?? 0;
126
+ }
127
+
128
+ for (const runId of runIds) {
129
+ const runRow = db.prepare('SELECT status FROM evaluation_runs WHERE id = ?').get(runId);
130
+ if (!runRow) {
131
+ fail(`${label}: run ${runId} not found in evaluation_runs`);
132
+ } else if (runRow.status !== 'completed') {
133
+ if (fixStatus) {
134
+ db.prepare("UPDATE evaluation_runs SET status = 'completed', completed_at = datetime('now') WHERE id = ?").run(runId);
135
+ warn(`${label}: ${runId} was '${runRow.status}', fixed to 'completed'`);
136
+ } else {
137
+ fail(`${label}: ${runId} status='${runRow.status}' (expected 'completed'). Use --fix-status to fix.`);
138
+ }
139
+ }
140
+ }
141
+
142
+ if (eval_.unit === 'learner turn') {
143
+ if (effectiveScored !== eval_.expected_scored) {
144
+ fail(`${label}: learner scored=${effectiveScored}, expected=${eval_.expected_scored}`);
145
+ } else {
146
+ pass(`${label}: ${effectiveScored} learner-scored`);
147
+ }
148
+ computedAttempts += eval_.expected_attempts;
149
+ computedScored += eval_.expected_scored;
150
+ } else {
151
+ if (actualScored !== eval_.expected_scored) {
152
+ fail(`${label}: scored=${actualScored}, expected=${eval_.expected_scored}`);
153
+ } else if (actualTotal < eval_.expected_scored) {
154
+ fail(`${label}: total=${actualTotal} < scored=${eval_.expected_scored}`);
155
+ } else {
156
+ pass(`${label}: ${actualScored} scored (${actualTotal} total)`);
157
+ }
158
+ computedAttempts += eval_.expected_attempts;
159
+ computedScored += eval_.expected_scored;
160
+ }
161
+ }
162
+
163
+ // 2. Validate totals
164
+ console.log('\n ── Totals ──');
165
+ if (computedScored !== manifest.totals.expected_scored) {
166
+ fail(`Scored total: sum of rows=${computedScored}, manifest=${manifest.totals.expected_scored}`);
167
+ } else {
168
+ pass(`Scored total: ${computedScored}`);
169
+ }
170
+ if (computedAttempts !== manifest.totals.expected_attempts) {
171
+ fail(`Attempts total: sum of rows=${computedAttempts}, manifest=${manifest.totals.expected_attempts}`);
172
+ } else {
173
+ pass(`Attempts total: ${computedAttempts}`);
174
+ }
175
+ if (manifest.key_evaluations.length !== manifest.totals.evaluations) {
176
+ fail(`Evaluation count: ${manifest.key_evaluations.length} rows, manifest says ${manifest.totals.evaluations}`);
177
+ } else {
178
+ pass(`Evaluation count: ${manifest.key_evaluations.length}`);
179
+ }
180
+
181
+ // 3. Validate judge accounting
182
+ console.log('\n ── Judge Accounting ──');
183
+ let opusCount = 0;
184
+ let sonnetCount = 0;
185
+ const sonnetRuns = [];
186
+ for (const eval_ of manifest.key_evaluations) {
187
+ if (eval_.primary_judge_pattern.includes('sonnet')) {
188
+ sonnetCount++;
189
+ sonnetRuns.push(...eval_.run_ids);
190
+ } else {
191
+ opusCount++;
192
+ }
193
+ }
194
+ if (opusCount !== manifest.totals.opus_primary_count) {
195
+ fail(`Opus primary: counted ${opusCount}, manifest says ${manifest.totals.opus_primary_count}`);
196
+ } else {
197
+ pass(`Opus primary: ${opusCount}`);
198
+ }
199
+ if (sonnetCount !== manifest.totals.sonnet_primary_count) {
200
+ fail(`Sonnet primary: counted ${sonnetCount}, manifest says ${manifest.totals.sonnet_primary_count}`);
201
+ } else {
202
+ pass(`Sonnet primary: ${sonnetCount} (${sonnetRuns.join(', ')})`);
203
+ }
204
+
205
+ // 4. Validate paper prose references
206
+ console.log('\n ── Paper Prose ──');
207
+ if (existsSync(PAPER_PATH)) {
208
+ const paper = readFileSync(PAPER_PATH, 'utf8');
209
+ const expectedScored = manifest.totals.expected_scored.toLocaleString();
210
+ const expectedAttempts = manifest.totals.expected_attempts.toLocaleString();
211
+
212
+ const scoredPattern = new RegExp(`N[=≈]\\s*${expectedScored.replace(',', ',')}`, 'g');
213
+ const scoredMatches = paper.match(scoredPattern) || [];
214
+ if (scoredMatches.length >= 4) {
215
+ pass(`N=${expectedScored} appears ${scoredMatches.length} times in paper`);
216
+ } else {
217
+ warn(`N=${expectedScored} appears only ${scoredMatches.length} times (expected ≥4)`);
218
+ }
219
+
220
+ const stalePatterns = [
221
+ /N[=≈]\s*3,047/g,
222
+ /N[=≈]\s*3,112/g,
223
+ /N[=≈]\s*2,906/g,
224
+ /N[=≈]\s*3,292/g,
225
+ /N[=≈]\s*3,347/g,
226
+ ];
227
+ for (const pat of stalePatterns) {
228
+ const { body } = splitPaper(paper);
229
+ const matches = body.match(pat) || [];
230
+ if (matches.length > 0) {
231
+ fail(`Stale N value found in body: ${pat.source} appears ${matches.length} times`);
232
+ }
233
+ }
234
+
235
+ const totalsRowPattern = new RegExp(
236
+ `\\*\\*${expectedAttempts.replace(',', ',')}\\*\\*.*\\*\\*${expectedScored.replace(',', ',')}\\*\\*`
237
+ );
238
+ if (totalsRowPattern.test(paper)) {
239
+ pass(`Table 2 totals row matches: ${expectedAttempts}/${expectedScored}`);
240
+ } else {
241
+ fail(`Table 2 totals row doesn't match expected ${expectedAttempts}/${expectedScored}`);
242
+ }
243
+
244
+ const allRunIds = manifest.key_evaluations.flatMap(e => e.run_ids);
245
+ const uniqueRunIds = [...new Set(allRunIds)];
246
+ let missingFromPaper = 0;
247
+ for (const runId of uniqueRunIds) {
248
+ if (!paper.includes(runId)) {
249
+ fail(`Run ${runId} not found in paper`);
250
+ missingFromPaper++;
251
+ }
252
+ }
253
+ if (missingFromPaper === 0) {
254
+ pass(`All ${uniqueRunIds.length} unique run IDs found in paper`);
255
+ }
256
+ } else {
257
+ warn(`Paper not found at ${PAPER_PATH}, skipping prose validation`);
258
+ }
259
+ }
260
+
261
+ // ── Level 2: Deep Paper-Internal Checks ─────────────────────────────────────
262
+
263
+ function runDeepChecks(manifest) {
264
+ console.log('\n── Level 2: Deep Paper Checks ─────────────────────────────');
265
+
266
+ if (!existsSync(PAPER_PATH)) {
267
+ fail(`Paper not found at ${PAPER_PATH}`);
268
+ return;
269
+ }
270
+
271
+ const paper = readFileSync(PAPER_PATH, 'utf8');
272
+ const { body } = splitPaper(paper);
273
+ const paperLines = paper.split('\n');
274
+ const bodyLines = body.split('\n');
275
+
276
+ // ── Pass A: Parse Table 2 from Markdown ──
277
+ passA(paper, manifest);
278
+
279
+ // ── Pass B: Paper-Wide N-Count Consistency ──
280
+ passB(body, manifest);
281
+
282
+ // ── Pass C: Table Cross-References ──
283
+ passC(body, paper);
284
+
285
+ // ── Pass D: Section Cross-References ──
286
+ passD(body, paperLines);
287
+
288
+ // ── Pass E: Run ID Audit ──
289
+ passE(body, paper, manifest);
290
+ }
291
+
292
+ /**
293
+ * Pass A: Parse Table 2 from the markdown and verify internal consistency.
294
+ * Sums each row's attempts/scored and checks against the stated totals row.
295
+ */
296
+ function passA(paper, manifest) {
297
+ console.log('\n ── Pass A: Table 2 Structure ──');
298
+
299
+ const lines = paper.split('\n');
300
+
301
+ // Find Table 2 header
302
+ const headerIdx = lines.findIndex(l => /^\*\*Table 2:/.test(l.trim()));
303
+ if (headerIdx === -1) {
304
+ fail('Table 2 header not found');
305
+ return;
306
+ }
307
+
308
+ // Find the table column header row (starts with | Evaluation)
309
+ let tableStart = -1;
310
+ for (let i = headerIdx; i < Math.min(headerIdx + 5, lines.length); i++) {
311
+ if (/^\|\s*Evaluation\s*\|/.test(lines[i])) {
312
+ tableStart = i;
313
+ break;
314
+ }
315
+ }
316
+ if (tableStart === -1) {
317
+ fail('Table 2 column headers not found');
318
+ return;
319
+ }
320
+
321
+ // Parse data rows (skip header and separator)
322
+ const dataRows = [];
323
+ let totalsRow = null;
324
+ for (let i = tableStart + 2; i < lines.length; i++) {
325
+ const line = lines[i].trim();
326
+ if (!line.startsWith('|')) break;
327
+
328
+ const cells = line.split('|').map(c => c.trim()).filter(c => c !== '');
329
+ if (cells.length < 5) continue;
330
+
331
+ // Check if this is the totals row
332
+ if (/Paper totals/i.test(cells[0])) {
333
+ const attempts = parseInt(cells[3].replace(/[*,]/g, ''), 10);
334
+ const scored = parseInt(cells[4].replace(/[*,]/g, ''), 10);
335
+ totalsRow = { attempts, scored };
336
+ continue;
337
+ }
338
+
339
+ const label = cells[0].replace(/\$[^$]+\$/g, '×'); // normalize LaTeX
340
+ const runId = cells[1].trim();
341
+ const section = cells[2].trim();
342
+ const attempts = parseInt(cells[3].replace(/,/g, ''), 10);
343
+ const scored = parseInt(cells[4].replace(/,/g, ''), 10);
344
+ const unit = cells[5] || '';
345
+
346
+ if (isNaN(attempts) || isNaN(scored)) continue;
347
+
348
+ dataRows.push({ label, runId, section, attempts, scored, unit, lineNum: i + 1 });
349
+ }
350
+
351
+ if (dataRows.length === 0) {
352
+ fail('No data rows parsed from Table 2');
353
+ return;
354
+ }
355
+
356
+ const computedAttempts = dataRows.reduce((s, r) => s + r.attempts, 0);
357
+ const computedScored = dataRows.reduce((s, r) => s + r.scored, 0);
358
+
359
+ pass(`${dataRows.length} rows parsed, computed: ${computedAttempts.toLocaleString()} attempts / ${computedScored.toLocaleString()} scored`);
360
+
361
+ if (totalsRow) {
362
+ if (totalsRow.attempts !== computedAttempts) {
363
+ fail(`Totals row attempts=${totalsRow.attempts.toLocaleString()}, computed=${computedAttempts.toLocaleString()}`);
364
+ } else if (totalsRow.scored !== computedScored) {
365
+ fail(`Totals row scored=${totalsRow.scored.toLocaleString()}, computed=${computedScored.toLocaleString()}`);
366
+ } else {
367
+ pass(`Totals row matches computed sums`);
368
+ }
369
+ } else {
370
+ warn('No totals row found in Table 2');
371
+ }
372
+
373
+ // Cross-check with manifest
374
+ if (computedScored !== manifest.totals.expected_scored) {
375
+ fail(`Table 2 scored (${computedScored}) ≠ manifest expected_scored (${manifest.totals.expected_scored})`);
376
+ } else {
377
+ pass(`Table 2 scored matches manifest (${computedScored})`);
378
+ }
379
+ if (dataRows.length !== manifest.totals.evaluations) {
380
+ fail(`Table 2 rows (${dataRows.length}) ≠ manifest evaluations (${manifest.totals.evaluations})`);
381
+ } else {
382
+ pass(`Table 2 row count matches manifest (${dataRows.length})`);
383
+ }
384
+ }
385
+
386
+ /**
387
+ * Pass B: Find all large N-count patterns in the body and check consistency.
388
+ * Also checks spelled-out evaluation counts (e.g., "thirty-six").
389
+ */
390
+ function passB(body, manifest) {
391
+ console.log('\n ── Pass B: N-Count Consistency ──');
392
+
393
+ const expectedScored = manifest.totals.expected_scored;
394
+ const expectedAttempts = manifest.totals.expected_attempts;
395
+ const evalCount = manifest.totals.evaluations;
396
+
397
+ // Check that the paper total N appears consistently in the body
398
+ const totalPattern = new RegExp(`N[=≈]\\s*${expectedScored.toLocaleString().replace(',', ',')}`, 'g');
399
+ const totalMatches = body.match(totalPattern) || [];
400
+ if (totalMatches.length >= 4) {
401
+ pass(`Paper total N=${expectedScored.toLocaleString()} appears ${totalMatches.length} times in body`);
402
+ } else {
403
+ warn(`Paper total N=${expectedScored.toLocaleString()} appears only ${totalMatches.length} times (expected ≥4)`);
404
+ }
405
+
406
+ // Check for stale N values that look like old paper totals (within ±200 of current total)
407
+ // These are the most dangerous drift: someone changed the total but missed a reference
408
+ const nPattern = /N[=≈]\s*([\d,]+)/g;
409
+ let match;
410
+ const staleCandidate = [];
411
+ while ((match = nPattern.exec(body)) !== null) {
412
+ const raw = match[1].replace(/,/g, '');
413
+ const num = parseInt(raw, 10);
414
+ // Only flag values suspiciously close to the paper total (within ±200) but not equal
415
+ if (num >= 2500 && num <= 5000 && num !== expectedScored && num !== expectedAttempts) {
416
+ const distance = Math.abs(num - expectedScored);
417
+ if (distance <= 200) {
418
+ staleCandidate.push({ value: num, context: match[0] });
419
+ }
420
+ }
421
+ }
422
+
423
+ if (staleCandidate.length > 0) {
424
+ for (const { value, context } of staleCandidate) {
425
+ fail(`Possible stale total: ${value.toLocaleString()} ("${context}") — close to but ≠ paper total ${expectedScored.toLocaleString()}`);
426
+ }
427
+ } else {
428
+ pass(`No stale N values near paper total detected`);
429
+ }
430
+
431
+ // Check spelled-out evaluation count
432
+ const evalWord = numToWord(evalCount);
433
+ if (evalWord) {
434
+ const wordPattern = new RegExp(`${evalWord}\\s+(key\\s+)?evaluations`, 'gi');
435
+ const wordMatches = body.match(wordPattern) || [];
436
+ if (wordMatches.length > 0) {
437
+ pass(`Evaluation count "${evalWord}" appears ${wordMatches.length} times`);
438
+ } else {
439
+ warn(`Spelled-out evaluation count "${evalWord}" not found in body`);
440
+ }
441
+
442
+ // Check for mismatched spelled-out counts
443
+ const allEvalWordPattern = /(twenty|thirty|forty)-(\w+)\s+(key\s+)?evaluations/gi;
444
+ let evalWordMatch;
445
+ while ((evalWordMatch = allEvalWordPattern.exec(body)) !== null) {
446
+ const foundWord = evalWordMatch[0].replace(/\s+(key\s+)?evaluations/i, '').toLowerCase();
447
+ const foundNum = WORD_TO_NUM[foundWord];
448
+ if (foundNum && foundNum !== evalCount) {
449
+ fail(`Stale evaluation count: "${foundWord}" (=${foundNum}) in body, expected "${evalWord}" (=${evalCount})`);
450
+ }
451
+ }
452
+ }
453
+ }
454
+
455
+ /**
456
+ * Pass C: Parse all table headers and cross-references in the body.
457
+ * Flags references to non-existent tables.
458
+ */
459
+ function passC(body, fullPaper) {
460
+ console.log('\n ── Pass C: Table References ──');
461
+
462
+ // Parse **Table N:** or **Table Nb:** headers from full paper
463
+ const tableHeaderPattern = /\*\*Table (\d+[a-z]?)(?::|\.)/g;
464
+ const definedTables = new Set();
465
+ let match;
466
+ while ((match = tableHeaderPattern.exec(fullPaper)) !== null) {
467
+ definedTables.add(match[1]);
468
+ }
469
+
470
+ // Parse Table N references in body prose (not in table headers or Appendix E)
471
+ const tableRefPattern = /Table (\d+[a-z]?)\b/g;
472
+ const references = new Map(); // tableId → count
473
+ while ((match = tableRefPattern.exec(body)) !== null) {
474
+ const id = match[1];
475
+ // Skip if this is inside a table header definition
476
+ const lineStart = body.lastIndexOf('\n', match.index);
477
+ const line = body.slice(lineStart, body.indexOf('\n', match.index + match[0].length));
478
+ if (/\*\*Table \d+[a-z]?[:.]/i.test(line)) continue;
479
+
480
+ references.set(id, (references.get(id) || 0) + 1);
481
+ }
482
+
483
+ const totalRefs = [...references.values()].reduce((s, c) => s + c, 0);
484
+ pass(`${definedTables.size} tables defined, ${totalRefs} references in body`);
485
+
486
+ let brokenRefs = 0;
487
+ for (const [id, count] of references) {
488
+ if (!definedTables.has(id)) {
489
+ fail(`Table ${id} referenced ${count} time(s) but not defined`);
490
+ brokenRefs++;
491
+ }
492
+ }
493
+ if (brokenRefs === 0) {
494
+ pass('All table references resolve to defined tables');
495
+ }
496
+ }
497
+
498
+ /**
499
+ * Pass D: Parse section headers and cross-references in the body.
500
+ * Flags references to non-existent sections.
501
+ */
502
+ function passD(body, paperLines) {
503
+ console.log('\n ── Pass D: Section References ──');
504
+
505
+ // Parse section headers: ## N. or ### N.N
506
+ const sectionPattern = /^#{2,3}\s+(\d+(?:\.\d+)?)\b/;
507
+ const definedSections = new Set();
508
+ for (const line of paperLines) {
509
+ const match = line.match(sectionPattern);
510
+ if (match) {
511
+ definedSections.add(match[1]);
512
+ }
513
+ }
514
+
515
+ // Also add appendix sections
516
+ const appendixPattern = /^## Appendix ([A-Z])/;
517
+ for (const line of paperLines) {
518
+ const match = line.match(appendixPattern);
519
+ if (match) {
520
+ definedSections.add(`Appendix ${match[1]}`);
521
+ }
522
+ }
523
+
524
+ // Parse "Section N.N" references in body
525
+ const sectionRefPattern = /Section (\d+(?:\.\d+)?)\b/g;
526
+ const references = new Map();
527
+ let match;
528
+ while ((match = sectionRefPattern.exec(body)) !== null) {
529
+ const id = match[1];
530
+ references.set(id, (references.get(id) || 0) + 1);
531
+ }
532
+
533
+ const totalRefs = [...references.values()].reduce((s, c) => s + c, 0);
534
+ pass(`${definedSections.size} sections defined, ${totalRefs} "Section X.Y" references`);
535
+
536
+ let brokenRefs = 0;
537
+ for (const [id, count] of references) {
538
+ // A reference to "Section 6" should match "## 6. Results"
539
+ // A reference to "Section 6.3" should match "### 6.3 Full Factorial..."
540
+ if (!definedSections.has(id)) {
541
+ // Also check if the parent section exists (e.g., "Section 6" matches "## 6.")
542
+ const parent = id.split('.')[0];
543
+ if (id.includes('.') || !definedSections.has(parent)) {
544
+ fail(`Section ${id} referenced ${count} time(s) but not defined`);
545
+ brokenRefs++;
546
+ }
547
+ }
548
+ }
549
+ if (brokenRefs === 0) {
550
+ pass('All section references resolve to defined sections');
551
+ }
552
+ }
553
+
554
+ /**
555
+ * Pass E: Audit run IDs — every run ID in prose should be in Table 2,
556
+ * and every Table 2 run ID should appear somewhere in the paper.
557
+ */
558
+ function passE(body, fullPaper, manifest) {
559
+ console.log('\n ── Pass E: Run ID Audit ──');
560
+
561
+ // Extract run IDs from Table 2 in the paper
562
+ const table2RunIds = new Set();
563
+ const table2Pattern = /eval-2026-\d{2}-\d{2}-[a-f0-9]{8}/g;
564
+ const lines = fullPaper.split('\n');
565
+
566
+ // Find Table 2 boundaries
567
+ const headerIdx = lines.findIndex(l => /^\*\*Table 2:/.test(l.trim()));
568
+ if (headerIdx === -1) {
569
+ warn('Table 2 not found for run ID audit');
570
+ return;
571
+ }
572
+
573
+ // Parse Table 2 run IDs
574
+ for (let i = headerIdx; i < lines.length; i++) {
575
+ const line = lines[i];
576
+ if (i > headerIdx + 2 && !line.trim().startsWith('|')) break;
577
+ let m;
578
+ while ((m = table2Pattern.exec(line)) !== null) {
579
+ table2RunIds.add(m[0]);
580
+ }
581
+ }
582
+
583
+ // Extract all run IDs from body (excluding Appendix E)
584
+ const bodyRunIds = new Set();
585
+ const bodyPattern = /eval-2026-\d{2}-\d{2}-[a-f0-9]{8}/g;
586
+ let bm;
587
+ while ((bm = bodyPattern.exec(body)) !== null) {
588
+ bodyRunIds.add(bm[0]);
589
+ }
590
+
591
+ pass(`${bodyRunIds.size} run IDs in body, ${table2RunIds.size} in Table 2`);
592
+
593
+ // Check: every body run ID should be in Table 2 or Appendix D
594
+ let orphaned = 0;
595
+ for (const runId of bodyRunIds) {
596
+ if (!table2RunIds.has(runId)) {
597
+ // Check if it's in Appendix D
598
+ const appendixDIdx = fullPaper.indexOf('## Appendix D');
599
+ const appendixEIdx = fullPaper.indexOf('## Appendix E');
600
+ if (appendixDIdx !== -1) {
601
+ const appendixD = fullPaper.slice(appendixDIdx, appendixEIdx !== -1 ? appendixEIdx : undefined);
602
+ if (!appendixD.includes(runId)) {
603
+ warn(`Run ID ${runId} in body but not in Table 2 or Appendix D`);
604
+ orphaned++;
605
+ }
606
+ }
607
+ }
608
+ }
609
+ if (orphaned === 0) {
610
+ pass('All body run IDs found in Table 2 or Appendix D');
611
+ }
612
+
613
+ // Check: every Table 2 run ID should appear somewhere in the paper body or appendices
614
+ let unreferenced = 0;
615
+ for (const runId of table2RunIds) {
616
+ // Count appearances outside Table 2 (should appear at least in Appendix D)
617
+ const fullCount = (fullPaper.match(new RegExp(runId.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'), 'g')) || []).length;
618
+ if (fullCount <= 1) {
619
+ warn(`Run ID ${runId} appears only in Table 2, nowhere else`);
620
+ unreferenced++;
621
+ }
622
+ }
623
+ if (unreferenced === 0) {
624
+ pass('All Table 2 run IDs referenced elsewhere in paper');
625
+ }
626
+ }
627
+
628
+ // ── Main ────────────────────────────────────────────────────────────────────
629
+
630
+ function main() {
631
+ console.log('═══ Paper Consistency Validation ═══');
632
+ console.log('═'.repeat(50));
633
+
634
+ if (!existsSync(MANIFEST_PATH)) {
635
+ fail(`Manifest not found: ${MANIFEST_PATH}`);
636
+ return process.exit(1);
637
+ }
638
+
639
+ const manifest = JSON.parse(readFileSync(MANIFEST_PATH, 'utf8'));
640
+ console.log(`\nManifest v${manifest.version} (${manifest.generated})`);
641
+ console.log(`Expected: ${manifest.totals.evaluations} evaluations, ${manifest.totals.expected_scored.toLocaleString()} scored`);
642
+
643
+ // Level 1: Manifest ↔ DB (always runs)
644
+ if (existsSync(DB_PATH)) {
645
+ const db = new Database(DB_PATH, { readonly: !fixStatus });
646
+ runLevel1(manifest, db);
647
+ db.close();
648
+ } else {
649
+ warn(`Database not found at ${DB_PATH}, skipping Level 1`);
650
+ }
651
+
652
+ // Level 2: Deep paper checks (only with --deep)
653
+ if (deepMode) {
654
+ runDeepChecks(manifest);
655
+ }
656
+
657
+ // ── Summary ─────────────────────────────────────────────────────────────
658
+ console.log('\n' + '═'.repeat(50));
659
+ console.log(`Summary: ${passCount} pass, ${warnCount} warn, ${failCount} fail`);
660
+ if (failCount > 0) {
661
+ console.log('\nFAILED — fix the issues above before building the paper.');
662
+ process.exit(1);
663
+ } else if (warnCount > 0) {
664
+ console.log('\nPASSED with warnings.');
665
+ } else {
666
+ console.log('\nALL PASSED ✓');
667
+ }
668
+ }
669
+
670
+ main();
@@ -166,7 +166,7 @@ describe('resolveModel (string format)', () => {
166
166
  it('resolves "anthropic.opus"', () => {
167
167
  const r = resolveModel('anthropic.opus');
168
168
  assert.strictEqual(r.provider, 'anthropic');
169
- assert.strictEqual(r.model, 'claude-opus-4-5');
169
+ assert.strictEqual(r.model, 'claude-opus-4-6');
170
170
  });
171
171
 
172
172
  it('resolves "openai.mini"', () => {
@@ -184,7 +184,7 @@ describe('resolveModel (string format)', () => {
184
184
  it('resolves "openrouter.sonnet" to openrouter model ID', () => {
185
185
  const r = resolveModel('openrouter.sonnet');
186
186
  assert.strictEqual(r.provider, 'openrouter');
187
- assert.strictEqual(r.model, 'anthropic/claude-sonnet-4.5');
187
+ assert.strictEqual(r.model, 'anthropic/claude-sonnet-4.6');
188
188
  });
189
189
 
190
190
  it('resolves "openrouter.nemotron"', () => {