@machinespirits/eval 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (74) hide show
  1. package/README.md +91 -9
  2. package/config/eval-settings.yaml +3 -3
  3. package/config/paper-manifest.json +486 -0
  4. package/config/providers.yaml +9 -6
  5. package/config/tutor-agents.yaml +2261 -0
  6. package/content/README.md +23 -0
  7. package/content/courses/479/course.md +53 -0
  8. package/content/courses/479/lecture-1.md +361 -0
  9. package/content/courses/479/lecture-2.md +360 -0
  10. package/content/courses/479/lecture-3.md +655 -0
  11. package/content/courses/479/lecture-4.md +530 -0
  12. package/content/courses/479/lecture-5.md +326 -0
  13. package/content/courses/479/lecture-6.md +346 -0
  14. package/content/courses/479/lecture-7.md +326 -0
  15. package/content/courses/479/lecture-8.md +273 -0
  16. package/content/courses/479/roadmap-slides.md +656 -0
  17. package/content/manifest.yaml +8 -0
  18. package/docs/research/build.sh +44 -20
  19. package/docs/research/figures/figure10.png +0 -0
  20. package/docs/research/figures/figure11.png +0 -0
  21. package/docs/research/figures/figure3.png +0 -0
  22. package/docs/research/figures/figure4.png +0 -0
  23. package/docs/research/figures/figure5.png +0 -0
  24. package/docs/research/figures/figure6.png +0 -0
  25. package/docs/research/figures/figure7.png +0 -0
  26. package/docs/research/figures/figure8.png +0 -0
  27. package/docs/research/figures/figure9.png +0 -0
  28. package/docs/research/header.tex +23 -2
  29. package/docs/research/paper-full.md +941 -285
  30. package/docs/research/paper-short.md +216 -585
  31. package/docs/research/references.bib +132 -0
  32. package/docs/research/slides-header.tex +188 -0
  33. package/docs/research/slides-pptx.md +363 -0
  34. package/docs/research/slides.md +531 -0
  35. package/docs/research/style-reference-pptx.py +199 -0
  36. package/package.json +6 -5
  37. package/scripts/analyze-eval-results.js +69 -17
  38. package/scripts/analyze-mechanism-traces.js +763 -0
  39. package/scripts/analyze-modulation-learning.js +498 -0
  40. package/scripts/analyze-prosthesis.js +144 -0
  41. package/scripts/analyze-run.js +264 -79
  42. package/scripts/assess-transcripts.js +853 -0
  43. package/scripts/browse-transcripts.js +854 -0
  44. package/scripts/check-parse-failures.js +73 -0
  45. package/scripts/code-dialectical-modulation.js +1320 -0
  46. package/scripts/download-data.sh +55 -0
  47. package/scripts/eval-cli.js +106 -18
  48. package/scripts/generate-paper-figures.js +663 -0
  49. package/scripts/generate-paper-figures.py +577 -76
  50. package/scripts/generate-paper-tables.js +299 -0
  51. package/scripts/qualitative-analysis-ai.js +3 -3
  52. package/scripts/render-sequence-diagram.js +694 -0
  53. package/scripts/test-latency.js +210 -0
  54. package/scripts/test-rate-limit.js +95 -0
  55. package/scripts/test-token-budget.js +332 -0
  56. package/scripts/validate-paper-manifest.js +670 -0
  57. package/services/__tests__/evalConfigLoader.test.js +2 -2
  58. package/services/__tests__/learnerRubricEvaluator.test.js +361 -0
  59. package/services/__tests__/learnerTutorInteractionEngine.test.js +326 -0
  60. package/services/evaluationRunner.js +975 -98
  61. package/services/evaluationStore.js +12 -4
  62. package/services/learnerTutorInteractionEngine.js +27 -2
  63. package/services/mockProvider.js +133 -0
  64. package/services/promptRewriter.js +1471 -5
  65. package/services/rubricEvaluator.js +55 -2
  66. package/services/transcriptFormatter.js +675 -0
  67. package/docs/EVALUATION-VARIABLES.md +0 -589
  68. package/docs/REPLICATION-PLAN.md +0 -577
  69. package/scripts/analyze-run.mjs +0 -282
  70. package/scripts/compare-runs.js +0 -44
  71. package/scripts/compare-suggestions.js +0 -80
  72. package/scripts/dig-into-run.js +0 -158
  73. package/scripts/show-failed-suggestions.js +0 -64
  74. /package/scripts/{check-run.mjs → check-run.js} +0 -0
@@ -0,0 +1,299 @@
1
+ #!/usr/bin/env node
2
+ /**
3
+ * generate-paper-tables.js — Level 3 paper manifest table generation + prose validation
4
+ *
5
+ * Reads config/paper-manifest.json, queries the DB, and:
6
+ * 1. Generates Table 2 (Evaluation Sample Summary) markdown
7
+ * 2. Generates Appendix D (Reproducibility Run IDs) markdown
8
+ * 3. Validates all prose N-count references in paper-full.md
9
+ * 4. Reports any discrepancies
10
+ *
11
+ * Usage:
12
+ * node scripts/generate-paper-tables.js # validate only
13
+ * node scripts/generate-paper-tables.js --generate # output generated tables
14
+ * node scripts/generate-paper-tables.js --diff # show diffs against paper
15
+ */
16
+
17
+ import { readFileSync, existsSync } from 'fs';
18
+ import { join, dirname } from 'path';
19
+ import { fileURLToPath } from 'url';
20
+ import Database from 'better-sqlite3';
21
+
22
+ const __dirname = dirname(fileURLToPath(import.meta.url));
23
+ const ROOT = join(__dirname, '..');
24
+
25
+ const MANIFEST_PATH = join(ROOT, 'config', 'paper-manifest.json');
26
+ const DB_PATH = join(ROOT, 'data', 'evaluations.db');
27
+ const PAPER_PATH = join(ROOT, 'docs', 'research', 'paper-full.md');
28
+
29
+ const doGenerate = process.argv.includes('--generate');
30
+ const doDiff = process.argv.includes('--diff');
31
+
32
+ // ── Helpers ─────────────────────────────────────────────────────────────────
33
+
34
+ function commaNum(n) {
35
+ return n.toLocaleString('en-US');
36
+ }
37
+
38
+ function numToWord(n) {
39
+ const words = {
40
+ 1: 'one', 2: 'two', 3: 'three', 4: 'four', 5: 'five',
41
+ 6: 'six', 7: 'seven', 8: 'eight', 9: 'nine', 10: 'ten',
42
+ 20: 'twenty', 21: 'twenty-one', 22: 'twenty-two', 23: 'twenty-three',
43
+ 24: 'twenty-four', 25: 'twenty-five', 26: 'twenty-six',
44
+ 27: 'twenty-seven', 28: 'twenty-eight', 29: 'twenty-nine',
45
+ 30: 'thirty', 31: 'thirty-one',
46
+ };
47
+ return words[n] || String(n);
48
+ }
49
+
50
+ // ── Main ────────────────────────────────────────────────────────────────────
51
+
52
+ function main() {
53
+ if (!existsSync(MANIFEST_PATH)) {
54
+ console.error(`Manifest not found: ${MANIFEST_PATH}`);
55
+ process.exit(1);
56
+ }
57
+ if (!existsSync(DB_PATH)) {
58
+ console.error(`Database not found: ${DB_PATH}`);
59
+ process.exit(1);
60
+ }
61
+
62
+ const manifest = JSON.parse(readFileSync(MANIFEST_PATH, 'utf8'));
63
+ const db = new Database(DB_PATH, { readonly: true });
64
+
65
+ // ── Query actual data ───────────────────────────────────────────────────
66
+
67
+ const evalData = [];
68
+ let totalAttempts = 0;
69
+ let totalScored = 0;
70
+
71
+ for (const eval_ of manifest.key_evaluations) {
72
+ const runIds = eval_.run_ids;
73
+ const judgePattern = eval_.primary_judge_pattern;
74
+ const placeholders = runIds.map(() => '?').join(',');
75
+
76
+ let scored;
77
+ if (eval_.unit === 'learner turn') {
78
+ const row = db.prepare(`
79
+ SELECT COUNT(*) as total,
80
+ SUM(CASE WHEN learner_overall_score IS NOT NULL THEN 1 ELSE 0 END) as scored
81
+ FROM evaluation_results
82
+ WHERE run_id IN (${placeholders}) AND judge_model LIKE ?
83
+ `).get(...runIds, judgePattern);
84
+ scored = row?.scored ?? 0;
85
+ } else {
86
+ const row = db.prepare(`
87
+ SELECT COUNT(*) as total,
88
+ SUM(CASE WHEN overall_score IS NOT NULL THEN 1 ELSE 0 END) as scored
89
+ FROM evaluation_results
90
+ WHERE run_id IN (${placeholders}) AND judge_model LIKE ?
91
+ `).get(...runIds, judgePattern);
92
+ scored = row?.scored ?? 0;
93
+ }
94
+
95
+ evalData.push({
96
+ ...eval_,
97
+ actual_scored: scored,
98
+ run_id_display: runIds.join(', '),
99
+ });
100
+
101
+ totalAttempts += eval_.expected_attempts;
102
+ totalScored += scored;
103
+ }
104
+
105
+ // ── Generate Table 2 ───────────────────────────────────────────────────
106
+
107
+ if (doGenerate || doDiff) {
108
+ console.log('═══ Generated Table 2: Evaluation Sample Summary ═══\n');
109
+
110
+ const lines = [
111
+ '| Evaluation | Run ID | Section | Total Attempts | Scored | Unit |',
112
+ '|------------|--------|---------|----------------|--------|------|',
113
+ ];
114
+
115
+ for (const e of evalData) {
116
+ const label = e.label.replace(/×/g, '$\\times$');
117
+ lines.push(
118
+ `| ${label} | ${e.run_id_display} | ${e.section} | ${e.expected_attempts} | ${e.actual_scored} | ${e.unit} |`
119
+ );
120
+ }
121
+
122
+ lines.push(
123
+ `| **Paper totals** | — | — | **${commaNum(totalAttempts)}** | **${commaNum(totalScored)}** | — |`
124
+ );
125
+
126
+ console.log(lines.join('\n'));
127
+ console.log();
128
+ }
129
+
130
+ // ── Generate Appendix D ─────────────────────────────────────────────────
131
+
132
+ if (doGenerate || doDiff) {
133
+ console.log('═══ Generated Appendix D: Reproducibility and Key Evaluation Run IDs ═══\n');
134
+
135
+ const uniqueEvals = manifest.key_evaluations;
136
+ const uniqueRunIds = [...new Set(uniqueEvals.flatMap(e => e.run_ids))];
137
+
138
+ // Find duplicate run IDs (same run used for multiple evaluations)
139
+ const runIdCounts = {};
140
+ for (const e of uniqueEvals) {
141
+ for (const rid of e.run_ids) {
142
+ runIdCounts[rid] = (runIdCounts[rid] || 0) + 1;
143
+ }
144
+ }
145
+ const duplicateRuns = Object.entries(runIdCounts)
146
+ .filter(([, count]) => count > 1)
147
+ .map(([rid]) => rid);
148
+
149
+ // Multi-ID evals
150
+ const multiIdEvals = uniqueEvals.filter(e => e.run_ids.length > 1);
151
+
152
+ const notes = [];
153
+ if (duplicateRuns.length > 0) {
154
+ notes.push(`${duplicateRuns.join(', ')} serves both ${
155
+ uniqueEvals.filter(e => e.run_ids.some(r => duplicateRuns.includes(r))).map(e => e.label.toLowerCase()).join(' and ')
156
+ }`);
157
+ }
158
+ if (multiIdEvals.length > 0) {
159
+ notes.push(`${multiIdEvals[0].run_ids.join(' and ')} are combined as one ${multiIdEvals[0].label.toLowerCase()}`);
160
+ }
161
+
162
+ console.log(`The ${numToWord(uniqueEvals.length)} key evaluations are listed below${notes.length > 0 ? ` (${notes.join('; ')})` : ''}:\n`);
163
+
164
+ const dLines = [
165
+ '| Finding | Run ID | Section |',
166
+ '|---------|--------|---------|',
167
+ ];
168
+ for (const e of evalData) {
169
+ dLines.push(`| ${e.label} | ${e.run_id_display} | ${e.section} |`);
170
+ }
171
+
172
+ console.log(dLines.join('\n'));
173
+ console.log();
174
+ }
175
+
176
+ // ── Prose validation ──────────────────────────────────────────────────
177
+
178
+ console.log('═══ Prose N-Count Validation ═══\n');
179
+
180
+ if (!existsSync(PAPER_PATH)) {
181
+ console.log(' Paper not found, skipping prose validation');
182
+ return;
183
+ }
184
+
185
+ const paper = readFileSync(PAPER_PATH, 'utf8');
186
+ const lines = paper.split('\n');
187
+ let issues = 0;
188
+
189
+ const expectedScored = commaNum(totalScored);
190
+ const expectedAttempts = commaNum(totalAttempts);
191
+
192
+ // Check: paper-total N references (only those with comma-separated thousands)
193
+ // Per-evaluation N values (N=262, N=88, etc.) are intentionally excluded.
194
+ // Revision history (Appendix E) is excluded as it describes past states.
195
+ const appendixEStart = paper.indexOf('## Appendix E');
196
+ const mainBody = appendixEStart > 0 ? paper.substring(0, appendixEStart) : paper;
197
+
198
+ const nPattern = /N[=≈]\s*([\d,]+)\s*(?:primary\s+)?scored/g;
199
+ let match;
200
+ while ((match = nPattern.exec(mainBody)) !== null) {
201
+ const found = match[1];
202
+ // Only check values with commas (>= 1,000) — these are paper totals
203
+ if (found.includes(',') && found !== expectedScored) {
204
+ const lineNum = mainBody.substring(0, match.index).split('\n').length;
205
+ console.log(` ✗ Line ${lineNum}: found "N=${found} scored", expected "N=${expectedScored} scored"`);
206
+ issues++;
207
+ }
208
+ }
209
+
210
+ // Check: manifest prose_n_references patterns all appear
211
+ if (manifest.prose_n_references) {
212
+ for (const ref of manifest.prose_n_references) {
213
+ if (!mainBody.includes(ref.pattern)) {
214
+ console.log(` ✗ Expected pattern "${ref.pattern}" not found in ${ref.location}`);
215
+ issues++;
216
+ }
217
+ }
218
+ }
219
+
220
+ // Check: no stale N values (in main body only, not revision history)
221
+ const staleValues = ['3,047', '3,112', '3,130', '2,906'];
222
+ for (const stale of staleValues) {
223
+ const staleRe = new RegExp(stale.replace(',', ','), 'g');
224
+ let m;
225
+ while ((m = staleRe.exec(mainBody)) !== null) {
226
+ const lineNum = mainBody.substring(0, m.index).split('\n').length;
227
+ console.log(` ✗ Line ${lineNum}: stale N value "${stale}" found`);
228
+ issues++;
229
+ }
230
+ }
231
+
232
+ // Check: Table 2 totals row
233
+ const totalsPattern = new RegExp(
234
+ `\\*\\*${expectedAttempts.replace(/,/g, ',')}\\*\\*.*\\*\\*${expectedScored.replace(/,/g, ',')}\\*\\*`
235
+ );
236
+ if (!totalsPattern.test(paper)) {
237
+ console.log(` ✗ Table 2 totals row doesn't match expected ${expectedAttempts}/${expectedScored}`);
238
+ issues++;
239
+ }
240
+
241
+ // Check: evaluation count in prose
242
+ const countWord = numToWord(manifest.totals.evaluations);
243
+ const countPattern = new RegExp(`${countWord} key evaluations`, 'g');
244
+ const countMatches = paper.match(countPattern) || [];
245
+ if (countMatches.length === 0) {
246
+ console.log(` ✗ "${countWord} key evaluations" not found in paper`);
247
+ issues++;
248
+ }
249
+
250
+ // Check: judge accounting
251
+ const opusWord = numToWord(manifest.totals.opus_primary_count);
252
+ const opusCapWord = opusWord.charAt(0).toUpperCase() + opusWord.slice(1);
253
+ if (!paper.includes(`${opusCapWord} of the ${countWord}`)) {
254
+ console.log(` ✗ Judge accounting: expected "${opusCapWord} of the ${countWord}" not found`);
255
+ issues++;
256
+ }
257
+
258
+ // Check: each run ID appears in paper
259
+ const allRunIds = manifest.key_evaluations.flatMap(e => e.run_ids);
260
+ const uniqueRunIds = [...new Set(allRunIds)];
261
+ for (const runId of uniqueRunIds) {
262
+ if (!paper.includes(runId)) {
263
+ console.log(` ✗ Run ID ${runId} not found in paper`);
264
+ issues++;
265
+ }
266
+ }
267
+
268
+ // Check: per-row scored counts match Table 2
269
+ for (const e of evalData) {
270
+ if (e.actual_scored !== e.expected_scored) {
271
+ console.log(` ✗ ${e.label}: DB scored=${e.actual_scored}, manifest expected=${e.expected_scored}`);
272
+ issues++;
273
+ }
274
+
275
+ // Check the row appears in paper with correct scored count
276
+ const rowPattern = new RegExp(
277
+ `${e.run_ids[0].replace(/[.*+?^${}()|[\]\\]/g, '\\$&')}.*\\|.*\\|.*${e.expected_scored}`
278
+ );
279
+ if (!rowPattern.test(paper)) {
280
+ console.log(` ⚠ ${e.label}: scored count ${e.expected_scored} may not appear in Table 2 row`);
281
+ }
282
+ }
283
+
284
+ // ── Summary ─────────────────────────────────────────────────────────────
285
+
286
+ if (issues === 0) {
287
+ console.log(` ✓ All prose N-counts consistent (${expectedScored} scored, ${expectedAttempts} attempts)`);
288
+ console.log(` ✓ All ${uniqueRunIds.length} run IDs present in paper`);
289
+ console.log(` ✓ Judge accounting correct (${opusCapWord} of ${countWord} Opus-primary)`);
290
+ console.log('\n ALL PASSED ✓');
291
+ } else {
292
+ console.log(`\n ${issues} issue(s) found`);
293
+ process.exit(1);
294
+ }
295
+
296
+ db.close();
297
+ }
298
+
299
+ main();
@@ -416,7 +416,7 @@ function loadData(db, cells, sampleSize) {
416
416
  WHERE success = 1
417
417
  AND overall_score IS NOT NULL
418
418
  AND suggestions IS NOT NULL
419
- AND judge_model IN ('claude-code', 'claude-code/opus')
419
+ AND judge_model LIKE 'claude-opus-%'
420
420
  AND profile_name IN (${placeholders})
421
421
  `;
422
422
 
@@ -459,7 +459,7 @@ function printCostEstimate(db) {
459
459
  const factorialCount = db.prepare(`
460
460
  SELECT COUNT(*) as n FROM evaluation_results
461
461
  WHERE success = 1 AND overall_score IS NOT NULL AND suggestions IS NOT NULL
462
- AND judge_model IN ('claude-code', 'claude-code/opus')
462
+ AND judge_model LIKE 'claude-opus-%'
463
463
  AND (profile_name LIKE 'cell_1_%' OR profile_name LIKE 'cell_2_%'
464
464
  OR profile_name LIKE 'cell_3_%' OR profile_name LIKE 'cell_4_%'
465
465
  OR profile_name LIKE 'cell_5_%' OR profile_name LIKE 'cell_6_%'
@@ -469,7 +469,7 @@ function printCostEstimate(db) {
469
469
  const allCount = db.prepare(`
470
470
  SELECT COUNT(*) as n FROM evaluation_results
471
471
  WHERE success = 1 AND overall_score IS NOT NULL AND suggestions IS NOT NULL
472
- AND judge_model IN ('claude-code', 'claude-code/opus')
472
+ AND judge_model LIKE 'claude-opus-%'
473
473
  `).get().n;
474
474
 
475
475
  // Estimated tokens per call