@machinespirits/eval 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (74) hide show
  1. package/README.md +91 -9
  2. package/config/eval-settings.yaml +3 -3
  3. package/config/paper-manifest.json +486 -0
  4. package/config/providers.yaml +9 -6
  5. package/config/tutor-agents.yaml +2261 -0
  6. package/content/README.md +23 -0
  7. package/content/courses/479/course.md +53 -0
  8. package/content/courses/479/lecture-1.md +361 -0
  9. package/content/courses/479/lecture-2.md +360 -0
  10. package/content/courses/479/lecture-3.md +655 -0
  11. package/content/courses/479/lecture-4.md +530 -0
  12. package/content/courses/479/lecture-5.md +326 -0
  13. package/content/courses/479/lecture-6.md +346 -0
  14. package/content/courses/479/lecture-7.md +326 -0
  15. package/content/courses/479/lecture-8.md +273 -0
  16. package/content/courses/479/roadmap-slides.md +656 -0
  17. package/content/manifest.yaml +8 -0
  18. package/docs/research/build.sh +44 -20
  19. package/docs/research/figures/figure10.png +0 -0
  20. package/docs/research/figures/figure11.png +0 -0
  21. package/docs/research/figures/figure3.png +0 -0
  22. package/docs/research/figures/figure4.png +0 -0
  23. package/docs/research/figures/figure5.png +0 -0
  24. package/docs/research/figures/figure6.png +0 -0
  25. package/docs/research/figures/figure7.png +0 -0
  26. package/docs/research/figures/figure8.png +0 -0
  27. package/docs/research/figures/figure9.png +0 -0
  28. package/docs/research/header.tex +23 -2
  29. package/docs/research/paper-full.md +941 -285
  30. package/docs/research/paper-short.md +216 -585
  31. package/docs/research/references.bib +132 -0
  32. package/docs/research/slides-header.tex +188 -0
  33. package/docs/research/slides-pptx.md +363 -0
  34. package/docs/research/slides.md +531 -0
  35. package/docs/research/style-reference-pptx.py +199 -0
  36. package/package.json +6 -5
  37. package/scripts/analyze-eval-results.js +69 -17
  38. package/scripts/analyze-mechanism-traces.js +763 -0
  39. package/scripts/analyze-modulation-learning.js +498 -0
  40. package/scripts/analyze-prosthesis.js +144 -0
  41. package/scripts/analyze-run.js +264 -79
  42. package/scripts/assess-transcripts.js +853 -0
  43. package/scripts/browse-transcripts.js +854 -0
  44. package/scripts/check-parse-failures.js +73 -0
  45. package/scripts/code-dialectical-modulation.js +1320 -0
  46. package/scripts/download-data.sh +55 -0
  47. package/scripts/eval-cli.js +106 -18
  48. package/scripts/generate-paper-figures.js +663 -0
  49. package/scripts/generate-paper-figures.py +577 -76
  50. package/scripts/generate-paper-tables.js +299 -0
  51. package/scripts/qualitative-analysis-ai.js +3 -3
  52. package/scripts/render-sequence-diagram.js +694 -0
  53. package/scripts/test-latency.js +210 -0
  54. package/scripts/test-rate-limit.js +95 -0
  55. package/scripts/test-token-budget.js +332 -0
  56. package/scripts/validate-paper-manifest.js +670 -0
  57. package/services/__tests__/evalConfigLoader.test.js +2 -2
  58. package/services/__tests__/learnerRubricEvaluator.test.js +361 -0
  59. package/services/__tests__/learnerTutorInteractionEngine.test.js +326 -0
  60. package/services/evaluationRunner.js +975 -98
  61. package/services/evaluationStore.js +12 -4
  62. package/services/learnerTutorInteractionEngine.js +27 -2
  63. package/services/mockProvider.js +133 -0
  64. package/services/promptRewriter.js +1471 -5
  65. package/services/rubricEvaluator.js +55 -2
  66. package/services/transcriptFormatter.js +675 -0
  67. package/docs/EVALUATION-VARIABLES.md +0 -589
  68. package/docs/REPLICATION-PLAN.md +0 -577
  69. package/scripts/analyze-run.mjs +0 -282
  70. package/scripts/compare-runs.js +0 -44
  71. package/scripts/compare-suggestions.js +0 -80
  72. package/scripts/dig-into-run.js +0 -158
  73. package/scripts/show-failed-suggestions.js +0 -64
  74. /package/scripts/{check-run.mjs → check-run.js} +0 -0
@@ -1,282 +0,0 @@
1
- /**
2
- * Detailed statistical analysis of an evaluation run.
3
- * Usage: node scripts/analyze-run.mjs [run_id]
4
- */
5
- import Database from 'better-sqlite3';
6
- const db = new Database('data/evaluations.db');
7
-
8
- const RUN_ID = process.argv[2] || db.prepare(
9
- 'SELECT run_id FROM evaluation_results ORDER BY created_at DESC LIMIT 1'
10
- ).get()?.run_id;
11
-
12
- if (!RUN_ID) { console.error('No run found'); process.exit(1); }
13
- console.log(`Analyzing run: ${RUN_ID}\n`);
14
-
15
- // ============================================================
16
- // Helper functions
17
- // ============================================================
18
- function std(values) {
19
- if (values.length < 2) return 0;
20
- const mean = values.reduce((a, b) => a + b, 0) / values.length;
21
- const variance = values.reduce((sum, v) => sum + (v - mean) ** 2, 0) / (values.length - 1);
22
- return Math.sqrt(variance);
23
- }
24
-
25
- function cohensD(group1, group2) {
26
- const m1 = group1.reduce((a, b) => a + b, 0) / group1.length;
27
- const m2 = group2.reduce((a, b) => a + b, 0) / group2.length;
28
- const s1 = std(group1);
29
- const s2 = std(group2);
30
- const pooled = Math.sqrt(((group1.length - 1) * s1 ** 2 + (group2.length - 1) * s2 ** 2) / (group1.length + group2.length - 2));
31
- return pooled === 0 ? 0 : (m1 - m2) / pooled;
32
- }
33
-
34
- function percentile(values, p) {
35
- const sorted = [...values].sort((a, b) => a - b);
36
- const idx = (p / 100) * (sorted.length - 1);
37
- const lo = Math.floor(idx);
38
- const hi = Math.ceil(idx);
39
- return lo === hi ? sorted[lo] : sorted[lo] + (sorted[hi] - sorted[lo]) * (idx - lo);
40
- }
41
-
42
- // ============================================================
43
- // 1. Summary statistics
44
- // ============================================================
45
- const allScores = db.prepare(`
46
- SELECT overall_score FROM evaluation_results
47
- WHERE run_id = ? AND overall_score IS NOT NULL
48
- `).all(RUN_ID).map(r => r.overall_score);
49
-
50
- const mean = allScores.reduce((a, b) => a + b, 0) / allScores.length;
51
- const sd = std(allScores);
52
- const median = percentile(allScores, 50);
53
- const q1 = percentile(allScores, 25);
54
- const q3 = percentile(allScores, 75);
55
-
56
- console.log('=== DESCRIPTIVE STATISTICS ===');
57
- console.log(`N = ${allScores.length}`);
58
- console.log(`Mean: ${mean.toFixed(1)} (SD: ${sd.toFixed(1)})`);
59
- console.log(`Median: ${median.toFixed(1)} (IQR: ${q1.toFixed(1)} – ${q3.toFixed(1)})`);
60
- console.log(`Range: ${Math.min(...allScores).toFixed(1)} – ${Math.max(...allScores).toFixed(1)}`);
61
-
62
- // ============================================================
63
- // 2. Per-model statistics
64
- // ============================================================
65
- console.log('\n=== PER-MODEL STATISTICS ===');
66
- const models = db.prepare(`
67
- SELECT DISTINCT model FROM evaluation_results
68
- WHERE run_id = ? AND overall_score IS NOT NULL
69
- `).all(RUN_ID).map(r => r.model);
70
-
71
- const modelData = {};
72
- for (const m of models) {
73
- const scores = db.prepare(`
74
- SELECT overall_score FROM evaluation_results
75
- WHERE run_id = ? AND model = ? AND overall_score IS NOT NULL
76
- `).all(RUN_ID, m).map(r => r.overall_score);
77
- modelData[m] = scores;
78
- const mn = scores.reduce((a, b) => a + b, 0) / scores.length;
79
- const s = std(scores);
80
- console.log(`${m}: M=${mn.toFixed(1)}, SD=${s.toFixed(1)}, N=${scores.length}, Range=[${Math.min(...scores).toFixed(1)}, ${Math.max(...scores).toFixed(1)}]`);
81
- }
82
-
83
- // ============================================================
84
- // 3. Pairwise effect sizes between models
85
- // ============================================================
86
- console.log('\n=== PAIRWISE EFFECT SIZES (Cohen\'s d) ===');
87
- const modelNames = Object.keys(modelData).filter(m => modelData[m].length >= 3);
88
- for (let i = 0; i < modelNames.length; i++) {
89
- for (let j = i + 1; j < modelNames.length; j++) {
90
- const d = cohensD(modelData[modelNames[i]], modelData[modelNames[j]]);
91
- const label = d > 0.8 ? 'large' : d > 0.5 ? 'medium' : d > 0.2 ? 'small' : 'negligible';
92
- console.log(`${modelNames[i]} vs ${modelNames[j]}: d=${d.toFixed(2)} (${label})`);
93
- }
94
- }
95
-
96
- // ============================================================
97
- // 4. Per-dimension statistics
98
- // ============================================================
99
- console.log('\n=== DIMENSION STATISTICS ===');
100
- const dims = ['relevance', 'specificity', 'pedagogical', 'personalization', 'actionability', 'tone'];
101
- const dimCols = dims.map(d => `score_${d}`);
102
-
103
- for (const dim of dims) {
104
- const col = `score_${dim}`;
105
- const vals = db.prepare(`
106
- SELECT ${col} as v FROM evaluation_results
107
- WHERE run_id = ? AND ${col} IS NOT NULL
108
- `).all(RUN_ID).map(r => r.v);
109
- if (vals.length === 0) continue;
110
- const mn = vals.reduce((a, b) => a + b, 0) / vals.length;
111
- const s = std(vals);
112
- console.log(`${dim.padEnd(20)} M=${mn.toFixed(2)}, SD=${s.toFixed(2)}, N=${vals.length}`);
113
- }
114
-
115
- // ============================================================
116
- // 5. Per-dimension per-model
117
- // ============================================================
118
- console.log('\n=== DIMENSION × MODEL BREAKDOWN ===');
119
- const header = 'Model'.padEnd(25) + dims.map(d => d.substring(0, 8).padStart(9)).join('');
120
- console.log(header);
121
- for (const m of modelNames) {
122
- let line = m.padEnd(25);
123
- for (const dim of dims) {
124
- const col = `score_${dim}`;
125
- const val = db.prepare(`
126
- SELECT AVG(${col}) as v FROM evaluation_results
127
- WHERE run_id = ? AND model = ? AND ${col} IS NOT NULL
128
- `).get(RUN_ID, m);
129
- line += (val?.v?.toFixed(2) || 'N/A').padStart(9);
130
- }
131
- console.log(line);
132
- }
133
-
134
- // ============================================================
135
- // 6. Scenario difficulty ranking
136
- // ============================================================
137
- console.log('\n=== SCENARIO DIFFICULTY RANKING (hardest → easiest) ===');
138
- const scenarioStats = db.prepare(`
139
- SELECT scenario_id,
140
- AVG(overall_score) as mean,
141
- COUNT(*) as n
142
- FROM evaluation_results
143
- WHERE run_id = ? AND overall_score IS NOT NULL
144
- GROUP BY scenario_id
145
- ORDER BY mean ASC
146
- `).all(RUN_ID);
147
-
148
- for (const s of scenarioStats) {
149
- const scores = db.prepare(`
150
- SELECT overall_score FROM evaluation_results
151
- WHERE run_id = ? AND scenario_id = ? AND overall_score IS NOT NULL
152
- `).all(RUN_ID, s.scenario_id).map(r => r.overall_score);
153
- const s_sd = std(scores);
154
- const bar = '█'.repeat(Math.round(s.mean / 5));
155
- console.log(`${s.scenario_id.padEnd(40)} ${s.mean.toFixed(1).padStart(5)} (SD=${s_sd.toFixed(1).padStart(5)}) ${bar}`);
156
- }
157
-
158
- // ============================================================
159
- // 7. Inter-model agreement (scenario-level correlation)
160
- // ============================================================
161
- console.log('\n=== INTER-MODEL AGREEMENT ===');
162
- const scenarios = db.prepare(`
163
- SELECT DISTINCT scenario_id FROM evaluation_results
164
- WHERE run_id = ? AND overall_score IS NOT NULL
165
- `).all(RUN_ID).map(r => r.scenario_id);
166
-
167
- // Check if models rank scenarios similarly
168
- for (let i = 0; i < modelNames.length; i++) {
169
- for (let j = i + 1; j < modelNames.length; j++) {
170
- const pairs = [];
171
- for (const s of scenarios) {
172
- const s1 = db.prepare(`SELECT overall_score FROM evaluation_results WHERE run_id = ? AND model = ? AND scenario_id = ? AND overall_score IS NOT NULL`).get(RUN_ID, modelNames[i], s);
173
- const s2 = db.prepare(`SELECT overall_score FROM evaluation_results WHERE run_id = ? AND model = ? AND scenario_id = ? AND overall_score IS NOT NULL`).get(RUN_ID, modelNames[j], s);
174
- if (s1 && s2) pairs.push([s1.overall_score, s2.overall_score]);
175
- }
176
- if (pairs.length >= 3) {
177
- // Spearman rank correlation
178
- const ranked = pairs.map(([a, b], idx) => ({ a, b, idx }));
179
- ranked.sort((x, y) => x.a - y.a);
180
- ranked.forEach((r, i) => r.rankA = i + 1);
181
- ranked.sort((x, y) => x.b - y.b);
182
- ranked.forEach((r, i) => r.rankB = i + 1);
183
- const n = ranked.length;
184
- const dSquared = ranked.reduce((sum, r) => sum + (r.rankA - r.rankB) ** 2, 0);
185
- const rho = 1 - (6 * dSquared) / (n * (n * n - 1));
186
- const agreement = rho > 0.7 ? 'strong' : rho > 0.4 ? 'moderate' : rho > 0 ? 'weak' : 'none';
187
- console.log(`${modelNames[i]} vs ${modelNames[j]}: Spearman ρ=${rho.toFixed(2)} (${agreement} agreement, N=${n})`);
188
- }
189
- }
190
- }
191
-
192
- // ============================================================
193
- // 8. Base vs Recognition score analysis
194
- // ============================================================
195
- console.log('\n=== BASE vs RECOGNITION SCORE ANALYSIS ===');
196
- const dualRows = db.prepare(`
197
- SELECT model, base_score, recognition_score, overall_score
198
- FROM evaluation_results
199
- WHERE run_id = ? AND base_score IS NOT NULL AND recognition_score IS NOT NULL
200
- `).all(RUN_ID);
201
-
202
- if (dualRows.length > 0) {
203
- const bases = dualRows.map(r => r.base_score);
204
- const recogs = dualRows.map(r => r.recognition_score);
205
- const overalls = dualRows.map(r => r.overall_score);
206
-
207
- console.log(`N (with both scores): ${dualRows.length}`);
208
- console.log(`Base: M=${(bases.reduce((a,b)=>a+b,0)/bases.length).toFixed(1)}, SD=${std(bases).toFixed(1)}`);
209
- console.log(`Recognition: M=${(recogs.reduce((a,b)=>a+b,0)/recogs.length).toFixed(1)}, SD=${std(recogs).toFixed(1)}`);
210
- console.log(`Overall: M=${(overalls.reduce((a,b)=>a+b,0)/overalls.length).toFixed(1)}, SD=${std(overalls).toFixed(1)}`);
211
-
212
- const gap = cohensD(bases, recogs);
213
- console.log(`Base vs Recognition gap: d=${gap.toFixed(2)} (${gap > 0.8 ? 'large' : gap > 0.5 ? 'medium' : 'small'})`);
214
-
215
- // Per-model breakdown
216
- console.log('\nPer-model dual scores:');
217
- for (const m of modelNames) {
218
- const mRows = dualRows.filter(r => r.model === m);
219
- if (mRows.length === 0) continue;
220
- const mb = mRows.map(r => r.base_score);
221
- const mr = mRows.map(r => r.recognition_score);
222
- console.log(` ${m}: Base=${(mb.reduce((a,b)=>a+b,0)/mb.length).toFixed(1)}, Recog=${(mr.reduce((a,b)=>a+b,0)/mr.length).toFixed(1)}, Gap=${((mb.reduce((a,b)=>a+b,0)/mb.length) - (mr.reduce((a,b)=>a+b,0)/mr.length)).toFixed(1)}, N=${mRows.length}`);
223
- }
224
- } else {
225
- console.log('No results with both base_score and recognition_score');
226
- }
227
-
228
- // ============================================================
229
- // 9. Variance decomposition (eta-squared)
230
- // ============================================================
231
- console.log('\n=== VARIANCE DECOMPOSITION ===');
232
- // How much variance is explained by model vs scenario?
233
- const grandMean = mean;
234
- const SSTotal = allScores.reduce((sum, s) => sum + (s - grandMean) ** 2, 0);
235
-
236
- // SS between models
237
- let SSModel = 0;
238
- for (const m of modelNames) {
239
- const mScores = modelData[m];
240
- const mMean = mScores.reduce((a, b) => a + b, 0) / mScores.length;
241
- SSModel += mScores.length * (mMean - grandMean) ** 2;
242
- }
243
-
244
- // SS between scenarios
245
- let SSScenario = 0;
246
- for (const s of scenarioStats) {
247
- const sMean = s.mean;
248
- SSScenario += s.n * (sMean - grandMean) ** 2;
249
- }
250
-
251
- const etaModel = SSModel / SSTotal;
252
- const etaScenario = SSScenario / SSTotal;
253
- const etaResidual = 1 - etaModel - etaScenario;
254
-
255
- console.log(`Total SS: ${SSTotal.toFixed(1)}`);
256
- console.log(`Model effect (η²): ${(etaModel * 100).toFixed(1)}% — ${etaModel < 0.01 ? 'negligible' : etaModel < 0.06 ? 'small' : etaModel < 0.14 ? 'medium' : 'large'}`);
257
- console.log(`Scenario effect (η²): ${(etaScenario * 100).toFixed(1)}% — ${etaScenario < 0.01 ? 'negligible' : etaScenario < 0.06 ? 'small' : etaScenario < 0.14 ? 'medium' : 'large'}`);
258
- console.log(`Residual: ${(etaResidual * 100).toFixed(1)}%`);
259
-
260
- // ============================================================
261
- // 10. High-variance scenarios (discriminating power)
262
- // ============================================================
263
- console.log('\n=== SCENARIO DISCRIMINATING POWER (cross-model variance) ===');
264
- const scenarioVariance = [];
265
- for (const s of scenarios) {
266
- const scores = db.prepare(`
267
- SELECT overall_score FROM evaluation_results
268
- WHERE run_id = ? AND scenario_id = ? AND overall_score IS NOT NULL
269
- `).all(RUN_ID, s).map(r => r.overall_score);
270
- if (scores.length >= 2) {
271
- const sv = std(scores);
272
- scenarioVariance.push({ id: s, sd: sv, range: Math.max(...scores) - Math.min(...scores) });
273
- }
274
- }
275
- scenarioVariance.sort((a, b) => b.sd - a.sd);
276
- console.log('Scenario'.padEnd(40), 'SD'.padStart(6), 'Range'.padStart(7));
277
- for (const s of scenarioVariance) {
278
- console.log(s.id.padEnd(40), s.sd.toFixed(1).padStart(6), s.range.toFixed(1).padStart(7));
279
- }
280
-
281
- console.log('\n=== ANALYSIS COMPLETE ===');
282
- db.close();
@@ -1,44 +0,0 @@
1
- #!/usr/bin/env node
2
- import fs from 'fs';
3
-
4
- function getScores(runId, scenario) {
5
- const logPath = `./logs/eval-progress/${runId}.jsonl`;
6
- const lines = fs.readFileSync(logPath, 'utf8').split('\n').filter(l => l.trim());
7
- const events = lines.map(l => JSON.parse(l));
8
- const successful = events.filter(e =>
9
- e.eventType === 'test_complete' &&
10
- e.success === true &&
11
- e.overallScore != null &&
12
- e.scenarioId === scenario
13
- );
14
-
15
- const byProfile = {};
16
- for (const r of successful) {
17
- if (!byProfile[r.profileName]) byProfile[r.profileName] = [];
18
- byProfile[r.profileName].push(r.overallScore);
19
- }
20
- return byProfile;
21
- }
22
-
23
- const run1 = process.argv[2] || 'eval-2026-02-03-f5d4dd93';
24
- const run2 = process.argv[3] || 'eval-2026-02-03-b391d999';
25
-
26
- const scenarios = ['mood_frustration_to_breakthrough', 'misconception_correction_flow', 'mutual_transformation_journey'];
27
-
28
- for (const scenario of scenarios) {
29
- console.log(`=== ${scenario} ===`);
30
- const s1 = getScores(run1, scenario);
31
- const s2 = getScores(run2, scenario);
32
-
33
- const profiles = ['cell_1_base_single_unified', 'cell_3_base_multi_unified', 'cell_5_recog_single_unified', 'cell_7_recog_multi_unified'];
34
-
35
- for (const p of profiles) {
36
- const scores1 = s1[p] || [];
37
- const scores2 = s2[p] || [];
38
- const avg1 = scores1.length > 0 ? (scores1.reduce((a, b) => a + b, 0) / scores1.length).toFixed(1) : 'N/A';
39
- const avg2 = scores2.length > 0 ? (scores2.reduce((a, b) => a + b, 0) / scores2.length).toFixed(1) : 'N/A';
40
- const shortP = p.replace('cell_', '').replace('_single_unified', '').replace('_multi_unified', '_m');
41
- console.log(` ${shortP.padEnd(12)} ${run1.slice(-8)}=${avg1.padStart(5)} (${scores1.join(',').padEnd(20)}) ${run2.slice(-8)}=${avg2.padStart(5)} (${scores2.join(',')})`);
42
- }
43
- console.log('');
44
- }
@@ -1,80 +0,0 @@
1
- #!/usr/bin/env node
2
- import fs from 'fs';
3
- import path from 'path';
4
-
5
- const dir = 'logs/tutor-dialogues';
6
- const files = fs.readdirSync(dir).filter(f => f.endsWith('.json'));
7
-
8
- // Get recent files from Feb 3
9
- const recentFiles = files.filter(f => {
10
- const stat = fs.statSync(path.join(dir, f));
11
- const date = new Date(stat.mtime);
12
- return date >= new Date('2026-02-03');
13
- });
14
-
15
- console.log(`Analyzing ${recentFiles.length} dialogue files from Feb 3+\n`);
16
-
17
- // Collect examples by profile and scenario
18
- const examples = { budget: [], recognition: [] };
19
-
20
- for (const f of recentFiles) {
21
- try {
22
- const d = JSON.parse(fs.readFileSync(path.join(dir, f), 'utf8'));
23
- const profile = d.profileName;
24
- const scenario = d.scenario?.scenarioId || d.scenarioId || 'unknown';
25
-
26
- if (profile !== 'budget' && profile !== 'recognition') continue;
27
-
28
- // Only look at struggling_learner and concept_confusion
29
- if (!scenario.includes('struggling') && !scenario.includes('confusion')) continue;
30
-
31
- const suggestions = d.suggestions || [];
32
- const firstSuggestion = suggestions[0];
33
- if (!firstSuggestion) continue;
34
-
35
- const title = firstSuggestion.title || '';
36
- const message = firstSuggestion.message || '';
37
- const fullText = (title + ' ' + message).toLowerCase();
38
-
39
- const hasReview = fullText.includes('review');
40
- const hasForbidden = ['next lecture', 'move on to', 'continue with'].some(fb => fullText.includes(fb));
41
-
42
- examples[profile].push({
43
- file: f,
44
- scenario,
45
- title,
46
- messagePreview: message.substring(0, 150),
47
- hasReview,
48
- hasForbidden,
49
- passed: hasReview && !hasForbidden
50
- });
51
- } catch (e) {}
52
- }
53
-
54
- // Show failing examples for each profile
55
- for (const profile of ['budget', 'recognition']) {
56
- const data = examples[profile];
57
- const passing = data.filter(x => x.passed);
58
- const failing = data.filter(x => !x.passed);
59
-
60
- console.log(`=== ${profile.toUpperCase()} ===`);
61
- console.log(`Total: ${data.length}, Passing: ${passing.length}, Failing: ${failing.length}`);
62
- console.log(`Pass rate: ${(passing.length / data.length * 100).toFixed(1)}%\n`);
63
-
64
- console.log('FAILING examples (missing review or has forbidden):');
65
- for (const ex of failing.slice(0, 5)) {
66
- console.log(` Scenario: ${ex.scenario}`);
67
- console.log(` Title: "${ex.title}"`);
68
- console.log(` Has review: ${ex.hasReview}, Has forbidden: ${ex.hasForbidden}`);
69
- console.log(` Preview: ${ex.messagePreview}...`);
70
- console.log(` File: ${ex.file}\n`);
71
- }
72
-
73
- console.log('PASSING examples:');
74
- for (const ex of passing.slice(0, 3)) {
75
- console.log(` Scenario: ${ex.scenario}`);
76
- console.log(` Title: "${ex.title}"`);
77
- console.log('');
78
- }
79
- console.log('');
80
- }
@@ -1,158 +0,0 @@
1
- #!/usr/bin/env node
2
- import fs from 'fs';
3
- import path from 'path';
4
-
5
- const runId = process.argv[2] || 'eval-2026-02-03-f5d4dd93';
6
- const logPath = `./logs/eval-progress/${runId}.jsonl`;
7
-
8
- const lines = fs.readFileSync(logPath, 'utf8').split('\n').filter(l => l.trim());
9
- const events = lines.map(l => JSON.parse(l));
10
- const tests = events.filter(e => e.eventType === 'test_complete' && e.success && e.overallScore != null);
11
-
12
- // Group by profile and scenario
13
- const byProfileScenario = {};
14
- for (const t of tests) {
15
- const key = t.profileName + '|' + t.scenarioId;
16
- if (!byProfileScenario[key]) byProfileScenario[key] = [];
17
- byProfileScenario[key].push(t.overallScore);
18
- }
19
-
20
- const scenarios = [...new Set(tests.map(t => t.scenarioId))];
21
-
22
- // Compare cell_2 vs cell_6 (both single+psycho, but base vs recog)
23
- console.log('=== cell_2 (base+single+psycho) vs cell_6 (recog+single+psycho) ===');
24
- console.log('Scenario'.padEnd(35), 'cell_2', 'cell_6', 'Delta');
25
- console.log('-'.repeat(60));
26
- let total2 = 0, total6 = 0, count = 0;
27
- for (const s of scenarios) {
28
- const c2 = byProfileScenario['cell_2_base_single_psycho|' + s] || [];
29
- const c6 = byProfileScenario['cell_6_recog_single_psycho|' + s] || [];
30
- const avg2 = c2.length ? (c2.reduce((a,b)=>a+b,0)/c2.length) : null;
31
- const avg6 = c6.length ? (c6.reduce((a,b)=>a+b,0)/c6.length) : null;
32
- const delta = (avg2 !== null && avg6 !== null) ? (avg2 - avg6) : null;
33
- console.log(
34
- s.padEnd(35),
35
- (avg2 !== null ? avg2.toFixed(1) : 'N/A').padStart(5),
36
- (avg6 !== null ? avg6.toFixed(1) : 'N/A').padStart(5),
37
- (delta !== null ? (delta > 0 ? '+' : '') + delta.toFixed(1) : '-').padStart(6)
38
- );
39
- if (avg2 !== null && avg6 !== null) {
40
- total2 += avg2;
41
- total6 += avg6;
42
- count++;
43
- }
44
- }
45
- console.log('-'.repeat(60));
46
- console.log('Average'.padEnd(35), (total2/count).toFixed(1).padStart(5), (total6/count).toFixed(1).padStart(5), ((total2-total6)/count > 0 ? '+' : '') + ((total2-total6)/count).toFixed(1).padStart(5));
47
-
48
- // Now compare all base vs all recognition
49
- console.log('\n=== All BASE profiles vs All RECOGNITION profiles ===');
50
- const baseProfiles = ['cell_1_base_single_unified', 'cell_2_base_single_psycho', 'cell_3_base_multi_unified', 'cell_4_base_multi_psycho'];
51
- const recogProfiles = ['cell_5_recog_single_unified', 'cell_6_recog_single_psycho', 'cell_7_recog_multi_unified', 'cell_8_recog_multi_psycho'];
52
-
53
- console.log('Scenario'.padEnd(35), 'Base', 'Recog', 'Delta');
54
- console.log('-'.repeat(60));
55
- let totalBase = 0, totalRecog = 0, countScen = 0;
56
- for (const s of scenarios) {
57
- let baseScores = [], recogScores = [];
58
- for (const p of baseProfiles) {
59
- baseScores.push(...(byProfileScenario[p + '|' + s] || []));
60
- }
61
- for (const p of recogProfiles) {
62
- recogScores.push(...(byProfileScenario[p + '|' + s] || []));
63
- }
64
- const avgBase = baseScores.length ? baseScores.reduce((a,b)=>a+b,0)/baseScores.length : null;
65
- const avgRecog = recogScores.length ? recogScores.reduce((a,b)=>a+b,0)/recogScores.length : null;
66
- const delta = (avgBase !== null && avgRecog !== null) ? (avgRecog - avgBase) : null;
67
- console.log(
68
- s.padEnd(35),
69
- (avgBase !== null ? avgBase.toFixed(1) : 'N/A').padStart(5),
70
- (avgRecog !== null ? avgRecog.toFixed(1) : 'N/A').padStart(5),
71
- (delta !== null ? (delta > 0 ? '+' : '') + delta.toFixed(1) : '-').padStart(6)
72
- );
73
- if (avgBase !== null && avgRecog !== null) {
74
- totalBase += avgBase;
75
- totalRecog += avgRecog;
76
- countScen++;
77
- }
78
- }
79
- console.log('-'.repeat(60));
80
- console.log('Average'.padEnd(35), (totalBase/countScen).toFixed(1).padStart(5), (totalRecog/countScen).toFixed(1).padStart(5), ((totalRecog-totalBase)/countScen > 0 ? '+' : '') + ((totalRecog-totalBase)/countScen).toFixed(1).padStart(5));
81
-
82
- // Check raw scores for cell_2 to see variance
83
- console.log('\n=== cell_2 raw scores (to check variance) ===');
84
- for (const s of scenarios) {
85
- const scores = byProfileScenario['cell_2_base_single_psycho|' + s] || [];
86
- if (scores.length > 0) {
87
- console.log(s.padEnd(35), scores.join(', '));
88
- }
89
- }
90
-
91
- // Check which scenarios cell_2 wins vs loses
92
- console.log('\n=== Where cell_2 beats cell_6 vs where it loses ===');
93
- let wins = [], losses = [], ties = [];
94
- for (const s of scenarios) {
95
- const c2 = byProfileScenario['cell_2_base_single_psycho|' + s] || [];
96
- const c6 = byProfileScenario['cell_6_recog_single_psycho|' + s] || [];
97
- if (c2.length && c6.length) {
98
- const avg2 = c2.reduce((a,b)=>a+b,0)/c2.length;
99
- const avg6 = c6.reduce((a,b)=>a+b,0)/c6.length;
100
- const delta = avg2 - avg6;
101
- if (delta > 5) wins.push({ scenario: s, delta });
102
- else if (delta < -5) losses.push({ scenario: s, delta });
103
- else ties.push({ scenario: s, delta });
104
- }
105
- }
106
- console.log('cell_2 WINS (>5 pts):');
107
- wins.sort((a,b) => b.delta - a.delta).forEach(w => console.log(` ${w.scenario}: +${w.delta.toFixed(1)}`));
108
- console.log('cell_2 LOSES (<-5 pts):');
109
- losses.sort((a,b) => a.delta - b.delta).forEach(l => console.log(` ${l.scenario}: ${l.delta.toFixed(1)}`));
110
- console.log('TIES (±5 pts):');
111
- ties.forEach(t => console.log(` ${t.scenario}: ${t.delta > 0 ? '+' : ''}${t.delta.toFixed(1)}`));
112
-
113
- // Analyze dialogue files for validation patterns
114
- console.log('\n=== Dialogue Validation Analysis ===');
115
-
116
- const dir = 'logs/tutor-dialogues';
117
- const files = fs.readdirSync(dir).filter(f => f.startsWith('dialogue-177008') || f.startsWith('dialogue-177009'));
118
-
119
- const forbidden = ['next lecture', 'move on to', 'continue with'];
120
-
121
- let results = { budget: [], recognition: [] };
122
-
123
- for (const f of files) {
124
- try {
125
- const d = JSON.parse(fs.readFileSync(path.join(dir, f), 'utf8'));
126
- const s = d.suggestions?.[0];
127
- if (s == null) continue;
128
-
129
- const userText = ((s.title || '') + ' ' + (s.message || '')).toLowerCase();
130
- const fullText = JSON.stringify(d.suggestions).toLowerCase();
131
-
132
- const hasReview = fullText.includes('review');
133
- const hasForbidden = forbidden.some(fb => userText.includes(fb));
134
-
135
- const profile = d.profileName === 'budget' ? 'budget' : d.profileName === 'recognition' ? 'recognition' : null;
136
- if (profile) {
137
- results[profile].push({
138
- hasReview,
139
- hasForbidden,
140
- score: (hasReview ? 50 : 0) + (hasForbidden ? 0 : 50)
141
- });
142
- }
143
- } catch (e) {}
144
- }
145
-
146
- for (const profile of ['budget', 'recognition']) {
147
- const r = results[profile];
148
- if (r.length === 0) continue;
149
- const avgScore = r.reduce((a, b) => a + b.score, 0) / r.length;
150
- const reviewPass = r.filter(x => x.hasReview).length;
151
- const forbiddenPass = r.filter(x => x.hasForbidden === false).length;
152
- console.log(profile + ':');
153
- console.log(' Samples:', r.length);
154
- console.log(' Review present (required):', reviewPass, '(' + (reviewPass/r.length*100).toFixed(1) + '%)');
155
- console.log(' Forbidden absent:', forbiddenPass, '(' + (forbiddenPass/r.length*100).toFixed(1) + '%)');
156
- console.log(' Avg validation score:', avgScore.toFixed(1));
157
- console.log('');
158
- }
@@ -1,64 +0,0 @@
1
- #!/usr/bin/env node
2
- import fs from 'fs';
3
- import path from 'path';
4
-
5
- const dir = 'logs/tutor-dialogues';
6
- const files = fs.readdirSync(dir).filter(f => f.endsWith('.json'));
7
-
8
- // Get recent files
9
- const recentFiles = files.filter(f => {
10
- const stat = fs.statSync(path.join(dir, f));
11
- return new Date(stat.mtime) >= new Date('2026-02-03');
12
- });
13
-
14
- console.log(`Scanning ${recentFiles.length} dialogue files...\n`);
15
-
16
- // Find recognition profile dialogues
17
- let recognitionExamples = [];
18
-
19
- for (const f of recentFiles) {
20
- try {
21
- const d = JSON.parse(fs.readFileSync(path.join(dir, f), 'utf8'));
22
- if (d.profileName !== 'recognition') continue;
23
-
24
- const suggestions = d.suggestions || [];
25
- if (suggestions.length === 0) continue;
26
-
27
- const first = suggestions[0];
28
- const text = ((first.title || '') + ' ' + (first.message || '')).toLowerCase();
29
- const hasReview = text.includes('review');
30
-
31
- // Store suggestion content
32
- recognitionExamples.push({
33
- file: f,
34
- title: first.title || '',
35
- message: first.message || '',
36
- hasReview,
37
- text
38
- });
39
- } catch (e) {}
40
- }
41
-
42
- // Show examples without "review"
43
- const failingExamples = recognitionExamples.filter(e => !e.hasReview);
44
- const passingExamples = recognitionExamples.filter(e => e.hasReview);
45
-
46
- console.log(`Recognition profile: ${recognitionExamples.length} total dialogues`);
47
- console.log(` With "review": ${passingExamples.length}`);
48
- console.log(` Without "review": ${failingExamples.length}\n`);
49
-
50
- console.log('=== FAILING EXAMPLES (no "review" in text) ===\n');
51
- for (const ex of failingExamples.slice(0, 6)) {
52
- console.log(`File: ${ex.file}`);
53
- console.log(`Title: "${ex.title}"`);
54
- console.log(`Message: ${ex.message.substring(0, 300)}...`);
55
- console.log('---\n');
56
- }
57
-
58
- console.log('=== PASSING EXAMPLES (has "review") ===\n');
59
- for (const ex of passingExamples.slice(0, 3)) {
60
- console.log(`File: ${ex.file}`);
61
- console.log(`Title: "${ex.title}"`);
62
- console.log(`Message: ${ex.message.substring(0, 200)}...`);
63
- console.log('---\n');
64
- }
File without changes