@machinespirits/eval 0.1.2 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (111) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +161 -0
  3. package/config/eval-settings.yaml +18 -0
  4. package/config/evaluation-rubric-learner.yaml +277 -0
  5. package/config/evaluation-rubric.yaml +613 -0
  6. package/config/interaction-eval-scenarios.yaml +93 -50
  7. package/config/learner-agents.yaml +124 -193
  8. package/config/providers.yaml +60 -0
  9. package/config/suggestion-scenarios.yaml +1399 -0
  10. package/config/tutor-agents.yaml +716 -0
  11. package/docs/EVALUATION-VARIABLES.md +589 -0
  12. package/docs/REPLICATION-PLAN.md +577 -0
  13. package/docs/research/build.sh +74 -0
  14. package/docs/research/figures/figure1.png +0 -0
  15. package/docs/research/figures/figure2.png +0 -0
  16. package/docs/research/figures/figure3.png +0 -0
  17. package/docs/research/figures/figure4.png +0 -0
  18. package/docs/research/figures/figure5.png +0 -0
  19. package/docs/research/figures/figure6.png +0 -0
  20. package/docs/research/header.tex +4 -0
  21. package/docs/research/paper-full.md +1909 -0
  22. package/docs/research/paper-short.md +805 -0
  23. package/docs/research/references.bib +1011 -0
  24. package/index.js +15 -6
  25. package/package.json +14 -21
  26. package/routes/evalRoutes.js +88 -36
  27. package/scripts/analyze-judge-reliability.js +401 -0
  28. package/scripts/analyze-run.js +97 -0
  29. package/scripts/analyze-run.mjs +282 -0
  30. package/scripts/analyze-validation-failures.js +141 -0
  31. package/scripts/check-run.mjs +17 -0
  32. package/scripts/code-impasse-strategies.js +1132 -0
  33. package/scripts/compare-runs.js +44 -0
  34. package/scripts/compare-suggestions.js +80 -0
  35. package/scripts/compare-transformation.js +116 -0
  36. package/scripts/dig-into-run.js +158 -0
  37. package/scripts/eval-cli.js +2626 -0
  38. package/scripts/generate-paper-figures.py +452 -0
  39. package/scripts/qualitative-analysis-ai.js +1313 -0
  40. package/scripts/qualitative-analysis.js +688 -0
  41. package/scripts/seed-db.js +87 -0
  42. package/scripts/show-failed-suggestions.js +64 -0
  43. package/scripts/validate-content.js +192 -0
  44. package/server.js +3 -2
  45. package/services/__tests__/evalConfigLoader.test.js +338 -0
  46. package/services/anovaStats.js +499 -0
  47. package/services/contentResolver.js +407 -0
  48. package/services/dialogueTraceAnalyzer.js +454 -0
  49. package/services/evalConfigLoader.js +625 -0
  50. package/services/evaluationRunner.js +2171 -270
  51. package/services/evaluationStore.js +564 -29
  52. package/services/learnerConfigLoader.js +75 -5
  53. package/services/learnerRubricEvaluator.js +284 -0
  54. package/services/learnerTutorInteractionEngine.js +375 -0
  55. package/services/processUtils.js +18 -0
  56. package/services/progressLogger.js +98 -0
  57. package/services/promptRecommendationService.js +31 -26
  58. package/services/promptRewriter.js +427 -0
  59. package/services/rubricEvaluator.js +543 -70
  60. package/services/streamingReporter.js +104 -0
  61. package/services/turnComparisonAnalyzer.js +494 -0
  62. package/components/MobileEvalDashboard.tsx +0 -267
  63. package/components/comparison/DeltaAnalysisTable.tsx +0 -137
  64. package/components/comparison/ProfileComparisonCard.tsx +0 -176
  65. package/components/comparison/RecognitionABMode.tsx +0 -385
  66. package/components/comparison/RecognitionMetricsPanel.tsx +0 -135
  67. package/components/comparison/WinnerIndicator.tsx +0 -64
  68. package/components/comparison/index.ts +0 -5
  69. package/components/mobile/BottomSheet.tsx +0 -233
  70. package/components/mobile/DimensionBreakdown.tsx +0 -210
  71. package/components/mobile/DocsView.tsx +0 -363
  72. package/components/mobile/LogsView.tsx +0 -481
  73. package/components/mobile/PsychodynamicQuadrant.tsx +0 -261
  74. package/components/mobile/QuickTestView.tsx +0 -1098
  75. package/components/mobile/RecognitionTypeChart.tsx +0 -124
  76. package/components/mobile/RecognitionView.tsx +0 -809
  77. package/components/mobile/RunDetailView.tsx +0 -261
  78. package/components/mobile/RunHistoryView.tsx +0 -367
  79. package/components/mobile/ScoreRadial.tsx +0 -211
  80. package/components/mobile/StreamingLogPanel.tsx +0 -230
  81. package/components/mobile/SynthesisStrategyChart.tsx +0 -140
  82. package/docs/research/ABLATION-DIALOGUE-ROUNDS.md +0 -52
  83. package/docs/research/ABLATION-MODEL-SELECTION.md +0 -53
  84. package/docs/research/ADVANCED-EVAL-ANALYSIS.md +0 -60
  85. package/docs/research/ANOVA-RESULTS-2026-01-14.md +0 -257
  86. package/docs/research/COMPREHENSIVE-EVALUATION-PLAN.md +0 -586
  87. package/docs/research/COST-ANALYSIS.md +0 -56
  88. package/docs/research/CRITICAL-REVIEW-RECOGNITION-TUTORING.md +0 -340
  89. package/docs/research/DYNAMIC-VS-SCRIPTED-ANALYSIS.md +0 -291
  90. package/docs/research/EVAL-SYSTEM-ANALYSIS.md +0 -306
  91. package/docs/research/FACTORIAL-RESULTS-2026-01-14.md +0 -301
  92. package/docs/research/IMPLEMENTATION-PLAN-CRITIQUE-RESPONSE.md +0 -1988
  93. package/docs/research/LONGITUDINAL-DYADIC-EVALUATION.md +0 -282
  94. package/docs/research/MULTI-JUDGE-VALIDATION-2026-01-14.md +0 -147
  95. package/docs/research/PAPER-EXTENSION-DYADIC.md +0 -204
  96. package/docs/research/PAPER-UNIFIED.md +0 -659
  97. package/docs/research/PAPER-UNIFIED.pdf +0 -0
  98. package/docs/research/PROMPT-IMPROVEMENTS-2026-01-14.md +0 -356
  99. package/docs/research/SESSION-NOTES-2026-01-11-RECOGNITION-EVAL.md +0 -419
  100. package/docs/research/archive/PAPER-DRAFT-RECOGNITION-TUTORING.md +0 -1637
  101. package/docs/research/archive/paper-multiagent-tutor.tex +0 -978
  102. package/docs/research/paper-draft/full-paper.md +0 -136
  103. package/docs/research/paper-draft/images/pasted-image-2026-01-24T03-47-47-846Z-d76a7ae2.png +0 -0
  104. package/docs/research/paper-draft/references.bib +0 -515
  105. package/docs/research/transcript-baseline.md +0 -139
  106. package/docs/research/transcript-recognition-multiagent.md +0 -187
  107. package/hooks/useEvalData.ts +0 -625
  108. package/server-init.js +0 -45
  109. package/services/benchmarkService.js +0 -1892
  110. package/types.ts +0 -165
  111. package/utils/haptics.ts +0 -45
@@ -0,0 +1,282 @@
1
+ /**
2
+ * Detailed statistical analysis of an evaluation run.
3
+ * Usage: node scripts/analyze-run.mjs [run_id]
4
+ */
5
+ import Database from 'better-sqlite3';
6
+ const db = new Database('data/evaluations.db');
7
+
8
+ const RUN_ID = process.argv[2] || db.prepare(
9
+ 'SELECT run_id FROM evaluation_results ORDER BY created_at DESC LIMIT 1'
10
+ ).get()?.run_id;
11
+
12
+ if (!RUN_ID) { console.error('No run found'); process.exit(1); }
13
+ console.log(`Analyzing run: ${RUN_ID}\n`);
14
+
15
+ // ============================================================
16
+ // Helper functions
17
+ // ============================================================
18
+ function std(values) {
19
+ if (values.length < 2) return 0;
20
+ const mean = values.reduce((a, b) => a + b, 0) / values.length;
21
+ const variance = values.reduce((sum, v) => sum + (v - mean) ** 2, 0) / (values.length - 1);
22
+ return Math.sqrt(variance);
23
+ }
24
+
25
+ function cohensD(group1, group2) {
26
+ const m1 = group1.reduce((a, b) => a + b, 0) / group1.length;
27
+ const m2 = group2.reduce((a, b) => a + b, 0) / group2.length;
28
+ const s1 = std(group1);
29
+ const s2 = std(group2);
30
+ const pooled = Math.sqrt(((group1.length - 1) * s1 ** 2 + (group2.length - 1) * s2 ** 2) / (group1.length + group2.length - 2));
31
+ return pooled === 0 ? 0 : (m1 - m2) / pooled;
32
+ }
33
+
34
+ function percentile(values, p) {
35
+ const sorted = [...values].sort((a, b) => a - b);
36
+ const idx = (p / 100) * (sorted.length - 1);
37
+ const lo = Math.floor(idx);
38
+ const hi = Math.ceil(idx);
39
+ return lo === hi ? sorted[lo] : sorted[lo] + (sorted[hi] - sorted[lo]) * (idx - lo);
40
+ }
41
+
42
+ // ============================================================
43
+ // 1. Summary statistics
44
+ // ============================================================
45
+ const allScores = db.prepare(`
46
+ SELECT overall_score FROM evaluation_results
47
+ WHERE run_id = ? AND overall_score IS NOT NULL
48
+ `).all(RUN_ID).map(r => r.overall_score);
49
+
50
+ const mean = allScores.reduce((a, b) => a + b, 0) / allScores.length;
51
+ const sd = std(allScores);
52
+ const median = percentile(allScores, 50);
53
+ const q1 = percentile(allScores, 25);
54
+ const q3 = percentile(allScores, 75);
55
+
56
+ console.log('=== DESCRIPTIVE STATISTICS ===');
57
+ console.log(`N = ${allScores.length}`);
58
+ console.log(`Mean: ${mean.toFixed(1)} (SD: ${sd.toFixed(1)})`);
59
+ console.log(`Median: ${median.toFixed(1)} (IQR: ${q1.toFixed(1)} – ${q3.toFixed(1)})`);
60
+ console.log(`Range: ${Math.min(...allScores).toFixed(1)} – ${Math.max(...allScores).toFixed(1)}`);
61
+
62
+ // ============================================================
63
+ // 2. Per-model statistics
64
+ // ============================================================
65
+ console.log('\n=== PER-MODEL STATISTICS ===');
66
+ const models = db.prepare(`
67
+ SELECT DISTINCT model FROM evaluation_results
68
+ WHERE run_id = ? AND overall_score IS NOT NULL
69
+ `).all(RUN_ID).map(r => r.model);
70
+
71
+ const modelData = {};
72
+ for (const m of models) {
73
+ const scores = db.prepare(`
74
+ SELECT overall_score FROM evaluation_results
75
+ WHERE run_id = ? AND model = ? AND overall_score IS NOT NULL
76
+ `).all(RUN_ID, m).map(r => r.overall_score);
77
+ modelData[m] = scores;
78
+ const mn = scores.reduce((a, b) => a + b, 0) / scores.length;
79
+ const s = std(scores);
80
+ console.log(`${m}: M=${mn.toFixed(1)}, SD=${s.toFixed(1)}, N=${scores.length}, Range=[${Math.min(...scores).toFixed(1)}, ${Math.max(...scores).toFixed(1)}]`);
81
+ }
82
+
83
+ // ============================================================
84
+ // 3. Pairwise effect sizes between models
85
+ // ============================================================
86
+ console.log('\n=== PAIRWISE EFFECT SIZES (Cohen\'s d) ===');
87
+ const modelNames = Object.keys(modelData).filter(m => modelData[m].length >= 3);
88
+ for (let i = 0; i < modelNames.length; i++) {
89
+ for (let j = i + 1; j < modelNames.length; j++) {
90
+ const d = cohensD(modelData[modelNames[i]], modelData[modelNames[j]]);
91
+ const label = d > 0.8 ? 'large' : d > 0.5 ? 'medium' : d > 0.2 ? 'small' : 'negligible';
92
+ console.log(`${modelNames[i]} vs ${modelNames[j]}: d=${d.toFixed(2)} (${label})`);
93
+ }
94
+ }
95
+
96
+ // ============================================================
97
+ // 4. Per-dimension statistics
98
+ // ============================================================
99
+ console.log('\n=== DIMENSION STATISTICS ===');
100
+ const dims = ['relevance', 'specificity', 'pedagogical', 'personalization', 'actionability', 'tone'];
101
+ const dimCols = dims.map(d => `score_${d}`);
102
+
103
+ for (const dim of dims) {
104
+ const col = `score_${dim}`;
105
+ const vals = db.prepare(`
106
+ SELECT ${col} as v FROM evaluation_results
107
+ WHERE run_id = ? AND ${col} IS NOT NULL
108
+ `).all(RUN_ID).map(r => r.v);
109
+ if (vals.length === 0) continue;
110
+ const mn = vals.reduce((a, b) => a + b, 0) / vals.length;
111
+ const s = std(vals);
112
+ console.log(`${dim.padEnd(20)} M=${mn.toFixed(2)}, SD=${s.toFixed(2)}, N=${vals.length}`);
113
+ }
114
+
115
+ // ============================================================
116
+ // 5. Per-dimension per-model
117
+ // ============================================================
118
+ console.log('\n=== DIMENSION × MODEL BREAKDOWN ===');
119
+ const header = 'Model'.padEnd(25) + dims.map(d => d.substring(0, 8).padStart(9)).join('');
120
+ console.log(header);
121
+ for (const m of modelNames) {
122
+ let line = m.padEnd(25);
123
+ for (const dim of dims) {
124
+ const col = `score_${dim}`;
125
+ const val = db.prepare(`
126
+ SELECT AVG(${col}) as v FROM evaluation_results
127
+ WHERE run_id = ? AND model = ? AND ${col} IS NOT NULL
128
+ `).get(RUN_ID, m);
129
+ line += (val?.v?.toFixed(2) || 'N/A').padStart(9);
130
+ }
131
+ console.log(line);
132
+ }
133
+
134
+ // ============================================================
135
+ // 6. Scenario difficulty ranking
136
+ // ============================================================
137
+ console.log('\n=== SCENARIO DIFFICULTY RANKING (hardest → easiest) ===');
138
+ const scenarioStats = db.prepare(`
139
+ SELECT scenario_id,
140
+ AVG(overall_score) as mean,
141
+ COUNT(*) as n
142
+ FROM evaluation_results
143
+ WHERE run_id = ? AND overall_score IS NOT NULL
144
+ GROUP BY scenario_id
145
+ ORDER BY mean ASC
146
+ `).all(RUN_ID);
147
+
148
+ for (const s of scenarioStats) {
149
+ const scores = db.prepare(`
150
+ SELECT overall_score FROM evaluation_results
151
+ WHERE run_id = ? AND scenario_id = ? AND overall_score IS NOT NULL
152
+ `).all(RUN_ID, s.scenario_id).map(r => r.overall_score);
153
+ const s_sd = std(scores);
154
+ const bar = '█'.repeat(Math.round(s.mean / 5));
155
+ console.log(`${s.scenario_id.padEnd(40)} ${s.mean.toFixed(1).padStart(5)} (SD=${s_sd.toFixed(1).padStart(5)}) ${bar}`);
156
+ }
157
+
158
+ // ============================================================
159
+ // 7. Inter-model agreement (scenario-level correlation)
160
+ // ============================================================
161
+ console.log('\n=== INTER-MODEL AGREEMENT ===');
162
+ const scenarios = db.prepare(`
163
+ SELECT DISTINCT scenario_id FROM evaluation_results
164
+ WHERE run_id = ? AND overall_score IS NOT NULL
165
+ `).all(RUN_ID).map(r => r.scenario_id);
166
+
167
+ // Check if models rank scenarios similarly
168
+ for (let i = 0; i < modelNames.length; i++) {
169
+ for (let j = i + 1; j < modelNames.length; j++) {
170
+ const pairs = [];
171
+ for (const s of scenarios) {
172
+ const s1 = db.prepare(`SELECT overall_score FROM evaluation_results WHERE run_id = ? AND model = ? AND scenario_id = ? AND overall_score IS NOT NULL`).get(RUN_ID, modelNames[i], s);
173
+ const s2 = db.prepare(`SELECT overall_score FROM evaluation_results WHERE run_id = ? AND model = ? AND scenario_id = ? AND overall_score IS NOT NULL`).get(RUN_ID, modelNames[j], s);
174
+ if (s1 && s2) pairs.push([s1.overall_score, s2.overall_score]);
175
+ }
176
+ if (pairs.length >= 3) {
177
+ // Spearman rank correlation
178
+ const ranked = pairs.map(([a, b], idx) => ({ a, b, idx }));
179
+ ranked.sort((x, y) => x.a - y.a);
180
+ ranked.forEach((r, i) => r.rankA = i + 1);
181
+ ranked.sort((x, y) => x.b - y.b);
182
+ ranked.forEach((r, i) => r.rankB = i + 1);
183
+ const n = ranked.length;
184
+ const dSquared = ranked.reduce((sum, r) => sum + (r.rankA - r.rankB) ** 2, 0);
185
+ const rho = 1 - (6 * dSquared) / (n * (n * n - 1));
186
+ const agreement = rho > 0.7 ? 'strong' : rho > 0.4 ? 'moderate' : rho > 0 ? 'weak' : 'none';
187
+ console.log(`${modelNames[i]} vs ${modelNames[j]}: Spearman ρ=${rho.toFixed(2)} (${agreement} agreement, N=${n})`);
188
+ }
189
+ }
190
+ }
191
+
192
+ // ============================================================
193
+ // 8. Base vs Recognition score analysis
194
+ // ============================================================
195
+ console.log('\n=== BASE vs RECOGNITION SCORE ANALYSIS ===');
196
+ const dualRows = db.prepare(`
197
+ SELECT model, base_score, recognition_score, overall_score
198
+ FROM evaluation_results
199
+ WHERE run_id = ? AND base_score IS NOT NULL AND recognition_score IS NOT NULL
200
+ `).all(RUN_ID);
201
+
202
+ if (dualRows.length > 0) {
203
+ const bases = dualRows.map(r => r.base_score);
204
+ const recogs = dualRows.map(r => r.recognition_score);
205
+ const overalls = dualRows.map(r => r.overall_score);
206
+
207
+ console.log(`N (with both scores): ${dualRows.length}`);
208
+ console.log(`Base: M=${(bases.reduce((a,b)=>a+b,0)/bases.length).toFixed(1)}, SD=${std(bases).toFixed(1)}`);
209
+ console.log(`Recognition: M=${(recogs.reduce((a,b)=>a+b,0)/recogs.length).toFixed(1)}, SD=${std(recogs).toFixed(1)}`);
210
+ console.log(`Overall: M=${(overalls.reduce((a,b)=>a+b,0)/overalls.length).toFixed(1)}, SD=${std(overalls).toFixed(1)}`);
211
+
212
+ const gap = cohensD(bases, recogs);
213
+ console.log(`Base vs Recognition gap: d=${gap.toFixed(2)} (${gap > 0.8 ? 'large' : gap > 0.5 ? 'medium' : 'small'})`);
214
+
215
+ // Per-model breakdown
216
+ console.log('\nPer-model dual scores:');
217
+ for (const m of modelNames) {
218
+ const mRows = dualRows.filter(r => r.model === m);
219
+ if (mRows.length === 0) continue;
220
+ const mb = mRows.map(r => r.base_score);
221
+ const mr = mRows.map(r => r.recognition_score);
222
+ console.log(` ${m}: Base=${(mb.reduce((a,b)=>a+b,0)/mb.length).toFixed(1)}, Recog=${(mr.reduce((a,b)=>a+b,0)/mr.length).toFixed(1)}, Gap=${((mb.reduce((a,b)=>a+b,0)/mb.length) - (mr.reduce((a,b)=>a+b,0)/mr.length)).toFixed(1)}, N=${mRows.length}`);
223
+ }
224
+ } else {
225
+ console.log('No results with both base_score and recognition_score');
226
+ }
227
+
228
+ // ============================================================
229
+ // 9. Variance decomposition (eta-squared)
230
+ // ============================================================
231
+ console.log('\n=== VARIANCE DECOMPOSITION ===');
232
+ // How much variance is explained by model vs scenario?
233
+ const grandMean = mean;
234
+ const SSTotal = allScores.reduce((sum, s) => sum + (s - grandMean) ** 2, 0);
235
+
236
+ // SS between models
237
+ let SSModel = 0;
238
+ for (const m of modelNames) {
239
+ const mScores = modelData[m];
240
+ const mMean = mScores.reduce((a, b) => a + b, 0) / mScores.length;
241
+ SSModel += mScores.length * (mMean - grandMean) ** 2;
242
+ }
243
+
244
+ // SS between scenarios
245
+ let SSScenario = 0;
246
+ for (const s of scenarioStats) {
247
+ const sMean = s.mean;
248
+ SSScenario += s.n * (sMean - grandMean) ** 2;
249
+ }
250
+
251
+ const etaModel = SSModel / SSTotal;
252
+ const etaScenario = SSScenario / SSTotal;
253
+ const etaResidual = 1 - etaModel - etaScenario;
254
+
255
+ console.log(`Total SS: ${SSTotal.toFixed(1)}`);
256
+ console.log(`Model effect (η²): ${(etaModel * 100).toFixed(1)}% — ${etaModel < 0.01 ? 'negligible' : etaModel < 0.06 ? 'small' : etaModel < 0.14 ? 'medium' : 'large'}`);
257
+ console.log(`Scenario effect (η²): ${(etaScenario * 100).toFixed(1)}% — ${etaScenario < 0.01 ? 'negligible' : etaScenario < 0.06 ? 'small' : etaScenario < 0.14 ? 'medium' : 'large'}`);
258
+ console.log(`Residual: ${(etaResidual * 100).toFixed(1)}%`);
259
+
260
+ // ============================================================
261
+ // 10. High-variance scenarios (discriminating power)
262
+ // ============================================================
263
+ console.log('\n=== SCENARIO DISCRIMINATING POWER (cross-model variance) ===');
264
+ const scenarioVariance = [];
265
+ for (const s of scenarios) {
266
+ const scores = db.prepare(`
267
+ SELECT overall_score FROM evaluation_results
268
+ WHERE run_id = ? AND scenario_id = ? AND overall_score IS NOT NULL
269
+ `).all(RUN_ID, s).map(r => r.overall_score);
270
+ if (scores.length >= 2) {
271
+ const sv = std(scores);
272
+ scenarioVariance.push({ id: s, sd: sv, range: Math.max(...scores) - Math.min(...scores) });
273
+ }
274
+ }
275
+ scenarioVariance.sort((a, b) => b.sd - a.sd);
276
+ console.log('Scenario'.padEnd(40), 'SD'.padStart(6), 'Range'.padStart(7));
277
+ for (const s of scenarioVariance) {
278
+ console.log(s.id.padEnd(40), s.sd.toFixed(1).padStart(6), s.range.toFixed(1).padStart(7));
279
+ }
280
+
281
+ console.log('\n=== ANALYSIS COMPLETE ===');
282
+ db.close();
@@ -0,0 +1,141 @@
1
+ #!/usr/bin/env node
2
+ import fs from 'fs';
3
+
4
+ const runId = 'eval-2026-02-03-f5d4dd93';
5
+ const logPath = `./logs/eval-progress/${runId}.jsonl`;
6
+
7
+ const lines = fs.readFileSync(logPath, 'utf8').split('\n').filter(l => l.trim());
8
+ const events = lines.map(l => JSON.parse(l));
9
+
10
+ // Find test_complete events for struggling_learner
11
+ const strugglingTests = events.filter(e =>
12
+ e.eventType === 'test_complete' &&
13
+ e.scenarioId === 'struggling_learner' &&
14
+ e.success
15
+ );
16
+
17
+ console.log(`=== STRUGGLING_LEARNER Scenario Analysis ===`);
18
+ console.log(`Total test completions: ${strugglingTests.length}\n`);
19
+
20
+ // Group by profile
21
+ const byProfile = {};
22
+ for (const t of strugglingTests) {
23
+ if (!byProfile[t.profileName]) byProfile[t.profileName] = [];
24
+ byProfile[t.profileName].push(t.overallScore);
25
+ }
26
+
27
+ console.log('Scores by profile:');
28
+ for (const [profile, scores] of Object.entries(byProfile).sort()) {
29
+ const avg = scores.reduce((a,b) => a+b, 0) / scores.length;
30
+ console.log(` ${profile}: ${scores.join(', ')} (avg: ${avg.toFixed(1)})`);
31
+ }
32
+
33
+ // Now look at concept_confusion
34
+ const confusionTests = events.filter(e =>
35
+ e.eventType === 'test_complete' &&
36
+ e.scenarioId === 'concept_confusion' &&
37
+ e.success
38
+ );
39
+
40
+ console.log(`\n=== CONCEPT_CONFUSION Scenario Analysis ===`);
41
+ console.log(`Total test completions: ${confusionTests.length}\n`);
42
+
43
+ const byProfile2 = {};
44
+ for (const t of confusionTests) {
45
+ if (!byProfile2[t.profileName]) byProfile2[t.profileName] = [];
46
+ byProfile2[t.profileName].push(t.overallScore);
47
+ }
48
+
49
+ console.log('Scores by profile:');
50
+ for (const [profile, scores] of Object.entries(byProfile2).sort()) {
51
+ const avg = scores.reduce((a,b) => a+b, 0) / scores.length;
52
+ console.log(` ${profile}: ${scores.join(', ')} (avg: ${avg.toFixed(1)})`);
53
+ }
54
+
55
+ // Now let's look at the validation rules for these scenarios
56
+ console.log('\n=== Validation Rules ===');
57
+ console.log('Loading validation rules from scenarios...');
58
+
59
+ // Read the scenario files to get the validation rules
60
+ const scenarioPath = './config/scenarios.yaml';
61
+ try {
62
+ const scenarioYaml = fs.readFileSync(scenarioPath, 'utf8');
63
+ // Simple extraction of struggling_learner validation rules
64
+ const strugglingMatch = scenarioYaml.match(/struggling_learner:[\s\S]*?validation:[\s\S]*?required_elements:[\s\S]*?\[(.*?)\][\s\S]*?forbidden_elements:[\s\S]*?\[(.*?)\]/);
65
+ if (strugglingMatch) {
66
+ console.log('struggling_learner validation rules:');
67
+ console.log(' required:', strugglingMatch[1]);
68
+ console.log(' forbidden:', strugglingMatch[2]);
69
+ }
70
+ } catch (e) {
71
+ console.log('Could not load scenarios.yaml');
72
+ }
73
+
74
+ // Now analyze all scenarios to find the pattern
75
+ console.log('\n=== Score Distribution Analysis ===');
76
+ const allTests = events.filter(e => e.eventType === 'test_complete' && e.success && e.overallScore != null);
77
+
78
+ // Compare scores between base and recognition profiles
79
+ const baseProfiles = ['cell_1_base_single_unified', 'cell_2_base_single_psycho', 'cell_3_base_multi_unified', 'cell_4_base_multi_psycho'];
80
+ const recogProfiles = ['cell_5_recog_single_unified', 'cell_6_recog_single_psycho', 'cell_7_recog_multi_unified', 'cell_8_recog_multi_psycho'];
81
+
82
+ // Count 50s and 100s for each profile type
83
+ let baseFifties = 0, baseHundreds = 0, baseTotal = 0;
84
+ let recogFifties = 0, recogHundreds = 0, recogTotal = 0;
85
+
86
+ for (const t of allTests) {
87
+ if (baseProfiles.includes(t.profileName)) {
88
+ baseTotal++;
89
+ if (t.overallScore === 50) baseFifties++;
90
+ else if (t.overallScore === 100) baseHundreds++;
91
+ } else if (recogProfiles.includes(t.profileName)) {
92
+ recogTotal++;
93
+ if (t.overallScore === 50) recogFifties++;
94
+ else if (t.overallScore === 100) recogHundreds++;
95
+ }
96
+ }
97
+
98
+ console.log('Base profiles:');
99
+ console.log(` Total tests: ${baseTotal}`);
100
+ console.log(` Score 100 (full pass): ${baseHundreds} (${(baseHundreds/baseTotal*100).toFixed(1)}%)`);
101
+ console.log(` Score 50 (partial): ${baseFifties} (${(baseFifties/baseTotal*100).toFixed(1)}%)`);
102
+
103
+ console.log('\nRecognition profiles:');
104
+ console.log(` Total tests: ${recogTotal}`);
105
+ console.log(` Score 100 (full pass): ${recogHundreds} (${(recogHundreds/recogTotal*100).toFixed(1)}%)`);
106
+ console.log(` Score 50 (partial): ${recogFifties} (${(recogFifties/recogTotal*100).toFixed(1)}%)`);
107
+
108
+ // Now by scenario - which have validation rules that recognition fails more often?
109
+ console.log('\n=== Per-Scenario Validation Pass Rates ===');
110
+ const scenarios = [...new Set(allTests.map(t => t.scenarioId))];
111
+
112
+ const results = [];
113
+ for (const s of scenarios) {
114
+ const baseTests = allTests.filter(t => baseProfiles.includes(t.profileName) && t.scenarioId === s);
115
+ const recogTests = allTests.filter(t => recogProfiles.includes(t.profileName) && t.scenarioId === s);
116
+
117
+ const basePassRate = baseTests.filter(t => t.overallScore === 100).length / baseTests.length;
118
+ const recogPassRate = recogTests.filter(t => t.overallScore === 100).length / recogTests.length;
119
+
120
+ results.push({
121
+ scenario: s,
122
+ basePassRate,
123
+ recogPassRate,
124
+ delta: basePassRate - recogPassRate,
125
+ baseTests: baseTests.length,
126
+ recogTests: recogTests.length
127
+ });
128
+ }
129
+
130
+ results.sort((a, b) => b.delta - a.delta);
131
+
132
+ console.log('Scenario'.padEnd(45), 'Base%', 'Recog%', 'Delta');
133
+ console.log('-'.repeat(70));
134
+ for (const r of results) {
135
+ console.log(
136
+ r.scenario.padEnd(45),
137
+ (r.basePassRate * 100).toFixed(0).padStart(4) + '%',
138
+ (r.recogPassRate * 100).toFixed(0).padStart(5) + '%',
139
+ ((r.delta > 0 ? '+' : '') + (r.delta * 100).toFixed(0) + '%').padStart(7)
140
+ );
141
+ }
@@ -0,0 +1,17 @@
1
+ import * as store from '../services/evaluationStore.js';
2
+ const runId = process.argv[2] || 'eval-2026-02-03-c8d32121';
3
+ const results = store.getResults(runId);
4
+ const failed = results.filter(r => r.success === false || r.errorMessage);
5
+ const succeeded = results.filter(r => r.success === true && !r.errorMessage);
6
+ console.log('Total results:', results.length);
7
+ console.log('Succeeded:', succeeded.length);
8
+ console.log('Failed:', failed.length);
9
+ const errorCounts = {};
10
+ for (const r of failed) {
11
+ const msg = r.errorMessage || 'no error message';
12
+ errorCounts[msg] = (errorCounts[msg] || 0) + 1;
13
+ }
14
+ console.log('\nFailure breakdown:');
15
+ for (const [msg, count] of Object.entries(errorCounts).sort((a,b) => b[1] - a[1])) {
16
+ console.log(' ' + count + 'x: ' + msg.substring(0, 100));
17
+ }