@machinespirits/eval 0.1.2 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (102) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +161 -0
  3. package/config/eval-settings.yaml +18 -0
  4. package/config/evaluation-rubric-learner.yaml +277 -0
  5. package/config/evaluation-rubric.yaml +613 -0
  6. package/config/interaction-eval-scenarios.yaml +93 -50
  7. package/config/learner-agents.yaml +124 -193
  8. package/config/machinespirits-eval.code-workspace +11 -0
  9. package/config/providers.yaml +60 -0
  10. package/config/suggestion-scenarios.yaml +1399 -0
  11. package/config/tutor-agents.yaml +716 -0
  12. package/docs/EVALUATION-VARIABLES.md +589 -0
  13. package/docs/REPLICATION-PLAN.md +577 -0
  14. package/index.js +15 -6
  15. package/package.json +16 -22
  16. package/routes/evalRoutes.js +88 -36
  17. package/scripts/analyze-judge-reliability.js +401 -0
  18. package/scripts/analyze-run.js +97 -0
  19. package/scripts/analyze-run.mjs +282 -0
  20. package/scripts/analyze-validation-failures.js +141 -0
  21. package/scripts/check-run.mjs +17 -0
  22. package/scripts/code-impasse-strategies.js +1132 -0
  23. package/scripts/compare-runs.js +44 -0
  24. package/scripts/compare-suggestions.js +80 -0
  25. package/scripts/compare-transformation.js +116 -0
  26. package/scripts/dig-into-run.js +158 -0
  27. package/scripts/eval-cli.js +2626 -0
  28. package/scripts/generate-paper-figures.py +452 -0
  29. package/scripts/qualitative-analysis-ai.js +1313 -0
  30. package/scripts/qualitative-analysis.js +688 -0
  31. package/scripts/seed-db.js +87 -0
  32. package/scripts/show-failed-suggestions.js +64 -0
  33. package/scripts/validate-content.js +192 -0
  34. package/server.js +3 -2
  35. package/services/__tests__/evalConfigLoader.test.js +338 -0
  36. package/services/anovaStats.js +499 -0
  37. package/services/contentResolver.js +407 -0
  38. package/services/dialogueTraceAnalyzer.js +454 -0
  39. package/services/evalConfigLoader.js +625 -0
  40. package/services/evaluationRunner.js +2171 -270
  41. package/services/evaluationStore.js +564 -29
  42. package/services/learnerConfigLoader.js +75 -5
  43. package/services/learnerRubricEvaluator.js +284 -0
  44. package/services/learnerTutorInteractionEngine.js +375 -0
  45. package/services/processUtils.js +18 -0
  46. package/services/progressLogger.js +98 -0
  47. package/services/promptRecommendationService.js +31 -26
  48. package/services/promptRewriter.js +427 -0
  49. package/services/rubricEvaluator.js +543 -70
  50. package/services/streamingReporter.js +104 -0
  51. package/services/turnComparisonAnalyzer.js +494 -0
  52. package/components/MobileEvalDashboard.tsx +0 -267
  53. package/components/comparison/DeltaAnalysisTable.tsx +0 -137
  54. package/components/comparison/ProfileComparisonCard.tsx +0 -176
  55. package/components/comparison/RecognitionABMode.tsx +0 -385
  56. package/components/comparison/RecognitionMetricsPanel.tsx +0 -135
  57. package/components/comparison/WinnerIndicator.tsx +0 -64
  58. package/components/comparison/index.ts +0 -5
  59. package/components/mobile/BottomSheet.tsx +0 -233
  60. package/components/mobile/DimensionBreakdown.tsx +0 -210
  61. package/components/mobile/DocsView.tsx +0 -363
  62. package/components/mobile/LogsView.tsx +0 -481
  63. package/components/mobile/PsychodynamicQuadrant.tsx +0 -261
  64. package/components/mobile/QuickTestView.tsx +0 -1098
  65. package/components/mobile/RecognitionTypeChart.tsx +0 -124
  66. package/components/mobile/RecognitionView.tsx +0 -809
  67. package/components/mobile/RunDetailView.tsx +0 -261
  68. package/components/mobile/RunHistoryView.tsx +0 -367
  69. package/components/mobile/ScoreRadial.tsx +0 -211
  70. package/components/mobile/StreamingLogPanel.tsx +0 -230
  71. package/components/mobile/SynthesisStrategyChart.tsx +0 -140
  72. package/docs/research/ABLATION-DIALOGUE-ROUNDS.md +0 -52
  73. package/docs/research/ABLATION-MODEL-SELECTION.md +0 -53
  74. package/docs/research/ADVANCED-EVAL-ANALYSIS.md +0 -60
  75. package/docs/research/ANOVA-RESULTS-2026-01-14.md +0 -257
  76. package/docs/research/COMPREHENSIVE-EVALUATION-PLAN.md +0 -586
  77. package/docs/research/COST-ANALYSIS.md +0 -56
  78. package/docs/research/CRITICAL-REVIEW-RECOGNITION-TUTORING.md +0 -340
  79. package/docs/research/DYNAMIC-VS-SCRIPTED-ANALYSIS.md +0 -291
  80. package/docs/research/EVAL-SYSTEM-ANALYSIS.md +0 -306
  81. package/docs/research/FACTORIAL-RESULTS-2026-01-14.md +0 -301
  82. package/docs/research/IMPLEMENTATION-PLAN-CRITIQUE-RESPONSE.md +0 -1988
  83. package/docs/research/LONGITUDINAL-DYADIC-EVALUATION.md +0 -282
  84. package/docs/research/MULTI-JUDGE-VALIDATION-2026-01-14.md +0 -147
  85. package/docs/research/PAPER-EXTENSION-DYADIC.md +0 -204
  86. package/docs/research/PAPER-UNIFIED.md +0 -659
  87. package/docs/research/PAPER-UNIFIED.pdf +0 -0
  88. package/docs/research/PROMPT-IMPROVEMENTS-2026-01-14.md +0 -356
  89. package/docs/research/SESSION-NOTES-2026-01-11-RECOGNITION-EVAL.md +0 -419
  90. package/docs/research/apa.csl +0 -2133
  91. package/docs/research/archive/PAPER-DRAFT-RECOGNITION-TUTORING.md +0 -1637
  92. package/docs/research/archive/paper-multiagent-tutor.tex +0 -978
  93. package/docs/research/paper-draft/full-paper.md +0 -136
  94. package/docs/research/paper-draft/images/pasted-image-2026-01-24T03-47-47-846Z-d76a7ae2.png +0 -0
  95. package/docs/research/paper-draft/references.bib +0 -515
  96. package/docs/research/transcript-baseline.md +0 -139
  97. package/docs/research/transcript-recognition-multiagent.md +0 -187
  98. package/hooks/useEvalData.ts +0 -625
  99. package/server-init.js +0 -45
  100. package/services/benchmarkService.js +0 -1892
  101. package/types.ts +0 -165
  102. package/utils/haptics.ts +0 -45
@@ -0,0 +1,44 @@
1
+ #!/usr/bin/env node
2
+ import fs from 'fs';
3
+
4
+ function getScores(runId, scenario) {
5
+ const logPath = `./logs/eval-progress/${runId}.jsonl`;
6
+ const lines = fs.readFileSync(logPath, 'utf8').split('\n').filter(l => l.trim());
7
+ const events = lines.map(l => JSON.parse(l));
8
+ const successful = events.filter(e =>
9
+ e.eventType === 'test_complete' &&
10
+ e.success === true &&
11
+ e.overallScore != null &&
12
+ e.scenarioId === scenario
13
+ );
14
+
15
+ const byProfile = {};
16
+ for (const r of successful) {
17
+ if (!byProfile[r.profileName]) byProfile[r.profileName] = [];
18
+ byProfile[r.profileName].push(r.overallScore);
19
+ }
20
+ return byProfile;
21
+ }
22
+
23
+ const run1 = process.argv[2] || 'eval-2026-02-03-f5d4dd93';
24
+ const run2 = process.argv[3] || 'eval-2026-02-03-b391d999';
25
+
26
+ const scenarios = ['mood_frustration_to_breakthrough', 'misconception_correction_flow', 'mutual_transformation_journey'];
27
+
28
+ for (const scenario of scenarios) {
29
+ console.log(`=== ${scenario} ===`);
30
+ const s1 = getScores(run1, scenario);
31
+ const s2 = getScores(run2, scenario);
32
+
33
+ const profiles = ['cell_1_base_single_unified', 'cell_3_base_multi_unified', 'cell_5_recog_single_unified', 'cell_7_recog_multi_unified'];
34
+
35
+ for (const p of profiles) {
36
+ const scores1 = s1[p] || [];
37
+ const scores2 = s2[p] || [];
38
+ const avg1 = scores1.length > 0 ? (scores1.reduce((a, b) => a + b, 0) / scores1.length).toFixed(1) : 'N/A';
39
+ const avg2 = scores2.length > 0 ? (scores2.reduce((a, b) => a + b, 0) / scores2.length).toFixed(1) : 'N/A';
40
+ const shortP = p.replace('cell_', '').replace('_single_unified', '').replace('_multi_unified', '_m');
41
+ console.log(` ${shortP.padEnd(12)} ${run1.slice(-8)}=${avg1.padStart(5)} (${scores1.join(',').padEnd(20)}) ${run2.slice(-8)}=${avg2.padStart(5)} (${scores2.join(',')})`);
42
+ }
43
+ console.log('');
44
+ }
@@ -0,0 +1,80 @@
1
+ #!/usr/bin/env node
2
+ import fs from 'fs';
3
+ import path from 'path';
4
+
5
+ const dir = 'logs/tutor-dialogues';
6
+ const files = fs.readdirSync(dir).filter(f => f.endsWith('.json'));
7
+
8
+ // Get recent files from Feb 3
9
+ const recentFiles = files.filter(f => {
10
+ const stat = fs.statSync(path.join(dir, f));
11
+ const date = new Date(stat.mtime);
12
+ return date >= new Date('2026-02-03');
13
+ });
14
+
15
+ console.log(`Analyzing ${recentFiles.length} dialogue files from Feb 3+\n`);
16
+
17
+ // Collect examples by profile and scenario
18
+ const examples = { budget: [], recognition: [] };
19
+
20
+ for (const f of recentFiles) {
21
+ try {
22
+ const d = JSON.parse(fs.readFileSync(path.join(dir, f), 'utf8'));
23
+ const profile = d.profileName;
24
+ const scenario = d.scenario?.scenarioId || d.scenarioId || 'unknown';
25
+
26
+ if (profile !== 'budget' && profile !== 'recognition') continue;
27
+
28
+ // Only look at struggling_learner and concept_confusion
29
+ if (!scenario.includes('struggling') && !scenario.includes('confusion')) continue;
30
+
31
+ const suggestions = d.suggestions || [];
32
+ const firstSuggestion = suggestions[0];
33
+ if (!firstSuggestion) continue;
34
+
35
+ const title = firstSuggestion.title || '';
36
+ const message = firstSuggestion.message || '';
37
+ const fullText = (title + ' ' + message).toLowerCase();
38
+
39
+ const hasReview = fullText.includes('review');
40
+ const hasForbidden = ['next lecture', 'move on to', 'continue with'].some(fb => fullText.includes(fb));
41
+
42
+ examples[profile].push({
43
+ file: f,
44
+ scenario,
45
+ title,
46
+ messagePreview: message.substring(0, 150),
47
+ hasReview,
48
+ hasForbidden,
49
+ passed: hasReview && !hasForbidden
50
+ });
51
+ } catch (e) {}
52
+ }
53
+
54
+ // Show failing examples for each profile
55
+ for (const profile of ['budget', 'recognition']) {
56
+ const data = examples[profile];
57
+ const passing = data.filter(x => x.passed);
58
+ const failing = data.filter(x => !x.passed);
59
+
60
+ console.log(`=== ${profile.toUpperCase()} ===`);
61
+ console.log(`Total: ${data.length}, Passing: ${passing.length}, Failing: ${failing.length}`);
62
+ console.log(`Pass rate: ${(passing.length / data.length * 100).toFixed(1)}%\n`);
63
+
64
+ console.log('FAILING examples (missing review or has forbidden):');
65
+ for (const ex of failing.slice(0, 5)) {
66
+ console.log(` Scenario: ${ex.scenario}`);
67
+ console.log(` Title: "${ex.title}"`);
68
+ console.log(` Has review: ${ex.hasReview}, Has forbidden: ${ex.hasForbidden}`);
69
+ console.log(` Preview: ${ex.messagePreview}...`);
70
+ console.log(` File: ${ex.file}\n`);
71
+ }
72
+
73
+ console.log('PASSING examples:');
74
+ for (const ex of passing.slice(0, 3)) {
75
+ console.log(` Scenario: ${ex.scenario}`);
76
+ console.log(` Title: "${ex.title}"`);
77
+ console.log('');
78
+ }
79
+ console.log('');
80
+ }
@@ -0,0 +1,116 @@
1
+ #!/usr/bin/env node
2
+ /**
3
+ * Compare transformation metrics between base and recognition profiles
4
+ */
5
+ import fs from 'fs';
6
+ import path from 'path';
7
+
8
+ const logsDir = './logs/tutor-dialogues/';
9
+ const files = fs.readdirSync(logsDir)
10
+ .filter(f => f.endsWith('.json'))
11
+ .sort()
12
+ .reverse()
13
+ .slice(0, 20);
14
+
15
+ const baseProfiles = [];
16
+ const recogProfiles = [];
17
+
18
+ files.forEach(f => {
19
+ try {
20
+ const data = JSON.parse(fs.readFileSync(path.join(logsDir, f)));
21
+ if (!data.transformationAnalysis || !data.isMultiTurn) return;
22
+
23
+ const ta = data.transformationAnalysis;
24
+ const tp = ta.turnProgression || {};
25
+ const bm = ta.dialogueTraceReport?.bilateralMetrics || {};
26
+ const sm = ta.dialogueTraceReport?.superegoMetrics || {};
27
+
28
+ const entry = {
29
+ file: f,
30
+ profile: data.profileName,
31
+ tutorAdaptIdx: tp.adaptationIndex,
32
+ learnerGrowthIdx: tp.learnerGrowthIndex,
33
+ bilateralIdx: tp.bilateralTransformationIndex,
34
+ tutorSignals: bm.tutorTransformationCount,
35
+ learnerSignals: bm.learnerTransformationCount,
36
+ balance: bm.bilateralBalance,
37
+ quality: ta.dialogueTraceReport?.overallAssessment?.transformationQuality,
38
+ superegoIncorp: sm?.incorporationRate,
39
+ isMutual: bm.isMutualTransformation,
40
+ };
41
+
42
+ if (data.profileName === 'recognition' || (data.profileName && data.profileName.includes('recog'))) {
43
+ recogProfiles.push(entry);
44
+ } else {
45
+ baseProfiles.push(entry);
46
+ }
47
+ } catch (e) {
48
+ // Skip invalid files
49
+ }
50
+ });
51
+
52
+ function avg(arr, key) {
53
+ const vals = arr.map(a => a[key]).filter(v => v !== undefined && v !== null);
54
+ return vals.length > 0 ? (vals.reduce((a,b)=>a+b,0)/vals.length) : null;
55
+ }
56
+
57
+ function fmt(v) {
58
+ return v !== null && v !== undefined ? v.toFixed(3) : 'N/A';
59
+ }
60
+
61
+ console.log('╔══════════════════════════════════════════════════════════════╗');
62
+ console.log('║ BILATERAL TRANSFORMATION METRICS COMPARISON ║');
63
+ console.log('╠══════════════════════════════════════════════════════════════╣');
64
+ console.log('');
65
+
66
+ console.log('=== BASE PROFILES (budget/single) ===');
67
+ console.log('Sample count:', baseProfiles.length);
68
+ if (baseProfiles.length > 0) {
69
+ console.log('');
70
+ console.log(' Tutor adaptation index: ', fmt(avg(baseProfiles, 'tutorAdaptIdx')));
71
+ console.log(' Learner growth index: ', fmt(avg(baseProfiles, 'learnerGrowthIdx')));
72
+ console.log(' Bilateral transformation: ', fmt(avg(baseProfiles, 'bilateralIdx')));
73
+ console.log(' Avg tutor signals: ', fmt(avg(baseProfiles, 'tutorSignals')));
74
+ console.log(' Avg learner signals: ', fmt(avg(baseProfiles, 'learnerSignals')));
75
+ console.log(' Avg transformation quality: ', fmt(avg(baseProfiles, 'quality')));
76
+ console.log(' Mutual transformation rate: ',
77
+ (baseProfiles.filter(p => p.isMutual).length / baseProfiles.length * 100).toFixed(0) + '%');
78
+ }
79
+
80
+ console.log('');
81
+ console.log('=== RECOGNITION PROFILES (ego-superego) ===');
82
+ console.log('Sample count:', recogProfiles.length);
83
+ if (recogProfiles.length > 0) {
84
+ console.log('');
85
+ console.log(' Tutor adaptation index: ', fmt(avg(recogProfiles, 'tutorAdaptIdx')));
86
+ console.log(' Learner growth index: ', fmt(avg(recogProfiles, 'learnerGrowthIdx')));
87
+ console.log(' Bilateral transformation: ', fmt(avg(recogProfiles, 'bilateralIdx')));
88
+ console.log(' Avg tutor signals: ', fmt(avg(recogProfiles, 'tutorSignals')));
89
+ console.log(' Avg learner signals: ', fmt(avg(recogProfiles, 'learnerSignals')));
90
+ console.log(' Avg transformation quality: ', fmt(avg(recogProfiles, 'quality')));
91
+ console.log(' Superego incorporation: ', fmt(avg(recogProfiles, 'superegoIncorp')));
92
+ console.log(' Mutual transformation rate: ',
93
+ (recogProfiles.filter(p => p.isMutual).length / recogProfiles.length * 100).toFixed(0) + '%');
94
+ }
95
+
96
+ console.log('');
97
+ console.log('╠══════════════════════════════════════════════════════════════╣');
98
+ console.log('║ COMPARISON ║');
99
+ console.log('╠══════════════════════════════════════════════════════════════╣');
100
+
101
+ if (baseProfiles.length > 0 && recogProfiles.length > 0) {
102
+ const baseTutor = avg(baseProfiles, 'tutorAdaptIdx');
103
+ const recogTutor = avg(recogProfiles, 'tutorAdaptIdx');
104
+ const baseQuality = avg(baseProfiles, 'quality');
105
+ const recogQuality = avg(recogProfiles, 'quality');
106
+
107
+ console.log('');
108
+ console.log(' Tutor adaptation delta: ',
109
+ baseTutor && recogTutor ? ((recogTutor - baseTutor) * 100).toFixed(1) + '% points' : 'N/A');
110
+ console.log(' Quality delta: ',
111
+ baseQuality !== null && recogQuality !== null ?
112
+ ((recogQuality - baseQuality)).toFixed(1) + ' points' : 'N/A');
113
+ }
114
+
115
+ console.log('');
116
+ console.log('╚══════════════════════════════════════════════════════════════╝');
@@ -0,0 +1,158 @@
1
+ #!/usr/bin/env node
2
+ import fs from 'fs';
3
+ import path from 'path';
4
+
5
+ const runId = process.argv[2] || 'eval-2026-02-03-f5d4dd93';
6
+ const logPath = `./logs/eval-progress/${runId}.jsonl`;
7
+
8
+ const lines = fs.readFileSync(logPath, 'utf8').split('\n').filter(l => l.trim());
9
+ const events = lines.map(l => JSON.parse(l));
10
+ const tests = events.filter(e => e.eventType === 'test_complete' && e.success && e.overallScore != null);
11
+
12
+ // Group by profile and scenario
13
+ const byProfileScenario = {};
14
+ for (const t of tests) {
15
+ const key = t.profileName + '|' + t.scenarioId;
16
+ if (!byProfileScenario[key]) byProfileScenario[key] = [];
17
+ byProfileScenario[key].push(t.overallScore);
18
+ }
19
+
20
+ const scenarios = [...new Set(tests.map(t => t.scenarioId))];
21
+
22
+ // Compare cell_2 vs cell_6 (both single+psycho, but base vs recog)
23
+ console.log('=== cell_2 (base+single+psycho) vs cell_6 (recog+single+psycho) ===');
24
+ console.log('Scenario'.padEnd(35), 'cell_2', 'cell_6', 'Delta');
25
+ console.log('-'.repeat(60));
26
+ let total2 = 0, total6 = 0, count = 0;
27
+ for (const s of scenarios) {
28
+ const c2 = byProfileScenario['cell_2_base_single_psycho|' + s] || [];
29
+ const c6 = byProfileScenario['cell_6_recog_single_psycho|' + s] || [];
30
+ const avg2 = c2.length ? (c2.reduce((a,b)=>a+b,0)/c2.length) : null;
31
+ const avg6 = c6.length ? (c6.reduce((a,b)=>a+b,0)/c6.length) : null;
32
+ const delta = (avg2 !== null && avg6 !== null) ? (avg2 - avg6) : null;
33
+ console.log(
34
+ s.padEnd(35),
35
+ (avg2 !== null ? avg2.toFixed(1) : 'N/A').padStart(5),
36
+ (avg6 !== null ? avg6.toFixed(1) : 'N/A').padStart(5),
37
+ (delta !== null ? (delta > 0 ? '+' : '') + delta.toFixed(1) : '-').padStart(6)
38
+ );
39
+ if (avg2 !== null && avg6 !== null) {
40
+ total2 += avg2;
41
+ total6 += avg6;
42
+ count++;
43
+ }
44
+ }
45
+ console.log('-'.repeat(60));
46
+ console.log('Average'.padEnd(35), (total2/count).toFixed(1).padStart(5), (total6/count).toFixed(1).padStart(5), ((total2-total6)/count > 0 ? '+' : '') + ((total2-total6)/count).toFixed(1).padStart(5));
47
+
48
+ // Now compare all base vs all recognition
49
+ console.log('\n=== All BASE profiles vs All RECOGNITION profiles ===');
50
+ const baseProfiles = ['cell_1_base_single_unified', 'cell_2_base_single_psycho', 'cell_3_base_multi_unified', 'cell_4_base_multi_psycho'];
51
+ const recogProfiles = ['cell_5_recog_single_unified', 'cell_6_recog_single_psycho', 'cell_7_recog_multi_unified', 'cell_8_recog_multi_psycho'];
52
+
53
+ console.log('Scenario'.padEnd(35), 'Base', 'Recog', 'Delta');
54
+ console.log('-'.repeat(60));
55
+ let totalBase = 0, totalRecog = 0, countScen = 0;
56
+ for (const s of scenarios) {
57
+ let baseScores = [], recogScores = [];
58
+ for (const p of baseProfiles) {
59
+ baseScores.push(...(byProfileScenario[p + '|' + s] || []));
60
+ }
61
+ for (const p of recogProfiles) {
62
+ recogScores.push(...(byProfileScenario[p + '|' + s] || []));
63
+ }
64
+ const avgBase = baseScores.length ? baseScores.reduce((a,b)=>a+b,0)/baseScores.length : null;
65
+ const avgRecog = recogScores.length ? recogScores.reduce((a,b)=>a+b,0)/recogScores.length : null;
66
+ const delta = (avgBase !== null && avgRecog !== null) ? (avgRecog - avgBase) : null;
67
+ console.log(
68
+ s.padEnd(35),
69
+ (avgBase !== null ? avgBase.toFixed(1) : 'N/A').padStart(5),
70
+ (avgRecog !== null ? avgRecog.toFixed(1) : 'N/A').padStart(5),
71
+ (delta !== null ? (delta > 0 ? '+' : '') + delta.toFixed(1) : '-').padStart(6)
72
+ );
73
+ if (avgBase !== null && avgRecog !== null) {
74
+ totalBase += avgBase;
75
+ totalRecog += avgRecog;
76
+ countScen++;
77
+ }
78
+ }
79
+ console.log('-'.repeat(60));
80
+ console.log('Average'.padEnd(35), (totalBase/countScen).toFixed(1).padStart(5), (totalRecog/countScen).toFixed(1).padStart(5), ((totalRecog-totalBase)/countScen > 0 ? '+' : '') + ((totalRecog-totalBase)/countScen).toFixed(1).padStart(5));
81
+
82
+ // Check raw scores for cell_2 to see variance
83
+ console.log('\n=== cell_2 raw scores (to check variance) ===');
84
+ for (const s of scenarios) {
85
+ const scores = byProfileScenario['cell_2_base_single_psycho|' + s] || [];
86
+ if (scores.length > 0) {
87
+ console.log(s.padEnd(35), scores.join(', '));
88
+ }
89
+ }
90
+
91
+ // Check which scenarios cell_2 wins vs loses
92
+ console.log('\n=== Where cell_2 beats cell_6 vs where it loses ===');
93
+ let wins = [], losses = [], ties = [];
94
+ for (const s of scenarios) {
95
+ const c2 = byProfileScenario['cell_2_base_single_psycho|' + s] || [];
96
+ const c6 = byProfileScenario['cell_6_recog_single_psycho|' + s] || [];
97
+ if (c2.length && c6.length) {
98
+ const avg2 = c2.reduce((a,b)=>a+b,0)/c2.length;
99
+ const avg6 = c6.reduce((a,b)=>a+b,0)/c6.length;
100
+ const delta = avg2 - avg6;
101
+ if (delta > 5) wins.push({ scenario: s, delta });
102
+ else if (delta < -5) losses.push({ scenario: s, delta });
103
+ else ties.push({ scenario: s, delta });
104
+ }
105
+ }
106
+ console.log('cell_2 WINS (>5 pts):');
107
+ wins.sort((a,b) => b.delta - a.delta).forEach(w => console.log(` ${w.scenario}: +${w.delta.toFixed(1)}`));
108
+ console.log('cell_2 LOSES (<-5 pts):');
109
+ losses.sort((a,b) => a.delta - b.delta).forEach(l => console.log(` ${l.scenario}: ${l.delta.toFixed(1)}`));
110
+ console.log('TIES (±5 pts):');
111
+ ties.forEach(t => console.log(` ${t.scenario}: ${t.delta > 0 ? '+' : ''}${t.delta.toFixed(1)}`));
112
+
113
+ // Analyze dialogue files for validation patterns
114
+ console.log('\n=== Dialogue Validation Analysis ===');
115
+
116
+ const dir = 'logs/tutor-dialogues';
117
+ const files = fs.readdirSync(dir).filter(f => f.startsWith('dialogue-177008') || f.startsWith('dialogue-177009'));
118
+
119
+ const forbidden = ['next lecture', 'move on to', 'continue with'];
120
+
121
+ let results = { budget: [], recognition: [] };
122
+
123
+ for (const f of files) {
124
+ try {
125
+ const d = JSON.parse(fs.readFileSync(path.join(dir, f), 'utf8'));
126
+ const s = d.suggestions?.[0];
127
+ if (s == null) continue;
128
+
129
+ const userText = ((s.title || '') + ' ' + (s.message || '')).toLowerCase();
130
+ const fullText = JSON.stringify(d.suggestions).toLowerCase();
131
+
132
+ const hasReview = fullText.includes('review');
133
+ const hasForbidden = forbidden.some(fb => userText.includes(fb));
134
+
135
+ const profile = d.profileName === 'budget' ? 'budget' : d.profileName === 'recognition' ? 'recognition' : null;
136
+ if (profile) {
137
+ results[profile].push({
138
+ hasReview,
139
+ hasForbidden,
140
+ score: (hasReview ? 50 : 0) + (hasForbidden ? 0 : 50)
141
+ });
142
+ }
143
+ } catch (e) {}
144
+ }
145
+
146
+ for (const profile of ['budget', 'recognition']) {
147
+ const r = results[profile];
148
+ if (r.length === 0) continue;
149
+ const avgScore = r.reduce((a, b) => a + b.score, 0) / r.length;
150
+ const reviewPass = r.filter(x => x.hasReview).length;
151
+ const forbiddenPass = r.filter(x => x.hasForbidden === false).length;
152
+ console.log(profile + ':');
153
+ console.log(' Samples:', r.length);
154
+ console.log(' Review present (required):', reviewPass, '(' + (reviewPass/r.length*100).toFixed(1) + '%)');
155
+ console.log(' Forbidden absent:', forbiddenPass, '(' + (forbiddenPass/r.length*100).toFixed(1) + '%)');
156
+ console.log(' Avg validation score:', avgScore.toFixed(1));
157
+ console.log('');
158
+ }