@machinespirits/eval 0.1.2 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (102) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +161 -0
  3. package/config/eval-settings.yaml +18 -0
  4. package/config/evaluation-rubric-learner.yaml +277 -0
  5. package/config/evaluation-rubric.yaml +613 -0
  6. package/config/interaction-eval-scenarios.yaml +93 -50
  7. package/config/learner-agents.yaml +124 -193
  8. package/config/machinespirits-eval.code-workspace +11 -0
  9. package/config/providers.yaml +60 -0
  10. package/config/suggestion-scenarios.yaml +1399 -0
  11. package/config/tutor-agents.yaml +716 -0
  12. package/docs/EVALUATION-VARIABLES.md +589 -0
  13. package/docs/REPLICATION-PLAN.md +577 -0
  14. package/index.js +15 -6
  15. package/package.json +16 -22
  16. package/routes/evalRoutes.js +88 -36
  17. package/scripts/analyze-judge-reliability.js +401 -0
  18. package/scripts/analyze-run.js +97 -0
  19. package/scripts/analyze-run.mjs +282 -0
  20. package/scripts/analyze-validation-failures.js +141 -0
  21. package/scripts/check-run.mjs +17 -0
  22. package/scripts/code-impasse-strategies.js +1132 -0
  23. package/scripts/compare-runs.js +44 -0
  24. package/scripts/compare-suggestions.js +80 -0
  25. package/scripts/compare-transformation.js +116 -0
  26. package/scripts/dig-into-run.js +158 -0
  27. package/scripts/eval-cli.js +2626 -0
  28. package/scripts/generate-paper-figures.py +452 -0
  29. package/scripts/qualitative-analysis-ai.js +1313 -0
  30. package/scripts/qualitative-analysis.js +688 -0
  31. package/scripts/seed-db.js +87 -0
  32. package/scripts/show-failed-suggestions.js +64 -0
  33. package/scripts/validate-content.js +192 -0
  34. package/server.js +3 -2
  35. package/services/__tests__/evalConfigLoader.test.js +338 -0
  36. package/services/anovaStats.js +499 -0
  37. package/services/contentResolver.js +407 -0
  38. package/services/dialogueTraceAnalyzer.js +454 -0
  39. package/services/evalConfigLoader.js +625 -0
  40. package/services/evaluationRunner.js +2171 -270
  41. package/services/evaluationStore.js +564 -29
  42. package/services/learnerConfigLoader.js +75 -5
  43. package/services/learnerRubricEvaluator.js +284 -0
  44. package/services/learnerTutorInteractionEngine.js +375 -0
  45. package/services/processUtils.js +18 -0
  46. package/services/progressLogger.js +98 -0
  47. package/services/promptRecommendationService.js +31 -26
  48. package/services/promptRewriter.js +427 -0
  49. package/services/rubricEvaluator.js +543 -70
  50. package/services/streamingReporter.js +104 -0
  51. package/services/turnComparisonAnalyzer.js +494 -0
  52. package/components/MobileEvalDashboard.tsx +0 -267
  53. package/components/comparison/DeltaAnalysisTable.tsx +0 -137
  54. package/components/comparison/ProfileComparisonCard.tsx +0 -176
  55. package/components/comparison/RecognitionABMode.tsx +0 -385
  56. package/components/comparison/RecognitionMetricsPanel.tsx +0 -135
  57. package/components/comparison/WinnerIndicator.tsx +0 -64
  58. package/components/comparison/index.ts +0 -5
  59. package/components/mobile/BottomSheet.tsx +0 -233
  60. package/components/mobile/DimensionBreakdown.tsx +0 -210
  61. package/components/mobile/DocsView.tsx +0 -363
  62. package/components/mobile/LogsView.tsx +0 -481
  63. package/components/mobile/PsychodynamicQuadrant.tsx +0 -261
  64. package/components/mobile/QuickTestView.tsx +0 -1098
  65. package/components/mobile/RecognitionTypeChart.tsx +0 -124
  66. package/components/mobile/RecognitionView.tsx +0 -809
  67. package/components/mobile/RunDetailView.tsx +0 -261
  68. package/components/mobile/RunHistoryView.tsx +0 -367
  69. package/components/mobile/ScoreRadial.tsx +0 -211
  70. package/components/mobile/StreamingLogPanel.tsx +0 -230
  71. package/components/mobile/SynthesisStrategyChart.tsx +0 -140
  72. package/docs/research/ABLATION-DIALOGUE-ROUNDS.md +0 -52
  73. package/docs/research/ABLATION-MODEL-SELECTION.md +0 -53
  74. package/docs/research/ADVANCED-EVAL-ANALYSIS.md +0 -60
  75. package/docs/research/ANOVA-RESULTS-2026-01-14.md +0 -257
  76. package/docs/research/COMPREHENSIVE-EVALUATION-PLAN.md +0 -586
  77. package/docs/research/COST-ANALYSIS.md +0 -56
  78. package/docs/research/CRITICAL-REVIEW-RECOGNITION-TUTORING.md +0 -340
  79. package/docs/research/DYNAMIC-VS-SCRIPTED-ANALYSIS.md +0 -291
  80. package/docs/research/EVAL-SYSTEM-ANALYSIS.md +0 -306
  81. package/docs/research/FACTORIAL-RESULTS-2026-01-14.md +0 -301
  82. package/docs/research/IMPLEMENTATION-PLAN-CRITIQUE-RESPONSE.md +0 -1988
  83. package/docs/research/LONGITUDINAL-DYADIC-EVALUATION.md +0 -282
  84. package/docs/research/MULTI-JUDGE-VALIDATION-2026-01-14.md +0 -147
  85. package/docs/research/PAPER-EXTENSION-DYADIC.md +0 -204
  86. package/docs/research/PAPER-UNIFIED.md +0 -659
  87. package/docs/research/PAPER-UNIFIED.pdf +0 -0
  88. package/docs/research/PROMPT-IMPROVEMENTS-2026-01-14.md +0 -356
  89. package/docs/research/SESSION-NOTES-2026-01-11-RECOGNITION-EVAL.md +0 -419
  90. package/docs/research/apa.csl +0 -2133
  91. package/docs/research/archive/PAPER-DRAFT-RECOGNITION-TUTORING.md +0 -1637
  92. package/docs/research/archive/paper-multiagent-tutor.tex +0 -978
  93. package/docs/research/paper-draft/full-paper.md +0 -136
  94. package/docs/research/paper-draft/images/pasted-image-2026-01-24T03-47-47-846Z-d76a7ae2.png +0 -0
  95. package/docs/research/paper-draft/references.bib +0 -515
  96. package/docs/research/transcript-baseline.md +0 -139
  97. package/docs/research/transcript-recognition-multiagent.md +0 -187
  98. package/hooks/useEvalData.ts +0 -625
  99. package/server-init.js +0 -45
  100. package/services/benchmarkService.js +0 -1892
  101. package/types.ts +0 -165
  102. package/utils/haptics.ts +0 -45
@@ -0,0 +1,401 @@
1
+ #!/usr/bin/env node
2
+ /**
3
+ * Inter-Judge Reliability Analysis
4
+ *
5
+ * Calculates agreement metrics between AI judges that scored the SAME responses.
6
+ *
7
+ * IMPORTANT: This requires paired data where identical responses were scored by
8
+ * multiple judges. Generate this by rejudging an existing run:
9
+ *
10
+ * node scripts/eval-cli.js rejudge <runId> --judge openrouter/anthropic/claude-sonnet-4.5
11
+ * node scripts/eval-cli.js rejudge <runId> --judge openrouter/moonshotai/kimi-k2.5
12
+ *
13
+ * The script matches responses by their `suggestions` content (MD5 hash) to find
14
+ * cases where the exact same tutor output was scored by different judges.
15
+ *
16
+ * Reports:
17
+ * - Pearson correlation (linear agreement)
18
+ * - Spearman rank correlation (ordinal agreement)
19
+ * - Mean absolute difference (calibration)
20
+ * - Per-dimension agreement
21
+ *
22
+ * Usage:
23
+ * node scripts/analyze-judge-reliability.js # All data
24
+ * node scripts/analyze-judge-reliability.js --run <runId> # Specific run
25
+ * node scripts/analyze-judge-reliability.js --verbose # Show disagreements
26
+ */
27
+
28
+ import Database from 'better-sqlite3';
29
+ import path from 'path';
30
+ import { fileURLToPath } from 'url';
31
+
32
+ const __filename = fileURLToPath(import.meta.url);
33
+ const __dirname = path.dirname(__filename);
34
+ const DB_PATH = path.join(__dirname, '..', 'data', 'evaluations.db');
35
+
36
+ // Parse CLI args
37
+ const args = process.argv.slice(2);
38
+ const getOption = (name) => {
39
+ const idx = args.indexOf(`--${name}`);
40
+ return idx !== -1 && args[idx + 1] ? args[idx + 1] : null;
41
+ };
42
+ const hasFlag = (name) => args.includes(`--${name}`);
43
+
44
+ const runIdFilter = getOption('run');
45
+ const verbose = hasFlag('verbose');
46
+
47
+ // Statistics helpers
48
+ function mean(arr) {
49
+ if (!arr.length) return 0;
50
+ return arr.reduce((s, v) => s + v, 0) / arr.length;
51
+ }
52
+
53
+ function std(arr) {
54
+ if (arr.length < 2) return 0;
55
+ const m = mean(arr);
56
+ const variance = arr.reduce((s, v) => s + (v - m) ** 2, 0) / (arr.length - 1);
57
+ return Math.sqrt(variance);
58
+ }
59
+
60
+ function pearsonCorrelation(x, y) {
61
+ if (x.length !== y.length || x.length < 3) return null;
62
+ const mx = mean(x);
63
+ const my = mean(y);
64
+ const sx = std(x);
65
+ const sy = std(y);
66
+ if (sx === 0 || sy === 0) return null;
67
+
68
+ let sum = 0;
69
+ for (let i = 0; i < x.length; i++) {
70
+ sum += (x[i] - mx) * (y[i] - my);
71
+ }
72
+ return sum / ((x.length - 1) * sx * sy);
73
+ }
74
+
75
+ function spearmanCorrelation(x, y) {
76
+ if (x.length !== y.length || x.length < 3) return null;
77
+
78
+ // Convert to ranks
79
+ const rankify = (arr) => {
80
+ const sorted = arr.map((v, i) => ({ v, i })).sort((a, b) => a.v - b.v);
81
+ const ranks = new Array(arr.length);
82
+ for (let i = 0; i < sorted.length; i++) {
83
+ ranks[sorted[i].i] = i + 1;
84
+ }
85
+ return ranks;
86
+ };
87
+
88
+ const rx = rankify(x);
89
+ const ry = rankify(y);
90
+ return pearsonCorrelation(rx, ry);
91
+ }
92
+
93
+ function meanAbsoluteDifference(x, y) {
94
+ if (x.length !== y.length || x.length === 0) return null;
95
+ let sum = 0;
96
+ for (let i = 0; i < x.length; i++) {
97
+ sum += Math.abs(x[i] - y[i]);
98
+ }
99
+ return sum / x.length;
100
+ }
101
+
102
+ function cronbachAlpha(items) {
103
+ // items: array of arrays, each inner array is scores from one rater
104
+ // Returns alpha for internal consistency
105
+ if (items.length < 2 || items[0].length < 2) return null;
106
+
107
+ const k = items.length;
108
+ const n = items[0].length;
109
+
110
+ // Calculate variance of each item and total
111
+ const itemVariances = items.map(item => {
112
+ const m = mean(item);
113
+ return item.reduce((s, v) => s + (v - m) ** 2, 0) / (n - 1);
114
+ });
115
+
116
+ // Total scores per subject
117
+ const totals = [];
118
+ for (let i = 0; i < n; i++) {
119
+ totals.push(items.reduce((s, item) => s + item[i], 0));
120
+ }
121
+ const totalVariance = (() => {
122
+ const m = mean(totals);
123
+ return totals.reduce((s, v) => s + (v - m) ** 2, 0) / (n - 1);
124
+ })();
125
+
126
+ const sumItemVariances = itemVariances.reduce((s, v) => s + v, 0);
127
+
128
+ if (totalVariance === 0) return null;
129
+ return (k / (k - 1)) * (1 - sumItemVariances / totalVariance);
130
+ }
131
+
132
+ // Simple hash for grouping identical responses
133
+ function simpleHash(str) {
134
+ if (!str) return null;
135
+ let hash = 0;
136
+ for (let i = 0; i < str.length; i++) {
137
+ const char = str.charCodeAt(i);
138
+ hash = ((hash << 5) - hash) + char;
139
+ hash = hash & hash; // Convert to 32bit integer
140
+ }
141
+ return hash.toString(16);
142
+ }
143
+
144
+ // Main analysis
145
+ function analyzeJudgeReliability() {
146
+ const db = new Database(DB_PATH, { readonly: true });
147
+
148
+ console.log('Inter-Judge Reliability Analysis');
149
+ console.log('='.repeat(60));
150
+ console.log('');
151
+
152
+ // Find all judge models
153
+ const judges = db.prepare(`
154
+ SELECT DISTINCT judge_model
155
+ FROM evaluation_results
156
+ WHERE judge_model IS NOT NULL
157
+ `).all().map(r => r.judge_model);
158
+
159
+ console.log(`Judges found: ${judges.join(', ')}`);
160
+ console.log('');
161
+
162
+ // Find paired judgments - must be SAME response content judged by different models
163
+ // Match on suggestions content (the actual tutor response), not just scenario/profile
164
+ let whereClause = 'WHERE judge_model IS NOT NULL AND overall_score IS NOT NULL AND suggestions IS NOT NULL';
165
+ if (runIdFilter) {
166
+ whereClause += ` AND run_id = '${runIdFilter}'`;
167
+ }
168
+
169
+ const pairedQuery = `
170
+ SELECT
171
+ run_id,
172
+ scenario_id,
173
+ profile_name,
174
+ judge_model,
175
+ overall_score,
176
+ score_relevance,
177
+ score_specificity,
178
+ score_pedagogical,
179
+ score_personalization,
180
+ score_actionability,
181
+ score_tone,
182
+ suggestions
183
+ FROM evaluation_results
184
+ ${whereClause}
185
+ ORDER BY suggestions, judge_model
186
+ `;
187
+
188
+ const results = db.prepare(pairedQuery).all();
189
+
190
+ // Group by RESPONSE CONTENT (suggestions hash) - not scenario/profile
191
+ // This ensures we only compare when the exact same response was judged multiple times
192
+ const responseGroups = new Map();
193
+
194
+ for (const r of results) {
195
+ // Use suggestions content hash as grouping key
196
+ const contentHash = simpleHash(r.suggestions);
197
+ if (!contentHash) continue;
198
+
199
+ const key = contentHash;
200
+ if (!responseGroups.has(key)) {
201
+ responseGroups.set(key, []);
202
+ }
203
+ responseGroups.get(key).push(r);
204
+ }
205
+
206
+ // Count how many responses have multiple judgments
207
+ let responsesWithMultipleJudges = 0;
208
+ for (const [key, group] of responseGroups) {
209
+ const uniqueJudges = new Set(group.map(r => r.judge_model));
210
+ if (uniqueJudges.size > 1) {
211
+ responsesWithMultipleJudges++;
212
+ }
213
+ }
214
+
215
+ if (responsesWithMultipleJudges === 0) {
216
+ console.log('⚠️ No paired judgments found!');
217
+ console.log('');
218
+ console.log('To analyze inter-judge reliability, you need the SAME response');
219
+ console.log('scored by multiple judges. Generate this data by rejudging a run:');
220
+ console.log('');
221
+ console.log(' # First, pick a completed run:');
222
+ console.log(' node scripts/eval-cli.js list');
223
+ console.log('');
224
+ console.log(' # Then rejudge with different models:');
225
+ console.log(' node scripts/eval-cli.js rejudge <runId> --judge openrouter/anthropic/claude-sonnet-4.5');
226
+ console.log(' node scripts/eval-cli.js rejudge <runId> --judge openrouter/moonshotai/kimi-k2.5');
227
+ console.log('');
228
+ console.log(' # Then run this analysis again');
229
+ console.log('');
230
+ db.close();
231
+ return;
232
+ }
233
+
234
+ console.log(`Responses with multiple judges: ${responsesWithMultipleJudges}`);
235
+ console.log('');
236
+
237
+ // Find groups with multiple judges
238
+ const pairsData = [];
239
+ const judgePairs = new Map(); // "judgeA|judgeB" -> [{score1, score2, ...}]
240
+
241
+ for (const [key, group] of responseGroups) {
242
+ const judgeScores = new Map();
243
+ for (const r of group) {
244
+ if (!judgeScores.has(r.judge_model)) {
245
+ judgeScores.set(r.judge_model, []);
246
+ }
247
+ judgeScores.set(r.judge_model, r);
248
+ }
249
+
250
+ // Only consider if multiple judges
251
+ if (judgeScores.size > 1) {
252
+ const judgeList = Array.from(judgeScores.keys()).sort();
253
+
254
+ // Create pairs for each combination
255
+ for (let i = 0; i < judgeList.length; i++) {
256
+ for (let j = i + 1; j < judgeList.length; j++) {
257
+ const pairKey = `${judgeList[i]}|${judgeList[j]}`;
258
+ if (!judgePairs.has(pairKey)) {
259
+ judgePairs.set(pairKey, []);
260
+ }
261
+
262
+ const s1 = judgeScores.get(judgeList[i]);
263
+ const s2 = judgeScores.get(judgeList[j]);
264
+
265
+ judgePairs.get(pairKey).push({
266
+ judge1: judgeList[i],
267
+ judge2: judgeList[j],
268
+ score1: s1.overall_score,
269
+ score2: s2.overall_score,
270
+ diff: Math.abs(s1.overall_score - s2.overall_score),
271
+ dimensions: {
272
+ relevance: [s1.score_relevance, s2.score_relevance],
273
+ specificity: [s1.score_specificity, s2.score_specificity],
274
+ pedagogical: [s1.score_pedagogical, s2.score_pedagogical],
275
+ personalization: [s1.score_personalization, s2.score_personalization],
276
+ actionability: [s1.score_actionability, s2.score_actionability],
277
+ tone: [s1.score_tone, s2.score_tone]
278
+ },
279
+ scenario: s1.scenario_id,
280
+ profile: s1.profile_name
281
+ });
282
+ }
283
+ }
284
+ }
285
+ }
286
+
287
+ if (judgePairs.size === 0) {
288
+ console.log('No paired judgments found (same response scored by multiple judges).');
289
+ console.log('');
290
+ console.log('To generate paired data, use the rejudge command with a different model:');
291
+ console.log(' node scripts/eval-cli.js rejudge <runId> --judge openrouter/anthropic/claude-sonnet-4.5');
292
+ db.close();
293
+ return;
294
+ }
295
+
296
+ console.log(`Found ${judgePairs.size} judge pair combinations`);
297
+ console.log('');
298
+
299
+ // Analyze each pair
300
+ const overallScores1 = [];
301
+ const overallScores2 = [];
302
+ const allDisagreements = [];
303
+
304
+ for (const [pairKey, pairs] of judgePairs) {
305
+ const [judge1, judge2] = pairKey.split('|');
306
+ const n = pairs.length;
307
+
308
+ const scores1 = pairs.map(p => p.score1);
309
+ const scores2 = pairs.map(p => p.score2);
310
+
311
+ overallScores1.push(...scores1);
312
+ overallScores2.push(...scores2);
313
+
314
+ const pearson = pearsonCorrelation(scores1, scores2);
315
+ const spearman = spearmanCorrelation(scores1, scores2);
316
+ const mad = meanAbsoluteDifference(scores1, scores2);
317
+
318
+ console.log(`\n${judge1.split('/').pop()} vs ${judge2.split('/').pop()}`);
319
+ console.log('-'.repeat(50));
320
+ console.log(` Paired responses: ${n}`);
321
+ console.log(` Pearson r: ${pearson !== null ? pearson.toFixed(3) : 'N/A'}`);
322
+ console.log(` Spearman ρ: ${spearman !== null ? spearman.toFixed(3) : 'N/A'}`);
323
+ console.log(` Mean Abs Diff: ${mad !== null ? mad.toFixed(2) : 'N/A'} pts`);
324
+ console.log(` Mean scores: ${mean(scores1).toFixed(1)} vs ${mean(scores2).toFixed(1)}`);
325
+
326
+ // Per-dimension analysis
327
+ const dimensions = ['relevance', 'specificity', 'pedagogical', 'personalization', 'actionability', 'tone'];
328
+ console.log('\n Per-dimension correlations:');
329
+
330
+ for (const dim of dimensions) {
331
+ const d1 = pairs.map(p => p.dimensions[dim][0]).filter(v => v != null);
332
+ const d2 = pairs.map(p => p.dimensions[dim][1]).filter(v => v != null);
333
+
334
+ if (d1.length >= 3 && d2.length >= 3) {
335
+ const r = pearsonCorrelation(d1, d2);
336
+ console.log(` ${dim.padEnd(16)} r = ${r !== null ? r.toFixed(3) : 'N/A'}`);
337
+ }
338
+ }
339
+
340
+ // Identify major disagreements (diff > 20)
341
+ const bigDisagreements = pairs.filter(p => p.diff > 20);
342
+ if (bigDisagreements.length > 0) {
343
+ allDisagreements.push(...bigDisagreements);
344
+ console.log(`\n Major disagreements (diff > 20): ${bigDisagreements.length}`);
345
+
346
+ if (verbose) {
347
+ for (const d of bigDisagreements.slice(0, 5)) {
348
+ console.log(` ${d.scenario} / ${d.profile}: ${d.score1} vs ${d.score2} (Δ${d.diff.toFixed(0)})`);
349
+ }
350
+ }
351
+ }
352
+ }
353
+
354
+ // Overall summary
355
+ console.log('\n' + '='.repeat(60));
356
+ console.log('OVERALL RELIABILITY SUMMARY');
357
+ console.log('='.repeat(60));
358
+
359
+ const totalPairs = overallScores1.length;
360
+ const overallPearson = pearsonCorrelation(overallScores1, overallScores2);
361
+ const overallSpearman = spearmanCorrelation(overallScores1, overallScores2);
362
+ const overallMAD = meanAbsoluteDifference(overallScores1, overallScores2);
363
+
364
+ console.log(`\nTotal paired judgments: ${totalPairs}`);
365
+ console.log(`Overall Pearson r: ${overallPearson !== null ? overallPearson.toFixed(3) : 'N/A'}`);
366
+ console.log(`Overall Spearman ρ: ${overallSpearman !== null ? overallSpearman.toFixed(3) : 'N/A'}`);
367
+ console.log(`Overall Mean Abs Diff: ${overallMAD !== null ? overallMAD.toFixed(2) : 'N/A'} pts`);
368
+
369
+ // Interpretation
370
+ console.log('\nInterpretation:');
371
+ if (overallPearson !== null) {
372
+ if (overallPearson >= 0.8) {
373
+ console.log(' ✓ Excellent agreement (r ≥ 0.8)');
374
+ } else if (overallPearson >= 0.6) {
375
+ console.log(' ○ Good agreement (0.6 ≤ r < 0.8)');
376
+ } else if (overallPearson >= 0.4) {
377
+ console.log(' △ Moderate agreement (0.4 ≤ r < 0.6)');
378
+ } else {
379
+ console.log(' ✗ Poor agreement (r < 0.4)');
380
+ }
381
+ }
382
+
383
+ if (overallMAD !== null) {
384
+ console.log(` Average score difference: ${overallMAD.toFixed(1)} points on 100-point scale`);
385
+ }
386
+
387
+ if (allDisagreements.length > 0) {
388
+ console.log(` ${allDisagreements.length} major disagreements (>20 pts) found`);
389
+ }
390
+
391
+ console.log('');
392
+ db.close();
393
+ }
394
+
395
+ // Run
396
+ try {
397
+ analyzeJudgeReliability();
398
+ } catch (err) {
399
+ console.error('Error:', err.message);
400
+ process.exit(1);
401
+ }
@@ -0,0 +1,97 @@
1
+ #!/usr/bin/env node
2
+ import fs from 'fs';
3
+
4
+ const runId = process.argv[2] || 'eval-2026-02-03-c8d32121';
5
+ const logPath = `./logs/eval-progress/${runId}.jsonl`;
6
+
7
+ if (!fs.existsSync(logPath)) {
8
+ console.error('Log file not found:', logPath);
9
+ process.exit(1);
10
+ }
11
+
12
+ const lines = fs.readFileSync(logPath, 'utf8').split('\n').filter(l => l.trim());
13
+ const events = lines.map(l => JSON.parse(l));
14
+
15
+ // Filter to successful test_complete events
16
+ const successful = events.filter(e =>
17
+ e.eventType === 'test_complete' &&
18
+ e.success === true &&
19
+ e.overallScore != null
20
+ );
21
+
22
+ console.log('Run:', runId);
23
+ console.log('Total successful results:', successful.length);
24
+ console.log('');
25
+
26
+ // Group by profile
27
+ const byProfile = {};
28
+ for (const r of successful) {
29
+ const profile = r.profileName;
30
+ if (!byProfile[profile]) byProfile[profile] = [];
31
+ byProfile[profile].push(r.overallScore);
32
+ }
33
+
34
+ console.log('By Profile (avg score):');
35
+ for (const [profile, scores] of Object.entries(byProfile).sort((a,b) => {
36
+ const avgA = a[1].reduce((s,v) => s+v, 0) / a[1].length;
37
+ const avgB = b[1].reduce((s,v) => s+v, 0) / b[1].length;
38
+ return avgB - avgA;
39
+ })) {
40
+ const avg = scores.reduce((s,v) => s+v, 0) / scores.length;
41
+ console.log(` ${profile}: ${avg.toFixed(1)} (n=${scores.length})`);
42
+ }
43
+
44
+ // Factor analysis
45
+ const factors = {
46
+ 'Factor A (recognition)': { on: [], off: [] },
47
+ 'Factor B (tutor arch)': { multi: [], single: [] },
48
+ 'Factor C (learner arch)': { psycho: [], unified: [] }
49
+ };
50
+
51
+ for (const r of successful) {
52
+ const profile = r.profileName;
53
+ const score = r.overallScore;
54
+
55
+ // Factor A: Recognition (cells 5-8 = on, cells 1-4 = off)
56
+ if (profile.includes('recog')) factors['Factor A (recognition)'].on.push(score);
57
+ else factors['Factor A (recognition)'].off.push(score);
58
+
59
+ // Factor B: Tutor arch (cells 3,4,7,8 = multi, cells 1,2,5,6 = single)
60
+ if (profile.includes('multi')) factors['Factor B (tutor arch)'].multi.push(score);
61
+ else factors['Factor B (tutor arch)'].single.push(score);
62
+
63
+ // Factor C: Learner arch (cells 2,4,6,8 = psycho, cells 1,3,5,7 = unified)
64
+ if (profile.includes('psycho')) factors['Factor C (learner arch)'].psycho.push(score);
65
+ else factors['Factor C (learner arch)'].unified.push(score);
66
+ }
67
+
68
+ console.log('');
69
+ console.log('Factor Analysis:');
70
+ for (const [factor, levels] of Object.entries(factors)) {
71
+ const level1 = Object.keys(levels)[0];
72
+ const level2 = Object.keys(levels)[1];
73
+ const n1 = levels[level1].length;
74
+ const n2 = levels[level2].length;
75
+ if (n1 === 0 || n2 === 0) continue;
76
+ const avg1 = levels[level1].reduce((s,v) => s+v, 0) / n1;
77
+ const avg2 = levels[level2].reduce((s,v) => s+v, 0) / n2;
78
+ const delta = avg1 - avg2;
79
+ console.log(` ${factor}:`);
80
+ console.log(` ${level1}: ${avg1.toFixed(1)} (n=${n1})`);
81
+ console.log(` ${level2}: ${avg2.toFixed(1)} (n=${n2})`);
82
+ console.log(` Delta: ${delta > 0 ? '+' : ''}${delta.toFixed(1)}`);
83
+ }
84
+
85
+ // Group by scenario
86
+ console.log('');
87
+ console.log('By Scenario:');
88
+ const byScenario = {};
89
+ for (const r of successful) {
90
+ const scenario = r.scenarioId;
91
+ if (!byScenario[scenario]) byScenario[scenario] = [];
92
+ byScenario[scenario].push({ profile: r.profileName, score: r.overallScore });
93
+ }
94
+ for (const [scenario, data] of Object.entries(byScenario)) {
95
+ const avg = data.reduce((s,d) => s + d.score, 0) / data.length;
96
+ console.log(` ${scenario}: avg=${avg.toFixed(1)} (n=${data.length})`);
97
+ }