@machinespirits/eval 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. package/components/MobileEvalDashboard.tsx +267 -0
  2. package/components/comparison/DeltaAnalysisTable.tsx +137 -0
  3. package/components/comparison/ProfileComparisonCard.tsx +176 -0
  4. package/components/comparison/RecognitionABMode.tsx +385 -0
  5. package/components/comparison/RecognitionMetricsPanel.tsx +135 -0
  6. package/components/comparison/WinnerIndicator.tsx +64 -0
  7. package/components/comparison/index.ts +5 -0
  8. package/components/mobile/BottomSheet.tsx +233 -0
  9. package/components/mobile/DimensionBreakdown.tsx +210 -0
  10. package/components/mobile/DocsView.tsx +363 -0
  11. package/components/mobile/LogsView.tsx +481 -0
  12. package/components/mobile/PsychodynamicQuadrant.tsx +261 -0
  13. package/components/mobile/QuickTestView.tsx +1098 -0
  14. package/components/mobile/RecognitionTypeChart.tsx +124 -0
  15. package/components/mobile/RecognitionView.tsx +809 -0
  16. package/components/mobile/RunDetailView.tsx +261 -0
  17. package/components/mobile/RunHistoryView.tsx +367 -0
  18. package/components/mobile/ScoreRadial.tsx +211 -0
  19. package/components/mobile/StreamingLogPanel.tsx +230 -0
  20. package/components/mobile/SynthesisStrategyChart.tsx +140 -0
  21. package/config/interaction-eval-scenarios.yaml +832 -0
  22. package/config/learner-agents.yaml +248 -0
  23. package/docs/research/ABLATION-DIALOGUE-ROUNDS.md +52 -0
  24. package/docs/research/ABLATION-MODEL-SELECTION.md +53 -0
  25. package/docs/research/ADVANCED-EVAL-ANALYSIS.md +60 -0
  26. package/docs/research/ANOVA-RESULTS-2026-01-14.md +257 -0
  27. package/docs/research/COMPREHENSIVE-EVALUATION-PLAN.md +586 -0
  28. package/docs/research/COST-ANALYSIS.md +56 -0
  29. package/docs/research/CRITICAL-REVIEW-RECOGNITION-TUTORING.md +340 -0
  30. package/docs/research/DYNAMIC-VS-SCRIPTED-ANALYSIS.md +291 -0
  31. package/docs/research/EVAL-SYSTEM-ANALYSIS.md +306 -0
  32. package/docs/research/FACTORIAL-RESULTS-2026-01-14.md +301 -0
  33. package/docs/research/IMPLEMENTATION-PLAN-CRITIQUE-RESPONSE.md +1988 -0
  34. package/docs/research/LONGITUDINAL-DYADIC-EVALUATION.md +282 -0
  35. package/docs/research/MULTI-JUDGE-VALIDATION-2026-01-14.md +147 -0
  36. package/docs/research/PAPER-EXTENSION-DYADIC.md +204 -0
  37. package/docs/research/PAPER-UNIFIED.md +659 -0
  38. package/docs/research/PAPER-UNIFIED.pdf +0 -0
  39. package/docs/research/PROMPT-IMPROVEMENTS-2026-01-14.md +356 -0
  40. package/docs/research/SESSION-NOTES-2026-01-11-RECOGNITION-EVAL.md +419 -0
  41. package/docs/research/apa.csl +2133 -0
  42. package/docs/research/archive/PAPER-DRAFT-RECOGNITION-TUTORING.md +1637 -0
  43. package/docs/research/archive/paper-multiagent-tutor.tex +978 -0
  44. package/docs/research/paper-draft/full-paper.md +136 -0
  45. package/docs/research/paper-draft/images/pasted-image-2026-01-24T03-47-47-846Z-d76a7ae2.png +0 -0
  46. package/docs/research/paper-draft/references.bib +515 -0
  47. package/docs/research/transcript-baseline.md +139 -0
  48. package/docs/research/transcript-recognition-multiagent.md +187 -0
  49. package/hooks/useEvalData.ts +625 -0
  50. package/index.js +27 -0
  51. package/package.json +73 -0
  52. package/routes/evalRoutes.js +3002 -0
  53. package/scripts/advanced-eval-analysis.js +351 -0
  54. package/scripts/analyze-eval-costs.js +378 -0
  55. package/scripts/analyze-eval-results.js +513 -0
  56. package/scripts/analyze-interaction-evals.js +368 -0
  57. package/server-init.js +45 -0
  58. package/server.js +162 -0
  59. package/services/benchmarkService.js +1892 -0
  60. package/services/evaluationRunner.js +739 -0
  61. package/services/evaluationStore.js +1121 -0
  62. package/services/learnerConfigLoader.js +385 -0
  63. package/services/learnerTutorInteractionEngine.js +857 -0
  64. package/services/memory/learnerMemoryService.js +1227 -0
  65. package/services/memory/learnerWritingPad.js +577 -0
  66. package/services/memory/tutorWritingPad.js +674 -0
  67. package/services/promptRecommendationService.js +493 -0
  68. package/services/rubricEvaluator.js +826 -0
@@ -0,0 +1,513 @@
1
+ #!/usr/bin/env node
2
+
3
+ /**
4
+ * Statistical Analysis for Evaluation Results
5
+ *
6
+ * Computes effect sizes (Cohen's d), confidence intervals, and p-values
7
+ * for comparing tutor profiles across evaluation scenarios.
8
+ *
9
+ * Usage:
10
+ * node scripts/analyze-eval-results.js --profiles baseline,recognition
11
+ * node scripts/analyze-eval-results.js --run-id eval-2026-01-11-xxx
12
+ * node scripts/analyze-eval-results.js --export results.csv
13
+ */
14
+
15
+ import 'dotenv/config';
16
+ import Database from 'better-sqlite3';
17
+ import path from 'path';
18
+ import { fileURLToPath } from 'url';
19
+ import fs from 'fs';
20
+
21
+ const __filename = fileURLToPath(import.meta.url);
22
+ const __dirname = path.dirname(__filename);
23
+ const ROOT_DIR = path.resolve(__dirname, '..');
24
+ const DATA_DIR = path.join(ROOT_DIR, 'data');
25
+
26
+ // ANSI colors
27
+ const c = {
28
+ reset: '\x1b[0m',
29
+ bold: '\x1b[1m',
30
+ dim: '\x1b[2m',
31
+ red: '\x1b[31m',
32
+ green: '\x1b[32m',
33
+ yellow: '\x1b[33m',
34
+ blue: '\x1b[34m',
35
+ cyan: '\x1b[36m',
36
+ };
37
+
38
+ // Statistical functions
39
+ function mean(arr) {
40
+ if (arr.length === 0) return 0;
41
+ return arr.reduce((a, b) => a + b, 0) / arr.length;
42
+ }
43
+
44
+ function standardDeviation(arr) {
45
+ if (arr.length < 2) return 0;
46
+ const m = mean(arr);
47
+ return Math.sqrt(arr.reduce((acc, val) => acc + Math.pow(val - m, 2), 0) / (arr.length - 1));
48
+ }
49
+
50
+ function pooledStandardDeviation(arr1, arr2) {
51
+ const n1 = arr1.length;
52
+ const n2 = arr2.length;
53
+ if (n1 < 2 || n2 < 2) return 0;
54
+
55
+ const var1 = Math.pow(standardDeviation(arr1), 2);
56
+ const var2 = Math.pow(standardDeviation(arr2), 2);
57
+
58
+ return Math.sqrt(((n1 - 1) * var1 + (n2 - 1) * var2) / (n1 + n2 - 2));
59
+ }
60
+
61
+ function cohensD(group1, group2) {
62
+ const pooledSD = pooledStandardDeviation(group1, group2);
63
+ if (pooledSD === 0) return 0;
64
+ return (mean(group1) - mean(group2)) / pooledSD;
65
+ }
66
+
67
+ function standardError(arr) {
68
+ if (arr.length < 2) return 0;
69
+ return standardDeviation(arr) / Math.sqrt(arr.length);
70
+ }
71
+
72
+ function confidenceInterval(arr, confidence = 0.95) {
73
+ const m = mean(arr);
74
+ const se = standardError(arr);
75
+ // Z-score for 95% CI is 1.96
76
+ const z = confidence === 0.95 ? 1.96 : confidence === 0.99 ? 2.576 : 1.645;
77
+ return {
78
+ mean: m,
79
+ lower: m - z * se,
80
+ upper: m + z * se,
81
+ se: se,
82
+ };
83
+ }
84
+
85
+ // Welch's t-test for unequal variances
86
+ function welchTTest(group1, group2) {
87
+ const n1 = group1.length;
88
+ const n2 = group2.length;
89
+
90
+ if (n1 < 2 || n2 < 2) {
91
+ return { t: 0, df: 0, p: 1 };
92
+ }
93
+
94
+ const m1 = mean(group1);
95
+ const m2 = mean(group2);
96
+ const v1 = Math.pow(standardDeviation(group1), 2);
97
+ const v2 = Math.pow(standardDeviation(group2), 2);
98
+
99
+ const se = Math.sqrt(v1 / n1 + v2 / n2);
100
+ if (se === 0) return { t: 0, df: n1 + n2 - 2, p: 1 };
101
+
102
+ const t = (m1 - m2) / se;
103
+
104
+ // Welch-Satterthwaite degrees of freedom
105
+ const num = Math.pow(v1 / n1 + v2 / n2, 2);
106
+ const denom = Math.pow(v1 / n1, 2) / (n1 - 1) + Math.pow(v2 / n2, 2) / (n2 - 1);
107
+ const df = denom > 0 ? num / denom : n1 + n2 - 2;
108
+
109
+ // Approximate p-value using t-distribution
110
+ // This is a simplified approximation
111
+ const p = 2 * (1 - tCDF(Math.abs(t), df));
112
+
113
+ return { t, df, p };
114
+ }
115
+
116
+ // Approximation of t-distribution CDF
117
+ function tCDF(t, df) {
118
+ // Using a simple approximation for the t-distribution
119
+ const x = df / (df + t * t);
120
+ return 1 - 0.5 * incompleteBeta(df / 2, 0.5, x);
121
+ }
122
+
123
+ // Incomplete beta function approximation (very simplified)
124
+ function incompleteBeta(a, b, x) {
125
+ // This is a rough approximation; for production use a proper library
126
+ if (x === 0) return 0;
127
+ if (x === 1) return 1;
128
+
129
+ // Use normal approximation for large df
130
+ if (a > 30) {
131
+ return x < 0.5 ? 0 : 1;
132
+ }
133
+
134
+ // Simple numerical integration
135
+ const steps = 100;
136
+ let sum = 0;
137
+ for (let i = 0; i < steps; i++) {
138
+ const xi = (i + 0.5) / steps * x;
139
+ sum += Math.pow(xi, a - 1) * Math.pow(1 - xi, b - 1);
140
+ }
141
+ sum *= x / steps;
142
+
143
+ // Normalize (approximate)
144
+ const beta = gamma(a) * gamma(b) / gamma(a + b);
145
+ return sum / beta;
146
+ }
147
+
148
+ function gamma(n) {
149
+ // Stirling's approximation for gamma function
150
+ if (n < 0.5) return Math.PI / (Math.sin(Math.PI * n) * gamma(1 - n));
151
+ n -= 1;
152
+ const g = 7;
153
+ const c = [0.99999999999980993, 676.5203681218851, -1259.1392167224028,
154
+ 771.32342877765313, -176.61502916214059, 12.507343278686905,
155
+ -0.13857109526572012, 9.9843695780195716e-6, 1.5056327351493116e-7];
156
+
157
+ let x = c[0];
158
+ for (let i = 1; i < g + 2; i++) {
159
+ x += c[i] / (n + i);
160
+ }
161
+ const t = n + g + 0.5;
162
+ return Math.sqrt(2 * Math.PI) * Math.pow(t, n + 0.5) * Math.exp(-t) * x;
163
+ }
164
+
165
+ // Interpret effect size
166
+ function interpretEffectSize(d) {
167
+ const absD = Math.abs(d);
168
+ if (absD < 0.2) return 'negligible';
169
+ if (absD < 0.5) return 'small';
170
+ if (absD < 0.8) return 'medium';
171
+ return 'large';
172
+ }
173
+
174
+ // Parse command line arguments
175
+ function parseArgs() {
176
+ const args = process.argv.slice(2);
177
+ const options = {
178
+ profiles: [],
179
+ runId: null,
180
+ scenarios: [],
181
+ dimensions: ['overall_score', 'score_relevance', 'score_specificity',
182
+ 'score_pedagogical', 'score_personalization', 'score_actionability', 'score_tone'],
183
+ export: null,
184
+ verbose: false,
185
+ };
186
+
187
+ for (let i = 0; i < args.length; i++) {
188
+ const arg = args[i];
189
+ if (arg === '--profiles' && args[i + 1]) {
190
+ options.profiles = args[++i].split(',');
191
+ } else if (arg === '--run-id' && args[i + 1]) {
192
+ options.runId = args[++i];
193
+ } else if (arg === '--scenarios' && args[i + 1]) {
194
+ options.scenarios = args[++i].split(',');
195
+ } else if (arg === '--dimensions' && args[i + 1]) {
196
+ options.dimensions = args[++i].split(',');
197
+ } else if (arg === '--export' && args[i + 1]) {
198
+ options.export = args[++i];
199
+ } else if (arg === '--verbose' || arg === '-v') {
200
+ options.verbose = true;
201
+ } else if (arg === '--help' || arg === '-h') {
202
+ console.log(`
203
+ Statistical Analysis for Evaluation Results
204
+
205
+ Usage:
206
+ node scripts/analyze-eval-results.js [options]
207
+
208
+ Options:
209
+ --profiles <p1,p2,...> Compare specific profiles (e.g., baseline,recognition)
210
+ --run-id <id> Analyze specific evaluation run
211
+ --scenarios <s1,s2,...> Filter to specific scenarios
212
+ --dimensions <d1,d2,...> Specify dimensions to analyze
213
+ --export <file> Export results to CSV/JSON
214
+ --verbose, -v Show detailed output
215
+ --help, -h Show this help
216
+
217
+ Examples:
218
+ node scripts/analyze-eval-results.js --profiles baseline,recognition
219
+ node scripts/analyze-eval-results.js --profiles baseline,recognition --scenarios productive_struggle_arc
220
+ node scripts/analyze-eval-results.js --export results.csv
221
+ `);
222
+ process.exit(0);
223
+ }
224
+ }
225
+
226
+ return options;
227
+ }
228
+
229
+ // Main analysis function
230
+ async function analyzeResults(options) {
231
+ const dbPath = path.join(DATA_DIR, 'evaluations.db');
232
+
233
+ if (!fs.existsSync(dbPath)) {
234
+ console.error(`${c.red}Error: Database not found at ${dbPath}${c.reset}`);
235
+ console.log('Run some evaluations first with: node scripts/eval-tutor.js run');
236
+ process.exit(1);
237
+ }
238
+
239
+ const db = new Database(dbPath, { readonly: true });
240
+
241
+ // Build query
242
+ let query = `
243
+ SELECT
244
+ profile_name,
245
+ scenario_id,
246
+ overall_score,
247
+ score_relevance,
248
+ score_specificity,
249
+ score_pedagogical,
250
+ score_personalization,
251
+ score_actionability,
252
+ score_tone,
253
+ created_at
254
+ FROM evaluation_results
255
+ WHERE success = 1
256
+ AND overall_score IS NOT NULL
257
+ `;
258
+
259
+ const params = [];
260
+
261
+ if (options.runId) {
262
+ query += ' AND run_id = ?';
263
+ params.push(options.runId);
264
+ }
265
+
266
+ if (options.profiles.length > 0) {
267
+ query += ` AND profile_name IN (${options.profiles.map(() => '?').join(',')})`;
268
+ params.push(...options.profiles);
269
+ }
270
+
271
+ if (options.scenarios.length > 0) {
272
+ query += ` AND scenario_id IN (${options.scenarios.map(() => '?').join(',')})`;
273
+ params.push(...options.scenarios);
274
+ }
275
+
276
+ query += ' ORDER BY profile_name, scenario_id, created_at';
277
+
278
+ const results = db.prepare(query).all(...params);
279
+
280
+ if (results.length === 0) {
281
+ console.log(`${c.yellow}No evaluation results found matching criteria${c.reset}`);
282
+ return;
283
+ }
284
+
285
+ // Group by profile
286
+ const byProfile = {};
287
+ const byScenario = {};
288
+
289
+ for (const r of results) {
290
+ const profile = r.profile_name || 'unknown';
291
+ const scenario = r.scenario_id;
292
+
293
+ if (!byProfile[profile]) byProfile[profile] = [];
294
+ byProfile[profile].push(r);
295
+
296
+ if (!byScenario[scenario]) byScenario[scenario] = {};
297
+ if (!byScenario[scenario][profile]) byScenario[scenario][profile] = [];
298
+ byScenario[scenario][profile].push(r);
299
+ }
300
+
301
+ const profiles = Object.keys(byProfile).sort();
302
+
303
+ console.log(`\n${c.bold}${c.cyan}═══════════════════════════════════════════════════════════════════════════════${c.reset}`);
304
+ console.log(`${c.bold} STATISTICAL ANALYSIS OF EVALUATION RESULTS${c.reset}`);
305
+ console.log(`${c.cyan}═══════════════════════════════════════════════════════════════════════════════${c.reset}\n`);
306
+
307
+ console.log(`${c.dim}Total results: ${results.length}${c.reset}`);
308
+ console.log(`${c.dim}Profiles: ${profiles.join(', ')}${c.reset}`);
309
+ console.log(`${c.dim}Scenarios: ${Object.keys(byScenario).length}${c.reset}\n`);
310
+
311
+ // Profile summary statistics
312
+ console.log(`${c.bold}PROFILE SUMMARY${c.reset}`);
313
+ console.log(`${'─'.repeat(80)}`);
314
+ console.log(`${'Profile'.padEnd(20)} ${'N'.padStart(6)} ${'Mean'.padStart(8)} ${'SD'.padStart(8)} ${'95% CI'.padStart(20)}`);
315
+ console.log(`${'─'.repeat(80)}`);
316
+
317
+ const profileStats = {};
318
+ for (const profile of profiles) {
319
+ const scores = byProfile[profile].map(r => r.overall_score);
320
+ const ci = confidenceInterval(scores);
321
+ profileStats[profile] = { scores, ci, n: scores.length };
322
+
323
+ console.log(
324
+ `${profile.padEnd(20)} ` +
325
+ `${scores.length.toString().padStart(6)} ` +
326
+ `${ci.mean.toFixed(2).padStart(8)} ` +
327
+ `${standardDeviation(scores).toFixed(2).padStart(8)} ` +
328
+ `[${ci.lower.toFixed(2)}, ${ci.upper.toFixed(2)}]`.padStart(20)
329
+ );
330
+ }
331
+ console.log();
332
+
333
+ // Pairwise comparisons
334
+ if (profiles.length >= 2) {
335
+ console.log(`${c.bold}PAIRWISE COMPARISONS (Overall Score)${c.reset}`);
336
+ console.log(`${'─'.repeat(90)}`);
337
+ console.log(
338
+ `${'Comparison'.padEnd(30)} ` +
339
+ `${'Δ Mean'.padStart(10)} ` +
340
+ `${"Cohen's d".padStart(12)} ` +
341
+ `${'Effect'.padStart(12)} ` +
342
+ `${'t'.padStart(8)} ` +
343
+ `${'p-value'.padStart(10)}`
344
+ );
345
+ console.log(`${'─'.repeat(90)}`);
346
+
347
+ const comparisons = [];
348
+ for (let i = 0; i < profiles.length; i++) {
349
+ for (let j = i + 1; j < profiles.length; j++) {
350
+ const p1 = profiles[i];
351
+ const p2 = profiles[j];
352
+ const s1 = profileStats[p1].scores;
353
+ const s2 = profileStats[p2].scores;
354
+
355
+ const d = cohensD(s1, s2);
356
+ const ttest = welchTTest(s1, s2);
357
+ const diff = mean(s1) - mean(s2);
358
+
359
+ const effectLabel = interpretEffectSize(d);
360
+ const pStr = ttest.p < 0.001 ? '<0.001' : ttest.p.toFixed(3);
361
+ const sigMarker = ttest.p < 0.05 ? (ttest.p < 0.01 ? '**' : '*') : '';
362
+
363
+ console.log(
364
+ `${`${p1} vs ${p2}`.padEnd(30)} ` +
365
+ `${(diff >= 0 ? '+' : '') + diff.toFixed(2).padStart(9)} ` +
366
+ `${d.toFixed(3).padStart(12)} ` +
367
+ `${effectLabel.padStart(12)} ` +
368
+ `${ttest.t.toFixed(2).padStart(8)} ` +
369
+ `${pStr.padStart(8)}${sigMarker.padStart(2)}`
370
+ );
371
+
372
+ comparisons.push({ p1, p2, diff, d, effectLabel, t: ttest.t, p: ttest.p });
373
+ }
374
+ }
375
+ console.log(`\n${c.dim}* p < 0.05, ** p < 0.01${c.reset}\n`);
376
+
377
+ // Dimension-level analysis
378
+ console.log(`${c.bold}DIMENSION-LEVEL EFFECT SIZES${c.reset}`);
379
+ console.log(`${'─'.repeat(100)}`);
380
+
381
+ const dimensions = [
382
+ { key: 'score_relevance', name: 'Relevance' },
383
+ { key: 'score_specificity', name: 'Specificity' },
384
+ { key: 'score_pedagogical', name: 'Pedagogical' },
385
+ { key: 'score_personalization', name: 'Personalization' },
386
+ { key: 'score_actionability', name: 'Actionability' },
387
+ { key: 'score_tone', name: 'Tone' },
388
+ ];
389
+
390
+ // Header
391
+ let header = 'Dimension'.padEnd(20);
392
+ for (const profile of profiles) {
393
+ header += profile.substring(0, 12).padStart(14);
394
+ }
395
+ if (profiles.length === 2) {
396
+ header += " Cohen's d".padStart(14) + ' Effect'.padStart(12);
397
+ }
398
+ console.log(header);
399
+ console.log(`${'─'.repeat(100)}`);
400
+
401
+ const dimensionResults = [];
402
+ for (const dim of dimensions) {
403
+ let row = dim.name.padEnd(20);
404
+
405
+ const dimScores = {};
406
+ for (const profile of profiles) {
407
+ const scores = byProfile[profile]
408
+ .map(r => r[dim.key])
409
+ .filter(s => s !== null && s !== undefined);
410
+ dimScores[profile] = scores;
411
+ row += mean(scores).toFixed(2).padStart(14);
412
+ }
413
+
414
+ if (profiles.length === 2) {
415
+ const d = cohensD(dimScores[profiles[0]], dimScores[profiles[1]]);
416
+ const effect = interpretEffectSize(d);
417
+ row += d.toFixed(3).padStart(14) + effect.padStart(12);
418
+ dimensionResults.push({ dimension: dim.name, d, effect });
419
+ }
420
+
421
+ console.log(row);
422
+ }
423
+ console.log();
424
+
425
+ // Sort dimensions by effect size for visualization
426
+ if (dimensionResults.length > 0) {
427
+ dimensionResults.sort((a, b) => Math.abs(b.d) - Math.abs(a.d));
428
+
429
+ console.log(`${c.bold}EFFECT SIZE RANKING${c.reset}`);
430
+ console.log(`${'─'.repeat(60)}`);
431
+
432
+ for (const dr of dimensionResults) {
433
+ const bar = '█'.repeat(Math.min(Math.round(Math.abs(dr.d) * 20), 40));
434
+ const direction = dr.d >= 0 ? c.green : c.red;
435
+ console.log(
436
+ `${dr.dimension.padEnd(20)} ` +
437
+ `${direction}${dr.d >= 0 ? '+' : ''}${dr.d.toFixed(3).padStart(7)}${c.reset} ` +
438
+ `${direction}${bar}${c.reset} ` +
439
+ `${dr.effect}`
440
+ );
441
+ }
442
+ console.log();
443
+ }
444
+
445
+ // Scenario-level breakdown
446
+ console.log(`${c.bold}SCENARIO-LEVEL RESULTS${c.reset}`);
447
+ console.log(`${'─'.repeat(90)}`);
448
+
449
+ const scenarios = Object.keys(byScenario).sort();
450
+ for (const scenario of scenarios) {
451
+ const scenarioData = byScenario[scenario];
452
+ const availableProfiles = Object.keys(scenarioData);
453
+
454
+ if (availableProfiles.length < 2) continue;
455
+
456
+ console.log(`\n${c.cyan}${scenario}${c.reset}`);
457
+
458
+ for (let i = 0; i < availableProfiles.length; i++) {
459
+ for (let j = i + 1; j < availableProfiles.length; j++) {
460
+ const p1 = availableProfiles[i];
461
+ const p2 = availableProfiles[j];
462
+ const s1 = scenarioData[p1].map(r => r.overall_score);
463
+ const s2 = scenarioData[p2].map(r => r.overall_score);
464
+
465
+ const d = cohensD(s1, s2);
466
+ const ttest = welchTTest(s1, s2);
467
+ const diff = mean(s1) - mean(s2);
468
+ const pctImprove = mean(s2) !== 0 ? ((mean(s1) - mean(s2)) / mean(s2) * 100) : 0;
469
+
470
+ console.log(
471
+ ` ${p1}(n=${s1.length}, μ=${mean(s1).toFixed(1)}) vs ` +
472
+ `${p2}(n=${s2.length}, μ=${mean(s2).toFixed(1)}): ` +
473
+ `Δ=${diff >= 0 ? '+' : ''}${diff.toFixed(1)} (${pctImprove >= 0 ? '+' : ''}${pctImprove.toFixed(0)}%), ` +
474
+ `d=${d.toFixed(2)}, p=${ttest.p < 0.001 ? '<0.001' : ttest.p.toFixed(3)}`
475
+ );
476
+ }
477
+ }
478
+ }
479
+ console.log();
480
+
481
+ // Export if requested
482
+ if (options.export) {
483
+ const exportData = {
484
+ generated_at: new Date().toISOString(),
485
+ profiles: profileStats,
486
+ comparisons,
487
+ dimensions: dimensionResults,
488
+ scenarios: byScenario,
489
+ };
490
+
491
+ if (options.export.endsWith('.json')) {
492
+ fs.writeFileSync(options.export, JSON.stringify(exportData, null, 2));
493
+ } else {
494
+ // CSV export
495
+ let csv = 'Comparison,Delta_Mean,Cohens_d,Effect_Size,t_statistic,p_value\n';
496
+ for (const comp of comparisons) {
497
+ csv += `"${comp.p1} vs ${comp.p2}",${comp.diff.toFixed(4)},${comp.d.toFixed(4)},${comp.effectLabel},${comp.t.toFixed(4)},${comp.p.toFixed(6)}\n`;
498
+ }
499
+ fs.writeFileSync(options.export, csv);
500
+ }
501
+ console.log(`${c.green}Results exported to ${options.export}${c.reset}`);
502
+ }
503
+ }
504
+
505
+ db.close();
506
+ }
507
+
508
+ // Run
509
+ const options = parseArgs();
510
+ analyzeResults(options).catch(err => {
511
+ console.error(`${c.red}Error: ${err.message}${c.reset}`);
512
+ process.exit(1);
513
+ });