@machinespirits/eval 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (74) hide show
  1. package/README.md +91 -9
  2. package/config/eval-settings.yaml +3 -3
  3. package/config/paper-manifest.json +486 -0
  4. package/config/providers.yaml +9 -6
  5. package/config/tutor-agents.yaml +2261 -0
  6. package/content/README.md +23 -0
  7. package/content/courses/479/course.md +53 -0
  8. package/content/courses/479/lecture-1.md +361 -0
  9. package/content/courses/479/lecture-2.md +360 -0
  10. package/content/courses/479/lecture-3.md +655 -0
  11. package/content/courses/479/lecture-4.md +530 -0
  12. package/content/courses/479/lecture-5.md +326 -0
  13. package/content/courses/479/lecture-6.md +346 -0
  14. package/content/courses/479/lecture-7.md +326 -0
  15. package/content/courses/479/lecture-8.md +273 -0
  16. package/content/courses/479/roadmap-slides.md +656 -0
  17. package/content/manifest.yaml +8 -0
  18. package/docs/research/build.sh +44 -20
  19. package/docs/research/figures/figure10.png +0 -0
  20. package/docs/research/figures/figure11.png +0 -0
  21. package/docs/research/figures/figure3.png +0 -0
  22. package/docs/research/figures/figure4.png +0 -0
  23. package/docs/research/figures/figure5.png +0 -0
  24. package/docs/research/figures/figure6.png +0 -0
  25. package/docs/research/figures/figure7.png +0 -0
  26. package/docs/research/figures/figure8.png +0 -0
  27. package/docs/research/figures/figure9.png +0 -0
  28. package/docs/research/header.tex +23 -2
  29. package/docs/research/paper-full.md +941 -285
  30. package/docs/research/paper-short.md +216 -585
  31. package/docs/research/references.bib +132 -0
  32. package/docs/research/slides-header.tex +188 -0
  33. package/docs/research/slides-pptx.md +363 -0
  34. package/docs/research/slides.md +531 -0
  35. package/docs/research/style-reference-pptx.py +199 -0
  36. package/package.json +6 -5
  37. package/scripts/analyze-eval-results.js +69 -17
  38. package/scripts/analyze-mechanism-traces.js +763 -0
  39. package/scripts/analyze-modulation-learning.js +498 -0
  40. package/scripts/analyze-prosthesis.js +144 -0
  41. package/scripts/analyze-run.js +264 -79
  42. package/scripts/assess-transcripts.js +853 -0
  43. package/scripts/browse-transcripts.js +854 -0
  44. package/scripts/check-parse-failures.js +73 -0
  45. package/scripts/code-dialectical-modulation.js +1320 -0
  46. package/scripts/download-data.sh +55 -0
  47. package/scripts/eval-cli.js +106 -18
  48. package/scripts/generate-paper-figures.js +663 -0
  49. package/scripts/generate-paper-figures.py +577 -76
  50. package/scripts/generate-paper-tables.js +299 -0
  51. package/scripts/qualitative-analysis-ai.js +3 -3
  52. package/scripts/render-sequence-diagram.js +694 -0
  53. package/scripts/test-latency.js +210 -0
  54. package/scripts/test-rate-limit.js +95 -0
  55. package/scripts/test-token-budget.js +332 -0
  56. package/scripts/validate-paper-manifest.js +670 -0
  57. package/services/__tests__/evalConfigLoader.test.js +2 -2
  58. package/services/__tests__/learnerRubricEvaluator.test.js +361 -0
  59. package/services/__tests__/learnerTutorInteractionEngine.test.js +326 -0
  60. package/services/evaluationRunner.js +975 -98
  61. package/services/evaluationStore.js +12 -4
  62. package/services/learnerTutorInteractionEngine.js +27 -2
  63. package/services/mockProvider.js +133 -0
  64. package/services/promptRewriter.js +1471 -5
  65. package/services/rubricEvaluator.js +55 -2
  66. package/services/transcriptFormatter.js +675 -0
  67. package/docs/EVALUATION-VARIABLES.md +0 -589
  68. package/docs/REPLICATION-PLAN.md +0 -577
  69. package/scripts/analyze-run.mjs +0 -282
  70. package/scripts/compare-runs.js +0 -44
  71. package/scripts/compare-suggestions.js +0 -80
  72. package/scripts/dig-into-run.js +0 -158
  73. package/scripts/show-failed-suggestions.js +0 -64
  74. /package/scripts/{check-run.mjs → check-run.js} +0 -0
@@ -0,0 +1,498 @@
1
+ #!/usr/bin/env node
2
+ /**
3
+ * Post-hoc analysis of modulation (Drama Machine) and synthetic learning outcomes.
4
+ *
5
+ * Track 1 analysis — uses existing data only, no new evaluations.
6
+ *
7
+ * (a) MODULATION METRICS (N=350 factorial):
8
+ * - Response length variability (CV) by condition
9
+ * - Vocabulary richness (type-token ratio) by condition
10
+ * - Dimension score variance (within-cell) — proxy for behavioral range
11
+ * - Ego-Superego negotiation rounds (multi-agent only)
12
+ *
13
+ * (b) SYNTHETIC LEARNING OUTCOME INDEX (N=118 bilateral):
14
+ * - Composite from existing learner rubric: revision_signals × 0.35
15
+ * + question_quality × 0.30 + conceptual_engagement × 0.35
16
+ * - Learning arc trajectory (turn 1 → turn N score progression)
17
+ * - Per-condition breakdown (recognition × architecture 2×2)
18
+ *
19
+ * Usage:
20
+ * node scripts/analyze-modulation-learning.js
21
+ */
22
+
23
+ import Database from 'better-sqlite3';
24
+ import path from 'path';
25
+ import { fileURLToPath } from 'url';
26
+ const __dirname = path.dirname(fileURLToPath(import.meta.url));
27
+
28
+ const DB_PATH = path.join(__dirname, '..', 'data', 'evaluations.db');
29
+ const db = new Database(DB_PATH, { readonly: true });
30
+
31
+ // ═══════════════════════════════════════════════════════════════════════════
32
+ // HELPERS
33
+ // ═══════════════════════════════════════════════════════════════════════════
34
+
35
+ function mean(arr) {
36
+ if (!arr.length) return 0;
37
+ return arr.reduce((a, b) => a + b, 0) / arr.length;
38
+ }
39
+
40
+ function std(arr) {
41
+ if (arr.length < 2) return 0;
42
+ const m = mean(arr);
43
+ return Math.sqrt(arr.reduce((s, x) => s + (x - m) ** 2, 0) / (arr.length - 1));
44
+ }
45
+
46
+ function cv(arr) {
47
+ const m = mean(arr);
48
+ return m > 0 ? std(arr) / m : 0;
49
+ }
50
+
51
+ function ttr(text) {
52
+ // Type-token ratio: unique words / total words
53
+ const words = text.toLowerCase().replace(/[^a-z\s]/g, '').split(/\s+/).filter(w => w.length > 2);
54
+ if (!words.length) return 0;
55
+ const unique = new Set(words);
56
+ return unique.size / words.length;
57
+ }
58
+
59
+ function cohensD(group1, group2) {
60
+ const m1 = mean(group1), m2 = mean(group2);
61
+ const s1 = std(group1), s2 = std(group2);
62
+ const pooled = Math.sqrt(((group1.length - 1) * s1 ** 2 + (group2.length - 1) * s2 ** 2) / (group1.length + group2.length - 2));
63
+ return pooled > 0 ? (m1 - m2) / pooled : 0;
64
+ }
65
+
66
+ function fTest2x2(data) {
67
+ // Simple 2×2 ANOVA — returns F for main effects and interaction
68
+ // data: { a0b0: [], a0b1: [], a1b0: [], a1b1: [] }
69
+ const grandMean = mean([...data.a0b0, ...data.a0b1, ...data.a1b0, ...data.a1b1]);
70
+ const n = data.a0b0.length + data.a0b1.length + data.a1b0.length + data.a1b1.length;
71
+
72
+ const cellMeans = {
73
+ a0b0: mean(data.a0b0), a0b1: mean(data.a0b1),
74
+ a1b0: mean(data.a1b0), a1b1: mean(data.a1b1)
75
+ };
76
+
77
+ const margA0 = mean([...data.a0b0, ...data.a0b1]);
78
+ const margA1 = mean([...data.a1b0, ...data.a1b1]);
79
+ const margB0 = mean([...data.a0b0, ...data.a1b0]);
80
+ const margB1 = mean([...data.a0b1, ...data.a1b1]);
81
+
82
+ // SS for main effects
83
+ const nA0 = data.a0b0.length + data.a0b1.length;
84
+ const nA1 = data.a1b0.length + data.a1b1.length;
85
+ const ssA = nA0 * (margA0 - grandMean) ** 2 + nA1 * (margA1 - grandMean) ** 2;
86
+
87
+ const nB0 = data.a0b0.length + data.a1b0.length;
88
+ const nB1 = data.a0b1.length + data.a1b1.length;
89
+ const ssB = nB0 * (margB0 - grandMean) ** 2 + nB1 * (margB1 - grandMean) ** 2;
90
+
91
+ // SS interaction
92
+ const interaction = (cellMeans.a1b1 - cellMeans.a1b0) - (cellMeans.a0b1 - cellMeans.a0b0);
93
+
94
+ // SS within (error)
95
+ let ssW = 0;
96
+ for (const [key, arr] of Object.entries(data)) {
97
+ const m = cellMeans[key];
98
+ ssW += arr.reduce((s, x) => s + (x - m) ** 2, 0);
99
+ }
100
+ const dfW = n - 4;
101
+ const msW = ssW / dfW;
102
+
103
+ return {
104
+ mainA: { F: (ssA / 1) / msW, marginals: [margA0, margA1], delta: margA1 - margA0 },
105
+ mainB: { F: (ssB / 1) / msW, marginals: [margB0, margB1], delta: margB1 - margB0 },
106
+ interaction: interaction,
107
+ cellMeans,
108
+ msW,
109
+ dfW
110
+ };
111
+ }
112
+
113
+ // ═══════════════════════════════════════════════════════════════════════════
114
+ // (a) MODULATION METRICS — N=350 factorial
115
+ // ═══════════════════════════════════════════════════════════════════════════
116
+
117
+ console.log('═══════════════════════════════════════════════════════════════');
118
+ console.log(' MODULATION ANALYSIS (Drama Machine Evidence)');
119
+ console.log(' N=350 factorial, Kimi K2.5 ego, Opus judge');
120
+ console.log('═══════════════════════════════════════════════════════════════\n');
121
+
122
+ const factorialRows = db.prepare(`
123
+ SELECT profile_name, suggestions, dialogue_rounds,
124
+ score_relevance, score_specificity, score_pedagogical,
125
+ score_personalization, score_actionability, score_tone,
126
+ overall_score, scores_with_reasoning, scenario_name
127
+ FROM evaluation_results
128
+ WHERE run_id IN ('eval-2026-02-03-f5d4dd93', 'eval-2026-02-06-a933d745')
129
+ AND overall_score IS NOT NULL
130
+ AND judge_model LIKE '%claude%'
131
+ `).all();
132
+
133
+ console.log(`Factorial rows: ${factorialRows.length}\n`);
134
+
135
+ // Classify into 4 conditions
136
+ const conditions = {
137
+ base_single: [], base_multi: [],
138
+ recog_single: [], recog_multi: []
139
+ };
140
+
141
+ for (const row of factorialRows) {
142
+ const isRecog = row.profile_name.includes('recog');
143
+ const isMulti = row.profile_name.includes('multi');
144
+ const key = `${isRecog ? 'recog' : 'base'}_${isMulti ? 'multi' : 'single'}`;
145
+
146
+ // Extract message text from JSON suggestions
147
+ let messageText = '';
148
+ try {
149
+ const suggestions = JSON.parse(row.suggestions);
150
+ messageText = suggestions.map(s => [s.message, s.title, s.reason].filter(Boolean).join(' ')).join(' ');
151
+ } catch { messageText = row.suggestions || ''; }
152
+
153
+ // Dimension scores as array (for variance computation)
154
+ const dimScores = [
155
+ row.score_relevance, row.score_specificity, row.score_pedagogical,
156
+ row.score_personalization, row.score_actionability, row.score_tone
157
+ ].filter(x => x != null);
158
+
159
+ // Extended dimensions from JSON if available
160
+ let extDimScores = [...dimScores];
161
+ if (row.scores_with_reasoning) {
162
+ try {
163
+ const parsed = JSON.parse(row.scores_with_reasoning);
164
+ for (const key of ['mutual_recognition', 'dialectical_responsiveness', 'memory_integration',
165
+ 'transformative_potential', 'tutor_adaptation', 'learner_growth',
166
+ 'productive_struggle', 'epistemic_honesty']) {
167
+ if (parsed[key]?.score != null) extDimScores.push(parsed[key].score);
168
+ }
169
+ } catch {}
170
+ }
171
+
172
+ conditions[key].push({
173
+ responseLength: messageText.length,
174
+ wordCount: messageText.split(/\s+/).filter(w => w.length > 0).length,
175
+ ttr: ttr(messageText),
176
+ dialogueRounds: row.dialogue_rounds || 0,
177
+ overallScore: row.overall_score,
178
+ dimScoreVariance: std(dimScores),
179
+ extDimScoreVariance: std(extDimScores),
180
+ scenario: row.scenario_name
181
+ });
182
+ }
183
+
184
+ // ── Report: Response Length ──────────────────────────────────────────────
185
+
186
+ console.log('─── Response Length (chars) ─────────────────────────────────');
187
+ console.log('Condition | N | Mean | SD | CV');
188
+ console.log('───────────────────|──────|─────────|─────────|────────');
189
+ for (const [key, items] of Object.entries(conditions)) {
190
+ const lens = items.map(i => i.responseLength);
191
+ console.log(`${key.padEnd(19)}| ${items.length.toString().padStart(4)} | ${mean(lens).toFixed(1).padStart(7)} | ${std(lens).toFixed(1).padStart(7)} | ${cv(lens).toFixed(3).padStart(6)}`);
192
+ }
193
+
194
+ // Cross-condition comparisons
195
+ const allSingle = [...conditions.base_single, ...conditions.recog_single];
196
+ const allMulti = [...conditions.base_multi, ...conditions.recog_multi];
197
+ const allBase = [...conditions.base_single, ...conditions.base_multi];
198
+ const allRecog = [...conditions.recog_single, ...conditions.recog_multi];
199
+
200
+ console.log('\nLength CV by factor:');
201
+ console.log(` Single-agent: ${cv(allSingle.map(i => i.responseLength)).toFixed(3)}`);
202
+ console.log(` Multi-agent: ${cv(allMulti.map(i => i.responseLength)).toFixed(3)}`);
203
+ console.log(` Base: ${cv(allBase.map(i => i.responseLength)).toFixed(3)}`);
204
+ console.log(` Recognition: ${cv(allRecog.map(i => i.responseLength)).toFixed(3)}`);
205
+
206
+ // ── Report: Vocabulary Richness (Type-Token Ratio) ───────────────────────
207
+
208
+ console.log('\n─── Vocabulary Richness (Type-Token Ratio) ─────────────────');
209
+ console.log('Condition | N | Mean TTR | SD');
210
+ console.log('───────────────────|──────|──────────|────────');
211
+ for (const [key, items] of Object.entries(conditions)) {
212
+ const ttrs = items.map(i => i.ttr);
213
+ console.log(`${key.padEnd(19)}| ${items.length.toString().padStart(4)} | ${mean(ttrs).toFixed(4).padStart(8)} | ${std(ttrs).toFixed(4).padStart(6)}`);
214
+ }
215
+
216
+ // ── Report: Dimension Score Variance (Behavioral Range) ──────────────────
217
+
218
+ console.log('\n─── Dimension Score Variance (Behavioral Range Proxy) ──────');
219
+ console.log('Higher variance = more differentiated scoring across dimensions');
220
+ console.log('= tutor modulates behavior across pedagogical dimensions\n');
221
+ console.log('Condition | N | Mean σ(6-dim) | Mean σ(14-dim)');
222
+ console.log('───────────────────|──────|───────────────|───────────────');
223
+ for (const [key, items] of Object.entries(conditions)) {
224
+ const dim6 = items.map(i => i.dimScoreVariance);
225
+ const dim14 = items.map(i => i.extDimScoreVariance);
226
+ console.log(`${key.padEnd(19)}| ${items.length.toString().padStart(4)} | ${mean(dim6).toFixed(3).padStart(13)} | ${mean(dim14).toFixed(3).padStart(13)}`);
227
+ }
228
+
229
+ // ── Report: Within-Scenario Response Diversity ───────────────────────────
230
+
231
+ console.log('\n─── Within-Scenario Response Diversity ─────────────────────');
232
+ console.log('CV of overall_score within each (condition × scenario) cell');
233
+ console.log('Higher CV = more varied quality across attempts = modulation\n');
234
+
235
+ const scenarioMap = {};
236
+ for (const [key, items] of Object.entries(conditions)) {
237
+ for (const item of items) {
238
+ const cellKey = `${key}|${item.scenario}`;
239
+ if (!scenarioMap[cellKey]) scenarioMap[cellKey] = [];
240
+ scenarioMap[cellKey].push(item.overallScore);
241
+ }
242
+ }
243
+
244
+ // Aggregate: mean CV per condition
245
+ const conditionCVs = {};
246
+ for (const [cellKey, scores] of Object.entries(scenarioMap)) {
247
+ const cond = cellKey.split('|')[0];
248
+ if (!conditionCVs[cond]) conditionCVs[cond] = [];
249
+ if (scores.length >= 3) conditionCVs[cond].push(cv(scores));
250
+ }
251
+
252
+ console.log('Condition | Mean CV(score) | N cells');
253
+ console.log('───────────────────|────────────────|────────');
254
+ for (const [key, cvArr] of Object.entries(conditionCVs)) {
255
+ console.log(`${key.padEnd(19)}| ${mean(cvArr).toFixed(4).padStart(14)} | ${cvArr.length.toString().padStart(6)}`);
256
+ }
257
+
258
+ // ── Report: Ego-Superego Negotiation Rounds ──────────────────────────────
259
+
260
+ console.log('\n─── Ego-Superego Negotiation Rounds (Multi-Agent Only) ─────');
261
+ const multiRounds = {};
262
+ for (const key of ['base_multi', 'recog_multi']) {
263
+ const rounds = conditions[key].map(i => i.dialogueRounds);
264
+ multiRounds[key] = rounds;
265
+ console.log(`${key.padEnd(19)}: mean=${mean(rounds).toFixed(2)}, sd=${std(rounds).toFixed(2)}, range=[${Math.min(...rounds)}, ${Math.max(...rounds)}]`);
266
+ }
267
+
268
+ // ── Summary Statistics ───────────────────────────────────────────────────
269
+
270
+ console.log('\n─── MODULATION SUMMARY (2×2: Recognition × Architecture) ───');
271
+
272
+ // Response length 2×2
273
+ const lenData = {
274
+ a0b0: conditions.base_single.map(i => i.responseLength),
275
+ a0b1: conditions.base_multi.map(i => i.responseLength),
276
+ a1b0: conditions.recog_single.map(i => i.responseLength),
277
+ a1b1: conditions.recog_multi.map(i => i.responseLength)
278
+ };
279
+ const lenAnova = fTest2x2(lenData);
280
+ console.log(`\nResponse Length:`);
281
+ console.log(` Recognition effect: ${lenAnova.mainA.delta > 0 ? '+' : ''}${lenAnova.mainA.delta.toFixed(1)} chars, F=${lenAnova.mainA.F.toFixed(2)}`);
282
+ console.log(` Architecture effect: ${lenAnova.mainB.delta > 0 ? '+' : ''}${lenAnova.mainB.delta.toFixed(1)} chars, F=${lenAnova.mainB.F.toFixed(2)}`);
283
+ console.log(` Interaction: ${lenAnova.interaction > 0 ? '+' : ''}${lenAnova.interaction.toFixed(1)} chars`);
284
+
285
+ // Dimension variance 2×2
286
+ const varData = {
287
+ a0b0: conditions.base_single.map(i => i.extDimScoreVariance),
288
+ a0b1: conditions.base_multi.map(i => i.extDimScoreVariance),
289
+ a1b0: conditions.recog_single.map(i => i.extDimScoreVariance),
290
+ a1b1: conditions.recog_multi.map(i => i.extDimScoreVariance)
291
+ };
292
+ const varAnova = fTest2x2(varData);
293
+ console.log(`\nDimension Score Variance (14-dim):`);
294
+ console.log(` Recognition effect: ${varAnova.mainA.delta > 0 ? '+' : ''}${varAnova.mainA.delta.toFixed(3)}, F=${varAnova.mainA.F.toFixed(2)}`);
295
+ console.log(` Architecture effect: ${varAnova.mainB.delta > 0 ? '+' : ''}${varAnova.mainB.delta.toFixed(3)}, F=${varAnova.mainB.F.toFixed(2)}`);
296
+ console.log(` Interaction: ${varAnova.interaction > 0 ? '+' : ''}${varAnova.interaction.toFixed(3)}`);
297
+
298
+ // TTR 2×2
299
+ const ttrData = {
300
+ a0b0: conditions.base_single.map(i => i.ttr),
301
+ a0b1: conditions.base_multi.map(i => i.ttr),
302
+ a1b0: conditions.recog_single.map(i => i.ttr),
303
+ a1b1: conditions.recog_multi.map(i => i.ttr)
304
+ };
305
+ const ttrAnova = fTest2x2(ttrData);
306
+ console.log(`\nVocabulary Richness (TTR):`);
307
+ console.log(` Recognition effect: ${ttrAnova.mainA.delta > 0 ? '+' : ''}${ttrAnova.mainA.delta.toFixed(4)}, F=${ttrAnova.mainA.F.toFixed(2)}`);
308
+ console.log(` Architecture effect: ${ttrAnova.mainB.delta > 0 ? '+' : ''}${ttrAnova.mainB.delta.toFixed(4)}, F=${ttrAnova.mainB.F.toFixed(2)}`);
309
+ console.log(` Interaction: ${ttrAnova.interaction > 0 ? '+' : ''}${ttrAnova.interaction.toFixed(4)}`);
310
+
311
+ // Cohen's d for key comparisons
312
+ console.log('\n─── Effect Sizes (Cohen\'s d) ────────────────────────────────');
313
+ console.log(`Response length: recognition d=${cohensD(allRecog.map(i => i.responseLength), allBase.map(i => i.responseLength)).toFixed(2)}`);
314
+ console.log(` architecture d=${cohensD(allMulti.map(i => i.responseLength), allSingle.map(i => i.responseLength)).toFixed(2)}`);
315
+ console.log(`TTR: recognition d=${cohensD(allRecog.map(i => i.ttr), allBase.map(i => i.ttr)).toFixed(2)}`);
316
+ console.log(` architecture d=${cohensD(allMulti.map(i => i.ttr), allSingle.map(i => i.ttr)).toFixed(2)}`);
317
+ console.log(`Dim variance: recognition d=${cohensD(allRecog.map(i => i.extDimScoreVariance), allBase.map(i => i.extDimScoreVariance)).toFixed(2)}`);
318
+ console.log(` architecture d=${cohensD(allMulti.map(i => i.extDimScoreVariance), allSingle.map(i => i.extDimScoreVariance)).toFixed(2)}`);
319
+
320
+
321
+ // ═══════════════════════════════════════════════════════════════════════════
322
+ // (b) SYNTHETIC LEARNING OUTCOME INDEX — N=118 bilateral
323
+ // ═══════════════════════════════════════════════════════════════════════════
324
+
325
+ console.log('\n\n═══════════════════════════════════════════════════════════════');
326
+ console.log(' SYNTHETIC LEARNING OUTCOME ANALYSIS');
327
+ console.log(' N=118 bilateral run, 3 multi-turn scenarios, Opus judge');
328
+ console.log('═══════════════════════════════════════════════════════════════\n');
329
+
330
+ const bilateralRows = db.prepare(`
331
+ SELECT profile_name, learner_scores, learner_overall_score, scenario_name
332
+ FROM evaluation_results
333
+ WHERE run_id = 'eval-2026-02-07-b6d75e87'
334
+ AND overall_score IS NOT NULL
335
+ AND learner_scores IS NOT NULL
336
+ `).all();
337
+
338
+ console.log(`Bilateral rows with learner scores: ${bilateralRows.length}\n`);
339
+
340
+ // Parse learner scores and compute composite learning outcome
341
+ const learnerConditions = {
342
+ base_single: [], base_multi: [],
343
+ recog_single: [], recog_multi: []
344
+ };
345
+
346
+ for (const row of bilateralRows) {
347
+ const isRecog = row.profile_name.includes('recog');
348
+ const isMulti = row.profile_name.includes('multi');
349
+ const key = `${isRecog ? 'recog' : 'base'}_${isMulti ? 'multi' : 'single'}`;
350
+
351
+ let learnerData;
352
+ try { learnerData = JSON.parse(row.learner_scores); } catch { continue; }
353
+
354
+ // Extract per-turn scores for learning outcome dimensions
355
+ const turnScores = [];
356
+ for (const turnKey of Object.keys(learnerData).sort()) {
357
+ const turn = learnerData[turnKey];
358
+ if (!turn?.scores) continue;
359
+ const s = turn.scores;
360
+
361
+ const revisionScore = s.revision_signals?.score || 0;
362
+ const questionScore = s.question_quality?.score || 0;
363
+ const conceptualScore = s.conceptual_engagement?.score || 0;
364
+
365
+ // Composite: weighted average of learning-relevant dimensions
366
+ // Maps 1-5 → 0-100 for comparability
367
+ const composite = ((revisionScore * 0.35 + questionScore * 0.30 + conceptualScore * 0.35) - 1) / 4 * 100;
368
+
369
+ turnScores.push({
370
+ turnIndex: turn.turnIndex || parseInt(turnKey),
371
+ revision: revisionScore,
372
+ question: questionScore,
373
+ conceptual: conceptualScore,
374
+ composite,
375
+ overallLearner: turn.overallScore || 0
376
+ });
377
+ }
378
+
379
+ if (turnScores.length === 0) continue;
380
+
381
+ // Learning arc: last turn composite minus first turn composite
382
+ const learningArc = turnScores.length > 1
383
+ ? turnScores[turnScores.length - 1].composite - turnScores[0].composite
384
+ : 0;
385
+
386
+ // Average composite across turns
387
+ const avgComposite = mean(turnScores.map(t => t.composite));
388
+
389
+ // Final turn composite (strongest signal of learning outcome)
390
+ const finalComposite = turnScores[turnScores.length - 1].composite;
391
+
392
+ // Revision signal progression: does revision_signals increase?
393
+ const revisionProgression = turnScores.length > 1
394
+ ? turnScores[turnScores.length - 1].revision - turnScores[0].revision
395
+ : 0;
396
+
397
+ learnerConditions[key].push({
398
+ avgComposite,
399
+ finalComposite,
400
+ learningArc,
401
+ revisionProgression,
402
+ overallLearner: row.learner_overall_score,
403
+ turnCount: turnScores.length,
404
+ turnScores,
405
+ scenario: row.scenario_name
406
+ });
407
+ }
408
+
409
+ // ── Report: Composite Learning Outcome ───────────────────────────────────
410
+
411
+ console.log('─── Synthetic Learning Outcome (Composite Index, 0–100) ────');
412
+ console.log('Weights: revision_signals (35%) + question_quality (30%) + conceptual_engagement (35%)\n');
413
+ console.log('Condition | N | Avg Composite | Final Turn | Learning Arc');
414
+ console.log('───────────────────|──────|───────────────|────────────|─────────────');
415
+ for (const [key, items] of Object.entries(learnerConditions)) {
416
+ const avgC = items.map(i => i.avgComposite);
417
+ const finalC = items.map(i => i.finalComposite);
418
+ const arc = items.map(i => i.learningArc);
419
+ console.log(`${key.padEnd(19)}| ${items.length.toString().padStart(4)} | ${mean(avgC).toFixed(1).padStart(13)} | ${mean(finalC).toFixed(1).padStart(10)} | ${mean(arc).toFixed(1).padStart(11)}`);
420
+ }
421
+
422
+ // ── 2×2 ANOVA on Composite Learning Outcome ─────────────────────────────
423
+
424
+ console.log('\n─── 2×2 ANOVA: Synthetic Learning Outcome ──────────────────');
425
+ const sloData = {
426
+ a0b0: learnerConditions.base_single.map(i => i.avgComposite),
427
+ a0b1: learnerConditions.base_multi.map(i => i.avgComposite),
428
+ a1b0: learnerConditions.recog_single.map(i => i.avgComposite),
429
+ a1b1: learnerConditions.recog_multi.map(i => i.avgComposite)
430
+ };
431
+ const sloAnova = fTest2x2(sloData);
432
+ console.log(`Recognition (A): delta=${sloAnova.mainA.delta > 0 ? '+' : ''}${sloAnova.mainA.delta.toFixed(1)}, F=${sloAnova.mainA.F.toFixed(2)}, marginals=[${sloAnova.mainA.marginals.map(m => m.toFixed(1)).join(', ')}]`);
433
+ console.log(`Architecture (B): delta=${sloAnova.mainB.delta > 0 ? '+' : ''}${sloAnova.mainB.delta.toFixed(1)}, F=${sloAnova.mainB.F.toFixed(2)}, marginals=[${sloAnova.mainB.marginals.map(m => m.toFixed(1)).join(', ')}]`);
434
+ console.log(`A×B Interaction: ${sloAnova.interaction > 0 ? '+' : ''}${sloAnova.interaction.toFixed(1)}`);
435
+ console.log(`Cell means: base_single=${sloAnova.cellMeans.a0b0.toFixed(1)}, base_multi=${sloAnova.cellMeans.a0b1.toFixed(1)}, recog_single=${sloAnova.cellMeans.a1b0.toFixed(1)}, recog_multi=${sloAnova.cellMeans.a1b1.toFixed(1)}`);
436
+
437
+ // ── Learning Arc 2×2 ────────────────────────────────────────────────────
438
+
439
+ console.log('\n─── 2×2 ANOVA: Learning Arc (Final − First Turn) ─────────');
440
+ const arcData = {
441
+ a0b0: learnerConditions.base_single.map(i => i.learningArc),
442
+ a0b1: learnerConditions.base_multi.map(i => i.learningArc),
443
+ a1b0: learnerConditions.recog_single.map(i => i.learningArc),
444
+ a1b1: learnerConditions.recog_multi.map(i => i.learningArc)
445
+ };
446
+ const arcAnova = fTest2x2(arcData);
447
+ console.log(`Recognition (A): delta=${arcAnova.mainA.delta > 0 ? '+' : ''}${arcAnova.mainA.delta.toFixed(1)}, F=${arcAnova.mainA.F.toFixed(2)}`);
448
+ console.log(`Architecture (B): delta=${arcAnova.mainB.delta > 0 ? '+' : ''}${arcAnova.mainB.delta.toFixed(1)}, F=${arcAnova.mainB.F.toFixed(2)}`);
449
+ console.log(`A×B Interaction: ${arcAnova.interaction > 0 ? '+' : ''}${arcAnova.interaction.toFixed(1)}`);
450
+ console.log(`Cell means: base_single=${arcAnova.cellMeans.a0b0.toFixed(1)}, base_multi=${arcAnova.cellMeans.a0b1.toFixed(1)}, recog_single=${arcAnova.cellMeans.a1b0.toFixed(1)}, recog_multi=${arcAnova.cellMeans.a1b1.toFixed(1)}`);
451
+
452
+ // ── Revision Signal Progression ──────────────────────────────────────────
453
+
454
+ console.log('\n─── Revision Signal Progression (Turn N − Turn 1) ─────────');
455
+ for (const [key, items] of Object.entries(learnerConditions)) {
456
+ const prog = items.map(i => i.revisionProgression);
457
+ console.log(`${key.padEnd(19)}: mean=${mean(prog).toFixed(2)}, sd=${std(prog).toFixed(2)}, d=${cohensD(prog, learnerConditions.base_single.map(i => i.revisionProgression)).toFixed(2)} vs base_single`);
458
+ }
459
+
460
+ // ── Effect Sizes ─────────────────────────────────────────────────────────
461
+
462
+ const allBaseLearner = [...learnerConditions.base_single, ...learnerConditions.base_multi];
463
+ const allRecogLearner = [...learnerConditions.recog_single, ...learnerConditions.recog_multi];
464
+ const allSingleLearner = [...learnerConditions.base_single, ...learnerConditions.recog_single];
465
+ const allMultiLearner = [...learnerConditions.base_multi, ...learnerConditions.recog_multi];
466
+
467
+ console.log('\n─── Effect Sizes (Cohen\'s d) ────────────────────────────────');
468
+ console.log(`Avg Composite: recognition d=${cohensD(allRecogLearner.map(i => i.avgComposite), allBaseLearner.map(i => i.avgComposite)).toFixed(2)}`);
469
+ console.log(` architecture d=${cohensD(allMultiLearner.map(i => i.avgComposite), allSingleLearner.map(i => i.avgComposite)).toFixed(2)}`);
470
+ console.log(`Learning Arc: recognition d=${cohensD(allRecogLearner.map(i => i.learningArc), allBaseLearner.map(i => i.learningArc)).toFixed(2)}`);
471
+ console.log(` architecture d=${cohensD(allMultiLearner.map(i => i.learningArc), allSingleLearner.map(i => i.learningArc)).toFixed(2)}`);
472
+ console.log(`Final Turn: recognition d=${cohensD(allRecogLearner.map(i => i.finalComposite), allBaseLearner.map(i => i.finalComposite)).toFixed(2)}`);
473
+ console.log(` architecture d=${cohensD(allMultiLearner.map(i => i.finalComposite), allSingleLearner.map(i => i.finalComposite)).toFixed(2)}`);
474
+
475
+
476
+ // ═══════════════════════════════════════════════════════════════════════════
477
+ // (c) COMBINED INTERPRETATION
478
+ // ═══════════════════════════════════════════════════════════════════════════
479
+
480
+ console.log('\n\n═══════════════════════════════════════════════════════════════');
481
+ console.log(' COMBINED INTERPRETATION');
482
+ console.log('═══════════════════════════════════════════════════════════════\n');
483
+
484
+ console.log('MODULATION (Does internal ego-superego tension produce more varied behavior?):');
485
+ console.log(' Metrics: response length CV, vocabulary richness (TTR), dimension score variance,');
486
+ console.log(' within-scenario score variability');
487
+ console.log(' Key comparison: multi-agent vs single-agent (Factor B)');
488
+ console.log(' Secondary: recognition vs base (Factor A) — recognition may induce more');
489
+ console.log(' context-sensitive modulation even without multi-agent architecture\n');
490
+
491
+ console.log('SYNTHETIC LEARNING OUTCOMES (Does the learner show evidence of conceptual growth?):');
492
+ console.log(' Composite index from: revision_signals (35%), question_quality (30%),');
493
+ console.log(' conceptual_engagement (35%)');
494
+ console.log(' Key metric: Learning Arc (final turn − first turn composite)');
495
+ console.log(' Operationalizes Drama Machine "transformation" claim empirically\n');
496
+
497
+ db.close();
498
+ console.log('Done.');
@@ -0,0 +1,144 @@
1
+ #!/usr/bin/env node
2
+ import Database from 'better-sqlite3';
3
+ const db = new Database('data/evaluations.db');
4
+
5
+ const RUN_ID = 'eval-2026-02-17-25aaae85';
6
+
7
+ // 1. Extended dimension analysis by cell
8
+ const rows = db.prepare(
9
+ 'SELECT profile_name, scenario_name, scores_with_reasoning, overall_score FROM evaluation_results WHERE run_id = ? AND overall_score IS NOT NULL'
10
+ ).all(RUN_ID);
11
+
12
+ const dims = {};
13
+ rows.forEach(r => {
14
+ const s = JSON.parse(r.scores_with_reasoning);
15
+ const cell = r.profile_name.includes('66') ? 'descriptive' : r.profile_name.includes('67') ? 'prescriptive' : 'adversary';
16
+ Object.entries(s).forEach(([dim, val]) => {
17
+ if (!(dim in dims)) dims[dim] = { descriptive: [], prescriptive: [], adversary: [] };
18
+ dims[dim][cell].push(val.score);
19
+ });
20
+ });
21
+
22
+ const avg = arr => arr.length ? (arr.reduce((s, v) => s + v, 0) / arr.length).toFixed(2) : 'n/a';
23
+
24
+ console.log('=== Extended Dimension Means (sorted by pooled, ascending) ===\n');
25
+ console.log('Dimension'.padEnd(30) + '| Descript | Prescrip | Adversary | Pooled');
26
+ console.log('-'.repeat(30) + '|----------|----------|-----------|-------');
27
+
28
+ const sorted = Object.entries(dims).sort((a, b) => {
29
+ const poolA = [...a[1].descriptive, ...a[1].prescriptive, ...a[1].adversary];
30
+ const poolB = [...b[1].descriptive, ...b[1].prescriptive, ...b[1].adversary];
31
+ return (poolA.reduce((s, v) => s + v, 0) / poolA.length) - (poolB.reduce((s, v) => s + v, 0) / poolB.length);
32
+ });
33
+
34
+ sorted.forEach(([dim, cells]) => {
35
+ const pool = [...cells.descriptive, ...cells.prescriptive, ...cells.adversary];
36
+ console.log(
37
+ dim.padEnd(30) + '| ' +
38
+ avg(cells.descriptive).padEnd(9) + '| ' +
39
+ avg(cells.prescriptive).padEnd(9) + '| ' +
40
+ avg(cells.adversary).padEnd(10) + '| ' +
41
+ avg(pool)
42
+ );
43
+ });
44
+
45
+ // 2. By scenario
46
+ console.log('\n\n=== Dimension Means by Scenario (pooled across cells) ===\n');
47
+ const byScenario = {};
48
+ rows.forEach(r => {
49
+ const scen = r.scenario_name.includes('Misconception') ? 'misconception' : 'mutual_transform';
50
+ const s = JSON.parse(r.scores_with_reasoning);
51
+ Object.entries(s).forEach(([dim, val]) => {
52
+ const key = dim + '|' + scen;
53
+ if (!(key in byScenario)) byScenario[key] = [];
54
+ byScenario[key].push(val.score);
55
+ });
56
+ });
57
+
58
+ const allDims = [...new Set(Object.keys(byScenario).map(k => k.split('|')[0]))];
59
+ console.log('Dimension'.padEnd(30) + '| Misconc | Mutual T | Delta');
60
+ console.log('-'.repeat(30) + '|----------|----------|------');
61
+ allDims.sort((a, b) => {
62
+ const ma = byScenario[a + '|misconception'] || [];
63
+ const mb = byScenario[a + '|mutual_transform'] || [];
64
+ return (avg(ma) - avg(mb)) - (avg(byScenario[b + '|misconception'] || []) - avg(byScenario[b + '|mutual_transform'] || []));
65
+ }).forEach(dim => {
66
+ const m = byScenario[dim + '|misconception'] || [];
67
+ const t = byScenario[dim + '|mutual_transform'] || [];
68
+ const delta = (parseFloat(avg(m)) - parseFloat(avg(t))).toFixed(2);
69
+ console.log(dim.padEnd(30) + '| ' + avg(m).padEnd(9) + '| ' + avg(t).padEnd(9) + '| ' + delta);
70
+ });
71
+
72
+ // 3. Qualitative reasoning extraction: most common failure patterns
73
+ console.log('\n\n=== Failure Pattern Analysis (scores <= 2) ===\n');
74
+ const failPatterns = {};
75
+ rows.forEach(r => {
76
+ const s = JSON.parse(r.scores_with_reasoning);
77
+ Object.entries(s).forEach(([dim, val]) => {
78
+ if (val.score <= 2) {
79
+ const key = dim;
80
+ if (!(key in failPatterns)) failPatterns[key] = { count: 0, reasons: [] };
81
+ failPatterns[key].count++;
82
+ failPatterns[key].reasons.push(val.reasoning);
83
+ }
84
+ });
85
+ });
86
+
87
+ Object.entries(failPatterns)
88
+ .sort((a, b) => b[1].count - a[1].count)
89
+ .forEach(([dim, data]) => {
90
+ console.log(`${dim}: ${data.count} failures (${(data.count / 90 * 100).toFixed(0)}% of dialogues)`);
91
+ // Extract key phrases
92
+ const phrases = {};
93
+ data.reasons.forEach(r => {
94
+ const lower = r.toLowerCase();
95
+ if (lower.includes('ignor')) phrases['ignores context/history'] = (phrases['ignores context/history'] || 0) + 1;
96
+ if (lower.includes('reset') || lower.includes('turn 0') || lower.includes('turn-0')) phrases['resets to turn 0'] = (phrases['resets to turn 0'] || 0) + 1;
97
+ if (lower.includes('adapt')) phrases['fails to adapt'] = (phrases['fails to adapt'] || 0) + 1;
98
+ if (lower.includes('fabricat') || lower.includes('invent')) phrases['fabricates/invents'] = (phrases['fabricates/invents'] || 0) + 1;
99
+ if (lower.includes('repeat')) phrases['repeats same content'] = (phrases['repeats same content'] || 0) + 1;
100
+ if (lower.includes('reject') || lower.includes('denied') || lower.includes('disavow')) phrases['ignores learner rejection'] = (phrases['ignores learner rejection'] || 0) + 1;
101
+ });
102
+ Object.entries(phrases).sort((a, b) => b[1] - a[1]).forEach(([p, c]) => {
103
+ console.log(` - ${p}: ${c}/${data.count}`);
104
+ });
105
+ console.log(' Sample: "' + data.reasons[0].substring(0, 100) + '"');
106
+ console.log('');
107
+ });
108
+
109
+ // 4. High vs low score qualitative comparison
110
+ console.log('\n=== High vs Low Score Comparison ===\n');
111
+ const highRows = rows.filter(r => r.overall_score >= 70).sort((a, b) => b.overall_score - a.overall_score);
112
+ const lowRows = rows.filter(r => r.overall_score <= 35).sort((a, b) => a.overall_score - b.overall_score);
113
+
114
+ console.log(`High scorers (>=70): N=${highRows.length}, mean=${avg(highRows.map(r => r.overall_score))}`);
115
+ highRows.slice(0, 3).forEach(r => {
116
+ const s = JSON.parse(r.scores_with_reasoning);
117
+ console.log(` Score ${r.overall_score.toFixed(1)}:`);
118
+ console.log(` tutor_adaptation: ${s.tutor_adaptation?.score} — "${s.tutor_adaptation?.reasoning}"`);
119
+ console.log(` mutual_recognition: ${s.mutual_recognition?.score} — "${s.mutual_recognition?.reasoning}"`);
120
+ console.log(` dialectical: ${s.dialectical_responsiveness?.score} — "${s.dialectical_responsiveness?.reasoning}"`);
121
+ });
122
+
123
+ console.log(`\nLow scorers (<=35): N=${lowRows.length}, mean=${avg(lowRows.map(r => r.overall_score))}`);
124
+ lowRows.slice(0, 3).forEach(r => {
125
+ const s = JSON.parse(r.scores_with_reasoning);
126
+ console.log(` Score ${r.overall_score.toFixed(1)}:`);
127
+ console.log(` tutor_adaptation: ${s.tutor_adaptation?.score} — "${s.tutor_adaptation?.reasoning}"`);
128
+ console.log(` mutual_recognition: ${s.mutual_recognition?.score} — "${s.mutual_recognition?.reasoning}"`);
129
+ console.log(` dialectical: ${s.dialectical_responsiveness?.score} — "${s.dialectical_responsiveness?.reasoning}"`);
130
+ });
131
+
132
+ // 5. Bimodality check
133
+ console.log('\n\n=== Score Distribution (bins of 10) ===\n');
134
+ const bins = {};
135
+ rows.forEach(r => {
136
+ const bin = Math.floor(r.overall_score / 10) * 10;
137
+ bins[bin] = (bins[bin] || 0) + 1;
138
+ });
139
+ Object.keys(bins).sort((a, b) => a - b).forEach(bin => {
140
+ const bar = '#'.repeat(bins[bin]);
141
+ console.log(`${String(bin).padStart(3)}-${String(Number(bin) + 9).padStart(3)}: ${bar} (${bins[bin]})`);
142
+ });
143
+
144
+ db.close();