@machinespirits/eval 0.1.2 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (102) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +161 -0
  3. package/config/eval-settings.yaml +18 -0
  4. package/config/evaluation-rubric-learner.yaml +277 -0
  5. package/config/evaluation-rubric.yaml +613 -0
  6. package/config/interaction-eval-scenarios.yaml +93 -50
  7. package/config/learner-agents.yaml +124 -193
  8. package/config/machinespirits-eval.code-workspace +11 -0
  9. package/config/providers.yaml +60 -0
  10. package/config/suggestion-scenarios.yaml +1399 -0
  11. package/config/tutor-agents.yaml +716 -0
  12. package/docs/EVALUATION-VARIABLES.md +589 -0
  13. package/docs/REPLICATION-PLAN.md +577 -0
  14. package/index.js +15 -6
  15. package/package.json +16 -22
  16. package/routes/evalRoutes.js +88 -36
  17. package/scripts/analyze-judge-reliability.js +401 -0
  18. package/scripts/analyze-run.js +97 -0
  19. package/scripts/analyze-run.mjs +282 -0
  20. package/scripts/analyze-validation-failures.js +141 -0
  21. package/scripts/check-run.mjs +17 -0
  22. package/scripts/code-impasse-strategies.js +1132 -0
  23. package/scripts/compare-runs.js +44 -0
  24. package/scripts/compare-suggestions.js +80 -0
  25. package/scripts/compare-transformation.js +116 -0
  26. package/scripts/dig-into-run.js +158 -0
  27. package/scripts/eval-cli.js +2626 -0
  28. package/scripts/generate-paper-figures.py +452 -0
  29. package/scripts/qualitative-analysis-ai.js +1313 -0
  30. package/scripts/qualitative-analysis.js +688 -0
  31. package/scripts/seed-db.js +87 -0
  32. package/scripts/show-failed-suggestions.js +64 -0
  33. package/scripts/validate-content.js +192 -0
  34. package/server.js +3 -2
  35. package/services/__tests__/evalConfigLoader.test.js +338 -0
  36. package/services/anovaStats.js +499 -0
  37. package/services/contentResolver.js +407 -0
  38. package/services/dialogueTraceAnalyzer.js +454 -0
  39. package/services/evalConfigLoader.js +625 -0
  40. package/services/evaluationRunner.js +2171 -270
  41. package/services/evaluationStore.js +564 -29
  42. package/services/learnerConfigLoader.js +75 -5
  43. package/services/learnerRubricEvaluator.js +284 -0
  44. package/services/learnerTutorInteractionEngine.js +375 -0
  45. package/services/processUtils.js +18 -0
  46. package/services/progressLogger.js +98 -0
  47. package/services/promptRecommendationService.js +31 -26
  48. package/services/promptRewriter.js +427 -0
  49. package/services/rubricEvaluator.js +543 -70
  50. package/services/streamingReporter.js +104 -0
  51. package/services/turnComparisonAnalyzer.js +494 -0
  52. package/components/MobileEvalDashboard.tsx +0 -267
  53. package/components/comparison/DeltaAnalysisTable.tsx +0 -137
  54. package/components/comparison/ProfileComparisonCard.tsx +0 -176
  55. package/components/comparison/RecognitionABMode.tsx +0 -385
  56. package/components/comparison/RecognitionMetricsPanel.tsx +0 -135
  57. package/components/comparison/WinnerIndicator.tsx +0 -64
  58. package/components/comparison/index.ts +0 -5
  59. package/components/mobile/BottomSheet.tsx +0 -233
  60. package/components/mobile/DimensionBreakdown.tsx +0 -210
  61. package/components/mobile/DocsView.tsx +0 -363
  62. package/components/mobile/LogsView.tsx +0 -481
  63. package/components/mobile/PsychodynamicQuadrant.tsx +0 -261
  64. package/components/mobile/QuickTestView.tsx +0 -1098
  65. package/components/mobile/RecognitionTypeChart.tsx +0 -124
  66. package/components/mobile/RecognitionView.tsx +0 -809
  67. package/components/mobile/RunDetailView.tsx +0 -261
  68. package/components/mobile/RunHistoryView.tsx +0 -367
  69. package/components/mobile/ScoreRadial.tsx +0 -211
  70. package/components/mobile/StreamingLogPanel.tsx +0 -230
  71. package/components/mobile/SynthesisStrategyChart.tsx +0 -140
  72. package/docs/research/ABLATION-DIALOGUE-ROUNDS.md +0 -52
  73. package/docs/research/ABLATION-MODEL-SELECTION.md +0 -53
  74. package/docs/research/ADVANCED-EVAL-ANALYSIS.md +0 -60
  75. package/docs/research/ANOVA-RESULTS-2026-01-14.md +0 -257
  76. package/docs/research/COMPREHENSIVE-EVALUATION-PLAN.md +0 -586
  77. package/docs/research/COST-ANALYSIS.md +0 -56
  78. package/docs/research/CRITICAL-REVIEW-RECOGNITION-TUTORING.md +0 -340
  79. package/docs/research/DYNAMIC-VS-SCRIPTED-ANALYSIS.md +0 -291
  80. package/docs/research/EVAL-SYSTEM-ANALYSIS.md +0 -306
  81. package/docs/research/FACTORIAL-RESULTS-2026-01-14.md +0 -301
  82. package/docs/research/IMPLEMENTATION-PLAN-CRITIQUE-RESPONSE.md +0 -1988
  83. package/docs/research/LONGITUDINAL-DYADIC-EVALUATION.md +0 -282
  84. package/docs/research/MULTI-JUDGE-VALIDATION-2026-01-14.md +0 -147
  85. package/docs/research/PAPER-EXTENSION-DYADIC.md +0 -204
  86. package/docs/research/PAPER-UNIFIED.md +0 -659
  87. package/docs/research/PAPER-UNIFIED.pdf +0 -0
  88. package/docs/research/PROMPT-IMPROVEMENTS-2026-01-14.md +0 -356
  89. package/docs/research/SESSION-NOTES-2026-01-11-RECOGNITION-EVAL.md +0 -419
  90. package/docs/research/apa.csl +0 -2133
  91. package/docs/research/archive/PAPER-DRAFT-RECOGNITION-TUTORING.md +0 -1637
  92. package/docs/research/archive/paper-multiagent-tutor.tex +0 -978
  93. package/docs/research/paper-draft/full-paper.md +0 -136
  94. package/docs/research/paper-draft/images/pasted-image-2026-01-24T03-47-47-846Z-d76a7ae2.png +0 -0
  95. package/docs/research/paper-draft/references.bib +0 -515
  96. package/docs/research/transcript-baseline.md +0 -139
  97. package/docs/research/transcript-recognition-multiagent.md +0 -187
  98. package/hooks/useEvalData.ts +0 -625
  99. package/server-init.js +0 -45
  100. package/services/benchmarkService.js +0 -1892
  101. package/types.ts +0 -165
  102. package/utils/haptics.ts +0 -45
@@ -0,0 +1,688 @@
1
+ #!/usr/bin/env node
2
+
3
+ /**
4
+ * Qualitative Analysis of Evaluation Transcripts
5
+ *
6
+ * Extracts and analyzes suggestion text from the evaluation database:
7
+ * 1. High-contrast transcript pairs (base vs recognition, same scenario)
8
+ * 2. Word frequency analysis (unigrams + bigrams, differential)
9
+ * 3. Lexical diversity metrics (TTR, word/sentence length, vocabulary)
10
+ * 4. Thematic coding with chi-square significance tests
11
+ *
12
+ * Outputs:
13
+ * exports/qualitative-analysis.json — structured data
14
+ * exports/qualitative-analysis.md — paper-ready summary
15
+ */
16
+
17
+ import Database from 'better-sqlite3';
18
+ import path from 'path';
19
+ import fs from 'fs';
20
+
21
+ // ── Stopwords ────────────────────────────────────────────────────────────
22
+
23
+ const STOPWORDS = new Set([
24
+ 'a', 'about', 'above', 'after', 'again', 'against', 'all', 'am', 'an',
25
+ 'and', 'any', 'are', 'aren\'t', 'as', 'at', 'be', 'because', 'been',
26
+ 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can',
27
+ 'can\'t', 'cannot', 'could', 'couldn\'t', 'did', 'didn\'t', 'do', 'does',
28
+ 'doesn\'t', 'doing', 'don\'t', 'down', 'during', 'each', 'few', 'for',
29
+ 'from', 'further', 'get', 'got', 'had', 'hadn\'t', 'has', 'hasn\'t',
30
+ 'have', 'haven\'t', 'having', 'he', 'he\'d', 'he\'ll', 'he\'s', 'her',
31
+ 'here', 'here\'s', 'hers', 'herself', 'him', 'himself', 'his', 'how',
32
+ 'how\'s', 'i', 'i\'d', 'i\'ll', 'i\'m', 'i\'ve', 'if', 'in', 'into',
33
+ 'is', 'isn\'t', 'it', 'it\'s', 'its', 'itself', 'just', 'let', 'let\'s',
34
+ 'like', 'make', 'me', 'might', 'more', 'most', 'mustn\'t', 'my',
35
+ 'myself', 'no', 'nor', 'not', 'of', 'off', 'on', 'once', 'only', 'or',
36
+ 'other', 'ought', 'our', 'ours', 'ourselves', 'out', 'over', 'own',
37
+ 'really', 'right', 'same', 'shan\'t', 'she', 'she\'d', 'she\'ll',
38
+ 'she\'s', 'should', 'shouldn\'t', 'so', 'some', 'such', 'take', 'than',
39
+ 'that', 'that\'s', 'the', 'their', 'theirs', 'them', 'themselves', 'then',
40
+ 'there', 'there\'s', 'these', 'they', 'they\'d', 'they\'ll', 'they\'re',
41
+ 'they\'ve', 'this', 'those', 'through', 'to', 'too', 'under', 'until',
42
+ 'up', 'us', 'very', 'was', 'wasn\'t', 'we', 'we\'d', 'we\'ll', 'we\'re',
43
+ 'we\'ve', 'well', 'were', 'weren\'t', 'what', 'what\'s', 'when',
44
+ 'when\'s', 'where', 'where\'s', 'which', 'while', 'who', 'who\'s',
45
+ 'whom', 'why', 'why\'s', 'will', 'with', 'won\'t', 'would', 'wouldn\'t',
46
+ 'you', 'you\'d', 'you\'ll', 'you\'re', 'you\'ve', 'your', 'yours',
47
+ 'yourself', 'yourselves', 'also', 'been', 'being', 'come', 'even',
48
+ 'first', 'going', 'good', 'know', 'look', 'much', 'need', 'new', 'now',
49
+ 'one', 'people', 'really', 'see', 'think', 'thing', 'time', 'two',
50
+ 'use', 'want', 'way', 'work', 'would', 'year', 'back', 'long', 'say',
51
+ 'still', 'tell', 'try', 'give', 'go', 'help', 'keep', 'many',
52
+ 'may', 'put', 'seem', 'show', 'start', 'turn', 'big', 'end', 'set',
53
+ 'll', 've', 're', 's', 't', 'd', 'don', 'isn', 'doesn', 'didn',
54
+ 'won', 'can', 'couldn', 'shouldn', 'wasn', 'weren', 'hasn', 'haven',
55
+ 'hadn', 'aren', 'mustn', 'shan', 'ain',
56
+ ]);
57
+
58
+ // ── Thematic coding categories ───────────────────────────────────────────
59
+ // Based on patterns from dialogueTraceAnalyzer.js and turnComparisonAnalyzer.js
60
+
61
+ const THEMATIC_CATEGORIES = {
62
+ engagement: {
63
+ label: 'Engagement markers',
64
+ description: 'Second-person engagement with learner contributions',
65
+ patterns: [
66
+ /your insight/gi,
67
+ /building on your/gi,
68
+ /your question/gi,
69
+ /your point/gi,
70
+ /your observation/gi,
71
+ /your analysis/gi,
72
+ /your argument/gi,
73
+ /your critique/gi,
74
+ /you've (raised|identified|highlighted|noticed|pointed out)/gi,
75
+ /you're (asking|raising|pushing|exploring|getting at)/gi,
76
+ ],
77
+ },
78
+ transformation: {
79
+ label: 'Transformation language',
80
+ description: 'Markers of mutual change or perspective shift',
81
+ patterns: [
82
+ /reconsidering/gi,
83
+ /that changes (how I|my)/gi,
84
+ /I hadn't (thought|considered)/gi,
85
+ /revising (my|the)/gi,
86
+ /let me (revise|adjust|rethink)/gi,
87
+ /you've (helped|pushed|made) me/gi,
88
+ /your .{1,20} (complicates|enriches|changes)/gi,
89
+ /shifts? (my|the|our) (understanding|framing|approach)/gi,
90
+ ],
91
+ },
92
+ struggle_honoring: {
93
+ label: 'Struggle-honoring',
94
+ description: 'Acknowledging productive confusion or difficulty',
95
+ patterns: [
96
+ /wrestling with/gi,
97
+ /productive confusion/gi,
98
+ /working through/gi,
99
+ /grappling with/gi,
100
+ /sitting with (the|this)/gi,
101
+ /tension (between|here|you)/gi,
102
+ /difficulty (is|here)/gi,
103
+ /struggle (with|is|here)/gi,
104
+ /not (easy|simple|straightforward)/gi,
105
+ ],
106
+ },
107
+ learner_as_subject: {
108
+ label: 'Learner-as-subject framing',
109
+ description: 'Treating learner as autonomous intellectual agent',
110
+ patterns: [
111
+ /your interpretation/gi,
112
+ /your analysis/gi,
113
+ /your understanding/gi,
114
+ /you're grappling with/gi,
115
+ /your perspective/gi,
116
+ /your framework/gi,
117
+ /your reading/gi,
118
+ /what you're (doing|building|developing|constructing)/gi,
119
+ /your (intellectual|philosophical|analytical)/gi,
120
+ ],
121
+ },
122
+ directive: {
123
+ label: 'Directive framing',
124
+ description: 'Expert-to-novice instructional markers',
125
+ patterns: [
126
+ /you should/gi,
127
+ /you need to/gi,
128
+ /you must/gi,
129
+ /the correct (answer|approach|way)/gi,
130
+ /the answer is/gi,
131
+ /let me explain/gi,
132
+ /here's what/gi,
133
+ /make sure (to|you)/gi,
134
+ /first,? you/gi,
135
+ ],
136
+ },
137
+ generic: {
138
+ label: 'Generic/placeholder',
139
+ description: 'Vague pedagogical language without specificity',
140
+ patterns: [
141
+ /foundational/gi,
142
+ /key concepts/gi,
143
+ /learning objectives/gi,
144
+ /knowledge base/gi,
145
+ /solid foundation/gi,
146
+ /core concepts/gi,
147
+ /build (a|your) (solid|strong)/gi,
148
+ /comprehensive (understanding|overview|review)/gi,
149
+ ],
150
+ },
151
+ };
152
+
153
+ // ── Helpers ──────────────────────────────────────────────────────────────
154
+
155
+ function tokenize(text) {
156
+ return text
157
+ .toLowerCase()
158
+ .replace(/[^a-z'\s-]/g, ' ')
159
+ .split(/\s+/)
160
+ .filter(w => w.length > 1);
161
+ }
162
+
163
+ function tokenizeFiltered(text) {
164
+ return tokenize(text).filter(w => !STOPWORDS.has(w));
165
+ }
166
+
167
+ function countSentences(text) {
168
+ const sentences = text.split(/[.!?]+/).filter(s => s.trim().length > 0);
169
+ return Math.max(sentences.length, 1);
170
+ }
171
+
172
+ function getBigrams(tokens) {
173
+ const bigrams = [];
174
+ for (let i = 0; i < tokens.length - 1; i++) {
175
+ bigrams.push(`${tokens[i]} ${tokens[i + 1]}`);
176
+ }
177
+ return bigrams;
178
+ }
179
+
180
+ function countFrequencies(items) {
181
+ const freq = {};
182
+ for (const item of items) {
183
+ freq[item] = (freq[item] || 0) + 1;
184
+ }
185
+ return freq;
186
+ }
187
+
188
+ function topN(freqObj, n) {
189
+ return Object.entries(freqObj)
190
+ .sort((a, b) => b[1] - a[1])
191
+ .slice(0, n);
192
+ }
193
+
194
+ /**
195
+ * Chi-square test for 2×2 contingency table.
196
+ * Cells: [condition1_present, condition1_absent, condition2_present, condition2_absent]
197
+ */
198
+ function chiSquare2x2(a, b, c, d) {
199
+ const n = a + b + c + d;
200
+ if (n === 0) return { chi2: 0, p: 1 };
201
+ const expected = [
202
+ ((a + b) * (a + c)) / n,
203
+ ((a + b) * (b + d)) / n,
204
+ ((c + d) * (a + c)) / n,
205
+ ((c + d) * (b + d)) / n,
206
+ ];
207
+ // Yates correction for small samples
208
+ const chi2 = [a, b, c, d].reduce((sum, obs, i) => {
209
+ const exp = expected[i];
210
+ if (exp === 0) return sum;
211
+ return sum + (Math.abs(obs - exp) - 0.5) ** 2 / exp;
212
+ }, 0);
213
+
214
+ // Approximate p-value from chi-square with df=1
215
+ let p;
216
+ if (chi2 > 10.83) p = 0.001;
217
+ else if (chi2 > 6.63) p = 0.01;
218
+ else if (chi2 > 3.84) p = 0.05;
219
+ else if (chi2 > 2.71) p = 0.10;
220
+ else p = 0.25;
221
+
222
+ return { chi2, p, sig: p < 0.05 };
223
+ }
224
+
225
+ function extractSuggestionTexts(suggestionsJson) {
226
+ try {
227
+ const parsed = JSON.parse(suggestionsJson);
228
+ if (!Array.isArray(parsed)) return { messages: [], reasonings: [] };
229
+ const messages = parsed.map(s => s.message || '').filter(Boolean);
230
+ const reasonings = parsed.map(s => s.reasoning || '').filter(Boolean);
231
+ return { messages, reasonings };
232
+ } catch {
233
+ return { messages: [], reasonings: [] };
234
+ }
235
+ }
236
+
237
+ // ── Main ─────────────────────────────────────────────────────────────────
238
+
239
+ async function main() {
240
+ console.log('='.repeat(70));
241
+ console.log('QUALITATIVE ANALYSIS OF EVALUATION TRANSCRIPTS');
242
+ console.log('='.repeat(70));
243
+ console.log('');
244
+
245
+ const dbPath = path.join(process.cwd(), 'data', 'evaluations.db');
246
+ if (!fs.existsSync(dbPath)) {
247
+ console.error('Database not found:', dbPath);
248
+ process.exit(1);
249
+ }
250
+
251
+ const db = new Database(dbPath);
252
+
253
+ // Ensure exports directory exists
254
+ const exportsDir = path.join(process.cwd(), 'exports');
255
+ if (!fs.existsSync(exportsDir)) {
256
+ fs.mkdirSync(exportsDir, { recursive: true });
257
+ }
258
+
259
+ // ── 1. Transcript Pair Selection ────────────────────────────────────
260
+
261
+ console.log('1. TRANSCRIPT PAIR SELECTION');
262
+ console.log('-'.repeat(70));
263
+
264
+ const baseCells = ['cell_1_base_single_unified', 'cell_2_base_single_psycho',
265
+ 'cell_3_base_multi_unified', 'cell_4_base_multi_psycho'];
266
+ const recogCells = ['cell_5_recog_single_unified', 'cell_6_recog_single_psycho',
267
+ 'cell_7_recog_multi_unified', 'cell_8_recog_multi_psycho'];
268
+
269
+ // Scenarios to find pairs for (high contrast)
270
+ const pairScenarios = [
271
+ 'struggling_learner',
272
+ 'recognition_seeking_learner',
273
+ 'adversarial_tester',
274
+ ];
275
+
276
+ const pairs = [];
277
+
278
+ for (const scenario of pairScenarios) {
279
+ // Highest-scoring recognition result
280
+ const bestRecog = db.prepare(`
281
+ SELECT id, scenario_id, profile_name, overall_score, suggestions
282
+ FROM evaluation_results
283
+ WHERE success = 1 AND overall_score IS NOT NULL
284
+ AND scenario_id = ?
285
+ AND profile_name IN (${recogCells.map(() => '?').join(',')})
286
+ AND suggestions IS NOT NULL
287
+ ORDER BY overall_score DESC
288
+ LIMIT 1
289
+ `).get(scenario, ...recogCells);
290
+
291
+ // Lowest-scoring base result (exclude score=0 which are error cases)
292
+ const worstBase = db.prepare(`
293
+ SELECT id, scenario_id, profile_name, overall_score, suggestions
294
+ FROM evaluation_results
295
+ WHERE success = 1 AND overall_score IS NOT NULL AND overall_score > 0
296
+ AND scenario_id = ?
297
+ AND profile_name IN (${baseCells.map(() => '?').join(',')})
298
+ AND suggestions IS NOT NULL
299
+ ORDER BY overall_score ASC
300
+ LIMIT 1
301
+ `).get(scenario, ...baseCells);
302
+
303
+ if (bestRecog && worstBase) {
304
+ const recTexts = extractSuggestionTexts(bestRecog.suggestions);
305
+ const baseTexts = extractSuggestionTexts(worstBase.suggestions);
306
+
307
+ const pair = {
308
+ scenario,
309
+ recognition: {
310
+ id: bestRecog.id,
311
+ profile: bestRecog.profile_name,
312
+ score: bestRecog.overall_score,
313
+ message: recTexts.messages.join('\n\n'),
314
+ reasoning: recTexts.reasonings.join('\n\n'),
315
+ },
316
+ base: {
317
+ id: worstBase.id,
318
+ profile: worstBase.profile_name,
319
+ score: worstBase.overall_score,
320
+ message: baseTexts.messages.join('\n\n'),
321
+ reasoning: baseTexts.reasonings.join('\n\n'),
322
+ },
323
+ scoreDiff: bestRecog.overall_score - worstBase.overall_score,
324
+ };
325
+ pairs.push(pair);
326
+
327
+ console.log(`\n ${scenario}:`);
328
+ console.log(` Recognition: id=${pair.recognition.id}, profile=${pair.recognition.profile}, score=${pair.recognition.score.toFixed(1)}`);
329
+ console.log(` Base: id=${pair.base.id}, profile=${pair.base.profile}, score=${pair.base.score.toFixed(1)}`);
330
+ console.log(` Score gap: ${pair.scoreDiff.toFixed(1)} points`);
331
+ }
332
+ }
333
+
334
+ // ── 2. Corpus Construction ──────────────────────────────────────────
335
+
336
+ console.log('\n\n2. CORPUS CONSTRUCTION');
337
+ console.log('-'.repeat(70));
338
+
339
+ // Gather all suggestion text for base and recognition conditions
340
+ const allRows = db.prepare(`
341
+ SELECT profile_name, suggestions
342
+ FROM evaluation_results
343
+ WHERE success = 1
344
+ AND suggestions IS NOT NULL
345
+ AND profile_name IN (${[...baseCells, ...recogCells].map(() => '?').join(',')})
346
+ `).all(...baseCells, ...recogCells);
347
+
348
+ const corpus = {
349
+ base: { messages: [], reasonings: [] },
350
+ recognition: { messages: [], reasonings: [] },
351
+ };
352
+
353
+ for (const row of allRows) {
354
+ const condition = baseCells.includes(row.profile_name) ? 'base' : 'recognition';
355
+ const texts = extractSuggestionTexts(row.suggestions);
356
+ corpus[condition].messages.push(...texts.messages);
357
+ corpus[condition].reasonings.push(...texts.reasonings);
358
+ }
359
+
360
+ console.log(` Base: ${corpus.base.messages.length} messages, ${corpus.base.reasonings.length} reasonings`);
361
+ console.log(` Recognition: ${corpus.recognition.messages.length} messages, ${corpus.recognition.reasonings.length} reasonings`);
362
+
363
+ // ── 3. Word Frequency Analysis ──────────────────────────────────────
364
+
365
+ console.log('\n\n3. WORD FREQUENCY ANALYSIS');
366
+ console.log('-'.repeat(70));
367
+
368
+ const baseMessageText = corpus.base.messages.join(' ');
369
+ const recogMessageText = corpus.recognition.messages.join(' ');
370
+
371
+ const baseTokens = tokenizeFiltered(baseMessageText);
372
+ const recogTokens = tokenizeFiltered(recogMessageText);
373
+
374
+ const baseUnigrams = countFrequencies(baseTokens);
375
+ const recogUnigrams = countFrequencies(recogTokens);
376
+
377
+ const baseBigrams = countFrequencies(getBigrams(tokenizeFiltered(baseMessageText)));
378
+ const recogBigrams = countFrequencies(getBigrams(tokenizeFiltered(recogMessageText)));
379
+
380
+ console.log('\n Top 30 Base Unigrams:');
381
+ const baseTop30 = topN(baseUnigrams, 30);
382
+ baseTop30.forEach(([w, c], i) => console.log(` ${(i + 1).toString().padStart(2)}. ${w.padEnd(20)} ${c}`));
383
+
384
+ console.log('\n Top 30 Recognition Unigrams:');
385
+ const recogTop30 = topN(recogUnigrams, 30);
386
+ recogTop30.forEach(([w, c], i) => console.log(` ${(i + 1).toString().padStart(2)}. ${w.padEnd(20)} ${c}`));
387
+
388
+ // Differential: words disproportionately more frequent in one condition
389
+ // Normalize by corpus size
390
+ const baseTotal = baseTokens.length;
391
+ const recogTotal = recogTokens.length;
392
+
393
+ // Get all words that appear at least 5 times in either corpus
394
+ const allWords = new Set([...Object.keys(baseUnigrams), ...Object.keys(recogUnigrams)]);
395
+ const differential = [];
396
+ for (const word of allWords) {
397
+ const baseCount = baseUnigrams[word] || 0;
398
+ const recogCount = recogUnigrams[word] || 0;
399
+ if (baseCount + recogCount < 10) continue; // minimum frequency
400
+ const baseRate = baseCount / baseTotal;
401
+ const recogRate = recogCount / recogTotal;
402
+ if (baseRate === 0 && recogRate === 0) continue;
403
+ const ratio = recogRate > 0 && baseRate > 0
404
+ ? recogRate / baseRate
405
+ : recogRate > 0 ? Infinity : 0;
406
+ differential.push({ word, baseCount, recogCount, baseRate, recogRate, ratio });
407
+ }
408
+
409
+ // Sort by ratio descending for recognition-skewed
410
+ // Require both counts > 0 for finite ratios, and minimum 10 in dominant condition
411
+ const recogSkewed = differential
412
+ .filter(d => d.ratio !== Infinity && d.ratio > 1 && d.recogCount >= 10)
413
+ .sort((a, b) => b.ratio - a.ratio)
414
+ .slice(0, 15);
415
+
416
+ const baseSkewed = differential
417
+ .filter(d => d.ratio > 0 && d.ratio < 1 && d.baseCount >= 10)
418
+ .sort((a, b) => a.ratio - b.ratio)
419
+ .slice(0, 15);
420
+
421
+ console.log('\n Top 15 Recognition-Skewed Words:');
422
+ recogSkewed.forEach(d => {
423
+ console.log(` ${d.word.padEnd(20)} base=${d.baseCount}, recog=${d.recogCount}, ratio=${d.ratio === Infinity ? '∞' : d.ratio.toFixed(2)}×`);
424
+ });
425
+
426
+ console.log('\n Top 15 Base-Skewed Words:');
427
+ baseSkewed.forEach(d => {
428
+ console.log(` ${d.word.padEnd(20)} base=${d.baseCount}, recog=${d.recogCount}, ratio=${d.ratio.toFixed(2)}×`);
429
+ });
430
+
431
+ // Top bigrams differential
432
+ const allBigrams = new Set([...Object.keys(baseBigrams), ...Object.keys(recogBigrams)]);
433
+ const bigramDiff = [];
434
+ for (const bg of allBigrams) {
435
+ const bc = baseBigrams[bg] || 0;
436
+ const rc = recogBigrams[bg] || 0;
437
+ if (bc + rc < 5) continue;
438
+ const br = bc / baseTotal;
439
+ const rr = rc / recogTotal;
440
+ const ratio = rr > 0 && br > 0 ? rr / br : rr > 0 ? Infinity : 0;
441
+ bigramDiff.push({ bigram: bg, baseCount: bc, recogCount: rc, ratio });
442
+ }
443
+
444
+ const recogBigramSkewed = bigramDiff.filter(d => d.ratio > 1).sort((a, b) => b.ratio - a.ratio).slice(0, 10);
445
+ const baseBigramSkewed = bigramDiff.filter(d => d.ratio < 1 && d.ratio > 0).sort((a, b) => a.ratio - b.ratio).slice(0, 10);
446
+
447
+ console.log('\n Top 10 Recognition-Skewed Bigrams:');
448
+ recogBigramSkewed.forEach(d => {
449
+ console.log(` ${d.bigram.padEnd(30)} base=${d.baseCount}, recog=${d.recogCount}, ratio=${d.ratio === Infinity ? '∞' : d.ratio.toFixed(2)}×`);
450
+ });
451
+
452
+ // ── 4. Lexical Diversity ────────────────────────────────────────────
453
+
454
+ console.log('\n\n4. LEXICAL DIVERSITY METRICS');
455
+ console.log('-'.repeat(70));
456
+
457
+ function computeLexicalMetrics(text, label) {
458
+ const allTokens = tokenize(text);
459
+ const types = new Set(allTokens);
460
+ const sentences = countSentences(text);
461
+ const ttr = allTokens.length > 0 ? types.size / allTokens.length : 0;
462
+ const meanWordLen = allTokens.length > 0
463
+ ? allTokens.reduce((sum, w) => sum + w.length, 0) / allTokens.length
464
+ : 0;
465
+ const meanSentLen = sentences > 0 ? allTokens.length / sentences : 0;
466
+
467
+ return {
468
+ label,
469
+ tokens: allTokens.length,
470
+ types: types.size,
471
+ ttr: ttr,
472
+ meanWordLength: meanWordLen,
473
+ meanSentenceLength: meanSentLen,
474
+ vocabularySize: types.size,
475
+ };
476
+ }
477
+
478
+ const lexical = {
479
+ base_message: computeLexicalMetrics(baseMessageText, 'Base (message)'),
480
+ recog_message: computeLexicalMetrics(recogMessageText, 'Recognition (message)'),
481
+ base_reasoning: computeLexicalMetrics(corpus.base.reasonings.join(' '), 'Base (reasoning)'),
482
+ recog_reasoning: computeLexicalMetrics(corpus.recognition.reasonings.join(' '), 'Recognition (reasoning)'),
483
+ };
484
+
485
+ for (const [key, m] of Object.entries(lexical)) {
486
+ console.log(`\n ${m.label}:`);
487
+ console.log(` Tokens: ${m.tokens.toLocaleString()}`);
488
+ console.log(` Type-Token Ratio: ${m.ttr.toFixed(4)}`);
489
+ console.log(` Vocabulary size: ${m.types.toLocaleString()}`);
490
+ console.log(` Mean word length: ${m.meanWordLength.toFixed(2)} chars`);
491
+ console.log(` Mean sentence length: ${m.meanSentenceLength.toFixed(1)} words`);
492
+ }
493
+
494
+ // ── 5. Thematic Coding ─────────────────────────────────────────────
495
+
496
+ console.log('\n\n5. THEMATIC CODING');
497
+ console.log('-'.repeat(70));
498
+
499
+ function countThematicMatches(text, patterns) {
500
+ let total = 0;
501
+ for (const pattern of patterns) {
502
+ const matches = text.match(pattern);
503
+ if (matches) total += matches.length;
504
+ }
505
+ return total;
506
+ }
507
+
508
+ // Count per 1000 words
509
+ const baseWordCount = tokenize(baseMessageText).length;
510
+ const recogWordCount = tokenize(recogMessageText).length;
511
+
512
+ // Also track presence/absence for chi-square (per response)
513
+ function countResponsePresence(messages, patterns) {
514
+ let present = 0;
515
+ for (const msg of messages) {
516
+ let found = false;
517
+ for (const pattern of patterns) {
518
+ if (pattern.test(msg)) {
519
+ found = true;
520
+ break;
521
+ }
522
+ }
523
+ if (found) present++;
524
+ }
525
+ return present;
526
+ }
527
+
528
+ const thematicResults = {};
529
+
530
+ for (const [category, config] of Object.entries(THEMATIC_CATEGORIES)) {
531
+ const baseRawCount = countThematicMatches(baseMessageText, config.patterns);
532
+ const recogRawCount = countThematicMatches(recogMessageText, config.patterns);
533
+
534
+ const basePer1000 = baseWordCount > 0 ? (baseRawCount / baseWordCount) * 1000 : 0;
535
+ const recogPer1000 = recogWordCount > 0 ? (recogRawCount / recogWordCount) * 1000 : 0;
536
+
537
+ const ratio = basePer1000 > 0 ? recogPer1000 / basePer1000 : (recogPer1000 > 0 ? Infinity : 1);
538
+
539
+ // Chi-square: response-level presence/absence
540
+ const basePresent = countResponsePresence(corpus.base.messages, config.patterns);
541
+ const baseAbsent = corpus.base.messages.length - basePresent;
542
+ const recogPresent = countResponsePresence(corpus.recognition.messages, config.patterns);
543
+ const recogAbsent = corpus.recognition.messages.length - recogPresent;
544
+
545
+ const chi = chiSquare2x2(basePresent, baseAbsent, recogPresent, recogAbsent);
546
+
547
+ thematicResults[category] = {
548
+ label: config.label,
549
+ description: config.description,
550
+ baseRawCount,
551
+ recogRawCount,
552
+ basePer1000,
553
+ recogPer1000,
554
+ ratio,
555
+ basePresent,
556
+ baseTotal: corpus.base.messages.length,
557
+ recogPresent,
558
+ recogTotal: corpus.recognition.messages.length,
559
+ chi2: chi.chi2,
560
+ p: chi.p,
561
+ sig: chi.sig,
562
+ };
563
+
564
+ console.log(`\n ${config.label}:`);
565
+ console.log(` Base: ${baseRawCount} occurrences (${basePer1000.toFixed(1)}/1000 words), ${basePresent}/${corpus.base.messages.length} responses`);
566
+ console.log(` Recognition: ${recogRawCount} occurrences (${recogPer1000.toFixed(1)}/1000 words), ${recogPresent}/${corpus.recognition.messages.length} responses`);
567
+ console.log(` Ratio: ${ratio === Infinity ? '∞' : ratio.toFixed(2)}×`);
568
+ console.log(` χ²(1) = ${chi.chi2.toFixed(2)}, p ${chi.p < 0.05 ? '< .05 *' : chi.p < 0.10 ? '< .10 †' : `≈ ${chi.p.toFixed(2)}`}`);
569
+ }
570
+
571
+ // ── 6. Build Output ────────────────────────────────────────────────
572
+
573
+ console.log('\n\n6. GENERATING OUTPUT');
574
+ console.log('-'.repeat(70));
575
+
576
+ // JSON output
577
+ const jsonOutput = {
578
+ generated: new Date().toISOString(),
579
+ transcriptPairs: pairs,
580
+ wordFrequency: {
581
+ baseCorpusSize: baseTotal,
582
+ recogCorpusSize: recogTotal,
583
+ baseTop30Unigrams: baseTop30.map(([w, c]) => ({ word: w, count: c })),
584
+ recogTop30Unigrams: recogTop30.map(([w, c]) => ({ word: w, count: c })),
585
+ recogSkewedUnigrams: recogSkewed,
586
+ baseSkewedUnigrams: baseSkewed,
587
+ recogSkewedBigrams: recogBigramSkewed,
588
+ baseSkewedBigrams: baseBigramSkewed,
589
+ },
590
+ lexicalDiversity: lexical,
591
+ thematicCoding: thematicResults,
592
+ };
593
+
594
+ const jsonPath = path.join(exportsDir, 'qualitative-analysis.json');
595
+ fs.writeFileSync(jsonPath, JSON.stringify(jsonOutput, null, 2));
596
+ console.log(` JSON: ${jsonPath}`);
597
+
598
+ // ── Markdown output ────────────────────────────────────────────────
599
+
600
+ let md = `# Qualitative Analysis of Evaluation Transcripts
601
+
602
+ **Generated:** ${new Date().toISOString()}
603
+
604
+ ## 1. Transcript Pairs (High-Contrast Base vs Recognition)
605
+
606
+ `;
607
+
608
+ const scenarioLabels = {
609
+ struggling_learner: 'Struggling Learner',
610
+ recognition_seeking_learner: 'Recognition-Seeking Learner',
611
+ adversarial_tester: 'Adversarial Tester',
612
+ };
613
+
614
+ for (const pair of pairs) {
615
+ md += `### ${scenarioLabels[pair.scenario] || pair.scenario}\n\n`;
616
+ md += `**Score gap:** ${pair.scoreDiff.toFixed(1)} points (base ${pair.base.score.toFixed(1)} → recognition ${pair.recognition.score.toFixed(1)})\n\n`;
617
+
618
+ md += `**Base response** (${pair.base.profile}, score ${pair.base.score.toFixed(1)}):\n\n`;
619
+ md += `> ${pair.base.message.replace(/\n/g, '\n> ')}\n\n`;
620
+
621
+ md += `**Recognition response** (${pair.recognition.profile}, score ${pair.recognition.score.toFixed(1)}):\n\n`;
622
+ md += `> ${pair.recognition.message.replace(/\n/g, '\n> ')}\n\n`;
623
+ md += `---\n\n`;
624
+ }
625
+
626
+ md += `## 2. Lexical Diversity Metrics
627
+
628
+ | Metric | Base (message) | Recognition (message) | Base (reasoning) | Recognition (reasoning) |
629
+ |--------|----------------|----------------------|------------------|------------------------|
630
+ | Tokens | ${lexical.base_message.tokens.toLocaleString()} | ${lexical.recog_message.tokens.toLocaleString()} | ${lexical.base_reasoning.tokens.toLocaleString()} | ${lexical.recog_reasoning.tokens.toLocaleString()} |
631
+ | Type-Token Ratio | ${lexical.base_message.ttr.toFixed(4)} | ${lexical.recog_message.ttr.toFixed(4)} | ${lexical.base_reasoning.ttr.toFixed(4)} | ${lexical.recog_reasoning.ttr.toFixed(4)} |
632
+ | Vocabulary Size | ${lexical.base_message.types.toLocaleString()} | ${lexical.recog_message.types.toLocaleString()} | ${lexical.base_reasoning.types.toLocaleString()} | ${lexical.recog_reasoning.types.toLocaleString()} |
633
+ | Mean Word Length | ${lexical.base_message.meanWordLength.toFixed(2)} | ${lexical.recog_message.meanWordLength.toFixed(2)} | ${lexical.base_reasoning.meanWordLength.toFixed(2)} | ${lexical.recog_reasoning.meanWordLength.toFixed(2)} |
634
+ | Mean Sentence Length | ${lexical.base_message.meanSentenceLength.toFixed(1)} | ${lexical.recog_message.meanSentenceLength.toFixed(1)} | ${lexical.base_reasoning.meanSentenceLength.toFixed(1)} | ${lexical.recog_reasoning.meanSentenceLength.toFixed(1)} |
635
+
636
+ `;
637
+
638
+ md += `## 3. Differential Word Frequency
639
+
640
+ ### Recognition-Skewed Terms
641
+
642
+ | Word | Base Count | Recognition Count | Rate Ratio |
643
+ |------|-----------|-------------------|------------|
644
+ `;
645
+ for (const d of recogSkewed) {
646
+ md += `| ${d.word} | ${d.baseCount} | ${d.recogCount} | ${d.ratio === Infinity ? '∞' : d.ratio.toFixed(2)}× |\n`;
647
+ }
648
+
649
+ md += `\n### Base-Skewed Terms
650
+
651
+ | Word | Base Count | Recognition Count | Rate Ratio |
652
+ |------|-----------|-------------------|------------|
653
+ `;
654
+ for (const d of baseSkewed) {
655
+ md += `| ${d.word} | ${d.baseCount} | ${d.recogCount} | ${d.ratio.toFixed(2)}× |\n`;
656
+ }
657
+
658
+ md += `\n### Recognition-Skewed Bigrams
659
+
660
+ | Bigram | Base Count | Recognition Count | Rate Ratio |
661
+ |--------|-----------|-------------------|------------|
662
+ `;
663
+ for (const d of recogBigramSkewed) {
664
+ md += `| ${d.bigram} | ${d.baseCount} | ${d.recogCount} | ${d.ratio === Infinity ? '∞' : d.ratio.toFixed(2)}× |\n`;
665
+ }
666
+
667
+ md += `\n## 4. Thematic Code Frequency
668
+
669
+ | Category | Base (per 1000 words) | Recognition (per 1000 words) | Ratio | χ²(1) | Sig |
670
+ |----------|----------------------|------------------------------|-------|-------|-----|
671
+ `;
672
+ for (const [cat, r] of Object.entries(thematicResults)) {
673
+ md += `| ${r.label} | ${r.basePer1000.toFixed(1)} | ${r.recogPer1000.toFixed(1)} | ${r.ratio === Infinity ? '∞' : r.ratio.toFixed(2)}× | ${r.chi2.toFixed(2)} | ${r.sig ? '*' : r.p < 0.10 ? '†' : ''} |\n`;
674
+ }
675
+
676
+ md += `\n*\\* p < .05, † p < .10. Chi-square tests on response-level presence/absence (base N=${corpus.base.messages.length}, recognition N=${corpus.recognition.messages.length}).*\n`;
677
+
678
+ const mdPath = path.join(exportsDir, 'qualitative-analysis.md');
679
+ fs.writeFileSync(mdPath, md);
680
+ console.log(` Markdown: ${mdPath}`);
681
+
682
+ console.log('\n' + '='.repeat(70));
683
+ console.log('Done.');
684
+
685
+ db.close();
686
+ }
687
+
688
+ main().catch(console.error);