@machinespirits/eval 0.1.2 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (102) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +161 -0
  3. package/config/eval-settings.yaml +18 -0
  4. package/config/evaluation-rubric-learner.yaml +277 -0
  5. package/config/evaluation-rubric.yaml +613 -0
  6. package/config/interaction-eval-scenarios.yaml +93 -50
  7. package/config/learner-agents.yaml +124 -193
  8. package/config/machinespirits-eval.code-workspace +11 -0
  9. package/config/providers.yaml +60 -0
  10. package/config/suggestion-scenarios.yaml +1399 -0
  11. package/config/tutor-agents.yaml +716 -0
  12. package/docs/EVALUATION-VARIABLES.md +589 -0
  13. package/docs/REPLICATION-PLAN.md +577 -0
  14. package/index.js +15 -6
  15. package/package.json +16 -22
  16. package/routes/evalRoutes.js +88 -36
  17. package/scripts/analyze-judge-reliability.js +401 -0
  18. package/scripts/analyze-run.js +97 -0
  19. package/scripts/analyze-run.mjs +282 -0
  20. package/scripts/analyze-validation-failures.js +141 -0
  21. package/scripts/check-run.mjs +17 -0
  22. package/scripts/code-impasse-strategies.js +1132 -0
  23. package/scripts/compare-runs.js +44 -0
  24. package/scripts/compare-suggestions.js +80 -0
  25. package/scripts/compare-transformation.js +116 -0
  26. package/scripts/dig-into-run.js +158 -0
  27. package/scripts/eval-cli.js +2626 -0
  28. package/scripts/generate-paper-figures.py +452 -0
  29. package/scripts/qualitative-analysis-ai.js +1313 -0
  30. package/scripts/qualitative-analysis.js +688 -0
  31. package/scripts/seed-db.js +87 -0
  32. package/scripts/show-failed-suggestions.js +64 -0
  33. package/scripts/validate-content.js +192 -0
  34. package/server.js +3 -2
  35. package/services/__tests__/evalConfigLoader.test.js +338 -0
  36. package/services/anovaStats.js +499 -0
  37. package/services/contentResolver.js +407 -0
  38. package/services/dialogueTraceAnalyzer.js +454 -0
  39. package/services/evalConfigLoader.js +625 -0
  40. package/services/evaluationRunner.js +2171 -270
  41. package/services/evaluationStore.js +564 -29
  42. package/services/learnerConfigLoader.js +75 -5
  43. package/services/learnerRubricEvaluator.js +284 -0
  44. package/services/learnerTutorInteractionEngine.js +375 -0
  45. package/services/processUtils.js +18 -0
  46. package/services/progressLogger.js +98 -0
  47. package/services/promptRecommendationService.js +31 -26
  48. package/services/promptRewriter.js +427 -0
  49. package/services/rubricEvaluator.js +543 -70
  50. package/services/streamingReporter.js +104 -0
  51. package/services/turnComparisonAnalyzer.js +494 -0
  52. package/components/MobileEvalDashboard.tsx +0 -267
  53. package/components/comparison/DeltaAnalysisTable.tsx +0 -137
  54. package/components/comparison/ProfileComparisonCard.tsx +0 -176
  55. package/components/comparison/RecognitionABMode.tsx +0 -385
  56. package/components/comparison/RecognitionMetricsPanel.tsx +0 -135
  57. package/components/comparison/WinnerIndicator.tsx +0 -64
  58. package/components/comparison/index.ts +0 -5
  59. package/components/mobile/BottomSheet.tsx +0 -233
  60. package/components/mobile/DimensionBreakdown.tsx +0 -210
  61. package/components/mobile/DocsView.tsx +0 -363
  62. package/components/mobile/LogsView.tsx +0 -481
  63. package/components/mobile/PsychodynamicQuadrant.tsx +0 -261
  64. package/components/mobile/QuickTestView.tsx +0 -1098
  65. package/components/mobile/RecognitionTypeChart.tsx +0 -124
  66. package/components/mobile/RecognitionView.tsx +0 -809
  67. package/components/mobile/RunDetailView.tsx +0 -261
  68. package/components/mobile/RunHistoryView.tsx +0 -367
  69. package/components/mobile/ScoreRadial.tsx +0 -211
  70. package/components/mobile/StreamingLogPanel.tsx +0 -230
  71. package/components/mobile/SynthesisStrategyChart.tsx +0 -140
  72. package/docs/research/ABLATION-DIALOGUE-ROUNDS.md +0 -52
  73. package/docs/research/ABLATION-MODEL-SELECTION.md +0 -53
  74. package/docs/research/ADVANCED-EVAL-ANALYSIS.md +0 -60
  75. package/docs/research/ANOVA-RESULTS-2026-01-14.md +0 -257
  76. package/docs/research/COMPREHENSIVE-EVALUATION-PLAN.md +0 -586
  77. package/docs/research/COST-ANALYSIS.md +0 -56
  78. package/docs/research/CRITICAL-REVIEW-RECOGNITION-TUTORING.md +0 -340
  79. package/docs/research/DYNAMIC-VS-SCRIPTED-ANALYSIS.md +0 -291
  80. package/docs/research/EVAL-SYSTEM-ANALYSIS.md +0 -306
  81. package/docs/research/FACTORIAL-RESULTS-2026-01-14.md +0 -301
  82. package/docs/research/IMPLEMENTATION-PLAN-CRITIQUE-RESPONSE.md +0 -1988
  83. package/docs/research/LONGITUDINAL-DYADIC-EVALUATION.md +0 -282
  84. package/docs/research/MULTI-JUDGE-VALIDATION-2026-01-14.md +0 -147
  85. package/docs/research/PAPER-EXTENSION-DYADIC.md +0 -204
  86. package/docs/research/PAPER-UNIFIED.md +0 -659
  87. package/docs/research/PAPER-UNIFIED.pdf +0 -0
  88. package/docs/research/PROMPT-IMPROVEMENTS-2026-01-14.md +0 -356
  89. package/docs/research/SESSION-NOTES-2026-01-11-RECOGNITION-EVAL.md +0 -419
  90. package/docs/research/apa.csl +0 -2133
  91. package/docs/research/archive/PAPER-DRAFT-RECOGNITION-TUTORING.md +0 -1637
  92. package/docs/research/archive/paper-multiagent-tutor.tex +0 -978
  93. package/docs/research/paper-draft/full-paper.md +0 -136
  94. package/docs/research/paper-draft/images/pasted-image-2026-01-24T03-47-47-846Z-d76a7ae2.png +0 -0
  95. package/docs/research/paper-draft/references.bib +0 -515
  96. package/docs/research/transcript-baseline.md +0 -139
  97. package/docs/research/transcript-recognition-multiagent.md +0 -187
  98. package/hooks/useEvalData.ts +0 -625
  99. package/server-init.js +0 -45
  100. package/services/benchmarkService.js +0 -1892
  101. package/types.ts +0 -165
  102. package/utils/haptics.ts +0 -45
@@ -0,0 +1,1313 @@
1
+ #!/usr/bin/env node
2
+
3
+ /**
4
+ * AI-Based Thematic Analysis of Evaluation Transcripts
5
+ *
6
+ * Two modes:
7
+ * Option 1 (--mode classify): Classify responses against existing 6 thematic categories
8
+ * Option 2 (--mode discover): Open-ended theme discovery with no predefined categories
9
+ *
10
+ * Usage:
11
+ * node scripts/qualitative-analysis-ai.js --mode classify --model claude-code [--sample 50]
12
+ * node scripts/qualitative-analysis-ai.js --mode discover --model haiku [--sample 50]
13
+ * node scripts/qualitative-analysis-ai.js --mode both --model claude-code [--sample 100]
14
+ * node scripts/qualitative-analysis-ai.js --cost-estimate
15
+ *
16
+ * Models:
17
+ * claude-code — Uses Claude Code CLI (your subscription, no API cost)
18
+ * haiku — OpenRouter Haiku (~$0.003/call)
19
+ * sonnet — OpenRouter Sonnet (~$0.008/call)
20
+ * opus — OpenRouter Opus (~$0.04/call)
21
+ *
22
+ * Default: claude-code (free via subscription)
23
+ */
24
+
25
+ import 'dotenv/config';
26
+ import Database from 'better-sqlite3';
27
+ import fs from 'fs';
28
+ import path from 'path';
29
+ import { spawn } from 'child_process';
30
+ import readline from 'readline';
31
+
32
+ // ── Model Configuration ─────────────────────────────────────────────────
33
+
34
+ const MODEL_MAP = {
35
+ 'claude-code': 'claude-code', // Uses Claude Code CLI subprocess (subscription)
36
+ haiku: 'anthropic/claude-haiku-4.5',
37
+ sonnet: 'anthropic/claude-sonnet-4.5',
38
+ opus: 'anthropic/claude-opus-4.5',
39
+ };
40
+
41
+ // OpenRouter pricing per million tokens (as of Feb 2026)
42
+ const PRICING = {
43
+ haiku: { input: 1.00, output: 5.00 },
44
+ sonnet: { input: 3.00, output: 15.00 },
45
+ opus: { input: 15.00, output: 75.00 },
46
+ };
47
+
48
+ // ── Existing Regex Categories (from qualitative-analysis.js) ────────────
49
+
50
+ const THEMATIC_CATEGORIES = {
51
+ engagement: {
52
+ label: 'Engagement markers',
53
+ description: 'Second-person engagement with learner contributions (e.g., "your insight", "building on your", "you\'ve raised")',
54
+ examples_positive: [
55
+ 'Your insight about alienation connects to...',
56
+ 'Building on your earlier point about power...',
57
+ 'You\'ve raised an important question here.',
58
+ ],
59
+ examples_negative: [
60
+ 'The concept of alienation is important.',
61
+ 'Let me explain how this works.',
62
+ 'You should review the next lecture.',
63
+ ],
64
+ },
65
+ transformation: {
66
+ label: 'Transformation language',
67
+ description: 'Markers of mutual change or perspective shift (e.g., "reconsidering", "that changes how I", "I hadn\'t thought")',
68
+ examples_positive: [
69
+ 'That changes how I think about this passage.',
70
+ 'I hadn\'t considered that angle before.',
71
+ 'Your critique enriches the standard reading.',
72
+ ],
73
+ examples_negative: [
74
+ 'The correct interpretation is...',
75
+ 'You should think about it this way.',
76
+ 'Here is the standard view.',
77
+ ],
78
+ },
79
+ struggle_honoring: {
80
+ label: 'Struggle-honoring',
81
+ description: 'Acknowledging productive confusion or difficulty (e.g., "wrestling with", "productive confusion", "grappling with")',
82
+ examples_positive: [
83
+ 'You\'re wrestling with a genuinely hard question.',
84
+ 'This productive confusion is where real learning happens.',
85
+ 'The tension between these ideas is worth sitting with.',
86
+ ],
87
+ examples_negative: [
88
+ 'Don\'t worry, this is easy once you understand it.',
89
+ 'The answer is straightforward.',
90
+ 'Let me simplify this for you.',
91
+ ],
92
+ },
93
+ learner_as_subject: {
94
+ label: 'Learner-as-subject framing',
95
+ description: 'Treating learner as autonomous intellectual agent (e.g., "your interpretation", "your framework", "what you\'re building")',
96
+ examples_positive: [
97
+ 'Your interpretation of the text offers...',
98
+ 'Your framework for understanding this...',
99
+ 'What you\'re developing is a sophisticated reading.',
100
+ ],
101
+ examples_negative: [
102
+ 'The correct framework is...',
103
+ 'Students typically understand this as...',
104
+ 'The textbook says...',
105
+ ],
106
+ },
107
+ directive: {
108
+ label: 'Directive framing',
109
+ description: 'Expert-to-novice instructional markers (e.g., "you should", "you need to", "the correct answer is")',
110
+ examples_positive: [
111
+ 'You should review the next lecture.',
112
+ 'You need to understand this before moving on.',
113
+ 'The correct approach is...',
114
+ ],
115
+ examples_negative: [
116
+ 'What if we explored this differently?',
117
+ 'Your approach suggests...',
118
+ 'Consider how this connects to...',
119
+ ],
120
+ },
121
+ generic: {
122
+ label: 'Generic/placeholder',
123
+ description: 'Vague pedagogical language without specificity (e.g., "foundational", "key concepts", "solid foundation")',
124
+ examples_positive: [
125
+ 'This covers foundational concepts.',
126
+ 'Building a solid foundation is key.',
127
+ 'Review the key concepts before proceeding.',
128
+ ],
129
+ examples_negative: [
130
+ 'The dialectic between master and slave reveals...',
131
+ 'Your reading of commodity fetishism...',
132
+ 'Consider how reification operates in...',
133
+ ],
134
+ },
135
+ };
136
+
137
+ // Regex patterns for inter-method agreement scoring
138
+ const REGEX_PATTERNS = {
139
+ engagement: [
140
+ /your insight/gi, /building on your/gi, /your question/gi, /your point/gi,
141
+ /your observation/gi, /your analysis/gi, /your argument/gi, /your critique/gi,
142
+ /you've (raised|identified|highlighted|noticed|pointed out)/gi,
143
+ /you're (asking|raising|pushing|exploring|getting at)/gi,
144
+ ],
145
+ transformation: [
146
+ /reconsidering/gi, /that changes (how I|my)/gi, /I hadn't (thought|considered)/gi,
147
+ /revising (my|the)/gi, /let me (revise|adjust|rethink)/gi,
148
+ /you've (helped|pushed|made) me/gi, /your .{1,20} (complicates|enriches|changes)/gi,
149
+ /shifts? (my|the|our) (understanding|framing|approach)/gi,
150
+ ],
151
+ struggle_honoring: [
152
+ /wrestling with/gi, /productive confusion/gi, /working through/gi,
153
+ /grappling with/gi, /sitting with (the|this)/gi, /tension (between|here|you)/gi,
154
+ /difficulty (is|here)/gi, /struggle (with|is|here)/gi,
155
+ /not (easy|simple|straightforward)/gi,
156
+ ],
157
+ learner_as_subject: [
158
+ /your interpretation/gi, /your analysis/gi, /your understanding/gi,
159
+ /you're grappling with/gi, /your perspective/gi, /your framework/gi,
160
+ /your reading/gi, /what you're (doing|building|developing|constructing)/gi,
161
+ /your (intellectual|philosophical|analytical)/gi,
162
+ ],
163
+ directive: [
164
+ /you should/gi, /you need to/gi, /you must/gi,
165
+ /the correct (answer|approach|way)/gi, /the answer is/gi,
166
+ /let me explain/gi, /here's what/gi, /make sure (to|you)/gi,
167
+ /first,? you/gi,
168
+ ],
169
+ generic: [
170
+ /foundational/gi, /key concepts/gi, /learning objectives/gi,
171
+ /knowledge base/gi, /solid foundation/gi, /core concepts/gi,
172
+ /build (a|your) (solid|strong)/gi,
173
+ /comprehensive (understanding|overview|review)/gi,
174
+ ],
175
+ };
176
+
177
+ // ── Prompts ─────────────────────────────────────────────────────────────
178
+
179
+ function buildClassifyPrompt(responseText, condition) {
180
+ const catDescriptions = Object.entries(THEMATIC_CATEGORIES).map(([key, cat]) => {
181
+ const posExamples = cat.examples_positive.map(e => ` + "${e}"`).join('\n');
182
+ const negExamples = cat.examples_negative.map(e => ` - "${e}"`).join('\n');
183
+ return `**${key}** (${cat.label}): ${cat.description}\nPresent examples:\n${posExamples}\nAbsent examples:\n${negExamples}`;
184
+ }).join('\n\n');
185
+
186
+ return `You are a qualitative coding expert analyzing AI tutor responses in an educational technology study.
187
+
188
+ The study compares tutor responses generated under two conditions:
189
+ - "base": Tutors with standard pedagogical prompts
190
+ - "recognition": Tutors with recognition-theory prompts (Hegelian mutual recognition)
191
+
192
+ This response was generated under the **${condition}** condition.
193
+
194
+ ## Task
195
+
196
+ Analyze the following tutor response for the presence/absence of each thematic category. For each category, provide:
197
+ 1. **present**: true/false — is this theme present in the response?
198
+ 2. **confidence**: 0.0-1.0 — how confident are you?
199
+ 3. **evidence**: A brief quote or description of what led to your judgment (max 30 words)
200
+ 4. **strength**: "none" | "weak" | "moderate" | "strong" — how prominently does this theme appear?
201
+
202
+ ## Categories
203
+
204
+ ${catDescriptions}
205
+
206
+ ## Response to Analyze
207
+
208
+ ${responseText}
209
+
210
+ ## Output Format
211
+
212
+ Return a JSON object with this exact structure:
213
+ {
214
+ "categories": {
215
+ "engagement": { "present": true/false, "confidence": 0.0-1.0, "evidence": "...", "strength": "none|weak|moderate|strong" },
216
+ "transformation": { ... },
217
+ "struggle_honoring": { ... },
218
+ "learner_as_subject": { ... },
219
+ "directive": { ... },
220
+ "generic": { ... }
221
+ },
222
+ "dominant_theme": "category_key",
223
+ "overall_quality_note": "One sentence summary of the response's pedagogical character (max 40 words)"
224
+ }
225
+
226
+ Return ONLY the JSON object, no other text.`;
227
+ }
228
+
229
+ function buildDiscoverPrompt(responseText, condition) {
230
+ return `You are a qualitative coding expert performing open-ended thematic analysis on AI tutor responses in an educational technology study.
231
+
232
+ The study compares tutor responses generated under two conditions:
233
+ - "base": Tutors with standard pedagogical prompts
234
+ - "recognition": Tutors with recognition-theory prompts (Hegelian mutual recognition)
235
+
236
+ This response was generated under the **${condition}** condition.
237
+
238
+ ## Task
239
+
240
+ Read the following tutor response carefully and identify the **3-5 most prominent themes** you observe. These should be emergent themes — do NOT use predefined categories. Focus on:
241
+
242
+ - Pedagogical stance (how does the tutor position itself relative to the learner?)
243
+ - Epistemic orientation (how does the tutor treat knowledge — as fixed or constructed?)
244
+ - Relational quality (how does the tutor relate to the learner — as equal, authority, guide?)
245
+ - Engagement depth (surface-level vs deep intellectual engagement)
246
+ - Language patterns (what's distinctive about the word choices and framing?)
247
+
248
+ ## Response to Analyze
249
+
250
+ ${responseText}
251
+
252
+ ## Output Format
253
+
254
+ Return a JSON object with this exact structure:
255
+ {
256
+ "themes": [
257
+ {
258
+ "name": "short_theme_name",
259
+ "label": "Human-Readable Theme Label",
260
+ "description": "1-2 sentence description of this theme",
261
+ "evidence": "Brief quote or paraphrase from the response (max 30 words)",
262
+ "salience": "low|medium|high"
263
+ }
264
+ ],
265
+ "pedagogical_stance": "One of: authoritative, collaborative, facilitative, directive, dialogical, or a custom term",
266
+ "epistemic_orientation": "One of: transmissive, constructivist, dialectical, or a custom term",
267
+ "overall_impression": "2-3 sentence qualitative description of this response's character (max 60 words)"
268
+ }
269
+
270
+ Return ONLY the JSON object, no other text.`;
271
+ }
272
+
273
+ // ── Model Calls ─────────────────────────────────────────────────────────
274
+
275
+ /**
276
+ * Call the model via the appropriate backend.
277
+ * claude-code: spawns `claude -p -` subprocess (uses subscription, no API cost)
278
+ * haiku/sonnet/opus: calls OpenRouter API
279
+ */
280
+ async function callModel(prompt, modelKey) {
281
+ if (modelKey === 'claude-code') {
282
+ return callClaudeCode(prompt);
283
+ }
284
+ return callOpenRouter(prompt, modelKey);
285
+ }
286
+
287
+ async function callClaudeCode(prompt) {
288
+ const claudeArgs = ['-p', '-', '--output-format', 'text'];
289
+
290
+ const stdout = await new Promise((resolve, reject) => {
291
+ const env = { ...process.env };
292
+ delete env.ANTHROPIC_API_KEY; // force subscription path
293
+ const child = spawn('claude', claudeArgs, {
294
+ stdio: ['pipe', 'pipe', 'pipe'],
295
+ env,
296
+ });
297
+ let out = '';
298
+ let err = '';
299
+ child.stdout.on('data', d => { out += d; });
300
+ child.stderr.on('data', d => { err += d; });
301
+ child.on('error', e => reject(new Error(`Failed to spawn claude: ${e.message}`)));
302
+ child.on('close', code => {
303
+ if (code !== 0) reject(new Error(err || out || `claude exited with code ${code}`));
304
+ else resolve(out);
305
+ });
306
+ child.stdin.write(prompt);
307
+ child.stdin.end();
308
+ });
309
+
310
+ return { content: stdout.trim(), usage: {} };
311
+ }
312
+
313
+ async function callOpenRouter(prompt, modelKey) {
314
+ const apiKey = process.env.OPENROUTER_API_KEY;
315
+ if (!apiKey) throw new Error('OPENROUTER_API_KEY not set. Export it before running.');
316
+
317
+ const model = MODEL_MAP[modelKey];
318
+ if (!model) throw new Error(`Unknown model: ${modelKey}. Use: claude-code, haiku, sonnet, opus`);
319
+
320
+ const controller = new AbortController();
321
+ const timeout = setTimeout(() => controller.abort(), 90000);
322
+
323
+ try {
324
+ const res = await fetch('https://openrouter.ai/api/v1/chat/completions', {
325
+ method: 'POST',
326
+ headers: {
327
+ 'Content-Type': 'application/json',
328
+ 'Authorization': `Bearer ${apiKey}`,
329
+ },
330
+ body: JSON.stringify({
331
+ model,
332
+ max_tokens: 1000,
333
+ temperature: 0.1,
334
+ include_reasoning: false,
335
+ response_format: { type: 'json_object' },
336
+ messages: [{ role: 'user', content: prompt }],
337
+ }),
338
+ signal: controller.signal,
339
+ });
340
+
341
+ clearTimeout(timeout);
342
+
343
+ if (!res.ok) {
344
+ const body = await res.text();
345
+ throw new Error(`OpenRouter ${res.status}: ${body.slice(0, 200)}`);
346
+ }
347
+
348
+ const data = await res.json();
349
+ const content = data.choices?.[0]?.message?.content;
350
+ if (!content) throw new Error('No content in OpenRouter response');
351
+
352
+ const usage = data.usage || {};
353
+ return { content, usage };
354
+ } catch (err) {
355
+ clearTimeout(timeout);
356
+ throw err;
357
+ }
358
+ }
359
+
360
+ function parseJsonResponse(content) {
361
+ try {
362
+ return JSON.parse(content);
363
+ } catch {
364
+ // Try to extract JSON from markdown code blocks
365
+ const match = content.match(/```(?:json)?\s*([\s\S]*?)```/);
366
+ if (match) return JSON.parse(match[1].trim());
367
+ throw new Error(`Failed to parse JSON: ${content.slice(0, 200)}`);
368
+ }
369
+ }
370
+
371
+ // ── Regex Comparison ────────────────────────────────────────────────────
372
+
373
+ function regexClassify(text) {
374
+ const results = {};
375
+ for (const [category, patterns] of Object.entries(REGEX_PATTERNS)) {
376
+ let found = false;
377
+ for (const pattern of patterns) {
378
+ pattern.lastIndex = 0;
379
+ if (pattern.test(text)) {
380
+ found = true;
381
+ break;
382
+ }
383
+ }
384
+ results[category] = found;
385
+ }
386
+ return results;
387
+ }
388
+
389
+ // ── Data Loading ────────────────────────────────────────────────────────
390
+
391
+ function loadData(db, cells, sampleSize) {
392
+ const cellList = cells.map(c => `'cell_${c}_base_single_unified','cell_${c}_base_single_psycho','cell_${c}_base_multi_unified','cell_${c}_base_multi_psycho','cell_${c}_recog_single_unified','cell_${c}_recog_single_psycho','cell_${c}_recog_multi_unified','cell_${c}_recog_multi_psycho'`);
393
+
394
+ // Build cell name list from cell numbers
395
+ const baseCells = [];
396
+ const recogCells = [];
397
+ for (const c of cells) {
398
+ if (c <= 4) {
399
+ baseCells.push(`cell_${c}_base_single_unified`, `cell_${c}_base_single_psycho`,
400
+ `cell_${c}_base_multi_unified`, `cell_${c}_base_multi_psycho`);
401
+ } else {
402
+ recogCells.push(`cell_${c}_recog_single_unified`, `cell_${c}_recog_single_psycho`,
403
+ `cell_${c}_recog_multi_unified`, `cell_${c}_recog_multi_psycho`);
404
+ }
405
+ }
406
+
407
+ const allCells = [...baseCells, ...recogCells];
408
+ const placeholders = allCells.map(() => '?').join(',');
409
+
410
+ let query = `
411
+ SELECT id, scenario_id, profile_name, overall_score, suggestions,
412
+ CASE WHEN profile_name LIKE 'cell_1_%' OR profile_name LIKE 'cell_2_%'
413
+ OR profile_name LIKE 'cell_3_%' OR profile_name LIKE 'cell_4_%'
414
+ THEN 'base' ELSE 'recognition' END as condition
415
+ FROM evaluation_results
416
+ WHERE success = 1
417
+ AND overall_score IS NOT NULL
418
+ AND suggestions IS NOT NULL
419
+ AND judge_model IN ('claude-code', 'claude-code/opus')
420
+ AND profile_name IN (${placeholders})
421
+ `;
422
+
423
+ // Note: when resuming with --sample, we load all data and let the
424
+ // checkpoint filter handle deduplication. The sample limit only applies
425
+ // to NEW items to process (see runClassification/runDiscovery).
426
+ const rows = db.prepare(query).all(...allCells);
427
+
428
+ return rows.map(row => {
429
+ let messages = [], reasonings = [];
430
+ try {
431
+ const parsed = JSON.parse(row.suggestions);
432
+ if (Array.isArray(parsed)) {
433
+ messages = parsed.map(s => s.message || '').filter(Boolean);
434
+ reasonings = parsed.map(s => s.reasoning || '').filter(Boolean);
435
+ }
436
+ } catch { /* skip */ }
437
+
438
+ return {
439
+ id: row.id,
440
+ scenario_id: row.scenario_id,
441
+ profile_name: row.profile_name,
442
+ overall_score: row.overall_score,
443
+ condition: row.condition,
444
+ messageText: messages.join('\n\n'),
445
+ reasoningText: reasonings.join('\n\n'),
446
+ fullText: [...messages, ...reasonings].join('\n\n'),
447
+ };
448
+ }).filter(r => r.messageText.length > 0);
449
+ }
450
+
451
+ // ── Cost Estimation ─────────────────────────────────────────────────────
452
+
453
+ function printCostEstimate(db) {
454
+ console.log('='.repeat(70));
455
+ console.log('COST ESTIMATE: AI THEMATIC ANALYSIS');
456
+ console.log('='.repeat(70));
457
+
458
+ // Count responses
459
+ const factorialCount = db.prepare(`
460
+ SELECT COUNT(*) as n FROM evaluation_results
461
+ WHERE success = 1 AND overall_score IS NOT NULL AND suggestions IS NOT NULL
462
+ AND judge_model IN ('claude-code', 'claude-code/opus')
463
+ AND (profile_name LIKE 'cell_1_%' OR profile_name LIKE 'cell_2_%'
464
+ OR profile_name LIKE 'cell_3_%' OR profile_name LIKE 'cell_4_%'
465
+ OR profile_name LIKE 'cell_5_%' OR profile_name LIKE 'cell_6_%'
466
+ OR profile_name LIKE 'cell_7_%' OR profile_name LIKE 'cell_8_%')
467
+ `).get().n;
468
+
469
+ const allCount = db.prepare(`
470
+ SELECT COUNT(*) as n FROM evaluation_results
471
+ WHERE success = 1 AND overall_score IS NOT NULL AND suggestions IS NOT NULL
472
+ AND judge_model IN ('claude-code', 'claude-code/opus')
473
+ `).get().n;
474
+
475
+ // Estimated tokens per call
476
+ const inputTokens = 1200; // prompt + response text
477
+ const outputTokens = 300; // JSON output
478
+
479
+ console.log(`\nData volumes:`);
480
+ console.log(` Factorial (cells 1-8): ${factorialCount} responses`);
481
+ console.log(` All Opus-judged data: ${allCount} responses`);
482
+ console.log(`\nPer-call estimate: ~${inputTokens} input tokens, ~${outputTokens} output tokens`);
483
+ console.log(`\nNote: Running both classify + discover modes doubles the cost.\n`);
484
+
485
+ console.log(`| Model | Per Call | Factorial (N=${factorialCount}) | All Data (N=${allCount}) | Both Modes × All |`);
486
+ console.log('|--------|---------|------------------|-----------------|------------------|');
487
+
488
+ for (const [model, pricing] of Object.entries(PRICING)) {
489
+ const perCall = (inputTokens * pricing.input / 1e6) + (outputTokens * pricing.output / 1e6);
490
+ const factorial = perCall * factorialCount;
491
+ const all = perCall * allCount;
492
+ const bothAll = all * 2;
493
+ console.log(`| ${model.padEnd(6)} | $${perCall.toFixed(4).padEnd(6)} | $${factorial.toFixed(2).padStart(16)} | $${all.toFixed(2).padStart(15)} | $${bothAll.toFixed(2).padStart(16)} |`);
494
+ }
495
+
496
+ console.log('\nSampled estimates (--sample flag):');
497
+ for (const sampleSize of [50, 100, 200]) {
498
+ console.log(` --sample ${sampleSize}:`);
499
+ for (const [model, pricing] of Object.entries(PRICING)) {
500
+ const perCall = (inputTokens * pricing.input / 1e6) + (outputTokens * pricing.output / 1e6);
501
+ const cost = perCall * sampleSize;
502
+ const bothCost = cost * 2;
503
+ console.log(` ${model}: $${cost.toFixed(2)} (one mode), $${bothCost.toFixed(2)} (both modes)`);
504
+ }
505
+ }
506
+
507
+ console.log('\n claude-code model uses Claude Code CLI (your subscription) — $0 API cost.');
508
+ console.log(' Runs sequentially (~10-20s per call). Estimated time:');
509
+ console.log(` --sample 50: ~${Math.ceil(50 * 15 / 60)} min (one mode), ~${Math.ceil(100 * 15 / 60)} min (both)`);
510
+ console.log(` Full dataset: ~${Math.ceil(factorialCount * 15 / 3600)} hrs (one mode), ~${Math.ceil(factorialCount * 2 * 15 / 3600)} hrs (both)`);
511
+
512
+ console.log('\nRecommended approach:');
513
+ console.log(' 1. Start with --sample 50 --model claude-code --mode both (free, ~25 min)');
514
+ console.log(' 2. Review results, adjust prompts if needed');
515
+ console.log(' 3. Full run with --model haiku --mode both (~$18, ~20 min with concurrency)');
516
+ console.log(' 4. Compare claude-code vs haiku for calibration');
517
+ console.log(' 5. Optionally --sample 100 --model sonnet for cross-model validation (~$2.40)');
518
+ }
519
+
520
+ // ── Checkpoint / Resume ─────────────────────────────────────────────────
521
+
522
+ function checkpointPath(exportsDir, modelKey, mode) {
523
+ return path.join(exportsDir, `.checkpoint-${modelKey}-${mode}.jsonl`);
524
+ }
525
+
526
+ function loadCheckpoint(filepath) {
527
+ if (!fs.existsSync(filepath)) return [];
528
+ const lines = fs.readFileSync(filepath, 'utf-8').split('\n').filter(Boolean);
529
+ const results = [];
530
+ for (const line of lines) {
531
+ try { results.push(JSON.parse(line)); }
532
+ catch { /* skip corrupt lines */ }
533
+ }
534
+ return results;
535
+ }
536
+
537
+ function appendCheckpoint(filepath, result) {
538
+ fs.appendFileSync(filepath, JSON.stringify(result) + '\n');
539
+ }
540
+
541
+ function formatEta(elapsedMs, completed, total) {
542
+ if (completed === 0) return '??';
543
+ const msPerItem = elapsedMs / completed;
544
+ const remaining = (total - completed) * msPerItem;
545
+ if (remaining < 60000) return `${Math.ceil(remaining / 1000)}s`;
546
+ if (remaining < 3600000) return `${Math.ceil(remaining / 60000)}m`;
547
+ const hrs = Math.floor(remaining / 3600000);
548
+ const mins = Math.ceil((remaining % 3600000) / 60000);
549
+ return `${hrs}h${mins}m`;
550
+ }
551
+
552
+ // ── Interactive Pause ────────────────────────────────────────────────────
553
+
554
+ async function askContinue(message) {
555
+ const rl = readline.createInterface({ input: process.stdin, output: process.stdout });
556
+ return new Promise(resolve => {
557
+ rl.question(message, answer => {
558
+ rl.close();
559
+ const a = answer.trim().toLowerCase();
560
+ resolve(a === '' || a === 'y' || a === 'yes');
561
+ });
562
+ });
563
+ }
564
+
565
+ function printInterimSummary(mode, results, startTime, totalTodo) {
566
+ const elapsed = Date.now() - startTime;
567
+ const n = results.length;
568
+ const base = results.filter(r => r.condition === 'base').length;
569
+ const recog = results.filter(r => r.condition === 'recognition').length;
570
+ const eta = formatEta(elapsed, n, totalTodo);
571
+
572
+ console.log('\n ┌─ Interim Summary ──────────────────────────────────');
573
+ console.log(` │ ${n} processed (base=${base}, recog=${recog}), ETA for rest: ${eta}`);
574
+
575
+ if (mode === 'classify' || mode === 'both') {
576
+ // Quick category presence counts
577
+ const cats = {};
578
+ for (const r of results) {
579
+ for (const [cat, val] of Object.entries(r.ai_categories || {})) {
580
+ if (!cats[cat]) cats[cat] = { base: [0,0], recognition: [0,0] };
581
+ if (val?.present) cats[cat][r.condition][0]++;
582
+ cats[cat][r.condition][1]++;
583
+ }
584
+ }
585
+ for (const [cat, counts] of Object.entries(cats)) {
586
+ const bPct = counts.base[1] ? (counts.base[0]/counts.base[1]*100).toFixed(0) : '-';
587
+ const rPct = counts.recognition[1] ? (counts.recognition[0]/counts.recognition[1]*100).toFixed(0) : '-';
588
+ console.log(` │ ${cat.padEnd(22)} base=${bPct}% recog=${rPct}%`);
589
+ }
590
+ }
591
+
592
+ if (mode === 'discover' || mode === 'both') {
593
+ // Quick stance tally
594
+ const stances = { base: {}, recognition: {} };
595
+ for (const r of results) {
596
+ const s = r.pedagogical_stance?.toLowerCase() || 'unknown';
597
+ stances[r.condition][s] = (stances[r.condition][s] || 0) + 1;
598
+ }
599
+ for (const cond of ['base', 'recognition']) {
600
+ const sorted = Object.entries(stances[cond]).sort((a,b) => b[1]-a[1]).slice(0,4);
601
+ if (sorted.length) {
602
+ console.log(` │ ${cond} stances: ${sorted.map(([s,n]) => `${s}(${n})`).join(', ')}`);
603
+ }
604
+ }
605
+ }
606
+
607
+ console.log(' └────────────────────────────────────────────────────');
608
+ }
609
+
610
+ /**
611
+ * Write a live report file that can be monitored from another terminal.
612
+ * Rewrites after every response. Reads the other mode's checkpoint to show
613
+ * cumulative results from both modes.
614
+ */
615
+ function writeLiveReport(exportsDir, modelKey, classifyResults, discoveryResults) {
616
+ // If one mode is null, try loading its checkpoint so we show both
617
+ if (!classifyResults) {
618
+ const cp = checkpointPath(exportsDir, modelKey, 'classify');
619
+ const loaded = loadCheckpoint(cp);
620
+ if (loaded.length > 0) classifyResults = loaded;
621
+ }
622
+ if (!discoveryResults) {
623
+ const cp = checkpointPath(exportsDir, modelKey, 'discover');
624
+ const loaded = loadCheckpoint(cp);
625
+ if (loaded.length > 0) discoveryResults = loaded;
626
+ }
627
+
628
+ const livePath = path.join(exportsDir, '.live-report.md');
629
+ const now = new Date().toISOString();
630
+ let md = `# Live AI Thematic Analysis — ${modelKey}\n\n`;
631
+ md += `**Updated:** ${now}\n\n`;
632
+
633
+ if (classifyResults && classifyResults.length > 0) {
634
+ const base = classifyResults.filter(r => r.condition === 'base');
635
+ const recog = classifyResults.filter(r => r.condition === 'recognition');
636
+ md += `## Classification (N=${classifyResults.length}: base=${base.length}, recog=${recog.length})\n\n`;
637
+ md += `| Category | Base % | Recog % | Diff |\n|----------|--------|---------|------|\n`;
638
+
639
+ const catNames = Object.keys(THEMATIC_CATEGORIES);
640
+ for (const cat of catNames) {
641
+ const bPresent = base.filter(r => r.ai_categories?.[cat]?.present).length;
642
+ const rPresent = recog.filter(r => r.ai_categories?.[cat]?.present).length;
643
+ const bPct = base.length ? (bPresent/base.length*100).toFixed(1) : '0.0';
644
+ const rPct = recog.length ? (rPresent/recog.length*100).toFixed(1) : '0.0';
645
+ const diff = (parseFloat(rPct) - parseFloat(bPct)).toFixed(1);
646
+ md += `| ${THEMATIC_CATEGORIES[cat].label} | ${bPct}% | ${rPct}% | ${diff > 0 ? '+' : ''}${diff}% |\n`;
647
+ }
648
+ md += '\n';
649
+ }
650
+
651
+ if (discoveryResults && discoveryResults.length > 0) {
652
+ const base = discoveryResults.filter(r => r.condition === 'base');
653
+ const recog = discoveryResults.filter(r => r.condition === 'recognition');
654
+ md += `## Discovery (N=${discoveryResults.length}: base=${base.length}, recog=${recog.length})\n\n`;
655
+
656
+ // Stances
657
+ md += `### Pedagogical Stances\n\n| Stance | Base | Recog |\n|--------|------|-------|\n`;
658
+ const allStances = {};
659
+ for (const r of discoveryResults) {
660
+ const s = r.pedagogical_stance?.toLowerCase() || 'unknown';
661
+ if (!allStances[s]) allStances[s] = { base: 0, recognition: 0 };
662
+ allStances[s][r.condition]++;
663
+ }
664
+ for (const [s, c] of Object.entries(allStances).sort((a,b) => (b[1].base+b[1].recognition)-(a[1].base+a[1].recognition))) {
665
+ md += `| ${s} | ${c.base} | ${c.recognition} |\n`;
666
+ }
667
+
668
+ // Top themes
669
+ md += `\n### Top Emergent Themes\n\n| Theme | Base | Recog | Diff |\n|-------|------|-------|------|\n`;
670
+ const themes = {};
671
+ for (const r of discoveryResults) {
672
+ for (const t of (r.themes || [])) {
673
+ const name = t.name?.toLowerCase()?.replace(/[^a-z0-9_]/g, '_') || 'unknown';
674
+ if (!themes[name]) themes[name] = { label: t.label, base: 0, recognition: 0 };
675
+ themes[name][r.condition]++;
676
+ }
677
+ }
678
+ const sorted = Object.entries(themes).sort((a,b) => (b[1].base+b[1].recognition)-(a[1].base+a[1].recognition));
679
+ for (const [, t] of sorted.slice(0, 20)) {
680
+ const diff = t.recognition - t.base;
681
+ md += `| ${t.label || ''} | ${t.base} | ${t.recognition} | ${diff > 0 ? '+' : ''}${diff} |\n`;
682
+ }
683
+ md += '\n';
684
+ }
685
+
686
+ fs.writeFileSync(livePath, md);
687
+ return livePath;
688
+ }
689
+
690
+ // ── Main Analysis ───────────────────────────────────────────────────────
691
+
692
+ async function runClassification(data, modelKey, concurrency, checkpointFile, { pauseEvery, exportsDir, sampleSize } = {}) {
693
+ const existing = loadCheckpoint(checkpointFile);
694
+ const doneIds = new Set(existing.map(r => r.id));
695
+ let todo = data.filter(d => !doneIds.has(d.id));
696
+ // For --sample: limit total results (existing + new) to sampleSize
697
+ if (sampleSize && existing.length + todo.length > sampleSize) {
698
+ const remaining = Math.max(0, sampleSize - existing.length);
699
+ todo = todo.sort(() => Math.random() - 0.5).slice(0, remaining);
700
+ }
701
+
702
+ console.log(`\nRunning CLASSIFICATION (Option 1) with ${modelKey}...`);
703
+ if (existing.length > 0) {
704
+ console.log(` Resuming: ${existing.length} already done, ${todo.length} remaining`);
705
+ } else {
706
+ console.log(` ${data.length} responses to process`);
707
+ }
708
+ if (pauseEvery) {
709
+ console.log(` Will pause every ${pauseEvery} responses for review`);
710
+ }
711
+
712
+ const results = [...existing];
713
+ let newCompleted = 0;
714
+ let totalInput = 0;
715
+ let totalOutput = 0;
716
+ let errors = 0;
717
+ const startTime = Date.now();
718
+
719
+ for (let i = 0; i < todo.length; i += concurrency) {
720
+ const batch = todo.slice(i, i + concurrency);
721
+ const promises = batch.map(async (item) => {
722
+ const prompt = buildClassifyPrompt(item.messageText, item.condition);
723
+ try {
724
+ const { content, usage } = await callModel(prompt, modelKey);
725
+ const parsed = parseJsonResponse(content);
726
+ totalInput += usage.prompt_tokens || 0;
727
+ totalOutput += usage.completion_tokens || 0;
728
+
729
+ const regexResults = regexClassify(item.messageText);
730
+
731
+ return {
732
+ id: item.id,
733
+ scenario_id: item.scenario_id,
734
+ profile_name: item.profile_name,
735
+ condition: item.condition,
736
+ overall_score: item.overall_score,
737
+ ai_categories: parsed.categories || {},
738
+ dominant_theme: parsed.dominant_theme,
739
+ quality_note: parsed.overall_quality_note,
740
+ regex_categories: regexResults,
741
+ };
742
+ } catch (err) {
743
+ errors++;
744
+ console.error(` Error on ${item.id}: ${err.message}`);
745
+ return null;
746
+ }
747
+ });
748
+
749
+ const batchResults = await Promise.all(promises);
750
+ for (const r of batchResults) {
751
+ if (r) {
752
+ results.push(r);
753
+ appendCheckpoint(checkpointFile, r);
754
+ }
755
+ }
756
+ newCompleted += batch.length;
757
+ const totalDone = existing.length + newCompleted;
758
+ const totalAll = existing.length + todo.length;
759
+ const eta = formatEta(Date.now() - startTime, newCompleted, todo.length);
760
+
761
+ // Update live report after every response
762
+ if (exportsDir) writeLiveReport(exportsDir, modelKey, results, null);
763
+
764
+ // Log every response for sequential models, every 10 for parallel
765
+ const logInterval = concurrency <= 1 ? 1 : 10;
766
+ if (newCompleted % logInterval === 0 || newCompleted === todo.length) {
767
+ console.log(` [${totalDone}/${totalAll}] ${errors} errors, ETA ${eta}`);
768
+ }
769
+
770
+ // Pause point
771
+ if (pauseEvery && newCompleted > 0 && newCompleted % pauseEvery === 0 && newCompleted < todo.length) {
772
+ printInterimSummary('classify', results, startTime, todo.length);
773
+ const cont = await askContinue(`\n Continue? [Y/n] `);
774
+ if (!cont) {
775
+ console.log(' Stopped by user. Progress saved to checkpoint.');
776
+ return results;
777
+ }
778
+ }
779
+ }
780
+
781
+ console.log(` Done. ${results.length} total (${newCompleted - errors} new), ${errors} errors.`);
782
+ if (totalInput > 0) {
783
+ console.log(` Tokens: ${totalInput.toLocaleString()} input, ${totalOutput.toLocaleString()} output`);
784
+ }
785
+
786
+ return results;
787
+ }
788
+
789
+ async function runDiscovery(data, modelKey, concurrency, checkpointFile, { pauseEvery, exportsDir, sampleSize } = {}) {
790
+ const existing = loadCheckpoint(checkpointFile);
791
+ const doneIds = new Set(existing.map(r => r.id));
792
+ let todo = data.filter(d => !doneIds.has(d.id));
793
+ if (sampleSize && existing.length + todo.length > sampleSize) {
794
+ const remaining = Math.max(0, sampleSize - existing.length);
795
+ todo = todo.sort(() => Math.random() - 0.5).slice(0, remaining);
796
+ }
797
+
798
+ console.log(`\nRunning DISCOVERY (Option 2) with ${modelKey}...`);
799
+ if (existing.length > 0) {
800
+ console.log(` Resuming: ${existing.length} already done, ${todo.length} remaining`);
801
+ } else {
802
+ console.log(` ${data.length} responses to process`);
803
+ }
804
+ if (pauseEvery) {
805
+ console.log(` Will pause every ${pauseEvery} responses for review`);
806
+ }
807
+
808
+ const results = [...existing];
809
+ let newCompleted = 0;
810
+ let totalInput = 0;
811
+ let totalOutput = 0;
812
+ let errors = 0;
813
+ const startTime = Date.now();
814
+
815
+ for (let i = 0; i < todo.length; i += concurrency) {
816
+ const batch = todo.slice(i, i + concurrency);
817
+ const promises = batch.map(async (item) => {
818
+ const prompt = buildDiscoverPrompt(item.messageText, item.condition);
819
+ try {
820
+ const { content, usage } = await callModel(prompt, modelKey);
821
+ const parsed = parseJsonResponse(content);
822
+ totalInput += usage.prompt_tokens || 0;
823
+ totalOutput += usage.completion_tokens || 0;
824
+
825
+ return {
826
+ id: item.id,
827
+ scenario_id: item.scenario_id,
828
+ profile_name: item.profile_name,
829
+ condition: item.condition,
830
+ overall_score: item.overall_score,
831
+ themes: parsed.themes || [],
832
+ pedagogical_stance: parsed.pedagogical_stance,
833
+ epistemic_orientation: parsed.epistemic_orientation,
834
+ overall_impression: parsed.overall_impression,
835
+ };
836
+ } catch (err) {
837
+ errors++;
838
+ console.error(` Error on ${item.id}: ${err.message}`);
839
+ return null;
840
+ }
841
+ });
842
+
843
+ const batchResults = await Promise.all(promises);
844
+ for (const r of batchResults) {
845
+ if (r) {
846
+ results.push(r);
847
+ appendCheckpoint(checkpointFile, r);
848
+ }
849
+ }
850
+ newCompleted += batch.length;
851
+ const totalDone = existing.length + newCompleted;
852
+ const totalAll = existing.length + todo.length;
853
+ const eta = formatEta(Date.now() - startTime, newCompleted, todo.length);
854
+
855
+ // Update live report after every response
856
+ if (exportsDir) writeLiveReport(exportsDir, modelKey, null, results);
857
+
858
+ const logInterval = concurrency <= 1 ? 1 : 10;
859
+ if (newCompleted % logInterval === 0 || newCompleted === todo.length) {
860
+ console.log(` [${totalDone}/${totalAll}] ${errors} errors, ETA ${eta}`);
861
+ }
862
+
863
+ // Pause point
864
+ if (pauseEvery && newCompleted > 0 && newCompleted % pauseEvery === 0 && newCompleted < todo.length) {
865
+ printInterimSummary('discover', results, startTime, todo.length);
866
+ const cont = await askContinue(`\n Continue? [Y/n] `);
867
+ if (!cont) {
868
+ console.log(' Stopped by user. Progress saved to checkpoint.');
869
+ return results;
870
+ }
871
+ }
872
+ }
873
+
874
+ console.log(` Done. ${results.length} total (${newCompleted - errors} new), ${errors} errors.`);
875
+ if (totalInput > 0) {
876
+ console.log(` Tokens: ${totalInput.toLocaleString()} input, ${totalOutput.toLocaleString()} output`);
877
+ }
878
+
879
+ return results;
880
+ }
881
+
882
+ // ── Agreement Analysis ──────────────────────────────────────────────────
883
+
884
+ function analyzeClassificationResults(results) {
885
+ const analysis = {
886
+ n: results.length,
887
+ byCondition: { base: { n: 0 }, recognition: { n: 0 } },
888
+ categoryStats: {},
889
+ interMethodAgreement: {},
890
+ };
891
+
892
+ // Category-level stats
893
+ for (const cat of Object.keys(THEMATIC_CATEGORIES)) {
894
+ analysis.categoryStats[cat] = {
895
+ base: { present: 0, absent: 0, strengths: {} },
896
+ recognition: { present: 0, absent: 0, strengths: {} },
897
+ };
898
+ analysis.interMethodAgreement[cat] = {
899
+ bothPresent: 0, bothAbsent: 0,
900
+ aiOnlyPresent: 0, regexOnlyPresent: 0,
901
+ };
902
+ }
903
+
904
+ for (const r of results) {
905
+ const cond = r.condition;
906
+ analysis.byCondition[cond].n++;
907
+
908
+ for (const cat of Object.keys(THEMATIC_CATEGORIES)) {
909
+ const aiPresent = r.ai_categories?.[cat]?.present === true;
910
+ const regexPresent = r.regex_categories?.[cat] === true;
911
+ const strength = r.ai_categories?.[cat]?.strength || 'none';
912
+
913
+ if (aiPresent) analysis.categoryStats[cat][cond].present++;
914
+ else analysis.categoryStats[cat][cond].absent++;
915
+
916
+ analysis.categoryStats[cat][cond].strengths[strength] =
917
+ (analysis.categoryStats[cat][cond].strengths[strength] || 0) + 1;
918
+
919
+ // Inter-method agreement
920
+ if (aiPresent && regexPresent) analysis.interMethodAgreement[cat].bothPresent++;
921
+ else if (!aiPresent && !regexPresent) analysis.interMethodAgreement[cat].bothAbsent++;
922
+ else if (aiPresent && !regexPresent) analysis.interMethodAgreement[cat].aiOnlyPresent++;
923
+ else analysis.interMethodAgreement[cat].regexOnlyPresent++;
924
+ }
925
+ }
926
+
927
+ // Compute agreement rates
928
+ for (const cat of Object.keys(THEMATIC_CATEGORIES)) {
929
+ const a = analysis.interMethodAgreement[cat];
930
+ const total = a.bothPresent + a.bothAbsent + a.aiOnlyPresent + a.regexOnlyPresent;
931
+ a.percentAgreement = total > 0 ? ((a.bothPresent + a.bothAbsent) / total * 100).toFixed(1) : 'N/A';
932
+
933
+ // Cohen's kappa
934
+ if (total > 0) {
935
+ const p_o = (a.bothPresent + a.bothAbsent) / total;
936
+ const p_ai = (a.bothPresent + a.aiOnlyPresent) / total;
937
+ const p_regex = (a.bothPresent + a.regexOnlyPresent) / total;
938
+ const p_e = p_ai * p_regex + (1 - p_ai) * (1 - p_regex);
939
+ a.kappa = p_e < 1 ? ((p_o - p_e) / (1 - p_e)).toFixed(3) : '1.000';
940
+ } else {
941
+ a.kappa = 'N/A';
942
+ }
943
+ }
944
+
945
+ return analysis;
946
+ }
947
+
948
+ function analyzeDiscoveryResults(results) {
949
+ const analysis = {
950
+ n: results.length,
951
+ byCondition: { base: { n: 0 }, recognition: { n: 0 } },
952
+ themeFrequency: {},
953
+ stanceDistribution: { base: {}, recognition: {} },
954
+ epistemicDistribution: { base: {}, recognition: {} },
955
+ };
956
+
957
+ for (const r of results) {
958
+ const cond = r.condition;
959
+ analysis.byCondition[cond].n++;
960
+
961
+ // Collect theme names
962
+ for (const theme of (r.themes || [])) {
963
+ const name = theme.name?.toLowerCase()?.replace(/[^a-z0-9_]/g, '_') || 'unknown';
964
+ if (!analysis.themeFrequency[name]) {
965
+ analysis.themeFrequency[name] = {
966
+ label: theme.label,
967
+ description: theme.description,
968
+ base: 0, recognition: 0,
969
+ };
970
+ }
971
+ analysis.themeFrequency[name][cond]++;
972
+ }
973
+
974
+ // Stances
975
+ const stance = r.pedagogical_stance?.toLowerCase() || 'unknown';
976
+ analysis.stanceDistribution[cond][stance] =
977
+ (analysis.stanceDistribution[cond][stance] || 0) + 1;
978
+
979
+ // Epistemic
980
+ const epistemic = r.epistemic_orientation?.toLowerCase() || 'unknown';
981
+ analysis.epistemicDistribution[cond][epistemic] =
982
+ (analysis.epistemicDistribution[cond][epistemic] || 0) + 1;
983
+ }
984
+
985
+ return analysis;
986
+ }
987
+
988
+ // ── Output Generation ───────────────────────────────────────────────────
989
+
990
+ function generateMarkdown(classifyAnalysis, discoveryAnalysis, modelKey, sampleSize) {
991
+ let md = `# AI Thematic Analysis Results
992
+
993
+ **Generated:** ${new Date().toISOString()}
994
+ **Model:** ${modelKey} (${MODEL_MAP[modelKey]})
995
+ **Sample:** ${sampleSize || 'full dataset'}
996
+
997
+ `;
998
+
999
+ if (classifyAnalysis) {
1000
+ md += `## Option 1: Structured Classification (N=${classifyAnalysis.n})
1001
+
1002
+ Base: N=${classifyAnalysis.byCondition.base.n}, Recognition: N=${classifyAnalysis.byCondition.recognition.n}
1003
+
1004
+ ### Category Presence by Condition
1005
+
1006
+ | Category | Base Present | Base % | Recog Present | Recog % | Difference |
1007
+ |----------|-------------|--------|---------------|---------|------------|
1008
+ `;
1009
+ for (const [cat, stats] of Object.entries(classifyAnalysis.categoryStats)) {
1010
+ const baseN = stats.base.present + stats.base.absent;
1011
+ const recogN = stats.recognition.present + stats.recognition.absent;
1012
+ const basePct = baseN > 0 ? (stats.base.present / baseN * 100).toFixed(1) : '0.0';
1013
+ const recogPct = recogN > 0 ? (stats.recognition.present / recogN * 100).toFixed(1) : '0.0';
1014
+ const diff = (parseFloat(recogPct) - parseFloat(basePct)).toFixed(1);
1015
+ md += `| ${THEMATIC_CATEGORIES[cat].label} | ${stats.base.present}/${baseN} | ${basePct}% | ${stats.recognition.present}/${recogN} | ${recogPct}% | ${diff > 0 ? '+' : ''}${diff}% |\n`;
1016
+ }
1017
+
1018
+ md += `\n### Inter-Method Agreement (AI vs Regex)
1019
+
1020
+ | Category | Both Present | Both Absent | AI Only | Regex Only | Agreement % | Cohen's κ |
1021
+ |----------|-------------|-------------|---------|-----------|------------|----------|
1022
+ `;
1023
+ for (const [cat, a] of Object.entries(classifyAnalysis.interMethodAgreement)) {
1024
+ md += `| ${THEMATIC_CATEGORIES[cat].label} | ${a.bothPresent} | ${a.bothAbsent} | ${a.aiOnlyPresent} | ${a.regexOnlyPresent} | ${a.percentAgreement}% | ${a.kappa} |\n`;
1025
+ }
1026
+
1027
+ md += `\n### Strength Distribution by Condition
1028
+
1029
+ `;
1030
+ for (const [cat, stats] of Object.entries(classifyAnalysis.categoryStats)) {
1031
+ md += `**${THEMATIC_CATEGORIES[cat].label}:**\n`;
1032
+ md += `- Base: ${JSON.stringify(stats.base.strengths)}\n`;
1033
+ md += `- Recognition: ${JSON.stringify(stats.recognition.strengths)}\n\n`;
1034
+ }
1035
+ }
1036
+
1037
+ if (discoveryAnalysis) {
1038
+ md += `## Option 2: Open-Ended Theme Discovery (N=${discoveryAnalysis.n})
1039
+
1040
+ Base: N=${discoveryAnalysis.byCondition.base.n}, Recognition: N=${discoveryAnalysis.byCondition.recognition.n}
1041
+
1042
+ ### Emergent Theme Frequency
1043
+
1044
+ | Theme | Label | Base | Recog | Total | Difference |
1045
+ |-------|-------|------|-------|-------|------------|
1046
+ `;
1047
+ const sortedThemes = Object.entries(discoveryAnalysis.themeFrequency)
1048
+ .sort((a, b) => (b[1].base + b[1].recognition) - (a[1].base + a[1].recognition));
1049
+
1050
+ for (const [name, freq] of sortedThemes.slice(0, 30)) {
1051
+ const total = freq.base + freq.recognition;
1052
+ const diff = freq.recognition - freq.base;
1053
+ md += `| ${name} | ${freq.label || ''} | ${freq.base} | ${freq.recognition} | ${total} | ${diff > 0 ? '+' : ''}${diff} |\n`;
1054
+ }
1055
+
1056
+ md += `\n### Pedagogical Stance Distribution
1057
+
1058
+ | Stance | Base | Recognition |
1059
+ |--------|------|-------------|
1060
+ `;
1061
+ const allStances = new Set([
1062
+ ...Object.keys(discoveryAnalysis.stanceDistribution.base),
1063
+ ...Object.keys(discoveryAnalysis.stanceDistribution.recognition),
1064
+ ]);
1065
+ for (const stance of [...allStances].sort()) {
1066
+ md += `| ${stance} | ${discoveryAnalysis.stanceDistribution.base[stance] || 0} | ${discoveryAnalysis.stanceDistribution.recognition[stance] || 0} |\n`;
1067
+ }
1068
+
1069
+ md += `\n### Epistemic Orientation Distribution
1070
+
1071
+ | Orientation | Base | Recognition |
1072
+ |-------------|------|-------------|
1073
+ `;
1074
+ const allEpistemic = new Set([
1075
+ ...Object.keys(discoveryAnalysis.epistemicDistribution.base),
1076
+ ...Object.keys(discoveryAnalysis.epistemicDistribution.recognition),
1077
+ ]);
1078
+ for (const ep of [...allEpistemic].sort()) {
1079
+ md += `| ${ep} | ${discoveryAnalysis.epistemicDistribution.base[ep] || 0} | ${discoveryAnalysis.epistemicDistribution.recognition[ep] || 0} |\n`;
1080
+ }
1081
+ }
1082
+
1083
+ return md;
1084
+ }
1085
+
1086
+ // ── CLI ─────────────────────────────────────────────────────────────────
1087
+
1088
+ function parseArgs() {
1089
+ const args = process.argv.slice(2);
1090
+ const opts = {
1091
+ mode: 'both', // classify, discover, both
1092
+ model: 'claude-code', // claude-code, haiku, sonnet, opus
1093
+ sample: null, // null = all, number = sample size
1094
+ cells: [1,2,3,4,5,6,7,8],
1095
+ concurrency: 5,
1096
+ pauseEvery: null, // pause after N responses for review
1097
+ costEstimate: false,
1098
+ clean: false, // delete checkpoint files before starting
1099
+ };
1100
+
1101
+ for (let i = 0; i < args.length; i++) {
1102
+ switch (args[i]) {
1103
+ case '--mode': opts.mode = args[++i]; break;
1104
+ case '--model': opts.model = args[++i]; break;
1105
+ case '--sample': opts.sample = parseInt(args[++i]); break;
1106
+ case '--cells': opts.cells = args[++i].split(',').map(Number); break;
1107
+ case '--concurrency': opts.concurrency = parseInt(args[++i]); break;
1108
+ case '--pause-every': opts.pauseEvery = parseInt(args[++i]); break;
1109
+ case '--cost-estimate': opts.costEstimate = true; break;
1110
+ case '--clean': opts.clean = true; break;
1111
+ case '--help':
1112
+ console.log(`Usage: node scripts/qualitative-analysis-ai.js [options]
1113
+
1114
+ Options:
1115
+ --mode <classify|discover|both> Analysis mode (default: both)
1116
+ --model <model> Model to use (default: claude-code)
1117
+ claude-code — Claude Code CLI (subscription, free)
1118
+ haiku — OpenRouter Haiku (~$0.003/call)
1119
+ sonnet — OpenRouter Sonnet (~$0.008/call)
1120
+ opus — OpenRouter Opus (~$0.04/call)
1121
+ --sample <N> Random sample size (default: all)
1122
+ --cells <1,2,...> Cell numbers to include (default: 1-8)
1123
+ --concurrency <N> Parallel calls (default: 5; forced to 1 for claude-code)
1124
+ --pause-every <N> Pause after every N responses to review interim results
1125
+ --clean Delete checkpoint files and start fresh
1126
+ --cost-estimate Print cost estimate and exit
1127
+ --help Show this help
1128
+
1129
+ Resume & monitoring:
1130
+ Runs are checkpointed to exports/.checkpoint-<model>-<mode>.jsonl
1131
+ Re-running the same command resumes from where it left off.
1132
+ Use --clean to discard checkpoints and start over.
1133
+
1134
+ During a run, monitor live results from another terminal:
1135
+ while true; do clear; cat exports/.live-report.md 2>/dev/null || echo "Waiting..."; sleep 5; done
1136
+ Or check raw checkpoint progress:
1137
+ wc -l exports/.checkpoint-claude-code-classify.jsonl`);
1138
+ process.exit(0);
1139
+ }
1140
+ }
1141
+
1142
+ // Claude Code CLI must run sequentially (one subprocess at a time)
1143
+ if (opts.model === 'claude-code') {
1144
+ opts.concurrency = 1;
1145
+ }
1146
+
1147
+ return opts;
1148
+ }
1149
+
1150
+ // ── Main ────────────────────────────────────────────────────────────────
1151
+
1152
+ async function main() {
1153
+ const opts = parseArgs();
1154
+
1155
+ const dbPath = path.join(process.cwd(), 'data', 'evaluations.db');
1156
+ if (!fs.existsSync(dbPath)) {
1157
+ console.error('Database not found:', dbPath);
1158
+ process.exit(1);
1159
+ }
1160
+
1161
+ const db = new Database(dbPath);
1162
+
1163
+ if (opts.costEstimate) {
1164
+ printCostEstimate(db);
1165
+ db.close();
1166
+ return;
1167
+ }
1168
+
1169
+ console.log('='.repeat(70));
1170
+ console.log('AI THEMATIC ANALYSIS OF EVALUATION TRANSCRIPTS');
1171
+ console.log('='.repeat(70));
1172
+ console.log(`Mode: ${opts.mode} | Model: ${opts.model} | Sample: ${opts.sample || 'all'}`);
1173
+ console.log(`Cells: ${opts.cells.join(', ')}`);
1174
+ if (opts.pauseEvery) {
1175
+ console.log(`Pause every: ${opts.pauseEvery} responses`);
1176
+ }
1177
+ console.log(`\nMonitor live results from another terminal:`);
1178
+ console.log(` while true; do clear; cat exports/.live-report.md 2>/dev/null || echo "Waiting for first result..."; sleep 5; done`);
1179
+
1180
+ // Load data
1181
+ const data = loadData(db, opts.cells, opts.sample);
1182
+ console.log(`\nLoaded ${data.length} responses`);
1183
+ const baseCount = data.filter(d => d.condition === 'base').length;
1184
+ const recogCount = data.filter(d => d.condition === 'recognition').length;
1185
+ console.log(` Base: ${baseCount}, Recognition: ${recogCount}`);
1186
+
1187
+ if (data.length === 0) {
1188
+ console.error('No data found. Check cell numbers and database.');
1189
+ db.close();
1190
+ return;
1191
+ }
1192
+
1193
+ // Ensure exports directory
1194
+ const exportsDir = path.join(process.cwd(), 'exports');
1195
+ if (!fs.existsSync(exportsDir)) {
1196
+ fs.mkdirSync(exportsDir, { recursive: true });
1197
+ }
1198
+
1199
+ // Checkpoint files
1200
+ const classifyCheckpoint = checkpointPath(exportsDir, opts.model, 'classify');
1201
+ const discoverCheckpoint = checkpointPath(exportsDir, opts.model, 'discover');
1202
+
1203
+ if (opts.clean) {
1204
+ for (const cp of [classifyCheckpoint, discoverCheckpoint]) {
1205
+ if (fs.existsSync(cp)) {
1206
+ fs.unlinkSync(cp);
1207
+ console.log(` Deleted checkpoint: ${path.basename(cp)}`);
1208
+ }
1209
+ }
1210
+ }
1211
+
1212
+ let classifyResults = null;
1213
+ let classifyAnalysis = null;
1214
+ let discoveryResults = null;
1215
+ let discoveryAnalysis = null;
1216
+
1217
+ const runOpts = { pauseEvery: opts.pauseEvery, exportsDir, sampleSize: opts.sample };
1218
+
1219
+ // Run classification
1220
+ if (opts.mode === 'classify' || opts.mode === 'both') {
1221
+ classifyResults = await runClassification(data, opts.model, opts.concurrency, classifyCheckpoint, runOpts);
1222
+ classifyAnalysis = analyzeClassificationResults(classifyResults);
1223
+ writeLiveReport(exportsDir, opts.model, classifyResults, null);
1224
+ }
1225
+
1226
+ // Run discovery
1227
+ if (opts.mode === 'discover' || opts.mode === 'both') {
1228
+ discoveryResults = await runDiscovery(data, opts.model, opts.concurrency, discoverCheckpoint, runOpts);
1229
+ discoveryAnalysis = analyzeDiscoveryResults(discoveryResults);
1230
+ writeLiveReport(exportsDir, opts.model, classifyResults, discoveryResults);
1231
+ }
1232
+
1233
+ // Clean up checkpoint and live report files on successful completion
1234
+ for (const cp of [classifyCheckpoint, discoverCheckpoint, path.join(exportsDir, '.live-report.md')]) {
1235
+ if (fs.existsSync(cp)) {
1236
+ fs.unlinkSync(cp);
1237
+ }
1238
+ }
1239
+
1240
+ // Save raw results
1241
+ const timestamp = new Date().toISOString().slice(0, 10);
1242
+ const suffix = opts.sample ? `-sample${opts.sample}` : '';
1243
+
1244
+ const jsonOutput = {
1245
+ generated: new Date().toISOString(),
1246
+ model: opts.model,
1247
+ modelId: MODEL_MAP[opts.model],
1248
+ sample: opts.sample,
1249
+ cells: opts.cells,
1250
+ classification: classifyResults ? {
1251
+ results: classifyResults,
1252
+ analysis: classifyAnalysis,
1253
+ } : null,
1254
+ discovery: discoveryResults ? {
1255
+ results: discoveryResults,
1256
+ analysis: discoveryAnalysis,
1257
+ } : null,
1258
+ };
1259
+
1260
+ const jsonPath = path.join(exportsDir, `qualitative-ai-${opts.model}${suffix}-${timestamp}.json`);
1261
+ fs.writeFileSync(jsonPath, JSON.stringify(jsonOutput, null, 2));
1262
+ console.log(`\nJSON: ${jsonPath}`);
1263
+
1264
+ // Generate markdown report
1265
+ const md = generateMarkdown(classifyAnalysis, discoveryAnalysis, opts.model, opts.sample);
1266
+ const mdPath = path.join(exportsDir, `qualitative-ai-${opts.model}${suffix}-${timestamp}.md`);
1267
+ fs.writeFileSync(mdPath, md);
1268
+ console.log(`Markdown: ${mdPath}`);
1269
+
1270
+ // Print summary to console
1271
+ if (classifyAnalysis) {
1272
+ console.log('\n' + '─'.repeat(70));
1273
+ console.log('CLASSIFICATION SUMMARY');
1274
+ console.log('─'.repeat(70));
1275
+ for (const [cat, stats] of Object.entries(classifyAnalysis.categoryStats)) {
1276
+ const baseN = stats.base.present + stats.base.absent;
1277
+ const recogN = stats.recognition.present + stats.recognition.absent;
1278
+ const basePct = baseN > 0 ? (stats.base.present / baseN * 100).toFixed(1) : '0.0';
1279
+ const recogPct = recogN > 0 ? (stats.recognition.present / recogN * 100).toFixed(1) : '0.0';
1280
+ const agree = classifyAnalysis.interMethodAgreement[cat];
1281
+ console.log(` ${THEMATIC_CATEGORIES[cat].label.padEnd(28)} base=${basePct}% recog=${recogPct}% | AI-regex κ=${agree.kappa}`);
1282
+ }
1283
+ }
1284
+
1285
+ if (discoveryAnalysis) {
1286
+ console.log('\n' + '─'.repeat(70));
1287
+ console.log('DISCOVERY SUMMARY');
1288
+ console.log('─'.repeat(70));
1289
+ const sorted = Object.entries(discoveryAnalysis.themeFrequency)
1290
+ .sort((a, b) => (b[1].base + b[1].recognition) - (a[1].base + a[1].recognition));
1291
+ console.log(' Top 15 emergent themes:');
1292
+ for (const [name, freq] of sorted.slice(0, 15)) {
1293
+ const total = freq.base + freq.recognition;
1294
+ const diff = freq.recognition - freq.base;
1295
+ console.log(` ${(freq.label || name).padEnd(35)} total=${total} (base=${freq.base}, recog=${freq.recognition}, diff=${diff > 0 ? '+' : ''}${diff})`);
1296
+ }
1297
+
1298
+ console.log('\n Pedagogical stances:');
1299
+ for (const cond of ['base', 'recognition']) {
1300
+ const sorted = Object.entries(discoveryAnalysis.stanceDistribution[cond])
1301
+ .sort((a, b) => b[1] - a[1]);
1302
+ console.log(` ${cond}: ${sorted.map(([s, n]) => `${s}(${n})`).join(', ')}`);
1303
+ }
1304
+ }
1305
+
1306
+ db.close();
1307
+ console.log('\nDone.');
1308
+ }
1309
+
1310
+ main().catch(err => {
1311
+ console.error('Fatal error:', err);
1312
+ process.exit(1);
1313
+ });