@machinespirits/eval 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (74) hide show
  1. package/README.md +91 -9
  2. package/config/eval-settings.yaml +3 -3
  3. package/config/paper-manifest.json +486 -0
  4. package/config/providers.yaml +9 -6
  5. package/config/tutor-agents.yaml +2261 -0
  6. package/content/README.md +23 -0
  7. package/content/courses/479/course.md +53 -0
  8. package/content/courses/479/lecture-1.md +361 -0
  9. package/content/courses/479/lecture-2.md +360 -0
  10. package/content/courses/479/lecture-3.md +655 -0
  11. package/content/courses/479/lecture-4.md +530 -0
  12. package/content/courses/479/lecture-5.md +326 -0
  13. package/content/courses/479/lecture-6.md +346 -0
  14. package/content/courses/479/lecture-7.md +326 -0
  15. package/content/courses/479/lecture-8.md +273 -0
  16. package/content/courses/479/roadmap-slides.md +656 -0
  17. package/content/manifest.yaml +8 -0
  18. package/docs/research/build.sh +44 -20
  19. package/docs/research/figures/figure10.png +0 -0
  20. package/docs/research/figures/figure11.png +0 -0
  21. package/docs/research/figures/figure3.png +0 -0
  22. package/docs/research/figures/figure4.png +0 -0
  23. package/docs/research/figures/figure5.png +0 -0
  24. package/docs/research/figures/figure6.png +0 -0
  25. package/docs/research/figures/figure7.png +0 -0
  26. package/docs/research/figures/figure8.png +0 -0
  27. package/docs/research/figures/figure9.png +0 -0
  28. package/docs/research/header.tex +23 -2
  29. package/docs/research/paper-full.md +941 -285
  30. package/docs/research/paper-short.md +216 -585
  31. package/docs/research/references.bib +132 -0
  32. package/docs/research/slides-header.tex +188 -0
  33. package/docs/research/slides-pptx.md +363 -0
  34. package/docs/research/slides.md +531 -0
  35. package/docs/research/style-reference-pptx.py +199 -0
  36. package/package.json +6 -5
  37. package/scripts/analyze-eval-results.js +69 -17
  38. package/scripts/analyze-mechanism-traces.js +763 -0
  39. package/scripts/analyze-modulation-learning.js +498 -0
  40. package/scripts/analyze-prosthesis.js +144 -0
  41. package/scripts/analyze-run.js +264 -79
  42. package/scripts/assess-transcripts.js +853 -0
  43. package/scripts/browse-transcripts.js +854 -0
  44. package/scripts/check-parse-failures.js +73 -0
  45. package/scripts/code-dialectical-modulation.js +1320 -0
  46. package/scripts/download-data.sh +55 -0
  47. package/scripts/eval-cli.js +106 -18
  48. package/scripts/generate-paper-figures.js +663 -0
  49. package/scripts/generate-paper-figures.py +577 -76
  50. package/scripts/generate-paper-tables.js +299 -0
  51. package/scripts/qualitative-analysis-ai.js +3 -3
  52. package/scripts/render-sequence-diagram.js +694 -0
  53. package/scripts/test-latency.js +210 -0
  54. package/scripts/test-rate-limit.js +95 -0
  55. package/scripts/test-token-budget.js +332 -0
  56. package/scripts/validate-paper-manifest.js +670 -0
  57. package/services/__tests__/evalConfigLoader.test.js +2 -2
  58. package/services/__tests__/learnerRubricEvaluator.test.js +361 -0
  59. package/services/__tests__/learnerTutorInteractionEngine.test.js +326 -0
  60. package/services/evaluationRunner.js +975 -98
  61. package/services/evaluationStore.js +12 -4
  62. package/services/learnerTutorInteractionEngine.js +27 -2
  63. package/services/mockProvider.js +133 -0
  64. package/services/promptRewriter.js +1471 -5
  65. package/services/rubricEvaluator.js +55 -2
  66. package/services/transcriptFormatter.js +675 -0
  67. package/docs/EVALUATION-VARIABLES.md +0 -589
  68. package/docs/REPLICATION-PLAN.md +0 -577
  69. package/scripts/analyze-run.mjs +0 -282
  70. package/scripts/compare-runs.js +0 -44
  71. package/scripts/compare-suggestions.js +0 -80
  72. package/scripts/dig-into-run.js +0 -158
  73. package/scripts/show-failed-suggestions.js +0 -64
  74. /package/scripts/{check-run.mjs → check-run.js} +0 -0
@@ -0,0 +1,853 @@
1
+ #!/usr/bin/env node
2
+
3
+ /**
4
+ * Qualitative Transcript Assessment
5
+ *
6
+ * Sends multi-turn dialogue transcripts to Claude for rich narrative assessment.
7
+ * Complements numeric rubric scores with interpretive analysis of pedagogical
8
+ * dynamics, recognition moments, superego effectiveness, and learner trajectory.
9
+ *
10
+ * Usage:
11
+ * node scripts/assess-transcripts.js <runId> [options]
12
+ *
13
+ * Options:
14
+ * --scenario <id> Filter by scenario ID
15
+ * --condition <cond> Filter: recog | base
16
+ * --profile <name> Filter by profile name (substring match)
17
+ * --limit <n> Max dialogues to assess
18
+ * --model <m> claude-code (default) | haiku | sonnet | gpt
19
+ * --parallelism <n> Concurrent assessments (default: 2)
20
+ * --output <path> Output file path (default: exports/transcript-assessment-<runId>.md)
21
+ * --resume Skip already-assessed dialogues (checks DB)
22
+ * --force Re-assess even if already assessed in DB
23
+ * --help Show this help
24
+ *
25
+ * Assessments are stored in the DB on evaluation_results (qualitative_assessment,
26
+ * qualitative_model columns). A JSONL backup is also written to exports/.
27
+ */
28
+
29
+ import 'dotenv/config';
30
+ import Database from 'better-sqlite3';
31
+ import fs from 'fs';
32
+ import path from 'path';
33
+ import { fileURLToPath } from 'url';
34
+ import { spawn } from 'child_process';
35
+ import { formatTranscript } from '../services/transcriptFormatter.js';
36
+
37
+ const __dirname = path.dirname(fileURLToPath(import.meta.url));
38
+ const LOGS_DIR = path.resolve(__dirname, '..', 'logs', 'tutor-dialogues');
39
+ const DB_PATH = path.resolve(__dirname, '..', 'data', 'evaluations.db');
40
+
41
+ // ── Assessment Tags ──────────────────────────────────────────────────────
42
+
43
+ const VALID_TAGS = [
44
+ 'recognition_moment', 'superego_overcorrection', 'learner_breakthrough',
45
+ 'strategy_shift', 'missed_scaffold', 'ego_compliance', 'ego_autonomy',
46
+ 'productive_impasse', 'emotional_attunement', 'stalling', 'regression',
47
+ ];
48
+
49
+ // ── Model Calls ──────────────────────────────────────────────────────────
50
+
51
+ const MODEL_MAP = {
52
+ 'claude-code': 'claude-code',
53
+ haiku: 'anthropic/claude-haiku-4.5',
54
+ sonnet: 'anthropic/claude-sonnet-4.5',
55
+ gpt: 'openai/gpt-5.2',
56
+ };
57
+
58
+ async function callModel(prompt, modelKey) {
59
+ if (modelKey === 'claude-code') return callClaudeCode(prompt);
60
+ return callOpenRouter(prompt, modelKey);
61
+ }
62
+
63
+ async function callClaudeCode(prompt) {
64
+ const stdout = await new Promise((resolve, reject) => {
65
+ const env = { ...process.env };
66
+ delete env.ANTHROPIC_API_KEY;
67
+ const child = spawn('claude', ['-p', '-', '--output-format', 'text'], {
68
+ stdio: ['pipe', 'pipe', 'pipe'],
69
+ env,
70
+ });
71
+ let out = '';
72
+ let err = '';
73
+ child.stdout.on('data', d => { out += d; });
74
+ child.stderr.on('data', d => { err += d; });
75
+ child.on('error', e => reject(new Error(`Failed to spawn claude: ${e.message}`)));
76
+ child.on('close', code => {
77
+ if (code !== 0) reject(new Error(err || out || `claude exited with code ${code}`));
78
+ else resolve(out);
79
+ });
80
+ child.stdin.write(prompt);
81
+ child.stdin.end();
82
+ });
83
+ return stdout.trim();
84
+ }
85
+
86
+ async function callOpenRouter(prompt, modelKey) {
87
+ const apiKey = process.env.OPENROUTER_API_KEY;
88
+ if (!apiKey) throw new Error('OPENROUTER_API_KEY not set');
89
+ const model = MODEL_MAP[modelKey];
90
+ if (!model) throw new Error(`Unknown model: ${modelKey}`);
91
+
92
+ const controller = new AbortController();
93
+ const timeout = setTimeout(() => controller.abort(), 180000);
94
+ try {
95
+ const res = await fetch('https://openrouter.ai/api/v1/chat/completions', {
96
+ method: 'POST',
97
+ headers: {
98
+ 'Content-Type': 'application/json',
99
+ 'Authorization': `Bearer ${apiKey}`,
100
+ },
101
+ body: JSON.stringify({
102
+ model,
103
+ max_tokens: 3000,
104
+ temperature: 0.2,
105
+ include_reasoning: false,
106
+ response_format: { type: 'json_object' },
107
+ messages: [{ role: 'user', content: prompt }],
108
+ }),
109
+ signal: controller.signal,
110
+ });
111
+ clearTimeout(timeout);
112
+ if (!res.ok) {
113
+ const body = await res.text();
114
+ throw new Error(`OpenRouter ${res.status}: ${body.slice(0, 200)}`);
115
+ }
116
+ const data = await res.json();
117
+ const content = data.choices?.[0]?.message?.content;
118
+ if (!content) throw new Error('No content in response');
119
+ return content;
120
+ } catch (err) {
121
+ clearTimeout(timeout);
122
+ throw err;
123
+ }
124
+ }
125
+
126
+ function parseJsonResponse(content) {
127
+ try {
128
+ return JSON.parse(content);
129
+ } catch {
130
+ const match = content.match(/```(?:json)?\s*([\s\S]*?)```/);
131
+ if (match) return JSON.parse(match[1].trim());
132
+ // Try extracting first { ... } block
133
+ const first = content.indexOf('{');
134
+ const last = content.lastIndexOf('}');
135
+ if (first !== -1 && last > first) {
136
+ return JSON.parse(content.slice(first, last + 1));
137
+ }
138
+ throw new Error(`Failed to parse JSON: ${content.slice(0, 300)}`);
139
+ }
140
+ }
141
+
142
+ // ── DB Schema ────────────────────────────────────────────────────────────
143
+
144
+ function ensureColumns(db) {
145
+ const cols = db.prepare("PRAGMA table_info(evaluation_results)").all().map(c => c.name);
146
+ if (!cols.includes('qualitative_assessment')) {
147
+ db.exec("ALTER TABLE evaluation_results ADD COLUMN qualitative_assessment TEXT");
148
+ }
149
+ if (!cols.includes('qualitative_model')) {
150
+ db.exec("ALTER TABLE evaluation_results ADD COLUMN qualitative_model TEXT");
151
+ }
152
+ if (!cols.includes('blinded_qualitative_assessment')) {
153
+ db.exec("ALTER TABLE evaluation_results ADD COLUMN blinded_qualitative_assessment TEXT");
154
+ }
155
+ if (!cols.includes('blinded_qualitative_model')) {
156
+ db.exec("ALTER TABLE evaluation_results ADD COLUMN blinded_qualitative_model TEXT");
157
+ }
158
+ }
159
+
160
+ // ── Data Loading ─────────────────────────────────────────────────────────
161
+
162
+ function loadMultiTurnResults(db, runId, filters = {}) {
163
+ let sql = `
164
+ SELECT id, scenario_id, scenario_name, profile_name, overall_score,
165
+ dialogue_id, dialogue_rounds, factor_recognition,
166
+ factor_multi_agent_tutor, learner_architecture,
167
+ judge_model, ego_model, superego_model,
168
+ qualitative_assessment, qualitative_model,
169
+ blinded_qualitative_assessment, blinded_qualitative_model
170
+ FROM evaluation_results
171
+ WHERE run_id = ? AND success = 1 AND dialogue_id IS NOT NULL
172
+ `;
173
+ const params = [runId];
174
+
175
+ if (filters.scenario) {
176
+ sql += ` AND scenario_id LIKE ?`;
177
+ params.push(`%${filters.scenario}%`);
178
+ }
179
+ if (filters.condition === 'recog' || filters.condition === 'recognition') {
180
+ sql += ` AND (profile_name LIKE '%recog%' OR factor_recognition = 1)`;
181
+ } else if (filters.condition === 'base') {
182
+ sql += ` AND profile_name NOT LIKE '%recog%' AND (factor_recognition = 0 OR factor_recognition IS NULL)`;
183
+ }
184
+ if (filters.profile) {
185
+ sql += ` AND profile_name LIKE ?`;
186
+ params.push(`%${filters.profile}%`);
187
+ }
188
+
189
+ sql += ` ORDER BY scenario_id, profile_name, id`;
190
+
191
+ const rows = db.prepare(sql).all(...params);
192
+
193
+ if (filters.limit && rows.length > filters.limit) {
194
+ return rows.slice(0, filters.limit);
195
+ }
196
+
197
+ return rows;
198
+ }
199
+
200
+ function loadDialogueTrace(dialogueId) {
201
+ if (!dialogueId || !fs.existsSync(LOGS_DIR)) return null;
202
+
203
+ const files = fs.readdirSync(LOGS_DIR).filter(f => f.includes(dialogueId));
204
+ if (files.length === 0) return null;
205
+
206
+ try {
207
+ const dialogue = JSON.parse(fs.readFileSync(path.join(LOGS_DIR, files[0]), 'utf-8'));
208
+ return {
209
+ trace: dialogue.dialogueTrace || [],
210
+ totalTurns: dialogue.totalTurns || 0,
211
+ profileName: dialogue.profileName,
212
+ };
213
+ } catch {
214
+ return null;
215
+ }
216
+ }
217
+
218
+ // ── Condition / Mechanism Detection ──────────────────────────────────────
219
+
220
+ function detectCondition(row) {
221
+ if (row.factor_recognition === 1) return 'recognition';
222
+ if (row.profile_name?.includes('recog')) return 'recognition';
223
+ return 'base';
224
+ }
225
+
226
+ function detectMechanism(profileName) {
227
+ if (!profileName) return 'standard';
228
+ const name = profileName.toLowerCase();
229
+ if (name.includes('_combined')) return 'combined';
230
+ if (name.includes('_quantitative')) return 'quantitative_disposition';
231
+ if (name.includes('_erosion')) return 'prompt_erosion';
232
+ if (name.includes('_intersubjective')) return 'intersubjective';
233
+ if (name.includes('_profile_')) return 'other_ego_profiling';
234
+ if (name.includes('_advocate')) return 'advocate';
235
+ if (name.includes('_adversary')) return 'adversary';
236
+ if (name.includes('_suspicious')) return 'dialectical_suspicious';
237
+ if (name.includes('_self_reflect')) return 'self_reflection';
238
+ if (name.includes('dialectical')) return 'dialectical';
239
+ if (name.includes('enhanced')) return 'enhanced';
240
+ if (name.includes('placebo')) return 'placebo';
241
+ return 'standard';
242
+ }
243
+
244
+ // ── Assessment Prompt ────────────────────────────────────────────────────
245
+
246
+ function buildAssessmentPrompt(row, transcript, { blinded = false } = {}) {
247
+ const condition = detectCondition(row);
248
+ const mechanism = detectMechanism(row.profile_name);
249
+
250
+ const metadataLines = [];
251
+ if (!blinded) metadataLines.push(`- Cell: ${row.profile_name}`);
252
+ metadataLines.push(`- Scenario: ${row.scenario_id}`);
253
+ if (!blinded) metadataLines.push(`- Recognition condition: ${condition}`);
254
+ metadataLines.push(`- Mechanism: ${mechanism}`);
255
+ if (!blinded) metadataLines.push(`- Numeric score: ${row.overall_score != null ? row.overall_score.toFixed(1) : 'N/A'}/100`);
256
+ metadataLines.push(`- Turns: ${row.dialogue_rounds || 'unknown'}`);
257
+
258
+ return `You are analyzing a multi-turn AI tutoring dialogue. The dialogue uses an
259
+ ego-superego architecture where:
260
+ - The EGO generates tutoring responses
261
+ - The SUPEREGO critiques and may request revisions
262
+ - Between turns, both may reflect on their own practice
263
+ - Some dialogues include intersubjective responses (ego responding to superego's reflections)
264
+
265
+ METADATA:
266
+ ${metadataLines.join('\n')}
267
+
268
+ TRANSCRIPT:
269
+ ${transcript}
270
+
271
+ ASSESSMENT INSTRUCTIONS:
272
+ Provide a qualitative assessment across these axes. Write 2-4 sentences per axis.
273
+ Focus on specific moments in the dialogue — cite turn numbers (ACT numbers) and quote key phrases.
274
+
275
+ 1. **Pedagogical Strategy Arc** — How does the tutor's approach evolve? What triggers shifts?
276
+ 2. **Recognition Dynamics** — Where does the tutor treat the learner as autonomous subject vs. object? Moments of genuine mutual recognition?
277
+ 3. **Superego Effectiveness** — Is the internal critic helpful or counterproductive? Does ego learn from it?
278
+ 4. **Learner Trajectory** — Engagement evolution: confusion → engagement → understanding, or stalling?
279
+ 5. **Missed Opportunities** — What could the tutor have done differently?
280
+ 6. **Key Turning Point** — The single most consequential moment and why
281
+
282
+ Return a JSON object with this exact structure:
283
+ {
284
+ "pedagogical_arc": "...",
285
+ "recognition_dynamics": "...",
286
+ "superego_effectiveness": "...",
287
+ "learner_trajectory": "...",
288
+ "missed_opportunities": "...",
289
+ "key_turning_point": { "turn": <number>, "description": "..." },
290
+ "overall_narrative": "A 3-5 sentence synthesis of the dialogue's quality and significance.",
291
+ "tags": ["tag1", "tag2"]
292
+ }
293
+
294
+ Valid tags (use only from this list, pick 2-5 that apply):
295
+ ${VALID_TAGS.map(t => ` - ${t}`).join('\n')}
296
+
297
+ Return ONLY the JSON object, no other text.`;
298
+ }
299
+
300
+ // ── Concurrency Control ──────────────────────────────────────────────────
301
+
302
+ async function runWithConcurrency(tasks, concurrency) {
303
+ const results = [];
304
+ let index = 0;
305
+
306
+ async function worker() {
307
+ while (index < tasks.length) {
308
+ const i = index++;
309
+ results[i] = await tasks[i]();
310
+ }
311
+ }
312
+
313
+ const workers = Array.from({ length: Math.min(concurrency, tasks.length) }, () => worker());
314
+ await Promise.all(workers);
315
+ return results;
316
+ }
317
+
318
+ // ── Report Generation ────────────────────────────────────────────────────
319
+
320
+ function generateReport(runId, assessments, modelKey) {
321
+ const scored = assessments.filter(a => a.assessment);
322
+ const errored = assessments.filter(a => a.error);
323
+
324
+ let md = `# Qualitative Transcript Assessment: ${runId}
325
+ Generated: ${new Date().toISOString().slice(0, 10)} | Model: ${modelKey} | N=${scored.length} dialogues`;
326
+
327
+ if (errored.length > 0) {
328
+ md += ` (${errored.length} errors)`;
329
+ }
330
+ md += '\n\n';
331
+
332
+ // ── Summary Table ──
333
+ md += `## Summary\n\n`;
334
+ md += `| # | Scenario | Cell | Cond | Score | Tags |\n`;
335
+ md += `|---|----------|------|------|-------|------|\n`;
336
+
337
+ for (let i = 0; i < scored.length; i++) {
338
+ const a = scored[i];
339
+ const shortProfile = a.profile_name?.replace(/^cell_\d+_/, '') || '';
340
+ const tags = (a.assessment.tags || []).join(', ');
341
+ const score = a.overall_score != null ? a.overall_score.toFixed(1) : '--';
342
+ md += `| ${i + 1} | ${a.scenario_id} | ${shortProfile} | ${a.condition} | ${score} | ${tags} |\n`;
343
+ }
344
+
345
+ // ── Tag Frequencies ──
346
+ md += `\n### Tag Frequencies\n\n`;
347
+ const tagCounts = {};
348
+ const tagsByCondition = {};
349
+ for (const a of scored) {
350
+ const tags = a.assessment.tags || [];
351
+ for (const tag of tags) {
352
+ tagCounts[tag] = (tagCounts[tag] || 0) + 1;
353
+ if (!tagsByCondition[tag]) tagsByCondition[tag] = { base: 0, recognition: 0 };
354
+ tagsByCondition[tag][a.condition]++;
355
+ }
356
+ }
357
+ const sortedTags = Object.entries(tagCounts).sort((a, b) => b[1] - a[1]);
358
+ for (const [tag, count] of sortedTags) {
359
+ const bc = tagsByCondition[tag];
360
+ md += `- **${tag}**: ${count} (base=${bc.base}, recog=${bc.recognition})\n`;
361
+ }
362
+
363
+ // ── Cross-Dialogue Themes ──
364
+ md += `\n### Cross-Dialogue Themes\n\n`;
365
+
366
+ // Aggregate narratives for a synthesis
367
+ const baseNarratives = scored.filter(a => a.condition === 'base').map(a => a.assessment.overall_narrative).filter(Boolean);
368
+ const recogNarratives = scored.filter(a => a.condition === 'recognition').map(a => a.assessment.overall_narrative).filter(Boolean);
369
+
370
+ if (baseNarratives.length > 0) {
371
+ const baseScores = scored.filter(a => a.condition === 'base').map(a => a.overall_score).filter(s => s != null);
372
+ const baseMean = baseScores.length > 0 ? (baseScores.reduce((a, b) => a + b, 0) / baseScores.length).toFixed(1) : '--';
373
+ md += `**Base condition** (N=${baseNarratives.length}, mean score=${baseMean}): `;
374
+ md += `See individual assessments below.\n\n`;
375
+ }
376
+ if (recogNarratives.length > 0) {
377
+ const recogScores = scored.filter(a => a.condition === 'recognition').map(a => a.overall_score).filter(s => s != null);
378
+ const recogMean = recogScores.length > 0 ? (recogScores.reduce((a, b) => a + b, 0) / recogScores.length).toFixed(1) : '--';
379
+ md += `**Recognition condition** (N=${recogNarratives.length}, mean score=${recogMean}): `;
380
+ md += `See individual assessments below.\n\n`;
381
+ }
382
+
383
+ // ── Individual Assessments ──
384
+ for (let i = 0; i < scored.length; i++) {
385
+ const a = scored[i];
386
+ md += `---\n\n`;
387
+ md += `## Dialogue ${i + 1}: ${a.profile_name} × ${a.scenario_id}`;
388
+ md += ` (Score: ${a.overall_score != null ? a.overall_score.toFixed(1) : '--'})\n\n`;
389
+
390
+ const ax = a.assessment;
391
+ md += `**Pedagogical Arc**: ${ax.pedagogical_arc}\n\n`;
392
+ md += `**Recognition Dynamics**: ${ax.recognition_dynamics}\n\n`;
393
+ md += `**Superego Effectiveness**: ${ax.superego_effectiveness}\n\n`;
394
+ md += `**Learner Trajectory**: ${ax.learner_trajectory}\n\n`;
395
+ md += `**Missed Opportunities**: ${ax.missed_opportunities}\n\n`;
396
+
397
+ if (ax.key_turning_point) {
398
+ const tp = ax.key_turning_point;
399
+ md += `**Key Turning Point** (Turn ${tp.turn}): ${tp.description}\n\n`;
400
+ }
401
+
402
+ md += `**Overall Narrative**: ${ax.overall_narrative}\n\n`;
403
+
404
+ if (ax.tags?.length > 0) {
405
+ md += `**Tags**: ${ax.tags.join(', ')}\n\n`;
406
+ }
407
+ }
408
+
409
+ // ── Errors ──
410
+ if (errored.length > 0) {
411
+ md += `---\n\n## Errors\n\n`;
412
+ for (const a of errored) {
413
+ md += `- **${a.profile_name} × ${a.scenario_id}** (id=${a.id}): ${a.error}\n`;
414
+ }
415
+ }
416
+
417
+ return md;
418
+ }
419
+
420
+ // ── CLI ──────────────────────────────────────────────────────────────────
421
+
422
+ function parseArgs() {
423
+ const args = process.argv.slice(2);
424
+ const opts = {
425
+ runId: null,
426
+ scenario: null,
427
+ condition: null,
428
+ profile: null,
429
+ limit: null,
430
+ model: 'claude-code',
431
+ parallelism: 2,
432
+ output: null,
433
+ resume: false,
434
+ force: false,
435
+ blinded: false,
436
+ importFile: null,
437
+ };
438
+
439
+ for (let i = 0; i < args.length; i++) {
440
+ switch (args[i]) {
441
+ case '--scenario': opts.scenario = args[++i]; break;
442
+ case '--condition': opts.condition = args[++i]; break;
443
+ case '--profile': opts.profile = args[++i]; break;
444
+ case '--limit': opts.limit = parseInt(args[++i], 10); break;
445
+ case '--model': opts.model = args[++i]; break;
446
+ case '--parallelism': opts.parallelism = parseInt(args[++i], 10); break;
447
+ case '--output': opts.output = args[++i]; break;
448
+ case '--resume': opts.resume = true; break;
449
+ case '--force': opts.force = true; break;
450
+ case '--blinded': opts.blinded = true; break;
451
+ case '--import': opts.importFile = args[++i]; break;
452
+ case '--help':
453
+ console.log(`Usage: node scripts/assess-transcripts.js <runId> [options]
454
+
455
+ Options:
456
+ --scenario <id> Filter by scenario ID (substring match)
457
+ --condition <cond> Filter: recog | base
458
+ --profile <name> Filter by profile name (substring match)
459
+ --limit <n> Max dialogues to assess
460
+ --model <m> claude-code (default) | haiku | sonnet | gpt
461
+ --parallelism <n> Concurrent assessments (default: 2)
462
+ --output <path> Output file path
463
+ --resume Skip dialogues already assessed in DB
464
+ --force Re-assess even if already in DB (overwrites)
465
+ --blinded Strip condition labels from metadata and transcript header
466
+ --import <jsonl> Import assessments from a JSONL file into DB
467
+ --help Show this help
468
+
469
+ Examples:
470
+ node scripts/assess-transcripts.js eval-2026-02-14-abcd1234 --limit 3
471
+ node scripts/assess-transcripts.js eval-2026-02-14-abcd1234 --scenario epistemic --condition recog
472
+ node scripts/assess-transcripts.js eval-2026-02-14-abcd1234 --model haiku --parallelism 4`);
473
+ process.exit(0);
474
+ default:
475
+ if (!args[i].startsWith('--') && !opts.runId) {
476
+ opts.runId = args[i];
477
+ } else if (!args[i].startsWith('--')) {
478
+ console.error(`Unknown argument: ${args[i]}`);
479
+ process.exit(1);
480
+ }
481
+ }
482
+ }
483
+
484
+ if (!opts.runId && !opts.importFile) {
485
+ console.error('Error: run ID is required.\nUsage: node scripts/assess-transcripts.js <runId> [options]');
486
+ process.exit(1);
487
+ }
488
+
489
+ return opts;
490
+ }
491
+
492
+ // ── Import from JSONL ────────────────────────────────────────────────────
493
+
494
+ function importFromFile(db, filePath, modelKey) {
495
+ ensureColumns(db);
496
+
497
+ if (!fs.existsSync(filePath)) {
498
+ console.error(`File not found: ${filePath}`);
499
+ process.exit(1);
500
+ }
501
+
502
+ const updateStmt = db.prepare(`
503
+ UPDATE evaluation_results
504
+ SET qualitative_assessment = ?, qualitative_model = ?
505
+ WHERE id = ?
506
+ `);
507
+
508
+ // Load entries from JSONL or JSON format
509
+ const entries = [];
510
+ const raw = fs.readFileSync(filePath, 'utf-8').trim();
511
+
512
+ if (filePath.endsWith('.jsonl')) {
513
+ // JSONL: one entry per line
514
+ for (const line of raw.split('\n').filter(l => l.trim())) {
515
+ try { entries.push(JSON.parse(line)); } catch { /* skip */ }
516
+ }
517
+ } else {
518
+ // JSON: expect { assessments: [...] } or bare array
519
+ try {
520
+ const parsed = JSON.parse(raw);
521
+ const arr = Array.isArray(parsed) ? parsed : (parsed.assessments || []);
522
+ entries.push(...arr);
523
+ } catch (err) {
524
+ console.error(`Failed to parse JSON: ${err.message}`);
525
+ process.exit(1);
526
+ }
527
+ }
528
+
529
+ let imported = 0;
530
+ let skipped = 0;
531
+
532
+ const importMany = db.transaction(() => {
533
+ for (const entry of entries) {
534
+ if (!entry.assessment || !entry.id) {
535
+ skipped++;
536
+ continue;
537
+ }
538
+ updateStmt.run(JSON.stringify(entry.assessment), modelKey, entry.id);
539
+ imported++;
540
+ }
541
+ });
542
+
543
+ importMany();
544
+
545
+ console.log(`Imported ${imported} assessments from ${filePath}`);
546
+ if (skipped > 0) console.log(` Skipped ${skipped} (no assessment or no id)`);
547
+
548
+ return imported;
549
+ }
550
+
551
+ // ── Main ─────────────────────────────────────────────────────────────────
552
+
553
+ async function main() {
554
+ const opts = parseArgs();
555
+
556
+ if (!fs.existsSync(DB_PATH)) {
557
+ console.error('Database not found:', DB_PATH);
558
+ process.exit(1);
559
+ }
560
+
561
+ const db = new Database(DB_PATH);
562
+ ensureColumns(db);
563
+
564
+ // Handle --import mode (accepts .json or .jsonl)
565
+ if (opts.importFile) {
566
+ const n = importFromFile(db, opts.importFile, opts.model);
567
+ db.close();
568
+ console.log(`Done. ${n} assessments written to DB.`);
569
+ return;
570
+ }
571
+
572
+ // Prepare DB statements — blinded assessments go to separate columns
573
+ const assessCol = opts.blinded ? 'blinded_qualitative_assessment' : 'qualitative_assessment';
574
+ const modelCol = opts.blinded ? 'blinded_qualitative_model' : 'qualitative_model';
575
+ const updateStmt = db.prepare(`
576
+ UPDATE evaluation_results
577
+ SET ${assessCol} = ?, ${modelCol} = ?
578
+ WHERE id = ?
579
+ `);
580
+
581
+ console.log('='.repeat(70));
582
+ console.log(`QUALITATIVE TRANSCRIPT ASSESSMENT${opts.blinded ? ' (BLINDED)' : ''}`);
583
+ console.log('='.repeat(70));
584
+ console.log(`Run: ${opts.runId} | Model: ${opts.model} | Parallelism: ${opts.parallelism}${opts.blinded ? ' | BLINDED' : ''}`);
585
+
586
+ // Load results with multi-turn dialogues
587
+ const rows = loadMultiTurnResults(db, opts.runId, {
588
+ scenario: opts.scenario,
589
+ condition: opts.condition,
590
+ profile: opts.profile,
591
+ limit: opts.limit,
592
+ });
593
+
594
+ console.log(`\nFound ${rows.length} results with dialogue logs`);
595
+
596
+ if (rows.length === 0) {
597
+ console.log('No multi-turn dialogues found. Ensure the run has dialogue_id values.');
598
+ db.close();
599
+ return;
600
+ }
601
+
602
+ // Load transcripts and filter to those with valid traces
603
+ const dialogues = [];
604
+ let skipped = 0;
605
+ for (const row of rows) {
606
+ const dialogue = loadDialogueTrace(row.dialogue_id);
607
+ if (!dialogue || dialogue.trace.length === 0) {
608
+ skipped++;
609
+ continue;
610
+ }
611
+
612
+ const transcript = formatTranscript(dialogue.trace, {
613
+ detail: 'play',
614
+ scenarioName: row.scenario_name || row.scenario_id,
615
+ profileName: opts.blinded ? '' : row.profile_name,
616
+ totalTurns: dialogue.totalTurns,
617
+ });
618
+
619
+ dialogues.push({
620
+ row,
621
+ transcript,
622
+ condition: detectCondition(row),
623
+ mechanism: detectMechanism(row.profile_name),
624
+ });
625
+ }
626
+
627
+ if (skipped > 0) {
628
+ console.log(` Skipped ${skipped} results without dialogue logs`);
629
+ }
630
+ console.log(` ${dialogues.length} dialogues ready for assessment`);
631
+
632
+ if (dialogues.length === 0) {
633
+ console.log('No dialogue traces found.');
634
+ db.close();
635
+ return;
636
+ }
637
+
638
+ // Estimate and confirm
639
+ const conditions = [...new Set(dialogues.map(d => d.condition))];
640
+ const scenarios = [...new Set(dialogues.map(d => d.row.scenario_id))];
641
+ console.log(` Conditions: ${conditions.join(', ')}`);
642
+ console.log(` Scenarios: ${scenarios.join(', ')}`);
643
+
644
+ if (opts.model === 'claude-code') {
645
+ console.log(` Cost: Free (Claude Code subscription)`);
646
+ } else {
647
+ const estTokens = dialogues.length * 3000;
648
+ console.log(` Estimated tokens: ~${(estTokens / 1000).toFixed(0)}K`);
649
+ }
650
+
651
+ // Ensure exports directory
652
+ const exportsDir = path.resolve(__dirname, '..', 'exports');
653
+ if (!fs.existsSync(exportsDir)) {
654
+ fs.mkdirSync(exportsDir, { recursive: true });
655
+ }
656
+
657
+ const outputPath = opts.output || path.join(exportsDir, `transcript-assessment-${opts.runId}.md`);
658
+ const jsonlPath = outputPath.replace(/\.md$/, '.jsonl');
659
+
660
+ // Determine which dialogues need assessment
661
+ let remaining;
662
+ // Use blinded column when --blinded, otherwise the standard column
663
+ const checkCol = opts.blinded ? 'blinded_qualitative_assessment' : 'qualitative_assessment';
664
+ const hasAssessment = (d) => d.row[checkCol] != null;
665
+
666
+ if (opts.force) {
667
+ remaining = dialogues;
668
+ console.log(` --force: will re-assess all ${dialogues.length} dialogues`);
669
+ } else if (opts.resume) {
670
+ // Check DB for existing assessments
671
+ const alreadyDone = dialogues.filter(hasAssessment);
672
+ remaining = dialogues.filter(d => !hasAssessment(d));
673
+ if (alreadyDone.length > 0) {
674
+ console.log(`\n ${alreadyDone.length} already assessed in DB, ${remaining.length} remaining`);
675
+ }
676
+ } else {
677
+ // Default: skip rows that already have assessments (same as --resume)
678
+ remaining = dialogues.filter(d => !hasAssessment(d));
679
+ const alreadyDone = dialogues.length - remaining.length;
680
+ if (alreadyDone > 0) {
681
+ console.log(` ${alreadyDone} already assessed in DB (use --force to re-assess)`);
682
+ }
683
+ }
684
+
685
+ if (remaining.length === 0) {
686
+ console.log('All dialogues already assessed. Regenerating report...');
687
+ }
688
+
689
+ // Clear JSONL backup for fresh run (append mode for resume)
690
+ if (!opts.resume && !opts.force) {
691
+ // Fresh: truncate
692
+ }
693
+
694
+ // Run assessments
695
+ const startTime = Date.now();
696
+ let completed = 0;
697
+ let errors = 0;
698
+
699
+ if (remaining.length > 0) {
700
+ console.log(`\nAssessing ${remaining.length} dialogues...\n`);
701
+
702
+ const tasks = remaining.map((d, idx) => async () => {
703
+ const label = `[${idx + 1}/${remaining.length}]`;
704
+ process.stdout.write(` ${label} ${d.row.scenario_id} / ${d.row.profile_name} (${d.condition})...`);
705
+
706
+ const prompt = buildAssessmentPrompt(d.row, d.transcript, { blinded: opts.blinded });
707
+
708
+ // Retry once on failure
709
+ for (let attempt = 0; attempt < 2; attempt++) {
710
+ try {
711
+ const content = await callModel(prompt, opts.model);
712
+ const assessment = parseJsonResponse(content);
713
+
714
+ // Validate tags
715
+ if (assessment.tags) {
716
+ assessment.tags = assessment.tags.filter(t => VALID_TAGS.includes(t));
717
+ }
718
+
719
+ // Write to DB immediately
720
+ updateStmt.run(JSON.stringify(assessment), opts.model, d.row.id);
721
+
722
+ completed++;
723
+ const tags = (assessment.tags || []).slice(0, 3).join(', ');
724
+ console.log(` OK [${tags}]`);
725
+
726
+ const result = {
727
+ id: d.row.id,
728
+ scenario_id: d.row.scenario_id,
729
+ profile_name: d.row.profile_name,
730
+ overall_score: d.row.overall_score,
731
+ condition: d.condition,
732
+ mechanism: d.mechanism,
733
+ assessment,
734
+ };
735
+
736
+ // Append to JSONL backup
737
+ fs.appendFileSync(jsonlPath, JSON.stringify(result) + '\n');
738
+
739
+ return result;
740
+ } catch (err) {
741
+ if (attempt === 0) {
742
+ process.stdout.write(` retry...`);
743
+ continue;
744
+ }
745
+ errors++;
746
+ console.log(` ERROR: ${err.message.slice(0, 80)}`);
747
+
748
+ return {
749
+ id: d.row.id,
750
+ scenario_id: d.row.scenario_id,
751
+ profile_name: d.row.profile_name,
752
+ overall_score: d.row.overall_score,
753
+ condition: d.condition,
754
+ mechanism: d.mechanism,
755
+ error: err.message,
756
+ };
757
+ }
758
+ }
759
+ });
760
+
761
+ await runWithConcurrency(tasks, opts.parallelism);
762
+ }
763
+
764
+ const elapsed = ((Date.now() - startTime) / 1000).toFixed(1);
765
+ console.log(`\nAssessment complete: ${completed} new, ${errors} errors, ${elapsed}s`);
766
+
767
+ // Reload all assessments from DB (the source of truth)
768
+ const allAssessments = db.prepare(`
769
+ SELECT id, scenario_id, profile_name, overall_score, factor_recognition,
770
+ ${assessCol} AS qualitative_assessment, ${modelCol} AS qualitative_model
771
+ FROM evaluation_results
772
+ WHERE run_id = ? AND success = 1 AND ${assessCol} IS NOT NULL
773
+ ORDER BY scenario_id, profile_name, id
774
+ `).all(opts.runId).map(row => ({
775
+ id: row.id,
776
+ scenario_id: row.scenario_id,
777
+ profile_name: row.profile_name,
778
+ overall_score: row.overall_score,
779
+ condition: detectCondition(row),
780
+ mechanism: detectMechanism(row.profile_name),
781
+ assessment: JSON.parse(row.qualitative_assessment),
782
+ }));
783
+
784
+ if (allAssessments.length === 0) {
785
+ console.error('No successful assessments in DB.');
786
+ db.close();
787
+ return;
788
+ }
789
+
790
+ console.log(`Total assessments in DB: ${allAssessments.length}`);
791
+
792
+ // Generate report from DB
793
+ const report = generateReport(opts.runId, allAssessments, opts.model);
794
+ fs.writeFileSync(outputPath, report);
795
+ console.log(`\nReport: ${outputPath}`);
796
+
797
+ // Also write a consolidated JSON
798
+ const jsonPath = outputPath.replace(/\.md$/, '.json');
799
+ fs.writeFileSync(jsonPath, JSON.stringify({
800
+ generated: new Date().toISOString(),
801
+ runId: opts.runId,
802
+ model: opts.model,
803
+ n: allAssessments.length,
804
+ errors,
805
+ assessments: allAssessments,
806
+ }, null, 2));
807
+ console.log(`JSON: ${jsonPath}`);
808
+
809
+ // Print summary
810
+ console.log('\n' + '─'.repeat(70));
811
+ console.log('TAG DISTRIBUTION');
812
+ console.log('─'.repeat(70));
813
+
814
+ const tagCounts = {};
815
+ for (const a of allAssessments) {
816
+ if (!a.assessment) continue;
817
+ for (const tag of (a.assessment.tags || [])) {
818
+ tagCounts[tag] = (tagCounts[tag] || 0) + 1;
819
+ }
820
+ }
821
+ const sorted = Object.entries(tagCounts).sort((a, b) => b[1] - a[1]);
822
+ for (const [tag, count] of sorted) {
823
+ const bar = '█'.repeat(Math.min(count, 20));
824
+ console.log(` ${tag.padEnd(28)} ${String(count).padStart(3)} ${bar}`);
825
+ }
826
+
827
+ // Score comparison by condition
828
+ const byCondition = { base: [], recognition: [] };
829
+ for (const a of allAssessments) {
830
+ if (a.overall_score != null && a.condition) {
831
+ byCondition[a.condition]?.push(a.overall_score);
832
+ }
833
+ }
834
+ if (byCondition.base.length > 0 || byCondition.recognition.length > 0) {
835
+ console.log('\n' + '─'.repeat(70));
836
+ console.log('SCORE SUMMARY');
837
+ console.log('─'.repeat(70));
838
+ for (const [cond, scores] of Object.entries(byCondition)) {
839
+ if (scores.length === 0) continue;
840
+ const mean = scores.reduce((a, b) => a + b, 0) / scores.length;
841
+ const sd = Math.sqrt(scores.reduce((s, v) => s + (v - mean) ** 2, 0) / scores.length);
842
+ console.log(` ${cond.padEnd(15)} N=${String(scores.length).padStart(3)} M=${mean.toFixed(1)} SD=${sd.toFixed(1)}`);
843
+ }
844
+ }
845
+
846
+ db.close();
847
+ console.log('\nDone.');
848
+ }
849
+
850
+ main().catch(err => {
851
+ console.error('Fatal error:', err);
852
+ process.exit(1);
853
+ });