@machinespirits/eval 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (74) hide show
  1. package/README.md +91 -9
  2. package/config/eval-settings.yaml +3 -3
  3. package/config/paper-manifest.json +486 -0
  4. package/config/providers.yaml +9 -6
  5. package/config/tutor-agents.yaml +2261 -0
  6. package/content/README.md +23 -0
  7. package/content/courses/479/course.md +53 -0
  8. package/content/courses/479/lecture-1.md +361 -0
  9. package/content/courses/479/lecture-2.md +360 -0
  10. package/content/courses/479/lecture-3.md +655 -0
  11. package/content/courses/479/lecture-4.md +530 -0
  12. package/content/courses/479/lecture-5.md +326 -0
  13. package/content/courses/479/lecture-6.md +346 -0
  14. package/content/courses/479/lecture-7.md +326 -0
  15. package/content/courses/479/lecture-8.md +273 -0
  16. package/content/courses/479/roadmap-slides.md +656 -0
  17. package/content/manifest.yaml +8 -0
  18. package/docs/research/build.sh +44 -20
  19. package/docs/research/figures/figure10.png +0 -0
  20. package/docs/research/figures/figure11.png +0 -0
  21. package/docs/research/figures/figure3.png +0 -0
  22. package/docs/research/figures/figure4.png +0 -0
  23. package/docs/research/figures/figure5.png +0 -0
  24. package/docs/research/figures/figure6.png +0 -0
  25. package/docs/research/figures/figure7.png +0 -0
  26. package/docs/research/figures/figure8.png +0 -0
  27. package/docs/research/figures/figure9.png +0 -0
  28. package/docs/research/header.tex +23 -2
  29. package/docs/research/paper-full.md +941 -285
  30. package/docs/research/paper-short.md +216 -585
  31. package/docs/research/references.bib +132 -0
  32. package/docs/research/slides-header.tex +188 -0
  33. package/docs/research/slides-pptx.md +363 -0
  34. package/docs/research/slides.md +531 -0
  35. package/docs/research/style-reference-pptx.py +199 -0
  36. package/package.json +6 -5
  37. package/scripts/analyze-eval-results.js +69 -17
  38. package/scripts/analyze-mechanism-traces.js +763 -0
  39. package/scripts/analyze-modulation-learning.js +498 -0
  40. package/scripts/analyze-prosthesis.js +144 -0
  41. package/scripts/analyze-run.js +264 -79
  42. package/scripts/assess-transcripts.js +853 -0
  43. package/scripts/browse-transcripts.js +854 -0
  44. package/scripts/check-parse-failures.js +73 -0
  45. package/scripts/code-dialectical-modulation.js +1320 -0
  46. package/scripts/download-data.sh +55 -0
  47. package/scripts/eval-cli.js +106 -18
  48. package/scripts/generate-paper-figures.js +663 -0
  49. package/scripts/generate-paper-figures.py +577 -76
  50. package/scripts/generate-paper-tables.js +299 -0
  51. package/scripts/qualitative-analysis-ai.js +3 -3
  52. package/scripts/render-sequence-diagram.js +694 -0
  53. package/scripts/test-latency.js +210 -0
  54. package/scripts/test-rate-limit.js +95 -0
  55. package/scripts/test-token-budget.js +332 -0
  56. package/scripts/validate-paper-manifest.js +670 -0
  57. package/services/__tests__/evalConfigLoader.test.js +2 -2
  58. package/services/__tests__/learnerRubricEvaluator.test.js +361 -0
  59. package/services/__tests__/learnerTutorInteractionEngine.test.js +326 -0
  60. package/services/evaluationRunner.js +975 -98
  61. package/services/evaluationStore.js +12 -4
  62. package/services/learnerTutorInteractionEngine.js +27 -2
  63. package/services/mockProvider.js +133 -0
  64. package/services/promptRewriter.js +1471 -5
  65. package/services/rubricEvaluator.js +55 -2
  66. package/services/transcriptFormatter.js +675 -0
  67. package/docs/EVALUATION-VARIABLES.md +0 -589
  68. package/docs/REPLICATION-PLAN.md +0 -577
  69. package/scripts/analyze-run.mjs +0 -282
  70. package/scripts/compare-runs.js +0 -44
  71. package/scripts/compare-suggestions.js +0 -80
  72. package/scripts/dig-into-run.js +0 -158
  73. package/scripts/show-failed-suggestions.js +0 -64
  74. /package/scripts/{check-run.mjs → check-run.js} +0 -0
@@ -0,0 +1,694 @@
1
+ #!/usr/bin/env node
2
+
3
+ /**
4
+ * Render Sequence Diagram
5
+ *
6
+ * Generates standalone HTML files with SVG sequence diagrams showing
7
+ * the message flow between tutor ego, tutor superego, learner ego,
8
+ * and learner superego. Includes judge adjudication panel.
9
+ *
10
+ * Usage:
11
+ * node scripts/render-sequence-diagram.js <runId> [options]
12
+ *
13
+ * Options:
14
+ * --scenario <id> Filter by scenario
15
+ * --profile <name> Filter by profile name
16
+ * --dialogue <id> Render a specific dialogue by ID
17
+ * --limit <N> Max number of diagrams to render
18
+ * --output <dir> Output directory (default: exports/)
19
+ * --open Open first diagram in browser after rendering
20
+ *
21
+ * Examples:
22
+ * node scripts/render-sequence-diagram.js eval-2026-02-07-b6d75e87 --dialogue dialogue-1770448315802-zmvmm0
23
+ * node scripts/render-sequence-diagram.js eval-2026-02-07-b6d75e87 --profile cell_8_recog_multi_psycho --open
24
+ * node scripts/render-sequence-diagram.js eval-2026-02-07-b6d75e87 --scenario mutual_transformation_journey --limit 4
25
+ */
26
+
27
+ import fs from 'fs';
28
+ import path from 'path';
29
+ import { execSync } from 'child_process';
30
+ import Database from 'better-sqlite3';
31
+ import YAML from 'yaml';
32
+
33
+ const DB_PATH = path.join(import.meta.dirname, '..', 'data', 'evaluations.db');
34
+ const LOGS_DIR = path.join(import.meta.dirname, '..', 'logs', 'tutor-dialogues');
35
+ const DEFAULT_OUTPUT = path.join(import.meta.dirname, '..', 'exports');
36
+
37
+ // ── CLI parsing ──────────────────────────────────────────────────────────────
38
+
39
+ const args = process.argv.slice(2);
40
+ if (args.length === 0 || args.includes('--help') || args.includes('-h')) {
41
+ console.log(`
42
+ Usage: render-sequence-diagram.js <runId> [options]
43
+
44
+ Options:
45
+ --scenario <id> Filter by scenario
46
+ --profile <name> Filter by profile name
47
+ --dialogue <id> Render a specific dialogue by ID
48
+ --limit <N> Max number of diagrams (default: all)
49
+ --output <dir> Output directory (default: exports/)
50
+ --open Open first diagram in browser
51
+ `);
52
+ process.exit(0);
53
+ }
54
+
55
+ function getOption(name) {
56
+ const idx = args.indexOf('--' + name);
57
+ return idx >= 0 && idx + 1 < args.length ? args[idx + 1] : null;
58
+ }
59
+ function getFlag(name) { return args.includes('--' + name); }
60
+
61
+ const runId = args.find(a => !a.startsWith('--') && a !== getOption('scenario') && a !== getOption('profile') && a !== getOption('dialogue') && a !== getOption('limit') && a !== getOption('output'));
62
+ const scenarioFilter = getOption('scenario');
63
+ const profileFilter = getOption('profile');
64
+ const dialogueFilter = getOption('dialogue');
65
+ const limit = getOption('limit') ? parseInt(getOption('limit')) : null;
66
+ const outputDir = getOption('output') || DEFAULT_OUTPUT;
67
+ const shouldOpen = getFlag('open');
68
+
69
+ if (!runId) {
70
+ console.error('Error: run ID required');
71
+ process.exit(1);
72
+ }
73
+
74
+ // ── DB queries ───────────────────────────────────────────────────────────────
75
+
76
+ const db = new Database(DB_PATH, { readonly: true });
77
+
78
+ let query = `SELECT id, profile_name, scenario_id, dialogue_id, overall_score, judge_model,
79
+ ego_model, superego_model,
80
+ score_relevance, score_specificity, score_pedagogical, score_personalization,
81
+ score_actionability, score_tone, scores_with_reasoning, qualitative_assessment, qualitative_model
82
+ FROM evaluation_results
83
+ WHERE run_id = ? AND dialogue_id IS NOT NULL`;
84
+ const params = [runId];
85
+
86
+ if (dialogueFilter) { query += ' AND dialogue_id = ?'; params.push(dialogueFilter); }
87
+ if (scenarioFilter) { query += ' AND scenario_id LIKE ?'; params.push('%' + scenarioFilter + '%'); }
88
+ if (profileFilter) { query += ' AND profile_name LIKE ?'; params.push('%' + profileFilter + '%'); }
89
+
90
+ query += ' ORDER BY overall_score DESC';
91
+ if (limit) { query += ' LIMIT ?'; params.push(limit); }
92
+
93
+ const results = db.prepare(query).all(...params);
94
+
95
+ if (results.length === 0) {
96
+ console.log('No multi-turn dialogues found matching filters.');
97
+ process.exit(0);
98
+ }
99
+
100
+ console.log(`Found ${results.length} dialogue(s) to render.`);
101
+
102
+ // ── Trace → sequence steps ───────────────────────────────────────────────────
103
+
104
+ function shortModel(m) {
105
+ if (!m) return '?';
106
+ // Strip provider prefix and version suffixes: "openrouter.kimi-k2.5" → "kimi-k2.5", "moonshotai/kimi-k2.5" → "kimi-k2.5"
107
+ return String(m).replace(/^openrouter\./, '').split('/').pop().split(':')[0];
108
+ }
109
+
110
+ function extractLearnerQuery(entry) {
111
+ const raw = entry.rawContext || '';
112
+ const match = raw.match(/Learner Messages?:\s*(.+?)(?:\n<\/|$)/s)
113
+ || raw.match(/Recent Chat History\n-\s*User:\s*"(.+?)"/s);
114
+ return match ? match[1].trim() : null;
115
+ }
116
+
117
+ function snippet(entry, maxLen = 90) {
118
+ return fullContent(entry).substring(0, maxLen);
119
+ }
120
+
121
+ function fullContent(entry) {
122
+ if (entry.agent === 'superego' && entry.action === 'review') {
123
+ return entry.feedback || entry.verdict?.feedback || '';
124
+ }
125
+ if (entry.suggestions?.length > 0) {
126
+ return entry.suggestions.map(s => s.message || s.text || s.title || '').join('\n\n');
127
+ }
128
+ if (entry.agent === 'user' && entry.action === 'context_input') {
129
+ return extractLearnerQuery(entry) || '(scenario context)';
130
+ }
131
+ if (entry.agent === 'user' && entry.action === 'turn_action') {
132
+ return entry.contextSummary || entry.detail || '';
133
+ }
134
+ return entry.detail || entry.contextSummary || '';
135
+ }
136
+
137
+ function traceToSteps(trace) {
138
+ const steps = [];
139
+ let dialogueTurn = 0;
140
+
141
+ // Identify indices where learner blocks start, so we can insert "Response" arrows
142
+ const learnerBlockStarts = new Set();
143
+ trace.forEach((e, i) => {
144
+ if (e.agent === 'learner_ego_initial') learnerBlockStarts.add(i);
145
+ });
146
+
147
+ // Track whether we've emitted a Response arrow for the current tutor block
148
+ let needsResponseArrow = false;
149
+
150
+ for (let i = 0; i < trace.length; i++) {
151
+ const e = trace[i];
152
+ const { agent, action } = e;
153
+
154
+ // If we're entering a learner block and haven't sent a Response arrow yet
155
+ if (learnerBlockStarts.has(i) && needsResponseArrow) {
156
+ // Find the last tutor ego output to use as the response content
157
+ let responseContent = '';
158
+ for (let j = i - 1; j >= 0; j--) {
159
+ const prev = trace[j];
160
+ if (prev.agent === 'ego' && (prev.action === 'generate' || prev.action === 'revise' || prev.action === 'incorporate-feedback')) {
161
+ responseContent = fullContent(prev);
162
+ break;
163
+ }
164
+ }
165
+ steps.push({ from: 'tutor_ego', to: 'learner_ego', label: 'Response', detail: '', fullDetail: responseContent, type: 'response', speaker: 'TUTOR EGO' });
166
+ needsResponseArrow = false;
167
+ }
168
+
169
+ if (agent === 'system') continue;
170
+ if (agent === 'user' && action === 'final_output') continue;
171
+ if (agent === 'learner_synthesis') continue;
172
+
173
+ // Context input
174
+ if (agent === 'user' && action === 'context_input') {
175
+ dialogueTurn++;
176
+ if (dialogueTurn === 1) {
177
+ const query = extractLearnerQuery(e);
178
+ const full = query || '(scenario prompt)';
179
+ steps.push({
180
+ from: 'learner_ego', to: 'tutor_ego',
181
+ label: 'Initial query',
182
+ detail: full.substring(0, 120),
183
+ fullDetail: full,
184
+ type: 'front', speaker: 'LEARNER',
185
+ });
186
+ }
187
+ needsResponseArrow = true;
188
+ continue;
189
+ }
190
+
191
+ // Tutor ego generate/revise
192
+ if (agent === 'ego' && (action === 'generate' || action === 'revise' || action === 'incorporate-feedback')) {
193
+ const full = fullContent(e);
194
+
195
+ // Look ahead: does a superego review follow before the next learner block?
196
+ let superegoFollows = false;
197
+ for (let j = i + 1; j < trace.length; j++) {
198
+ if (trace[j].agent === 'superego' && trace[j].action === 'review') { superegoFollows = true; break; }
199
+ if (learnerBlockStarts.has(j)) break; // hit learner block first — no review coming
200
+ if (trace[j].agent === 'user' && trace[j].action === 'context_input') break;
201
+ }
202
+
203
+ if (action !== 'generate' && !superegoFollows) {
204
+ // Final revision with no superego review — render as direct Response to learner
205
+ steps.push({
206
+ from: 'tutor_ego', to: 'learner_ego', label: 'Response',
207
+ detail: '', fullDetail: full, type: 'response',
208
+ latency: e.metrics?.latencyMs || null,
209
+ speaker: 'TUTOR EGO', model: e.metrics?.model || null,
210
+ });
211
+ needsResponseArrow = false;
212
+ } else {
213
+ const label = action === 'generate' ? 'Draft' : 'Revised';
214
+ steps.push({
215
+ from: 'tutor_ego', to: 'tutor_superego', label,
216
+ detail: snippet(e, 120), fullDetail: full, type: 'back',
217
+ latency: e.metrics?.latencyMs || null,
218
+ speaker: action === 'generate' ? 'TUTOR EGO (draft)' : 'TUTOR EGO (revised)',
219
+ model: e.metrics?.model || null,
220
+ });
221
+ }
222
+ continue;
223
+ }
224
+
225
+ // Tutor superego review
226
+ if (agent === 'superego' && action === 'review') {
227
+ const approved = e.approved;
228
+ const full = fullContent(e);
229
+ if (approved) {
230
+ steps.push({
231
+ from: 'tutor_superego', to: 'tutor_ego', label: 'Approved \u2713',
232
+ detail: snippet(e, 120), fullDetail: full, type: 'back', approved: true,
233
+ latency: e.metrics?.latencyMs || null,
234
+ speaker: 'SUPEREGO', model: e.metrics?.model || null,
235
+ });
236
+ // Find the approved ego output for the response
237
+ let responseContent = '';
238
+ for (let j = i - 1; j >= 0; j--) {
239
+ const prev = trace[j];
240
+ if (prev.agent === 'ego' && (prev.action === 'generate' || prev.action === 'revise' || prev.action === 'incorporate-feedback')) {
241
+ responseContent = fullContent(prev);
242
+ break;
243
+ }
244
+ }
245
+ steps.push({ from: 'tutor_ego', to: 'learner_ego', label: 'Response', detail: '', fullDetail: responseContent, type: 'response', speaker: 'TUTOR EGO' });
246
+ needsResponseArrow = false;
247
+ } else {
248
+ steps.push({
249
+ from: 'tutor_superego', to: 'tutor_ego', label: 'Revise \u21BB',
250
+ detail: snippet(e, 120), fullDetail: full, type: 'back', approved: false,
251
+ latency: e.metrics?.latencyMs || null,
252
+ speaker: 'SUPEREGO', model: e.metrics?.model || null,
253
+ });
254
+ }
255
+ continue;
256
+ }
257
+
258
+ // Learner ego initial
259
+ if (agent === 'learner_ego_initial' && action === 'deliberation') {
260
+ const full = fullContent(e);
261
+ steps.push({
262
+ from: 'learner_ego', to: 'learner_superego', label: 'Reaction',
263
+ detail: snippet(e, 120), fullDetail: full, type: 'back',
264
+ speaker: 'LEARNER EGO',
265
+ });
266
+ continue;
267
+ }
268
+
269
+ // Learner superego
270
+ if (agent === 'learner_superego' && action === 'deliberation') {
271
+ const full = fullContent(e);
272
+ steps.push({
273
+ from: 'learner_superego', to: 'learner_ego', label: 'Critique',
274
+ detail: snippet(e, 120), fullDetail: full, type: 'back',
275
+ speaker: 'LEARNER SUPEREGO',
276
+ });
277
+ continue;
278
+ }
279
+
280
+ // Learner ego revision — skip, turn_action carries the final
281
+ if (agent === 'learner_ego_revision') continue;
282
+
283
+ // Turn action = learner's external message
284
+ if (agent === 'user' && action === 'turn_action') {
285
+ const full = fullContent(e);
286
+ steps.push({
287
+ from: 'learner_ego', to: 'tutor_ego',
288
+ label: 'Turn ' + (dialogueTurn + 1),
289
+ detail: snippet(e, 120), fullDetail: full, type: 'front',
290
+ speaker: 'LEARNER',
291
+ });
292
+ needsResponseArrow = true;
293
+ continue;
294
+ }
295
+ }
296
+
297
+ return steps;
298
+ }
299
+
300
+ // ── HTML template ────────────────────────────────────────────────────────────
301
+
302
+ function escapeHtml(text) {
303
+ if (!text) return '';
304
+ return text.replace(/&/g, '&amp;').replace(/</g, '&lt;').replace(/>/g, '&gt;').replace(/"/g, '&quot;');
305
+ }
306
+
307
+ function generateHtml(result, steps, trace, meta = {}) {
308
+ const profile = result.profile_name;
309
+ const scenario = result.scenario_id;
310
+ const score = result.overall_score?.toFixed(1) || '--';
311
+
312
+ let judgeScores = {};
313
+ try { judgeScores = JSON.parse(result.scores_with_reasoning || '{}'); } catch {}
314
+ let qualitative = {};
315
+ try { qualitative = JSON.parse(result.qualitative_assessment || '{}'); } catch {}
316
+
317
+ const isRecog = /recog/i.test(profile);
318
+ const condLabel = isRecog ? 'Recognition' : 'Base';
319
+
320
+ // ── Actors & SVG dimensions ──
321
+ const actors = [
322
+ { id: 'learner_superego', label: 'L.Superego', model: shortModel(meta.learnerSuperegoModel), color: '#fce4ec', textColor: '#c62828', stroke: '#ef5350' },
323
+ { id: 'learner_ego', label: 'L.Ego', model: shortModel(meta.learnerEgoModel), color: '#f3e5f5', textColor: '#6a1b9a', stroke: '#ab47bc' },
324
+ { id: 'tutor_ego', label: 'T.Ego', model: shortModel(meta.egoModel), color: '#e3f2fd', textColor: '#1565c0', stroke: '#42a5f5' },
325
+ { id: 'tutor_superego', label: 'T.Superego', model: shortModel(meta.superegoModel), color: '#e8f5e9', textColor: '#2e7d32', stroke: '#66bb6a' },
326
+ ];
327
+ const colMap = {};
328
+ actors.forEach((a, i) => { colMap[a.id] = i; });
329
+
330
+ const colWidth = 140;
331
+ const rowHeight = 38;
332
+ const headerHeight = 56;
333
+ const padding = 20;
334
+ const svgWidth = colWidth * actors.length + padding * 2;
335
+ const svgHeight = headerHeight + steps.length * rowHeight + 30;
336
+
337
+ // ── Build SVG ──
338
+ let svg = '';
339
+
340
+ // Actor column headers with model subtitle
341
+ actors.forEach((a, i) => {
342
+ const x = padding + i * colWidth;
343
+ const cx = x + colWidth / 2;
344
+ svg += `<rect x="${x + 8}" y="4" width="${colWidth - 16}" height="40" rx="5" fill="${a.color}" stroke="${a.stroke}" stroke-width="1"/>`;
345
+ svg += `<text x="${cx}" y="21" text-anchor="middle" font-size="11" font-weight="600" fill="${a.textColor}">${a.label}</text>`;
346
+ svg += `<text x="${cx}" y="36" text-anchor="middle" font-size="8.5" fill="${a.textColor}" opacity="0.65">${a.model}</text>`;
347
+ svg += `<line x1="${cx}" y1="${headerHeight}" x2="${cx}" y2="${svgHeight - 10}" stroke="#333" stroke-width="1" stroke-dasharray="3,3"/>`;
348
+ });
349
+
350
+ // Turn separator lines
351
+ const turnBoundaries = [];
352
+ let prevTurn = '';
353
+ steps.forEach((s, i) => {
354
+ if (s.label.startsWith('Turn ') || s.label === 'Initial query') {
355
+ const num = s.label === 'Initial query' ? 1 : parseInt(s.label.replace('Turn ', ''));
356
+ if (num !== prevTurn) { turnBoundaries.push({ index: i, turn: num }); prevTurn = num; }
357
+ }
358
+ });
359
+ turnBoundaries.forEach(tb => {
360
+ const y = headerHeight + tb.index * rowHeight;
361
+ svg += `<line x1="${padding}" y1="${y}" x2="${svgWidth - padding}" y2="${y}" stroke="#444" stroke-width="0.5"/>`;
362
+ svg += `<text x="${svgWidth - padding + 3}" y="${y + 12}" font-size="9" fill="#666" font-weight="600">T${tb.turn}</text>`;
363
+ });
364
+
365
+ // Arrow groups — each clickable
366
+ steps.forEach((step, i) => {
367
+ const fromCol = colMap[step.from];
368
+ const toCol = colMap[step.to];
369
+ if (fromCol === undefined || toCol === undefined) return;
370
+
371
+ const fromX = padding + fromCol * colWidth + colWidth / 2;
372
+ const toX = padding + toCol * colWidth + colWidth / 2;
373
+ const y = headerHeight + i * rowHeight + rowHeight / 2;
374
+ const isLR = fromX < toX;
375
+
376
+ let color;
377
+ const fromActor = actors[fromCol];
378
+ if (step.type === 'front' || step.type === 'response') { color = '#78909c'; }
379
+ else { color = fromActor.stroke; }
380
+
381
+ const sw = (step.type === 'front' || step.type === 'response') ? 2.2 : 1.2;
382
+ const tipOff = isLR ? -6 : 6;
383
+
384
+ // Invisible wider hit area for clicking
385
+ svg += `<g data-step="${i}" class="arrow-group" style="cursor:pointer" onclick="highlight(${i})">`;
386
+ svg += `<line x1="${fromX}" y1="${y}" x2="${toX}" y2="${y}" stroke="transparent" stroke-width="20"/>`;
387
+ svg += `<line x1="${fromX}" y1="${y}" x2="${toX + tipOff}" y2="${y}" stroke="${color}" stroke-width="${sw}" class="arrow-line"/>`;
388
+ if (isLR) {
389
+ svg += `<polygon points="${toX - 6},${y - 3.5} ${toX},${y} ${toX - 6},${y + 3.5}" fill="${color}"/>`;
390
+ } else {
391
+ svg += `<polygon points="${toX + 6},${y - 3.5} ${toX},${y} ${toX + 6},${y + 3.5}" fill="${color}"/>`;
392
+ }
393
+
394
+ const labelX = (fromX + toX) / 2;
395
+ let labelColor = '#bbb';
396
+ if (step.approved === true) labelColor = '#66bb6a';
397
+ if (step.approved === false) labelColor = '#ff7043';
398
+
399
+ svg += `<text x="${labelX}" y="${y - 6}" text-anchor="middle" font-size="9.5" font-weight="500" fill="${labelColor}">${escapeHtml(step.label)}</text>`;
400
+ if (step.latency) {
401
+ const lat = step.latency < 1000 ? step.latency + 'ms' : (step.latency / 1000).toFixed(1) + 's';
402
+ svg += `<text x="${labelX}" y="${y + 13}" text-anchor="middle" font-size="8" fill="#555">${lat}</text>`;
403
+ }
404
+ svg += `</g>`;
405
+ });
406
+
407
+ // ── Build transcript entries ──
408
+ const speakerColors = {
409
+ 'TUTOR EGO': '#42a5f5', 'TUTOR EGO (draft)': '#42a5f5', 'TUTOR EGO (revised)': '#42a5f5',
410
+ 'SUPEREGO': '#66bb6a', 'LEARNER EGO': '#ab47bc', 'LEARNER SUPEREGO': '#ef5350',
411
+ 'LEARNER': '#78909c',
412
+ };
413
+
414
+ let transcriptHtml = '';
415
+ steps.forEach((step, i) => {
416
+ const speaker = step.speaker || step.label;
417
+ const color = speakerColors[speaker] || '#999';
418
+ const content = step.fullDetail || step.detail || '';
419
+ if (!content && step.type === 'response') return; // skip empty response arrows without content
420
+
421
+ let badge = '';
422
+ if (step.approved === true) badge = '<span class="badge approved">APPROVED</span>';
423
+ else if (step.approved === false) badge = '<span class="badge revise">REVISE</span>';
424
+
425
+ const modelStr = step.model ? `<span class="entry-model">${escapeHtml(String(step.model).split('/').pop().split(':')[0])}</span>` : '';
426
+
427
+ transcriptHtml += `<div class="entry" id="entry-${i}" data-step="${i}" onclick="highlight(${i})">
428
+ <div class="entry-speaker" style="color:${color}">${escapeHtml(speaker)} ${badge} ${modelStr}</div>
429
+ <div class="entry-content">${escapeHtml(content)}</div>
430
+ </div>\n`;
431
+ });
432
+
433
+ // ── Judge table ──
434
+ let judgeRows = '';
435
+ for (const [dim, data] of Object.entries(judgeScores)) {
436
+ const label = dim.replace(/_/g, ' ').replace(/\b\w/g, c => c.toUpperCase());
437
+ const sv = data.score || 0;
438
+ const reasoning = escapeHtml(data.reasoning || '');
439
+ const barW = (sv / 5) * 100;
440
+ const barC = sv >= 4 ? '#4caf50' : sv >= 3 ? '#ff9800' : '#f44336';
441
+ judgeRows += `<tr>
442
+ <td class="jd">${label}</td><td class="js" style="color:${barC}">${sv}</td>
443
+ <td class="jb"><div class="bar-bg"><div class="bar-fg" style="width:${barW}%;background:${barC}"></div></div></td>
444
+ <td class="jr">${reasoning}</td></tr>`;
445
+ }
446
+
447
+ // ── Qualitative ──
448
+ let qualHtml = '';
449
+ const axes = [
450
+ ['pedagogical_arc', 'Pedagogical Arc'], ['recognition_dynamics', 'Recognition Dynamics'],
451
+ ['superego_effectiveness', 'Superego Effectiveness'], ['learner_trajectory', 'Learner Trajectory'],
452
+ ['missed_opportunities', 'Missed Opportunities'], ['overall_narrative', 'Overall Narrative'],
453
+ ];
454
+ for (const [k, lab] of axes) {
455
+ if (qualitative[k]) {
456
+ qualHtml += `<div class="qual-item"><div class="qual-label">${lab}</div><div class="qual-text">${escapeHtml(qualitative[k])}</div></div>`;
457
+ }
458
+ }
459
+ if (qualitative.key_turning_point) {
460
+ const ktp = qualitative.key_turning_point;
461
+ qualHtml += `<div class="qual-ktp"><div class="qual-label" style="color:#ffab40">Key Turning Point (Turn ${ktp.turn || '?'})</div><div class="qual-text">${escapeHtml(ktp.description || '')}</div></div>`;
462
+ }
463
+ if (qualitative.tags?.length) {
464
+ qualHtml += `<div class="qual-tags">${qualitative.tags.map(t => `<span class="tag">${escapeHtml(t)}</span>`).join('')}</div>`;
465
+ }
466
+
467
+ return `<!DOCTYPE html>
468
+ <html lang="en">
469
+ <head>
470
+ <meta charset="UTF-8">
471
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
472
+ <title>${escapeHtml(scenario)} — ${escapeHtml(profile)}</title>
473
+ <style>
474
+ :root { --bg:#0d1117; --surface:#161b22; --border:#30363d; --text:#e6edf3; --muted:#8b949e; }
475
+ * { box-sizing:border-box; margin:0; padding:0; }
476
+ body { font-family:'SF Mono','Fira Code','JetBrains Mono',monospace; background:var(--bg); color:var(--text); height:100vh; overflow:hidden; display:flex; flex-direction:column; }
477
+
478
+ .top-bar { padding:12px 20px; border-bottom:1px solid var(--border); background:var(--surface); display:flex; align-items:center; justify-content:space-between; flex-shrink:0; gap:20px; }
479
+ .top-bar h1 { font-size:14px; font-weight:600; margin-bottom:4px; }
480
+ .top-bar .meta-grid { display:grid; grid-template-columns:auto auto auto; gap:2px 16px; font-size:11px; }
481
+ .top-bar .meta-label { color:var(--muted); }
482
+ .top-bar .meta-value { color:var(--text); font-weight:500; }
483
+ .top-bar .meta-id { font-size:9px; color:#555; margin-top:3px; }
484
+ .top-bar .score-badge { padding:3px 12px; border-radius:12px; font-weight:700; font-size:14px; color:#fff;
485
+ background:${parseFloat(score) >= 90 ? '#1b5e20' : parseFloat(score) >= 70 ? '#e65100' : '#b71c1c'}; }
486
+
487
+ .split { display:flex; flex:1; overflow:hidden; }
488
+
489
+ /* Left: sequence diagram */
490
+ .left-pane { width:50%; overflow:auto; border-right:1px solid var(--border); padding:12px; flex-shrink:0; }
491
+ .left-pane svg { display:block; margin:0 auto; }
492
+ svg text { font-family:'SF Mono','Fira Code',monospace; }
493
+ .arrow-group:hover .arrow-line { stroke-width:3 !important; }
494
+ .arrow-group.active .arrow-line { stroke-width:3.5 !important; filter:drop-shadow(0 0 4px currentColor); }
495
+ .arrow-group.active text { font-weight:700 !important; }
496
+
497
+ .legend { display:flex; gap:14px; justify-content:center; padding:8px; font-size:10px; color:var(--muted); flex-shrink:0; }
498
+ .legend span { display:flex; align-items:center; gap:3px; }
499
+ .legend .sw { width:12px; height:3px; border-radius:2px; }
500
+
501
+ /* Right: transcript (scrolls independently) */
502
+ .right-pane { width:50%; overflow-y:auto; padding:12px 16px; }
503
+
504
+ .entry { padding:10px 12px; margin:4px 0; border-radius:6px; border:1px solid transparent; cursor:pointer; transition:all 0.15s; }
505
+ .entry:hover { background:rgba(255,255,255,0.03); border-color:var(--border); }
506
+ .entry.active { background:rgba(88,166,255,0.08); border-color:#58a6ff; box-shadow:0 0 12px rgba(88,166,255,0.15); }
507
+ .entry-speaker { font-size:10px; font-weight:700; text-transform:uppercase; letter-spacing:0.5px; margin-bottom:4px; display:flex; align-items:center; gap:6px; }
508
+ .entry-model { font-weight:400; color:var(--muted); font-size:9px; }
509
+ .entry-content { font-size:12px; line-height:1.6; color:#ccc; white-space:pre-wrap; word-wrap:break-word; }
510
+
511
+ .badge { font-size:9px; padding:1px 6px; border-radius:8px; font-weight:600; }
512
+ .badge.approved { background:rgba(102,187,106,0.2); color:#66bb6a; }
513
+ .badge.revise { background:rgba(255,112,67,0.2); color:#ff7043; }
514
+
515
+ /* Judge panel — collapsible below split */
516
+ .judge-panel { flex-shrink:0; border-top:1px solid var(--border); background:var(--surface); }
517
+ .judge-toggle { padding:10px 20px; cursor:pointer; font-size:11px; text-transform:uppercase; letter-spacing:1.5px; color:var(--muted); font-weight:600; list-style:none; user-select:none; }
518
+ .judge-toggle::-webkit-details-marker { display:none; }
519
+ .judge-toggle::before { content:'▸ '; }
520
+ .judge-panel[open] .judge-toggle::before { content:'▾ '; }
521
+ .judge-body { padding:4px 20px 16px; max-height:50vh; overflow-y:auto; }
522
+ table { width:100%; border-collapse:collapse; font-size:11px; }
523
+ tr { border-bottom:1px solid #1e1e1e; }
524
+ .jd { padding:5px 8px; font-weight:500; white-space:nowrap; color:#ccc; }
525
+ .js { padding:5px 6px; text-align:center; font-weight:700; }
526
+ .jb { padding:5px 8px; }
527
+ .jr { padding:5px 8px; color:var(--muted); font-size:10px; }
528
+ .bar-bg { background:#262626; border-radius:3px; height:5px; width:80px; }
529
+ .bar-fg { border-radius:3px; height:5px; }
530
+
531
+ .qual-item { margin-bottom:12px; }
532
+ .qual-label { font-weight:600; color:#90caf9; font-size:11px; margin-bottom:3px; }
533
+ .qual-text { color:#aaa; font-size:11px; line-height:1.6; }
534
+ .qual-ktp { margin:12px 0; padding:10px; background:#1a237e; border-radius:6px; }
535
+ .qual-tags { margin-top:8px; }
536
+ .tag { display:inline-block; padding:2px 8px; margin:2px; border-radius:10px; font-size:10px; font-weight:600; background:#263238; color:#80cbc4; }
537
+ </style>
538
+ </head>
539
+ <body>
540
+
541
+ <div class="top-bar">
542
+ <div>
543
+ <h1>${escapeHtml(scenario?.replace(/_/g, ' ').replace(/\b\w/g, c => c.toUpperCase()) || '')}</h1>
544
+ <div class="meta-grid">
545
+ <span class="meta-label">Cell</span><span class="meta-value">${escapeHtml(profile)}</span><span class="meta-value">${condLabel}${meta.totalTurns ? ' · ' + meta.totalTurns + ' turns' : ''}</span>
546
+ <span class="meta-label">Tutor</span><span class="meta-value">ego ${escapeHtml(shortModel(meta.egoModel))}</span><span class="meta-value">superego ${escapeHtml(shortModel(meta.superegoModel) || shortModel(meta.egoModel))}</span>
547
+ <span class="meta-label">Learner</span><span class="meta-value">ego ${escapeHtml(shortModel(meta.learnerEgoModel))}</span><span class="meta-value">superego ${escapeHtml(shortModel(meta.learnerSuperegoModel))}</span>
548
+ <span class="meta-label">Judge</span><span class="meta-value">${escapeHtml(shortModel(meta.judgeModel))}</span><span></span>
549
+ </div>
550
+ <div class="meta-id">${escapeHtml(meta.runId)} · ${escapeHtml(meta.dialogueId)}</div>
551
+ </div>
552
+ <span class="score-badge">${score}</span>
553
+ </div>
554
+
555
+ <div class="legend">
556
+ <span><span class="sw" style="background:#78909c"></span> Front stage</span>
557
+ <span><span class="sw" style="background:#ef5350"></span> L.Superego</span>
558
+ <span><span class="sw" style="background:#ab47bc"></span> L.Ego</span>
559
+ <span><span class="sw" style="background:#42a5f5"></span> T.Ego</span>
560
+ <span><span class="sw" style="background:#66bb6a"></span> T.Superego</span>
561
+ </div>
562
+
563
+ <div class="split">
564
+ <div class="left-pane">
565
+ <svg width="${svgWidth + 20}" height="${svgHeight}" xmlns="http://www.w3.org/2000/svg">${svg}</svg>
566
+ </div>
567
+ <div class="right-pane" id="transcript">
568
+ ${transcriptHtml}
569
+ </div>
570
+ </div>
571
+
572
+ ${(judgeRows || qualHtml) ? `<details class="judge-panel">
573
+ <summary class="judge-toggle">Judge Adjudication &mdash; ${score}/100</summary>
574
+ <div class="judge-body">
575
+ ${judgeRows ? `<table>${judgeRows}</table>` : ''}
576
+ ${qualHtml ? `<div style="margin-top:16px">${qualHtml}</div>` : ''}
577
+ </div>
578
+ </details>` : ''}
579
+
580
+ <script>
581
+ let activeStep = -1;
582
+ function highlight(idx) {
583
+ // Clear previous
584
+ document.querySelectorAll('.arrow-group.active').forEach(g => g.classList.remove('active'));
585
+ document.querySelectorAll('.entry.active').forEach(e => e.classList.remove('active'));
586
+
587
+ // Activate
588
+ const arrow = document.querySelector('.arrow-group[data-step="'+idx+'"]');
589
+ const entry = document.getElementById('entry-' + idx);
590
+ if (arrow) arrow.classList.add('active');
591
+ if (entry) {
592
+ entry.classList.add('active');
593
+ entry.scrollIntoView({ behavior: 'smooth', block: 'center' });
594
+ }
595
+ // Also scroll SVG to show the arrow
596
+ if (arrow) {
597
+ const rect = arrow.getBoundingClientRect();
598
+ const pane = document.querySelector('.left-pane');
599
+ const paneRect = pane.getBoundingClientRect();
600
+ if (rect.top < paneRect.top + 60 || rect.bottom > paneRect.bottom - 20) {
601
+ const y = pane.scrollTop + rect.top - paneRect.top - paneRect.height / 2;
602
+ pane.scrollTo({ top: y, behavior: 'smooth' });
603
+ }
604
+ }
605
+ activeStep = idx;
606
+ }
607
+
608
+ // Keyboard navigation
609
+ document.addEventListener('keydown', e => {
610
+ const maxStep = document.querySelectorAll('.arrow-group').length - 1;
611
+ if (e.key === 'ArrowDown' || e.key === 'j') { e.preventDefault(); highlight(Math.min(activeStep + 1, maxStep)); }
612
+ if (e.key === 'ArrowUp' || e.key === 'k') { e.preventDefault(); highlight(Math.max(activeStep - 1, 0)); }
613
+ });
614
+ </script>
615
+ </body>
616
+ </html>`;
617
+ }
618
+
619
+ // ── Main loop ────────────────────────────────────────────────────────────────
620
+
621
+ fs.mkdirSync(outputDir, { recursive: true });
622
+ const rendered = [];
623
+
624
+ for (const result of results) {
625
+ const dialogueId = result.dialogue_id;
626
+ const logFiles = fs.readdirSync(LOGS_DIR).filter(f => f.includes(dialogueId));
627
+
628
+ if (logFiles.length === 0) {
629
+ console.log(` ⚠ No log file for ${dialogueId}, skipping`);
630
+ continue;
631
+ }
632
+
633
+ const logPath = path.join(LOGS_DIR, logFiles[0]);
634
+ const log = JSON.parse(fs.readFileSync(logPath, 'utf8'));
635
+ const trace = log.consolidatedTrace || log.dialogueTrace || [];
636
+
637
+ if (trace.length === 0) {
638
+ console.log(` ⚠ Empty trace for ${dialogueId}, skipping`);
639
+ continue;
640
+ }
641
+
642
+ const steps = traceToSteps(trace);
643
+ if (steps.length === 0) {
644
+ console.log(` ⚠ No sequence steps for ${dialogueId}, skipping`);
645
+ continue;
646
+ }
647
+
648
+ // Collect metadata from log + DB result for the header
649
+ // Resolve learner models from config
650
+ let learnerEgoModel = '', learnerSuperegoModel = '';
651
+ try {
652
+ const learnerYaml = YAML.parse(fs.readFileSync(path.join(process.cwd(), 'config/learner-agents.yaml'), 'utf8'));
653
+ const arch = log.learnerArchitecture || 'unified';
654
+ const prof = learnerYaml.profiles?.[arch];
655
+ if (prof?.ego) {
656
+ learnerEgoModel = (prof.ego.provider ? prof.ego.provider + '.' : '') + (prof.ego.model || '');
657
+ learnerSuperegoModel = (prof.superego?.provider ? prof.superego.provider + '.' : '') + (prof.superego?.model || '');
658
+ } else if (prof?.unified_learner) {
659
+ learnerEgoModel = (prof.unified_learner.provider ? prof.unified_learner.provider + '.' : '') + (prof.unified_learner.model || '');
660
+ learnerSuperegoModel = learnerEgoModel;
661
+ }
662
+ } catch {}
663
+
664
+ const meta = {
665
+ runId,
666
+ egoModel: result.ego_model || log.model || '',
667
+ superegoModel: result.superego_model || '',
668
+ judgeModel: result.judge_model || '',
669
+ learnerArch: log.learnerArchitecture || '',
670
+ learnerEgoModel,
671
+ learnerSuperegoModel,
672
+ totalTurns: log.totalTurns || '',
673
+ dialogueId: dialogueId,
674
+ };
675
+
676
+ const html = generateHtml(result, steps, trace, meta);
677
+ const filename = `sequence-${result.profile_name}-${result.scenario_id}-${result.overall_score?.toFixed(0) || '0'}.html`;
678
+ const outPath = path.join(outputDir, filename);
679
+
680
+ fs.writeFileSync(outPath, html);
681
+ rendered.push(outPath);
682
+ console.log(` ✓ ${filename} (${steps.length} steps, score ${result.overall_score?.toFixed(1)})`);
683
+ }
684
+
685
+ console.log(`\nRendered ${rendered.length} diagram(s) to ${outputDir}/`);
686
+
687
+ if (shouldOpen && rendered.length > 0) {
688
+ try {
689
+ execSync(`open "${rendered[0]}"`);
690
+ console.log(`Opened: ${path.basename(rendered[0])}`);
691
+ } catch { /* ignore */ }
692
+ }
693
+
694
+ db.close();