@machinespirits/eval 0.1.2 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (102) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +161 -0
  3. package/config/eval-settings.yaml +18 -0
  4. package/config/evaluation-rubric-learner.yaml +277 -0
  5. package/config/evaluation-rubric.yaml +613 -0
  6. package/config/interaction-eval-scenarios.yaml +93 -50
  7. package/config/learner-agents.yaml +124 -193
  8. package/config/machinespirits-eval.code-workspace +11 -0
  9. package/config/providers.yaml +60 -0
  10. package/config/suggestion-scenarios.yaml +1399 -0
  11. package/config/tutor-agents.yaml +716 -0
  12. package/docs/EVALUATION-VARIABLES.md +589 -0
  13. package/docs/REPLICATION-PLAN.md +577 -0
  14. package/index.js +15 -6
  15. package/package.json +16 -22
  16. package/routes/evalRoutes.js +88 -36
  17. package/scripts/analyze-judge-reliability.js +401 -0
  18. package/scripts/analyze-run.js +97 -0
  19. package/scripts/analyze-run.mjs +282 -0
  20. package/scripts/analyze-validation-failures.js +141 -0
  21. package/scripts/check-run.mjs +17 -0
  22. package/scripts/code-impasse-strategies.js +1132 -0
  23. package/scripts/compare-runs.js +44 -0
  24. package/scripts/compare-suggestions.js +80 -0
  25. package/scripts/compare-transformation.js +116 -0
  26. package/scripts/dig-into-run.js +158 -0
  27. package/scripts/eval-cli.js +2626 -0
  28. package/scripts/generate-paper-figures.py +452 -0
  29. package/scripts/qualitative-analysis-ai.js +1313 -0
  30. package/scripts/qualitative-analysis.js +688 -0
  31. package/scripts/seed-db.js +87 -0
  32. package/scripts/show-failed-suggestions.js +64 -0
  33. package/scripts/validate-content.js +192 -0
  34. package/server.js +3 -2
  35. package/services/__tests__/evalConfigLoader.test.js +338 -0
  36. package/services/anovaStats.js +499 -0
  37. package/services/contentResolver.js +407 -0
  38. package/services/dialogueTraceAnalyzer.js +454 -0
  39. package/services/evalConfigLoader.js +625 -0
  40. package/services/evaluationRunner.js +2171 -270
  41. package/services/evaluationStore.js +564 -29
  42. package/services/learnerConfigLoader.js +75 -5
  43. package/services/learnerRubricEvaluator.js +284 -0
  44. package/services/learnerTutorInteractionEngine.js +375 -0
  45. package/services/processUtils.js +18 -0
  46. package/services/progressLogger.js +98 -0
  47. package/services/promptRecommendationService.js +31 -26
  48. package/services/promptRewriter.js +427 -0
  49. package/services/rubricEvaluator.js +543 -70
  50. package/services/streamingReporter.js +104 -0
  51. package/services/turnComparisonAnalyzer.js +494 -0
  52. package/components/MobileEvalDashboard.tsx +0 -267
  53. package/components/comparison/DeltaAnalysisTable.tsx +0 -137
  54. package/components/comparison/ProfileComparisonCard.tsx +0 -176
  55. package/components/comparison/RecognitionABMode.tsx +0 -385
  56. package/components/comparison/RecognitionMetricsPanel.tsx +0 -135
  57. package/components/comparison/WinnerIndicator.tsx +0 -64
  58. package/components/comparison/index.ts +0 -5
  59. package/components/mobile/BottomSheet.tsx +0 -233
  60. package/components/mobile/DimensionBreakdown.tsx +0 -210
  61. package/components/mobile/DocsView.tsx +0 -363
  62. package/components/mobile/LogsView.tsx +0 -481
  63. package/components/mobile/PsychodynamicQuadrant.tsx +0 -261
  64. package/components/mobile/QuickTestView.tsx +0 -1098
  65. package/components/mobile/RecognitionTypeChart.tsx +0 -124
  66. package/components/mobile/RecognitionView.tsx +0 -809
  67. package/components/mobile/RunDetailView.tsx +0 -261
  68. package/components/mobile/RunHistoryView.tsx +0 -367
  69. package/components/mobile/ScoreRadial.tsx +0 -211
  70. package/components/mobile/StreamingLogPanel.tsx +0 -230
  71. package/components/mobile/SynthesisStrategyChart.tsx +0 -140
  72. package/docs/research/ABLATION-DIALOGUE-ROUNDS.md +0 -52
  73. package/docs/research/ABLATION-MODEL-SELECTION.md +0 -53
  74. package/docs/research/ADVANCED-EVAL-ANALYSIS.md +0 -60
  75. package/docs/research/ANOVA-RESULTS-2026-01-14.md +0 -257
  76. package/docs/research/COMPREHENSIVE-EVALUATION-PLAN.md +0 -586
  77. package/docs/research/COST-ANALYSIS.md +0 -56
  78. package/docs/research/CRITICAL-REVIEW-RECOGNITION-TUTORING.md +0 -340
  79. package/docs/research/DYNAMIC-VS-SCRIPTED-ANALYSIS.md +0 -291
  80. package/docs/research/EVAL-SYSTEM-ANALYSIS.md +0 -306
  81. package/docs/research/FACTORIAL-RESULTS-2026-01-14.md +0 -301
  82. package/docs/research/IMPLEMENTATION-PLAN-CRITIQUE-RESPONSE.md +0 -1988
  83. package/docs/research/LONGITUDINAL-DYADIC-EVALUATION.md +0 -282
  84. package/docs/research/MULTI-JUDGE-VALIDATION-2026-01-14.md +0 -147
  85. package/docs/research/PAPER-EXTENSION-DYADIC.md +0 -204
  86. package/docs/research/PAPER-UNIFIED.md +0 -659
  87. package/docs/research/PAPER-UNIFIED.pdf +0 -0
  88. package/docs/research/PROMPT-IMPROVEMENTS-2026-01-14.md +0 -356
  89. package/docs/research/SESSION-NOTES-2026-01-11-RECOGNITION-EVAL.md +0 -419
  90. package/docs/research/apa.csl +0 -2133
  91. package/docs/research/archive/PAPER-DRAFT-RECOGNITION-TUTORING.md +0 -1637
  92. package/docs/research/archive/paper-multiagent-tutor.tex +0 -978
  93. package/docs/research/paper-draft/full-paper.md +0 -136
  94. package/docs/research/paper-draft/images/pasted-image-2026-01-24T03-47-47-846Z-d76a7ae2.png +0 -0
  95. package/docs/research/paper-draft/references.bib +0 -515
  96. package/docs/research/transcript-baseline.md +0 -139
  97. package/docs/research/transcript-recognition-multiagent.md +0 -187
  98. package/hooks/useEvalData.ts +0 -625
  99. package/server-init.js +0 -45
  100. package/services/benchmarkService.js +0 -1892
  101. package/types.ts +0 -165
  102. package/utils/haptics.ts +0 -45
@@ -0,0 +1,2626 @@
1
+ #!/usr/bin/env node
2
+
3
+ import 'dotenv/config';
4
+
5
+ /**
6
+ * Evaluation CLI
7
+ *
8
+ * Command-line interface for running tutor evaluations.
9
+ *
10
+ * Usage:
11
+ * node scripts/eval-cli.js # List available options
12
+ * node scripts/eval-cli.js quick # Run a quick test with defaults
13
+ * node scripts/eval-cli.js test # Run a quick test (alias)
14
+ * node scripts/eval-cli.js run # Run 2x2x2 factorial evaluation (default)
15
+ * node scripts/eval-cli.js runs # List past evaluation runs
16
+ * node scripts/eval-cli.js report <runId> # Show report for a previous run
17
+ * node scripts/eval-cli.js transcript <runId> # Show full transcripts for a run
18
+ * node scripts/eval-cli.js status <runId> # Quick snapshot of a run's state
19
+ * node scripts/eval-cli.js watch <runId> # Live-updating progress table
20
+ * node scripts/eval-cli.js export <runId> # Export results to file for offline review
21
+ * node scripts/eval-cli.js cleanup # Preview stale runs (dry-run by default)
22
+ * node scripts/eval-cli.js cleanup --force # Actually mark stale runs as completed
23
+ * node scripts/eval-cli.js resume <runId> # Resume an incomplete run (re-run missing tests)
24
+ * node scripts/eval-cli.js revert <runId> # Revert a completed/failed run to 'running'
25
+ * node scripts/eval-cli.js rejudge <runId> # Re-run AI judge (adds new rows for reliability)
26
+ * node scripts/eval-cli.js rejudge <runId> --overwrite # Re-run AI judge (replaces existing)
27
+ * node scripts/eval-cli.js evaluate <runId> # Judge skip-rubric results via claude CLI
28
+ * node scripts/eval-cli.js evaluate <runId> --follow # Poll & judge results as they appear
29
+ * node scripts/eval-cli.js evaluate-learner <runId> # Score learner turns from multi-turn interactions
30
+ * node scripts/eval-cli.js chat # AI conversational interface
31
+ *
32
+ * Options:
33
+ * --scenario <id> Scenario ID or comma-separated IDs (default: all scenarios)
34
+ * --cluster <name> Scenario cluster filter: single-turn, multi-turn, core, mood, benchmark, recognition, multi_turn (comma-separated OK)
35
+ * --profile <name> Override profile(s) — comma-separated or single name
36
+ * --all-profiles Use ALL profiles instead of the 8 factorial cells
37
+ * --skip-rubric Skip AI-based rubric evaluation
38
+ * --verbose Enable verbose output
39
+ * --runs <n> Replications per cell (for 'run' command, default: 1)
40
+ * --parallelism <n> Parallel test count (for 'run' command, default: 2)
41
+ * --description <text> Description for the evaluation run
42
+ * --db Use SQLite instead of JSONL for 'watch' (slower but persistent)
43
+ * --follow Poll for new results in 'evaluate' (live follow mode)
44
+ * --refresh <ms> Refresh interval for 'watch' (default: 2000) or 'evaluate --follow' (default: 5000)
45
+ * --force Actually complete stale runs (for 'cleanup'; dry-run without it)
46
+ * --older-than <min> Staleness threshold in minutes (for 'cleanup', default: 30)
47
+ *
48
+ * The default `run` uses the 2x2x2 factorial design:
49
+ * Factor A: Recognition prompts (off / on)
50
+ * Factor B: Multi-agent tutor (single / ego+superego)
51
+ * Factor C: Multi-agent learner (unified / ego_superego)
52
+ * = 8 cells, all nemotron (free tier) to isolate architecture effects.
53
+ *
54
+ * Examples:
55
+ * eval-cli.js run --runs 3 # 8 cells × all scenarios × 3 reps
56
+ * eval-cli.js run --runs 1 --scenario new_user_first_visit # Quick single-scenario check
57
+ * eval-cli.js run --cluster multi-turn --runs 1 # Only multi-turn scenarios
58
+ * eval-cli.js run --cluster core,mood --runs 1 # Core + mood scenarios
59
+ * eval-cli.js run --profile budget,baseline # Override: only these profiles
60
+ * eval-cli.js run --all-profiles --runs 1 # Legacy: every profile in tutor-agents.yaml
61
+ */
62
+
63
+ import * as evaluationRunner from '../services/evaluationRunner.js';
64
+ import * as anovaStats from '../services/anovaStats.js';
65
+ import * as evaluationStore from '../services/evaluationStore.js';
66
+ import { getAvailableJudge, buildEvaluationPrompt, calculateOverallScore, calculateBaseScore, calculateRecognitionScore } from '../services/rubricEvaluator.js';
67
+ import { buildLearnerEvaluationPrompt, calculateLearnerOverallScore } from '../services/learnerRubricEvaluator.js';
68
+ import { readProgressLog, getProgressLogPath } from '../services/progressLogger.js';
69
+ import * as evalConfigLoader from '../services/evalConfigLoader.js';
70
+ const { getScenario } = evalConfigLoader;
71
+ import { spawn } from 'child_process';
72
+ import readline from 'readline';
73
+ import fs from 'fs';
74
+ import path from 'path';
75
+ import { fileURLToPath } from 'url';
76
+
77
+ const __dirname = path.dirname(fileURLToPath(import.meta.url));
78
+ const LOGS_DIR = path.resolve(__dirname, '..', 'logs', 'tutor-dialogues');
79
+
80
+ const args = process.argv.slice(2);
81
+ const command = args.find(a => !a.startsWith('--')) || 'list';
82
+
83
+ function getFlag(name) {
84
+ return args.includes(`--${name}`);
85
+ }
86
+
87
+ function getOption(name, defaultValue = undefined) {
88
+ const idx = args.indexOf(`--${name}`);
89
+ if (idx === -1 || idx + 1 >= args.length) return defaultValue;
90
+ return args[idx + 1];
91
+ }
92
+
93
+ import { isPidAlive } from '../services/processUtils.js';
94
+
95
+ // ── watch / status helpers ────────────────────────────────────────
96
+
97
+ function formatMs(ms) {
98
+ if (ms < 1000) return `${ms}ms`;
99
+ if (ms < 60000) return `${(ms / 1000).toFixed(1)}s`;
100
+ const m = Math.floor(ms / 60000);
101
+ const s = Math.round((ms % 60000) / 1000);
102
+ return `${m}m ${s}s`;
103
+ }
104
+
105
+ /**
106
+ * Format a single dialogue trace entry for display.
107
+ * Handles both legacy format (role/speaker/content) and the structured
108
+ * multi-agent format (agent/action/suggestions/feedback).
109
+ */
110
+ function formatTraceEntry(entry) {
111
+ // Legacy format
112
+ if (entry.role || entry.speaker) {
113
+ const role = (entry.role || entry.speaker).toUpperCase();
114
+ const content = entry.content || entry.message || entry.text || '';
115
+ return `[${role}] ${content}`;
116
+ }
117
+
118
+ // Structured multi-agent format
119
+ const agent = (entry.agent || 'unknown').toUpperCase();
120
+ const action = entry.action || '';
121
+
122
+ switch (action) {
123
+ case 'context_input': {
124
+ const ctx = entry.contextData || {};
125
+ const parts = [];
126
+ if (ctx.currentPage) parts.push(ctx.currentPage.replace(/^\*+:\s*/, ''));
127
+ if (ctx.strugglesCount) parts.push(`${ctx.strugglesCount} struggle signals`);
128
+ if (ctx.sessions) parts.push(`${ctx.sessions} prior sessions`);
129
+ return `[CONTEXT] ${parts.length ? parts.join(', ') : '(scenario input)'}`;
130
+ }
131
+ case 'generate': {
132
+ const titles = (entry.suggestions || []).map(s => s.title || s.type).join('; ');
133
+ return `[EGO → SUPEREGO] Generated: ${titles}`;
134
+ }
135
+ case 'review': {
136
+ const verdict = entry.verdict || {};
137
+ const approved = entry.approved ?? verdict.approved;
138
+ const tag = approved ? '✓ APPROVED' : '→ REVISE';
139
+ const feedback = entry.feedback || verdict.feedback || '';
140
+ const summary = feedback.length > 200 ? feedback.substring(0, 200) + '…' : feedback;
141
+ return `[SUPEREGO ${tag}] ${summary}`;
142
+ }
143
+ case 'revise': {
144
+ const titles = (entry.suggestions || []).map(s => s.title || s.type).join('; ');
145
+ return `[EGO revised] ${titles}`;
146
+ }
147
+ case 'final_output': {
148
+ const detail = entry.contextSummary || entry.detail || `Turn ${(entry.turnIndex || 0) + 1} complete`;
149
+ return `[OUTPUT] ${detail}`;
150
+ }
151
+ case 'turn_action': {
152
+ const learnerMsg = entry.contextSummary || entry.detail || '';
153
+ return `[LEARNER] ${learnerMsg}`;
154
+ }
155
+ default: {
156
+ const content = entry.content || entry.message || entry.text || entry.contextSummary || action;
157
+ return `[${agent}:${action}] ${content}`;
158
+ }
159
+ }
160
+ }
161
+
162
+ /**
163
+ * Build a scenario×profile grid from JSONL events.
164
+ * Returns { scenarios, profiles, grid, completedTests, totalTests, runDone }.
165
+ */
166
+ function buildGridFromEvents(events) {
167
+ let scenarios = [];
168
+ let profiles = [];
169
+ let originalTotalTests = 0; // From first run_start (original plan)
170
+ let completedTests = 0;
171
+ let runDone = false;
172
+ let durationMs = null;
173
+ let isResumed = false;
174
+ const grid = {}; // grid[scenarioName][profileName] = { score, success, ... }
175
+
176
+ for (const ev of events) {
177
+ if (ev.eventType === 'run_start') {
178
+ scenarios = ev.scenarios || [];
179
+ profiles = ev.profiles || [];
180
+ // Keep the FIRST run_start's totalTests (original plan), ignore resume's smaller count
181
+ if (originalTotalTests === 0) {
182
+ originalTotalTests = ev.totalTests || 0;
183
+ } else {
184
+ isResumed = true; // This is a resume
185
+ }
186
+ } else if (ev.eventType === 'test_complete') {
187
+ // Count actual events instead of relying on per-event completedCount
188
+ completedTests++;
189
+ const sName = ev.scenarioName || ev.scenarioId;
190
+ const pName = ev.profileName || '?';
191
+ if (!grid[sName]) grid[sName] = {};
192
+ grid[sName][pName] = {
193
+ score: ev.overallScore,
194
+ success: ev.success,
195
+ latencyMs: ev.latencyMs,
196
+ };
197
+ } else if (ev.eventType === 'test_error') {
198
+ completedTests++;
199
+ const sName = ev.scenarioName || ev.scenarioId;
200
+ const pName = ev.profileName || '?';
201
+ if (!grid[sName]) grid[sName] = {};
202
+ grid[sName][pName] = {
203
+ score: null,
204
+ success: false,
205
+ error: ev.errorMessage,
206
+ };
207
+ } else if (ev.eventType === 'run_complete') {
208
+ runDone = true;
209
+ durationMs = ev.durationMs;
210
+ }
211
+ }
212
+
213
+ // If no run_start was found, infer scenarios and profiles from the grid
214
+ if (scenarios.length === 0) {
215
+ scenarios = Object.keys(grid);
216
+ }
217
+ if (profiles.length === 0) {
218
+ const profileSet = new Set();
219
+ for (const scenarioData of Object.values(grid)) {
220
+ for (const profile of Object.keys(scenarioData)) {
221
+ profileSet.add(profile);
222
+ }
223
+ }
224
+ profiles = [...profileSet];
225
+ }
226
+
227
+ return { scenarios, profiles, grid, completedTests, totalTests: originalTotalTests, runDone, durationMs };
228
+ }
229
+
230
+ /**
231
+ * Render the scenario×profile grid table as a string.
232
+ */
233
+ function renderGrid({ scenarios, profiles, grid, completedTests, totalTests, runDone, durationMs }) {
234
+ const lines = [];
235
+ const pct = totalTests > 0 ? Math.round((completedTests / totalTests) * 100) : 0;
236
+ lines.push(`Progress: ${completedTests}/${totalTests} (${pct}%)${runDone ? ' DONE' : ' running...'}${durationMs ? ` ${formatMs(durationMs)}` : ''}`);
237
+ lines.push('');
238
+
239
+ // Determine column widths
240
+ const scenarioColWidth = Math.max(20, ...scenarios.map(s => s.length));
241
+ const profileColWidth = Math.max(8, ...profiles.map(p => p.length));
242
+
243
+ // Header row
244
+ const header = ''.padEnd(scenarioColWidth) + ' | ' + profiles.map(p => p.padEnd(profileColWidth)).join(' | ');
245
+ lines.push(header);
246
+ lines.push('-'.repeat(header.length));
247
+
248
+ // Data rows
249
+ for (const scenario of scenarios) {
250
+ const cells = profiles.map(profile => {
251
+ const cell = grid[scenario]?.[profile];
252
+ if (!cell) return ''.padEnd(profileColWidth);
253
+ if (cell.error) return 'ERR'.padEnd(profileColWidth);
254
+ if (!cell.success) return 'FAIL'.padEnd(profileColWidth);
255
+ const scoreStr = cell.score != null ? cell.score.toFixed(1) : '--';
256
+ return scoreStr.padEnd(profileColWidth);
257
+ });
258
+ const row = scenario.substring(0, scenarioColWidth).padEnd(scenarioColWidth) + ' | ' + cells.join(' | ');
259
+ lines.push(row);
260
+ }
261
+
262
+ return lines.join('\n');
263
+ }
264
+
265
+ // ── chat command ─────────────────────────────────────────────────
266
+
267
+ const CHAT_TOOLS = [
268
+ {
269
+ type: 'function',
270
+ function: {
271
+ name: 'list_runs',
272
+ description: 'List recent evaluation runs. Returns run IDs, statuses, scores, and descriptions.',
273
+ parameters: {
274
+ type: 'object',
275
+ properties: {
276
+ limit: { type: 'number', description: 'Max runs to return (default 20)' },
277
+ status: { type: 'string', description: 'Filter by status: running, completed, failed' },
278
+ },
279
+ },
280
+ },
281
+ },
282
+ {
283
+ type: 'function',
284
+ function: {
285
+ name: 'get_run_report',
286
+ description: 'Generate a full text report for a run including rankings, dimension breakdown, scenario performance, and ANOVA.',
287
+ parameters: {
288
+ type: 'object',
289
+ properties: {
290
+ runId: { type: 'string', description: 'The evaluation run ID' },
291
+ },
292
+ required: ['runId'],
293
+ },
294
+ },
295
+ },
296
+ {
297
+ type: 'function',
298
+ function: {
299
+ name: 'get_transcript',
300
+ description: 'Get dialogue transcripts for a run, optionally filtered to a single scenario.',
301
+ parameters: {
302
+ type: 'object',
303
+ properties: {
304
+ runId: { type: 'string', description: 'The evaluation run ID' },
305
+ scenarioId: { type: 'string', description: 'Optional scenario ID to filter' },
306
+ },
307
+ required: ['runId'],
308
+ },
309
+ },
310
+ },
311
+ {
312
+ type: 'function',
313
+ function: {
314
+ name: 'run_anova',
315
+ description: 'Run a 2x2x2 three-way ANOVA on factorial cell data for a given run. Requires factor-tagged results.',
316
+ parameters: {
317
+ type: 'object',
318
+ properties: {
319
+ runId: { type: 'string', description: 'The evaluation run ID' },
320
+ },
321
+ required: ['runId'],
322
+ },
323
+ },
324
+ },
325
+ {
326
+ type: 'function',
327
+ function: {
328
+ name: 'run_evaluation',
329
+ description: 'Start a new evaluation run. Can specify scenarios, profiles, cluster filters, and replications.',
330
+ parameters: {
331
+ type: 'object',
332
+ properties: {
333
+ scenarios: {
334
+ type: 'array',
335
+ items: { type: 'string' },
336
+ description: 'Scenario IDs to run (omit for all)',
337
+ },
338
+ profiles: {
339
+ type: 'array',
340
+ items: { type: 'string' },
341
+ description: 'Profile names to test (omit for default factorial)',
342
+ },
343
+ cluster: {
344
+ type: 'string',
345
+ description: 'Scenario cluster filter: single-turn, multi-turn, or category names (core, mood, benchmark, recognition, multi_turn). Comma-separated for multiple.',
346
+ },
347
+ runs: { type: 'number', description: 'Replications per cell (default 1)' },
348
+ description: { type: 'string', description: 'Description for this run' },
349
+ },
350
+ },
351
+ },
352
+ },
353
+ {
354
+ type: 'function',
355
+ function: {
356
+ name: 'quick_test',
357
+ description: 'Run a quick single-scenario test with one profile.',
358
+ parameters: {
359
+ type: 'object',
360
+ properties: {
361
+ scenarioId: { type: 'string', description: 'Scenario ID (default: new_user_first_visit)' },
362
+ profile: { type: 'string', description: 'Profile name (default: budget)' },
363
+ },
364
+ },
365
+ },
366
+ },
367
+ {
368
+ type: 'function',
369
+ function: {
370
+ name: 'cleanup_stale',
371
+ description: 'Find and optionally complete stale runs stuck in "running" state.',
372
+ parameters: {
373
+ type: 'object',
374
+ properties: {
375
+ olderThanMinutes: { type: 'number', description: 'Staleness threshold (default 30)' },
376
+ force: { type: 'boolean', description: 'Actually complete them (default false = dry run)' },
377
+ },
378
+ },
379
+ },
380
+ },
381
+ {
382
+ type: 'function',
383
+ function: {
384
+ name: 'list_options',
385
+ description: 'List available scenarios, configurations, and profiles.',
386
+ parameters: { type: 'object', properties: {} },
387
+ },
388
+ },
389
+ {
390
+ type: 'function',
391
+ function: {
392
+ name: 'export_results',
393
+ description: 'Export full results for a run as JSON (run metadata, stats, scenario stats, individual results).',
394
+ parameters: {
395
+ type: 'object',
396
+ properties: {
397
+ runId: { type: 'string', description: 'The evaluation run ID' },
398
+ },
399
+ required: ['runId'],
400
+ },
401
+ },
402
+ },
403
+ {
404
+ type: 'function',
405
+ function: {
406
+ name: 'complete_run',
407
+ description: 'Mark an incomplete run as completed with whatever results exist.',
408
+ parameters: {
409
+ type: 'object',
410
+ properties: {
411
+ runId: { type: 'string', description: 'The evaluation run ID' },
412
+ },
413
+ required: ['runId'],
414
+ },
415
+ },
416
+ },
417
+ {
418
+ type: 'function',
419
+ function: {
420
+ name: 'revert_run',
421
+ description: 'Revert a completed/failed run back to "running" status.',
422
+ parameters: {
423
+ type: 'object',
424
+ properties: {
425
+ runId: { type: 'string', description: 'The evaluation run ID' },
426
+ },
427
+ required: ['runId'],
428
+ },
429
+ },
430
+ },
431
+ {
432
+ type: 'function',
433
+ function: {
434
+ name: 'get_run_status',
435
+ description: 'Get detailed status of a run including per-profile stats and scenario breakdown.',
436
+ parameters: {
437
+ type: 'object',
438
+ properties: {
439
+ runId: { type: 'string', description: 'The evaluation run ID' },
440
+ },
441
+ required: ['runId'],
442
+ },
443
+ },
444
+ },
445
+ ];
446
+
447
+ function truncate(str, maxLen = 4000) {
448
+ if (typeof str !== 'string') str = JSON.stringify(str, null, 2);
449
+ if (str.length <= maxLen) return str;
450
+ return str.slice(0, maxLen) + `\n... (truncated, ${str.length - maxLen} chars omitted)`;
451
+ }
452
+
453
+ async function executeTool(name, params) {
454
+ switch (name) {
455
+ case 'list_runs': {
456
+ const runs = evaluationStore.listRuns({
457
+ limit: params.limit || 20,
458
+ status: params.status || null,
459
+ });
460
+ return JSON.stringify(runs, null, 2);
461
+ }
462
+ case 'get_run_report': {
463
+ const report = evaluationRunner.generateReport(params.runId);
464
+ return truncate(report);
465
+ }
466
+ case 'get_transcript': {
467
+ const results = evaluationStore.getResults(params.runId, {
468
+ scenarioId: params.scenarioId || null,
469
+ });
470
+ if (results.length === 0) return 'No results found for this run.';
471
+
472
+ const lines = [];
473
+ for (const r of results) {
474
+ lines.push(`--- ${r.scenarioName || r.scenarioId} | ${r.profileName} | score=${r.overallScore?.toFixed(1) ?? '--'} ---`);
475
+ let printed = false;
476
+ if (r.dialogueId) {
477
+ const files = fs.existsSync(LOGS_DIR)
478
+ ? fs.readdirSync(LOGS_DIR).filter(f => f.includes(r.dialogueId))
479
+ : [];
480
+ if (files.length > 0) {
481
+ try {
482
+ const dialogue = JSON.parse(fs.readFileSync(path.join(LOGS_DIR, files[0]), 'utf-8'));
483
+ for (const entry of (dialogue.dialogueTrace || [])) {
484
+ lines.push(`[${(entry.role || 'unknown').toUpperCase()}] ${entry.content || ''}`);
485
+ }
486
+ printed = true;
487
+ } catch (e) { /* fall through */ }
488
+ }
489
+ }
490
+ if (!printed && r.suggestions?.length > 0) {
491
+ lines.push('Suggestions:');
492
+ for (const s of r.suggestions) {
493
+ lines.push(` • ${typeof s === 'string' ? s : (s.text || s.message || JSON.stringify(s))}`);
494
+ }
495
+ }
496
+ if (r.evaluationReasoning) lines.push(`Judge: ${r.evaluationReasoning}`);
497
+ lines.push('');
498
+ }
499
+ return truncate(lines.join('\n'));
500
+ }
501
+ case 'run_anova': {
502
+ const scoreTypes = [
503
+ { column: 'overall_score', label: 'Overall Score' },
504
+ { column: 'base_score', label: 'Base Score' },
505
+ { column: 'recognition_score', label: 'Recognition Score' },
506
+ ];
507
+ const parts = [];
508
+ for (const { column, label } of scoreTypes) {
509
+ const cellData = evaluationStore.getFactorialCellData(params.runId, { scoreColumn: column });
510
+ const totalSamples = Object.values(cellData).reduce((s, arr) => s + arr.length, 0);
511
+ if (totalSamples === 0) continue;
512
+ if (totalSamples <= 8) {
513
+ parts.push(`${label}: Only ${totalSamples} samples — need > 8 for ANOVA.`);
514
+ continue;
515
+ }
516
+ const result = anovaStats.runThreeWayANOVA(cellData);
517
+ parts.push(anovaStats.formatANOVAReport(result, { scoreLabel: label }));
518
+ }
519
+ return parts.length > 0 ? parts.join('\n') : 'No factorial cell data found for this run.';
520
+ }
521
+ case 'run_evaluation': {
522
+ const scenarios = params.scenarios?.length > 0 ? params.scenarios : 'all';
523
+ let configurations = 'factorial';
524
+ if (params.profiles?.length > 0) {
525
+ configurations = params.profiles.map(name => ({
526
+ provider: null, model: null, profileName: name, label: name,
527
+ }));
528
+ }
529
+ const result = await evaluationRunner.runEvaluation({
530
+ scenarios,
531
+ configurations,
532
+ runsPerConfig: params.runs || 1,
533
+ description: params.description || 'Chat-initiated evaluation',
534
+ scenarioFilter: params.cluster || null,
535
+ });
536
+ return JSON.stringify(result, null, 2);
537
+ }
538
+ case 'quick_test': {
539
+ const config = { profileName: params.profile || 'budget' };
540
+ const result = await evaluationRunner.quickTest(config, {
541
+ scenarioId: params.scenarioId || 'new_user_first_visit',
542
+ });
543
+ return truncate(JSON.stringify(result, null, 2));
544
+ }
545
+ case 'cleanup_stale': {
546
+ const result = evaluationStore.autoCompleteStaleRuns({
547
+ olderThanMinutes: params.olderThanMinutes || 30,
548
+ dryRun: !params.force,
549
+ });
550
+ return JSON.stringify(result, null, 2);
551
+ }
552
+ case 'list_options': {
553
+ const opts = evaluationRunner.listOptions();
554
+ return truncate(JSON.stringify({
555
+ scenarios: opts.scenarios.map(s => ({ id: s.id, name: s.name, isMultiTurn: s.isMultiTurn })),
556
+ profiles: opts.profiles?.map(p => ({ name: p.name, description: p.description })),
557
+ }, null, 2));
558
+ }
559
+ case 'export_results': {
560
+ const data = evaluationStore.exportToJson(params.runId);
561
+ return truncate(JSON.stringify(data, null, 2));
562
+ }
563
+ case 'complete_run': {
564
+ const result = evaluationStore.completeRun(params.runId);
565
+ return JSON.stringify(result, null, 2);
566
+ }
567
+ case 'revert_run': {
568
+ const run = evaluationStore.getRun(params.runId);
569
+ if (!run) return `Run not found: ${params.runId}`;
570
+ if (run.status === 'running') return `Run ${params.runId} is already running.`;
571
+ evaluationStore.updateRun(params.runId, { status: 'running' });
572
+ return `Reverted run ${params.runId} from '${run.status}' to 'running'.`;
573
+ }
574
+ case 'get_run_status': {
575
+ const runData = evaluationRunner.getRunResults(params.runId);
576
+ return truncate(JSON.stringify({
577
+ run: runData.run,
578
+ stats: runData.stats,
579
+ resultCount: runData.results.length,
580
+ }, null, 2));
581
+ }
582
+ default:
583
+ return `Unknown tool: ${name}`;
584
+ }
585
+ }
586
+
587
+ async function callOpenRouter(messages, model, apiKey) {
588
+ const res = await fetch('https://openrouter.ai/api/v1/chat/completions', {
589
+ method: 'POST',
590
+ headers: {
591
+ 'Content-Type': 'application/json',
592
+ 'Authorization': `Bearer ${apiKey}`,
593
+ },
594
+ body: JSON.stringify({
595
+ model,
596
+ messages,
597
+ tools: CHAT_TOOLS,
598
+ temperature: 0.3,
599
+ max_tokens: 4096,
600
+ }),
601
+ });
602
+
603
+ if (!res.ok) {
604
+ const body = await res.text().catch(() => '');
605
+ throw new Error(`OpenRouter API error: ${res.status} — ${body.slice(0, 300)}`);
606
+ }
607
+
608
+ return res.json();
609
+ }
610
+
611
+ async function runChat() {
612
+ const judge = getAvailableJudge();
613
+ const apiKey = process.env.OPENROUTER_API_KEY;
614
+ if (!apiKey) {
615
+ console.error('OPENROUTER_API_KEY not set. Required for chat mode.');
616
+ process.exit(1);
617
+ }
618
+ const model = `${judge.provider === 'openrouter' ? '' : judge.provider + '/'}${judge.model}`;
619
+ const chatModel = judge.provider === 'openrouter' ? judge.model : `${judge.provider}/${judge.model}`;
620
+
621
+ console.log(`\nEval Chat (model: ${chatModel})`);
622
+ console.log('Type your questions about evaluation runs. Use "quit" or "exit" to leave.\n');
623
+
624
+ const rl = readline.createInterface({
625
+ input: process.stdin,
626
+ output: process.stdout,
627
+ prompt: 'eval> ',
628
+ });
629
+
630
+ const messages = [
631
+ {
632
+ role: 'system',
633
+ content: `You are an AI assistant for a tutor evaluation system. You help users inspect evaluation runs, view reports, run ANOVA analyses, start new evaluations, and manage run lifecycle.
634
+
635
+ You have access to tools that query a SQLite database of evaluation runs and results. Each run tests tutor AI configurations against pedagogical scenarios and scores them with an AI judge.
636
+
637
+ Key concepts:
638
+ - Runs contain multiple test results (scenario × profile combinations)
639
+ - The 2×2×2 factorial design tests: Recognition prompts (A), Multi-agent tutor (B), Multi-agent learner (C)
640
+ - ANOVA analyses test significance of these factors
641
+ - Profiles define tutor configurations (model, architecture, etc.)
642
+ - Scenarios define learner situations to evaluate
643
+
644
+ When showing data, be concise. Summarise key findings rather than dumping raw JSON. Use tables where helpful.
645
+ When the user asks to see "recent runs" or "latest", use list_runs.
646
+ When asked about a specific run, use get_run_report or get_run_status.
647
+ For statistical analysis, use run_anova.
648
+ To see available test scenarios and profiles, use list_options.`,
649
+ },
650
+ ];
651
+
652
+ const prompt = () => rl.prompt();
653
+
654
+ rl.on('close', () => {
655
+ console.log('\nBye.');
656
+ process.exit(0);
657
+ });
658
+
659
+ prompt();
660
+
661
+ for await (const line of rl) {
662
+ const input = line.trim();
663
+ if (!input) { prompt(); continue; }
664
+ if (input === 'quit' || input === 'exit') {
665
+ console.log('Bye.');
666
+ process.exit(0);
667
+ }
668
+
669
+ messages.push({ role: 'user', content: input });
670
+
671
+ try {
672
+ let done = false;
673
+ while (!done) {
674
+ const response = await callOpenRouter(messages, chatModel, apiKey);
675
+ const choice = response.choices?.[0];
676
+ if (!choice) {
677
+ console.log('[No response from model]');
678
+ done = true;
679
+ break;
680
+ }
681
+
682
+ const msg = choice.message;
683
+ messages.push(msg);
684
+
685
+ // Handle tool calls
686
+ if (msg.tool_calls?.length > 0) {
687
+ for (const tc of msg.tool_calls) {
688
+ const fnName = tc.function.name;
689
+ let fnArgs = {};
690
+ try { fnArgs = JSON.parse(tc.function.arguments || '{}'); } catch (e) { /* empty */ }
691
+
692
+ process.stdout.write(` [calling ${fnName}...]\n`);
693
+ let result;
694
+ try {
695
+ result = await executeTool(fnName, fnArgs);
696
+ } catch (err) {
697
+ result = `Error: ${err.message}`;
698
+ }
699
+
700
+ messages.push({
701
+ role: 'tool',
702
+ tool_call_id: tc.id,
703
+ content: typeof result === 'string' ? result : JSON.stringify(result),
704
+ });
705
+ }
706
+ // Loop back to get the model's summary of tool results
707
+ } else {
708
+ // Text response — print it
709
+ const text = msg.content || '';
710
+ console.log(`\n${text}\n`);
711
+ done = true;
712
+ }
713
+ }
714
+ } catch (err) {
715
+ console.error(`\nError: ${err.message}\n`);
716
+ }
717
+
718
+ prompt();
719
+ }
720
+ }
721
+
722
+ async function main() {
723
+ try {
724
+ switch (command) {
725
+ case 'list': {
726
+ const options = evaluationRunner.listOptions();
727
+
728
+ // Factorial design — the default run mode
729
+ if (options.profiles?.length) {
730
+ const cellProfiles = options.profiles.filter(p => p.name.startsWith('cell_'));
731
+ const regularProfiles = options.profiles.filter(p => !p.name.startsWith('cell_'));
732
+
733
+ if (cellProfiles.length > 0) {
734
+ console.log('\n2x2x2 Factorial Cells (default `run` configuration):');
735
+ console.log(' A: Recognition B: Tutor arch. C: Learner arch.\n');
736
+ for (const p of cellProfiles) {
737
+ const arch = p.dialogueEnabled ? 'ego+superego' : 'single-agent';
738
+ console.log(` ${p.name.padEnd(32)} ${arch.padEnd(14)} ${p.description || ''}`);
739
+ }
740
+ }
741
+
742
+ if (regularProfiles.length > 0) {
743
+ console.log('\nOther Profiles (use --profile <name> or --all-profiles):');
744
+ for (const p of regularProfiles) {
745
+ const ego = p.egoProvider && p.egoModel ? ` [${p.egoProvider}/${p.egoModel}]` : '';
746
+ const dialogue = p.dialogueEnabled ? ` (dialogue: ${p.maxRounds}r)` : ' (single)';
747
+ console.log(` ${p.name}${ego}${dialogue} - ${p.description || ''}`);
748
+ }
749
+ }
750
+ }
751
+
752
+ console.log('\nScenarios:');
753
+ for (const s of options.scenarios) {
754
+ const mt = s.isMultiTurn ? ` [${s.turnCount}T]` : '';
755
+ console.log(` ${s.id}${mt} - ${s.name || s.id}`);
756
+ }
757
+
758
+ console.log('\nProvider Configurations:');
759
+ for (const c of options.configurations) {
760
+ console.log(` ${c.provider}/${c.model}`);
761
+ }
762
+ break;
763
+ }
764
+
765
+ case 'quick':
766
+ case 'test': {
767
+ const scenarioId = getOption('scenario', 'new_user_first_visit');
768
+ const profile = getOption('profile', 'budget');
769
+ const verbose = getFlag('verbose');
770
+ const evalSettingsQt = evalConfigLoader.getEvalSettings();
771
+ const skipRubricEval = getFlag('skip-rubric') || !evalSettingsQt.useAIJudge;
772
+ const config = { profileName: profile };
773
+
774
+ console.log(`\nRunning quick test (profile: ${profile}, scenario: ${scenarioId})...\n`);
775
+ const result = await evaluationRunner.quickTest(config, {
776
+ scenarioId,
777
+ verbose,
778
+ skipRubricEval,
779
+ });
780
+ console.log('\nResult:');
781
+ console.log(JSON.stringify(result, null, 2));
782
+ break;
783
+ }
784
+
785
+ case 'run': {
786
+ const verbose = getFlag('verbose');
787
+ // CLI --use-rubric forces rubric on; --skip-rubric forces off; otherwise use config default
788
+ const evalSettings = evalConfigLoader.getEvalSettings();
789
+ const skipRubricEval = getFlag('use-rubric') ? false : (getFlag('skip-rubric') || !evalSettings.useAIJudge);
790
+ const runsPerConfig = parseInt(getOption('runs', '1'), 10);
791
+ const parallelism = parseInt(getOption('parallelism', '2'), 10);
792
+ const description = getOption('description');
793
+ const clusterOpt = getOption('cluster');
794
+ const scenarioOpt = getOption('scenario') || getOption('scenarios');
795
+ const allProfiles = getFlag('all-profiles');
796
+ const modelOverride = getOption('model');
797
+ const egoModelOverride = getOption('ego-model');
798
+ const superegoModelOverride = getOption('superego-model');
799
+
800
+ // --cluster and --scenario are mutually exclusive
801
+ if (clusterOpt && scenarioOpt) {
802
+ console.error('Error: --cluster and --scenario are mutually exclusive.');
803
+ process.exit(1);
804
+ }
805
+
806
+ const scenarios = scenarioOpt
807
+ ? scenarioOpt.split(',').map(s => s.trim())
808
+ : 'all';
809
+
810
+ // Determine configurations: explicit --profile overrides everything,
811
+ // --all-profiles loads every profile, default is the 8 factorial cells.
812
+ const profileOpt = getOption('config') || getOption('profile') || getOption('profiles');
813
+ let configurations;
814
+ let isFactorial = false;
815
+
816
+ if (profileOpt) {
817
+ // Explicit profile selection (single or comma-separated)
818
+ const profileNames = profileOpt.includes(',')
819
+ ? profileOpt.split(',').map(s => s.trim())
820
+ : [profileOpt];
821
+ configurations = profileNames.map(name => ({
822
+ provider: null,
823
+ model: null,
824
+ profileName: name,
825
+ label: name,
826
+ }));
827
+ // Check if the selection happens to be factorial cells
828
+ isFactorial = profileNames.every(n => n.startsWith('cell_'));
829
+ } else if (allProfiles) {
830
+ configurations = 'profiles';
831
+ } else {
832
+ // Default: 2×2×2 factorial design
833
+ isFactorial = true;
834
+ configurations = 'factorial';
835
+ }
836
+
837
+ if (isFactorial) {
838
+ const cellCount = 8;
839
+ console.log('\n2x2x2 Factorial Design');
840
+ console.log(` Factor A: Recognition (off / on)`);
841
+ console.log(` Factor B: Tutor arch. (single / ego+superego)`);
842
+ console.log(` Factor C: Learner arch. (unified / ego_superego)`);
843
+ console.log(` Cells: ${cellCount} | Runs/cell: ${runsPerConfig} | Per scenario: ${cellCount * runsPerConfig}`);
844
+ if (modelOverride) {
845
+ console.log(` Model override: ${modelOverride}`);
846
+ } else if (egoModelOverride || superegoModelOverride) {
847
+ if (egoModelOverride) console.log(` Ego model override: ${egoModelOverride}`);
848
+ if (superegoModelOverride) console.log(` Superego model override: ${superegoModelOverride}`);
849
+ }
850
+ console.log('');
851
+ }
852
+
853
+ if (clusterOpt) {
854
+ console.log(`Cluster filter: ${clusterOpt}\n`);
855
+ }
856
+ console.log('Starting evaluation run...\n');
857
+ const result = await evaluationRunner.runEvaluation({
858
+ scenarios,
859
+ configurations,
860
+ runsPerConfig,
861
+ parallelism,
862
+ skipRubricEval,
863
+ description: description || (isFactorial ? '2x2x2 Factorial Evaluation' : null),
864
+ verbose,
865
+ scenarioFilter: clusterOpt || null,
866
+ modelOverride: modelOverride || null,
867
+ egoModelOverride: egoModelOverride || null,
868
+ superegoModelOverride: superegoModelOverride || null,
869
+ });
870
+ // Extract unique model aliases used across all configs (ego + superego)
871
+ const extractAlias = (raw) => {
872
+ if (!raw) return null;
873
+ const dotIdx = raw.indexOf('.');
874
+ return dotIdx !== -1 ? raw.slice(dotIdx + 1) : raw;
875
+ };
876
+ const modelAliases = [...new Set(
877
+ (result.stats || []).flatMap(s => [
878
+ extractAlias(s.egoModel || s.model),
879
+ extractAlias(s.superegoModel),
880
+ ]).filter(Boolean)
881
+ )];
882
+
883
+ console.log('\nEvaluation complete.');
884
+ if (modelAliases.length > 0) {
885
+ console.log(`Models: ${modelAliases.join(', ')}`);
886
+ }
887
+ console.log(JSON.stringify(result, null, 2));
888
+
889
+ // Factorial post-analysis: print cell means and ANOVA for each score type
890
+ if (result.runId) {
891
+ const scoreTypes = [
892
+ { column: 'overall_score', label: 'Overall Score' },
893
+ { column: 'base_score', label: 'Base Score' },
894
+ { column: 'recognition_score', label: 'Recognition Score' },
895
+ ];
896
+
897
+ for (const { column, label } of scoreTypes) {
898
+ const cellData = evaluationStore.getFactorialCellData(result.runId, { scoreColumn: column });
899
+ const cellKeys = Object.keys(cellData);
900
+ const totalSamples = cellKeys.reduce((sum, k) => sum + cellData[k].length, 0);
901
+
902
+ if (totalSamples === 0) continue;
903
+
904
+ console.log('\n' + '='.repeat(70));
905
+ console.log(` FACTORIAL ANALYSIS: ${label.toUpperCase()}`);
906
+ console.log('='.repeat(70));
907
+
908
+ for (const key of cellKeys.sort()) {
909
+ const scores = cellData[key];
910
+ const mean = scores.reduce((a, b) => a + b, 0) / scores.length;
911
+ const sd = scores.length > 1
912
+ ? Math.sqrt(scores.reduce((acc, s) => acc + (s - mean) ** 2, 0) / (scores.length - 1))
913
+ : 0;
914
+ const cellLabel = key.replace(/r(\d)_t(\d)_l(\d)/, (_, r, t, l) =>
915
+ `Recog=${r === '1' ? 'Y' : 'N'} Tutor=${t === '1' ? 'Multi' : 'Single'} Learner=${l === '1' ? 'Psycho' : 'Unified'}`
916
+ );
917
+ console.log(` ${cellLabel.padEnd(52)} mean=${mean.toFixed(1)} sd=${sd.toFixed(1)} n=${scores.length}`);
918
+ }
919
+
920
+ if (totalSamples > 8) {
921
+ const anovaResult = anovaStats.runThreeWayANOVA(cellData);
922
+ console.log(anovaStats.formatANOVAReport(anovaResult, { scoreLabel: label }));
923
+ } else {
924
+ console.log(`\n Need > 8 total samples for ANOVA (have ${totalSamples}). Increase --runs.`);
925
+ }
926
+ }
927
+ }
928
+ break;
929
+ }
930
+
931
+ case 'runs': {
932
+ const limitOpt = getOption('limit');
933
+ const limit = limitOpt ? parseInt(limitOpt, 10) : null;
934
+ const statusFilter = getOption('status') || null;
935
+ const runs = evaluationStore.listRuns({ limit, status: statusFilter });
936
+
937
+ if (runs.length === 0) {
938
+ console.log('\nNo evaluation runs found.');
939
+ break;
940
+ }
941
+
942
+ console.log(`\nEvaluation runs (${runs.length} total):\n`);
943
+ console.log(
944
+ ' ' +
945
+ 'ID'.padEnd(40) +
946
+ 'Status'.padEnd(12) +
947
+ 'Progress'.padEnd(18) +
948
+ 'Avg'.padEnd(7) +
949
+ 'Duration'.padEnd(10) +
950
+ 'Created'.padEnd(24) +
951
+ 'Description'
952
+ );
953
+ console.log(' ' + '-'.repeat(130));
954
+
955
+ for (const run of runs) {
956
+ const created = run.createdAt
957
+ ? new Date(run.createdAt).toLocaleString()
958
+ : '--';
959
+ // Progress: show completed/total (pct%)
960
+ let progress = '--';
961
+ if (run.totalTests > 0) {
962
+ const pct = run.progressPct != null ? run.progressPct : 100;
963
+ progress = `${run.completedResults}/${run.totalTests} (${pct}%)`;
964
+ } else if (run.completedResults > 0) {
965
+ progress = `${run.completedResults} done`;
966
+ }
967
+ const avg = run.avgScore != null ? run.avgScore.toFixed(1) : '--';
968
+ // Duration formatting
969
+ let duration = '--';
970
+ if (run.durationMs != null) {
971
+ const totalSec = Math.round(run.durationMs / 1000);
972
+ const m = Math.floor(totalSec / 60);
973
+ const s = totalSec % 60;
974
+ duration = m > 0 ? `${m}m ${s}s` : `${s}s`;
975
+ }
976
+ const desc = run.description || '';
977
+ const models = (run.models && run.models.length > 0) ? run.models.join(', ') : '--';
978
+ console.log(
979
+ ' ' +
980
+ run.id.padEnd(40) +
981
+ (run.status || '--').padEnd(12) +
982
+ progress.padEnd(18) +
983
+ avg.padEnd(7) +
984
+ duration.padEnd(10) +
985
+ created.padEnd(24) +
986
+ desc
987
+ );
988
+ if (models !== '--') {
989
+ console.log(' ' + ` Models: ${models}`);
990
+ }
991
+ }
992
+ console.log('');
993
+ break;
994
+ }
995
+
996
+ case 'report': {
997
+ const runId = args.find(a => !a.startsWith('--') && a !== 'report');
998
+ if (!runId) {
999
+ console.error('Usage: eval-cli.js report <runId>');
1000
+ process.exit(1);
1001
+ }
1002
+ const report = evaluationRunner.generateReport(runId);
1003
+ console.log(report);
1004
+ break;
1005
+ }
1006
+
1007
+ case 'status': {
1008
+ // Quick snapshot of a run's current state
1009
+ const runId = args.find(a => !a.startsWith('--') && a !== 'status');
1010
+ if (!runId) {
1011
+ console.error('Usage: eval-cli.js status <runId>');
1012
+ process.exit(1);
1013
+ }
1014
+
1015
+ // Try JSONL first for in-progress runs
1016
+ const events = readProgressLog(runId);
1017
+ if (events.length > 0) {
1018
+ let { scenarios, profiles, grid, completedTests, totalTests, runDone, durationMs } = buildGridFromEvents(events);
1019
+
1020
+ // Check if process is still alive (for running runs)
1021
+ let statusLabel = runDone ? 'completed' : 'running';
1022
+ const runData = evaluationRunner.getRunResults(runId);
1023
+ const pid = runData?.run?.metadata?.pid;
1024
+
1025
+ // If JSONL has no run_start (totalTests=0), fall back to DB for the total
1026
+ if (totalTests === 0 && runData?.run) {
1027
+ totalTests = (runData.run.totalScenarios || scenarios.length) * (runData.run.totalConfigurations || profiles.length);
1028
+ }
1029
+
1030
+ // For resumed runs, completed can exceed total - cap display at total
1031
+ const displayCompleted = Math.min(completedTests, totalTests);
1032
+ const pct = totalTests > 0 ? Math.min(100, Math.round((displayCompleted / totalTests) * 100)) : 0;
1033
+ if (!runDone && pid) {
1034
+ const alive = isPidAlive(pid);
1035
+ if (!alive) {
1036
+ statusLabel = `STALE (pid ${pid} dead)`;
1037
+ } else {
1038
+ statusLabel = `running (pid ${pid})`;
1039
+ }
1040
+ }
1041
+
1042
+ console.log(`\nRun: ${runId}`);
1043
+ console.log(`Status: ${statusLabel}`);
1044
+ console.log(`Progress: ${displayCompleted}/${totalTests} tests (${pct}%)${completedTests > totalTests ? ` [${completedTests - totalTests} retried]` : ''}`);
1045
+ if (durationMs) console.log(`Duration: ${formatMs(durationMs)}`);
1046
+ console.log(`Scenarios: ${scenarios.length} | Profiles: ${profiles.length}`);
1047
+
1048
+ // Per-scenario completion counts
1049
+ if (scenarios.length > 0) {
1050
+ console.log('\nScenario completion:');
1051
+ for (const s of scenarios) {
1052
+ const done = profiles.filter(p => grid[s]?.[p]).length;
1053
+ const scores = profiles
1054
+ .filter(p => grid[s]?.[p]?.score != null)
1055
+ .map(p => grid[s][p].score);
1056
+ const avg = scores.length > 0
1057
+ ? (scores.reduce((a, b) => a + b, 0) / scores.length).toFixed(1)
1058
+ : '--';
1059
+ console.log(` ${s}: ${done}/${profiles.length} profiles done, avg=${avg}`);
1060
+ }
1061
+ }
1062
+
1063
+ // Top performers
1064
+ const profileScores = {};
1065
+ for (const s of scenarios) {
1066
+ for (const p of profiles) {
1067
+ const cell = grid[s]?.[p];
1068
+ if (cell?.score != null) {
1069
+ if (!profileScores[p]) profileScores[p] = [];
1070
+ profileScores[p].push(cell.score);
1071
+ }
1072
+ }
1073
+ }
1074
+ const ranked = Object.entries(profileScores)
1075
+ .map(([name, scores]) => ({
1076
+ name,
1077
+ avg: scores.reduce((a, b) => a + b, 0) / scores.length,
1078
+ count: scores.length,
1079
+ }))
1080
+ .sort((a, b) => b.avg - a.avg);
1081
+ if (ranked.length > 0) {
1082
+ console.log('\nTop performers:');
1083
+ for (const r of ranked.slice(0, 5)) {
1084
+ console.log(` ${r.name}: avg=${r.avg.toFixed(1)} (${r.count} tests)`);
1085
+ }
1086
+ }
1087
+ } else {
1088
+ // Fallback: read from SQLite
1089
+ const runData = evaluationRunner.getRunResults(runId);
1090
+ console.log(`\nRun: ${runId}`);
1091
+ console.log(`Status: ${runData.run.status}`);
1092
+ const createdLocal = runData.run.createdAt
1093
+ ? new Date(runData.run.createdAt).toLocaleString()
1094
+ : '--';
1095
+ console.log(`Created: ${createdLocal}`);
1096
+ console.log(`Description: ${runData.run.description || 'N/A'}`);
1097
+ // Count unique (scenario, profile) pairs to handle rejudge duplicates
1098
+ const uniqueTests = new Set(runData.results.map(r => `${r.scenarioId}:${r.profileName}`)).size;
1099
+ console.log(`Tests: ${runData.run.totalTests || uniqueTests}`);
1100
+
1101
+ if (runData.stats.length > 0) {
1102
+ console.log('\nTop performers:');
1103
+ for (const stat of runData.stats.slice(0, 10)) {
1104
+ const label = stat.profileName || `${stat.provider}/${stat.model}`;
1105
+ const base = stat.avgBaseScore != null ? ` base=${stat.avgBaseScore.toFixed(1)}` : '';
1106
+ const recog = stat.avgRecognitionScore != null ? ` recog=${stat.avgRecognitionScore.toFixed(1)}` : '';
1107
+ console.log(` ${label}: avg=${stat.avgScore?.toFixed(1) || '--'}${base}${recog} (${stat.totalTests} tests)`);
1108
+ }
1109
+ }
1110
+ }
1111
+ console.log('');
1112
+ break;
1113
+ }
1114
+
1115
+ case 'watch': {
1116
+ // Live-updating scenario×profile grid table
1117
+ const runId = args.find(a => !a.startsWith('--') && a !== 'watch');
1118
+ if (!runId) {
1119
+ console.error('Usage: eval-cli.js watch <runId> [--refresh 2000] [--db]');
1120
+ process.exit(1);
1121
+ }
1122
+
1123
+ const refreshMs = parseInt(getOption('refresh', '2000'), 10);
1124
+ const useDb = getFlag('db');
1125
+
1126
+ console.log(`Watching run: ${runId} (refresh every ${refreshMs}ms, source: ${useDb ? 'SQLite' : 'JSONL'})`);
1127
+ console.log('Press Ctrl+C to stop.\n');
1128
+
1129
+ const renderFromJsonl = () => {
1130
+ const events = readProgressLog(runId);
1131
+ if (events.length === 0) {
1132
+ return { output: 'Waiting for progress data...', done: false };
1133
+ }
1134
+ const data = buildGridFromEvents(events);
1135
+ // If JSONL has no run_start (totalTests=0), fall back to DB for the total
1136
+ if (data.totalTests === 0) {
1137
+ try {
1138
+ const runData = evaluationRunner.getRunResults(runId);
1139
+ const run = runData.run;
1140
+ data.totalTests = (run.totalScenarios || 1) * (run.totalConfigurations || 1);
1141
+ } catch {
1142
+ // If DB lookup fails, infer from grid
1143
+ data.totalTests = data.scenarios.length * data.profiles.length || data.completedTests;
1144
+ }
1145
+ }
1146
+ return { output: renderGrid(data), done: data.runDone };
1147
+ };
1148
+
1149
+ const renderFromDb = () => {
1150
+ try {
1151
+ const runData = evaluationRunner.getRunResults(runId);
1152
+ const results = runData.results || [];
1153
+ // Build grid from DB results
1154
+ const scenarios = [...new Set(results.map(r => r.scenarioName || r.scenarioId))];
1155
+ const profiles = [...new Set(results.map(r => r.profileName || `${r.provider}/${r.model}`))];
1156
+ const grid = {};
1157
+ for (const r of results) {
1158
+ const sName = r.scenarioName || r.scenarioId;
1159
+ const pName = r.profileName || `${r.provider}/${r.model}`;
1160
+ if (!grid[sName]) grid[sName] = {};
1161
+ grid[sName][pName] = {
1162
+ score: r.overallScore,
1163
+ success: r.success,
1164
+ latencyMs: r.latencyMs,
1165
+ };
1166
+ }
1167
+ const totalTests = (runData.run.totalScenarios || scenarios.length) * (runData.run.totalConfigurations || profiles.length);
1168
+ const done = runData.run.status === 'completed';
1169
+ // Count unique (scenario, profile) pairs instead of total rows (handles rejudge duplicates)
1170
+ const uniqueCompleted = new Set(results.map(r => `${r.scenarioId}:${r.profileName}`)).size;
1171
+ return {
1172
+ output: renderGrid({ scenarios, profiles, grid, completedTests: uniqueCompleted, totalTests, runDone: done, durationMs: null }),
1173
+ done,
1174
+ };
1175
+ } catch (e) {
1176
+ return { output: `Error reading DB: ${e.message}`, done: false };
1177
+ }
1178
+ };
1179
+
1180
+ const render = useDb ? renderFromDb : renderFromJsonl;
1181
+
1182
+ // Initial check — if JSONL doesn't exist yet, wait for it
1183
+ if (!useDb) {
1184
+ const logPath = getProgressLogPath(runId);
1185
+ if (!fs.existsSync(logPath)) {
1186
+ console.log(`Waiting for progress log: ${logPath}`);
1187
+ }
1188
+ }
1189
+
1190
+ // Poll loop
1191
+ let lastOutput = '';
1192
+ const poll = () => {
1193
+ const { output, done } = render();
1194
+ if (output !== lastOutput) {
1195
+ // Clear screen and redraw
1196
+ process.stdout.write('\x1b[2J\x1b[H');
1197
+ console.log(`Watch: ${runId} (${new Date().toLocaleTimeString()})`);
1198
+ console.log('');
1199
+ console.log(output);
1200
+ lastOutput = output;
1201
+ }
1202
+ if (done) {
1203
+ console.log('\nRun complete. Exiting watch.');
1204
+ process.exit(0);
1205
+ }
1206
+ };
1207
+
1208
+ poll();
1209
+ const interval = setInterval(poll, refreshMs);
1210
+
1211
+ // Clean exit on Ctrl+C
1212
+ process.on('SIGINT', () => {
1213
+ clearInterval(interval);
1214
+ console.log('\nStopped watching.');
1215
+ process.exit(0);
1216
+ });
1217
+
1218
+ // Keep process alive
1219
+ await new Promise(() => {});
1220
+ break;
1221
+ }
1222
+
1223
+ case 'transcript': {
1224
+ const runId = args.find(a => !a.startsWith('--') && a !== 'transcript');
1225
+ if (!runId) {
1226
+ console.error('Usage: eval-cli.js transcript <runId> [--scenario <id>]');
1227
+ process.exit(1);
1228
+ }
1229
+
1230
+ const scenarioFilter = getOption('scenario');
1231
+ const results = evaluationStore.getResults(runId, {
1232
+ scenarioId: scenarioFilter || null,
1233
+ });
1234
+
1235
+ if (results.length === 0) {
1236
+ console.log(`\nNo results found for run: ${runId}`);
1237
+ break;
1238
+ }
1239
+
1240
+ console.log(`\nTranscripts for run: ${runId} (${results.length} results)\n`);
1241
+
1242
+ for (const result of results) {
1243
+ console.log('='.repeat(80));
1244
+ console.log(`Scenario: ${result.scenarioName || result.scenarioId}`);
1245
+ console.log(`Profile: ${result.profileName || `${result.provider}/${result.model}`}`);
1246
+ console.log(`Score: ${result.overallScore != null ? result.overallScore.toFixed(1) : '--'} | Success: ${result.success}`);
1247
+ console.log('-'.repeat(80));
1248
+
1249
+ // Try dialogue log file first
1250
+ let printed = false;
1251
+ if (result.dialogueId) {
1252
+ // Search for the dialogue file (may include date prefix in filename)
1253
+ const files = fs.existsSync(LOGS_DIR)
1254
+ ? fs.readdirSync(LOGS_DIR).filter(f => f.includes(result.dialogueId))
1255
+ : [];
1256
+
1257
+ if (files.length > 0) {
1258
+ try {
1259
+ const dialogue = JSON.parse(fs.readFileSync(path.join(LOGS_DIR, files[0]), 'utf-8'));
1260
+ const trace = dialogue.dialogueTrace || [];
1261
+ for (const entry of trace) {
1262
+ console.log(formatTraceEntry(entry));
1263
+ console.log('');
1264
+ }
1265
+ if (trace.length > 0) printed = true;
1266
+ } catch (e) {
1267
+ // Fall through to suggestions
1268
+ }
1269
+ }
1270
+ }
1271
+
1272
+ // Fall back to suggestions / raw response from DB
1273
+ if (!printed) {
1274
+ if (result.suggestions?.length > 0) {
1275
+ console.log('Suggestions:');
1276
+ for (const s of result.suggestions) {
1277
+ const text = typeof s === 'string' ? s : (s.text || s.content || JSON.stringify(s));
1278
+ console.log(` • ${text}`);
1279
+ }
1280
+ console.log('');
1281
+ }
1282
+ if (result.evaluationReasoning) {
1283
+ console.log('Judge reasoning:');
1284
+ console.log(` ${result.evaluationReasoning}`);
1285
+ console.log('');
1286
+ }
1287
+ }
1288
+
1289
+ if (result.errorMessage) {
1290
+ console.log(`ERROR: ${result.errorMessage}`);
1291
+ console.log('');
1292
+ }
1293
+ }
1294
+
1295
+ // Also check for interaction evals
1296
+ const interactionEvals = evaluationStore.listInteractionEvals({ limit: 200 });
1297
+ const runInteractions = interactionEvals.filter(e => e.runId === runId);
1298
+
1299
+ if (runInteractions.length > 0) {
1300
+ console.log('\n' + '='.repeat(80));
1301
+ console.log(' INTERACTION TRANSCRIPTS');
1302
+ console.log('='.repeat(80));
1303
+
1304
+ for (const ie of runInteractions) {
1305
+ const full = evaluationStore.getInteractionEval(ie.evalId);
1306
+ if (!full) continue;
1307
+
1308
+ console.log(`\nScenario: ${full.scenarioName || full.scenarioId}`);
1309
+ console.log(`Tutor: ${full.tutorProfile} | Learner: ${full.learnerProfile}`);
1310
+ console.log(`Turns: ${full.turnCount} | Score: ${full.judgeOverallScore ?? '--'}`);
1311
+ console.log('-'.repeat(80));
1312
+
1313
+ if (full.formattedTranscript) {
1314
+ console.log(full.formattedTranscript);
1315
+ } else if (full.turns?.length > 0) {
1316
+ for (const turn of full.turns) {
1317
+ const speaker = (turn.phase || turn.role || 'unknown').toUpperCase();
1318
+ console.log(`[Turn ${turn.turnNumber || '?'}] ${speaker}:`);
1319
+ console.log(turn.externalMessage || turn.content || '');
1320
+ console.log('');
1321
+ }
1322
+ }
1323
+ }
1324
+ }
1325
+
1326
+ break;
1327
+ }
1328
+
1329
+ case 'cleanup': {
1330
+ const force = getFlag('force');
1331
+ const threshold = parseInt(getOption('older-than', '30'), 10);
1332
+
1333
+ console.log(`\nScanning for stale runs (running > ${threshold} minutes)...`);
1334
+
1335
+ // Dry-run by default; require --force to actually complete
1336
+ const dryRun = !force;
1337
+ if (dryRun) console.log(' (dry run — pass --force to actually complete stale runs)\n');
1338
+
1339
+ const result = evaluationStore.autoCompleteStaleRuns({
1340
+ olderThanMinutes: threshold,
1341
+ dryRun,
1342
+ });
1343
+
1344
+ if (result.found === 0) {
1345
+ console.log('No stale runs found.');
1346
+ } else if (dryRun) {
1347
+ console.log(`Found ${result.found} stale run(s):\n`);
1348
+ for (const run of result.runs) {
1349
+ console.log(` ${run.id} age=${run.ageMinutes}m results=${run.resultsFound} desc="${run.description || ''}"` );
1350
+ }
1351
+ console.log('\nRe-run with --force to mark these as completed.');
1352
+ } else {
1353
+ console.log(`Processed ${result.completed} stale run(s):\n`);
1354
+ for (const run of result.runs) {
1355
+ const status = run.status || (run.alreadyCompleted ? 'already completed' : 'unknown');
1356
+ const partial = run.wasPartial ? ` (partial: ${run.completionRate}%)` : '';
1357
+ console.log(` ${run.runId} → ${status}${partial} results=${run.resultsFound || '--'}`);
1358
+ }
1359
+ }
1360
+
1361
+ console.log('');
1362
+ break;
1363
+ }
1364
+
1365
+ case 'resume': {
1366
+ const runId = args.find(a => !a.startsWith('--') && a !== 'resume');
1367
+ if (!runId) {
1368
+ console.error('Usage: eval-cli.js resume <runId> [--parallelism N] [--verbose] [--force]');
1369
+ process.exit(1);
1370
+ }
1371
+
1372
+ const verbose = getFlag('verbose');
1373
+ const force = getFlag('force');
1374
+ const parallelism = parseInt(getOption('parallelism', '2'), 10);
1375
+
1376
+ const result = await evaluationRunner.resumeEvaluation({
1377
+ runId,
1378
+ parallelism,
1379
+ verbose,
1380
+ force,
1381
+ });
1382
+
1383
+ if (result.alreadyComplete) {
1384
+ break;
1385
+ }
1386
+
1387
+ // Extract unique model aliases (same as `run` command)
1388
+ const extractAlias = (raw) => {
1389
+ if (!raw) return null;
1390
+ const dotIdx = raw.indexOf('.');
1391
+ return dotIdx !== -1 ? raw.slice(dotIdx + 1) : raw;
1392
+ };
1393
+ const modelAliases = [...new Set(
1394
+ (result.stats || []).flatMap(s => [
1395
+ extractAlias(s.egoModel || s.model),
1396
+ extractAlias(s.superegoModel),
1397
+ ]).filter(Boolean)
1398
+ )];
1399
+
1400
+ console.log('\nResume complete.');
1401
+ if (modelAliases.length > 0) {
1402
+ console.log(`Models: ${modelAliases.join(', ')}`);
1403
+ }
1404
+ console.log(` Total tests (all): ${result.totalTests}`);
1405
+ console.log(` Resumed tests: ${result.resumedTests}`);
1406
+ console.log(` Successful (this run): ${result.successfulTests}`);
1407
+ console.log(JSON.stringify(result, null, 2));
1408
+
1409
+ // Factorial post-analysis (same as `run` command)
1410
+ if (result.runId) {
1411
+ const scoreTypes = [
1412
+ { column: 'overall_score', label: 'Overall Score' },
1413
+ { column: 'base_score', label: 'Base Score' },
1414
+ { column: 'recognition_score', label: 'Recognition Score' },
1415
+ ];
1416
+
1417
+ for (const { column, label } of scoreTypes) {
1418
+ const cellData = evaluationStore.getFactorialCellData(result.runId, { scoreColumn: column });
1419
+ const cellKeys = Object.keys(cellData);
1420
+ const totalSamples = cellKeys.reduce((sum, k) => sum + cellData[k].length, 0);
1421
+
1422
+ if (totalSamples === 0) continue;
1423
+
1424
+ console.log('\n' + '='.repeat(70));
1425
+ console.log(` FACTORIAL ANALYSIS: ${label.toUpperCase()}`);
1426
+ console.log('='.repeat(70));
1427
+
1428
+ for (const key of cellKeys.sort()) {
1429
+ const scores = cellData[key];
1430
+ const mean = scores.reduce((a, b) => a + b, 0) / scores.length;
1431
+ const sd = scores.length > 1
1432
+ ? Math.sqrt(scores.reduce((acc, s) => acc + (s - mean) ** 2, 0) / (scores.length - 1))
1433
+ : 0;
1434
+ const cellLabel = key.replace(/r(\d)_t(\d)_l(\d)/, (_, r, t, l) =>
1435
+ `Recog=${r === '1' ? 'Y' : 'N'} Tutor=${t === '1' ? 'Multi' : 'Single'} Learner=${l === '1' ? 'Psycho' : 'Unified'}`
1436
+ );
1437
+ console.log(` ${cellLabel.padEnd(52)} mean=${mean.toFixed(1)} sd=${sd.toFixed(1)} n=${scores.length}`);
1438
+ }
1439
+
1440
+ if (totalSamples > 8) {
1441
+ const anovaResult = anovaStats.runThreeWayANOVA(cellData);
1442
+ console.log(anovaStats.formatANOVAReport(anovaResult, { scoreLabel: label }));
1443
+ } else {
1444
+ console.log(`\n Need > 8 total samples for ANOVA (have ${totalSamples}). Increase --runs.`);
1445
+ }
1446
+ }
1447
+ }
1448
+ break;
1449
+ }
1450
+
1451
+ case 'revert': {
1452
+ const runId = args.find(a => !a.startsWith('--') && a !== 'revert');
1453
+ if (!runId) {
1454
+ console.error('Usage: eval-cli.js revert <runId>');
1455
+ process.exit(1);
1456
+ }
1457
+
1458
+ const run = evaluationStore.getRun(runId);
1459
+ if (!run) {
1460
+ console.error(`Run not found: ${runId}`);
1461
+ process.exit(1);
1462
+ }
1463
+
1464
+ if (run.status === 'running') {
1465
+ console.log(`Run ${runId} is already in 'running' state.`);
1466
+ break;
1467
+ }
1468
+
1469
+ console.log(`Reverting run ${runId} from '${run.status}' → 'running'...`);
1470
+ evaluationStore.updateRun(runId, { status: 'running' });
1471
+ console.log('Done.');
1472
+ break;
1473
+ }
1474
+
1475
+ case 'chat': {
1476
+ await runChat();
1477
+ break;
1478
+ }
1479
+
1480
+ case 'rejudge': {
1481
+ const runId = args.find(a => !a.startsWith('--') && a !== 'rejudge');
1482
+ if (!runId) {
1483
+ console.error('Usage: eval-cli.js rejudge <runId> [--judge <model>] [--scenario <id>] [--verbose] [--overwrite]');
1484
+ console.error('');
1485
+ console.error('By default, creates new rows (preserves history for inter-judge reliability).');
1486
+ console.error('Use --overwrite to replace existing scores instead.');
1487
+ process.exit(1);
1488
+ }
1489
+
1490
+ // Restore env overrides from run metadata
1491
+ {
1492
+ const runData = evaluationStore.getRun(runId);
1493
+ const meta = typeof runData?.metadata === 'string' ? JSON.parse(runData.metadata) : runData?.metadata;
1494
+ if (meta?.scenariosFile && !process.env.EVAL_SCENARIOS_FILE) {
1495
+ process.env.EVAL_SCENARIOS_FILE = meta.scenariosFile;
1496
+ console.log(`[rejudge] Restored EVAL_SCENARIOS_FILE from run metadata: ${meta.scenariosFile}`);
1497
+ }
1498
+ if (meta?.contentPath && !process.env.EVAL_CONTENT_PATH) {
1499
+ process.env.EVAL_CONTENT_PATH = meta.contentPath;
1500
+ console.log(`[rejudge] Restored EVAL_CONTENT_PATH from run metadata: ${meta.contentPath}`);
1501
+ }
1502
+ }
1503
+
1504
+ const verbose = getFlag('verbose');
1505
+ const overwrite = getFlag('overwrite');
1506
+ const judgeOverride = getOption('judge') || null;
1507
+ const scenarioFilter = getOption('scenario') || null;
1508
+
1509
+ console.log(`\nRejudging run: ${runId}`);
1510
+ if (judgeOverride) console.log(` Judge override: ${judgeOverride}`);
1511
+ if (scenarioFilter) console.log(` Scenario filter: ${scenarioFilter}`);
1512
+ console.log(` Mode: ${overwrite ? 'overwrite (replace existing)' : 'preserve history (add new rows)'}`);
1513
+ console.log('');
1514
+
1515
+ const summary = await evaluationRunner.rejudgeRun(runId, {
1516
+ judgeOverride,
1517
+ verbose,
1518
+ scenarioFilter,
1519
+ overwrite,
1520
+ });
1521
+
1522
+ console.log('\n' + '='.repeat(60));
1523
+ console.log(' REJUDGE SUMMARY');
1524
+ console.log('='.repeat(60));
1525
+ console.log(` Run: ${summary.runId}`);
1526
+ console.log(` Total: ${summary.total}`);
1527
+ console.log(` Succeeded: ${summary.succeeded}`);
1528
+ console.log(` Failed: ${summary.failed}`);
1529
+ console.log(` Old avg: ${summary.oldAvgScore?.toFixed(2) ?? 'N/A'}`);
1530
+ console.log(` New avg: ${summary.newAvgScore?.toFixed(2) ?? 'N/A'}`);
1531
+ if (summary.scoreDelta != null) {
1532
+ const sign = summary.scoreDelta >= 0 ? '+' : '';
1533
+ console.log(` Delta: ${sign}${summary.scoreDelta.toFixed(2)}`);
1534
+ }
1535
+ console.log('');
1536
+ break;
1537
+ }
1538
+
1539
+ case 'export': {
1540
+ const runId = args.find(a => !a.startsWith('--') && a !== 'export');
1541
+ if (!runId) {
1542
+ console.error('Usage: eval-cli.js export <runId> [--scenario <id>] [--profile <name>] [--output <path>]');
1543
+ process.exit(1);
1544
+ }
1545
+
1546
+ const scenarioFilter = getOption('scenario') || null;
1547
+ const profileFilter = getOption('profile') || null;
1548
+ const outputOption = getOption('output') || null;
1549
+
1550
+ const results = evaluationStore.getResults(runId, {
1551
+ scenarioId: scenarioFilter,
1552
+ profileName: profileFilter,
1553
+ });
1554
+
1555
+ if (results.length === 0) {
1556
+ console.log(`\nNo results found for run: ${runId}`);
1557
+ break;
1558
+ }
1559
+
1560
+ // Build output
1561
+ const lines = [];
1562
+ lines.push(`# Evaluation Export — Run ${runId}`);
1563
+ lines.push(`# ${results.length} result(s)`);
1564
+ if (scenarioFilter) lines.push(`# Scenario filter: ${scenarioFilter}`);
1565
+ if (profileFilter) lines.push(`# Profile filter: ${profileFilter}`);
1566
+ lines.push('');
1567
+
1568
+ for (const result of results) {
1569
+ const scenario = getScenario(result.scenarioId);
1570
+
1571
+ lines.push('='.repeat(80));
1572
+ lines.push(`Scenario: ${result.scenarioName || result.scenarioId}`);
1573
+ lines.push(`Profile: ${result.profileName || `${result.provider}/${result.model}`}`);
1574
+ lines.push(`Provider: ${result.provider} Model: ${result.model}`);
1575
+ if (result.egoModel || result.superegoModel) {
1576
+ lines.push(`Ego: ${result.egoModel || 'N/A'} Superego: ${result.superegoModel || 'N/A'}`);
1577
+ }
1578
+ lines.push(`Score: ${result.overallScore != null ? result.overallScore.toFixed(1) : 'NOT EVALUATED'}`);
1579
+ lines.push('='.repeat(80));
1580
+ lines.push('');
1581
+
1582
+ if (scenario) {
1583
+ if (scenario.learner_context) {
1584
+ lines.push('### Scenario Context');
1585
+ lines.push(scenario.learner_context.trim());
1586
+ lines.push('');
1587
+ }
1588
+ if (scenario.expected_behavior) {
1589
+ lines.push('### Expected Behavior');
1590
+ lines.push(scenario.expected_behavior);
1591
+ lines.push('');
1592
+ }
1593
+ if (scenario.required_elements?.length > 0) {
1594
+ lines.push('### Required Elements');
1595
+ for (const el of scenario.required_elements) lines.push(`- ${el}`);
1596
+ lines.push('');
1597
+ }
1598
+ if (scenario.forbidden_elements?.length > 0) {
1599
+ lines.push('### Forbidden Elements');
1600
+ for (const el of scenario.forbidden_elements) lines.push(`- ${el}`);
1601
+ lines.push('');
1602
+ }
1603
+ }
1604
+
1605
+ // Tutor suggestion(s)
1606
+ if (result.suggestions?.length > 0) {
1607
+ lines.push('### Tutor Suggestion');
1608
+ for (const s of result.suggestions) {
1609
+ if (typeof s === 'string') {
1610
+ lines.push(s);
1611
+ } else {
1612
+ if (s.title) lines.push(`Title: ${s.title}`);
1613
+ if (s.message || s.text || s.content) lines.push(`Message: ${s.message || s.text || s.content}`);
1614
+ if (s.action) lines.push(`Action: ${s.action}${s.actionTarget ? ' → ' + s.actionTarget : ''}`);
1615
+ if (s.reasoning) lines.push(`Reasoning: ${s.reasoning}`);
1616
+ }
1617
+ }
1618
+ lines.push('');
1619
+ }
1620
+
1621
+ // Dialogue trace
1622
+ if (result.dialogueId) {
1623
+ const files = fs.existsSync(LOGS_DIR)
1624
+ ? fs.readdirSync(LOGS_DIR).filter(f => f.includes(result.dialogueId))
1625
+ : [];
1626
+
1627
+ if (files.length > 0) {
1628
+ try {
1629
+ const dialogue = JSON.parse(fs.readFileSync(path.join(LOGS_DIR, files[0]), 'utf-8'));
1630
+ const trace = dialogue.dialogueTrace || [];
1631
+ if (trace.length > 0) {
1632
+ lines.push('### Dialogue Trace');
1633
+ for (const entry of trace) {
1634
+ lines.push(formatTraceEntry(entry));
1635
+ }
1636
+ lines.push('');
1637
+ }
1638
+ } catch (e) {
1639
+ // skip
1640
+ }
1641
+ }
1642
+ }
1643
+
1644
+ if (result.errorMessage) {
1645
+ lines.push(`### Error`);
1646
+ lines.push(result.errorMessage);
1647
+ lines.push('');
1648
+ }
1649
+
1650
+ lines.push('');
1651
+ }
1652
+
1653
+ // Determine output path
1654
+ let outputPath = outputOption;
1655
+ if (!outputPath) {
1656
+ const exportsDir = path.resolve(__dirname, '..', 'exports');
1657
+ if (!fs.existsSync(exportsDir)) fs.mkdirSync(exportsDir, { recursive: true });
1658
+ let filename = `eval-${runId}`;
1659
+ if (scenarioFilter) filename += `-${scenarioFilter}`;
1660
+ if (profileFilter) filename += `-${profileFilter}`;
1661
+ filename += '.md';
1662
+ outputPath = path.join(exportsDir, filename);
1663
+ }
1664
+
1665
+ fs.writeFileSync(outputPath, lines.join('\n'), 'utf-8');
1666
+ console.log(`\nExported ${results.length} result(s) to: ${outputPath}`);
1667
+ break;
1668
+ }
1669
+
1670
+ case 'evaluate': {
1671
+ const runId = args.find(a => !a.startsWith('--') && a !== 'evaluate');
1672
+ if (!runId) {
1673
+ console.error('Usage: eval-cli.js evaluate <runId> [--scenario <id>] [--profile <name>] [--model <model>] [--force] [--follow] [--review] [--refresh <ms>] [--verbose]');
1674
+ process.exit(1);
1675
+ }
1676
+
1677
+ const verbose = getFlag('verbose');
1678
+ const force = getFlag('force');
1679
+ const follow = getFlag('follow');
1680
+ const review = getFlag('review');
1681
+ const refreshMs = parseInt(getOption('refresh', '5000'), 10);
1682
+ const scenarioFilter = getOption('scenario') || getOption('scenarios') || null;
1683
+ const profileFilter = getOption('profile') || getOption('profiles') || null;
1684
+ const modelOverride = getOption('model') || null;
1685
+
1686
+ // Restore env overrides from run metadata (e.g. EVAL_SCENARIOS_FILE for domain generalizability runs)
1687
+ {
1688
+ const runData = evaluationStore.getRun(runId);
1689
+ const meta = typeof runData?.metadata === 'string' ? JSON.parse(runData.metadata) : runData?.metadata;
1690
+ if (meta?.scenariosFile && !process.env.EVAL_SCENARIOS_FILE) {
1691
+ process.env.EVAL_SCENARIOS_FILE = meta.scenariosFile;
1692
+ console.log(`[evaluate] Restored EVAL_SCENARIOS_FILE from run metadata: ${meta.scenariosFile}`);
1693
+ }
1694
+ if (meta?.contentPath && !process.env.EVAL_CONTENT_PATH) {
1695
+ process.env.EVAL_CONTENT_PATH = meta.contentPath;
1696
+ console.log(`[evaluate] Restored EVAL_CONTENT_PATH from run metadata: ${meta.contentPath}`);
1697
+ }
1698
+ }
1699
+
1700
+ // Helper: evaluate a single result via claude CLI
1701
+ async function evaluateOneResult(result, tag) {
1702
+ const scenarioId = result.scenarioId;
1703
+ const profileName = result.profileName || `${result.provider}/${result.model}`;
1704
+
1705
+ const scenario = getScenario(scenarioId);
1706
+ if (!scenario) {
1707
+ console.log(`${tag} ${scenarioId} / ${profileName} ... SKIP (scenario not found)`);
1708
+ return null;
1709
+ }
1710
+
1711
+ const suggestion = result.suggestions?.[0];
1712
+ if (!suggestion) {
1713
+ console.log(`${tag} ${scenarioId} / ${profileName} ... SKIP (no suggestion)`);
1714
+ return null;
1715
+ }
1716
+
1717
+ // Load dialogue log for multi-turn context (if available)
1718
+ let dialogueContext = null;
1719
+ const dialogueId = result.dialogueId;
1720
+ if (dialogueId) {
1721
+ const logPath = path.join(LOGS_DIR, `${dialogueId}.json`);
1722
+ try {
1723
+ if (fs.existsSync(logPath)) {
1724
+ const dialogueLog = JSON.parse(fs.readFileSync(logPath, 'utf-8'));
1725
+ if (dialogueLog.isMultiTurn && dialogueLog.dialogueTrace?.length > 0) {
1726
+ dialogueContext = {
1727
+ consolidatedTrace: dialogueLog.dialogueTrace,
1728
+ conversationHistory: (dialogueLog.turnResults || []).map((t, i) => ({
1729
+ turnIndex: i,
1730
+ turnId: t.turnId,
1731
+ suggestion: t.suggestions?.[0],
1732
+ learnerAction: t.learnerAction,
1733
+ learnerMessage: t.learnerMessage,
1734
+ })),
1735
+ };
1736
+ if (verbose) {
1737
+ console.log(`${tag} loaded dialogue transcript (${dialogueLog.dialogueTrace.length} trace entries)`);
1738
+ }
1739
+ }
1740
+ }
1741
+ } catch (e) {
1742
+ if (verbose) console.log(`${tag} could not load dialogue log: ${e.message}`);
1743
+ }
1744
+ }
1745
+
1746
+ const prompt = buildEvaluationPrompt(suggestion, {
1747
+ name: scenario.name,
1748
+ description: scenario.description,
1749
+ expectedBehavior: scenario.expected_behavior,
1750
+ learnerContext: scenario.learner_context,
1751
+ requiredElements: scenario.required_elements,
1752
+ forbiddenElements: scenario.forbidden_elements,
1753
+ }, { dialogueContext });
1754
+
1755
+ const claudeArgs = ['-p', '-', '--output-format', 'text'];
1756
+ if (modelOverride) {
1757
+ claudeArgs.push('--model', modelOverride);
1758
+ }
1759
+
1760
+ if (verbose) {
1761
+ console.log(`${tag} ${scenarioId} / ${profileName} ... calling claude`);
1762
+ }
1763
+
1764
+ const stdout = await new Promise((resolve, reject) => {
1765
+ const env = { ...process.env };
1766
+ delete env.ANTHROPIC_API_KEY;
1767
+ const child = spawn('claude', claudeArgs, {
1768
+ stdio: ['pipe', 'pipe', 'pipe'],
1769
+ env,
1770
+ });
1771
+ let out = '';
1772
+ let err = '';
1773
+ child.stdout.on('data', d => { out += d; });
1774
+ child.stderr.on('data', d => { err += d; });
1775
+ child.on('error', reject);
1776
+ child.on('close', code => {
1777
+ if (code !== 0) reject(new Error(err || out || `claude exited with code ${code}`));
1778
+ else resolve(out);
1779
+ });
1780
+ child.stdin.write(prompt);
1781
+ child.stdin.end();
1782
+ });
1783
+
1784
+ let jsonStr = stdout.trim();
1785
+ const fenceMatch = jsonStr.match(/```(?:json)?\s*([\s\S]*?)```/);
1786
+ if (fenceMatch) {
1787
+ jsonStr = fenceMatch[1].trim();
1788
+ } else {
1789
+ const firstBrace = jsonStr.indexOf('{');
1790
+ const lastBrace = jsonStr.lastIndexOf('}');
1791
+ if (firstBrace !== -1 && lastBrace > firstBrace) {
1792
+ jsonStr = jsonStr.slice(firstBrace, lastBrace + 1);
1793
+ }
1794
+ }
1795
+
1796
+ const parsed = JSON.parse(jsonStr);
1797
+
1798
+ const dimensionMap = {
1799
+ relevance: 'relevance',
1800
+ specificity: 'specificity',
1801
+ pedagogical_soundness: 'pedagogical',
1802
+ pedagogical: 'pedagogical',
1803
+ personalization: 'personalization',
1804
+ actionability: 'actionability',
1805
+ tone: 'tone',
1806
+ };
1807
+
1808
+ const normalizedScores = {};
1809
+ for (const [key, value] of Object.entries(parsed.scores || {})) {
1810
+ const normalizedKey = dimensionMap[key] || key;
1811
+ if (typeof value === 'object' && value !== null) {
1812
+ normalizedScores[normalizedKey] = { score: value.score, reasoning: value.reasoning };
1813
+ } else if (typeof value === 'number') {
1814
+ normalizedScores[normalizedKey] = { score: value, reasoning: null };
1815
+ }
1816
+ }
1817
+
1818
+ const overallScore = Object.keys(normalizedScores).length > 0
1819
+ ? calculateOverallScore(normalizedScores)
1820
+ : parsed.overall_score;
1821
+ const baseScore = calculateBaseScore(normalizedScores);
1822
+ const recognitionScore = calculateRecognitionScore(normalizedScores);
1823
+
1824
+ const evaluation = {
1825
+ scores: normalizedScores,
1826
+ overallScore,
1827
+ baseScore,
1828
+ recognitionScore,
1829
+ passesRequired: parsed.validation?.passes_required ?? true,
1830
+ passesForbidden: parsed.validation?.passes_forbidden ?? true,
1831
+ requiredMissing: parsed.validation?.required_missing || [],
1832
+ forbiddenFound: parsed.validation?.forbidden_found || [],
1833
+ summary: parsed.summary,
1834
+ judgeModel: modelOverride ? `claude-code/${modelOverride}` : 'claude-code/opus',
1835
+ };
1836
+
1837
+ evaluationStore.updateResultScores(result.id, evaluation);
1838
+
1839
+ // Score line
1840
+ const dimScores = Object.entries(normalizedScores)
1841
+ .map(([k, v]) => `${k}=${v.score}`)
1842
+ .join(' ');
1843
+ console.log(`${tag} ${scenarioId} / ${profileName} ... ${overallScore.toFixed(1)} (${dimScores})`);
1844
+
1845
+ if (verbose) {
1846
+ // Truncated suggestion excerpt
1847
+ const suggText = typeof suggestion === 'string'
1848
+ ? suggestion
1849
+ : (suggestion.message || suggestion.text || suggestion.content || JSON.stringify(suggestion));
1850
+ const truncSugg = suggText.length > 200
1851
+ ? suggText.slice(0, 200).replace(/\n/g, ' ') + '...'
1852
+ : suggText.replace(/\n/g, ' ');
1853
+ console.log(` Suggestion: ${truncSugg}`);
1854
+
1855
+ // Judge summary
1856
+ if (parsed.summary) {
1857
+ const truncSummary = parsed.summary.length > 300
1858
+ ? parsed.summary.slice(0, 300).replace(/\n/g, ' ') + '...'
1859
+ : parsed.summary.replace(/\n/g, ' ');
1860
+ console.log(` Judge: ${truncSummary}`);
1861
+ }
1862
+ console.log('');
1863
+ }
1864
+
1865
+ return overallScore;
1866
+ }
1867
+
1868
+ // Helper: print summary
1869
+ function printEvaluateSummary(succeeded, failed, totalAttempted, scores) {
1870
+ const avgScore = scores.length > 0
1871
+ ? scores.reduce((a, b) => a + b, 0) / scores.length
1872
+ : 0;
1873
+
1874
+ console.log('\n' + '='.repeat(50));
1875
+ console.log(' EVALUATE SUMMARY');
1876
+ console.log('='.repeat(50));
1877
+ console.log(` Total: ${totalAttempted}`);
1878
+ console.log(` Succeeded: ${succeeded}`);
1879
+ console.log(` Failed: ${failed}`);
1880
+ if (scores.length > 0) {
1881
+ console.log(` Avg score: ${avgScore.toFixed(1)}`);
1882
+ }
1883
+ console.log('');
1884
+ }
1885
+
1886
+ // Helper: run holistic dialogue evaluation for multi-turn dialogues
1887
+ async function evaluateHolisticDialogues(evaluatedResults) {
1888
+ // Group results by dialogueId to find multi-turn dialogues
1889
+ const dialogueGroups = new Map();
1890
+ for (const result of evaluatedResults) {
1891
+ if (result.dialogueId) {
1892
+ if (!dialogueGroups.has(result.dialogueId)) {
1893
+ dialogueGroups.set(result.dialogueId, []);
1894
+ }
1895
+ dialogueGroups.get(result.dialogueId).push(result);
1896
+ }
1897
+ }
1898
+
1899
+ // Filter to multi-turn dialogues (2+ results sharing a dialogueId)
1900
+ const multiTurnDialogues = [...dialogueGroups.entries()].filter(([, results]) => results.length > 1);
1901
+ if (multiTurnDialogues.length === 0) return;
1902
+
1903
+ console.log(`\n${'─'.repeat(50)}`);
1904
+ console.log(` HOLISTIC DIALOGUE EVALUATION (${multiTurnDialogues.length} dialogue(s))`);
1905
+ console.log(`${'─'.repeat(50)}\n`);
1906
+
1907
+ for (const [dialogueId, results] of multiTurnDialogues) {
1908
+ const logPath = path.join(LOGS_DIR, `${dialogueId}.json`);
1909
+ let dialogueLog;
1910
+ try {
1911
+ if (!fs.existsSync(logPath)) {
1912
+ console.log(` ${dialogueId} ... SKIP (dialogue log not found)`);
1913
+ continue;
1914
+ }
1915
+ dialogueLog = JSON.parse(fs.readFileSync(logPath, 'utf-8'));
1916
+ } catch (e) {
1917
+ console.log(` ${dialogueId} ... SKIP (could not load: ${e.message})`);
1918
+ continue;
1919
+ }
1920
+
1921
+ if (!dialogueLog.isMultiTurn || !dialogueLog.dialogueTrace?.length) {
1922
+ console.log(` ${dialogueId} ... SKIP (not multi-turn or no trace)`);
1923
+ continue;
1924
+ }
1925
+
1926
+ // Build context from the dialogue log
1927
+ const consolidatedTrace = dialogueLog.dialogueTrace;
1928
+ const conversationHistory = (dialogueLog.turnResults || []).map((t, i) => ({
1929
+ turnIndex: i,
1930
+ turnId: t.turnId,
1931
+ suggestion: t.suggestions?.[0],
1932
+ learnerAction: t.learnerAction,
1933
+ learnerMessage: t.learnerMessage,
1934
+ }));
1935
+
1936
+ // Use the last turn's suggestion as the focal point
1937
+ const lastResult = results[results.length - 1];
1938
+ const lastSuggestion = lastResult.suggestions?.[0];
1939
+ if (!lastSuggestion) {
1940
+ console.log(` ${dialogueId} ... SKIP (no suggestion on last turn)`);
1941
+ continue;
1942
+ }
1943
+
1944
+ const scenarioId = lastResult.scenarioId;
1945
+ const scenario = getScenario(scenarioId);
1946
+ if (!scenario) {
1947
+ console.log(` ${dialogueId} ... SKIP (scenario ${scenarioId} not found)`);
1948
+ continue;
1949
+ }
1950
+
1951
+ const prompt = buildEvaluationPrompt(lastSuggestion, {
1952
+ name: `${scenario.name} (holistic dialogue)`,
1953
+ description: `Holistic evaluation of ${results.length}-turn dialogue. Score the overall quality of the tutoring interaction across all turns, not just this final response.`,
1954
+ expectedBehavior: scenario.expected_behavior,
1955
+ learnerContext: scenario.learner_context,
1956
+ requiredElements: scenario.required_elements,
1957
+ forbiddenElements: scenario.forbidden_elements,
1958
+ }, {
1959
+ dialogueContext: { conversationHistory, consolidatedTrace },
1960
+ });
1961
+
1962
+ try {
1963
+ const claudeArgs = ['-p', '-', '--output-format', 'text'];
1964
+ if (modelOverride) claudeArgs.push('--model', modelOverride);
1965
+
1966
+ const stdout = await new Promise((resolve, reject) => {
1967
+ const env = { ...process.env };
1968
+ delete env.ANTHROPIC_API_KEY;
1969
+ const child = spawn('claude', claudeArgs, {
1970
+ stdio: ['pipe', 'pipe', 'pipe'],
1971
+ env,
1972
+ });
1973
+ let out = '';
1974
+ let err = '';
1975
+ child.stdout.on('data', d => { out += d; });
1976
+ child.stderr.on('data', d => { err += d; });
1977
+ child.on('error', reject);
1978
+ child.on('close', code => {
1979
+ if (code !== 0) reject(new Error(err || out || `claude exited with code ${code}`));
1980
+ else resolve(out);
1981
+ });
1982
+ child.stdin.write(prompt);
1983
+ child.stdin.end();
1984
+ });
1985
+
1986
+ let jsonStr = stdout.trim();
1987
+ const fenceMatch = jsonStr.match(/```(?:json)?\s*([\s\S]*?)```/);
1988
+ if (fenceMatch) {
1989
+ jsonStr = fenceMatch[1].trim();
1990
+ } else {
1991
+ const firstBrace = jsonStr.indexOf('{');
1992
+ const lastBrace = jsonStr.lastIndexOf('}');
1993
+ if (firstBrace !== -1 && lastBrace > firstBrace) {
1994
+ jsonStr = jsonStr.slice(firstBrace, lastBrace + 1);
1995
+ }
1996
+ }
1997
+
1998
+ const parsed = JSON.parse(jsonStr);
1999
+
2000
+ const normalizedScores = {};
2001
+ const dimensionMap = {
2002
+ relevance: 'relevance', specificity: 'specificity',
2003
+ pedagogical_soundness: 'pedagogical', pedagogical: 'pedagogical',
2004
+ personalization: 'personalization', actionability: 'actionability', tone: 'tone',
2005
+ };
2006
+ for (const [key, value] of Object.entries(parsed.scores || {})) {
2007
+ const normalizedKey = dimensionMap[key] || key;
2008
+ if (typeof value === 'object' && value !== null) {
2009
+ normalizedScores[normalizedKey] = { score: value.score, reasoning: value.reasoning };
2010
+ } else if (typeof value === 'number') {
2011
+ normalizedScores[normalizedKey] = { score: value, reasoning: null };
2012
+ }
2013
+ }
2014
+
2015
+ const overallScore = Object.keys(normalizedScores).length > 0
2016
+ ? calculateOverallScore(normalizedScores) : parsed.overall_score;
2017
+ const baseScore = calculateBaseScore(normalizedScores);
2018
+ const recognitionScore = calculateRecognitionScore(normalizedScores);
2019
+
2020
+ const holisticScore = {
2021
+ overallScore,
2022
+ baseScore,
2023
+ recognitionScore,
2024
+ scores: normalizedScores,
2025
+ summary: parsed.summary,
2026
+ judgeModel: modelOverride ? `claude-code/${modelOverride}` : 'claude-code/opus',
2027
+ };
2028
+
2029
+ // Save to dialogue log
2030
+ dialogueLog.holisticDialogueScore = holisticScore;
2031
+ fs.writeFileSync(logPath, JSON.stringify(dialogueLog, null, 2));
2032
+
2033
+ const profileName = lastResult.profileName || `${lastResult.provider}/${lastResult.model}`;
2034
+ console.log(` ${scenarioId} / ${profileName} ... holistic=${overallScore.toFixed(1)} (base=${baseScore.toFixed(1)} recog=${recognitionScore.toFixed(1)})`);
2035
+ if (verbose && parsed.summary) {
2036
+ const truncSummary = parsed.summary.length > 300
2037
+ ? parsed.summary.slice(0, 300).replace(/\n/g, ' ') + '...'
2038
+ : parsed.summary.replace(/\n/g, ' ');
2039
+ console.log(` Judge: ${truncSummary}\n`);
2040
+ }
2041
+ } catch (err) {
2042
+ const msg = err.stderr ? err.stderr.slice(0, 200) : err.message;
2043
+ console.log(` ${dialogueId} ... FAIL: ${msg}`);
2044
+ }
2045
+ }
2046
+ }
2047
+
2048
+ // ── Review mode: show stored reasoning without re-evaluating ──
2049
+ if (review) {
2050
+ const results = evaluationStore.getResults(runId, {
2051
+ scenarioId: scenarioFilter,
2052
+ profileName: profileFilter,
2053
+ });
2054
+
2055
+ if (results.length === 0) {
2056
+ console.error(`No results found for run: ${runId}`);
2057
+ process.exit(1);
2058
+ }
2059
+
2060
+ const evaluated = results.filter(r => r.baseScore != null);
2061
+ if (evaluated.length === 0) {
2062
+ console.log('No evaluated results to review. Run evaluate first.');
2063
+ break;
2064
+ }
2065
+
2066
+ console.log(`\nReviewing ${evaluated.length} evaluated result(s) for run: ${runId}\n`);
2067
+
2068
+ for (let i = 0; i < evaluated.length; i++) {
2069
+ const r = evaluated[i];
2070
+ const profileName = r.profileName || `${r.provider}/${r.model}`;
2071
+
2072
+ // Dimension scores on one line
2073
+ const dimScores = Object.entries(r.scores || {})
2074
+ .filter(([, v]) => v != null)
2075
+ .map(([k, v]) => {
2076
+ const score = typeof v === 'object' ? v.score : v;
2077
+ return `${k}=${score}`;
2078
+ })
2079
+ .join(' ');
2080
+
2081
+ console.log(`[${i + 1}/${evaluated.length}] ${r.scenarioId} / ${profileName} ... ${r.overallScore?.toFixed(1) ?? '--'} (${dimScores})`);
2082
+
2083
+ // Suggestion excerpt
2084
+ const suggestion = r.suggestions?.[0];
2085
+ if (suggestion) {
2086
+ const suggText = typeof suggestion === 'string'
2087
+ ? suggestion
2088
+ : (suggestion.message || suggestion.text || suggestion.content || JSON.stringify(suggestion));
2089
+ const truncSugg = suggText.length > 200
2090
+ ? suggText.slice(0, 200).replace(/\n/g, ' ') + '...'
2091
+ : suggText.replace(/\n/g, ' ');
2092
+ console.log(` Suggestion: ${truncSugg}`);
2093
+ }
2094
+
2095
+ // Judge summary
2096
+ if (r.evaluationReasoning) {
2097
+ const truncReasoning = r.evaluationReasoning.length > 300
2098
+ ? r.evaluationReasoning.slice(0, 300).replace(/\n/g, ' ') + '...'
2099
+ : r.evaluationReasoning.replace(/\n/g, ' ');
2100
+ console.log(` Judge: ${truncReasoning}`);
2101
+ }
2102
+
2103
+ // Per-dimension reasoning (verbose only)
2104
+ if (verbose && r.scores) {
2105
+ for (const [dim, val] of Object.entries(r.scores)) {
2106
+ if (typeof val === 'object' && val?.reasoning) {
2107
+ const truncDim = val.reasoning.length > 150
2108
+ ? val.reasoning.slice(0, 150).replace(/\n/g, ' ') + '...'
2109
+ : val.reasoning.replace(/\n/g, ' ');
2110
+ console.log(` ${dim} (${val.score}): ${truncDim}`);
2111
+ }
2112
+ }
2113
+ }
2114
+ console.log('');
2115
+ }
2116
+
2117
+ // Quick stats
2118
+ const reviewScores = evaluated.map(r => r.overallScore).filter(s => s != null);
2119
+ if (reviewScores.length > 0) {
2120
+ const avg = reviewScores.reduce((a, b) => a + b, 0) / reviewScores.length;
2121
+ const sd = Math.sqrt(reviewScores.reduce((acc, s) => acc + (s - avg) ** 2, 0) / (reviewScores.length - 1));
2122
+ console.log(`Reviewed ${evaluated.length} results: avg=${avg.toFixed(1)} sd=${sd.toFixed(1)}`);
2123
+ }
2124
+ break;
2125
+ }
2126
+
2127
+ let succeeded = 0;
2128
+ let failed = 0;
2129
+ const scores = [];
2130
+
2131
+ if (follow) {
2132
+ // ── Follow mode: poll for new unevaluated results ──
2133
+ // Show initial status
2134
+ const initialResults = evaluationStore.getResults(runId, {
2135
+ scenarioId: scenarioFilter,
2136
+ profileName: profileFilter,
2137
+ });
2138
+ const initialTotal = initialResults.filter(r => r.success).length;
2139
+ const initialUnevaluated = initialResults.filter(r => r.baseScore == null && r.success).length;
2140
+ const initialEvaluated = initialTotal - initialUnevaluated;
2141
+
2142
+ console.log(`\nFollowing run: ${runId}`);
2143
+ console.log(` Already scored: ${initialEvaluated}/${initialTotal}`);
2144
+ console.log(` Need scoring: ${initialUnevaluated}`);
2145
+ if (modelOverride) console.log(` Model: ${modelOverride}`);
2146
+ console.log(` Polling every ${refreshMs}ms for new results...`);
2147
+ console.log('');
2148
+
2149
+ const processedIds = new Set();
2150
+ let evalCounter = 0;
2151
+ let interrupted = false;
2152
+
2153
+ // SIGINT handler: print summary so far and exit
2154
+ const sigintHandler = () => {
2155
+ interrupted = true;
2156
+ console.log('\n\nInterrupted by user.');
2157
+ printEvaluateSummary(succeeded, failed, succeeded + failed, scores);
2158
+ process.exit(0);
2159
+ };
2160
+ process.on('SIGINT', sigintHandler);
2161
+
2162
+ const sleep = (ms) => new Promise(resolve => setTimeout(resolve, ms));
2163
+
2164
+ while (!interrupted) {
2165
+ // Fetch results that have a suggestion but no rubric evaluation
2166
+ const results = evaluationStore.getResults(runId, {
2167
+ scenarioId: scenarioFilter,
2168
+ profileName: profileFilter,
2169
+ });
2170
+
2171
+ const unevaluated = results.filter(r =>
2172
+ r.baseScore == null && r.success && !processedIds.has(r.id)
2173
+ );
2174
+
2175
+ // Total results available so far (for progress display)
2176
+ const totalResults = results.filter(r => r.success).length;
2177
+ const alreadyEvaluated = results.filter(r => r.baseScore != null && r.success).length;
2178
+
2179
+ // Process each new unevaluated result
2180
+ let batchIndex = 0;
2181
+ const batchSize = unevaluated.length;
2182
+ for (const result of unevaluated) {
2183
+ if (interrupted) break;
2184
+ processedIds.add(result.id);
2185
+ evalCounter++;
2186
+ batchIndex++;
2187
+ // Show: [batch progress] (overall: evaluated/total)
2188
+ const tag = `[${batchIndex}/${batchSize}] (${alreadyEvaluated + batchIndex}/${totalResults} scored)`;
2189
+
2190
+ try {
2191
+ const score = await evaluateOneResult(result, tag);
2192
+ if (score != null) {
2193
+ scores.push(score);
2194
+ succeeded++;
2195
+ } else {
2196
+ failed++;
2197
+ }
2198
+ } catch (err) {
2199
+ failed++;
2200
+ const msg = err.stderr ? err.stderr.slice(0, 200) : err.message;
2201
+ const profileName = result.profileName || `${result.provider}/${result.model}`;
2202
+ console.log(`${tag} ${result.scenarioId} / ${profileName} ... FAIL: ${msg}`);
2203
+ if (verbose) console.error(err);
2204
+ }
2205
+ }
2206
+
2207
+ // Check if run is done and no unevaluated results remain
2208
+ const run = evaluationStore.getRun(runId);
2209
+ const runStatus = run?.status || 'unknown';
2210
+
2211
+ if (runStatus !== 'running' && unevaluated.length === 0) {
2212
+ // Re-check one more time to avoid race condition
2213
+ const finalResults = evaluationStore.getResults(runId, {
2214
+ scenarioId: scenarioFilter,
2215
+ profileName: profileFilter,
2216
+ });
2217
+ const finalUnevaluated = finalResults.filter(r =>
2218
+ r.baseScore == null && r.success && !processedIds.has(r.id)
2219
+ );
2220
+ if (finalUnevaluated.length === 0) {
2221
+ console.log(`\nRun ${runStatus}. All results evaluated.`);
2222
+ break;
2223
+ }
2224
+ }
2225
+
2226
+ // Status line while waiting
2227
+ const evaluatedCount = results.filter(r => r.baseScore != null).length;
2228
+ console.log(`Waiting for new results... (${evaluatedCount} evaluated of ${totalResults} total, run ${runStatus})`);
2229
+
2230
+ await sleep(refreshMs);
2231
+ }
2232
+
2233
+ process.removeListener('SIGINT', sigintHandler);
2234
+ printEvaluateSummary(succeeded, failed, succeeded + failed, scores);
2235
+
2236
+ // Holistic dialogue evaluation for multi-turn dialogues
2237
+ const allResults = evaluationStore.getResults(runId, {
2238
+ scenarioId: scenarioFilter,
2239
+ profileName: profileFilter,
2240
+ }).filter(r => r.success && r.baseScore != null);
2241
+ await evaluateHolisticDialogues(allResults);
2242
+
2243
+ } else {
2244
+ // ── One-shot mode (existing behavior) ──
2245
+
2246
+ // Load results for this run
2247
+ const results = evaluationStore.getResults(runId, {
2248
+ scenarioId: scenarioFilter,
2249
+ profileName: profileFilter,
2250
+ });
2251
+
2252
+ if (results.length === 0) {
2253
+ console.error(`No results found for run: ${runId}`);
2254
+ process.exit(1);
2255
+ }
2256
+
2257
+ // Filter to unevaluated results unless --force
2258
+ // Use baseScore == null to detect skip-rubric results (overallScore=100 but no rubric dims)
2259
+ const toEvaluate = force
2260
+ ? results
2261
+ : results.filter(r => r.baseScore == null && r.success);
2262
+
2263
+ if (toEvaluate.length === 0) {
2264
+ console.log('All results already have rubric scores. Use --review to inspect reasoning, or --force to re-evaluate.');
2265
+ break;
2266
+ }
2267
+
2268
+ console.log(`\nEvaluating ${toEvaluate.length} result(s) for run: ${runId}`);
2269
+ if (modelOverride) console.log(` Model: ${modelOverride}`);
2270
+ console.log('');
2271
+
2272
+ for (let i = 0; i < toEvaluate.length; i++) {
2273
+ const result = toEvaluate[i];
2274
+ const tag = `[${i + 1}/${toEvaluate.length}]`;
2275
+
2276
+ try {
2277
+ const score = await evaluateOneResult(result, tag);
2278
+ if (score != null) {
2279
+ scores.push(score);
2280
+ succeeded++;
2281
+ } else {
2282
+ failed++;
2283
+ }
2284
+ } catch (err) {
2285
+ failed++;
2286
+ const profileName = result.profileName || `${result.provider}/${result.model}`;
2287
+ const msg = err.stderr ? err.stderr.slice(0, 200) : err.message;
2288
+ console.log(`${tag} ${result.scenarioId} / ${profileName} ... FAIL: ${msg}`);
2289
+ if (verbose) console.error(err);
2290
+ }
2291
+ }
2292
+
2293
+ printEvaluateSummary(succeeded, failed, toEvaluate.length, scores);
2294
+
2295
+ // Holistic dialogue evaluation for multi-turn dialogues
2296
+ await evaluateHolisticDialogues(toEvaluate.filter(r => r.success));
2297
+ }
2298
+ break;
2299
+ }
2300
+
2301
+ case 'evaluate-learner': {
2302
+ // ── Learner-side evaluation: score learner turns from multi-turn dialogues ──
2303
+ //
2304
+ // Data lives in evaluation_results (per-dialogue rows with dialogueId)
2305
+ // and logs/tutor-dialogues/*.json (full dialogue traces with learner turns).
2306
+ //
2307
+ // For each dialogue:
2308
+ // 1. Load the log file to get learner turn messages + deliberation traces
2309
+ // 2. Build a learner evaluation prompt per learner turn (truncated context)
2310
+ // 3. Call Claude as judge
2311
+ // 4. Store per-turn scores as JSON + overall learner score on the result row
2312
+
2313
+ const runId = args.find(a => !a.startsWith('--') && a !== 'evaluate-learner');
2314
+ if (!runId) {
2315
+ console.error('Usage: eval-cli.js evaluate-learner <runId> [--model <model>] [--force] [--verbose] [--arch <architecture>]');
2316
+ console.error(' Scores learner turns from dialogue logs using the learner rubric.');
2317
+ console.error(' Only works on multi-turn runs with learner turns (e.g., bilateral transformation).');
2318
+ console.error(' --arch filters by learner_architecture (e.g., ego_superego_recognition)');
2319
+ process.exit(1);
2320
+ }
2321
+
2322
+ const verbose = getFlag('verbose');
2323
+ const force = getFlag('force');
2324
+ const modelOverride = getOption('model') || null;
2325
+ const profileFilter = getOption('profile') || getOption('profiles') || null;
2326
+ const archFilter = getOption('arch') || null;
2327
+
2328
+ // Load results with dialogue IDs (multi-turn data)
2329
+ const allResults = evaluationStore.getResults(runId, { profileName: profileFilter });
2330
+ let dialogueResults = allResults.filter(r => r.dialogueId && r.success);
2331
+ if (archFilter) {
2332
+ dialogueResults = dialogueResults.filter(r => r.learnerArchitecture === archFilter);
2333
+ }
2334
+
2335
+ if (dialogueResults.length === 0) {
2336
+ console.error(`No multi-turn dialogue results found for run: ${runId}`);
2337
+ console.error('This command only works on runs that produced dialogue log files.');
2338
+ process.exit(1);
2339
+ }
2340
+
2341
+ // Filter to those needing learner evaluation (unless --force)
2342
+ const toEvaluate = force
2343
+ ? dialogueResults
2344
+ : dialogueResults.filter(r => r.learnerOverallScore == null);
2345
+
2346
+ if (toEvaluate.length === 0) {
2347
+ console.log('All dialogue results already have learner scores. Use --force to re-evaluate.');
2348
+ break;
2349
+ }
2350
+
2351
+ console.log(`\nEvaluating learner turns for ${toEvaluate.length} dialogue(s) from run: ${runId}`);
2352
+ if (modelOverride) console.log(` Model: ${modelOverride}`);
2353
+ console.log('');
2354
+
2355
+ let succeeded = 0;
2356
+ let failed = 0;
2357
+ const allScores = [];
2358
+
2359
+ for (let i = 0; i < toEvaluate.length; i++) {
2360
+ const result = toEvaluate[i];
2361
+ const profileName = result.profileName || `${result.provider}/${result.model}`;
2362
+ const tag = `[${i + 1}/${toEvaluate.length}]`;
2363
+
2364
+ // Load dialogue log file
2365
+ const logPath = path.join(LOGS_DIR, `${result.dialogueId}.json`);
2366
+ let dialogueLog;
2367
+ try {
2368
+ if (!fs.existsSync(logPath)) {
2369
+ console.log(`${tag} ${result.scenarioId} / ${profileName} ... SKIP (log file not found)`);
2370
+ failed++;
2371
+ continue;
2372
+ }
2373
+ dialogueLog = JSON.parse(fs.readFileSync(logPath, 'utf-8'));
2374
+ } catch (e) {
2375
+ console.log(`${tag} ${result.scenarioId} / ${profileName} ... SKIP (${e.message})`);
2376
+ failed++;
2377
+ continue;
2378
+ }
2379
+
2380
+ if (!dialogueLog.isMultiTurn) {
2381
+ console.log(`${tag} ${result.scenarioId} / ${profileName} ... SKIP (not multi-turn)`);
2382
+ failed++;
2383
+ continue;
2384
+ }
2385
+
2386
+ const trace = dialogueLog.dialogueTrace || [];
2387
+ const learnerArch = dialogueLog.learnerArchitecture || 'unified';
2388
+ const isMultiAgent = learnerArch.includes('ego_superego') || learnerArch === 'multi_agent' || learnerArch.includes('psychodynamic');
2389
+
2390
+ // Extract learner turns from dialogue trace.
2391
+ // Each learner turn consists of:
2392
+ // - turn_action entry (contextSummary = external message)
2393
+ // - For multi-agent: preceding learner_ego_initial, learner_superego, learner_ego_revision entries
2394
+ const learnerTurns = [];
2395
+ const turnActionEntries = trace.filter(t => t.agent === 'user' && t.action === 'turn_action');
2396
+
2397
+ for (const ta of turnActionEntries) {
2398
+ const turnData = {
2399
+ turnIndex: ta.turnIndex,
2400
+ externalMessage: ta.contextSummary || '',
2401
+ internalDeliberation: [],
2402
+ };
2403
+
2404
+ // Find deliberation entries associated with this turn action
2405
+ // They appear before the turn_action in the trace and after the previous tutor turn
2406
+ if (isMultiAgent) {
2407
+ const taIdx = trace.indexOf(ta);
2408
+ // Walk backward from turn_action to find learner deliberation entries
2409
+ for (let j = taIdx - 1; j >= 0; j--) {
2410
+ const entry = trace[j];
2411
+ if (entry.agent === 'learner_ego_initial' && entry.action === 'deliberation') {
2412
+ turnData.internalDeliberation.unshift({ role: 'ego_initial', content: entry.contextSummary || '' });
2413
+ break; // ego_initial is the first step, stop here
2414
+ } else if (entry.agent === 'learner_superego' && entry.action === 'deliberation') {
2415
+ turnData.internalDeliberation.unshift({ role: 'superego', content: entry.contextSummary || '' });
2416
+ } else if (entry.agent === 'learner_ego_revision' && entry.action === 'deliberation') {
2417
+ turnData.internalDeliberation.unshift({ role: 'ego_revision', content: entry.contextSummary || '' });
2418
+ } else if (entry.agent === 'learner_synthesis' && entry.action === 'response') {
2419
+ // synthesis is the final merged output, skip (same as external message)
2420
+ } else if (entry.agent === 'ego' || entry.agent === 'system') {
2421
+ break; // Reached the tutor's turn, stop
2422
+ }
2423
+ }
2424
+ }
2425
+
2426
+ learnerTurns.push(turnData);
2427
+ }
2428
+
2429
+ if (learnerTurns.length === 0) {
2430
+ console.log(`${tag} ${result.scenarioId} / ${profileName} ... SKIP (no learner turns in trace)`);
2431
+ failed++;
2432
+ continue;
2433
+ }
2434
+
2435
+ // Build a reconstructed turn array for the prompt builder
2436
+ // Interleave tutor suggestions and learner messages
2437
+ const reconstructedTurns = [];
2438
+ const turnResults = dialogueLog.turnResults || [];
2439
+
2440
+ // Turn 0: initial tutor suggestion
2441
+ if (turnResults.length > 0) {
2442
+ const sug = turnResults[0].suggestions?.[0];
2443
+ reconstructedTurns.push({
2444
+ turnNumber: 0,
2445
+ phase: 'tutor',
2446
+ externalMessage: sug?.message || sug?.text || JSON.stringify(sug),
2447
+ });
2448
+ }
2449
+
2450
+ // Subsequent turns: learner → tutor pairs
2451
+ for (let lt = 0; lt < learnerTurns.length; lt++) {
2452
+ reconstructedTurns.push({
2453
+ turnNumber: lt + 1,
2454
+ phase: 'learner',
2455
+ externalMessage: learnerTurns[lt].externalMessage,
2456
+ internalDeliberation: learnerTurns[lt].internalDeliberation,
2457
+ });
2458
+
2459
+ // Add corresponding tutor response (if exists)
2460
+ const tutorTurn = turnResults[lt + 1];
2461
+ if (tutorTurn) {
2462
+ const sug = tutorTurn.suggestions?.[0];
2463
+ reconstructedTurns.push({
2464
+ turnNumber: lt + 1,
2465
+ phase: 'tutor',
2466
+ externalMessage: sug?.message || sug?.text || JSON.stringify(sug),
2467
+ });
2468
+ }
2469
+ }
2470
+
2471
+ // Get scenario info
2472
+ const scenario = getScenario(result.scenarioId);
2473
+ const scenarioName = scenario?.name || result.scenarioId;
2474
+
2475
+ // Use learnerContext from the dialogue log as persona description
2476
+ const personaDescription = dialogueLog.learnerContext || 'No persona description available';
2477
+
2478
+ const turnScores = {};
2479
+ let turnSucceeded = 0;
2480
+
2481
+ // Score each learner turn
2482
+ for (let lt = 0; lt < learnerTurns.length; lt++) {
2483
+ // Find the learner turn's index in reconstructedTurns
2484
+ const targetIdx = reconstructedTurns.findIndex((t, idx) =>
2485
+ t.phase === 'learner' && t.externalMessage === learnerTurns[lt].externalMessage && idx > 0
2486
+ );
2487
+
2488
+ if (targetIdx === -1) continue;
2489
+
2490
+ const turnTag = `${tag} ${result.scenarioId} / ${profileName} learner-turn-${lt + 1}`;
2491
+
2492
+ try {
2493
+ const prompt = buildLearnerEvaluationPrompt({
2494
+ turns: reconstructedTurns,
2495
+ targetTurnIndex: targetIdx,
2496
+ personaId: profileName,
2497
+ personaDescription,
2498
+ learnerArchitecture: isMultiAgent ? 'multi_agent' : 'unified',
2499
+ scenarioName,
2500
+ topic: result.scenarioId,
2501
+ });
2502
+
2503
+ const claudeArgs = ['-p', '-', '--output-format', 'text'];
2504
+ if (modelOverride) {
2505
+ claudeArgs.push('--model', modelOverride);
2506
+ }
2507
+
2508
+ if (verbose) {
2509
+ console.log(`${turnTag} ... calling claude`);
2510
+ }
2511
+
2512
+ const stdout = await new Promise((resolve, reject) => {
2513
+ const env = { ...process.env };
2514
+ delete env.ANTHROPIC_API_KEY;
2515
+ const child = spawn('claude', claudeArgs, {
2516
+ stdio: ['pipe', 'pipe', 'pipe'],
2517
+ env,
2518
+ });
2519
+ let out = '';
2520
+ let err = '';
2521
+ child.stdout.on('data', d => { out += d; });
2522
+ child.stderr.on('data', d => { err += d; });
2523
+ child.on('error', reject);
2524
+ child.on('close', code => {
2525
+ if (code !== 0) reject(new Error(err || out || `claude exited with code ${code}`));
2526
+ else resolve(out);
2527
+ });
2528
+ child.stdin.write(prompt);
2529
+ child.stdin.end();
2530
+ });
2531
+
2532
+ // Parse JSON response
2533
+ let jsonStr = stdout.trim();
2534
+ const fenceMatch = jsonStr.match(/```(?:json)?\s*([\s\S]*?)```/);
2535
+ if (fenceMatch) {
2536
+ jsonStr = fenceMatch[1].trim();
2537
+ } else {
2538
+ const firstBrace = jsonStr.indexOf('{');
2539
+ const lastBrace = jsonStr.lastIndexOf('}');
2540
+ if (firstBrace !== -1 && lastBrace > firstBrace) {
2541
+ jsonStr = jsonStr.slice(firstBrace, lastBrace + 1);
2542
+ }
2543
+ }
2544
+
2545
+ const parsed = JSON.parse(jsonStr);
2546
+ const turnOverall = calculateLearnerOverallScore(parsed.scores || {}, isMultiAgent);
2547
+
2548
+ turnScores[lt] = {
2549
+ turnIndex: lt + 1,
2550
+ scores: parsed.scores,
2551
+ overallScore: turnOverall,
2552
+ summary: parsed.summary,
2553
+ };
2554
+
2555
+ const dimScores = Object.entries(parsed.scores || {})
2556
+ .map(([k, v]) => `${k}=${typeof v === 'object' ? v.score : v}`)
2557
+ .join(' ');
2558
+ console.log(`${turnTag} ... ${turnOverall.toFixed(1)} (${dimScores})`);
2559
+
2560
+ if (verbose && parsed.summary) {
2561
+ console.log(` Judge: ${parsed.summary}`);
2562
+ }
2563
+
2564
+ turnSucceeded++;
2565
+ } catch (err) {
2566
+ const msg = err.stderr ? err.stderr.slice(0, 200) : err.message;
2567
+ console.log(`${turnTag} ... FAIL: ${msg}`);
2568
+ if (verbose) console.error(err);
2569
+ }
2570
+ }
2571
+
2572
+ if (turnSucceeded > 0) {
2573
+ // Calculate dialogue-level learner score (average across turns)
2574
+ const turnOveralls = Object.values(turnScores).map(ts => ts.overallScore);
2575
+ const dialogueLearnerScore = turnOveralls.reduce((a, b) => a + b, 0) / turnOveralls.length;
2576
+
2577
+ // Store in database on the evaluation_results row
2578
+ evaluationStore.updateResultLearnerScores(result.id, {
2579
+ scores: turnScores,
2580
+ overallScore: dialogueLearnerScore,
2581
+ judgeModel: modelOverride ? `claude-code/${modelOverride}` : 'claude-code/opus',
2582
+ });
2583
+
2584
+ allScores.push(dialogueLearnerScore);
2585
+ succeeded++;
2586
+
2587
+ console.log(` → Dialogue learner score: ${dialogueLearnerScore.toFixed(1)} (${turnSucceeded} turns scored)`);
2588
+ console.log('');
2589
+ } else {
2590
+ failed++;
2591
+ }
2592
+ }
2593
+
2594
+ // Summary
2595
+ console.log('\n' + '='.repeat(50));
2596
+ console.log(' EVALUATE-LEARNER SUMMARY');
2597
+ console.log('='.repeat(50));
2598
+ console.log(` Total dialogues: ${toEvaluate.length}`);
2599
+ console.log(` Succeeded: ${succeeded}`);
2600
+ console.log(` Failed: ${failed}`);
2601
+ if (allScores.length > 0) {
2602
+ const avg = allScores.reduce((a, b) => a + b, 0) / allScores.length;
2603
+ const sd = allScores.length > 1
2604
+ ? Math.sqrt(allScores.reduce((acc, s) => acc + (s - avg) ** 2, 0) / (allScores.length - 1))
2605
+ : 0;
2606
+ console.log(` Avg learner score: ${avg.toFixed(1)} (SD=${sd.toFixed(1)})`);
2607
+ }
2608
+ console.log('');
2609
+ break;
2610
+ }
2611
+
2612
+ default:
2613
+ console.error(`Unknown command: ${command}`);
2614
+ console.error('Available commands: list, quick, test, run, runs, report, status, watch, transcript, export, cleanup, resume, revert, rejudge, evaluate, evaluate-learner, chat');
2615
+ process.exit(1);
2616
+ }
2617
+ } catch (error) {
2618
+ console.error(`\nError: ${error.message}`);
2619
+ if (getFlag('verbose')) {
2620
+ console.error(error.stack);
2621
+ }
2622
+ process.exit(1);
2623
+ }
2624
+ }
2625
+
2626
+ main();