@machinespirits/eval 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. package/components/MobileEvalDashboard.tsx +267 -0
  2. package/components/comparison/DeltaAnalysisTable.tsx +137 -0
  3. package/components/comparison/ProfileComparisonCard.tsx +176 -0
  4. package/components/comparison/RecognitionABMode.tsx +385 -0
  5. package/components/comparison/RecognitionMetricsPanel.tsx +135 -0
  6. package/components/comparison/WinnerIndicator.tsx +64 -0
  7. package/components/comparison/index.ts +5 -0
  8. package/components/mobile/BottomSheet.tsx +233 -0
  9. package/components/mobile/DimensionBreakdown.tsx +210 -0
  10. package/components/mobile/DocsView.tsx +363 -0
  11. package/components/mobile/LogsView.tsx +481 -0
  12. package/components/mobile/PsychodynamicQuadrant.tsx +261 -0
  13. package/components/mobile/QuickTestView.tsx +1098 -0
  14. package/components/mobile/RecognitionTypeChart.tsx +124 -0
  15. package/components/mobile/RecognitionView.tsx +809 -0
  16. package/components/mobile/RunDetailView.tsx +261 -0
  17. package/components/mobile/RunHistoryView.tsx +367 -0
  18. package/components/mobile/ScoreRadial.tsx +211 -0
  19. package/components/mobile/StreamingLogPanel.tsx +230 -0
  20. package/components/mobile/SynthesisStrategyChart.tsx +140 -0
  21. package/config/interaction-eval-scenarios.yaml +832 -0
  22. package/config/learner-agents.yaml +248 -0
  23. package/docs/research/ABLATION-DIALOGUE-ROUNDS.md +52 -0
  24. package/docs/research/ABLATION-MODEL-SELECTION.md +53 -0
  25. package/docs/research/ADVANCED-EVAL-ANALYSIS.md +60 -0
  26. package/docs/research/ANOVA-RESULTS-2026-01-14.md +257 -0
  27. package/docs/research/COMPREHENSIVE-EVALUATION-PLAN.md +586 -0
  28. package/docs/research/COST-ANALYSIS.md +56 -0
  29. package/docs/research/CRITICAL-REVIEW-RECOGNITION-TUTORING.md +340 -0
  30. package/docs/research/DYNAMIC-VS-SCRIPTED-ANALYSIS.md +291 -0
  31. package/docs/research/EVAL-SYSTEM-ANALYSIS.md +306 -0
  32. package/docs/research/FACTORIAL-RESULTS-2026-01-14.md +301 -0
  33. package/docs/research/IMPLEMENTATION-PLAN-CRITIQUE-RESPONSE.md +1988 -0
  34. package/docs/research/LONGITUDINAL-DYADIC-EVALUATION.md +282 -0
  35. package/docs/research/MULTI-JUDGE-VALIDATION-2026-01-14.md +147 -0
  36. package/docs/research/PAPER-EXTENSION-DYADIC.md +204 -0
  37. package/docs/research/PAPER-UNIFIED.md +659 -0
  38. package/docs/research/PAPER-UNIFIED.pdf +0 -0
  39. package/docs/research/PROMPT-IMPROVEMENTS-2026-01-14.md +356 -0
  40. package/docs/research/SESSION-NOTES-2026-01-11-RECOGNITION-EVAL.md +419 -0
  41. package/docs/research/apa.csl +2133 -0
  42. package/docs/research/archive/PAPER-DRAFT-RECOGNITION-TUTORING.md +1637 -0
  43. package/docs/research/archive/paper-multiagent-tutor.tex +978 -0
  44. package/docs/research/paper-draft/full-paper.md +136 -0
  45. package/docs/research/paper-draft/images/pasted-image-2026-01-24T03-47-47-846Z-d76a7ae2.png +0 -0
  46. package/docs/research/paper-draft/references.bib +515 -0
  47. package/docs/research/transcript-baseline.md +139 -0
  48. package/docs/research/transcript-recognition-multiagent.md +187 -0
  49. package/hooks/useEvalData.ts +625 -0
  50. package/index.js +27 -0
  51. package/package.json +73 -0
  52. package/routes/evalRoutes.js +3002 -0
  53. package/scripts/advanced-eval-analysis.js +351 -0
  54. package/scripts/analyze-eval-costs.js +378 -0
  55. package/scripts/analyze-eval-results.js +513 -0
  56. package/scripts/analyze-interaction-evals.js +368 -0
  57. package/server-init.js +45 -0
  58. package/server.js +162 -0
  59. package/services/benchmarkService.js +1892 -0
  60. package/services/evaluationRunner.js +739 -0
  61. package/services/evaluationStore.js +1121 -0
  62. package/services/learnerConfigLoader.js +385 -0
  63. package/services/learnerTutorInteractionEngine.js +857 -0
  64. package/services/memory/learnerMemoryService.js +1227 -0
  65. package/services/memory/learnerWritingPad.js +577 -0
  66. package/services/memory/tutorWritingPad.js +674 -0
  67. package/services/promptRecommendationService.js +493 -0
  68. package/services/rubricEvaluator.js +826 -0
@@ -0,0 +1,3002 @@
1
+ /**
2
+ * Evaluation API Routes
3
+ *
4
+ * Endpoints for testing and evaluating AI tutor performance.
5
+ * Mirrors CLI /eval functionality for web/API access.
6
+ *
7
+ * Note: Prompt recommendations are read-only via API.
8
+ * Prompts can be viewed but not written to disk.
9
+ */
10
+
11
+ import { Router } from 'express';
12
+ import * as evaluationRunner from '../services/evaluationRunner.js';
13
+ import * as evaluationStore from '../services/evaluationStore.js';
14
+ import * as learnerConfigLoader from '../services/learnerConfigLoader.js';
15
+ import * as promptRecommendationService from '../services/promptRecommendationService.js';
16
+ import interactionEngine from '../services/learnerTutorInteractionEngine.js';
17
+ // Import core tutor services from @machinespirits/tutor-core
18
+ import {
19
+ tutorApiService as tutorApi,
20
+ dialogueLogService,
21
+ monitoringService,
22
+ aiConfigService,
23
+ writingPadService
24
+ } from '@machinespirits/tutor-core';
25
+ const { getApiKey, getDefaultModel } = aiConfigService;
26
+ const { clearConscious, getWritingPad } = writingPadService;
27
+ import fs from 'fs';
28
+ import path from 'path';
29
+
30
+ const router = Router();
31
+
32
+ // ============================================================================
33
+ // CRASH PROTECTION: Track active evaluation streams
34
+ // ============================================================================
35
+ const activeEvalStreams = new Map();
36
+ let streamIdCounter = 0;
37
+
38
+ // Configuration
39
+ const MAX_STREAM_DURATION_MS = 2 * 60 * 60 * 1000; // 2 hours
40
+ const TIMEOUT_WARNING_MS = 30 * 60 * 1000; // Warn at 30 minutes before timeout
41
+
42
+ // Cleanup function for orphaned streams
43
+ export function cleanupAllStreams() {
44
+ if (activeEvalStreams.size > 0) {
45
+ console.log(`[EvalRoutes] Cleaning up ${activeEvalStreams.size} active streams...`);
46
+ activeEvalStreams.forEach(({ res, keepAlive, timeoutTimer, streamId }) => {
47
+ try {
48
+ if (keepAlive) clearInterval(keepAlive);
49
+ if (timeoutTimer) clearTimeout(timeoutTimer);
50
+ if (res && !res.writableEnded) {
51
+ res.write('event: error\ndata: {"error": "Server restarting"}\n\n');
52
+ res.end();
53
+ }
54
+ } catch (e) {
55
+ console.error(`[EvalRoutes] Error cleaning stream ${streamId}:`, e.message);
56
+ }
57
+ });
58
+ activeEvalStreams.clear();
59
+ }
60
+ }
61
+
62
+ // Helper to register a new stream with timeout protection
63
+ function registerStream(res, keepAlive, options = {}) {
64
+ const streamId = `eval-stream-${++streamIdCounter}-${Date.now()}`;
65
+ const maxDuration = options.maxDuration || MAX_STREAM_DURATION_MS;
66
+ const startedAt = Date.now();
67
+
68
+ // Set up timeout handler
69
+ const timeoutTimer = setTimeout(() => {
70
+ console.warn(`[EvalRoutes] Stream ${streamId} exceeded max duration (${maxDuration}ms), forcing cleanup`);
71
+ try {
72
+ if (res && !res.writableEnded) {
73
+ res.write('event: error\ndata: {"error": "Evaluation timeout - exceeded maximum duration", "timeout": true}\n\n');
74
+ res.end();
75
+ }
76
+ } catch (e) {
77
+ console.error(`[EvalRoutes] Error sending timeout to ${streamId}:`, e.message);
78
+ }
79
+ unregisterStream(streamId);
80
+ }, maxDuration);
81
+
82
+ activeEvalStreams.set(streamId, {
83
+ res,
84
+ keepAlive,
85
+ timeoutTimer,
86
+ streamId,
87
+ startedAt,
88
+ maxDuration
89
+ });
90
+
91
+ console.log(`[EvalRoutes] Stream registered: ${streamId} (Timeout: ${maxDuration}ms, Total active: ${activeEvalStreams.size})`);
92
+ return streamId;
93
+ }
94
+
95
+ // Helper to unregister a stream
96
+ function unregisterStream(streamId) {
97
+ const stream = activeEvalStreams.get(streamId);
98
+ if (stream) {
99
+ if (stream.keepAlive) clearInterval(stream.keepAlive);
100
+ if (stream.timeoutTimer) clearTimeout(stream.timeoutTimer);
101
+ activeEvalStreams.delete(streamId);
102
+ const duration = Math.round((Date.now() - stream.startedAt) / 1000);
103
+ console.log(`[EvalRoutes] Stream closed: ${streamId} (Duration: ${duration}s, Remaining: ${activeEvalStreams.size})`);
104
+ }
105
+ }
106
+
107
+ // Periodic check for hung streams (runs every 5 minutes)
108
+ setInterval(() => {
109
+ const now = Date.now();
110
+ activeEvalStreams.forEach((stream, streamId) => {
111
+ const age = now - stream.startedAt;
112
+
113
+ // Warn if approaching timeout
114
+ if (age > (stream.maxDuration - TIMEOUT_WARNING_MS) && !stream.warningShown) {
115
+ const remaining = Math.round((stream.maxDuration - age) / 1000 / 60);
116
+ console.warn(`[EvalRoutes] Stream ${streamId} will timeout in ${remaining} minutes`);
117
+ try {
118
+ if (stream.res && !stream.res.writableEnded) {
119
+ stream.res.write(`event: warning\ndata: {"message": "Evaluation will timeout in ${remaining} minutes", "remainingMs": ${stream.maxDuration - age}}\n\n`);
120
+ }
121
+ } catch (e) {
122
+ // Ignore write errors
123
+ }
124
+ stream.warningShown = true;
125
+ }
126
+ });
127
+ }, 5 * 60 * 1000); // Check every 5 minutes
128
+
129
+ // Path to prompts directory
130
+ const PROMPTS_DIR = path.join(process.cwd(), 'prompts');
131
+
132
+ // ============================================================================
133
+ // Configuration Endpoints
134
+ // ============================================================================
135
+
136
+ /**
137
+ * List available scenarios
138
+ * GET /api/eval/scenarios
139
+ */
140
+ router.get('/scenarios', (req, res) => {
141
+ try {
142
+ const scenarios = tutorApi.listScenarios();
143
+ res.json({ success: true, scenarios });
144
+ } catch (error) {
145
+ console.error('[EvalRoutes] List scenarios error:', error);
146
+ res.status(500).json({ error: 'Failed to list scenarios' });
147
+ }
148
+ });
149
+
150
+ /**
151
+ * Get scenario details
152
+ * GET /api/eval/scenarios/:id
153
+ */
154
+ router.get('/scenarios/:id', (req, res) => {
155
+ try {
156
+ const scenario = tutorApi.getScenario(req.params.id);
157
+ if (!scenario) {
158
+ return res.status(404).json({ error: 'Scenario not found' });
159
+ }
160
+ res.json({ success: true, scenario });
161
+ } catch (error) {
162
+ console.error('[EvalRoutes] Get scenario error:', error);
163
+ res.status(500).json({ error: 'Failed to get scenario' });
164
+ }
165
+ });
166
+
167
+ /**
168
+ * List available tutor profiles
169
+ * GET /api/eval/profiles
170
+ */
171
+ router.get('/profiles', (req, res) => {
172
+ try {
173
+ const profiles = tutorApi.listProfiles();
174
+ res.json({ success: true, profiles });
175
+ } catch (error) {
176
+ console.error('[EvalRoutes] List profiles error:', error);
177
+ res.status(500).json({ error: 'Failed to list profiles' });
178
+ }
179
+ });
180
+
181
+ /**
182
+ * List available learner profiles (for interaction evaluations)
183
+ * GET /api/eval/learner-profiles
184
+ */
185
+ router.get('/learner-profiles', (req, res) => {
186
+ try {
187
+ const profiles = learnerConfigLoader.listProfiles();
188
+ const personas = learnerConfigLoader.listPersonas();
189
+ res.json({ success: true, profiles, personas });
190
+ } catch (error) {
191
+ console.error('[EvalRoutes] List learner profiles error:', error);
192
+ res.status(500).json({ error: 'Failed to list learner profiles' });
193
+ }
194
+ });
195
+
196
+ /**
197
+ * List model configurations
198
+ * GET /api/eval/configurations
199
+ */
200
+ router.get('/configurations', (req, res) => {
201
+ try {
202
+ const configurations = tutorApi.listConfigurations();
203
+ res.json({ success: true, configurations });
204
+ } catch (error) {
205
+ console.error('[EvalRoutes] List configurations error:', error);
206
+ res.status(500).json({ error: 'Failed to list configurations' });
207
+ }
208
+ });
209
+
210
+ // ============================================================================
211
+ // Quick Test Endpoints
212
+ // ============================================================================
213
+
214
+ /**
215
+ * Run a quick evaluation test
216
+ * POST /api/eval/quick
217
+ *
218
+ * Body: {
219
+ * profile: "budget", // Profile name or config string
220
+ * scenario: "new_user_first_visit", // Scenario ID (optional)
221
+ * skipRubric: true // Skip AI judge evaluation (optional)
222
+ * }
223
+ */
224
+ router.post('/quick', async (req, res) => {
225
+ try {
226
+ const { profile = 'budget', scenario = 'new_user_first_visit', skipRubric = false } = req.body;
227
+
228
+ // Build config
229
+ const config = { profileName: profile };
230
+
231
+ // Get scenario name for description
232
+ const scenarioDetails = tutorApi.getScenario(scenario);
233
+ const scenarioName = scenarioDetails?.name || scenario;
234
+
235
+ // Create a run to persist result to history
236
+ const run = evaluationStore.createRun({
237
+ description: scenarioName,
238
+ totalScenarios: 1,
239
+ totalConfigurations: 1,
240
+ metadata: {
241
+ runType: 'quick',
242
+ profiles: [profile],
243
+ scenarios: [scenario],
244
+ scenarioNames: [scenarioName],
245
+ },
246
+ });
247
+
248
+ const result = await evaluationRunner.quickTest(config, {
249
+ scenarioId: scenario,
250
+ skipRubricEval: skipRubric,
251
+ verbose: false,
252
+ });
253
+
254
+ // Store result to history
255
+ evaluationStore.storeResult(run.id, result);
256
+
257
+ // Mark run as completed
258
+ evaluationStore.updateRun(run.id, {
259
+ status: 'completed',
260
+ totalTests: 1,
261
+ completedAt: new Date().toISOString(),
262
+ });
263
+
264
+ res.json({
265
+ success: true,
266
+ runId: run.id,
267
+ result: {
268
+ runId: run.id,
269
+ scenarioId: result.scenarioId,
270
+ scenarioName: result.scenarioName,
271
+ profile: result.profileName,
272
+ provider: result.provider,
273
+ model: result.model,
274
+ passed: result.success,
275
+ overallScore: result.overallScore,
276
+ latencyMs: result.latencyMs,
277
+ scores: result.scoresWithReasoning || result.scores, // Prefer detailed scores
278
+ validation: {
279
+ passesRequired: result.passesRequired,
280
+ passesForbidden: result.passesForbidden,
281
+ requiredMissing: result.requiredMissing,
282
+ forbiddenFound: result.forbiddenFound,
283
+ },
284
+ suggestions: result.suggestions,
285
+ // Token usage
286
+ inputTokens: result.inputTokens,
287
+ outputTokens: result.outputTokens,
288
+ totalTokens: (result.inputTokens || 0) + (result.outputTokens || 0),
289
+ apiCalls: result.apiCalls,
290
+ dialogueRounds: result.dialogueRounds,
291
+ // Evaluator reasoning
292
+ evaluationReasoning: result.evaluationReasoning,
293
+ evaluatorModel: result.evaluatorModel,
294
+ // Scenario context for display (original user request)
295
+ scenarioContext: scenarioDetails ? {
296
+ description: scenarioDetails.description,
297
+ expectedBehavior: scenarioDetails.expected_behavior,
298
+ learnerContext: scenarioDetails.learner_context,
299
+ } : null,
300
+ },
301
+ });
302
+ } catch (error) {
303
+ console.error('[EvalRoutes] Quick test error:', error);
304
+ res.status(500).json({ error: 'Failed to run quick test', details: error.message });
305
+ }
306
+ });
307
+
308
+ /**
309
+ * Run a quick test with SSE streaming for real-time logs
310
+ * GET /api/eval/stream/quick
311
+ * Query params: profile, scenario, skipRubric
312
+ */
313
+ router.get('/stream/quick', async (req, res) => {
314
+ // Set up SSE
315
+ res.writeHead(200, {
316
+ 'Content-Type': 'text/event-stream',
317
+ 'Cache-Control': 'no-cache',
318
+ Connection: 'keep-alive',
319
+ });
320
+
321
+ const sendEvent = (type, data) => {
322
+ // Use named events for addEventListener compatibility
323
+ res.write(`event: ${type}\ndata: ${JSON.stringify(data)}\n\n`);
324
+ };
325
+
326
+ // Keep-alive to prevent connection timeout
327
+ const keepAlive = setInterval(() => {
328
+ res.write(': keepalive\n\n');
329
+ }, 15000);
330
+
331
+ // Register stream for crash protection
332
+ const streamId = registerStream(res, keepAlive);
333
+
334
+ // Clean up on close
335
+ req.on('close', () => {
336
+ clearInterval(keepAlive);
337
+ unregisterStream(streamId);
338
+ });
339
+
340
+ try {
341
+ const profile = req.query.profile || 'budget';
342
+ const scenario = req.query.scenario || 'new_user_first_visit';
343
+ const skipRubric = req.query.skipRubric === 'true';
344
+ const outputSize = req.query.outputSize || 'normal'; // compact, normal, expanded
345
+
346
+ // Get scenario name for description
347
+ const scenarioDetails = tutorApi.getScenario(scenario);
348
+ const scenarioName = scenarioDetails?.name || scenario;
349
+
350
+ // Create a run to persist result to history (status: 'running')
351
+ const run = evaluationStore.createRun({
352
+ description: scenarioName,
353
+ totalScenarios: 1,
354
+ totalConfigurations: 1,
355
+ metadata: {
356
+ runType: 'quick',
357
+ profiles: [profile],
358
+ scenarios: [scenario],
359
+ scenarioNames: [scenarioName],
360
+ },
361
+ });
362
+
363
+ sendEvent('start', {
364
+ profile,
365
+ scenario,
366
+ skipRubric,
367
+ outputSize,
368
+ runId: run.id,
369
+ timestamp: new Date().toISOString(),
370
+ });
371
+
372
+ sendEvent('log', { message: `Starting quick test: ${profile} / ${scenario}`, level: 'info' });
373
+ sendEvent('log', { message: `Run ID: ${run.id}`, level: 'info' });
374
+ sendEvent('log', { message: `Skip rubric evaluation: ${skipRubric}`, level: 'info' });
375
+ sendEvent('log', { message: `Output size: ${outputSize}`, level: 'info' });
376
+
377
+ const config = { profileName: profile };
378
+
379
+ // Create a log callback to stream logs
380
+ const onLog = (message, level = 'info') => {
381
+ sendEvent('log', { message, level, timestamp: new Date().toISOString() });
382
+ };
383
+
384
+ sendEvent('log', { message: 'Building learner context...', level: 'info' });
385
+ sendEvent('progress', { stage: 'context', message: 'Building learner context' });
386
+
387
+ const result = await evaluationRunner.quickTest(config, {
388
+ scenarioId: scenario,
389
+ skipRubricEval: skipRubric,
390
+ outputSize, // compact, normal, expanded - affects response length
391
+ verbose: true,
392
+ onLog, // Pass log callback
393
+ });
394
+
395
+ // Store result to history
396
+ evaluationStore.storeResult(run.id, result);
397
+
398
+ // Mark run as completed
399
+ evaluationStore.updateRun(run.id, {
400
+ status: 'completed',
401
+ totalTests: 1,
402
+ completedAt: new Date().toISOString(),
403
+ });
404
+
405
+ sendEvent('log', { message: `Test completed: score=${result.overallScore?.toFixed(1) || 'N/A'}`, level: 'success' });
406
+ sendEvent('log', { message: `Saved to history: ${run.id}`, level: 'info' });
407
+
408
+ sendEvent('result', {
409
+ runId: run.id,
410
+ scenarioId: result.scenarioId,
411
+ scenarioName: result.scenarioName,
412
+ profile: result.profileName,
413
+ provider: result.provider,
414
+ model: result.model,
415
+ passed: result.success,
416
+ overallScore: result.overallScore,
417
+ latencyMs: result.latencyMs,
418
+ scores: result.scoresWithReasoning || result.scores, // Prefer detailed scores
419
+ validation: {
420
+ passesRequired: result.passesRequired,
421
+ passesForbidden: result.passesForbidden,
422
+ requiredMissing: result.requiredMissing,
423
+ forbiddenFound: result.forbiddenFound,
424
+ },
425
+ suggestions: result.suggestions,
426
+ inputTokens: result.inputTokens,
427
+ outputTokens: result.outputTokens,
428
+ totalTokens: (result.inputTokens || 0) + (result.outputTokens || 0),
429
+ apiCalls: result.apiCalls,
430
+ dialogueRounds: result.dialogueRounds,
431
+ dialogueId: result.dialogueId,
432
+ // Evaluator reasoning
433
+ evaluationReasoning: result.evaluationReasoning,
434
+ evaluatorModel: result.evaluatorModel,
435
+ // Scenario context for display (original user request)
436
+ scenarioContext: scenarioDetails ? {
437
+ description: scenarioDetails.description,
438
+ expectedBehavior: scenarioDetails.expected_behavior,
439
+ learnerContext: scenarioDetails.learner_context,
440
+ } : null,
441
+ });
442
+
443
+ sendEvent('complete', { success: true, runId: run.id });
444
+ clearInterval(keepAlive);
445
+ res.end();
446
+ } catch (error) {
447
+ sendEvent('log', { message: `Error: ${error.message}`, level: 'error' });
448
+ sendEvent('error', { error: error.message });
449
+ clearInterval(keepAlive);
450
+ res.end();
451
+ }
452
+ });
453
+
454
+ // ============================================================================
455
+ // Full Evaluation Endpoints
456
+ // ============================================================================
457
+
458
+ /**
459
+ * Run a full evaluation
460
+ * POST /api/eval/run
461
+ *
462
+ * Body: {
463
+ * profiles: ["budget", "fast"], // Profiles to test
464
+ * scenarios: ["new_user_first_visit", "struggling_learner"], // Scenarios (or "all")
465
+ * runsPerConfig: 1, // Repetitions
466
+ * skipRubric: false // Use AI judge
467
+ * }
468
+ */
469
+ router.post('/run', async (req, res) => {
470
+ try {
471
+ const {
472
+ profiles = ['budget'],
473
+ scenarios = 'all',
474
+ runsPerConfig = 1,
475
+ skipRubric = false,
476
+ description
477
+ } = req.body;
478
+
479
+ // Build configurations from profiles
480
+ const configurations = profiles.map(p => ({ profileName: p, label: p }));
481
+
482
+ const result = await evaluationRunner.runEvaluation({
483
+ scenarios,
484
+ configurations,
485
+ runsPerConfig,
486
+ skipRubricEval: skipRubric,
487
+ description,
488
+ verbose: false,
489
+ });
490
+
491
+ res.json({
492
+ success: true,
493
+ runId: result.runId,
494
+ totalTests: result.totalTests,
495
+ successfulTests: result.successfulTests,
496
+ stats: result.stats,
497
+ scenarioStats: result.scenarioStats,
498
+ });
499
+ } catch (error) {
500
+ console.error('[EvalRoutes] Run evaluation error:', error);
501
+ res.status(500).json({ error: 'Failed to run evaluation', details: error.message });
502
+ }
503
+ });
504
+
505
+ /**
506
+ * Compare multiple configurations
507
+ * POST /api/eval/compare
508
+ *
509
+ * Body: {
510
+ * profiles: ["budget", "fast", "quality"],
511
+ * scenarios: "all",
512
+ * runsPerConfig: 1
513
+ * }
514
+ */
515
+ router.post('/compare', async (req, res) => {
516
+ try {
517
+ const { profiles, scenarios = 'all', runsPerConfig = 1 } = req.body;
518
+
519
+ if (!profiles || profiles.length < 2) {
520
+ return res.status(400).json({ error: 'At least 2 profiles required for comparison' });
521
+ }
522
+
523
+ const configs = profiles.map(p => ({ profileName: p, label: p }));
524
+
525
+ const result = await evaluationRunner.compareConfigurations(configs, {
526
+ scenarios,
527
+ runsPerConfig,
528
+ verbose: false,
529
+ });
530
+
531
+ res.json({
532
+ success: true,
533
+ runId: result.runId,
534
+ rankings: result.rankings,
535
+ scenarioBreakdown: result.scenarioBreakdown,
536
+ });
537
+ } catch (error) {
538
+ console.error('[EvalRoutes] Compare error:', error);
539
+ res.status(500).json({ error: 'Failed to compare configurations', details: error.message });
540
+ }
541
+ });
542
+
543
+ /**
544
+ * Matrix comparison of multiple profiles with dimension breakdowns
545
+ * POST /api/eval/matrix
546
+ *
547
+ * Body: {
548
+ * profiles: ["budget", "default", "experimental"], // Profiles to test
549
+ * scenarios: "all", // Scenarios to run (or array of IDs)
550
+ * skipRubric: true // Skip AI judge evaluation (faster)
551
+ * }
552
+ *
553
+ * Returns dimension scores and overall rankings for each profile.
554
+ */
555
+ router.post('/matrix', async (req, res) => {
556
+ try {
557
+ let { profiles = [], scenarios = 'all', skipRubric = false } = req.body;
558
+
559
+ // Default profiles if none specified
560
+ const allProfiles = tutorApi.listProfiles();
561
+ if (profiles.length === 0) {
562
+ profiles = ['budget', 'experimental', 'default', 'fast'].filter(p =>
563
+ allProfiles.some(ap => ap.name === p)
564
+ );
565
+ }
566
+
567
+ // Validate profiles exist
568
+ const validProfiles = profiles.filter(p => allProfiles.some(ap => ap.name === p));
569
+ const invalidProfiles = profiles.filter(p => !allProfiles.some(ap => ap.name === p));
570
+
571
+ if (validProfiles.length === 0) {
572
+ return res.status(400).json({
573
+ error: 'No valid profiles specified',
574
+ available: allProfiles.map(p => p.name),
575
+ });
576
+ }
577
+
578
+ // Get scenarios
579
+ const allScenarios = tutorApi.listScenarios();
580
+ const scenariosToRun = scenarios === 'all'
581
+ ? allScenarios
582
+ : allScenarios.filter(s => scenarios.includes(s.id));
583
+
584
+ // Create a run to persist results to history
585
+ const run = evaluationStore.createRun({
586
+ description: `${validProfiles.length} profiles × ${scenariosToRun.length} scenarios`,
587
+ totalScenarios: scenariosToRun.length,
588
+ totalConfigurations: validProfiles.length,
589
+ metadata: {
590
+ runType: 'matrix',
591
+ profiles: validProfiles,
592
+ scenarios: scenariosToRun.map(s => s.id),
593
+ scenarioNames: scenariosToRun.map(s => s.name),
594
+ skipRubric,
595
+ },
596
+ });
597
+
598
+ // Run evaluations
599
+ const results = {};
600
+ const dimensionScores = {};
601
+ let totalTests = 0;
602
+
603
+ for (const profileName of validProfiles) {
604
+ results[profileName] = [];
605
+ dimensionScores[profileName] = {};
606
+
607
+ for (const scenario of scenariosToRun) {
608
+ try {
609
+ const config = { profileName, label: profileName };
610
+ const result = await evaluationRunner.quickTest(config, {
611
+ scenarioId: scenario.id,
612
+ verbose: false,
613
+ skipRubricEval: skipRubric,
614
+ debug: false,
615
+ });
616
+
617
+ results[profileName].push(result);
618
+ totalTests++;
619
+
620
+ // Save result to database
621
+ evaluationStore.storeResult(run.id, {
622
+ ...result,
623
+ scenarioId: scenario.id,
624
+ scenarioName: scenario.name,
625
+ profileName,
626
+ });
627
+
628
+ // Collect dimension scores
629
+ if (result.scores) {
630
+ for (const [dim, score] of Object.entries(result.scores)) {
631
+ if (!dimensionScores[profileName][dim]) {
632
+ dimensionScores[profileName][dim] = [];
633
+ }
634
+ if (typeof score === 'number') {
635
+ dimensionScores[profileName][dim].push(score);
636
+ }
637
+ }
638
+ }
639
+ } catch (e) {
640
+ const errorResult = {
641
+ success: false,
642
+ errorMessage: e.message,
643
+ scenarioId: scenario.id,
644
+ };
645
+ results[profileName].push(errorResult);
646
+ totalTests++;
647
+
648
+ // Save error to database
649
+ evaluationStore.storeResult(run.id, {
650
+ ...errorResult,
651
+ scenarioName: scenario.name,
652
+ profileName,
653
+ provider: 'unknown',
654
+ model: 'unknown',
655
+ });
656
+ }
657
+ }
658
+ }
659
+
660
+ // Update run as completed
661
+ evaluationStore.updateRun(run.id, {
662
+ status: 'completed',
663
+ totalTests,
664
+ completedAt: new Date().toISOString(),
665
+ });
666
+
667
+ // Build dimension averages
668
+ const dimensions = ['relevance', 'specificity', 'pedagogical', 'personalization', 'actionability', 'tone'];
669
+ const dimensionAverages = {};
670
+ for (const profile of validProfiles) {
671
+ dimensionAverages[profile] = {};
672
+ for (const dim of dimensions) {
673
+ const scores = dimensionScores[profile]?.[dim] || [];
674
+ dimensionAverages[profile][dim] = scores.length > 0
675
+ ? scores.reduce((a, b) => a + b, 0) / scores.length
676
+ : null;
677
+ }
678
+ }
679
+
680
+ // Build rankings
681
+ const rankings = validProfiles.map(profile => {
682
+ const profileResults = results[profile] || [];
683
+ const successCount = profileResults.filter(r => r.success !== false).length;
684
+ const scores = profileResults.filter(r => r.overallScore != null).map(r => r.overallScore);
685
+ const avgScore = scores.length > 0 ? scores.reduce((a, b) => a + b, 0) / scores.length : null;
686
+ const latencies = profileResults.filter(r => r.latencyMs != null).map(r => r.latencyMs);
687
+ const avgLatency = latencies.length > 0 ? latencies.reduce((a, b) => a + b, 0) / latencies.length : null;
688
+
689
+ return {
690
+ profile,
691
+ tests: profileResults.length,
692
+ successes: successCount,
693
+ avgScore,
694
+ avgLatency,
695
+ };
696
+ }).sort((a, b) => (b.avgScore || 0) - (a.avgScore || 0));
697
+
698
+ res.json({
699
+ success: true,
700
+ runId: run.id, // Include run ID so frontend can navigate to history
701
+ profiles: validProfiles,
702
+ invalidProfiles: invalidProfiles.length > 0 ? invalidProfiles : undefined,
703
+ scenariosRun: scenariosToRun.length,
704
+ dimensionAverages,
705
+ rankings,
706
+ results, // Full results for detailed analysis
707
+ });
708
+ } catch (error) {
709
+ console.error('[EvalRoutes] Matrix error:', error);
710
+ res.status(500).json({ error: 'Failed to run matrix comparison', details: error.message });
711
+ }
712
+ });
713
+
714
+ /**
715
+ * Run matrix comparison with SSE streaming for real-time logs
716
+ * GET /api/eval/stream/matrix
717
+ * Query params: profiles, scenarios, skipRubric
718
+ */
719
+ router.get('/stream/matrix', async (req, res) => {
720
+ // Set up SSE
721
+ res.writeHead(200, {
722
+ 'Content-Type': 'text/event-stream',
723
+ 'Cache-Control': 'no-cache',
724
+ Connection: 'keep-alive',
725
+ });
726
+
727
+ const sendEvent = (type, data) => {
728
+ // Use named events for addEventListener compatibility
729
+ res.write(`event: ${type}\ndata: ${JSON.stringify(data)}\n\n`);
730
+ };
731
+
732
+ // Keep-alive to prevent connection timeout
733
+ const keepAlive = setInterval(() => {
734
+ res.write(': keepalive\n\n');
735
+ }, 15000);
736
+
737
+ // Register stream for crash protection
738
+ const streamId = registerStream(res, keepAlive);
739
+
740
+ // Clean up on close
741
+ req.on('close', () => {
742
+ clearInterval(keepAlive);
743
+ unregisterStream(streamId);
744
+ });
745
+
746
+ try {
747
+ const profilesParam = req.query.profiles || '';
748
+ let profiles = profilesParam ? profilesParam.split(',') : [];
749
+ const scenariosParam = req.query.scenarios || 'all';
750
+ const scenarios = scenariosParam === 'all' ? 'all' : scenariosParam.split(',');
751
+ const skipRubric = req.query.skipRubric === 'true';
752
+ const outputSize = req.query.outputSize || 'normal';
753
+
754
+ // Get all available profiles
755
+ const allProfiles = tutorApi.listProfiles();
756
+ if (profiles.length === 0) {
757
+ profiles = ['budget', 'experimental', 'default', 'fast'].filter(p =>
758
+ allProfiles.some(ap => ap.name === p)
759
+ );
760
+ }
761
+
762
+ // Validate profiles
763
+ const validProfiles = profiles.filter(p => allProfiles.some(ap => ap.name === p));
764
+ if (validProfiles.length === 0) {
765
+ sendEvent('error', { error: 'No valid profiles specified' });
766
+ return res.end();
767
+ }
768
+
769
+ // Get scenarios
770
+ const allScenarios = tutorApi.listScenarios();
771
+ const scenariosToRun = scenarios === 'all'
772
+ ? allScenarios
773
+ : allScenarios.filter(s => scenarios.includes(s.id));
774
+
775
+ const totalTests = validProfiles.length * scenariosToRun.length;
776
+
777
+ sendEvent('start', {
778
+ profiles: validProfiles,
779
+ scenarioCount: scenariosToRun.length,
780
+ totalTests,
781
+ skipRubric,
782
+ outputSize,
783
+ timestamp: new Date().toISOString(),
784
+ });
785
+
786
+ sendEvent('log', { message: `Starting matrix: ${validProfiles.length} profiles × ${scenariosToRun.length} scenarios = ${totalTests} tests`, level: 'info' });
787
+ sendEvent('log', { message: `Output size: ${outputSize}`, level: 'info' });
788
+
789
+ // Create a run to persist results
790
+ const run = evaluationStore.createRun({
791
+ description: `${validProfiles.length} profiles × ${scenariosToRun.length} scenarios`,
792
+ totalScenarios: scenariosToRun.length,
793
+ totalConfigurations: validProfiles.length,
794
+ metadata: {
795
+ runType: 'matrix',
796
+ profiles: validProfiles,
797
+ scenarios: scenariosToRun.map(s => s.id),
798
+ scenarioNames: scenariosToRun.map(s => s.name),
799
+ skipRubric,
800
+ },
801
+ });
802
+
803
+ sendEvent('log', { message: `Run ID: ${run.id}`, level: 'info' });
804
+
805
+ // Run evaluations
806
+ const results = {};
807
+ const dimensionScores = {};
808
+ let completedTests = 0;
809
+
810
+ for (const profileName of validProfiles) {
811
+ results[profileName] = [];
812
+ dimensionScores[profileName] = {};
813
+
814
+ sendEvent('log', { message: `\n=== Profile: ${profileName} ===`, level: 'info' });
815
+
816
+ for (const scenario of scenariosToRun) {
817
+ completedTests++;
818
+
819
+ sendEvent('progress', {
820
+ current: completedTests,
821
+ total: totalTests,
822
+ profile: profileName,
823
+ scenario: scenario.name,
824
+ percentage: Math.round((completedTests / totalTests) * 100),
825
+ });
826
+
827
+ sendEvent('log', { message: `[${completedTests}/${totalTests}] ${scenario.name}...`, level: 'info' });
828
+
829
+ try {
830
+ const config = { profileName, label: profileName };
831
+
832
+ // Create log callback for this test
833
+ const onLog = (message, level = 'info') => {
834
+ sendEvent('log', { message: ` ${message}`, level, timestamp: new Date().toISOString() });
835
+ };
836
+
837
+ const result = await evaluationRunner.quickTest(config, {
838
+ scenarioId: scenario.id,
839
+ verbose: false,
840
+ skipRubricEval: skipRubric,
841
+ outputSize,
842
+ onLog,
843
+ });
844
+
845
+ results[profileName].push(result);
846
+
847
+ // Save result to database
848
+ evaluationStore.storeResult(run.id, {
849
+ ...result,
850
+ scenarioId: scenario.id,
851
+ scenarioName: scenario.name,
852
+ profileName,
853
+ });
854
+
855
+ // Collect dimension scores
856
+ if (result.scores) {
857
+ for (const [dim, score] of Object.entries(result.scores)) {
858
+ if (!dimensionScores[profileName][dim]) {
859
+ dimensionScores[profileName][dim] = [];
860
+ }
861
+ if (typeof score === 'number') {
862
+ dimensionScores[profileName][dim].push(score);
863
+ }
864
+ }
865
+ }
866
+
867
+ const scoreStr = result.overallScore != null ? result.overallScore.toFixed(1) : 'N/A';
868
+ const status = result.success !== false ? '✓' : '✗';
869
+ sendEvent('log', { message: ` ${status} Score: ${scoreStr} (${result.latencyMs}ms)`, level: result.success !== false ? 'success' : 'warning' });
870
+
871
+ sendEvent('result', {
872
+ profile: profileName,
873
+ scenarioId: scenario.id,
874
+ scenarioName: scenario.name,
875
+ passed: result.success !== false,
876
+ score: result.overallScore,
877
+ latencyMs: result.latencyMs,
878
+ inputTokens: result.inputTokens,
879
+ outputTokens: result.outputTokens,
880
+ });
881
+
882
+ } catch (e) {
883
+ sendEvent('log', { message: ` ✗ Error: ${e.message}`, level: 'error' });
884
+
885
+ const errorResult = {
886
+ success: false,
887
+ errorMessage: e.message,
888
+ scenarioId: scenario.id,
889
+ };
890
+ results[profileName].push(errorResult);
891
+
892
+ evaluationStore.storeResult(run.id, {
893
+ ...errorResult,
894
+ scenarioName: scenario.name,
895
+ profileName,
896
+ provider: 'unknown',
897
+ model: 'unknown',
898
+ });
899
+ }
900
+ }
901
+ }
902
+
903
+ // Update run as completed
904
+ evaluationStore.updateRun(run.id, {
905
+ status: 'completed',
906
+ totalTests: completedTests,
907
+ completedAt: new Date().toISOString(),
908
+ });
909
+
910
+ // Build dimension averages
911
+ const dimensions = ['relevance', 'specificity', 'pedagogical', 'personalization', 'actionability', 'tone'];
912
+ const dimensionAverages = {};
913
+ for (const profile of validProfiles) {
914
+ dimensionAverages[profile] = {};
915
+ for (const dim of dimensions) {
916
+ const scores = dimensionScores[profile]?.[dim] || [];
917
+ dimensionAverages[profile][dim] = scores.length > 0
918
+ ? scores.reduce((a, b) => a + b, 0) / scores.length
919
+ : null;
920
+ }
921
+ }
922
+
923
+ // Build rankings
924
+ const rankings = validProfiles.map(profile => {
925
+ const profileResults = results[profile] || [];
926
+ const successCount = profileResults.filter(r => r.success !== false).length;
927
+ const scores = profileResults.filter(r => r.overallScore != null).map(r => r.overallScore);
928
+ const avgScore = scores.length > 0 ? scores.reduce((a, b) => a + b, 0) / scores.length : null;
929
+ const latencies = profileResults.filter(r => r.latencyMs != null).map(r => r.latencyMs);
930
+ const avgLatency = latencies.length > 0 ? latencies.reduce((a, b) => a + b, 0) / latencies.length : null;
931
+
932
+ return {
933
+ profile,
934
+ tests: profileResults.length,
935
+ successes: successCount,
936
+ avgScore,
937
+ avgLatency,
938
+ };
939
+ }).sort((a, b) => (b.avgScore || 0) - (a.avgScore || 0));
940
+
941
+ sendEvent('log', { message: `\n=== Matrix Complete ===`, level: 'success' });
942
+ sendEvent('log', { message: `Total tests: ${completedTests}`, level: 'info' });
943
+
944
+ // Send final complete event with full results
945
+ sendEvent('complete', {
946
+ success: true,
947
+ runId: run.id,
948
+ profiles: validProfiles,
949
+ scenariosRun: scenariosToRun.length,
950
+ dimensionAverages,
951
+ rankings,
952
+ results,
953
+ });
954
+
955
+ unregisterStream(streamId);
956
+ res.end();
957
+ } catch (error) {
958
+ sendEvent('log', { message: `Fatal error: ${error.message}`, level: 'error' });
959
+ sendEvent('error', { error: error.message });
960
+ unregisterStream(streamId);
961
+ res.end();
962
+ }
963
+ });
964
+
965
+ /**
966
+ * Run learner-tutor interaction evaluation with SSE streaming
967
+ * GET /api/eval/stream/interact
968
+ * Query params:
969
+ * - persona: learner persona ID (default: confused_novice)
970
+ * - profile: tutor profile name (default: budget)
971
+ * - turns: number of dialogue turns (default: 5)
972
+ * - dialogueEnabled: whether tutor uses multi-agent dialogue (default: true)
973
+ * - topic: topic for discussion (default: "Hegel's concept of recognition")
974
+ * - runJudge: whether to run AI judge evaluation (default: true)
975
+ */
976
+ router.get('/stream/interact', async (req, res) => {
977
+ // Set up SSE
978
+ res.writeHead(200, {
979
+ 'Content-Type': 'text/event-stream',
980
+ 'Cache-Control': 'no-cache',
981
+ Connection: 'keep-alive',
982
+ });
983
+
984
+ const sendEvent = (type, data) => {
985
+ res.write(`event: ${type}\ndata: ${JSON.stringify(data)}\n\n`);
986
+ };
987
+
988
+ // Keep-alive to prevent connection timeout
989
+ const keepAlive = setInterval(() => {
990
+ res.write(': keepalive\n\n');
991
+ }, 15000);
992
+
993
+ // Register stream for crash protection (interaction evals can take a while)
994
+ const streamId = registerStream(res, keepAlive, { maxDuration: 30 * 60 * 1000 }); // 30 min timeout
995
+
996
+ // Clean up on close
997
+ req.on('close', () => {
998
+ clearInterval(keepAlive);
999
+ unregisterStream(streamId);
1000
+ });
1001
+
1002
+ try {
1003
+ const persona = req.query.persona || 'confused_novice';
1004
+ const tutorProfile = req.query.profile || 'budget';
1005
+ const maxTurns = parseInt(req.query.turns) || 5;
1006
+ const dialogueEnabled = req.query.dialogueEnabled !== 'false';
1007
+ const topic = req.query.topic || "Hegel's concept of recognition";
1008
+ const runJudge = req.query.runJudge !== 'false';
1009
+
1010
+ sendEvent('start', {
1011
+ persona,
1012
+ tutorProfile,
1013
+ maxTurns,
1014
+ dialogueEnabled,
1015
+ topic,
1016
+ runJudge,
1017
+ timestamp: new Date().toISOString(),
1018
+ });
1019
+
1020
+ sendEvent('log', { message: `Starting interaction evaluation`, level: 'info' });
1021
+ sendEvent('log', { message: `Learner persona: ${persona}`, level: 'info' });
1022
+ sendEvent('log', { message: `Tutor profile: ${tutorProfile}`, level: 'info' });
1023
+ sendEvent('log', { message: `Max turns: ${maxTurns}`, level: 'info' });
1024
+ sendEvent('log', { message: `Dialogue enabled: ${dialogueEnabled}`, level: 'info' });
1025
+ sendEvent('log', { message: `Topic: ${topic}`, level: 'info' });
1026
+
1027
+ // Set up LLM call function using available providers
1028
+ let llmClient = null;
1029
+ let llmProvider = null;
1030
+
1031
+ // Try providers in order of preference
1032
+ const openrouterKey = getApiKey('openrouter');
1033
+ const geminiKey = getApiKey('gemini');
1034
+ const anthropicKey = getApiKey('claude');
1035
+ const openaiKey = getApiKey('openai');
1036
+
1037
+ if (openrouterKey) {
1038
+ llmProvider = 'openrouter';
1039
+ const OpenAI = (await import('openai')).default;
1040
+ llmClient = new OpenAI({
1041
+ apiKey: openrouterKey,
1042
+ baseURL: 'https://openrouter.ai/api/v1',
1043
+ });
1044
+ sendEvent('log', { message: `Using OpenRouter for LLM calls`, level: 'info' });
1045
+ } else if (geminiKey) {
1046
+ llmProvider = 'gemini';
1047
+ const { GoogleGenAI } = await import('@google/genai');
1048
+ llmClient = new GoogleGenAI({ apiKey: geminiKey });
1049
+ sendEvent('log', { message: `Using Gemini for LLM calls`, level: 'info' });
1050
+ } else if (anthropicKey) {
1051
+ llmProvider = 'anthropic';
1052
+ const Anthropic = (await import('@anthropic-ai/sdk')).default;
1053
+ llmClient = new Anthropic({ apiKey: anthropicKey });
1054
+ sendEvent('log', { message: `Using Anthropic for LLM calls`, level: 'info' });
1055
+ } else if (openaiKey) {
1056
+ llmProvider = 'openai';
1057
+ const OpenAI = (await import('openai')).default;
1058
+ llmClient = new OpenAI({ apiKey: openaiKey });
1059
+ sendEvent('log', { message: `Using OpenAI for LLM calls`, level: 'info' });
1060
+ } else {
1061
+ throw new Error('No LLM API key configured. Set OPENROUTER_API_KEY, GEMINI_API_KEY, ANTHROPIC_API_KEY, or OPENAI_API_KEY.');
1062
+ }
1063
+
1064
+ // Create the llmCall function matching the expected signature
1065
+ const llmCall = async (requestedModel, systemPrompt, messages, options = {}) => {
1066
+ const { temperature = 0.7, maxTokens = 1000 } = options;
1067
+ const model = requestedModel || getDefaultModel(llmProvider === 'anthropic' ? 'claude' : llmProvider) || 'deepseek/deepseek-chat';
1068
+
1069
+ try {
1070
+ if (llmProvider === 'openrouter') {
1071
+ const response = await llmClient.chat.completions.create({
1072
+ model,
1073
+ temperature,
1074
+ max_tokens: maxTokens,
1075
+ messages: [
1076
+ { role: 'system', content: systemPrompt },
1077
+ ...messages.map(m => ({
1078
+ role: m.role === 'user' ? 'user' : 'assistant',
1079
+ content: m.content,
1080
+ })),
1081
+ ],
1082
+ });
1083
+ return {
1084
+ content: response.choices[0]?.message?.content || '',
1085
+ usage: {
1086
+ inputTokens: response.usage?.prompt_tokens || 0,
1087
+ outputTokens: response.usage?.completion_tokens || 0,
1088
+ },
1089
+ };
1090
+ } else if (llmProvider === 'gemini') {
1091
+ const userMessages = messages.map(m => m.content).join('\n\n');
1092
+ const response = await llmClient.models.generateContent({
1093
+ model,
1094
+ contents: [{ role: 'user', parts: [{ text: `${systemPrompt}\n\n${userMessages}` }] }],
1095
+ generationConfig: { temperature, maxOutputTokens: maxTokens },
1096
+ });
1097
+ const text = response.text || response.candidates?.[0]?.content?.parts?.[0]?.text || '';
1098
+ return {
1099
+ content: text,
1100
+ usage: {
1101
+ inputTokens: Math.ceil((systemPrompt.length + userMessages.length) / 4),
1102
+ outputTokens: Math.ceil(text.length / 4),
1103
+ },
1104
+ };
1105
+ } else if (llmProvider === 'anthropic') {
1106
+ const response = await llmClient.messages.create({
1107
+ model: model || 'claude-3-5-haiku-20241022',
1108
+ max_tokens: maxTokens,
1109
+ system: systemPrompt,
1110
+ messages: messages.map(m => ({
1111
+ role: m.role === 'user' ? 'user' : 'assistant',
1112
+ content: m.content,
1113
+ })),
1114
+ });
1115
+ return {
1116
+ content: response.content[0]?.text || '',
1117
+ usage: {
1118
+ inputTokens: response.usage?.input_tokens || 0,
1119
+ outputTokens: response.usage?.output_tokens || 0,
1120
+ },
1121
+ };
1122
+ } else if (llmProvider === 'openai') {
1123
+ const response = await llmClient.chat.completions.create({
1124
+ model: model || 'gpt-4o-mini',
1125
+ temperature,
1126
+ max_tokens: maxTokens,
1127
+ messages: [
1128
+ { role: 'system', content: systemPrompt },
1129
+ ...messages.map(m => ({
1130
+ role: m.role === 'user' ? 'user' : 'assistant',
1131
+ content: m.content,
1132
+ })),
1133
+ ],
1134
+ });
1135
+ return {
1136
+ content: response.choices[0]?.message?.content || '',
1137
+ usage: {
1138
+ inputTokens: response.usage?.prompt_tokens || 0,
1139
+ outputTokens: response.usage?.completion_tokens || 0,
1140
+ },
1141
+ };
1142
+ }
1143
+ } catch (error) {
1144
+ console.error(`[InteractStream] LLM call error:`, error.message);
1145
+ throw error;
1146
+ }
1147
+ };
1148
+
1149
+ // Generate unique learner ID for this eval
1150
+ const learnerId = `eval-learner-${persona}-${Date.now()}`;
1151
+ const evalId = `short-interact-${Date.now()}`;
1152
+ const sessionId = `session-${Date.now()}`;
1153
+
1154
+ sendEvent('log', { message: `Eval ID: ${evalId}`, level: 'info' });
1155
+ sendEvent('progress', { stage: 'setup', message: 'Initializing interaction' });
1156
+
1157
+ // Run the interaction
1158
+ sendEvent('log', { message: `\nStarting ${maxTurns}-turn interaction...`, level: 'info' });
1159
+ sendEvent('progress', { stage: 'interaction', message: 'Running learner-tutor dialogue' });
1160
+
1161
+ const interactionTrace = await interactionEngine.runInteraction(
1162
+ {
1163
+ learnerId,
1164
+ personaId: persona,
1165
+ tutorProfile,
1166
+ learnerProfile: dialogueEnabled ? 'psychodynamic' : 'unified',
1167
+ topic,
1168
+ scenario: {
1169
+ name: `Interactive Evaluation - ${persona}`,
1170
+ },
1171
+ sessionId,
1172
+ },
1173
+ llmCall,
1174
+ {
1175
+ maxTurns,
1176
+ trace: true,
1177
+ observeInternals: true,
1178
+ }
1179
+ );
1180
+
1181
+ sendEvent('log', { message: `Interaction completed: ${interactionTrace.turns.length} turns`, level: 'success' });
1182
+
1183
+ // Generate sequence diagram and transcript
1184
+ const generateSequenceDiagram = (trace) => {
1185
+ const lines = ['sequenceDiagram'];
1186
+ lines.push(' participant L as Learner');
1187
+ lines.push(' participant T as Tutor');
1188
+
1189
+ for (const turn of trace.turns || []) {
1190
+ const speaker = turn.phase === 'learner' ? 'L' : 'T';
1191
+ const target = turn.phase === 'learner' ? 'T' : 'L';
1192
+ const msg = (turn.externalMessage || '').slice(0, 50).replace(/"/g, "'").replace(/\n/g, ' ');
1193
+ lines.push(` ${speaker}->>+${target}: ${msg}${msg.length >= 50 ? '...' : ''}`);
1194
+ }
1195
+
1196
+ return lines.join('\n');
1197
+ };
1198
+
1199
+ const generateTranscript = (trace) => {
1200
+ const lines = [];
1201
+ for (const turn of trace.turns || []) {
1202
+ const speaker = turn.phase === 'learner' ? 'LEARNER' : 'TUTOR';
1203
+ lines.push(`[Turn ${turn.turnNumber}] ${speaker}:`);
1204
+ lines.push(turn.externalMessage || '');
1205
+ lines.push('');
1206
+ }
1207
+ return lines.join('\n');
1208
+ };
1209
+
1210
+ // Compile result
1211
+ const result = {
1212
+ evalId,
1213
+ scenarioId: `interact-${persona}`,
1214
+ scenarioName: `Interactive Evaluation - ${persona}`,
1215
+ type: 'short_term',
1216
+ learnerId,
1217
+ personaId: persona,
1218
+ tutorProfile,
1219
+ learnerArchitecture: dialogueEnabled ? 'psychodynamic' : 'unified',
1220
+ learnerProfile: dialogueEnabled ? 'psychodynamic' : 'unified',
1221
+ topic,
1222
+ interaction: interactionTrace,
1223
+ turnCount: interactionTrace.turns.length,
1224
+ turns: interactionTrace.turns,
1225
+ sequenceDiagram: generateSequenceDiagram(interactionTrace),
1226
+ formattedTranscript: generateTranscript(interactionTrace),
1227
+ skipJudge: !runJudge,
1228
+ metrics: {
1229
+ turnCount: interactionTrace.turns.length,
1230
+ totalTokens: (interactionTrace.metrics?.learnerInputTokens || 0) +
1231
+ (interactionTrace.metrics?.learnerOutputTokens || 0) +
1232
+ (interactionTrace.metrics?.tutorInputTokens || 0) +
1233
+ (interactionTrace.metrics?.tutorOutputTokens || 0),
1234
+ learnerTokens: (interactionTrace.metrics?.learnerInputTokens || 0) +
1235
+ (interactionTrace.metrics?.learnerOutputTokens || 0),
1236
+ tutorTokens: (interactionTrace.metrics?.tutorInputTokens || 0) +
1237
+ (interactionTrace.metrics?.tutorOutputTokens || 0),
1238
+ totalLatencyMs: interactionTrace.metrics?.totalLatencyMs || 0,
1239
+ },
1240
+ timestamp: new Date().toISOString(),
1241
+ };
1242
+
1243
+ // Store in database
1244
+ sendEvent('progress', { stage: 'storing', message: 'Saving results' });
1245
+
1246
+ // First create a run entry so it appears in History with "Interact" filter
1247
+ let runId = null;
1248
+ try {
1249
+ const runData = evaluationStore.createRun({
1250
+ description: `Interact: ${persona} → ${tutorProfile}`,
1251
+ totalScenarios: 1,
1252
+ metadata: {
1253
+ runType: 'interaction',
1254
+ profiles: [tutorProfile],
1255
+ personaId: persona,
1256
+ learnerArchitecture: dialogueEnabled ? 'psychodynamic' : 'unified',
1257
+ topic,
1258
+ fastMode: !runJudge,
1259
+ },
1260
+ });
1261
+ runId = runData.id;
1262
+ sendEvent('log', { message: `Created run entry: ${runId}`, level: 'info' });
1263
+ } catch (e) {
1264
+ sendEvent('log', { message: `Run entry warning: ${e.message}`, level: 'warning' });
1265
+ }
1266
+
1267
+ // Now store the interaction evaluation details
1268
+ try {
1269
+ result.runId = runId;
1270
+ evaluationStore.storeInteractionEval(result);
1271
+ sendEvent('log', { message: `Stored in database: ${evalId}`, level: 'success' });
1272
+
1273
+ // Mark the run as completed (don't use completeRun which checks evaluation_results table)
1274
+ if (runId) {
1275
+ evaluationStore.updateRun(runId, {
1276
+ status: 'completed',
1277
+ totalTests: result.metrics?.turnCount || 1,
1278
+ completedAt: new Date().toISOString(),
1279
+ });
1280
+ }
1281
+ } catch (e) {
1282
+ sendEvent('log', { message: `Database storage warning: ${e.message}`, level: 'warning' });
1283
+ }
1284
+
1285
+ // Send turn-by-turn summary
1286
+ for (let i = 0; i < interactionTrace.turns.length; i++) {
1287
+ const turn = interactionTrace.turns[i];
1288
+ sendEvent('turn', {
1289
+ turnNumber: turn.turnNumber,
1290
+ phase: turn.phase,
1291
+ message: turn.externalMessage?.slice(0, 100) + (turn.externalMessage?.length > 100 ? '...' : ''),
1292
+ });
1293
+ }
1294
+
1295
+ sendEvent('log', { message: `\n=== Interaction Complete ===`, level: 'success' });
1296
+ sendEvent('log', { message: `Total turns: ${result.metrics.turnCount}`, level: 'info' });
1297
+ sendEvent('log', { message: `Total tokens: ${result.metrics.totalTokens}`, level: 'info' });
1298
+
1299
+ // Send final result
1300
+ sendEvent('result', {
1301
+ evalId: result.evalId,
1302
+ scenarioName: result.scenarioName,
1303
+ persona: result.personaId,
1304
+ tutorProfile: result.tutorProfile,
1305
+ learnerArchitecture: result.learnerArchitecture,
1306
+ turnCount: result.metrics.turnCount,
1307
+ totalTokens: result.metrics.totalTokens,
1308
+ learnerTokens: result.metrics.learnerTokens,
1309
+ tutorTokens: result.metrics.tutorTokens,
1310
+ latencyMs: result.metrics.totalLatencyMs,
1311
+ passed: true, // No judge score yet
1312
+ overallScore: null,
1313
+ });
1314
+
1315
+ sendEvent('complete', {
1316
+ success: true,
1317
+ evalId: result.evalId,
1318
+ });
1319
+
1320
+ unregisterStream(streamId);
1321
+ res.end();
1322
+ } catch (error) {
1323
+ console.error('[InteractStream] Error:', error);
1324
+ sendEvent('log', { message: `Error: ${error.message}`, level: 'error' });
1325
+ sendEvent('error', { error: error.message });
1326
+ unregisterStream(streamId);
1327
+ res.end();
1328
+ }
1329
+ });
1330
+
1331
+ // ============================================================================
1332
+ // Results Endpoints
1333
+ // ============================================================================
1334
+
1335
+ /**
1336
+ * List previous evaluation runs
1337
+ * GET /api/eval/runs
1338
+ * Query params: limit (default 20)
1339
+ */
1340
+ router.get('/runs', (req, res) => {
1341
+ try {
1342
+ const limit = parseInt(req.query.limit) || 20;
1343
+ const runs = evaluationStore.listRuns({ limit });
1344
+
1345
+ // Also include interaction evals in the runs list
1346
+ const interactionEvals = evaluationStore.listInteractionEvals({ limit });
1347
+ const interactionRuns = interactionEvals.map(e => ({
1348
+ id: e.evalId,
1349
+ description: e.scenarioName || 'Interaction Evaluation',
1350
+ status: 'completed',
1351
+ createdAt: e.createdAt,
1352
+ totalScenarios: 1,
1353
+ totalTests: e.turnCount || 1,
1354
+ type: 'interaction',
1355
+ metadata: JSON.stringify({
1356
+ runType: 'interaction',
1357
+ profiles: [e.tutorProfile || 'default'],
1358
+ scenarioNames: [e.scenarioName],
1359
+ learnerProfile: e.learnerProfile,
1360
+ personaId: e.personaId,
1361
+ }),
1362
+ }));
1363
+
1364
+ // Merge and sort by createdAt descending
1365
+ const allRuns = [...runs, ...interactionRuns].sort((a, b) =>
1366
+ new Date(b.createdAt).getTime() - new Date(a.createdAt).getTime()
1367
+ ).slice(0, limit);
1368
+
1369
+ res.json({ success: true, runs: allRuns });
1370
+ } catch (error) {
1371
+ console.error('[EvalRoutes] List runs error:', error);
1372
+ res.status(500).json({ error: 'Failed to list runs' });
1373
+ }
1374
+ });
1375
+
1376
+ /**
1377
+ * Find incomplete (stuck) evaluation runs
1378
+ * GET /api/eval/runs-incomplete
1379
+ * Query params: olderThanMinutes (default 30)
1380
+ */
1381
+ router.get('/runs-incomplete', (req, res) => {
1382
+ try {
1383
+ const olderThanMinutes = parseInt(req.query.olderThanMinutes) || 30;
1384
+ const runs = evaluationStore.findIncompleteRuns({ olderThanMinutes });
1385
+ res.json({ success: true, runs, found: runs.length });
1386
+ } catch (error) {
1387
+ console.error('[EvalRoutes] Find incomplete runs error:', error);
1388
+ res.status(500).json({ error: 'Failed to find incomplete runs' });
1389
+ }
1390
+ });
1391
+
1392
+ /**
1393
+ * Auto-complete all stale runs
1394
+ * POST /api/eval/runs-auto-complete
1395
+ * Body: { olderThanMinutes: 30, dryRun: false }
1396
+ */
1397
+ router.post('/runs-auto-complete', (req, res) => {
1398
+ try {
1399
+ const { olderThanMinutes = 30, dryRun = false } = req.body;
1400
+ const result = evaluationStore.autoCompleteStaleRuns({ olderThanMinutes, dryRun });
1401
+ res.json({ success: true, ...result });
1402
+ } catch (error) {
1403
+ console.error('[EvalRoutes] Auto-complete runs error:', error);
1404
+ res.status(500).json({ error: 'Failed to auto-complete runs', details: error.message });
1405
+ }
1406
+ });
1407
+
1408
+ /**
1409
+ * Get results for a specific run
1410
+ * GET /api/eval/runs/:runId
1411
+ */
1412
+ router.get('/runs/:runId', (req, res) => {
1413
+ try {
1414
+ const { runId } = req.params;
1415
+
1416
+ // Check if this is an interaction eval
1417
+ if (runId.startsWith('short-') || runId.startsWith('long-')) {
1418
+ const evalData = evaluationStore.getInteractionEval(runId);
1419
+ if (!evalData) {
1420
+ return res.status(404).json({ error: 'Interaction evaluation not found' });
1421
+ }
1422
+
1423
+ // Format as a run with results for the existing frontend
1424
+ return res.json({
1425
+ success: true,
1426
+ type: 'interaction',
1427
+ run: {
1428
+ id: evalData.evalId,
1429
+ description: evalData.scenarioName || 'Interaction Evaluation',
1430
+ status: 'completed',
1431
+ createdAt: evalData.createdAt,
1432
+ },
1433
+ stats: {
1434
+ totalTests: 1,
1435
+ avgScore: evalData.judgeOverallScore,
1436
+ },
1437
+ results: [{
1438
+ scenarioId: evalData.scenarioId,
1439
+ scenarioName: evalData.scenarioName,
1440
+ profileName: evalData.tutorProfile || 'default',
1441
+ tutorProfile: evalData.tutorProfile || 'default',
1442
+ model: `${evalData.turnCount} turns`,
1443
+ passed: evalData.judgeOverallScore >= 3,
1444
+ overallScore: evalData.judgeOverallScore,
1445
+ overall_score: evalData.judgeOverallScore,
1446
+ inputTokens: evalData.learnerTokens || 0,
1447
+ outputTokens: evalData.tutorTokens || 0,
1448
+ latencyMs: evalData.latencyMs || 0,
1449
+ latency_ms: evalData.latencyMs || 0,
1450
+ isInteraction: true,
1451
+ interactionEvalId: evalData.evalId,
1452
+ // dialogueId links to the dialogue log viewer
1453
+ dialogueId: evalData.evalId,
1454
+ // Include judgeEvaluation for dimension score extraction in History tab
1455
+ judgeEvaluation: evalData.judgeEvaluation,
1456
+ }],
1457
+ // Include full interaction data for display
1458
+ interaction: {
1459
+ evalId: evalData.evalId,
1460
+ scenarioName: evalData.scenarioName,
1461
+ turnCount: evalData.turnCount,
1462
+ turns: evalData.turns,
1463
+ sequenceDiagram: evalData.sequenceDiagram,
1464
+ formattedTranscript: evalData.formattedTranscript,
1465
+ totalTokens: evalData.totalTokens,
1466
+ learnerTokens: evalData.learnerTokens,
1467
+ tutorTokens: evalData.tutorTokens,
1468
+ latencyMs: evalData.latencyMs,
1469
+ judgeOverallScore: evalData.judgeOverallScore,
1470
+ judgeEvaluation: evalData.judgeEvaluation,
1471
+ },
1472
+ status: 'completed',
1473
+ description: evalData.scenarioName,
1474
+ scenarioNames: [evalData.scenarioName],
1475
+ });
1476
+ }
1477
+
1478
+ // Regular run
1479
+ const result = evaluationRunner.getRunResults(runId);
1480
+
1481
+ // Check if this is an interaction run (created from Interact tab)
1482
+ const runMetadata = result.run?.metadata ?
1483
+ (typeof result.run.metadata === 'string' ? JSON.parse(result.run.metadata) : result.run.metadata)
1484
+ : {};
1485
+
1486
+ if (runMetadata.runType === 'interaction') {
1487
+ // Look up the interaction eval data by runId
1488
+ const interactionEval = evaluationStore.getInteractionEvalByRunId(runId);
1489
+ if (interactionEval) {
1490
+ return res.json({
1491
+ success: true,
1492
+ type: 'interaction',
1493
+ run: result.run,
1494
+ stats: {
1495
+ totalTests: 1,
1496
+ avgScore: interactionEval.judgeOverallScore,
1497
+ },
1498
+ results: [{
1499
+ scenarioId: interactionEval.scenarioId,
1500
+ scenarioName: interactionEval.scenarioName,
1501
+ profileName: interactionEval.tutorProfile || 'default',
1502
+ tutorProfile: interactionEval.tutorProfile || 'default',
1503
+ model: `${interactionEval.turnCount} turns`,
1504
+ passed: interactionEval.judgeOverallScore >= 3,
1505
+ overallScore: interactionEval.judgeOverallScore,
1506
+ overall_score: interactionEval.judgeOverallScore,
1507
+ inputTokens: interactionEval.learnerTokens || 0,
1508
+ outputTokens: interactionEval.tutorTokens || 0,
1509
+ latencyMs: interactionEval.latencyMs || 0,
1510
+ latency_ms: interactionEval.latencyMs || 0,
1511
+ isInteraction: true,
1512
+ interactionEvalId: interactionEval.evalId,
1513
+ dialogueId: interactionEval.evalId,
1514
+ judgeEvaluation: interactionEval.judgeEvaluation,
1515
+ }],
1516
+ interaction: {
1517
+ evalId: interactionEval.evalId,
1518
+ scenarioName: interactionEval.scenarioName,
1519
+ turnCount: interactionEval.turnCount,
1520
+ turns: interactionEval.turns,
1521
+ sequenceDiagram: interactionEval.sequenceDiagram,
1522
+ formattedTranscript: interactionEval.formattedTranscript,
1523
+ totalTokens: interactionEval.totalTokens,
1524
+ learnerTokens: interactionEval.learnerTokens,
1525
+ tutorTokens: interactionEval.tutorTokens,
1526
+ latencyMs: interactionEval.latencyMs,
1527
+ judgeOverallScore: interactionEval.judgeOverallScore,
1528
+ judgeEvaluation: interactionEval.judgeEvaluation,
1529
+ },
1530
+ status: 'completed',
1531
+ description: result.run?.description || interactionEval.scenarioName,
1532
+ scenarioNames: [interactionEval.scenarioName],
1533
+ metadata: runMetadata,
1534
+ });
1535
+ }
1536
+ }
1537
+
1538
+ // Extract scenario names from results for display
1539
+ const scenarioNames = [...new Set(
1540
+ (result.results || [])
1541
+ .map((r) => r.scenarioName)
1542
+ .filter(Boolean)
1543
+ )].sort();
1544
+
1545
+ // Include key run properties at top level for easier frontend access
1546
+ res.json({
1547
+ success: true,
1548
+ ...result,
1549
+ // Flatten these for easier access in UI
1550
+ status: result.run?.status,
1551
+ description: result.run?.description,
1552
+ scenarioNames,
1553
+ });
1554
+ } catch (error) {
1555
+ console.error('[EvalRoutes] Get run error:', error);
1556
+ res.status(500).json({ error: 'Failed to get run results', details: error.message });
1557
+ }
1558
+ });
1559
+
1560
+ /**
1561
+ * Get report for a run
1562
+ * GET /api/eval/runs/:runId/report
1563
+ */
1564
+ router.get('/runs/:runId/report', (req, res) => {
1565
+ try {
1566
+ const report = evaluationRunner.generateReport(req.params.runId);
1567
+
1568
+ // Check if client wants plain text
1569
+ if (req.accepts('text/plain')) {
1570
+ res.type('text/plain').send(report);
1571
+ } else {
1572
+ res.json({ success: true, report });
1573
+ }
1574
+ } catch (error) {
1575
+ console.error('[EvalRoutes] Get report error:', error);
1576
+ res.status(500).json({ error: 'Failed to generate report', details: error.message });
1577
+ }
1578
+ });
1579
+
1580
+ // ============================================================================
1581
+ // Dialogue Log Endpoints
1582
+ // ============================================================================
1583
+
1584
+ /**
1585
+ * List available log dates
1586
+ * GET /api/eval/logs/dates
1587
+ */
1588
+ router.get('/logs/dates', (req, res) => {
1589
+ try {
1590
+ const dates = dialogueLogService.listLogDates();
1591
+ res.json({ success: true, dates });
1592
+ } catch (error) {
1593
+ console.error('[EvalRoutes] List log dates error:', error);
1594
+ res.status(500).json({ error: 'Failed to list log dates' });
1595
+ }
1596
+ });
1597
+
1598
+ /**
1599
+ * Get dialogues for a specific date
1600
+ * GET /api/eval/logs/:date
1601
+ * Query params: limit (default 10), offset (default 0)
1602
+ */
1603
+ router.get('/logs/:date', (req, res) => {
1604
+ try {
1605
+ const { date } = req.params;
1606
+ const limit = parseInt(req.query.limit) || 10;
1607
+ const offset = parseInt(req.query.offset) || 0;
1608
+
1609
+ const result = dialogueLogService.getDialogues({ date, limit, offset });
1610
+ res.json({ success: true, ...result });
1611
+ } catch (error) {
1612
+ console.error('[EvalRoutes] Get dialogues error:', error);
1613
+ res.status(500).json({ error: 'Failed to get dialogues' });
1614
+ }
1615
+ });
1616
+
1617
+ /**
1618
+ * Get a specific dialogue by dialogueId
1619
+ * GET /api/eval/logs/dialogue/:dialogueId
1620
+ */
1621
+ router.get('/logs/dialogue/:dialogueId', (req, res) => {
1622
+ try {
1623
+ const { dialogueId } = req.params;
1624
+
1625
+ // Check if this is an interaction eval dialogue (starts with short- or long-)
1626
+ if (dialogueId.startsWith('short-') || dialogueId.startsWith('long-')) {
1627
+ const interactionEval = evaluationStore.getInteractionEval(dialogueId);
1628
+ if (interactionEval) {
1629
+ // Format interaction eval as entries for DialogueFlowDiagram
1630
+ // Expand each turn into action-based entries the diagram expects
1631
+ const entries = [];
1632
+ let entryIndex = 0;
1633
+
1634
+ for (const turn of interactionEval.turns || []) {
1635
+ const isLearner = turn.phase === 'learner';
1636
+
1637
+ // Add internal deliberation steps if present
1638
+ if (turn.internalDeliberation && turn.internalDeliberation.length > 0) {
1639
+ for (const delib of turn.internalDeliberation) {
1640
+ if (delib.role === 'ego') {
1641
+ entries.push({
1642
+ index: entryIndex++,
1643
+ action: isLearner ? 'learner_ego_thought' : 'tutor_ego_thought',
1644
+ agent: isLearner ? 'ego' : 'tutor_ego',
1645
+ phase: turn.phase,
1646
+ message: delib.content,
1647
+ timestamp: turn.timestamp,
1648
+ });
1649
+ } else if (delib.role === 'superego') {
1650
+ entries.push({
1651
+ index: entryIndex++,
1652
+ action: isLearner ? 'learner_superego_critique' : 'tutor_superego_critique',
1653
+ agent: isLearner ? 'superego' : 'tutor_superego',
1654
+ phase: turn.phase,
1655
+ message: delib.content,
1656
+ timestamp: turn.timestamp,
1657
+ });
1658
+ }
1659
+ }
1660
+ }
1661
+
1662
+ // Add the external message entry
1663
+ entries.push({
1664
+ index: entryIndex++,
1665
+ action: isLearner ? 'learner_input' : 'tutor_response',
1666
+ agent: isLearner ? 'ego' : 'tutor_ego',
1667
+ phase: turn.phase,
1668
+ message: turn.externalMessage,
1669
+ timestamp: turn.timestamp,
1670
+ turnNumber: turn.turnNumber,
1671
+ });
1672
+ }
1673
+
1674
+ // Calculate summary stats
1675
+ const learnerTurns = (interactionEval.turns || []).filter(t => t.phase === 'learner').length;
1676
+ const tutorTurns = (interactionEval.turns || []).filter(t => t.phase === 'tutor').length;
1677
+
1678
+ return res.json({
1679
+ success: true,
1680
+ dialogueId,
1681
+ dialogue: {
1682
+ dialogueId,
1683
+ entries,
1684
+ startTime: interactionEval.createdAt,
1685
+ isInteractionEval: true,
1686
+ scenarioName: interactionEval.scenarioName,
1687
+ personaId: interactionEval.personaId,
1688
+ judgeEvaluation: interactionEval.judgeEvaluation,
1689
+ summary: {
1690
+ totalTurns: interactionEval.turnCount,
1691
+ egoCount: learnerTurns,
1692
+ userCount: interactionEval.turnCount,
1693
+ superegoCount: 0,
1694
+ totalLatencyMs: interactionEval.latencyMs || 0,
1695
+ totalInputTokens: Math.floor((interactionEval.totalTokens || 0) / 2),
1696
+ totalOutputTokens: Math.ceil((interactionEval.totalTokens || 0) / 2),
1697
+ totalCost: 0,
1698
+ },
1699
+ sequenceDiagram: interactionEval.sequenceDiagram,
1700
+ formattedTranscript: interactionEval.formattedTranscript,
1701
+ isInteraction: true,
1702
+ },
1703
+ });
1704
+ }
1705
+ }
1706
+
1707
+ // Regular dialogue lookup
1708
+ const dialogue = dialogueLogService.getDialogueById(dialogueId);
1709
+
1710
+ if (!dialogue) {
1711
+ return res.status(404).json({ error: 'Dialogue not found' });
1712
+ }
1713
+
1714
+ res.json({ success: true, dialogue, dialogueId });
1715
+ } catch (error) {
1716
+ console.error('[EvalRoutes] Get dialogue by ID error:', error);
1717
+ res.status(500).json({ error: 'Failed to get dialogue' });
1718
+ }
1719
+ });
1720
+
1721
+ /**
1722
+ * Get a specific dialogue by index
1723
+ * GET /api/eval/logs/:date/:index
1724
+ */
1725
+ router.get('/logs/:date/:index', (req, res) => {
1726
+ try {
1727
+ const { date, index } = req.params;
1728
+ const dialogue = dialogueLogService.getDialogueByIndex(date, parseInt(index));
1729
+
1730
+ if (!dialogue) {
1731
+ return res.status(404).json({ error: 'Dialogue not found' });
1732
+ }
1733
+
1734
+ res.json({ success: true, dialogue });
1735
+ } catch (error) {
1736
+ console.error('[EvalRoutes] Get dialogue error:', error);
1737
+ res.status(500).json({ error: 'Failed to get dialogue' });
1738
+ }
1739
+ });
1740
+
1741
+ /**
1742
+ * Get log statistics
1743
+ * GET /api/eval/logs/stats
1744
+ * Query params: startDate, endDate
1745
+ */
1746
+ router.get('/logs-stats', (req, res) => {
1747
+ try {
1748
+ const { startDate, endDate } = req.query;
1749
+ const stats = dialogueLogService.getLogStatistics({ startDate, endDate });
1750
+ res.json({ success: true, ...stats });
1751
+ } catch (error) {
1752
+ console.error('[EvalRoutes] Get log stats error:', error);
1753
+ res.status(500).json({ error: 'Failed to get log statistics' });
1754
+ }
1755
+ });
1756
+
1757
+ // ============================================================================
1758
+ // Prompt Endpoints (Read-Only)
1759
+ // ============================================================================
1760
+
1761
+ /**
1762
+ * List available prompts
1763
+ * GET /api/eval/prompts
1764
+ */
1765
+ router.get('/prompts', (req, res) => {
1766
+ try {
1767
+ if (!fs.existsSync(PROMPTS_DIR)) {
1768
+ return res.json({ success: true, prompts: [] });
1769
+ }
1770
+
1771
+ const files = fs.readdirSync(PROMPTS_DIR)
1772
+ .filter(f => f.endsWith('.md'))
1773
+ .map(f => {
1774
+ const filePath = path.join(PROMPTS_DIR, f);
1775
+ const stats = fs.statSync(filePath);
1776
+ return {
1777
+ name: f.replace('.md', ''),
1778
+ filename: f,
1779
+ size: stats.size,
1780
+ modified: stats.mtime.toISOString(),
1781
+ };
1782
+ });
1783
+
1784
+ res.json({ success: true, prompts: files });
1785
+ } catch (error) {
1786
+ console.error('[EvalRoutes] List prompts error:', error);
1787
+ res.status(500).json({ error: 'Failed to list prompts' });
1788
+ }
1789
+ });
1790
+
1791
+ /**
1792
+ * Get prompt content (read-only)
1793
+ * GET /api/eval/prompts/:name
1794
+ */
1795
+ router.get('/prompts/:name', (req, res) => {
1796
+ try {
1797
+ const filename = req.params.name.endsWith('.md')
1798
+ ? req.params.name
1799
+ : `${req.params.name}.md`;
1800
+ const filePath = path.join(PROMPTS_DIR, filename);
1801
+
1802
+ if (!fs.existsSync(filePath)) {
1803
+ return res.status(404).json({ error: 'Prompt not found' });
1804
+ }
1805
+
1806
+ const content = fs.readFileSync(filePath, 'utf8');
1807
+ const stats = fs.statSync(filePath);
1808
+
1809
+ res.json({
1810
+ success: true,
1811
+ prompt: {
1812
+ name: req.params.name,
1813
+ filename,
1814
+ content,
1815
+ size: stats.size,
1816
+ modified: stats.mtime.toISOString(),
1817
+ },
1818
+ });
1819
+ } catch (error) {
1820
+ console.error('[EvalRoutes] Get prompt error:', error);
1821
+ res.status(500).json({ error: 'Failed to get prompt' });
1822
+ }
1823
+ });
1824
+
1825
+ /**
1826
+ * Generate prompt improvement recommendations (read-only)
1827
+ * POST /api/eval/prompts/recommend
1828
+ *
1829
+ * Body: {
1830
+ * runId: "run-123", // Get results from a run
1831
+ * profile: "budget", // Or run fresh tests with this profile
1832
+ * scenarios: "all" // Scenarios to test (if running fresh)
1833
+ * }
1834
+ *
1835
+ * Returns recommendations for prompt improvements.
1836
+ * Does NOT write to disk - web clients can display these for review.
1837
+ */
1838
+ router.post('/prompts/recommend', async (req, res) => {
1839
+ try {
1840
+ const { runId, profile, scenarios = 'all' } = req.body;
1841
+
1842
+ let results = [];
1843
+ let profileName = profile || 'unknown';
1844
+
1845
+ if (runId) {
1846
+ // Get results from existing run
1847
+ const runResults = evaluationStore.getResults(runId);
1848
+ if (!runResults || runResults.length === 0) {
1849
+ return res.status(404).json({ error: 'Run not found or has no results' });
1850
+ }
1851
+ results = runResults;
1852
+ profileName = runResults[0]?.profileName || profileName;
1853
+ } else if (profile) {
1854
+ // Run fresh evaluations
1855
+ const allScenarios = tutorApi.listScenarios();
1856
+ const scenariosToRun = scenarios === 'all'
1857
+ ? allScenarios
1858
+ : allScenarios.filter(s => scenarios.includes(s.id));
1859
+
1860
+ for (const scenario of scenariosToRun) {
1861
+ try {
1862
+ const config = { profileName: profile, label: profile };
1863
+ const result = await evaluationRunner.quickTest(config, {
1864
+ scenarioId: scenario.id,
1865
+ verbose: false,
1866
+ skipRubricEval: false, // Need rubric for recommendations
1867
+ });
1868
+ results.push(result);
1869
+ } catch (e) {
1870
+ // Skip failed tests
1871
+ }
1872
+ }
1873
+ } else {
1874
+ return res.status(400).json({ error: 'Either runId or profile is required' });
1875
+ }
1876
+
1877
+ if (results.length === 0) {
1878
+ return res.status(400).json({ error: 'No evaluation results available' });
1879
+ }
1880
+
1881
+ // Generate recommendations
1882
+ const recommendations = await promptRecommendationService.generateRecommendations({
1883
+ results,
1884
+ profileName,
1885
+ });
1886
+
1887
+ res.json({
1888
+ success: true,
1889
+ ...recommendations,
1890
+ // Explicitly note this is read-only
1891
+ readOnly: true,
1892
+ note: 'Recommendations are for review only. Use CLI to apply changes.',
1893
+ });
1894
+ } catch (error) {
1895
+ console.error('[EvalRoutes] Recommend prompts error:', error);
1896
+ res.status(500).json({ error: 'Failed to generate recommendations', details: error.message });
1897
+ }
1898
+ });
1899
+
1900
+ // ============================================================================
1901
+ // Streaming Evaluation Endpoints
1902
+ // ============================================================================
1903
+
1904
+ /**
1905
+ * Run evaluation with SSE streaming for real-time progress
1906
+ * GET /api/eval/stream/run
1907
+ * Query params: profiles, scenarios, skipRubric
1908
+ */
1909
+ router.get('/stream/run', async (req, res) => {
1910
+ // Set up SSE
1911
+ res.writeHead(200, {
1912
+ 'Content-Type': 'text/event-stream',
1913
+ 'Cache-Control': 'no-cache',
1914
+ Connection: 'keep-alive',
1915
+ });
1916
+
1917
+ const sendEvent = (type, data) => {
1918
+ res.write(`event: ${type}\n`);
1919
+ res.write(`data: ${JSON.stringify(data)}\n\n`);
1920
+ };
1921
+
1922
+ // Keep-alive to prevent connection timeout
1923
+ const keepAlive = setInterval(() => {
1924
+ res.write(': keepalive\n\n');
1925
+ }, 15000);
1926
+
1927
+ // Register stream for crash protection
1928
+ const streamId = registerStream(res, keepAlive);
1929
+
1930
+ // Clean up on close
1931
+ req.on('close', () => {
1932
+ clearInterval(keepAlive);
1933
+ unregisterStream(streamId);
1934
+ });
1935
+
1936
+ try {
1937
+ const profiles = req.query.profiles
1938
+ ? req.query.profiles.split(',')
1939
+ : ['budget'];
1940
+ const scenarios = req.query.scenarios === 'all' || !req.query.scenarios
1941
+ ? 'all'
1942
+ : req.query.scenarios.split(',');
1943
+ const skipRubric = req.query.skipRubric === 'true';
1944
+ const outputSize = req.query.outputSize || 'normal';
1945
+
1946
+ // Get all scenarios to run
1947
+ const allScenarios = tutorApi.listScenarios();
1948
+ const scenariosToRun = scenarios === 'all'
1949
+ ? allScenarios
1950
+ : allScenarios.filter(s => scenarios.includes(s.id));
1951
+
1952
+ const totalTests = profiles.length * scenariosToRun.length;
1953
+ let completedTests = 0;
1954
+
1955
+ sendEvent('start', {
1956
+ profiles,
1957
+ scenarioCount: scenariosToRun.length,
1958
+ totalTests,
1959
+ skipRubric,
1960
+ outputSize,
1961
+ timestamp: new Date().toISOString(),
1962
+ });
1963
+
1964
+ sendEvent('log', { message: `Starting batch run: ${profiles.length} profiles × ${scenariosToRun.length} scenarios = ${totalTests} tests`, level: 'info' });
1965
+ sendEvent('log', { message: `Fast mode (skip rubric): ${skipRubric}`, level: 'info' });
1966
+ sendEvent('log', { message: `Output size: ${outputSize}`, level: 'info' });
1967
+
1968
+ const results = [];
1969
+
1970
+ for (const profileName of profiles) {
1971
+ sendEvent('log', { message: `\n=== Profile: ${profileName} ===`, level: 'info' });
1972
+
1973
+ for (const scenario of scenariosToRun) {
1974
+ completedTests++;
1975
+
1976
+ sendEvent('progress', {
1977
+ current: completedTests,
1978
+ total: totalTests,
1979
+ profile: profileName,
1980
+ scenario: scenario.name,
1981
+ percentage: Math.round((completedTests / totalTests) * 100),
1982
+ });
1983
+
1984
+ sendEvent('log', { message: `[${completedTests}/${totalTests}] ${scenario.name}...`, level: 'info' });
1985
+
1986
+ try {
1987
+ const config = { profileName, label: profileName };
1988
+
1989
+ // Create log callback for this test
1990
+ const onLog = (message, level = 'info') => {
1991
+ sendEvent('log', { message: ` ${message}`, level, timestamp: new Date().toISOString() });
1992
+ };
1993
+
1994
+ const result = await evaluationRunner.quickTest(config, {
1995
+ scenarioId: scenario.id,
1996
+ skipRubricEval: skipRubric,
1997
+ outputSize,
1998
+ verbose: false,
1999
+ onLog,
2000
+ });
2001
+
2002
+ results.push(result);
2003
+
2004
+ const scoreStr = result.overallScore != null ? result.overallScore.toFixed(1) : 'N/A';
2005
+ const status = result.success !== false ? '✓' : '✗';
2006
+ sendEvent('log', { message: ` ${status} Score: ${scoreStr} (${result.latencyMs}ms)`, level: result.success !== false ? 'success' : 'warning' });
2007
+
2008
+ sendEvent('result', {
2009
+ profile: profileName,
2010
+ scenarioId: scenario.id,
2011
+ scenarioName: scenario.name,
2012
+ passed: result.success,
2013
+ score: result.overallScore,
2014
+ latencyMs: result.latencyMs,
2015
+ inputTokens: result.inputTokens,
2016
+ outputTokens: result.outputTokens,
2017
+ totalTokens: (result.inputTokens || 0) + (result.outputTokens || 0),
2018
+ });
2019
+ } catch (e) {
2020
+ sendEvent('log', { message: ` ✗ Error: ${e.message}`, level: 'error' });
2021
+ sendEvent('error', {
2022
+ profile: profileName,
2023
+ scenarioId: scenario.id,
2024
+ error: e.message,
2025
+ });
2026
+ }
2027
+ }
2028
+ }
2029
+
2030
+ // Calculate summary
2031
+ const successCount = results.filter(r => r.success !== false).length;
2032
+ const scores = results.filter(r => r.overallScore != null).map(r => r.overallScore);
2033
+ const avgScore = scores.length > 0
2034
+ ? scores.reduce((a, b) => a + b, 0) / scores.length
2035
+ : null;
2036
+
2037
+ sendEvent('log', { message: `\n=== Batch Complete ===`, level: 'success' });
2038
+ sendEvent('log', { message: `Total: ${totalTests}, Passed: ${successCount}, Avg Score: ${avgScore?.toFixed(1) || 'N/A'}`, level: 'info' });
2039
+
2040
+ sendEvent('complete', {
2041
+ totalTests,
2042
+ successfulTests: successCount,
2043
+ averageScore: avgScore,
2044
+ });
2045
+
2046
+ unregisterStream(streamId);
2047
+ res.end();
2048
+ } catch (error) {
2049
+ sendEvent('log', { message: `Fatal error: ${error.message}`, level: 'error' });
2050
+ sendEvent('error', { error: error.message });
2051
+ unregisterStream(streamId);
2052
+ res.end();
2053
+ }
2054
+ });
2055
+
2056
+ // ============================================================================
2057
+ // Trajectory and Improvement Cycle Endpoints
2058
+ // ============================================================================
2059
+
2060
+ /**
2061
+ * Get improvement trajectory for a profile
2062
+ * GET /api/eval/trajectory/:profile
2063
+ * Query params: last (number of cycles), all (boolean)
2064
+ */
2065
+ router.get('/trajectory/:profile', (req, res) => {
2066
+ try {
2067
+ const { profile } = req.params;
2068
+ const last = parseInt(req.query.last) || 5;
2069
+ const all = req.query.all === 'true';
2070
+
2071
+ const trajectoryDir = path.join(process.cwd(), 'data', 'improvement-trajectories');
2072
+ const trajectoryFile = path.join(trajectoryDir, `${profile}.json`);
2073
+
2074
+ if (!fs.existsSync(trajectoryFile)) {
2075
+ return res.json({
2076
+ success: true,
2077
+ profile,
2078
+ cycles: [],
2079
+ message: 'No improvement history found for this profile',
2080
+ });
2081
+ }
2082
+
2083
+ const data = JSON.parse(fs.readFileSync(trajectoryFile, 'utf8'));
2084
+ const cycles = all ? data.cycles : data.cycles.slice(-last);
2085
+
2086
+ res.json({
2087
+ success: true,
2088
+ profile,
2089
+ startedAt: data.startedAt,
2090
+ lastUpdated: data.lastUpdated,
2091
+ totalCycles: data.cycles.length,
2092
+ cycles,
2093
+ });
2094
+ } catch (error) {
2095
+ console.error('[EvalRoutes] Get trajectory error:', error);
2096
+ res.status(500).json({ error: 'Failed to get trajectory', details: error.message });
2097
+ }
2098
+ });
2099
+
2100
+ /**
2101
+ * Compare two evaluation runs
2102
+ * GET /api/eval/compare-runs/:runId1/:runId2
2103
+ */
2104
+ router.get('/compare-runs/:runId1/:runId2', (req, res) => {
2105
+ try {
2106
+ const { runId1, runId2 } = req.params;
2107
+
2108
+ const results1 = evaluationStore.getResults(runId1);
2109
+ const results2 = evaluationStore.getResults(runId2);
2110
+
2111
+ if (!results1 || results1.length === 0) {
2112
+ return res.status(404).json({ error: `Run ${runId1} not found` });
2113
+ }
2114
+ if (!results2 || results2.length === 0) {
2115
+ return res.status(404).json({ error: `Run ${runId2} not found` });
2116
+ }
2117
+
2118
+ // Calculate averages for each run
2119
+ const calcAverages = (results) => {
2120
+ const dims = ['relevance', 'specificity', 'pedagogical', 'personalization', 'actionability', 'tone'];
2121
+ const dimScores = {};
2122
+ dims.forEach(d => { dimScores[d] = []; });
2123
+
2124
+ let totalScore = 0;
2125
+ let scoreCount = 0;
2126
+
2127
+ results.forEach(r => {
2128
+ if (r.overall_score != null) {
2129
+ totalScore += r.overall_score;
2130
+ scoreCount++;
2131
+ }
2132
+ dims.forEach(d => {
2133
+ const score = r[`score_${d}`];
2134
+ if (score != null) {
2135
+ dimScores[d].push(score);
2136
+ }
2137
+ });
2138
+ });
2139
+
2140
+ const dimAverages = {};
2141
+ dims.forEach(d => {
2142
+ const scores = dimScores[d];
2143
+ dimAverages[d] = scores.length > 0
2144
+ ? scores.reduce((a, b) => a + b, 0) / scores.length
2145
+ : null;
2146
+ });
2147
+
2148
+ return {
2149
+ overallScore: scoreCount > 0 ? totalScore / scoreCount : null,
2150
+ dimensions: dimAverages,
2151
+ testCount: results.length,
2152
+ successCount: results.filter(r => r.success).length,
2153
+ };
2154
+ };
2155
+
2156
+ const avg1 = calcAverages(results1);
2157
+ const avg2 = calcAverages(results2);
2158
+
2159
+ // Calculate deltas
2160
+ const deltas = {
2161
+ overallScore: avg2.overallScore != null && avg1.overallScore != null
2162
+ ? avg2.overallScore - avg1.overallScore
2163
+ : null,
2164
+ dimensions: {},
2165
+ };
2166
+
2167
+ Object.keys(avg1.dimensions).forEach(dim => {
2168
+ if (avg1.dimensions[dim] != null && avg2.dimensions[dim] != null) {
2169
+ deltas.dimensions[dim] = avg2.dimensions[dim] - avg1.dimensions[dim];
2170
+ } else {
2171
+ deltas.dimensions[dim] = null;
2172
+ }
2173
+ });
2174
+
2175
+ res.json({
2176
+ success: true,
2177
+ run1: { id: runId1, ...avg1 },
2178
+ run2: { id: runId2, ...avg2 },
2179
+ deltas,
2180
+ improved: deltas.overallScore != null && deltas.overallScore > 0,
2181
+ });
2182
+ } catch (error) {
2183
+ console.error('[EvalRoutes] Compare runs error:', error);
2184
+ res.status(500).json({ error: 'Failed to compare runs', details: error.message });
2185
+ }
2186
+ });
2187
+
2188
+ /**
2189
+ * Get dimension statistics across all runs for trend analysis
2190
+ * GET /api/eval/trends
2191
+ * Query params: profile, limit (default 50 individual results)
2192
+ *
2193
+ * Returns individual test results (not aggregated per run) for accurate trend visualization.
2194
+ * Each point represents a single evaluation, not an averaged run.
2195
+ */
2196
+ router.get('/trends', (req, res) => {
2197
+ try {
2198
+ const { profile } = req.query;
2199
+ const limit = parseInt(req.query.limit) || 50;
2200
+
2201
+ // Get recent runs (fetch 3x the limit to account for fast-mode runs being filtered)
2202
+ // Many runs may be --fast (no AI scoring), so we need to fetch more to get enough scored results
2203
+ const runs = evaluationStore.listRuns({ limit: limit * 3 });
2204
+
2205
+ // Helper to extract numeric score from potentially complex score objects
2206
+ const extractNumericScore = (scoreVal) => {
2207
+ if (scoreVal == null) return null;
2208
+ if (typeof scoreVal === 'number') return isNaN(scoreVal) ? null : scoreVal;
2209
+ if (typeof scoreVal === 'object' && scoreVal.score != null) {
2210
+ const s = scoreVal.score;
2211
+ return typeof s === 'number' && !isNaN(s) ? s : null;
2212
+ }
2213
+ return null;
2214
+ };
2215
+
2216
+ // Collect individual results from all runs
2217
+ const allResults = [];
2218
+ const dims = ['relevance', 'specificity', 'pedagogical', 'personalization', 'actionability', 'tone'];
2219
+
2220
+ for (const run of runs) {
2221
+ const results = evaluationStore.getResults(run.id);
2222
+
2223
+ // Use metadata.runType if available, fallback to parsing description
2224
+ const metadata = run.metadata || {};
2225
+ let runType = metadata.runType || 'eval';
2226
+ if (runType === 'eval' && run.description) {
2227
+ const desc = run.description.toLowerCase();
2228
+ if (desc.includes('matrix')) runType = 'matrix';
2229
+ else if (desc.includes('auto-improve')) runType = 'auto';
2230
+ else if (desc.includes('compare')) runType = 'compare';
2231
+ else if (desc.includes('quick')) runType = 'quick';
2232
+ }
2233
+
2234
+ for (const r of results) {
2235
+ // Filter by profile if specified
2236
+ if (profile && r.profileName !== profile) continue;
2237
+
2238
+ // Extract dimension scores
2239
+ const dimScores = {};
2240
+ dims.forEach(d => {
2241
+ dimScores[d] = extractNumericScore(r.scores?.[d]);
2242
+ });
2243
+
2244
+ allResults.push({
2245
+ runId: run.id,
2246
+ resultId: r.id,
2247
+ createdAt: r.createdAt || run.createdAt,
2248
+ description: run.description,
2249
+ runType,
2250
+ profileName: r.profileName,
2251
+ scenarioName: r.scenarioName,
2252
+ overallScore: extractNumericScore(r.overallScore),
2253
+ dimensions: dimScores,
2254
+ // Include testCount for the table display (how many tests in this run)
2255
+ testCount: results.length,
2256
+ // Include profiles array for compatibility with table display
2257
+ profiles: [r.profileName].filter(Boolean),
2258
+ });
2259
+ }
2260
+ }
2261
+
2262
+ // Sort by createdAt (oldest first for charting) and limit
2263
+ allResults.sort((a, b) => new Date(a.createdAt).getTime() - new Date(b.createdAt).getTime());
2264
+ const trends = allResults.slice(-limit);
2265
+
2266
+ res.json({
2267
+ success: true,
2268
+ profile: profile || 'all',
2269
+ trends,
2270
+ totalResults: allResults.length,
2271
+ });
2272
+ } catch (error) {
2273
+ console.error('[EvalRoutes] Get trends error:', error);
2274
+ res.status(500).json({ error: 'Failed to get trends', details: error.message });
2275
+ }
2276
+ });
2277
+
2278
+ // ============================================================================
2279
+ // Documentation Endpoints
2280
+ // ============================================================================
2281
+
2282
+ // Path to evaluation documentation directory
2283
+ const EVAL_DOCS_DIR = path.join(process.cwd(), 'markdown', 'eval');
2284
+ // Path to research documentation directory
2285
+ const RESEARCH_DOCS_DIR = path.join(process.cwd(), 'docs', 'research');
2286
+
2287
+ /**
2288
+ * List available evaluation documentation files
2289
+ * GET /api/eval/docs
2290
+ */
2291
+ router.get('/docs', (req, res) => {
2292
+ try {
2293
+ if (!fs.existsSync(EVAL_DOCS_DIR)) {
2294
+ return res.json({ success: true, docs: [] });
2295
+ }
2296
+
2297
+ const files = fs.readdirSync(EVAL_DOCS_DIR)
2298
+ .filter(f => f.endsWith('.md'))
2299
+ .map(f => {
2300
+ const filePath = path.join(EVAL_DOCS_DIR, f);
2301
+ const stats = fs.statSync(filePath);
2302
+ // Extract a friendly title from filename
2303
+ const name = f.replace('.md', '');
2304
+ const title = name
2305
+ .replace(/-/g, ' ')
2306
+ .replace(/\b\w/g, l => l.toUpperCase());
2307
+ return {
2308
+ name,
2309
+ filename: f,
2310
+ title,
2311
+ size: stats.size,
2312
+ modified: stats.mtime.toISOString(),
2313
+ };
2314
+ })
2315
+ .sort((a, b) => a.title.localeCompare(b.title));
2316
+
2317
+ res.json({ success: true, docs: files });
2318
+ } catch (error) {
2319
+ console.error('[EvalRoutes] List docs error:', error);
2320
+ res.status(500).json({ error: 'Failed to list docs' });
2321
+ }
2322
+ });
2323
+
2324
+ /**
2325
+ * Get documentation file content
2326
+ * GET /api/eval/docs/:name
2327
+ *
2328
+ * Supports "research:" prefix to load from docs/research/ directory
2329
+ * e.g., /api/eval/docs/research:PAPER-DRAFT-RECOGNITION-TUTORING
2330
+ */
2331
+ router.get('/docs/:name', (req, res) => {
2332
+ try {
2333
+ let docName = req.params.name;
2334
+ let docsDir = EVAL_DOCS_DIR;
2335
+
2336
+ // Check for research: prefix to load from docs/research/
2337
+ if (docName.startsWith('research:')) {
2338
+ docName = docName.substring('research:'.length);
2339
+ docsDir = RESEARCH_DOCS_DIR;
2340
+ }
2341
+
2342
+ const filename = docName.endsWith('.md')
2343
+ ? docName
2344
+ : `${docName}.md`;
2345
+ const filePath = path.join(docsDir, filename);
2346
+
2347
+ if (!fs.existsSync(filePath)) {
2348
+ return res.status(404).json({ error: 'Documentation not found' });
2349
+ }
2350
+
2351
+ const content = fs.readFileSync(filePath, 'utf8');
2352
+ const stats = fs.statSync(filePath);
2353
+
2354
+ // Extract title from first heading or filename
2355
+ const titleMatch = content.match(/^#\s+(.+)$/m);
2356
+ const title = titleMatch
2357
+ ? titleMatch[1]
2358
+ : docName.replace(/-/g, ' ').replace(/\b\w/g, l => l.toUpperCase());
2359
+
2360
+ res.json({
2361
+ success: true,
2362
+ doc: {
2363
+ name: req.params.name,
2364
+ filename,
2365
+ title,
2366
+ content,
2367
+ size: stats.size,
2368
+ modified: stats.mtime.toISOString(),
2369
+ },
2370
+ });
2371
+ } catch (error) {
2372
+ console.error('[EvalRoutes] Get doc error:', error);
2373
+ res.status(500).json({ error: 'Failed to get documentation' });
2374
+ }
2375
+ });
2376
+
2377
+ // ============================================================================
2378
+ // Monitoring Endpoints
2379
+ // ============================================================================
2380
+
2381
+ /**
2382
+ * Get monitoring summary
2383
+ * GET /api/eval/monitor/summary
2384
+ */
2385
+ router.get('/monitor/summary', (req, res) => {
2386
+ try {
2387
+ const summary = monitoringService.getMonitoringSummary();
2388
+ res.json({ success: true, ...summary });
2389
+ } catch (error) {
2390
+ console.error('[EvalRoutes] Monitor summary error:', error);
2391
+ res.status(500).json({ error: 'Failed to get monitoring summary' });
2392
+ }
2393
+ });
2394
+
2395
+ /**
2396
+ * Get active sessions
2397
+ * GET /api/eval/monitor/sessions
2398
+ */
2399
+ router.get('/monitor/sessions', (req, res) => {
2400
+ try {
2401
+ const sessions = monitoringService.getActiveSessions();
2402
+ const aggregate = monitoringService.getAggregateMetrics();
2403
+ res.json({ success: true, sessions, aggregate });
2404
+ } catch (error) {
2405
+ console.error('[EvalRoutes] Monitor sessions error:', error);
2406
+ res.status(500).json({ error: 'Failed to get active sessions' });
2407
+ }
2408
+ });
2409
+
2410
+ /**
2411
+ * Get specific session details
2412
+ * GET /api/eval/monitor/sessions/:id
2413
+ */
2414
+ router.get('/monitor/sessions/:id', (req, res) => {
2415
+ try {
2416
+ const session = monitoringService.getSession(req.params.id);
2417
+ if (!session) {
2418
+ return res.status(404).json({ error: 'Session not found' });
2419
+ }
2420
+ res.json({ success: true, session });
2421
+ } catch (error) {
2422
+ console.error('[EvalRoutes] Get session error:', error);
2423
+ res.status(500).json({ error: 'Failed to get session' });
2424
+ }
2425
+ });
2426
+
2427
+ /**
2428
+ * Get alerts
2429
+ * GET /api/eval/monitor/alerts
2430
+ * Query params: severity, acknowledged, limit
2431
+ */
2432
+ router.get('/monitor/alerts', (req, res) => {
2433
+ try {
2434
+ const { severity, acknowledged, limit } = req.query;
2435
+ const options = {};
2436
+ if (severity) options.severity = severity;
2437
+ if (acknowledged !== undefined) options.acknowledged = acknowledged === 'true';
2438
+ if (limit) options.limit = parseInt(limit, 10);
2439
+
2440
+ const alerts = monitoringService.getAlerts(options);
2441
+ res.json({ success: true, alerts });
2442
+ } catch (error) {
2443
+ console.error('[EvalRoutes] Get alerts error:', error);
2444
+ res.status(500).json({ error: 'Failed to get alerts' });
2445
+ }
2446
+ });
2447
+
2448
+ /**
2449
+ * Acknowledge an alert
2450
+ * POST /api/eval/monitor/alerts/:id/acknowledge
2451
+ */
2452
+ router.post('/monitor/alerts/:id/acknowledge', (req, res) => {
2453
+ try {
2454
+ const alert = monitoringService.acknowledgeAlert(req.params.id);
2455
+ if (!alert) {
2456
+ return res.status(404).json({ error: 'Alert not found' });
2457
+ }
2458
+ res.json({ success: true, alert });
2459
+ } catch (error) {
2460
+ console.error('[EvalRoutes] Acknowledge alert error:', error);
2461
+ res.status(500).json({ error: 'Failed to acknowledge alert' });
2462
+ }
2463
+ });
2464
+
2465
+ // ============================================================================
2466
+ // Run Completion & Recovery Endpoints
2467
+ // ============================================================================
2468
+
2469
+ /**
2470
+ * Complete an incomplete evaluation run
2471
+ * POST /api/eval/runs/:runId/complete
2472
+ *
2473
+ * Marks a stuck/interrupted run as completed with whatever results exist.
2474
+ */
2475
+ router.post('/runs/:runId/complete', (req, res) => {
2476
+ try {
2477
+ const result = evaluationStore.completeRun(req.params.runId);
2478
+ res.json({ success: true, ...result });
2479
+ } catch (error) {
2480
+ console.error('[EvalRoutes] Complete run error:', error);
2481
+ res.status(500).json({ error: 'Failed to complete run', details: error.message });
2482
+ }
2483
+ });
2484
+
2485
+ /**
2486
+ * Get resumption status for an incomplete run
2487
+ * GET /api/eval/runs/:runId/resume-status
2488
+ *
2489
+ * Returns which tests have been completed and which remain,
2490
+ * enabling resumption of interrupted evaluations.
2491
+ *
2492
+ * Query params: profiles (comma-separated), scenarios (comma-separated or "all")
2493
+ */
2494
+ router.get('/runs/:runId/resume-status', (req, res) => {
2495
+ try {
2496
+ const { runId } = req.params;
2497
+ const run = evaluationStore.getRun(runId);
2498
+
2499
+ if (!run) {
2500
+ return res.status(404).json({ error: 'Run not found' });
2501
+ }
2502
+
2503
+ // Get profiles and scenarios from query or run metadata
2504
+ const metadata = run.metadata ? JSON.parse(run.metadata) : {};
2505
+ let profiles = req.query.profiles ? req.query.profiles.split(',') : metadata.profiles || [];
2506
+ let scenariosParam = req.query.scenarios || metadata.scenarios || 'all';
2507
+
2508
+ if (profiles.length === 0) {
2509
+ return res.status(400).json({
2510
+ error: 'Profiles not specified',
2511
+ hint: 'Provide profiles as query param or ensure run metadata contains profiles',
2512
+ });
2513
+ }
2514
+
2515
+ // Get scenarios
2516
+ const allScenarios = tutorApi.listScenarios();
2517
+ const scenarios = scenariosParam === 'all'
2518
+ ? allScenarios
2519
+ : allScenarios.filter(s => scenariosParam.includes(s.id));
2520
+
2521
+ // Get incomplete tests
2522
+ const status = evaluationStore.getIncompleteTests(runId, profiles, scenarios);
2523
+
2524
+ res.json({
2525
+ success: true,
2526
+ ...status,
2527
+ runMetadata: {
2528
+ description: run.description,
2529
+ createdAt: run.createdAt,
2530
+ totalScenarios: run.totalScenarios,
2531
+ totalConfigurations: run.totalConfigurations,
2532
+ },
2533
+ });
2534
+ } catch (error) {
2535
+ console.error('[EvalRoutes] Resume status error:', error);
2536
+ res.status(500).json({ error: 'Failed to get resume status', details: error.message });
2537
+ }
2538
+ });
2539
+
2540
+ // ============================================================================
2541
+ // Interaction Evaluation Endpoints (Learner-Tutor Dialogues)
2542
+ // ============================================================================
2543
+
2544
+ /**
2545
+ * List interaction evaluations
2546
+ * GET /api/eval/interactions
2547
+ * Query params: limit (default 50), scenarioId
2548
+ */
2549
+ router.get('/interactions', (req, res) => {
2550
+ try {
2551
+ const limit = parseInt(req.query.limit) || 50;
2552
+ const scenarioId = req.query.scenarioId || null;
2553
+
2554
+ const evals = evaluationStore.listInteractionEvals({ limit, scenarioId });
2555
+ res.json({ success: true, evals, count: evals.length });
2556
+ } catch (error) {
2557
+ console.error('[EvalRoutes] List interactions error:', error);
2558
+ res.status(500).json({ error: 'Failed to list interaction evaluations' });
2559
+ }
2560
+ });
2561
+
2562
+ /**
2563
+ * Get a specific interaction evaluation
2564
+ * GET /api/eval/interactions/:evalId
2565
+ */
2566
+ router.get('/interactions/:evalId', (req, res) => {
2567
+ try {
2568
+ const { evalId } = req.params;
2569
+ const evalData = evaluationStore.getInteractionEval(evalId);
2570
+
2571
+ if (!evalData) {
2572
+ return res.status(404).json({ error: 'Interaction evaluation not found' });
2573
+ }
2574
+
2575
+ res.json({ success: true, ...evalData });
2576
+ } catch (error) {
2577
+ console.error('[EvalRoutes] Get interaction error:', error);
2578
+ res.status(500).json({ error: 'Failed to get interaction evaluation' });
2579
+ }
2580
+ });
2581
+
2582
+ /**
2583
+ * Get mermaid sequence diagram for an interaction evaluation
2584
+ * GET /api/eval/interactions/:evalId/diagram
2585
+ */
2586
+ router.get('/interactions/:evalId/diagram', (req, res) => {
2587
+ try {
2588
+ const { evalId } = req.params;
2589
+ const evalData = evaluationStore.getInteractionEval(evalId);
2590
+
2591
+ if (!evalData) {
2592
+ return res.status(404).json({ error: 'Interaction evaluation not found' });
2593
+ }
2594
+
2595
+ res.type('text/plain').send(evalData.sequenceDiagram || 'No diagram available');
2596
+ } catch (error) {
2597
+ console.error('[EvalRoutes] Get diagram error:', error);
2598
+ res.status(500).json({ error: 'Failed to get diagram' });
2599
+ }
2600
+ });
2601
+
2602
+ /**
2603
+ * Get formatted transcript for an interaction evaluation
2604
+ * GET /api/eval/interactions/:evalId/transcript
2605
+ */
2606
+ router.get('/interactions/:evalId/transcript', (req, res) => {
2607
+ try {
2608
+ const { evalId } = req.params;
2609
+ const evalData = evaluationStore.getInteractionEval(evalId);
2610
+
2611
+ if (!evalData) {
2612
+ return res.status(404).json({ error: 'Interaction evaluation not found' });
2613
+ }
2614
+
2615
+ res.type('text/plain').send(evalData.formattedTranscript || 'No transcript available');
2616
+ } catch (error) {
2617
+ console.error('[EvalRoutes] Get transcript error:', error);
2618
+ res.status(500).json({ error: 'Failed to get transcript' });
2619
+ }
2620
+ });
2621
+
2622
+ // ============================================================================
2623
+ // Recognition A/B Comparison Endpoint
2624
+ // ============================================================================
2625
+
2626
+ /**
2627
+ * Run Recognition A/B comparison with SSE streaming
2628
+ * GET /api/eval/stream/recognition-ab
2629
+ *
2630
+ * Compares baseline (no recognition) vs recognition (with recognition) profiles
2631
+ * using only recognition_test: true scenarios.
2632
+ *
2633
+ * Returns:
2634
+ * - Per-profile results with dimension scores
2635
+ * - Recognition metrics for recognition profile
2636
+ * - Delta analysis with statistical significance indicators
2637
+ * - Winner badges per dimension
2638
+ */
2639
+ router.get('/stream/recognition-ab', async (req, res) => {
2640
+ // Set up SSE
2641
+ res.writeHead(200, {
2642
+ 'Content-Type': 'text/event-stream',
2643
+ 'Cache-Control': 'no-cache',
2644
+ Connection: 'keep-alive',
2645
+ });
2646
+
2647
+ const sendEvent = (type, data) => {
2648
+ res.write(`event: ${type}\ndata: ${JSON.stringify(data)}\n\n`);
2649
+ };
2650
+
2651
+ // Keep-alive to prevent connection timeout
2652
+ const keepAlive = setInterval(() => {
2653
+ res.write(': keepalive\n\n');
2654
+ }, 15000);
2655
+
2656
+ // Register stream for crash protection
2657
+ const streamId = registerStream(res, keepAlive);
2658
+
2659
+ // Clean up on close
2660
+ req.on('close', () => {
2661
+ clearInterval(keepAlive);
2662
+ unregisterStream(streamId);
2663
+ });
2664
+
2665
+ try {
2666
+ // Fixed profiles for A/B comparison
2667
+ const profiles = ['baseline', 'recognition'];
2668
+ const skipRubric = req.query.skipRubric === 'true';
2669
+ const outputSize = req.query.outputSize || 'normal';
2670
+
2671
+ // Validate profiles exist
2672
+ const allProfiles = tutorApi.listProfiles();
2673
+ const validProfiles = profiles.filter(p => allProfiles.some(ap => ap.name === p));
2674
+
2675
+ if (validProfiles.length !== 2) {
2676
+ sendEvent('error', {
2677
+ error: 'Recognition A/B requires both baseline and recognition profiles',
2678
+ found: validProfiles,
2679
+ available: allProfiles.map(p => p.name),
2680
+ });
2681
+ return res.end();
2682
+ }
2683
+
2684
+ // Get only recognition_test scenarios
2685
+ const allScenarios = tutorApi.listScenarios();
2686
+ const recognitionScenarios = allScenarios.filter(s => s.recognition_test === true);
2687
+
2688
+ if (recognitionScenarios.length === 0) {
2689
+ sendEvent('error', { error: 'No recognition_test scenarios found in config' });
2690
+ return res.end();
2691
+ }
2692
+
2693
+ const totalTests = validProfiles.length * recognitionScenarios.length;
2694
+ const testLearnerId = `eval-recognition-ab-${Date.now()}`;
2695
+
2696
+ sendEvent('start', {
2697
+ profiles: validProfiles,
2698
+ scenarioCount: recognitionScenarios.length,
2699
+ scenarioIds: recognitionScenarios.map(s => s.id),
2700
+ totalTests,
2701
+ skipRubric,
2702
+ outputSize,
2703
+ testLearnerId,
2704
+ timestamp: new Date().toISOString(),
2705
+ });
2706
+
2707
+ sendEvent('log', {
2708
+ message: `Recognition A/B: baseline vs recognition × ${recognitionScenarios.length} scenarios`,
2709
+ level: 'info',
2710
+ });
2711
+
2712
+ // Create a run to persist results
2713
+ const run = evaluationStore.createRun({
2714
+ description: `Recognition A/B: baseline vs recognition × ${recognitionScenarios.length} scenarios`,
2715
+ totalScenarios: recognitionScenarios.length,
2716
+ totalConfigurations: 2,
2717
+ metadata: {
2718
+ runType: 'recognition-ab',
2719
+ profiles: validProfiles,
2720
+ scenarios: recognitionScenarios.map(s => s.id),
2721
+ scenarioNames: recognitionScenarios.map(s => s.name),
2722
+ skipRubric,
2723
+ testLearnerId,
2724
+ },
2725
+ });
2726
+
2727
+ sendEvent('log', { message: `Run ID: ${run.id}`, level: 'info' });
2728
+
2729
+ // Run evaluations
2730
+ const results = { baseline: [], recognition: [] };
2731
+ const dimensionScores = { baseline: {}, recognition: {} };
2732
+ const recognitionMetrics = {
2733
+ momentsGenerated: 0,
2734
+ dialecticalDepth: [],
2735
+ synthesisStrategies: {
2736
+ ghost_dominates: 0,
2737
+ learner_dominates: 0,
2738
+ dialectical_synthesis: 0,
2739
+ },
2740
+ };
2741
+ let completedTests = 0;
2742
+
2743
+ for (const profileName of validProfiles) {
2744
+ sendEvent('log', { message: `\n=== Profile: ${profileName} ===`, level: 'info' });
2745
+
2746
+ // Clear writing pad before each profile run for clean comparison
2747
+ try {
2748
+ clearConscious(testLearnerId);
2749
+ sendEvent('log', { message: ` Cleared writing pad for ${testLearnerId}`, level: 'info' });
2750
+ } catch (e) {
2751
+ // Pad may not exist yet, that's fine
2752
+ }
2753
+
2754
+ for (const scenario of recognitionScenarios) {
2755
+ completedTests++;
2756
+
2757
+ sendEvent('progress', {
2758
+ current: completedTests,
2759
+ total: totalTests,
2760
+ profile: profileName,
2761
+ scenario: scenario.name,
2762
+ percentage: Math.round((completedTests / totalTests) * 100),
2763
+ });
2764
+
2765
+ sendEvent('log', { message: `[${completedTests}/${totalTests}] ${scenario.name}...`, level: 'info' });
2766
+
2767
+ try {
2768
+ const config = { profileName, label: profileName };
2769
+
2770
+ // Create log callback for this test
2771
+ const onLog = (message, level = 'info') => {
2772
+ sendEvent('log', { message: ` ${message}`, level, timestamp: new Date().toISOString() });
2773
+ };
2774
+
2775
+ const result = await evaluationRunner.quickTest(config, {
2776
+ scenarioId: scenario.id,
2777
+ verbose: false,
2778
+ skipRubricEval: skipRubric,
2779
+ outputSize,
2780
+ onLog,
2781
+ learnerId: testLearnerId,
2782
+ });
2783
+
2784
+ results[profileName].push(result);
2785
+
2786
+ // Save result to database
2787
+ evaluationStore.storeResult(run.id, {
2788
+ ...result,
2789
+ scenarioId: scenario.id,
2790
+ scenarioName: scenario.name,
2791
+ profileName,
2792
+ });
2793
+
2794
+ // Collect dimension scores
2795
+ if (result.scores) {
2796
+ for (const [dim, score] of Object.entries(result.scores)) {
2797
+ if (!dimensionScores[profileName][dim]) {
2798
+ dimensionScores[profileName][dim] = [];
2799
+ }
2800
+ if (typeof score === 'number') {
2801
+ dimensionScores[profileName][dim].push(score);
2802
+ }
2803
+ }
2804
+ }
2805
+
2806
+ // For recognition profile, collect recognition-specific metrics
2807
+ if (profileName === 'recognition') {
2808
+ try {
2809
+ const pad = getWritingPad(testLearnerId);
2810
+ if (pad) {
2811
+ recognitionMetrics.momentsGenerated = pad.totalRecognitionMoments || 0;
2812
+ if (pad.dialecticalDepth) {
2813
+ recognitionMetrics.dialecticalDepth.push(pad.dialecticalDepth);
2814
+ }
2815
+ // Aggregate synthesis strategies from pad stats
2816
+ const stats = pad.statistics || {};
2817
+ if (stats.synthesisStrategies) {
2818
+ recognitionMetrics.synthesisStrategies.ghost_dominates += stats.synthesisStrategies.ghost_dominates || 0;
2819
+ recognitionMetrics.synthesisStrategies.learner_dominates += stats.synthesisStrategies.learner_dominates || 0;
2820
+ recognitionMetrics.synthesisStrategies.dialectical_synthesis += stats.synthesisStrategies.dialectical_synthesis || 0;
2821
+ }
2822
+ }
2823
+ } catch (e) {
2824
+ // Recognition metrics collection failed silently
2825
+ }
2826
+ }
2827
+
2828
+ const scoreStr = result.overallScore != null ? result.overallScore.toFixed(1) : 'N/A';
2829
+ const status = result.success !== false ? '✓' : '✗';
2830
+ sendEvent('log', {
2831
+ message: ` ${status} Score: ${scoreStr} (${result.latencyMs}ms)`,
2832
+ level: result.success !== false ? 'success' : 'warning',
2833
+ });
2834
+
2835
+ sendEvent('result', {
2836
+ profile: profileName,
2837
+ scenarioId: scenario.id,
2838
+ scenarioName: scenario.name,
2839
+ passed: result.success !== false,
2840
+ score: result.overallScore,
2841
+ latencyMs: result.latencyMs,
2842
+ inputTokens: result.inputTokens,
2843
+ outputTokens: result.outputTokens,
2844
+ });
2845
+
2846
+ } catch (e) {
2847
+ sendEvent('log', { message: ` ✗ Error: ${e.message}`, level: 'error' });
2848
+
2849
+ const errorResult = {
2850
+ success: false,
2851
+ errorMessage: e.message,
2852
+ scenarioId: scenario.id,
2853
+ };
2854
+ results[profileName].push(errorResult);
2855
+
2856
+ evaluationStore.storeResult(run.id, {
2857
+ ...errorResult,
2858
+ scenarioName: scenario.name,
2859
+ profileName,
2860
+ provider: 'unknown',
2861
+ model: 'unknown',
2862
+ });
2863
+ }
2864
+ }
2865
+ }
2866
+
2867
+ // Update run as completed
2868
+ evaluationStore.updateRun(run.id, {
2869
+ status: 'completed',
2870
+ totalTests: completedTests,
2871
+ completedAt: new Date().toISOString(),
2872
+ });
2873
+
2874
+ // Build dimension averages
2875
+ const dimensions = ['relevance', 'specificity', 'pedagogical', 'personalization', 'actionability', 'tone'];
2876
+ const dimensionAverages = { baseline: {}, recognition: {} };
2877
+
2878
+ for (const profile of validProfiles) {
2879
+ for (const dim of dimensions) {
2880
+ const scores = dimensionScores[profile]?.[dim] || [];
2881
+ dimensionAverages[profile][dim] = scores.length > 0
2882
+ ? scores.reduce((a, b) => a + b, 0) / scores.length
2883
+ : null;
2884
+ }
2885
+ }
2886
+
2887
+ // Build delta analysis with winner indicators
2888
+ const deltaAnalysis = [];
2889
+ for (const dim of dimensions) {
2890
+ const baselineAvg = dimensionAverages.baseline[dim];
2891
+ const recognitionAvg = dimensionAverages.recognition[dim];
2892
+
2893
+ if (baselineAvg != null && recognitionAvg != null) {
2894
+ const delta = recognitionAvg - baselineAvg;
2895
+ const deltaPercent = baselineAvg > 0 ? (delta / baselineAvg) * 100 : 0;
2896
+
2897
+ // Significance thresholds (on 5-point scale)
2898
+ // * = >5% improvement (delta > 0.25)
2899
+ // ** = >10% improvement (delta > 0.5)
2900
+ let significance = '';
2901
+ let winner = null;
2902
+
2903
+ if (Math.abs(delta) > 0.5) {
2904
+ significance = '**';
2905
+ winner = delta > 0 ? 'recognition' : 'baseline';
2906
+ } else if (Math.abs(delta) > 0.25) {
2907
+ significance = '*';
2908
+ winner = delta > 0 ? 'recognition' : 'baseline';
2909
+ }
2910
+
2911
+ deltaAnalysis.push({
2912
+ dimension: dim,
2913
+ baseline: baselineAvg,
2914
+ recognition: recognitionAvg,
2915
+ delta,
2916
+ deltaPercent,
2917
+ significance,
2918
+ winner,
2919
+ });
2920
+ }
2921
+ }
2922
+
2923
+ // Calculate overall scores and winner
2924
+ const baselineResults = results.baseline || [];
2925
+ const recognitionResults = results.recognition || [];
2926
+
2927
+ const baselineScores = baselineResults.filter(r => r.overallScore != null).map(r => r.overallScore);
2928
+ const recognitionScores = recognitionResults.filter(r => r.overallScore != null).map(r => r.overallScore);
2929
+
2930
+ const baselineAvgScore = baselineScores.length > 0
2931
+ ? baselineScores.reduce((a, b) => a + b, 0) / baselineScores.length
2932
+ : null;
2933
+ const recognitionAvgScore = recognitionScores.length > 0
2934
+ ? recognitionScores.reduce((a, b) => a + b, 0) / recognitionScores.length
2935
+ : null;
2936
+
2937
+ let overallWinner = null;
2938
+ let overallDelta = null;
2939
+ let overallSignificance = '';
2940
+
2941
+ if (baselineAvgScore != null && recognitionAvgScore != null) {
2942
+ overallDelta = recognitionAvgScore - baselineAvgScore;
2943
+
2944
+ // Overall winner based on score delta > 5 points
2945
+ if (Math.abs(overallDelta) > 10) {
2946
+ overallSignificance = '**';
2947
+ overallWinner = overallDelta > 0 ? 'recognition' : 'baseline';
2948
+ } else if (Math.abs(overallDelta) > 5) {
2949
+ overallSignificance = '*';
2950
+ overallWinner = overallDelta > 0 ? 'recognition' : 'baseline';
2951
+ }
2952
+ }
2953
+
2954
+ // Calculate average dialectical depth
2955
+ const avgDialecticalDepth = recognitionMetrics.dialecticalDepth.length > 0
2956
+ ? recognitionMetrics.dialecticalDepth.reduce((a, b) => a + b, 0) / recognitionMetrics.dialecticalDepth.length
2957
+ : 0;
2958
+
2959
+ sendEvent('log', { message: `\n=== Recognition A/B Complete ===`, level: 'success' });
2960
+ sendEvent('log', { message: `Total tests: ${completedTests}`, level: 'info' });
2961
+ sendEvent('log', {
2962
+ message: `Baseline avg: ${baselineAvgScore?.toFixed(1) || 'N/A'} | Recognition avg: ${recognitionAvgScore?.toFixed(1) || 'N/A'}`,
2963
+ level: 'info',
2964
+ });
2965
+ if (overallWinner) {
2966
+ sendEvent('log', { message: `Winner: ${overallWinner.toUpperCase()} (${overallSignificance})`, level: 'success' });
2967
+ }
2968
+
2969
+ // Send final complete event with full results
2970
+ sendEvent('complete', {
2971
+ success: true,
2972
+ runId: run.id,
2973
+ profiles: validProfiles,
2974
+ scenariosRun: recognitionScenarios.length,
2975
+ dimensionAverages,
2976
+ deltaAnalysis,
2977
+ overallScores: {
2978
+ baseline: baselineAvgScore,
2979
+ recognition: recognitionAvgScore,
2980
+ delta: overallDelta,
2981
+ significance: overallSignificance,
2982
+ winner: overallWinner,
2983
+ },
2984
+ recognitionMetrics: {
2985
+ momentsGenerated: recognitionMetrics.momentsGenerated,
2986
+ avgDialecticalDepth,
2987
+ synthesisStrategies: recognitionMetrics.synthesisStrategies,
2988
+ },
2989
+ results,
2990
+ });
2991
+
2992
+ unregisterStream(streamId);
2993
+ res.end();
2994
+ } catch (error) {
2995
+ sendEvent('log', { message: `Fatal error: ${error.message}`, level: 'error' });
2996
+ sendEvent('error', { error: error.message });
2997
+ unregisterStream(streamId);
2998
+ res.end();
2999
+ }
3000
+ });
3001
+
3002
+ export default router;