@machinespirits/eval 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/components/MobileEvalDashboard.tsx +267 -0
- package/components/comparison/DeltaAnalysisTable.tsx +137 -0
- package/components/comparison/ProfileComparisonCard.tsx +176 -0
- package/components/comparison/RecognitionABMode.tsx +385 -0
- package/components/comparison/RecognitionMetricsPanel.tsx +135 -0
- package/components/comparison/WinnerIndicator.tsx +64 -0
- package/components/comparison/index.ts +5 -0
- package/components/mobile/BottomSheet.tsx +233 -0
- package/components/mobile/DimensionBreakdown.tsx +210 -0
- package/components/mobile/DocsView.tsx +363 -0
- package/components/mobile/LogsView.tsx +481 -0
- package/components/mobile/PsychodynamicQuadrant.tsx +261 -0
- package/components/mobile/QuickTestView.tsx +1098 -0
- package/components/mobile/RecognitionTypeChart.tsx +124 -0
- package/components/mobile/RecognitionView.tsx +809 -0
- package/components/mobile/RunDetailView.tsx +261 -0
- package/components/mobile/RunHistoryView.tsx +367 -0
- package/components/mobile/ScoreRadial.tsx +211 -0
- package/components/mobile/StreamingLogPanel.tsx +230 -0
- package/components/mobile/SynthesisStrategyChart.tsx +140 -0
- package/config/interaction-eval-scenarios.yaml +832 -0
- package/config/learner-agents.yaml +248 -0
- package/docs/research/ABLATION-DIALOGUE-ROUNDS.md +52 -0
- package/docs/research/ABLATION-MODEL-SELECTION.md +53 -0
- package/docs/research/ADVANCED-EVAL-ANALYSIS.md +60 -0
- package/docs/research/ANOVA-RESULTS-2026-01-14.md +257 -0
- package/docs/research/COMPREHENSIVE-EVALUATION-PLAN.md +586 -0
- package/docs/research/COST-ANALYSIS.md +56 -0
- package/docs/research/CRITICAL-REVIEW-RECOGNITION-TUTORING.md +340 -0
- package/docs/research/DYNAMIC-VS-SCRIPTED-ANALYSIS.md +291 -0
- package/docs/research/EVAL-SYSTEM-ANALYSIS.md +306 -0
- package/docs/research/FACTORIAL-RESULTS-2026-01-14.md +301 -0
- package/docs/research/IMPLEMENTATION-PLAN-CRITIQUE-RESPONSE.md +1988 -0
- package/docs/research/LONGITUDINAL-DYADIC-EVALUATION.md +282 -0
- package/docs/research/MULTI-JUDGE-VALIDATION-2026-01-14.md +147 -0
- package/docs/research/PAPER-EXTENSION-DYADIC.md +204 -0
- package/docs/research/PAPER-UNIFIED.md +659 -0
- package/docs/research/PAPER-UNIFIED.pdf +0 -0
- package/docs/research/PROMPT-IMPROVEMENTS-2026-01-14.md +356 -0
- package/docs/research/SESSION-NOTES-2026-01-11-RECOGNITION-EVAL.md +419 -0
- package/docs/research/apa.csl +2133 -0
- package/docs/research/archive/PAPER-DRAFT-RECOGNITION-TUTORING.md +1637 -0
- package/docs/research/archive/paper-multiagent-tutor.tex +978 -0
- package/docs/research/paper-draft/full-paper.md +136 -0
- package/docs/research/paper-draft/images/pasted-image-2026-01-24T03-47-47-846Z-d76a7ae2.png +0 -0
- package/docs/research/paper-draft/references.bib +515 -0
- package/docs/research/transcript-baseline.md +139 -0
- package/docs/research/transcript-recognition-multiagent.md +187 -0
- package/hooks/useEvalData.ts +625 -0
- package/index.js +27 -0
- package/package.json +73 -0
- package/routes/evalRoutes.js +3002 -0
- package/scripts/advanced-eval-analysis.js +351 -0
- package/scripts/analyze-eval-costs.js +378 -0
- package/scripts/analyze-eval-results.js +513 -0
- package/scripts/analyze-interaction-evals.js +368 -0
- package/server-init.js +45 -0
- package/server.js +162 -0
- package/services/benchmarkService.js +1892 -0
- package/services/evaluationRunner.js +739 -0
- package/services/evaluationStore.js +1121 -0
- package/services/learnerConfigLoader.js +385 -0
- package/services/learnerTutorInteractionEngine.js +857 -0
- package/services/memory/learnerMemoryService.js +1227 -0
- package/services/memory/learnerWritingPad.js +577 -0
- package/services/memory/tutorWritingPad.js +674 -0
- package/services/promptRecommendationService.js +493 -0
- package/services/rubricEvaluator.js +826 -0
|
@@ -0,0 +1,3002 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Evaluation API Routes
|
|
3
|
+
*
|
|
4
|
+
* Endpoints for testing and evaluating AI tutor performance.
|
|
5
|
+
* Mirrors CLI /eval functionality for web/API access.
|
|
6
|
+
*
|
|
7
|
+
* Note: Prompt recommendations are read-only via API.
|
|
8
|
+
* Prompts can be viewed but not written to disk.
|
|
9
|
+
*/
|
|
10
|
+
|
|
11
|
+
import { Router } from 'express';
|
|
12
|
+
import * as evaluationRunner from '../services/evaluationRunner.js';
|
|
13
|
+
import * as evaluationStore from '../services/evaluationStore.js';
|
|
14
|
+
import * as learnerConfigLoader from '../services/learnerConfigLoader.js';
|
|
15
|
+
import * as promptRecommendationService from '../services/promptRecommendationService.js';
|
|
16
|
+
import interactionEngine from '../services/learnerTutorInteractionEngine.js';
|
|
17
|
+
// Import core tutor services from @machinespirits/tutor-core
|
|
18
|
+
import {
|
|
19
|
+
tutorApiService as tutorApi,
|
|
20
|
+
dialogueLogService,
|
|
21
|
+
monitoringService,
|
|
22
|
+
aiConfigService,
|
|
23
|
+
writingPadService
|
|
24
|
+
} from '@machinespirits/tutor-core';
|
|
25
|
+
const { getApiKey, getDefaultModel } = aiConfigService;
|
|
26
|
+
const { clearConscious, getWritingPad } = writingPadService;
|
|
27
|
+
import fs from 'fs';
|
|
28
|
+
import path from 'path';
|
|
29
|
+
|
|
30
|
+
const router = Router();
|
|
31
|
+
|
|
32
|
+
// ============================================================================
|
|
33
|
+
// CRASH PROTECTION: Track active evaluation streams
|
|
34
|
+
// ============================================================================
|
|
35
|
+
const activeEvalStreams = new Map();
|
|
36
|
+
let streamIdCounter = 0;
|
|
37
|
+
|
|
38
|
+
// Configuration
|
|
39
|
+
const MAX_STREAM_DURATION_MS = 2 * 60 * 60 * 1000; // 2 hours
|
|
40
|
+
const TIMEOUT_WARNING_MS = 30 * 60 * 1000; // Warn at 30 minutes before timeout
|
|
41
|
+
|
|
42
|
+
// Cleanup function for orphaned streams
|
|
43
|
+
export function cleanupAllStreams() {
|
|
44
|
+
if (activeEvalStreams.size > 0) {
|
|
45
|
+
console.log(`[EvalRoutes] Cleaning up ${activeEvalStreams.size} active streams...`);
|
|
46
|
+
activeEvalStreams.forEach(({ res, keepAlive, timeoutTimer, streamId }) => {
|
|
47
|
+
try {
|
|
48
|
+
if (keepAlive) clearInterval(keepAlive);
|
|
49
|
+
if (timeoutTimer) clearTimeout(timeoutTimer);
|
|
50
|
+
if (res && !res.writableEnded) {
|
|
51
|
+
res.write('event: error\ndata: {"error": "Server restarting"}\n\n');
|
|
52
|
+
res.end();
|
|
53
|
+
}
|
|
54
|
+
} catch (e) {
|
|
55
|
+
console.error(`[EvalRoutes] Error cleaning stream ${streamId}:`, e.message);
|
|
56
|
+
}
|
|
57
|
+
});
|
|
58
|
+
activeEvalStreams.clear();
|
|
59
|
+
}
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
// Helper to register a new stream with timeout protection
|
|
63
|
+
function registerStream(res, keepAlive, options = {}) {
|
|
64
|
+
const streamId = `eval-stream-${++streamIdCounter}-${Date.now()}`;
|
|
65
|
+
const maxDuration = options.maxDuration || MAX_STREAM_DURATION_MS;
|
|
66
|
+
const startedAt = Date.now();
|
|
67
|
+
|
|
68
|
+
// Set up timeout handler
|
|
69
|
+
const timeoutTimer = setTimeout(() => {
|
|
70
|
+
console.warn(`[EvalRoutes] Stream ${streamId} exceeded max duration (${maxDuration}ms), forcing cleanup`);
|
|
71
|
+
try {
|
|
72
|
+
if (res && !res.writableEnded) {
|
|
73
|
+
res.write('event: error\ndata: {"error": "Evaluation timeout - exceeded maximum duration", "timeout": true}\n\n');
|
|
74
|
+
res.end();
|
|
75
|
+
}
|
|
76
|
+
} catch (e) {
|
|
77
|
+
console.error(`[EvalRoutes] Error sending timeout to ${streamId}:`, e.message);
|
|
78
|
+
}
|
|
79
|
+
unregisterStream(streamId);
|
|
80
|
+
}, maxDuration);
|
|
81
|
+
|
|
82
|
+
activeEvalStreams.set(streamId, {
|
|
83
|
+
res,
|
|
84
|
+
keepAlive,
|
|
85
|
+
timeoutTimer,
|
|
86
|
+
streamId,
|
|
87
|
+
startedAt,
|
|
88
|
+
maxDuration
|
|
89
|
+
});
|
|
90
|
+
|
|
91
|
+
console.log(`[EvalRoutes] Stream registered: ${streamId} (Timeout: ${maxDuration}ms, Total active: ${activeEvalStreams.size})`);
|
|
92
|
+
return streamId;
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
// Helper to unregister a stream
|
|
96
|
+
function unregisterStream(streamId) {
|
|
97
|
+
const stream = activeEvalStreams.get(streamId);
|
|
98
|
+
if (stream) {
|
|
99
|
+
if (stream.keepAlive) clearInterval(stream.keepAlive);
|
|
100
|
+
if (stream.timeoutTimer) clearTimeout(stream.timeoutTimer);
|
|
101
|
+
activeEvalStreams.delete(streamId);
|
|
102
|
+
const duration = Math.round((Date.now() - stream.startedAt) / 1000);
|
|
103
|
+
console.log(`[EvalRoutes] Stream closed: ${streamId} (Duration: ${duration}s, Remaining: ${activeEvalStreams.size})`);
|
|
104
|
+
}
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
// Periodic check for hung streams (runs every 5 minutes)
|
|
108
|
+
setInterval(() => {
|
|
109
|
+
const now = Date.now();
|
|
110
|
+
activeEvalStreams.forEach((stream, streamId) => {
|
|
111
|
+
const age = now - stream.startedAt;
|
|
112
|
+
|
|
113
|
+
// Warn if approaching timeout
|
|
114
|
+
if (age > (stream.maxDuration - TIMEOUT_WARNING_MS) && !stream.warningShown) {
|
|
115
|
+
const remaining = Math.round((stream.maxDuration - age) / 1000 / 60);
|
|
116
|
+
console.warn(`[EvalRoutes] Stream ${streamId} will timeout in ${remaining} minutes`);
|
|
117
|
+
try {
|
|
118
|
+
if (stream.res && !stream.res.writableEnded) {
|
|
119
|
+
stream.res.write(`event: warning\ndata: {"message": "Evaluation will timeout in ${remaining} minutes", "remainingMs": ${stream.maxDuration - age}}\n\n`);
|
|
120
|
+
}
|
|
121
|
+
} catch (e) {
|
|
122
|
+
// Ignore write errors
|
|
123
|
+
}
|
|
124
|
+
stream.warningShown = true;
|
|
125
|
+
}
|
|
126
|
+
});
|
|
127
|
+
}, 5 * 60 * 1000); // Check every 5 minutes
|
|
128
|
+
|
|
129
|
+
// Path to prompts directory
|
|
130
|
+
const PROMPTS_DIR = path.join(process.cwd(), 'prompts');
|
|
131
|
+
|
|
132
|
+
// ============================================================================
|
|
133
|
+
// Configuration Endpoints
|
|
134
|
+
// ============================================================================
|
|
135
|
+
|
|
136
|
+
/**
|
|
137
|
+
* List available scenarios
|
|
138
|
+
* GET /api/eval/scenarios
|
|
139
|
+
*/
|
|
140
|
+
router.get('/scenarios', (req, res) => {
|
|
141
|
+
try {
|
|
142
|
+
const scenarios = tutorApi.listScenarios();
|
|
143
|
+
res.json({ success: true, scenarios });
|
|
144
|
+
} catch (error) {
|
|
145
|
+
console.error('[EvalRoutes] List scenarios error:', error);
|
|
146
|
+
res.status(500).json({ error: 'Failed to list scenarios' });
|
|
147
|
+
}
|
|
148
|
+
});
|
|
149
|
+
|
|
150
|
+
/**
|
|
151
|
+
* Get scenario details
|
|
152
|
+
* GET /api/eval/scenarios/:id
|
|
153
|
+
*/
|
|
154
|
+
router.get('/scenarios/:id', (req, res) => {
|
|
155
|
+
try {
|
|
156
|
+
const scenario = tutorApi.getScenario(req.params.id);
|
|
157
|
+
if (!scenario) {
|
|
158
|
+
return res.status(404).json({ error: 'Scenario not found' });
|
|
159
|
+
}
|
|
160
|
+
res.json({ success: true, scenario });
|
|
161
|
+
} catch (error) {
|
|
162
|
+
console.error('[EvalRoutes] Get scenario error:', error);
|
|
163
|
+
res.status(500).json({ error: 'Failed to get scenario' });
|
|
164
|
+
}
|
|
165
|
+
});
|
|
166
|
+
|
|
167
|
+
/**
|
|
168
|
+
* List available tutor profiles
|
|
169
|
+
* GET /api/eval/profiles
|
|
170
|
+
*/
|
|
171
|
+
router.get('/profiles', (req, res) => {
|
|
172
|
+
try {
|
|
173
|
+
const profiles = tutorApi.listProfiles();
|
|
174
|
+
res.json({ success: true, profiles });
|
|
175
|
+
} catch (error) {
|
|
176
|
+
console.error('[EvalRoutes] List profiles error:', error);
|
|
177
|
+
res.status(500).json({ error: 'Failed to list profiles' });
|
|
178
|
+
}
|
|
179
|
+
});
|
|
180
|
+
|
|
181
|
+
/**
|
|
182
|
+
* List available learner profiles (for interaction evaluations)
|
|
183
|
+
* GET /api/eval/learner-profiles
|
|
184
|
+
*/
|
|
185
|
+
router.get('/learner-profiles', (req, res) => {
|
|
186
|
+
try {
|
|
187
|
+
const profiles = learnerConfigLoader.listProfiles();
|
|
188
|
+
const personas = learnerConfigLoader.listPersonas();
|
|
189
|
+
res.json({ success: true, profiles, personas });
|
|
190
|
+
} catch (error) {
|
|
191
|
+
console.error('[EvalRoutes] List learner profiles error:', error);
|
|
192
|
+
res.status(500).json({ error: 'Failed to list learner profiles' });
|
|
193
|
+
}
|
|
194
|
+
});
|
|
195
|
+
|
|
196
|
+
/**
|
|
197
|
+
* List model configurations
|
|
198
|
+
* GET /api/eval/configurations
|
|
199
|
+
*/
|
|
200
|
+
router.get('/configurations', (req, res) => {
|
|
201
|
+
try {
|
|
202
|
+
const configurations = tutorApi.listConfigurations();
|
|
203
|
+
res.json({ success: true, configurations });
|
|
204
|
+
} catch (error) {
|
|
205
|
+
console.error('[EvalRoutes] List configurations error:', error);
|
|
206
|
+
res.status(500).json({ error: 'Failed to list configurations' });
|
|
207
|
+
}
|
|
208
|
+
});
|
|
209
|
+
|
|
210
|
+
// ============================================================================
|
|
211
|
+
// Quick Test Endpoints
|
|
212
|
+
// ============================================================================
|
|
213
|
+
|
|
214
|
+
/**
|
|
215
|
+
* Run a quick evaluation test
|
|
216
|
+
* POST /api/eval/quick
|
|
217
|
+
*
|
|
218
|
+
* Body: {
|
|
219
|
+
* profile: "budget", // Profile name or config string
|
|
220
|
+
* scenario: "new_user_first_visit", // Scenario ID (optional)
|
|
221
|
+
* skipRubric: true // Skip AI judge evaluation (optional)
|
|
222
|
+
* }
|
|
223
|
+
*/
|
|
224
|
+
router.post('/quick', async (req, res) => {
|
|
225
|
+
try {
|
|
226
|
+
const { profile = 'budget', scenario = 'new_user_first_visit', skipRubric = false } = req.body;
|
|
227
|
+
|
|
228
|
+
// Build config
|
|
229
|
+
const config = { profileName: profile };
|
|
230
|
+
|
|
231
|
+
// Get scenario name for description
|
|
232
|
+
const scenarioDetails = tutorApi.getScenario(scenario);
|
|
233
|
+
const scenarioName = scenarioDetails?.name || scenario;
|
|
234
|
+
|
|
235
|
+
// Create a run to persist result to history
|
|
236
|
+
const run = evaluationStore.createRun({
|
|
237
|
+
description: scenarioName,
|
|
238
|
+
totalScenarios: 1,
|
|
239
|
+
totalConfigurations: 1,
|
|
240
|
+
metadata: {
|
|
241
|
+
runType: 'quick',
|
|
242
|
+
profiles: [profile],
|
|
243
|
+
scenarios: [scenario],
|
|
244
|
+
scenarioNames: [scenarioName],
|
|
245
|
+
},
|
|
246
|
+
});
|
|
247
|
+
|
|
248
|
+
const result = await evaluationRunner.quickTest(config, {
|
|
249
|
+
scenarioId: scenario,
|
|
250
|
+
skipRubricEval: skipRubric,
|
|
251
|
+
verbose: false,
|
|
252
|
+
});
|
|
253
|
+
|
|
254
|
+
// Store result to history
|
|
255
|
+
evaluationStore.storeResult(run.id, result);
|
|
256
|
+
|
|
257
|
+
// Mark run as completed
|
|
258
|
+
evaluationStore.updateRun(run.id, {
|
|
259
|
+
status: 'completed',
|
|
260
|
+
totalTests: 1,
|
|
261
|
+
completedAt: new Date().toISOString(),
|
|
262
|
+
});
|
|
263
|
+
|
|
264
|
+
res.json({
|
|
265
|
+
success: true,
|
|
266
|
+
runId: run.id,
|
|
267
|
+
result: {
|
|
268
|
+
runId: run.id,
|
|
269
|
+
scenarioId: result.scenarioId,
|
|
270
|
+
scenarioName: result.scenarioName,
|
|
271
|
+
profile: result.profileName,
|
|
272
|
+
provider: result.provider,
|
|
273
|
+
model: result.model,
|
|
274
|
+
passed: result.success,
|
|
275
|
+
overallScore: result.overallScore,
|
|
276
|
+
latencyMs: result.latencyMs,
|
|
277
|
+
scores: result.scoresWithReasoning || result.scores, // Prefer detailed scores
|
|
278
|
+
validation: {
|
|
279
|
+
passesRequired: result.passesRequired,
|
|
280
|
+
passesForbidden: result.passesForbidden,
|
|
281
|
+
requiredMissing: result.requiredMissing,
|
|
282
|
+
forbiddenFound: result.forbiddenFound,
|
|
283
|
+
},
|
|
284
|
+
suggestions: result.suggestions,
|
|
285
|
+
// Token usage
|
|
286
|
+
inputTokens: result.inputTokens,
|
|
287
|
+
outputTokens: result.outputTokens,
|
|
288
|
+
totalTokens: (result.inputTokens || 0) + (result.outputTokens || 0),
|
|
289
|
+
apiCalls: result.apiCalls,
|
|
290
|
+
dialogueRounds: result.dialogueRounds,
|
|
291
|
+
// Evaluator reasoning
|
|
292
|
+
evaluationReasoning: result.evaluationReasoning,
|
|
293
|
+
evaluatorModel: result.evaluatorModel,
|
|
294
|
+
// Scenario context for display (original user request)
|
|
295
|
+
scenarioContext: scenarioDetails ? {
|
|
296
|
+
description: scenarioDetails.description,
|
|
297
|
+
expectedBehavior: scenarioDetails.expected_behavior,
|
|
298
|
+
learnerContext: scenarioDetails.learner_context,
|
|
299
|
+
} : null,
|
|
300
|
+
},
|
|
301
|
+
});
|
|
302
|
+
} catch (error) {
|
|
303
|
+
console.error('[EvalRoutes] Quick test error:', error);
|
|
304
|
+
res.status(500).json({ error: 'Failed to run quick test', details: error.message });
|
|
305
|
+
}
|
|
306
|
+
});
|
|
307
|
+
|
|
308
|
+
/**
|
|
309
|
+
* Run a quick test with SSE streaming for real-time logs
|
|
310
|
+
* GET /api/eval/stream/quick
|
|
311
|
+
* Query params: profile, scenario, skipRubric
|
|
312
|
+
*/
|
|
313
|
+
router.get('/stream/quick', async (req, res) => {
|
|
314
|
+
// Set up SSE
|
|
315
|
+
res.writeHead(200, {
|
|
316
|
+
'Content-Type': 'text/event-stream',
|
|
317
|
+
'Cache-Control': 'no-cache',
|
|
318
|
+
Connection: 'keep-alive',
|
|
319
|
+
});
|
|
320
|
+
|
|
321
|
+
const sendEvent = (type, data) => {
|
|
322
|
+
// Use named events for addEventListener compatibility
|
|
323
|
+
res.write(`event: ${type}\ndata: ${JSON.stringify(data)}\n\n`);
|
|
324
|
+
};
|
|
325
|
+
|
|
326
|
+
// Keep-alive to prevent connection timeout
|
|
327
|
+
const keepAlive = setInterval(() => {
|
|
328
|
+
res.write(': keepalive\n\n');
|
|
329
|
+
}, 15000);
|
|
330
|
+
|
|
331
|
+
// Register stream for crash protection
|
|
332
|
+
const streamId = registerStream(res, keepAlive);
|
|
333
|
+
|
|
334
|
+
// Clean up on close
|
|
335
|
+
req.on('close', () => {
|
|
336
|
+
clearInterval(keepAlive);
|
|
337
|
+
unregisterStream(streamId);
|
|
338
|
+
});
|
|
339
|
+
|
|
340
|
+
try {
|
|
341
|
+
const profile = req.query.profile || 'budget';
|
|
342
|
+
const scenario = req.query.scenario || 'new_user_first_visit';
|
|
343
|
+
const skipRubric = req.query.skipRubric === 'true';
|
|
344
|
+
const outputSize = req.query.outputSize || 'normal'; // compact, normal, expanded
|
|
345
|
+
|
|
346
|
+
// Get scenario name for description
|
|
347
|
+
const scenarioDetails = tutorApi.getScenario(scenario);
|
|
348
|
+
const scenarioName = scenarioDetails?.name || scenario;
|
|
349
|
+
|
|
350
|
+
// Create a run to persist result to history (status: 'running')
|
|
351
|
+
const run = evaluationStore.createRun({
|
|
352
|
+
description: scenarioName,
|
|
353
|
+
totalScenarios: 1,
|
|
354
|
+
totalConfigurations: 1,
|
|
355
|
+
metadata: {
|
|
356
|
+
runType: 'quick',
|
|
357
|
+
profiles: [profile],
|
|
358
|
+
scenarios: [scenario],
|
|
359
|
+
scenarioNames: [scenarioName],
|
|
360
|
+
},
|
|
361
|
+
});
|
|
362
|
+
|
|
363
|
+
sendEvent('start', {
|
|
364
|
+
profile,
|
|
365
|
+
scenario,
|
|
366
|
+
skipRubric,
|
|
367
|
+
outputSize,
|
|
368
|
+
runId: run.id,
|
|
369
|
+
timestamp: new Date().toISOString(),
|
|
370
|
+
});
|
|
371
|
+
|
|
372
|
+
sendEvent('log', { message: `Starting quick test: ${profile} / ${scenario}`, level: 'info' });
|
|
373
|
+
sendEvent('log', { message: `Run ID: ${run.id}`, level: 'info' });
|
|
374
|
+
sendEvent('log', { message: `Skip rubric evaluation: ${skipRubric}`, level: 'info' });
|
|
375
|
+
sendEvent('log', { message: `Output size: ${outputSize}`, level: 'info' });
|
|
376
|
+
|
|
377
|
+
const config = { profileName: profile };
|
|
378
|
+
|
|
379
|
+
// Create a log callback to stream logs
|
|
380
|
+
const onLog = (message, level = 'info') => {
|
|
381
|
+
sendEvent('log', { message, level, timestamp: new Date().toISOString() });
|
|
382
|
+
};
|
|
383
|
+
|
|
384
|
+
sendEvent('log', { message: 'Building learner context...', level: 'info' });
|
|
385
|
+
sendEvent('progress', { stage: 'context', message: 'Building learner context' });
|
|
386
|
+
|
|
387
|
+
const result = await evaluationRunner.quickTest(config, {
|
|
388
|
+
scenarioId: scenario,
|
|
389
|
+
skipRubricEval: skipRubric,
|
|
390
|
+
outputSize, // compact, normal, expanded - affects response length
|
|
391
|
+
verbose: true,
|
|
392
|
+
onLog, // Pass log callback
|
|
393
|
+
});
|
|
394
|
+
|
|
395
|
+
// Store result to history
|
|
396
|
+
evaluationStore.storeResult(run.id, result);
|
|
397
|
+
|
|
398
|
+
// Mark run as completed
|
|
399
|
+
evaluationStore.updateRun(run.id, {
|
|
400
|
+
status: 'completed',
|
|
401
|
+
totalTests: 1,
|
|
402
|
+
completedAt: new Date().toISOString(),
|
|
403
|
+
});
|
|
404
|
+
|
|
405
|
+
sendEvent('log', { message: `Test completed: score=${result.overallScore?.toFixed(1) || 'N/A'}`, level: 'success' });
|
|
406
|
+
sendEvent('log', { message: `Saved to history: ${run.id}`, level: 'info' });
|
|
407
|
+
|
|
408
|
+
sendEvent('result', {
|
|
409
|
+
runId: run.id,
|
|
410
|
+
scenarioId: result.scenarioId,
|
|
411
|
+
scenarioName: result.scenarioName,
|
|
412
|
+
profile: result.profileName,
|
|
413
|
+
provider: result.provider,
|
|
414
|
+
model: result.model,
|
|
415
|
+
passed: result.success,
|
|
416
|
+
overallScore: result.overallScore,
|
|
417
|
+
latencyMs: result.latencyMs,
|
|
418
|
+
scores: result.scoresWithReasoning || result.scores, // Prefer detailed scores
|
|
419
|
+
validation: {
|
|
420
|
+
passesRequired: result.passesRequired,
|
|
421
|
+
passesForbidden: result.passesForbidden,
|
|
422
|
+
requiredMissing: result.requiredMissing,
|
|
423
|
+
forbiddenFound: result.forbiddenFound,
|
|
424
|
+
},
|
|
425
|
+
suggestions: result.suggestions,
|
|
426
|
+
inputTokens: result.inputTokens,
|
|
427
|
+
outputTokens: result.outputTokens,
|
|
428
|
+
totalTokens: (result.inputTokens || 0) + (result.outputTokens || 0),
|
|
429
|
+
apiCalls: result.apiCalls,
|
|
430
|
+
dialogueRounds: result.dialogueRounds,
|
|
431
|
+
dialogueId: result.dialogueId,
|
|
432
|
+
// Evaluator reasoning
|
|
433
|
+
evaluationReasoning: result.evaluationReasoning,
|
|
434
|
+
evaluatorModel: result.evaluatorModel,
|
|
435
|
+
// Scenario context for display (original user request)
|
|
436
|
+
scenarioContext: scenarioDetails ? {
|
|
437
|
+
description: scenarioDetails.description,
|
|
438
|
+
expectedBehavior: scenarioDetails.expected_behavior,
|
|
439
|
+
learnerContext: scenarioDetails.learner_context,
|
|
440
|
+
} : null,
|
|
441
|
+
});
|
|
442
|
+
|
|
443
|
+
sendEvent('complete', { success: true, runId: run.id });
|
|
444
|
+
clearInterval(keepAlive);
|
|
445
|
+
res.end();
|
|
446
|
+
} catch (error) {
|
|
447
|
+
sendEvent('log', { message: `Error: ${error.message}`, level: 'error' });
|
|
448
|
+
sendEvent('error', { error: error.message });
|
|
449
|
+
clearInterval(keepAlive);
|
|
450
|
+
res.end();
|
|
451
|
+
}
|
|
452
|
+
});
|
|
453
|
+
|
|
454
|
+
// ============================================================================
|
|
455
|
+
// Full Evaluation Endpoints
|
|
456
|
+
// ============================================================================
|
|
457
|
+
|
|
458
|
+
/**
|
|
459
|
+
* Run a full evaluation
|
|
460
|
+
* POST /api/eval/run
|
|
461
|
+
*
|
|
462
|
+
* Body: {
|
|
463
|
+
* profiles: ["budget", "fast"], // Profiles to test
|
|
464
|
+
* scenarios: ["new_user_first_visit", "struggling_learner"], // Scenarios (or "all")
|
|
465
|
+
* runsPerConfig: 1, // Repetitions
|
|
466
|
+
* skipRubric: false // Use AI judge
|
|
467
|
+
* }
|
|
468
|
+
*/
|
|
469
|
+
router.post('/run', async (req, res) => {
|
|
470
|
+
try {
|
|
471
|
+
const {
|
|
472
|
+
profiles = ['budget'],
|
|
473
|
+
scenarios = 'all',
|
|
474
|
+
runsPerConfig = 1,
|
|
475
|
+
skipRubric = false,
|
|
476
|
+
description
|
|
477
|
+
} = req.body;
|
|
478
|
+
|
|
479
|
+
// Build configurations from profiles
|
|
480
|
+
const configurations = profiles.map(p => ({ profileName: p, label: p }));
|
|
481
|
+
|
|
482
|
+
const result = await evaluationRunner.runEvaluation({
|
|
483
|
+
scenarios,
|
|
484
|
+
configurations,
|
|
485
|
+
runsPerConfig,
|
|
486
|
+
skipRubricEval: skipRubric,
|
|
487
|
+
description,
|
|
488
|
+
verbose: false,
|
|
489
|
+
});
|
|
490
|
+
|
|
491
|
+
res.json({
|
|
492
|
+
success: true,
|
|
493
|
+
runId: result.runId,
|
|
494
|
+
totalTests: result.totalTests,
|
|
495
|
+
successfulTests: result.successfulTests,
|
|
496
|
+
stats: result.stats,
|
|
497
|
+
scenarioStats: result.scenarioStats,
|
|
498
|
+
});
|
|
499
|
+
} catch (error) {
|
|
500
|
+
console.error('[EvalRoutes] Run evaluation error:', error);
|
|
501
|
+
res.status(500).json({ error: 'Failed to run evaluation', details: error.message });
|
|
502
|
+
}
|
|
503
|
+
});
|
|
504
|
+
|
|
505
|
+
/**
|
|
506
|
+
* Compare multiple configurations
|
|
507
|
+
* POST /api/eval/compare
|
|
508
|
+
*
|
|
509
|
+
* Body: {
|
|
510
|
+
* profiles: ["budget", "fast", "quality"],
|
|
511
|
+
* scenarios: "all",
|
|
512
|
+
* runsPerConfig: 1
|
|
513
|
+
* }
|
|
514
|
+
*/
|
|
515
|
+
router.post('/compare', async (req, res) => {
|
|
516
|
+
try {
|
|
517
|
+
const { profiles, scenarios = 'all', runsPerConfig = 1 } = req.body;
|
|
518
|
+
|
|
519
|
+
if (!profiles || profiles.length < 2) {
|
|
520
|
+
return res.status(400).json({ error: 'At least 2 profiles required for comparison' });
|
|
521
|
+
}
|
|
522
|
+
|
|
523
|
+
const configs = profiles.map(p => ({ profileName: p, label: p }));
|
|
524
|
+
|
|
525
|
+
const result = await evaluationRunner.compareConfigurations(configs, {
|
|
526
|
+
scenarios,
|
|
527
|
+
runsPerConfig,
|
|
528
|
+
verbose: false,
|
|
529
|
+
});
|
|
530
|
+
|
|
531
|
+
res.json({
|
|
532
|
+
success: true,
|
|
533
|
+
runId: result.runId,
|
|
534
|
+
rankings: result.rankings,
|
|
535
|
+
scenarioBreakdown: result.scenarioBreakdown,
|
|
536
|
+
});
|
|
537
|
+
} catch (error) {
|
|
538
|
+
console.error('[EvalRoutes] Compare error:', error);
|
|
539
|
+
res.status(500).json({ error: 'Failed to compare configurations', details: error.message });
|
|
540
|
+
}
|
|
541
|
+
});
|
|
542
|
+
|
|
543
|
+
/**
|
|
544
|
+
* Matrix comparison of multiple profiles with dimension breakdowns
|
|
545
|
+
* POST /api/eval/matrix
|
|
546
|
+
*
|
|
547
|
+
* Body: {
|
|
548
|
+
* profiles: ["budget", "default", "experimental"], // Profiles to test
|
|
549
|
+
* scenarios: "all", // Scenarios to run (or array of IDs)
|
|
550
|
+
* skipRubric: true // Skip AI judge evaluation (faster)
|
|
551
|
+
* }
|
|
552
|
+
*
|
|
553
|
+
* Returns dimension scores and overall rankings for each profile.
|
|
554
|
+
*/
|
|
555
|
+
router.post('/matrix', async (req, res) => {
|
|
556
|
+
try {
|
|
557
|
+
let { profiles = [], scenarios = 'all', skipRubric = false } = req.body;
|
|
558
|
+
|
|
559
|
+
// Default profiles if none specified
|
|
560
|
+
const allProfiles = tutorApi.listProfiles();
|
|
561
|
+
if (profiles.length === 0) {
|
|
562
|
+
profiles = ['budget', 'experimental', 'default', 'fast'].filter(p =>
|
|
563
|
+
allProfiles.some(ap => ap.name === p)
|
|
564
|
+
);
|
|
565
|
+
}
|
|
566
|
+
|
|
567
|
+
// Validate profiles exist
|
|
568
|
+
const validProfiles = profiles.filter(p => allProfiles.some(ap => ap.name === p));
|
|
569
|
+
const invalidProfiles = profiles.filter(p => !allProfiles.some(ap => ap.name === p));
|
|
570
|
+
|
|
571
|
+
if (validProfiles.length === 0) {
|
|
572
|
+
return res.status(400).json({
|
|
573
|
+
error: 'No valid profiles specified',
|
|
574
|
+
available: allProfiles.map(p => p.name),
|
|
575
|
+
});
|
|
576
|
+
}
|
|
577
|
+
|
|
578
|
+
// Get scenarios
|
|
579
|
+
const allScenarios = tutorApi.listScenarios();
|
|
580
|
+
const scenariosToRun = scenarios === 'all'
|
|
581
|
+
? allScenarios
|
|
582
|
+
: allScenarios.filter(s => scenarios.includes(s.id));
|
|
583
|
+
|
|
584
|
+
// Create a run to persist results to history
|
|
585
|
+
const run = evaluationStore.createRun({
|
|
586
|
+
description: `${validProfiles.length} profiles × ${scenariosToRun.length} scenarios`,
|
|
587
|
+
totalScenarios: scenariosToRun.length,
|
|
588
|
+
totalConfigurations: validProfiles.length,
|
|
589
|
+
metadata: {
|
|
590
|
+
runType: 'matrix',
|
|
591
|
+
profiles: validProfiles,
|
|
592
|
+
scenarios: scenariosToRun.map(s => s.id),
|
|
593
|
+
scenarioNames: scenariosToRun.map(s => s.name),
|
|
594
|
+
skipRubric,
|
|
595
|
+
},
|
|
596
|
+
});
|
|
597
|
+
|
|
598
|
+
// Run evaluations
|
|
599
|
+
const results = {};
|
|
600
|
+
const dimensionScores = {};
|
|
601
|
+
let totalTests = 0;
|
|
602
|
+
|
|
603
|
+
for (const profileName of validProfiles) {
|
|
604
|
+
results[profileName] = [];
|
|
605
|
+
dimensionScores[profileName] = {};
|
|
606
|
+
|
|
607
|
+
for (const scenario of scenariosToRun) {
|
|
608
|
+
try {
|
|
609
|
+
const config = { profileName, label: profileName };
|
|
610
|
+
const result = await evaluationRunner.quickTest(config, {
|
|
611
|
+
scenarioId: scenario.id,
|
|
612
|
+
verbose: false,
|
|
613
|
+
skipRubricEval: skipRubric,
|
|
614
|
+
debug: false,
|
|
615
|
+
});
|
|
616
|
+
|
|
617
|
+
results[profileName].push(result);
|
|
618
|
+
totalTests++;
|
|
619
|
+
|
|
620
|
+
// Save result to database
|
|
621
|
+
evaluationStore.storeResult(run.id, {
|
|
622
|
+
...result,
|
|
623
|
+
scenarioId: scenario.id,
|
|
624
|
+
scenarioName: scenario.name,
|
|
625
|
+
profileName,
|
|
626
|
+
});
|
|
627
|
+
|
|
628
|
+
// Collect dimension scores
|
|
629
|
+
if (result.scores) {
|
|
630
|
+
for (const [dim, score] of Object.entries(result.scores)) {
|
|
631
|
+
if (!dimensionScores[profileName][dim]) {
|
|
632
|
+
dimensionScores[profileName][dim] = [];
|
|
633
|
+
}
|
|
634
|
+
if (typeof score === 'number') {
|
|
635
|
+
dimensionScores[profileName][dim].push(score);
|
|
636
|
+
}
|
|
637
|
+
}
|
|
638
|
+
}
|
|
639
|
+
} catch (e) {
|
|
640
|
+
const errorResult = {
|
|
641
|
+
success: false,
|
|
642
|
+
errorMessage: e.message,
|
|
643
|
+
scenarioId: scenario.id,
|
|
644
|
+
};
|
|
645
|
+
results[profileName].push(errorResult);
|
|
646
|
+
totalTests++;
|
|
647
|
+
|
|
648
|
+
// Save error to database
|
|
649
|
+
evaluationStore.storeResult(run.id, {
|
|
650
|
+
...errorResult,
|
|
651
|
+
scenarioName: scenario.name,
|
|
652
|
+
profileName,
|
|
653
|
+
provider: 'unknown',
|
|
654
|
+
model: 'unknown',
|
|
655
|
+
});
|
|
656
|
+
}
|
|
657
|
+
}
|
|
658
|
+
}
|
|
659
|
+
|
|
660
|
+
// Update run as completed
|
|
661
|
+
evaluationStore.updateRun(run.id, {
|
|
662
|
+
status: 'completed',
|
|
663
|
+
totalTests,
|
|
664
|
+
completedAt: new Date().toISOString(),
|
|
665
|
+
});
|
|
666
|
+
|
|
667
|
+
// Build dimension averages
|
|
668
|
+
const dimensions = ['relevance', 'specificity', 'pedagogical', 'personalization', 'actionability', 'tone'];
|
|
669
|
+
const dimensionAverages = {};
|
|
670
|
+
for (const profile of validProfiles) {
|
|
671
|
+
dimensionAverages[profile] = {};
|
|
672
|
+
for (const dim of dimensions) {
|
|
673
|
+
const scores = dimensionScores[profile]?.[dim] || [];
|
|
674
|
+
dimensionAverages[profile][dim] = scores.length > 0
|
|
675
|
+
? scores.reduce((a, b) => a + b, 0) / scores.length
|
|
676
|
+
: null;
|
|
677
|
+
}
|
|
678
|
+
}
|
|
679
|
+
|
|
680
|
+
// Build rankings
|
|
681
|
+
const rankings = validProfiles.map(profile => {
|
|
682
|
+
const profileResults = results[profile] || [];
|
|
683
|
+
const successCount = profileResults.filter(r => r.success !== false).length;
|
|
684
|
+
const scores = profileResults.filter(r => r.overallScore != null).map(r => r.overallScore);
|
|
685
|
+
const avgScore = scores.length > 0 ? scores.reduce((a, b) => a + b, 0) / scores.length : null;
|
|
686
|
+
const latencies = profileResults.filter(r => r.latencyMs != null).map(r => r.latencyMs);
|
|
687
|
+
const avgLatency = latencies.length > 0 ? latencies.reduce((a, b) => a + b, 0) / latencies.length : null;
|
|
688
|
+
|
|
689
|
+
return {
|
|
690
|
+
profile,
|
|
691
|
+
tests: profileResults.length,
|
|
692
|
+
successes: successCount,
|
|
693
|
+
avgScore,
|
|
694
|
+
avgLatency,
|
|
695
|
+
};
|
|
696
|
+
}).sort((a, b) => (b.avgScore || 0) - (a.avgScore || 0));
|
|
697
|
+
|
|
698
|
+
res.json({
|
|
699
|
+
success: true,
|
|
700
|
+
runId: run.id, // Include run ID so frontend can navigate to history
|
|
701
|
+
profiles: validProfiles,
|
|
702
|
+
invalidProfiles: invalidProfiles.length > 0 ? invalidProfiles : undefined,
|
|
703
|
+
scenariosRun: scenariosToRun.length,
|
|
704
|
+
dimensionAverages,
|
|
705
|
+
rankings,
|
|
706
|
+
results, // Full results for detailed analysis
|
|
707
|
+
});
|
|
708
|
+
} catch (error) {
|
|
709
|
+
console.error('[EvalRoutes] Matrix error:', error);
|
|
710
|
+
res.status(500).json({ error: 'Failed to run matrix comparison', details: error.message });
|
|
711
|
+
}
|
|
712
|
+
});
|
|
713
|
+
|
|
714
|
+
/**
|
|
715
|
+
* Run matrix comparison with SSE streaming for real-time logs
|
|
716
|
+
* GET /api/eval/stream/matrix
|
|
717
|
+
* Query params: profiles, scenarios, skipRubric
|
|
718
|
+
*/
|
|
719
|
+
router.get('/stream/matrix', async (req, res) => {
|
|
720
|
+
// Set up SSE
|
|
721
|
+
res.writeHead(200, {
|
|
722
|
+
'Content-Type': 'text/event-stream',
|
|
723
|
+
'Cache-Control': 'no-cache',
|
|
724
|
+
Connection: 'keep-alive',
|
|
725
|
+
});
|
|
726
|
+
|
|
727
|
+
const sendEvent = (type, data) => {
|
|
728
|
+
// Use named events for addEventListener compatibility
|
|
729
|
+
res.write(`event: ${type}\ndata: ${JSON.stringify(data)}\n\n`);
|
|
730
|
+
};
|
|
731
|
+
|
|
732
|
+
// Keep-alive to prevent connection timeout
|
|
733
|
+
const keepAlive = setInterval(() => {
|
|
734
|
+
res.write(': keepalive\n\n');
|
|
735
|
+
}, 15000);
|
|
736
|
+
|
|
737
|
+
// Register stream for crash protection
|
|
738
|
+
const streamId = registerStream(res, keepAlive);
|
|
739
|
+
|
|
740
|
+
// Clean up on close
|
|
741
|
+
req.on('close', () => {
|
|
742
|
+
clearInterval(keepAlive);
|
|
743
|
+
unregisterStream(streamId);
|
|
744
|
+
});
|
|
745
|
+
|
|
746
|
+
try {
|
|
747
|
+
const profilesParam = req.query.profiles || '';
|
|
748
|
+
let profiles = profilesParam ? profilesParam.split(',') : [];
|
|
749
|
+
const scenariosParam = req.query.scenarios || 'all';
|
|
750
|
+
const scenarios = scenariosParam === 'all' ? 'all' : scenariosParam.split(',');
|
|
751
|
+
const skipRubric = req.query.skipRubric === 'true';
|
|
752
|
+
const outputSize = req.query.outputSize || 'normal';
|
|
753
|
+
|
|
754
|
+
// Get all available profiles
|
|
755
|
+
const allProfiles = tutorApi.listProfiles();
|
|
756
|
+
if (profiles.length === 0) {
|
|
757
|
+
profiles = ['budget', 'experimental', 'default', 'fast'].filter(p =>
|
|
758
|
+
allProfiles.some(ap => ap.name === p)
|
|
759
|
+
);
|
|
760
|
+
}
|
|
761
|
+
|
|
762
|
+
// Validate profiles
|
|
763
|
+
const validProfiles = profiles.filter(p => allProfiles.some(ap => ap.name === p));
|
|
764
|
+
if (validProfiles.length === 0) {
|
|
765
|
+
sendEvent('error', { error: 'No valid profiles specified' });
|
|
766
|
+
return res.end();
|
|
767
|
+
}
|
|
768
|
+
|
|
769
|
+
// Get scenarios
|
|
770
|
+
const allScenarios = tutorApi.listScenarios();
|
|
771
|
+
const scenariosToRun = scenarios === 'all'
|
|
772
|
+
? allScenarios
|
|
773
|
+
: allScenarios.filter(s => scenarios.includes(s.id));
|
|
774
|
+
|
|
775
|
+
const totalTests = validProfiles.length * scenariosToRun.length;
|
|
776
|
+
|
|
777
|
+
sendEvent('start', {
|
|
778
|
+
profiles: validProfiles,
|
|
779
|
+
scenarioCount: scenariosToRun.length,
|
|
780
|
+
totalTests,
|
|
781
|
+
skipRubric,
|
|
782
|
+
outputSize,
|
|
783
|
+
timestamp: new Date().toISOString(),
|
|
784
|
+
});
|
|
785
|
+
|
|
786
|
+
sendEvent('log', { message: `Starting matrix: ${validProfiles.length} profiles × ${scenariosToRun.length} scenarios = ${totalTests} tests`, level: 'info' });
|
|
787
|
+
sendEvent('log', { message: `Output size: ${outputSize}`, level: 'info' });
|
|
788
|
+
|
|
789
|
+
// Create a run to persist results
|
|
790
|
+
const run = evaluationStore.createRun({
|
|
791
|
+
description: `${validProfiles.length} profiles × ${scenariosToRun.length} scenarios`,
|
|
792
|
+
totalScenarios: scenariosToRun.length,
|
|
793
|
+
totalConfigurations: validProfiles.length,
|
|
794
|
+
metadata: {
|
|
795
|
+
runType: 'matrix',
|
|
796
|
+
profiles: validProfiles,
|
|
797
|
+
scenarios: scenariosToRun.map(s => s.id),
|
|
798
|
+
scenarioNames: scenariosToRun.map(s => s.name),
|
|
799
|
+
skipRubric,
|
|
800
|
+
},
|
|
801
|
+
});
|
|
802
|
+
|
|
803
|
+
sendEvent('log', { message: `Run ID: ${run.id}`, level: 'info' });
|
|
804
|
+
|
|
805
|
+
// Run evaluations
|
|
806
|
+
const results = {};
|
|
807
|
+
const dimensionScores = {};
|
|
808
|
+
let completedTests = 0;
|
|
809
|
+
|
|
810
|
+
for (const profileName of validProfiles) {
|
|
811
|
+
results[profileName] = [];
|
|
812
|
+
dimensionScores[profileName] = {};
|
|
813
|
+
|
|
814
|
+
sendEvent('log', { message: `\n=== Profile: ${profileName} ===`, level: 'info' });
|
|
815
|
+
|
|
816
|
+
for (const scenario of scenariosToRun) {
|
|
817
|
+
completedTests++;
|
|
818
|
+
|
|
819
|
+
sendEvent('progress', {
|
|
820
|
+
current: completedTests,
|
|
821
|
+
total: totalTests,
|
|
822
|
+
profile: profileName,
|
|
823
|
+
scenario: scenario.name,
|
|
824
|
+
percentage: Math.round((completedTests / totalTests) * 100),
|
|
825
|
+
});
|
|
826
|
+
|
|
827
|
+
sendEvent('log', { message: `[${completedTests}/${totalTests}] ${scenario.name}...`, level: 'info' });
|
|
828
|
+
|
|
829
|
+
try {
|
|
830
|
+
const config = { profileName, label: profileName };
|
|
831
|
+
|
|
832
|
+
// Create log callback for this test
|
|
833
|
+
const onLog = (message, level = 'info') => {
|
|
834
|
+
sendEvent('log', { message: ` ${message}`, level, timestamp: new Date().toISOString() });
|
|
835
|
+
};
|
|
836
|
+
|
|
837
|
+
const result = await evaluationRunner.quickTest(config, {
|
|
838
|
+
scenarioId: scenario.id,
|
|
839
|
+
verbose: false,
|
|
840
|
+
skipRubricEval: skipRubric,
|
|
841
|
+
outputSize,
|
|
842
|
+
onLog,
|
|
843
|
+
});
|
|
844
|
+
|
|
845
|
+
results[profileName].push(result);
|
|
846
|
+
|
|
847
|
+
// Save result to database
|
|
848
|
+
evaluationStore.storeResult(run.id, {
|
|
849
|
+
...result,
|
|
850
|
+
scenarioId: scenario.id,
|
|
851
|
+
scenarioName: scenario.name,
|
|
852
|
+
profileName,
|
|
853
|
+
});
|
|
854
|
+
|
|
855
|
+
// Collect dimension scores
|
|
856
|
+
if (result.scores) {
|
|
857
|
+
for (const [dim, score] of Object.entries(result.scores)) {
|
|
858
|
+
if (!dimensionScores[profileName][dim]) {
|
|
859
|
+
dimensionScores[profileName][dim] = [];
|
|
860
|
+
}
|
|
861
|
+
if (typeof score === 'number') {
|
|
862
|
+
dimensionScores[profileName][dim].push(score);
|
|
863
|
+
}
|
|
864
|
+
}
|
|
865
|
+
}
|
|
866
|
+
|
|
867
|
+
const scoreStr = result.overallScore != null ? result.overallScore.toFixed(1) : 'N/A';
|
|
868
|
+
const status = result.success !== false ? '✓' : '✗';
|
|
869
|
+
sendEvent('log', { message: ` ${status} Score: ${scoreStr} (${result.latencyMs}ms)`, level: result.success !== false ? 'success' : 'warning' });
|
|
870
|
+
|
|
871
|
+
sendEvent('result', {
|
|
872
|
+
profile: profileName,
|
|
873
|
+
scenarioId: scenario.id,
|
|
874
|
+
scenarioName: scenario.name,
|
|
875
|
+
passed: result.success !== false,
|
|
876
|
+
score: result.overallScore,
|
|
877
|
+
latencyMs: result.latencyMs,
|
|
878
|
+
inputTokens: result.inputTokens,
|
|
879
|
+
outputTokens: result.outputTokens,
|
|
880
|
+
});
|
|
881
|
+
|
|
882
|
+
} catch (e) {
|
|
883
|
+
sendEvent('log', { message: ` ✗ Error: ${e.message}`, level: 'error' });
|
|
884
|
+
|
|
885
|
+
const errorResult = {
|
|
886
|
+
success: false,
|
|
887
|
+
errorMessage: e.message,
|
|
888
|
+
scenarioId: scenario.id,
|
|
889
|
+
};
|
|
890
|
+
results[profileName].push(errorResult);
|
|
891
|
+
|
|
892
|
+
evaluationStore.storeResult(run.id, {
|
|
893
|
+
...errorResult,
|
|
894
|
+
scenarioName: scenario.name,
|
|
895
|
+
profileName,
|
|
896
|
+
provider: 'unknown',
|
|
897
|
+
model: 'unknown',
|
|
898
|
+
});
|
|
899
|
+
}
|
|
900
|
+
}
|
|
901
|
+
}
|
|
902
|
+
|
|
903
|
+
// Update run as completed
|
|
904
|
+
evaluationStore.updateRun(run.id, {
|
|
905
|
+
status: 'completed',
|
|
906
|
+
totalTests: completedTests,
|
|
907
|
+
completedAt: new Date().toISOString(),
|
|
908
|
+
});
|
|
909
|
+
|
|
910
|
+
// Build dimension averages
|
|
911
|
+
const dimensions = ['relevance', 'specificity', 'pedagogical', 'personalization', 'actionability', 'tone'];
|
|
912
|
+
const dimensionAverages = {};
|
|
913
|
+
for (const profile of validProfiles) {
|
|
914
|
+
dimensionAverages[profile] = {};
|
|
915
|
+
for (const dim of dimensions) {
|
|
916
|
+
const scores = dimensionScores[profile]?.[dim] || [];
|
|
917
|
+
dimensionAverages[profile][dim] = scores.length > 0
|
|
918
|
+
? scores.reduce((a, b) => a + b, 0) / scores.length
|
|
919
|
+
: null;
|
|
920
|
+
}
|
|
921
|
+
}
|
|
922
|
+
|
|
923
|
+
// Build rankings
|
|
924
|
+
const rankings = validProfiles.map(profile => {
|
|
925
|
+
const profileResults = results[profile] || [];
|
|
926
|
+
const successCount = profileResults.filter(r => r.success !== false).length;
|
|
927
|
+
const scores = profileResults.filter(r => r.overallScore != null).map(r => r.overallScore);
|
|
928
|
+
const avgScore = scores.length > 0 ? scores.reduce((a, b) => a + b, 0) / scores.length : null;
|
|
929
|
+
const latencies = profileResults.filter(r => r.latencyMs != null).map(r => r.latencyMs);
|
|
930
|
+
const avgLatency = latencies.length > 0 ? latencies.reduce((a, b) => a + b, 0) / latencies.length : null;
|
|
931
|
+
|
|
932
|
+
return {
|
|
933
|
+
profile,
|
|
934
|
+
tests: profileResults.length,
|
|
935
|
+
successes: successCount,
|
|
936
|
+
avgScore,
|
|
937
|
+
avgLatency,
|
|
938
|
+
};
|
|
939
|
+
}).sort((a, b) => (b.avgScore || 0) - (a.avgScore || 0));
|
|
940
|
+
|
|
941
|
+
sendEvent('log', { message: `\n=== Matrix Complete ===`, level: 'success' });
|
|
942
|
+
sendEvent('log', { message: `Total tests: ${completedTests}`, level: 'info' });
|
|
943
|
+
|
|
944
|
+
// Send final complete event with full results
|
|
945
|
+
sendEvent('complete', {
|
|
946
|
+
success: true,
|
|
947
|
+
runId: run.id,
|
|
948
|
+
profiles: validProfiles,
|
|
949
|
+
scenariosRun: scenariosToRun.length,
|
|
950
|
+
dimensionAverages,
|
|
951
|
+
rankings,
|
|
952
|
+
results,
|
|
953
|
+
});
|
|
954
|
+
|
|
955
|
+
unregisterStream(streamId);
|
|
956
|
+
res.end();
|
|
957
|
+
} catch (error) {
|
|
958
|
+
sendEvent('log', { message: `Fatal error: ${error.message}`, level: 'error' });
|
|
959
|
+
sendEvent('error', { error: error.message });
|
|
960
|
+
unregisterStream(streamId);
|
|
961
|
+
res.end();
|
|
962
|
+
}
|
|
963
|
+
});
|
|
964
|
+
|
|
965
|
+
/**
|
|
966
|
+
* Run learner-tutor interaction evaluation with SSE streaming
|
|
967
|
+
* GET /api/eval/stream/interact
|
|
968
|
+
* Query params:
|
|
969
|
+
* - persona: learner persona ID (default: confused_novice)
|
|
970
|
+
* - profile: tutor profile name (default: budget)
|
|
971
|
+
* - turns: number of dialogue turns (default: 5)
|
|
972
|
+
* - dialogueEnabled: whether tutor uses multi-agent dialogue (default: true)
|
|
973
|
+
* - topic: topic for discussion (default: "Hegel's concept of recognition")
|
|
974
|
+
* - runJudge: whether to run AI judge evaluation (default: true)
|
|
975
|
+
*/
|
|
976
|
+
router.get('/stream/interact', async (req, res) => {
|
|
977
|
+
// Set up SSE
|
|
978
|
+
res.writeHead(200, {
|
|
979
|
+
'Content-Type': 'text/event-stream',
|
|
980
|
+
'Cache-Control': 'no-cache',
|
|
981
|
+
Connection: 'keep-alive',
|
|
982
|
+
});
|
|
983
|
+
|
|
984
|
+
const sendEvent = (type, data) => {
|
|
985
|
+
res.write(`event: ${type}\ndata: ${JSON.stringify(data)}\n\n`);
|
|
986
|
+
};
|
|
987
|
+
|
|
988
|
+
// Keep-alive to prevent connection timeout
|
|
989
|
+
const keepAlive = setInterval(() => {
|
|
990
|
+
res.write(': keepalive\n\n');
|
|
991
|
+
}, 15000);
|
|
992
|
+
|
|
993
|
+
// Register stream for crash protection (interaction evals can take a while)
|
|
994
|
+
const streamId = registerStream(res, keepAlive, { maxDuration: 30 * 60 * 1000 }); // 30 min timeout
|
|
995
|
+
|
|
996
|
+
// Clean up on close
|
|
997
|
+
req.on('close', () => {
|
|
998
|
+
clearInterval(keepAlive);
|
|
999
|
+
unregisterStream(streamId);
|
|
1000
|
+
});
|
|
1001
|
+
|
|
1002
|
+
try {
|
|
1003
|
+
const persona = req.query.persona || 'confused_novice';
|
|
1004
|
+
const tutorProfile = req.query.profile || 'budget';
|
|
1005
|
+
const maxTurns = parseInt(req.query.turns) || 5;
|
|
1006
|
+
const dialogueEnabled = req.query.dialogueEnabled !== 'false';
|
|
1007
|
+
const topic = req.query.topic || "Hegel's concept of recognition";
|
|
1008
|
+
const runJudge = req.query.runJudge !== 'false';
|
|
1009
|
+
|
|
1010
|
+
sendEvent('start', {
|
|
1011
|
+
persona,
|
|
1012
|
+
tutorProfile,
|
|
1013
|
+
maxTurns,
|
|
1014
|
+
dialogueEnabled,
|
|
1015
|
+
topic,
|
|
1016
|
+
runJudge,
|
|
1017
|
+
timestamp: new Date().toISOString(),
|
|
1018
|
+
});
|
|
1019
|
+
|
|
1020
|
+
sendEvent('log', { message: `Starting interaction evaluation`, level: 'info' });
|
|
1021
|
+
sendEvent('log', { message: `Learner persona: ${persona}`, level: 'info' });
|
|
1022
|
+
sendEvent('log', { message: `Tutor profile: ${tutorProfile}`, level: 'info' });
|
|
1023
|
+
sendEvent('log', { message: `Max turns: ${maxTurns}`, level: 'info' });
|
|
1024
|
+
sendEvent('log', { message: `Dialogue enabled: ${dialogueEnabled}`, level: 'info' });
|
|
1025
|
+
sendEvent('log', { message: `Topic: ${topic}`, level: 'info' });
|
|
1026
|
+
|
|
1027
|
+
// Set up LLM call function using available providers
|
|
1028
|
+
let llmClient = null;
|
|
1029
|
+
let llmProvider = null;
|
|
1030
|
+
|
|
1031
|
+
// Try providers in order of preference
|
|
1032
|
+
const openrouterKey = getApiKey('openrouter');
|
|
1033
|
+
const geminiKey = getApiKey('gemini');
|
|
1034
|
+
const anthropicKey = getApiKey('claude');
|
|
1035
|
+
const openaiKey = getApiKey('openai');
|
|
1036
|
+
|
|
1037
|
+
if (openrouterKey) {
|
|
1038
|
+
llmProvider = 'openrouter';
|
|
1039
|
+
const OpenAI = (await import('openai')).default;
|
|
1040
|
+
llmClient = new OpenAI({
|
|
1041
|
+
apiKey: openrouterKey,
|
|
1042
|
+
baseURL: 'https://openrouter.ai/api/v1',
|
|
1043
|
+
});
|
|
1044
|
+
sendEvent('log', { message: `Using OpenRouter for LLM calls`, level: 'info' });
|
|
1045
|
+
} else if (geminiKey) {
|
|
1046
|
+
llmProvider = 'gemini';
|
|
1047
|
+
const { GoogleGenAI } = await import('@google/genai');
|
|
1048
|
+
llmClient = new GoogleGenAI({ apiKey: geminiKey });
|
|
1049
|
+
sendEvent('log', { message: `Using Gemini for LLM calls`, level: 'info' });
|
|
1050
|
+
} else if (anthropicKey) {
|
|
1051
|
+
llmProvider = 'anthropic';
|
|
1052
|
+
const Anthropic = (await import('@anthropic-ai/sdk')).default;
|
|
1053
|
+
llmClient = new Anthropic({ apiKey: anthropicKey });
|
|
1054
|
+
sendEvent('log', { message: `Using Anthropic for LLM calls`, level: 'info' });
|
|
1055
|
+
} else if (openaiKey) {
|
|
1056
|
+
llmProvider = 'openai';
|
|
1057
|
+
const OpenAI = (await import('openai')).default;
|
|
1058
|
+
llmClient = new OpenAI({ apiKey: openaiKey });
|
|
1059
|
+
sendEvent('log', { message: `Using OpenAI for LLM calls`, level: 'info' });
|
|
1060
|
+
} else {
|
|
1061
|
+
throw new Error('No LLM API key configured. Set OPENROUTER_API_KEY, GEMINI_API_KEY, ANTHROPIC_API_KEY, or OPENAI_API_KEY.');
|
|
1062
|
+
}
|
|
1063
|
+
|
|
1064
|
+
// Create the llmCall function matching the expected signature
|
|
1065
|
+
const llmCall = async (requestedModel, systemPrompt, messages, options = {}) => {
|
|
1066
|
+
const { temperature = 0.7, maxTokens = 1000 } = options;
|
|
1067
|
+
const model = requestedModel || getDefaultModel(llmProvider === 'anthropic' ? 'claude' : llmProvider) || 'deepseek/deepseek-chat';
|
|
1068
|
+
|
|
1069
|
+
try {
|
|
1070
|
+
if (llmProvider === 'openrouter') {
|
|
1071
|
+
const response = await llmClient.chat.completions.create({
|
|
1072
|
+
model,
|
|
1073
|
+
temperature,
|
|
1074
|
+
max_tokens: maxTokens,
|
|
1075
|
+
messages: [
|
|
1076
|
+
{ role: 'system', content: systemPrompt },
|
|
1077
|
+
...messages.map(m => ({
|
|
1078
|
+
role: m.role === 'user' ? 'user' : 'assistant',
|
|
1079
|
+
content: m.content,
|
|
1080
|
+
})),
|
|
1081
|
+
],
|
|
1082
|
+
});
|
|
1083
|
+
return {
|
|
1084
|
+
content: response.choices[0]?.message?.content || '',
|
|
1085
|
+
usage: {
|
|
1086
|
+
inputTokens: response.usage?.prompt_tokens || 0,
|
|
1087
|
+
outputTokens: response.usage?.completion_tokens || 0,
|
|
1088
|
+
},
|
|
1089
|
+
};
|
|
1090
|
+
} else if (llmProvider === 'gemini') {
|
|
1091
|
+
const userMessages = messages.map(m => m.content).join('\n\n');
|
|
1092
|
+
const response = await llmClient.models.generateContent({
|
|
1093
|
+
model,
|
|
1094
|
+
contents: [{ role: 'user', parts: [{ text: `${systemPrompt}\n\n${userMessages}` }] }],
|
|
1095
|
+
generationConfig: { temperature, maxOutputTokens: maxTokens },
|
|
1096
|
+
});
|
|
1097
|
+
const text = response.text || response.candidates?.[0]?.content?.parts?.[0]?.text || '';
|
|
1098
|
+
return {
|
|
1099
|
+
content: text,
|
|
1100
|
+
usage: {
|
|
1101
|
+
inputTokens: Math.ceil((systemPrompt.length + userMessages.length) / 4),
|
|
1102
|
+
outputTokens: Math.ceil(text.length / 4),
|
|
1103
|
+
},
|
|
1104
|
+
};
|
|
1105
|
+
} else if (llmProvider === 'anthropic') {
|
|
1106
|
+
const response = await llmClient.messages.create({
|
|
1107
|
+
model: model || 'claude-3-5-haiku-20241022',
|
|
1108
|
+
max_tokens: maxTokens,
|
|
1109
|
+
system: systemPrompt,
|
|
1110
|
+
messages: messages.map(m => ({
|
|
1111
|
+
role: m.role === 'user' ? 'user' : 'assistant',
|
|
1112
|
+
content: m.content,
|
|
1113
|
+
})),
|
|
1114
|
+
});
|
|
1115
|
+
return {
|
|
1116
|
+
content: response.content[0]?.text || '',
|
|
1117
|
+
usage: {
|
|
1118
|
+
inputTokens: response.usage?.input_tokens || 0,
|
|
1119
|
+
outputTokens: response.usage?.output_tokens || 0,
|
|
1120
|
+
},
|
|
1121
|
+
};
|
|
1122
|
+
} else if (llmProvider === 'openai') {
|
|
1123
|
+
const response = await llmClient.chat.completions.create({
|
|
1124
|
+
model: model || 'gpt-4o-mini',
|
|
1125
|
+
temperature,
|
|
1126
|
+
max_tokens: maxTokens,
|
|
1127
|
+
messages: [
|
|
1128
|
+
{ role: 'system', content: systemPrompt },
|
|
1129
|
+
...messages.map(m => ({
|
|
1130
|
+
role: m.role === 'user' ? 'user' : 'assistant',
|
|
1131
|
+
content: m.content,
|
|
1132
|
+
})),
|
|
1133
|
+
],
|
|
1134
|
+
});
|
|
1135
|
+
return {
|
|
1136
|
+
content: response.choices[0]?.message?.content || '',
|
|
1137
|
+
usage: {
|
|
1138
|
+
inputTokens: response.usage?.prompt_tokens || 0,
|
|
1139
|
+
outputTokens: response.usage?.completion_tokens || 0,
|
|
1140
|
+
},
|
|
1141
|
+
};
|
|
1142
|
+
}
|
|
1143
|
+
} catch (error) {
|
|
1144
|
+
console.error(`[InteractStream] LLM call error:`, error.message);
|
|
1145
|
+
throw error;
|
|
1146
|
+
}
|
|
1147
|
+
};
|
|
1148
|
+
|
|
1149
|
+
// Generate unique learner ID for this eval
|
|
1150
|
+
const learnerId = `eval-learner-${persona}-${Date.now()}`;
|
|
1151
|
+
const evalId = `short-interact-${Date.now()}`;
|
|
1152
|
+
const sessionId = `session-${Date.now()}`;
|
|
1153
|
+
|
|
1154
|
+
sendEvent('log', { message: `Eval ID: ${evalId}`, level: 'info' });
|
|
1155
|
+
sendEvent('progress', { stage: 'setup', message: 'Initializing interaction' });
|
|
1156
|
+
|
|
1157
|
+
// Run the interaction
|
|
1158
|
+
sendEvent('log', { message: `\nStarting ${maxTurns}-turn interaction...`, level: 'info' });
|
|
1159
|
+
sendEvent('progress', { stage: 'interaction', message: 'Running learner-tutor dialogue' });
|
|
1160
|
+
|
|
1161
|
+
const interactionTrace = await interactionEngine.runInteraction(
|
|
1162
|
+
{
|
|
1163
|
+
learnerId,
|
|
1164
|
+
personaId: persona,
|
|
1165
|
+
tutorProfile,
|
|
1166
|
+
learnerProfile: dialogueEnabled ? 'psychodynamic' : 'unified',
|
|
1167
|
+
topic,
|
|
1168
|
+
scenario: {
|
|
1169
|
+
name: `Interactive Evaluation - ${persona}`,
|
|
1170
|
+
},
|
|
1171
|
+
sessionId,
|
|
1172
|
+
},
|
|
1173
|
+
llmCall,
|
|
1174
|
+
{
|
|
1175
|
+
maxTurns,
|
|
1176
|
+
trace: true,
|
|
1177
|
+
observeInternals: true,
|
|
1178
|
+
}
|
|
1179
|
+
);
|
|
1180
|
+
|
|
1181
|
+
sendEvent('log', { message: `Interaction completed: ${interactionTrace.turns.length} turns`, level: 'success' });
|
|
1182
|
+
|
|
1183
|
+
// Generate sequence diagram and transcript
|
|
1184
|
+
const generateSequenceDiagram = (trace) => {
|
|
1185
|
+
const lines = ['sequenceDiagram'];
|
|
1186
|
+
lines.push(' participant L as Learner');
|
|
1187
|
+
lines.push(' participant T as Tutor');
|
|
1188
|
+
|
|
1189
|
+
for (const turn of trace.turns || []) {
|
|
1190
|
+
const speaker = turn.phase === 'learner' ? 'L' : 'T';
|
|
1191
|
+
const target = turn.phase === 'learner' ? 'T' : 'L';
|
|
1192
|
+
const msg = (turn.externalMessage || '').slice(0, 50).replace(/"/g, "'").replace(/\n/g, ' ');
|
|
1193
|
+
lines.push(` ${speaker}->>+${target}: ${msg}${msg.length >= 50 ? '...' : ''}`);
|
|
1194
|
+
}
|
|
1195
|
+
|
|
1196
|
+
return lines.join('\n');
|
|
1197
|
+
};
|
|
1198
|
+
|
|
1199
|
+
const generateTranscript = (trace) => {
|
|
1200
|
+
const lines = [];
|
|
1201
|
+
for (const turn of trace.turns || []) {
|
|
1202
|
+
const speaker = turn.phase === 'learner' ? 'LEARNER' : 'TUTOR';
|
|
1203
|
+
lines.push(`[Turn ${turn.turnNumber}] ${speaker}:`);
|
|
1204
|
+
lines.push(turn.externalMessage || '');
|
|
1205
|
+
lines.push('');
|
|
1206
|
+
}
|
|
1207
|
+
return lines.join('\n');
|
|
1208
|
+
};
|
|
1209
|
+
|
|
1210
|
+
// Compile result
|
|
1211
|
+
const result = {
|
|
1212
|
+
evalId,
|
|
1213
|
+
scenarioId: `interact-${persona}`,
|
|
1214
|
+
scenarioName: `Interactive Evaluation - ${persona}`,
|
|
1215
|
+
type: 'short_term',
|
|
1216
|
+
learnerId,
|
|
1217
|
+
personaId: persona,
|
|
1218
|
+
tutorProfile,
|
|
1219
|
+
learnerArchitecture: dialogueEnabled ? 'psychodynamic' : 'unified',
|
|
1220
|
+
learnerProfile: dialogueEnabled ? 'psychodynamic' : 'unified',
|
|
1221
|
+
topic,
|
|
1222
|
+
interaction: interactionTrace,
|
|
1223
|
+
turnCount: interactionTrace.turns.length,
|
|
1224
|
+
turns: interactionTrace.turns,
|
|
1225
|
+
sequenceDiagram: generateSequenceDiagram(interactionTrace),
|
|
1226
|
+
formattedTranscript: generateTranscript(interactionTrace),
|
|
1227
|
+
skipJudge: !runJudge,
|
|
1228
|
+
metrics: {
|
|
1229
|
+
turnCount: interactionTrace.turns.length,
|
|
1230
|
+
totalTokens: (interactionTrace.metrics?.learnerInputTokens || 0) +
|
|
1231
|
+
(interactionTrace.metrics?.learnerOutputTokens || 0) +
|
|
1232
|
+
(interactionTrace.metrics?.tutorInputTokens || 0) +
|
|
1233
|
+
(interactionTrace.metrics?.tutorOutputTokens || 0),
|
|
1234
|
+
learnerTokens: (interactionTrace.metrics?.learnerInputTokens || 0) +
|
|
1235
|
+
(interactionTrace.metrics?.learnerOutputTokens || 0),
|
|
1236
|
+
tutorTokens: (interactionTrace.metrics?.tutorInputTokens || 0) +
|
|
1237
|
+
(interactionTrace.metrics?.tutorOutputTokens || 0),
|
|
1238
|
+
totalLatencyMs: interactionTrace.metrics?.totalLatencyMs || 0,
|
|
1239
|
+
},
|
|
1240
|
+
timestamp: new Date().toISOString(),
|
|
1241
|
+
};
|
|
1242
|
+
|
|
1243
|
+
// Store in database
|
|
1244
|
+
sendEvent('progress', { stage: 'storing', message: 'Saving results' });
|
|
1245
|
+
|
|
1246
|
+
// First create a run entry so it appears in History with "Interact" filter
|
|
1247
|
+
let runId = null;
|
|
1248
|
+
try {
|
|
1249
|
+
const runData = evaluationStore.createRun({
|
|
1250
|
+
description: `Interact: ${persona} → ${tutorProfile}`,
|
|
1251
|
+
totalScenarios: 1,
|
|
1252
|
+
metadata: {
|
|
1253
|
+
runType: 'interaction',
|
|
1254
|
+
profiles: [tutorProfile],
|
|
1255
|
+
personaId: persona,
|
|
1256
|
+
learnerArchitecture: dialogueEnabled ? 'psychodynamic' : 'unified',
|
|
1257
|
+
topic,
|
|
1258
|
+
fastMode: !runJudge,
|
|
1259
|
+
},
|
|
1260
|
+
});
|
|
1261
|
+
runId = runData.id;
|
|
1262
|
+
sendEvent('log', { message: `Created run entry: ${runId}`, level: 'info' });
|
|
1263
|
+
} catch (e) {
|
|
1264
|
+
sendEvent('log', { message: `Run entry warning: ${e.message}`, level: 'warning' });
|
|
1265
|
+
}
|
|
1266
|
+
|
|
1267
|
+
// Now store the interaction evaluation details
|
|
1268
|
+
try {
|
|
1269
|
+
result.runId = runId;
|
|
1270
|
+
evaluationStore.storeInteractionEval(result);
|
|
1271
|
+
sendEvent('log', { message: `Stored in database: ${evalId}`, level: 'success' });
|
|
1272
|
+
|
|
1273
|
+
// Mark the run as completed (don't use completeRun which checks evaluation_results table)
|
|
1274
|
+
if (runId) {
|
|
1275
|
+
evaluationStore.updateRun(runId, {
|
|
1276
|
+
status: 'completed',
|
|
1277
|
+
totalTests: result.metrics?.turnCount || 1,
|
|
1278
|
+
completedAt: new Date().toISOString(),
|
|
1279
|
+
});
|
|
1280
|
+
}
|
|
1281
|
+
} catch (e) {
|
|
1282
|
+
sendEvent('log', { message: `Database storage warning: ${e.message}`, level: 'warning' });
|
|
1283
|
+
}
|
|
1284
|
+
|
|
1285
|
+
// Send turn-by-turn summary
|
|
1286
|
+
for (let i = 0; i < interactionTrace.turns.length; i++) {
|
|
1287
|
+
const turn = interactionTrace.turns[i];
|
|
1288
|
+
sendEvent('turn', {
|
|
1289
|
+
turnNumber: turn.turnNumber,
|
|
1290
|
+
phase: turn.phase,
|
|
1291
|
+
message: turn.externalMessage?.slice(0, 100) + (turn.externalMessage?.length > 100 ? '...' : ''),
|
|
1292
|
+
});
|
|
1293
|
+
}
|
|
1294
|
+
|
|
1295
|
+
sendEvent('log', { message: `\n=== Interaction Complete ===`, level: 'success' });
|
|
1296
|
+
sendEvent('log', { message: `Total turns: ${result.metrics.turnCount}`, level: 'info' });
|
|
1297
|
+
sendEvent('log', { message: `Total tokens: ${result.metrics.totalTokens}`, level: 'info' });
|
|
1298
|
+
|
|
1299
|
+
// Send final result
|
|
1300
|
+
sendEvent('result', {
|
|
1301
|
+
evalId: result.evalId,
|
|
1302
|
+
scenarioName: result.scenarioName,
|
|
1303
|
+
persona: result.personaId,
|
|
1304
|
+
tutorProfile: result.tutorProfile,
|
|
1305
|
+
learnerArchitecture: result.learnerArchitecture,
|
|
1306
|
+
turnCount: result.metrics.turnCount,
|
|
1307
|
+
totalTokens: result.metrics.totalTokens,
|
|
1308
|
+
learnerTokens: result.metrics.learnerTokens,
|
|
1309
|
+
tutorTokens: result.metrics.tutorTokens,
|
|
1310
|
+
latencyMs: result.metrics.totalLatencyMs,
|
|
1311
|
+
passed: true, // No judge score yet
|
|
1312
|
+
overallScore: null,
|
|
1313
|
+
});
|
|
1314
|
+
|
|
1315
|
+
sendEvent('complete', {
|
|
1316
|
+
success: true,
|
|
1317
|
+
evalId: result.evalId,
|
|
1318
|
+
});
|
|
1319
|
+
|
|
1320
|
+
unregisterStream(streamId);
|
|
1321
|
+
res.end();
|
|
1322
|
+
} catch (error) {
|
|
1323
|
+
console.error('[InteractStream] Error:', error);
|
|
1324
|
+
sendEvent('log', { message: `Error: ${error.message}`, level: 'error' });
|
|
1325
|
+
sendEvent('error', { error: error.message });
|
|
1326
|
+
unregisterStream(streamId);
|
|
1327
|
+
res.end();
|
|
1328
|
+
}
|
|
1329
|
+
});
|
|
1330
|
+
|
|
1331
|
+
// ============================================================================
|
|
1332
|
+
// Results Endpoints
|
|
1333
|
+
// ============================================================================
|
|
1334
|
+
|
|
1335
|
+
/**
|
|
1336
|
+
* List previous evaluation runs
|
|
1337
|
+
* GET /api/eval/runs
|
|
1338
|
+
* Query params: limit (default 20)
|
|
1339
|
+
*/
|
|
1340
|
+
router.get('/runs', (req, res) => {
|
|
1341
|
+
try {
|
|
1342
|
+
const limit = parseInt(req.query.limit) || 20;
|
|
1343
|
+
const runs = evaluationStore.listRuns({ limit });
|
|
1344
|
+
|
|
1345
|
+
// Also include interaction evals in the runs list
|
|
1346
|
+
const interactionEvals = evaluationStore.listInteractionEvals({ limit });
|
|
1347
|
+
const interactionRuns = interactionEvals.map(e => ({
|
|
1348
|
+
id: e.evalId,
|
|
1349
|
+
description: e.scenarioName || 'Interaction Evaluation',
|
|
1350
|
+
status: 'completed',
|
|
1351
|
+
createdAt: e.createdAt,
|
|
1352
|
+
totalScenarios: 1,
|
|
1353
|
+
totalTests: e.turnCount || 1,
|
|
1354
|
+
type: 'interaction',
|
|
1355
|
+
metadata: JSON.stringify({
|
|
1356
|
+
runType: 'interaction',
|
|
1357
|
+
profiles: [e.tutorProfile || 'default'],
|
|
1358
|
+
scenarioNames: [e.scenarioName],
|
|
1359
|
+
learnerProfile: e.learnerProfile,
|
|
1360
|
+
personaId: e.personaId,
|
|
1361
|
+
}),
|
|
1362
|
+
}));
|
|
1363
|
+
|
|
1364
|
+
// Merge and sort by createdAt descending
|
|
1365
|
+
const allRuns = [...runs, ...interactionRuns].sort((a, b) =>
|
|
1366
|
+
new Date(b.createdAt).getTime() - new Date(a.createdAt).getTime()
|
|
1367
|
+
).slice(0, limit);
|
|
1368
|
+
|
|
1369
|
+
res.json({ success: true, runs: allRuns });
|
|
1370
|
+
} catch (error) {
|
|
1371
|
+
console.error('[EvalRoutes] List runs error:', error);
|
|
1372
|
+
res.status(500).json({ error: 'Failed to list runs' });
|
|
1373
|
+
}
|
|
1374
|
+
});
|
|
1375
|
+
|
|
1376
|
+
/**
|
|
1377
|
+
* Find incomplete (stuck) evaluation runs
|
|
1378
|
+
* GET /api/eval/runs-incomplete
|
|
1379
|
+
* Query params: olderThanMinutes (default 30)
|
|
1380
|
+
*/
|
|
1381
|
+
router.get('/runs-incomplete', (req, res) => {
|
|
1382
|
+
try {
|
|
1383
|
+
const olderThanMinutes = parseInt(req.query.olderThanMinutes) || 30;
|
|
1384
|
+
const runs = evaluationStore.findIncompleteRuns({ olderThanMinutes });
|
|
1385
|
+
res.json({ success: true, runs, found: runs.length });
|
|
1386
|
+
} catch (error) {
|
|
1387
|
+
console.error('[EvalRoutes] Find incomplete runs error:', error);
|
|
1388
|
+
res.status(500).json({ error: 'Failed to find incomplete runs' });
|
|
1389
|
+
}
|
|
1390
|
+
});
|
|
1391
|
+
|
|
1392
|
+
/**
|
|
1393
|
+
* Auto-complete all stale runs
|
|
1394
|
+
* POST /api/eval/runs-auto-complete
|
|
1395
|
+
* Body: { olderThanMinutes: 30, dryRun: false }
|
|
1396
|
+
*/
|
|
1397
|
+
router.post('/runs-auto-complete', (req, res) => {
|
|
1398
|
+
try {
|
|
1399
|
+
const { olderThanMinutes = 30, dryRun = false } = req.body;
|
|
1400
|
+
const result = evaluationStore.autoCompleteStaleRuns({ olderThanMinutes, dryRun });
|
|
1401
|
+
res.json({ success: true, ...result });
|
|
1402
|
+
} catch (error) {
|
|
1403
|
+
console.error('[EvalRoutes] Auto-complete runs error:', error);
|
|
1404
|
+
res.status(500).json({ error: 'Failed to auto-complete runs', details: error.message });
|
|
1405
|
+
}
|
|
1406
|
+
});
|
|
1407
|
+
|
|
1408
|
+
/**
|
|
1409
|
+
* Get results for a specific run
|
|
1410
|
+
* GET /api/eval/runs/:runId
|
|
1411
|
+
*/
|
|
1412
|
+
router.get('/runs/:runId', (req, res) => {
|
|
1413
|
+
try {
|
|
1414
|
+
const { runId } = req.params;
|
|
1415
|
+
|
|
1416
|
+
// Check if this is an interaction eval
|
|
1417
|
+
if (runId.startsWith('short-') || runId.startsWith('long-')) {
|
|
1418
|
+
const evalData = evaluationStore.getInteractionEval(runId);
|
|
1419
|
+
if (!evalData) {
|
|
1420
|
+
return res.status(404).json({ error: 'Interaction evaluation not found' });
|
|
1421
|
+
}
|
|
1422
|
+
|
|
1423
|
+
// Format as a run with results for the existing frontend
|
|
1424
|
+
return res.json({
|
|
1425
|
+
success: true,
|
|
1426
|
+
type: 'interaction',
|
|
1427
|
+
run: {
|
|
1428
|
+
id: evalData.evalId,
|
|
1429
|
+
description: evalData.scenarioName || 'Interaction Evaluation',
|
|
1430
|
+
status: 'completed',
|
|
1431
|
+
createdAt: evalData.createdAt,
|
|
1432
|
+
},
|
|
1433
|
+
stats: {
|
|
1434
|
+
totalTests: 1,
|
|
1435
|
+
avgScore: evalData.judgeOverallScore,
|
|
1436
|
+
},
|
|
1437
|
+
results: [{
|
|
1438
|
+
scenarioId: evalData.scenarioId,
|
|
1439
|
+
scenarioName: evalData.scenarioName,
|
|
1440
|
+
profileName: evalData.tutorProfile || 'default',
|
|
1441
|
+
tutorProfile: evalData.tutorProfile || 'default',
|
|
1442
|
+
model: `${evalData.turnCount} turns`,
|
|
1443
|
+
passed: evalData.judgeOverallScore >= 3,
|
|
1444
|
+
overallScore: evalData.judgeOverallScore,
|
|
1445
|
+
overall_score: evalData.judgeOverallScore,
|
|
1446
|
+
inputTokens: evalData.learnerTokens || 0,
|
|
1447
|
+
outputTokens: evalData.tutorTokens || 0,
|
|
1448
|
+
latencyMs: evalData.latencyMs || 0,
|
|
1449
|
+
latency_ms: evalData.latencyMs || 0,
|
|
1450
|
+
isInteraction: true,
|
|
1451
|
+
interactionEvalId: evalData.evalId,
|
|
1452
|
+
// dialogueId links to the dialogue log viewer
|
|
1453
|
+
dialogueId: evalData.evalId,
|
|
1454
|
+
// Include judgeEvaluation for dimension score extraction in History tab
|
|
1455
|
+
judgeEvaluation: evalData.judgeEvaluation,
|
|
1456
|
+
}],
|
|
1457
|
+
// Include full interaction data for display
|
|
1458
|
+
interaction: {
|
|
1459
|
+
evalId: evalData.evalId,
|
|
1460
|
+
scenarioName: evalData.scenarioName,
|
|
1461
|
+
turnCount: evalData.turnCount,
|
|
1462
|
+
turns: evalData.turns,
|
|
1463
|
+
sequenceDiagram: evalData.sequenceDiagram,
|
|
1464
|
+
formattedTranscript: evalData.formattedTranscript,
|
|
1465
|
+
totalTokens: evalData.totalTokens,
|
|
1466
|
+
learnerTokens: evalData.learnerTokens,
|
|
1467
|
+
tutorTokens: evalData.tutorTokens,
|
|
1468
|
+
latencyMs: evalData.latencyMs,
|
|
1469
|
+
judgeOverallScore: evalData.judgeOverallScore,
|
|
1470
|
+
judgeEvaluation: evalData.judgeEvaluation,
|
|
1471
|
+
},
|
|
1472
|
+
status: 'completed',
|
|
1473
|
+
description: evalData.scenarioName,
|
|
1474
|
+
scenarioNames: [evalData.scenarioName],
|
|
1475
|
+
});
|
|
1476
|
+
}
|
|
1477
|
+
|
|
1478
|
+
// Regular run
|
|
1479
|
+
const result = evaluationRunner.getRunResults(runId);
|
|
1480
|
+
|
|
1481
|
+
// Check if this is an interaction run (created from Interact tab)
|
|
1482
|
+
const runMetadata = result.run?.metadata ?
|
|
1483
|
+
(typeof result.run.metadata === 'string' ? JSON.parse(result.run.metadata) : result.run.metadata)
|
|
1484
|
+
: {};
|
|
1485
|
+
|
|
1486
|
+
if (runMetadata.runType === 'interaction') {
|
|
1487
|
+
// Look up the interaction eval data by runId
|
|
1488
|
+
const interactionEval = evaluationStore.getInteractionEvalByRunId(runId);
|
|
1489
|
+
if (interactionEval) {
|
|
1490
|
+
return res.json({
|
|
1491
|
+
success: true,
|
|
1492
|
+
type: 'interaction',
|
|
1493
|
+
run: result.run,
|
|
1494
|
+
stats: {
|
|
1495
|
+
totalTests: 1,
|
|
1496
|
+
avgScore: interactionEval.judgeOverallScore,
|
|
1497
|
+
},
|
|
1498
|
+
results: [{
|
|
1499
|
+
scenarioId: interactionEval.scenarioId,
|
|
1500
|
+
scenarioName: interactionEval.scenarioName,
|
|
1501
|
+
profileName: interactionEval.tutorProfile || 'default',
|
|
1502
|
+
tutorProfile: interactionEval.tutorProfile || 'default',
|
|
1503
|
+
model: `${interactionEval.turnCount} turns`,
|
|
1504
|
+
passed: interactionEval.judgeOverallScore >= 3,
|
|
1505
|
+
overallScore: interactionEval.judgeOverallScore,
|
|
1506
|
+
overall_score: interactionEval.judgeOverallScore,
|
|
1507
|
+
inputTokens: interactionEval.learnerTokens || 0,
|
|
1508
|
+
outputTokens: interactionEval.tutorTokens || 0,
|
|
1509
|
+
latencyMs: interactionEval.latencyMs || 0,
|
|
1510
|
+
latency_ms: interactionEval.latencyMs || 0,
|
|
1511
|
+
isInteraction: true,
|
|
1512
|
+
interactionEvalId: interactionEval.evalId,
|
|
1513
|
+
dialogueId: interactionEval.evalId,
|
|
1514
|
+
judgeEvaluation: interactionEval.judgeEvaluation,
|
|
1515
|
+
}],
|
|
1516
|
+
interaction: {
|
|
1517
|
+
evalId: interactionEval.evalId,
|
|
1518
|
+
scenarioName: interactionEval.scenarioName,
|
|
1519
|
+
turnCount: interactionEval.turnCount,
|
|
1520
|
+
turns: interactionEval.turns,
|
|
1521
|
+
sequenceDiagram: interactionEval.sequenceDiagram,
|
|
1522
|
+
formattedTranscript: interactionEval.formattedTranscript,
|
|
1523
|
+
totalTokens: interactionEval.totalTokens,
|
|
1524
|
+
learnerTokens: interactionEval.learnerTokens,
|
|
1525
|
+
tutorTokens: interactionEval.tutorTokens,
|
|
1526
|
+
latencyMs: interactionEval.latencyMs,
|
|
1527
|
+
judgeOverallScore: interactionEval.judgeOverallScore,
|
|
1528
|
+
judgeEvaluation: interactionEval.judgeEvaluation,
|
|
1529
|
+
},
|
|
1530
|
+
status: 'completed',
|
|
1531
|
+
description: result.run?.description || interactionEval.scenarioName,
|
|
1532
|
+
scenarioNames: [interactionEval.scenarioName],
|
|
1533
|
+
metadata: runMetadata,
|
|
1534
|
+
});
|
|
1535
|
+
}
|
|
1536
|
+
}
|
|
1537
|
+
|
|
1538
|
+
// Extract scenario names from results for display
|
|
1539
|
+
const scenarioNames = [...new Set(
|
|
1540
|
+
(result.results || [])
|
|
1541
|
+
.map((r) => r.scenarioName)
|
|
1542
|
+
.filter(Boolean)
|
|
1543
|
+
)].sort();
|
|
1544
|
+
|
|
1545
|
+
// Include key run properties at top level for easier frontend access
|
|
1546
|
+
res.json({
|
|
1547
|
+
success: true,
|
|
1548
|
+
...result,
|
|
1549
|
+
// Flatten these for easier access in UI
|
|
1550
|
+
status: result.run?.status,
|
|
1551
|
+
description: result.run?.description,
|
|
1552
|
+
scenarioNames,
|
|
1553
|
+
});
|
|
1554
|
+
} catch (error) {
|
|
1555
|
+
console.error('[EvalRoutes] Get run error:', error);
|
|
1556
|
+
res.status(500).json({ error: 'Failed to get run results', details: error.message });
|
|
1557
|
+
}
|
|
1558
|
+
});
|
|
1559
|
+
|
|
1560
|
+
/**
|
|
1561
|
+
* Get report for a run
|
|
1562
|
+
* GET /api/eval/runs/:runId/report
|
|
1563
|
+
*/
|
|
1564
|
+
router.get('/runs/:runId/report', (req, res) => {
|
|
1565
|
+
try {
|
|
1566
|
+
const report = evaluationRunner.generateReport(req.params.runId);
|
|
1567
|
+
|
|
1568
|
+
// Check if client wants plain text
|
|
1569
|
+
if (req.accepts('text/plain')) {
|
|
1570
|
+
res.type('text/plain').send(report);
|
|
1571
|
+
} else {
|
|
1572
|
+
res.json({ success: true, report });
|
|
1573
|
+
}
|
|
1574
|
+
} catch (error) {
|
|
1575
|
+
console.error('[EvalRoutes] Get report error:', error);
|
|
1576
|
+
res.status(500).json({ error: 'Failed to generate report', details: error.message });
|
|
1577
|
+
}
|
|
1578
|
+
});
|
|
1579
|
+
|
|
1580
|
+
// ============================================================================
|
|
1581
|
+
// Dialogue Log Endpoints
|
|
1582
|
+
// ============================================================================
|
|
1583
|
+
|
|
1584
|
+
/**
|
|
1585
|
+
* List available log dates
|
|
1586
|
+
* GET /api/eval/logs/dates
|
|
1587
|
+
*/
|
|
1588
|
+
router.get('/logs/dates', (req, res) => {
|
|
1589
|
+
try {
|
|
1590
|
+
const dates = dialogueLogService.listLogDates();
|
|
1591
|
+
res.json({ success: true, dates });
|
|
1592
|
+
} catch (error) {
|
|
1593
|
+
console.error('[EvalRoutes] List log dates error:', error);
|
|
1594
|
+
res.status(500).json({ error: 'Failed to list log dates' });
|
|
1595
|
+
}
|
|
1596
|
+
});
|
|
1597
|
+
|
|
1598
|
+
/**
|
|
1599
|
+
* Get dialogues for a specific date
|
|
1600
|
+
* GET /api/eval/logs/:date
|
|
1601
|
+
* Query params: limit (default 10), offset (default 0)
|
|
1602
|
+
*/
|
|
1603
|
+
router.get('/logs/:date', (req, res) => {
|
|
1604
|
+
try {
|
|
1605
|
+
const { date } = req.params;
|
|
1606
|
+
const limit = parseInt(req.query.limit) || 10;
|
|
1607
|
+
const offset = parseInt(req.query.offset) || 0;
|
|
1608
|
+
|
|
1609
|
+
const result = dialogueLogService.getDialogues({ date, limit, offset });
|
|
1610
|
+
res.json({ success: true, ...result });
|
|
1611
|
+
} catch (error) {
|
|
1612
|
+
console.error('[EvalRoutes] Get dialogues error:', error);
|
|
1613
|
+
res.status(500).json({ error: 'Failed to get dialogues' });
|
|
1614
|
+
}
|
|
1615
|
+
});
|
|
1616
|
+
|
|
1617
|
+
/**
|
|
1618
|
+
* Get a specific dialogue by dialogueId
|
|
1619
|
+
* GET /api/eval/logs/dialogue/:dialogueId
|
|
1620
|
+
*/
|
|
1621
|
+
router.get('/logs/dialogue/:dialogueId', (req, res) => {
|
|
1622
|
+
try {
|
|
1623
|
+
const { dialogueId } = req.params;
|
|
1624
|
+
|
|
1625
|
+
// Check if this is an interaction eval dialogue (starts with short- or long-)
|
|
1626
|
+
if (dialogueId.startsWith('short-') || dialogueId.startsWith('long-')) {
|
|
1627
|
+
const interactionEval = evaluationStore.getInteractionEval(dialogueId);
|
|
1628
|
+
if (interactionEval) {
|
|
1629
|
+
// Format interaction eval as entries for DialogueFlowDiagram
|
|
1630
|
+
// Expand each turn into action-based entries the diagram expects
|
|
1631
|
+
const entries = [];
|
|
1632
|
+
let entryIndex = 0;
|
|
1633
|
+
|
|
1634
|
+
for (const turn of interactionEval.turns || []) {
|
|
1635
|
+
const isLearner = turn.phase === 'learner';
|
|
1636
|
+
|
|
1637
|
+
// Add internal deliberation steps if present
|
|
1638
|
+
if (turn.internalDeliberation && turn.internalDeliberation.length > 0) {
|
|
1639
|
+
for (const delib of turn.internalDeliberation) {
|
|
1640
|
+
if (delib.role === 'ego') {
|
|
1641
|
+
entries.push({
|
|
1642
|
+
index: entryIndex++,
|
|
1643
|
+
action: isLearner ? 'learner_ego_thought' : 'tutor_ego_thought',
|
|
1644
|
+
agent: isLearner ? 'ego' : 'tutor_ego',
|
|
1645
|
+
phase: turn.phase,
|
|
1646
|
+
message: delib.content,
|
|
1647
|
+
timestamp: turn.timestamp,
|
|
1648
|
+
});
|
|
1649
|
+
} else if (delib.role === 'superego') {
|
|
1650
|
+
entries.push({
|
|
1651
|
+
index: entryIndex++,
|
|
1652
|
+
action: isLearner ? 'learner_superego_critique' : 'tutor_superego_critique',
|
|
1653
|
+
agent: isLearner ? 'superego' : 'tutor_superego',
|
|
1654
|
+
phase: turn.phase,
|
|
1655
|
+
message: delib.content,
|
|
1656
|
+
timestamp: turn.timestamp,
|
|
1657
|
+
});
|
|
1658
|
+
}
|
|
1659
|
+
}
|
|
1660
|
+
}
|
|
1661
|
+
|
|
1662
|
+
// Add the external message entry
|
|
1663
|
+
entries.push({
|
|
1664
|
+
index: entryIndex++,
|
|
1665
|
+
action: isLearner ? 'learner_input' : 'tutor_response',
|
|
1666
|
+
agent: isLearner ? 'ego' : 'tutor_ego',
|
|
1667
|
+
phase: turn.phase,
|
|
1668
|
+
message: turn.externalMessage,
|
|
1669
|
+
timestamp: turn.timestamp,
|
|
1670
|
+
turnNumber: turn.turnNumber,
|
|
1671
|
+
});
|
|
1672
|
+
}
|
|
1673
|
+
|
|
1674
|
+
// Calculate summary stats
|
|
1675
|
+
const learnerTurns = (interactionEval.turns || []).filter(t => t.phase === 'learner').length;
|
|
1676
|
+
const tutorTurns = (interactionEval.turns || []).filter(t => t.phase === 'tutor').length;
|
|
1677
|
+
|
|
1678
|
+
return res.json({
|
|
1679
|
+
success: true,
|
|
1680
|
+
dialogueId,
|
|
1681
|
+
dialogue: {
|
|
1682
|
+
dialogueId,
|
|
1683
|
+
entries,
|
|
1684
|
+
startTime: interactionEval.createdAt,
|
|
1685
|
+
isInteractionEval: true,
|
|
1686
|
+
scenarioName: interactionEval.scenarioName,
|
|
1687
|
+
personaId: interactionEval.personaId,
|
|
1688
|
+
judgeEvaluation: interactionEval.judgeEvaluation,
|
|
1689
|
+
summary: {
|
|
1690
|
+
totalTurns: interactionEval.turnCount,
|
|
1691
|
+
egoCount: learnerTurns,
|
|
1692
|
+
userCount: interactionEval.turnCount,
|
|
1693
|
+
superegoCount: 0,
|
|
1694
|
+
totalLatencyMs: interactionEval.latencyMs || 0,
|
|
1695
|
+
totalInputTokens: Math.floor((interactionEval.totalTokens || 0) / 2),
|
|
1696
|
+
totalOutputTokens: Math.ceil((interactionEval.totalTokens || 0) / 2),
|
|
1697
|
+
totalCost: 0,
|
|
1698
|
+
},
|
|
1699
|
+
sequenceDiagram: interactionEval.sequenceDiagram,
|
|
1700
|
+
formattedTranscript: interactionEval.formattedTranscript,
|
|
1701
|
+
isInteraction: true,
|
|
1702
|
+
},
|
|
1703
|
+
});
|
|
1704
|
+
}
|
|
1705
|
+
}
|
|
1706
|
+
|
|
1707
|
+
// Regular dialogue lookup
|
|
1708
|
+
const dialogue = dialogueLogService.getDialogueById(dialogueId);
|
|
1709
|
+
|
|
1710
|
+
if (!dialogue) {
|
|
1711
|
+
return res.status(404).json({ error: 'Dialogue not found' });
|
|
1712
|
+
}
|
|
1713
|
+
|
|
1714
|
+
res.json({ success: true, dialogue, dialogueId });
|
|
1715
|
+
} catch (error) {
|
|
1716
|
+
console.error('[EvalRoutes] Get dialogue by ID error:', error);
|
|
1717
|
+
res.status(500).json({ error: 'Failed to get dialogue' });
|
|
1718
|
+
}
|
|
1719
|
+
});
|
|
1720
|
+
|
|
1721
|
+
/**
|
|
1722
|
+
* Get a specific dialogue by index
|
|
1723
|
+
* GET /api/eval/logs/:date/:index
|
|
1724
|
+
*/
|
|
1725
|
+
router.get('/logs/:date/:index', (req, res) => {
|
|
1726
|
+
try {
|
|
1727
|
+
const { date, index } = req.params;
|
|
1728
|
+
const dialogue = dialogueLogService.getDialogueByIndex(date, parseInt(index));
|
|
1729
|
+
|
|
1730
|
+
if (!dialogue) {
|
|
1731
|
+
return res.status(404).json({ error: 'Dialogue not found' });
|
|
1732
|
+
}
|
|
1733
|
+
|
|
1734
|
+
res.json({ success: true, dialogue });
|
|
1735
|
+
} catch (error) {
|
|
1736
|
+
console.error('[EvalRoutes] Get dialogue error:', error);
|
|
1737
|
+
res.status(500).json({ error: 'Failed to get dialogue' });
|
|
1738
|
+
}
|
|
1739
|
+
});
|
|
1740
|
+
|
|
1741
|
+
/**
|
|
1742
|
+
* Get log statistics
|
|
1743
|
+
* GET /api/eval/logs/stats
|
|
1744
|
+
* Query params: startDate, endDate
|
|
1745
|
+
*/
|
|
1746
|
+
router.get('/logs-stats', (req, res) => {
|
|
1747
|
+
try {
|
|
1748
|
+
const { startDate, endDate } = req.query;
|
|
1749
|
+
const stats = dialogueLogService.getLogStatistics({ startDate, endDate });
|
|
1750
|
+
res.json({ success: true, ...stats });
|
|
1751
|
+
} catch (error) {
|
|
1752
|
+
console.error('[EvalRoutes] Get log stats error:', error);
|
|
1753
|
+
res.status(500).json({ error: 'Failed to get log statistics' });
|
|
1754
|
+
}
|
|
1755
|
+
});
|
|
1756
|
+
|
|
1757
|
+
// ============================================================================
|
|
1758
|
+
// Prompt Endpoints (Read-Only)
|
|
1759
|
+
// ============================================================================
|
|
1760
|
+
|
|
1761
|
+
/**
|
|
1762
|
+
* List available prompts
|
|
1763
|
+
* GET /api/eval/prompts
|
|
1764
|
+
*/
|
|
1765
|
+
router.get('/prompts', (req, res) => {
|
|
1766
|
+
try {
|
|
1767
|
+
if (!fs.existsSync(PROMPTS_DIR)) {
|
|
1768
|
+
return res.json({ success: true, prompts: [] });
|
|
1769
|
+
}
|
|
1770
|
+
|
|
1771
|
+
const files = fs.readdirSync(PROMPTS_DIR)
|
|
1772
|
+
.filter(f => f.endsWith('.md'))
|
|
1773
|
+
.map(f => {
|
|
1774
|
+
const filePath = path.join(PROMPTS_DIR, f);
|
|
1775
|
+
const stats = fs.statSync(filePath);
|
|
1776
|
+
return {
|
|
1777
|
+
name: f.replace('.md', ''),
|
|
1778
|
+
filename: f,
|
|
1779
|
+
size: stats.size,
|
|
1780
|
+
modified: stats.mtime.toISOString(),
|
|
1781
|
+
};
|
|
1782
|
+
});
|
|
1783
|
+
|
|
1784
|
+
res.json({ success: true, prompts: files });
|
|
1785
|
+
} catch (error) {
|
|
1786
|
+
console.error('[EvalRoutes] List prompts error:', error);
|
|
1787
|
+
res.status(500).json({ error: 'Failed to list prompts' });
|
|
1788
|
+
}
|
|
1789
|
+
});
|
|
1790
|
+
|
|
1791
|
+
/**
|
|
1792
|
+
* Get prompt content (read-only)
|
|
1793
|
+
* GET /api/eval/prompts/:name
|
|
1794
|
+
*/
|
|
1795
|
+
router.get('/prompts/:name', (req, res) => {
|
|
1796
|
+
try {
|
|
1797
|
+
const filename = req.params.name.endsWith('.md')
|
|
1798
|
+
? req.params.name
|
|
1799
|
+
: `${req.params.name}.md`;
|
|
1800
|
+
const filePath = path.join(PROMPTS_DIR, filename);
|
|
1801
|
+
|
|
1802
|
+
if (!fs.existsSync(filePath)) {
|
|
1803
|
+
return res.status(404).json({ error: 'Prompt not found' });
|
|
1804
|
+
}
|
|
1805
|
+
|
|
1806
|
+
const content = fs.readFileSync(filePath, 'utf8');
|
|
1807
|
+
const stats = fs.statSync(filePath);
|
|
1808
|
+
|
|
1809
|
+
res.json({
|
|
1810
|
+
success: true,
|
|
1811
|
+
prompt: {
|
|
1812
|
+
name: req.params.name,
|
|
1813
|
+
filename,
|
|
1814
|
+
content,
|
|
1815
|
+
size: stats.size,
|
|
1816
|
+
modified: stats.mtime.toISOString(),
|
|
1817
|
+
},
|
|
1818
|
+
});
|
|
1819
|
+
} catch (error) {
|
|
1820
|
+
console.error('[EvalRoutes] Get prompt error:', error);
|
|
1821
|
+
res.status(500).json({ error: 'Failed to get prompt' });
|
|
1822
|
+
}
|
|
1823
|
+
});
|
|
1824
|
+
|
|
1825
|
+
/**
|
|
1826
|
+
* Generate prompt improvement recommendations (read-only)
|
|
1827
|
+
* POST /api/eval/prompts/recommend
|
|
1828
|
+
*
|
|
1829
|
+
* Body: {
|
|
1830
|
+
* runId: "run-123", // Get results from a run
|
|
1831
|
+
* profile: "budget", // Or run fresh tests with this profile
|
|
1832
|
+
* scenarios: "all" // Scenarios to test (if running fresh)
|
|
1833
|
+
* }
|
|
1834
|
+
*
|
|
1835
|
+
* Returns recommendations for prompt improvements.
|
|
1836
|
+
* Does NOT write to disk - web clients can display these for review.
|
|
1837
|
+
*/
|
|
1838
|
+
router.post('/prompts/recommend', async (req, res) => {
|
|
1839
|
+
try {
|
|
1840
|
+
const { runId, profile, scenarios = 'all' } = req.body;
|
|
1841
|
+
|
|
1842
|
+
let results = [];
|
|
1843
|
+
let profileName = profile || 'unknown';
|
|
1844
|
+
|
|
1845
|
+
if (runId) {
|
|
1846
|
+
// Get results from existing run
|
|
1847
|
+
const runResults = evaluationStore.getResults(runId);
|
|
1848
|
+
if (!runResults || runResults.length === 0) {
|
|
1849
|
+
return res.status(404).json({ error: 'Run not found or has no results' });
|
|
1850
|
+
}
|
|
1851
|
+
results = runResults;
|
|
1852
|
+
profileName = runResults[0]?.profileName || profileName;
|
|
1853
|
+
} else if (profile) {
|
|
1854
|
+
// Run fresh evaluations
|
|
1855
|
+
const allScenarios = tutorApi.listScenarios();
|
|
1856
|
+
const scenariosToRun = scenarios === 'all'
|
|
1857
|
+
? allScenarios
|
|
1858
|
+
: allScenarios.filter(s => scenarios.includes(s.id));
|
|
1859
|
+
|
|
1860
|
+
for (const scenario of scenariosToRun) {
|
|
1861
|
+
try {
|
|
1862
|
+
const config = { profileName: profile, label: profile };
|
|
1863
|
+
const result = await evaluationRunner.quickTest(config, {
|
|
1864
|
+
scenarioId: scenario.id,
|
|
1865
|
+
verbose: false,
|
|
1866
|
+
skipRubricEval: false, // Need rubric for recommendations
|
|
1867
|
+
});
|
|
1868
|
+
results.push(result);
|
|
1869
|
+
} catch (e) {
|
|
1870
|
+
// Skip failed tests
|
|
1871
|
+
}
|
|
1872
|
+
}
|
|
1873
|
+
} else {
|
|
1874
|
+
return res.status(400).json({ error: 'Either runId or profile is required' });
|
|
1875
|
+
}
|
|
1876
|
+
|
|
1877
|
+
if (results.length === 0) {
|
|
1878
|
+
return res.status(400).json({ error: 'No evaluation results available' });
|
|
1879
|
+
}
|
|
1880
|
+
|
|
1881
|
+
// Generate recommendations
|
|
1882
|
+
const recommendations = await promptRecommendationService.generateRecommendations({
|
|
1883
|
+
results,
|
|
1884
|
+
profileName,
|
|
1885
|
+
});
|
|
1886
|
+
|
|
1887
|
+
res.json({
|
|
1888
|
+
success: true,
|
|
1889
|
+
...recommendations,
|
|
1890
|
+
// Explicitly note this is read-only
|
|
1891
|
+
readOnly: true,
|
|
1892
|
+
note: 'Recommendations are for review only. Use CLI to apply changes.',
|
|
1893
|
+
});
|
|
1894
|
+
} catch (error) {
|
|
1895
|
+
console.error('[EvalRoutes] Recommend prompts error:', error);
|
|
1896
|
+
res.status(500).json({ error: 'Failed to generate recommendations', details: error.message });
|
|
1897
|
+
}
|
|
1898
|
+
});
|
|
1899
|
+
|
|
1900
|
+
// ============================================================================
|
|
1901
|
+
// Streaming Evaluation Endpoints
|
|
1902
|
+
// ============================================================================
|
|
1903
|
+
|
|
1904
|
+
/**
|
|
1905
|
+
* Run evaluation with SSE streaming for real-time progress
|
|
1906
|
+
* GET /api/eval/stream/run
|
|
1907
|
+
* Query params: profiles, scenarios, skipRubric
|
|
1908
|
+
*/
|
|
1909
|
+
router.get('/stream/run', async (req, res) => {
|
|
1910
|
+
// Set up SSE
|
|
1911
|
+
res.writeHead(200, {
|
|
1912
|
+
'Content-Type': 'text/event-stream',
|
|
1913
|
+
'Cache-Control': 'no-cache',
|
|
1914
|
+
Connection: 'keep-alive',
|
|
1915
|
+
});
|
|
1916
|
+
|
|
1917
|
+
const sendEvent = (type, data) => {
|
|
1918
|
+
res.write(`event: ${type}\n`);
|
|
1919
|
+
res.write(`data: ${JSON.stringify(data)}\n\n`);
|
|
1920
|
+
};
|
|
1921
|
+
|
|
1922
|
+
// Keep-alive to prevent connection timeout
|
|
1923
|
+
const keepAlive = setInterval(() => {
|
|
1924
|
+
res.write(': keepalive\n\n');
|
|
1925
|
+
}, 15000);
|
|
1926
|
+
|
|
1927
|
+
// Register stream for crash protection
|
|
1928
|
+
const streamId = registerStream(res, keepAlive);
|
|
1929
|
+
|
|
1930
|
+
// Clean up on close
|
|
1931
|
+
req.on('close', () => {
|
|
1932
|
+
clearInterval(keepAlive);
|
|
1933
|
+
unregisterStream(streamId);
|
|
1934
|
+
});
|
|
1935
|
+
|
|
1936
|
+
try {
|
|
1937
|
+
const profiles = req.query.profiles
|
|
1938
|
+
? req.query.profiles.split(',')
|
|
1939
|
+
: ['budget'];
|
|
1940
|
+
const scenarios = req.query.scenarios === 'all' || !req.query.scenarios
|
|
1941
|
+
? 'all'
|
|
1942
|
+
: req.query.scenarios.split(',');
|
|
1943
|
+
const skipRubric = req.query.skipRubric === 'true';
|
|
1944
|
+
const outputSize = req.query.outputSize || 'normal';
|
|
1945
|
+
|
|
1946
|
+
// Get all scenarios to run
|
|
1947
|
+
const allScenarios = tutorApi.listScenarios();
|
|
1948
|
+
const scenariosToRun = scenarios === 'all'
|
|
1949
|
+
? allScenarios
|
|
1950
|
+
: allScenarios.filter(s => scenarios.includes(s.id));
|
|
1951
|
+
|
|
1952
|
+
const totalTests = profiles.length * scenariosToRun.length;
|
|
1953
|
+
let completedTests = 0;
|
|
1954
|
+
|
|
1955
|
+
sendEvent('start', {
|
|
1956
|
+
profiles,
|
|
1957
|
+
scenarioCount: scenariosToRun.length,
|
|
1958
|
+
totalTests,
|
|
1959
|
+
skipRubric,
|
|
1960
|
+
outputSize,
|
|
1961
|
+
timestamp: new Date().toISOString(),
|
|
1962
|
+
});
|
|
1963
|
+
|
|
1964
|
+
sendEvent('log', { message: `Starting batch run: ${profiles.length} profiles × ${scenariosToRun.length} scenarios = ${totalTests} tests`, level: 'info' });
|
|
1965
|
+
sendEvent('log', { message: `Fast mode (skip rubric): ${skipRubric}`, level: 'info' });
|
|
1966
|
+
sendEvent('log', { message: `Output size: ${outputSize}`, level: 'info' });
|
|
1967
|
+
|
|
1968
|
+
const results = [];
|
|
1969
|
+
|
|
1970
|
+
for (const profileName of profiles) {
|
|
1971
|
+
sendEvent('log', { message: `\n=== Profile: ${profileName} ===`, level: 'info' });
|
|
1972
|
+
|
|
1973
|
+
for (const scenario of scenariosToRun) {
|
|
1974
|
+
completedTests++;
|
|
1975
|
+
|
|
1976
|
+
sendEvent('progress', {
|
|
1977
|
+
current: completedTests,
|
|
1978
|
+
total: totalTests,
|
|
1979
|
+
profile: profileName,
|
|
1980
|
+
scenario: scenario.name,
|
|
1981
|
+
percentage: Math.round((completedTests / totalTests) * 100),
|
|
1982
|
+
});
|
|
1983
|
+
|
|
1984
|
+
sendEvent('log', { message: `[${completedTests}/${totalTests}] ${scenario.name}...`, level: 'info' });
|
|
1985
|
+
|
|
1986
|
+
try {
|
|
1987
|
+
const config = { profileName, label: profileName };
|
|
1988
|
+
|
|
1989
|
+
// Create log callback for this test
|
|
1990
|
+
const onLog = (message, level = 'info') => {
|
|
1991
|
+
sendEvent('log', { message: ` ${message}`, level, timestamp: new Date().toISOString() });
|
|
1992
|
+
};
|
|
1993
|
+
|
|
1994
|
+
const result = await evaluationRunner.quickTest(config, {
|
|
1995
|
+
scenarioId: scenario.id,
|
|
1996
|
+
skipRubricEval: skipRubric,
|
|
1997
|
+
outputSize,
|
|
1998
|
+
verbose: false,
|
|
1999
|
+
onLog,
|
|
2000
|
+
});
|
|
2001
|
+
|
|
2002
|
+
results.push(result);
|
|
2003
|
+
|
|
2004
|
+
const scoreStr = result.overallScore != null ? result.overallScore.toFixed(1) : 'N/A';
|
|
2005
|
+
const status = result.success !== false ? '✓' : '✗';
|
|
2006
|
+
sendEvent('log', { message: ` ${status} Score: ${scoreStr} (${result.latencyMs}ms)`, level: result.success !== false ? 'success' : 'warning' });
|
|
2007
|
+
|
|
2008
|
+
sendEvent('result', {
|
|
2009
|
+
profile: profileName,
|
|
2010
|
+
scenarioId: scenario.id,
|
|
2011
|
+
scenarioName: scenario.name,
|
|
2012
|
+
passed: result.success,
|
|
2013
|
+
score: result.overallScore,
|
|
2014
|
+
latencyMs: result.latencyMs,
|
|
2015
|
+
inputTokens: result.inputTokens,
|
|
2016
|
+
outputTokens: result.outputTokens,
|
|
2017
|
+
totalTokens: (result.inputTokens || 0) + (result.outputTokens || 0),
|
|
2018
|
+
});
|
|
2019
|
+
} catch (e) {
|
|
2020
|
+
sendEvent('log', { message: ` ✗ Error: ${e.message}`, level: 'error' });
|
|
2021
|
+
sendEvent('error', {
|
|
2022
|
+
profile: profileName,
|
|
2023
|
+
scenarioId: scenario.id,
|
|
2024
|
+
error: e.message,
|
|
2025
|
+
});
|
|
2026
|
+
}
|
|
2027
|
+
}
|
|
2028
|
+
}
|
|
2029
|
+
|
|
2030
|
+
// Calculate summary
|
|
2031
|
+
const successCount = results.filter(r => r.success !== false).length;
|
|
2032
|
+
const scores = results.filter(r => r.overallScore != null).map(r => r.overallScore);
|
|
2033
|
+
const avgScore = scores.length > 0
|
|
2034
|
+
? scores.reduce((a, b) => a + b, 0) / scores.length
|
|
2035
|
+
: null;
|
|
2036
|
+
|
|
2037
|
+
sendEvent('log', { message: `\n=== Batch Complete ===`, level: 'success' });
|
|
2038
|
+
sendEvent('log', { message: `Total: ${totalTests}, Passed: ${successCount}, Avg Score: ${avgScore?.toFixed(1) || 'N/A'}`, level: 'info' });
|
|
2039
|
+
|
|
2040
|
+
sendEvent('complete', {
|
|
2041
|
+
totalTests,
|
|
2042
|
+
successfulTests: successCount,
|
|
2043
|
+
averageScore: avgScore,
|
|
2044
|
+
});
|
|
2045
|
+
|
|
2046
|
+
unregisterStream(streamId);
|
|
2047
|
+
res.end();
|
|
2048
|
+
} catch (error) {
|
|
2049
|
+
sendEvent('log', { message: `Fatal error: ${error.message}`, level: 'error' });
|
|
2050
|
+
sendEvent('error', { error: error.message });
|
|
2051
|
+
unregisterStream(streamId);
|
|
2052
|
+
res.end();
|
|
2053
|
+
}
|
|
2054
|
+
});
|
|
2055
|
+
|
|
2056
|
+
// ============================================================================
|
|
2057
|
+
// Trajectory and Improvement Cycle Endpoints
|
|
2058
|
+
// ============================================================================
|
|
2059
|
+
|
|
2060
|
+
/**
|
|
2061
|
+
* Get improvement trajectory for a profile
|
|
2062
|
+
* GET /api/eval/trajectory/:profile
|
|
2063
|
+
* Query params: last (number of cycles), all (boolean)
|
|
2064
|
+
*/
|
|
2065
|
+
router.get('/trajectory/:profile', (req, res) => {
|
|
2066
|
+
try {
|
|
2067
|
+
const { profile } = req.params;
|
|
2068
|
+
const last = parseInt(req.query.last) || 5;
|
|
2069
|
+
const all = req.query.all === 'true';
|
|
2070
|
+
|
|
2071
|
+
const trajectoryDir = path.join(process.cwd(), 'data', 'improvement-trajectories');
|
|
2072
|
+
const trajectoryFile = path.join(trajectoryDir, `${profile}.json`);
|
|
2073
|
+
|
|
2074
|
+
if (!fs.existsSync(trajectoryFile)) {
|
|
2075
|
+
return res.json({
|
|
2076
|
+
success: true,
|
|
2077
|
+
profile,
|
|
2078
|
+
cycles: [],
|
|
2079
|
+
message: 'No improvement history found for this profile',
|
|
2080
|
+
});
|
|
2081
|
+
}
|
|
2082
|
+
|
|
2083
|
+
const data = JSON.parse(fs.readFileSync(trajectoryFile, 'utf8'));
|
|
2084
|
+
const cycles = all ? data.cycles : data.cycles.slice(-last);
|
|
2085
|
+
|
|
2086
|
+
res.json({
|
|
2087
|
+
success: true,
|
|
2088
|
+
profile,
|
|
2089
|
+
startedAt: data.startedAt,
|
|
2090
|
+
lastUpdated: data.lastUpdated,
|
|
2091
|
+
totalCycles: data.cycles.length,
|
|
2092
|
+
cycles,
|
|
2093
|
+
});
|
|
2094
|
+
} catch (error) {
|
|
2095
|
+
console.error('[EvalRoutes] Get trajectory error:', error);
|
|
2096
|
+
res.status(500).json({ error: 'Failed to get trajectory', details: error.message });
|
|
2097
|
+
}
|
|
2098
|
+
});
|
|
2099
|
+
|
|
2100
|
+
/**
|
|
2101
|
+
* Compare two evaluation runs
|
|
2102
|
+
* GET /api/eval/compare-runs/:runId1/:runId2
|
|
2103
|
+
*/
|
|
2104
|
+
router.get('/compare-runs/:runId1/:runId2', (req, res) => {
|
|
2105
|
+
try {
|
|
2106
|
+
const { runId1, runId2 } = req.params;
|
|
2107
|
+
|
|
2108
|
+
const results1 = evaluationStore.getResults(runId1);
|
|
2109
|
+
const results2 = evaluationStore.getResults(runId2);
|
|
2110
|
+
|
|
2111
|
+
if (!results1 || results1.length === 0) {
|
|
2112
|
+
return res.status(404).json({ error: `Run ${runId1} not found` });
|
|
2113
|
+
}
|
|
2114
|
+
if (!results2 || results2.length === 0) {
|
|
2115
|
+
return res.status(404).json({ error: `Run ${runId2} not found` });
|
|
2116
|
+
}
|
|
2117
|
+
|
|
2118
|
+
// Calculate averages for each run
|
|
2119
|
+
const calcAverages = (results) => {
|
|
2120
|
+
const dims = ['relevance', 'specificity', 'pedagogical', 'personalization', 'actionability', 'tone'];
|
|
2121
|
+
const dimScores = {};
|
|
2122
|
+
dims.forEach(d => { dimScores[d] = []; });
|
|
2123
|
+
|
|
2124
|
+
let totalScore = 0;
|
|
2125
|
+
let scoreCount = 0;
|
|
2126
|
+
|
|
2127
|
+
results.forEach(r => {
|
|
2128
|
+
if (r.overall_score != null) {
|
|
2129
|
+
totalScore += r.overall_score;
|
|
2130
|
+
scoreCount++;
|
|
2131
|
+
}
|
|
2132
|
+
dims.forEach(d => {
|
|
2133
|
+
const score = r[`score_${d}`];
|
|
2134
|
+
if (score != null) {
|
|
2135
|
+
dimScores[d].push(score);
|
|
2136
|
+
}
|
|
2137
|
+
});
|
|
2138
|
+
});
|
|
2139
|
+
|
|
2140
|
+
const dimAverages = {};
|
|
2141
|
+
dims.forEach(d => {
|
|
2142
|
+
const scores = dimScores[d];
|
|
2143
|
+
dimAverages[d] = scores.length > 0
|
|
2144
|
+
? scores.reduce((a, b) => a + b, 0) / scores.length
|
|
2145
|
+
: null;
|
|
2146
|
+
});
|
|
2147
|
+
|
|
2148
|
+
return {
|
|
2149
|
+
overallScore: scoreCount > 0 ? totalScore / scoreCount : null,
|
|
2150
|
+
dimensions: dimAverages,
|
|
2151
|
+
testCount: results.length,
|
|
2152
|
+
successCount: results.filter(r => r.success).length,
|
|
2153
|
+
};
|
|
2154
|
+
};
|
|
2155
|
+
|
|
2156
|
+
const avg1 = calcAverages(results1);
|
|
2157
|
+
const avg2 = calcAverages(results2);
|
|
2158
|
+
|
|
2159
|
+
// Calculate deltas
|
|
2160
|
+
const deltas = {
|
|
2161
|
+
overallScore: avg2.overallScore != null && avg1.overallScore != null
|
|
2162
|
+
? avg2.overallScore - avg1.overallScore
|
|
2163
|
+
: null,
|
|
2164
|
+
dimensions: {},
|
|
2165
|
+
};
|
|
2166
|
+
|
|
2167
|
+
Object.keys(avg1.dimensions).forEach(dim => {
|
|
2168
|
+
if (avg1.dimensions[dim] != null && avg2.dimensions[dim] != null) {
|
|
2169
|
+
deltas.dimensions[dim] = avg2.dimensions[dim] - avg1.dimensions[dim];
|
|
2170
|
+
} else {
|
|
2171
|
+
deltas.dimensions[dim] = null;
|
|
2172
|
+
}
|
|
2173
|
+
});
|
|
2174
|
+
|
|
2175
|
+
res.json({
|
|
2176
|
+
success: true,
|
|
2177
|
+
run1: { id: runId1, ...avg1 },
|
|
2178
|
+
run2: { id: runId2, ...avg2 },
|
|
2179
|
+
deltas,
|
|
2180
|
+
improved: deltas.overallScore != null && deltas.overallScore > 0,
|
|
2181
|
+
});
|
|
2182
|
+
} catch (error) {
|
|
2183
|
+
console.error('[EvalRoutes] Compare runs error:', error);
|
|
2184
|
+
res.status(500).json({ error: 'Failed to compare runs', details: error.message });
|
|
2185
|
+
}
|
|
2186
|
+
});
|
|
2187
|
+
|
|
2188
|
+
/**
|
|
2189
|
+
* Get dimension statistics across all runs for trend analysis
|
|
2190
|
+
* GET /api/eval/trends
|
|
2191
|
+
* Query params: profile, limit (default 50 individual results)
|
|
2192
|
+
*
|
|
2193
|
+
* Returns individual test results (not aggregated per run) for accurate trend visualization.
|
|
2194
|
+
* Each point represents a single evaluation, not an averaged run.
|
|
2195
|
+
*/
|
|
2196
|
+
router.get('/trends', (req, res) => {
|
|
2197
|
+
try {
|
|
2198
|
+
const { profile } = req.query;
|
|
2199
|
+
const limit = parseInt(req.query.limit) || 50;
|
|
2200
|
+
|
|
2201
|
+
// Get recent runs (fetch 3x the limit to account for fast-mode runs being filtered)
|
|
2202
|
+
// Many runs may be --fast (no AI scoring), so we need to fetch more to get enough scored results
|
|
2203
|
+
const runs = evaluationStore.listRuns({ limit: limit * 3 });
|
|
2204
|
+
|
|
2205
|
+
// Helper to extract numeric score from potentially complex score objects
|
|
2206
|
+
const extractNumericScore = (scoreVal) => {
|
|
2207
|
+
if (scoreVal == null) return null;
|
|
2208
|
+
if (typeof scoreVal === 'number') return isNaN(scoreVal) ? null : scoreVal;
|
|
2209
|
+
if (typeof scoreVal === 'object' && scoreVal.score != null) {
|
|
2210
|
+
const s = scoreVal.score;
|
|
2211
|
+
return typeof s === 'number' && !isNaN(s) ? s : null;
|
|
2212
|
+
}
|
|
2213
|
+
return null;
|
|
2214
|
+
};
|
|
2215
|
+
|
|
2216
|
+
// Collect individual results from all runs
|
|
2217
|
+
const allResults = [];
|
|
2218
|
+
const dims = ['relevance', 'specificity', 'pedagogical', 'personalization', 'actionability', 'tone'];
|
|
2219
|
+
|
|
2220
|
+
for (const run of runs) {
|
|
2221
|
+
const results = evaluationStore.getResults(run.id);
|
|
2222
|
+
|
|
2223
|
+
// Use metadata.runType if available, fallback to parsing description
|
|
2224
|
+
const metadata = run.metadata || {};
|
|
2225
|
+
let runType = metadata.runType || 'eval';
|
|
2226
|
+
if (runType === 'eval' && run.description) {
|
|
2227
|
+
const desc = run.description.toLowerCase();
|
|
2228
|
+
if (desc.includes('matrix')) runType = 'matrix';
|
|
2229
|
+
else if (desc.includes('auto-improve')) runType = 'auto';
|
|
2230
|
+
else if (desc.includes('compare')) runType = 'compare';
|
|
2231
|
+
else if (desc.includes('quick')) runType = 'quick';
|
|
2232
|
+
}
|
|
2233
|
+
|
|
2234
|
+
for (const r of results) {
|
|
2235
|
+
// Filter by profile if specified
|
|
2236
|
+
if (profile && r.profileName !== profile) continue;
|
|
2237
|
+
|
|
2238
|
+
// Extract dimension scores
|
|
2239
|
+
const dimScores = {};
|
|
2240
|
+
dims.forEach(d => {
|
|
2241
|
+
dimScores[d] = extractNumericScore(r.scores?.[d]);
|
|
2242
|
+
});
|
|
2243
|
+
|
|
2244
|
+
allResults.push({
|
|
2245
|
+
runId: run.id,
|
|
2246
|
+
resultId: r.id,
|
|
2247
|
+
createdAt: r.createdAt || run.createdAt,
|
|
2248
|
+
description: run.description,
|
|
2249
|
+
runType,
|
|
2250
|
+
profileName: r.profileName,
|
|
2251
|
+
scenarioName: r.scenarioName,
|
|
2252
|
+
overallScore: extractNumericScore(r.overallScore),
|
|
2253
|
+
dimensions: dimScores,
|
|
2254
|
+
// Include testCount for the table display (how many tests in this run)
|
|
2255
|
+
testCount: results.length,
|
|
2256
|
+
// Include profiles array for compatibility with table display
|
|
2257
|
+
profiles: [r.profileName].filter(Boolean),
|
|
2258
|
+
});
|
|
2259
|
+
}
|
|
2260
|
+
}
|
|
2261
|
+
|
|
2262
|
+
// Sort by createdAt (oldest first for charting) and limit
|
|
2263
|
+
allResults.sort((a, b) => new Date(a.createdAt).getTime() - new Date(b.createdAt).getTime());
|
|
2264
|
+
const trends = allResults.slice(-limit);
|
|
2265
|
+
|
|
2266
|
+
res.json({
|
|
2267
|
+
success: true,
|
|
2268
|
+
profile: profile || 'all',
|
|
2269
|
+
trends,
|
|
2270
|
+
totalResults: allResults.length,
|
|
2271
|
+
});
|
|
2272
|
+
} catch (error) {
|
|
2273
|
+
console.error('[EvalRoutes] Get trends error:', error);
|
|
2274
|
+
res.status(500).json({ error: 'Failed to get trends', details: error.message });
|
|
2275
|
+
}
|
|
2276
|
+
});
|
|
2277
|
+
|
|
2278
|
+
// ============================================================================
|
|
2279
|
+
// Documentation Endpoints
|
|
2280
|
+
// ============================================================================
|
|
2281
|
+
|
|
2282
|
+
// Path to evaluation documentation directory
|
|
2283
|
+
const EVAL_DOCS_DIR = path.join(process.cwd(), 'markdown', 'eval');
|
|
2284
|
+
// Path to research documentation directory
|
|
2285
|
+
const RESEARCH_DOCS_DIR = path.join(process.cwd(), 'docs', 'research');
|
|
2286
|
+
|
|
2287
|
+
/**
|
|
2288
|
+
* List available evaluation documentation files
|
|
2289
|
+
* GET /api/eval/docs
|
|
2290
|
+
*/
|
|
2291
|
+
router.get('/docs', (req, res) => {
|
|
2292
|
+
try {
|
|
2293
|
+
if (!fs.existsSync(EVAL_DOCS_DIR)) {
|
|
2294
|
+
return res.json({ success: true, docs: [] });
|
|
2295
|
+
}
|
|
2296
|
+
|
|
2297
|
+
const files = fs.readdirSync(EVAL_DOCS_DIR)
|
|
2298
|
+
.filter(f => f.endsWith('.md'))
|
|
2299
|
+
.map(f => {
|
|
2300
|
+
const filePath = path.join(EVAL_DOCS_DIR, f);
|
|
2301
|
+
const stats = fs.statSync(filePath);
|
|
2302
|
+
// Extract a friendly title from filename
|
|
2303
|
+
const name = f.replace('.md', '');
|
|
2304
|
+
const title = name
|
|
2305
|
+
.replace(/-/g, ' ')
|
|
2306
|
+
.replace(/\b\w/g, l => l.toUpperCase());
|
|
2307
|
+
return {
|
|
2308
|
+
name,
|
|
2309
|
+
filename: f,
|
|
2310
|
+
title,
|
|
2311
|
+
size: stats.size,
|
|
2312
|
+
modified: stats.mtime.toISOString(),
|
|
2313
|
+
};
|
|
2314
|
+
})
|
|
2315
|
+
.sort((a, b) => a.title.localeCompare(b.title));
|
|
2316
|
+
|
|
2317
|
+
res.json({ success: true, docs: files });
|
|
2318
|
+
} catch (error) {
|
|
2319
|
+
console.error('[EvalRoutes] List docs error:', error);
|
|
2320
|
+
res.status(500).json({ error: 'Failed to list docs' });
|
|
2321
|
+
}
|
|
2322
|
+
});
|
|
2323
|
+
|
|
2324
|
+
/**
|
|
2325
|
+
* Get documentation file content
|
|
2326
|
+
* GET /api/eval/docs/:name
|
|
2327
|
+
*
|
|
2328
|
+
* Supports "research:" prefix to load from docs/research/ directory
|
|
2329
|
+
* e.g., /api/eval/docs/research:PAPER-DRAFT-RECOGNITION-TUTORING
|
|
2330
|
+
*/
|
|
2331
|
+
router.get('/docs/:name', (req, res) => {
|
|
2332
|
+
try {
|
|
2333
|
+
let docName = req.params.name;
|
|
2334
|
+
let docsDir = EVAL_DOCS_DIR;
|
|
2335
|
+
|
|
2336
|
+
// Check for research: prefix to load from docs/research/
|
|
2337
|
+
if (docName.startsWith('research:')) {
|
|
2338
|
+
docName = docName.substring('research:'.length);
|
|
2339
|
+
docsDir = RESEARCH_DOCS_DIR;
|
|
2340
|
+
}
|
|
2341
|
+
|
|
2342
|
+
const filename = docName.endsWith('.md')
|
|
2343
|
+
? docName
|
|
2344
|
+
: `${docName}.md`;
|
|
2345
|
+
const filePath = path.join(docsDir, filename);
|
|
2346
|
+
|
|
2347
|
+
if (!fs.existsSync(filePath)) {
|
|
2348
|
+
return res.status(404).json({ error: 'Documentation not found' });
|
|
2349
|
+
}
|
|
2350
|
+
|
|
2351
|
+
const content = fs.readFileSync(filePath, 'utf8');
|
|
2352
|
+
const stats = fs.statSync(filePath);
|
|
2353
|
+
|
|
2354
|
+
// Extract title from first heading or filename
|
|
2355
|
+
const titleMatch = content.match(/^#\s+(.+)$/m);
|
|
2356
|
+
const title = titleMatch
|
|
2357
|
+
? titleMatch[1]
|
|
2358
|
+
: docName.replace(/-/g, ' ').replace(/\b\w/g, l => l.toUpperCase());
|
|
2359
|
+
|
|
2360
|
+
res.json({
|
|
2361
|
+
success: true,
|
|
2362
|
+
doc: {
|
|
2363
|
+
name: req.params.name,
|
|
2364
|
+
filename,
|
|
2365
|
+
title,
|
|
2366
|
+
content,
|
|
2367
|
+
size: stats.size,
|
|
2368
|
+
modified: stats.mtime.toISOString(),
|
|
2369
|
+
},
|
|
2370
|
+
});
|
|
2371
|
+
} catch (error) {
|
|
2372
|
+
console.error('[EvalRoutes] Get doc error:', error);
|
|
2373
|
+
res.status(500).json({ error: 'Failed to get documentation' });
|
|
2374
|
+
}
|
|
2375
|
+
});
|
|
2376
|
+
|
|
2377
|
+
// ============================================================================
|
|
2378
|
+
// Monitoring Endpoints
|
|
2379
|
+
// ============================================================================
|
|
2380
|
+
|
|
2381
|
+
/**
|
|
2382
|
+
* Get monitoring summary
|
|
2383
|
+
* GET /api/eval/monitor/summary
|
|
2384
|
+
*/
|
|
2385
|
+
router.get('/monitor/summary', (req, res) => {
|
|
2386
|
+
try {
|
|
2387
|
+
const summary = monitoringService.getMonitoringSummary();
|
|
2388
|
+
res.json({ success: true, ...summary });
|
|
2389
|
+
} catch (error) {
|
|
2390
|
+
console.error('[EvalRoutes] Monitor summary error:', error);
|
|
2391
|
+
res.status(500).json({ error: 'Failed to get monitoring summary' });
|
|
2392
|
+
}
|
|
2393
|
+
});
|
|
2394
|
+
|
|
2395
|
+
/**
|
|
2396
|
+
* Get active sessions
|
|
2397
|
+
* GET /api/eval/monitor/sessions
|
|
2398
|
+
*/
|
|
2399
|
+
router.get('/monitor/sessions', (req, res) => {
|
|
2400
|
+
try {
|
|
2401
|
+
const sessions = monitoringService.getActiveSessions();
|
|
2402
|
+
const aggregate = monitoringService.getAggregateMetrics();
|
|
2403
|
+
res.json({ success: true, sessions, aggregate });
|
|
2404
|
+
} catch (error) {
|
|
2405
|
+
console.error('[EvalRoutes] Monitor sessions error:', error);
|
|
2406
|
+
res.status(500).json({ error: 'Failed to get active sessions' });
|
|
2407
|
+
}
|
|
2408
|
+
});
|
|
2409
|
+
|
|
2410
|
+
/**
|
|
2411
|
+
* Get specific session details
|
|
2412
|
+
* GET /api/eval/monitor/sessions/:id
|
|
2413
|
+
*/
|
|
2414
|
+
router.get('/monitor/sessions/:id', (req, res) => {
|
|
2415
|
+
try {
|
|
2416
|
+
const session = monitoringService.getSession(req.params.id);
|
|
2417
|
+
if (!session) {
|
|
2418
|
+
return res.status(404).json({ error: 'Session not found' });
|
|
2419
|
+
}
|
|
2420
|
+
res.json({ success: true, session });
|
|
2421
|
+
} catch (error) {
|
|
2422
|
+
console.error('[EvalRoutes] Get session error:', error);
|
|
2423
|
+
res.status(500).json({ error: 'Failed to get session' });
|
|
2424
|
+
}
|
|
2425
|
+
});
|
|
2426
|
+
|
|
2427
|
+
/**
|
|
2428
|
+
* Get alerts
|
|
2429
|
+
* GET /api/eval/monitor/alerts
|
|
2430
|
+
* Query params: severity, acknowledged, limit
|
|
2431
|
+
*/
|
|
2432
|
+
router.get('/monitor/alerts', (req, res) => {
|
|
2433
|
+
try {
|
|
2434
|
+
const { severity, acknowledged, limit } = req.query;
|
|
2435
|
+
const options = {};
|
|
2436
|
+
if (severity) options.severity = severity;
|
|
2437
|
+
if (acknowledged !== undefined) options.acknowledged = acknowledged === 'true';
|
|
2438
|
+
if (limit) options.limit = parseInt(limit, 10);
|
|
2439
|
+
|
|
2440
|
+
const alerts = monitoringService.getAlerts(options);
|
|
2441
|
+
res.json({ success: true, alerts });
|
|
2442
|
+
} catch (error) {
|
|
2443
|
+
console.error('[EvalRoutes] Get alerts error:', error);
|
|
2444
|
+
res.status(500).json({ error: 'Failed to get alerts' });
|
|
2445
|
+
}
|
|
2446
|
+
});
|
|
2447
|
+
|
|
2448
|
+
/**
|
|
2449
|
+
* Acknowledge an alert
|
|
2450
|
+
* POST /api/eval/monitor/alerts/:id/acknowledge
|
|
2451
|
+
*/
|
|
2452
|
+
router.post('/monitor/alerts/:id/acknowledge', (req, res) => {
|
|
2453
|
+
try {
|
|
2454
|
+
const alert = monitoringService.acknowledgeAlert(req.params.id);
|
|
2455
|
+
if (!alert) {
|
|
2456
|
+
return res.status(404).json({ error: 'Alert not found' });
|
|
2457
|
+
}
|
|
2458
|
+
res.json({ success: true, alert });
|
|
2459
|
+
} catch (error) {
|
|
2460
|
+
console.error('[EvalRoutes] Acknowledge alert error:', error);
|
|
2461
|
+
res.status(500).json({ error: 'Failed to acknowledge alert' });
|
|
2462
|
+
}
|
|
2463
|
+
});
|
|
2464
|
+
|
|
2465
|
+
// ============================================================================
|
|
2466
|
+
// Run Completion & Recovery Endpoints
|
|
2467
|
+
// ============================================================================
|
|
2468
|
+
|
|
2469
|
+
/**
|
|
2470
|
+
* Complete an incomplete evaluation run
|
|
2471
|
+
* POST /api/eval/runs/:runId/complete
|
|
2472
|
+
*
|
|
2473
|
+
* Marks a stuck/interrupted run as completed with whatever results exist.
|
|
2474
|
+
*/
|
|
2475
|
+
router.post('/runs/:runId/complete', (req, res) => {
|
|
2476
|
+
try {
|
|
2477
|
+
const result = evaluationStore.completeRun(req.params.runId);
|
|
2478
|
+
res.json({ success: true, ...result });
|
|
2479
|
+
} catch (error) {
|
|
2480
|
+
console.error('[EvalRoutes] Complete run error:', error);
|
|
2481
|
+
res.status(500).json({ error: 'Failed to complete run', details: error.message });
|
|
2482
|
+
}
|
|
2483
|
+
});
|
|
2484
|
+
|
|
2485
|
+
/**
|
|
2486
|
+
* Get resumption status for an incomplete run
|
|
2487
|
+
* GET /api/eval/runs/:runId/resume-status
|
|
2488
|
+
*
|
|
2489
|
+
* Returns which tests have been completed and which remain,
|
|
2490
|
+
* enabling resumption of interrupted evaluations.
|
|
2491
|
+
*
|
|
2492
|
+
* Query params: profiles (comma-separated), scenarios (comma-separated or "all")
|
|
2493
|
+
*/
|
|
2494
|
+
router.get('/runs/:runId/resume-status', (req, res) => {
|
|
2495
|
+
try {
|
|
2496
|
+
const { runId } = req.params;
|
|
2497
|
+
const run = evaluationStore.getRun(runId);
|
|
2498
|
+
|
|
2499
|
+
if (!run) {
|
|
2500
|
+
return res.status(404).json({ error: 'Run not found' });
|
|
2501
|
+
}
|
|
2502
|
+
|
|
2503
|
+
// Get profiles and scenarios from query or run metadata
|
|
2504
|
+
const metadata = run.metadata ? JSON.parse(run.metadata) : {};
|
|
2505
|
+
let profiles = req.query.profiles ? req.query.profiles.split(',') : metadata.profiles || [];
|
|
2506
|
+
let scenariosParam = req.query.scenarios || metadata.scenarios || 'all';
|
|
2507
|
+
|
|
2508
|
+
if (profiles.length === 0) {
|
|
2509
|
+
return res.status(400).json({
|
|
2510
|
+
error: 'Profiles not specified',
|
|
2511
|
+
hint: 'Provide profiles as query param or ensure run metadata contains profiles',
|
|
2512
|
+
});
|
|
2513
|
+
}
|
|
2514
|
+
|
|
2515
|
+
// Get scenarios
|
|
2516
|
+
const allScenarios = tutorApi.listScenarios();
|
|
2517
|
+
const scenarios = scenariosParam === 'all'
|
|
2518
|
+
? allScenarios
|
|
2519
|
+
: allScenarios.filter(s => scenariosParam.includes(s.id));
|
|
2520
|
+
|
|
2521
|
+
// Get incomplete tests
|
|
2522
|
+
const status = evaluationStore.getIncompleteTests(runId, profiles, scenarios);
|
|
2523
|
+
|
|
2524
|
+
res.json({
|
|
2525
|
+
success: true,
|
|
2526
|
+
...status,
|
|
2527
|
+
runMetadata: {
|
|
2528
|
+
description: run.description,
|
|
2529
|
+
createdAt: run.createdAt,
|
|
2530
|
+
totalScenarios: run.totalScenarios,
|
|
2531
|
+
totalConfigurations: run.totalConfigurations,
|
|
2532
|
+
},
|
|
2533
|
+
});
|
|
2534
|
+
} catch (error) {
|
|
2535
|
+
console.error('[EvalRoutes] Resume status error:', error);
|
|
2536
|
+
res.status(500).json({ error: 'Failed to get resume status', details: error.message });
|
|
2537
|
+
}
|
|
2538
|
+
});
|
|
2539
|
+
|
|
2540
|
+
// ============================================================================
|
|
2541
|
+
// Interaction Evaluation Endpoints (Learner-Tutor Dialogues)
|
|
2542
|
+
// ============================================================================
|
|
2543
|
+
|
|
2544
|
+
/**
|
|
2545
|
+
* List interaction evaluations
|
|
2546
|
+
* GET /api/eval/interactions
|
|
2547
|
+
* Query params: limit (default 50), scenarioId
|
|
2548
|
+
*/
|
|
2549
|
+
router.get('/interactions', (req, res) => {
|
|
2550
|
+
try {
|
|
2551
|
+
const limit = parseInt(req.query.limit) || 50;
|
|
2552
|
+
const scenarioId = req.query.scenarioId || null;
|
|
2553
|
+
|
|
2554
|
+
const evals = evaluationStore.listInteractionEvals({ limit, scenarioId });
|
|
2555
|
+
res.json({ success: true, evals, count: evals.length });
|
|
2556
|
+
} catch (error) {
|
|
2557
|
+
console.error('[EvalRoutes] List interactions error:', error);
|
|
2558
|
+
res.status(500).json({ error: 'Failed to list interaction evaluations' });
|
|
2559
|
+
}
|
|
2560
|
+
});
|
|
2561
|
+
|
|
2562
|
+
/**
|
|
2563
|
+
* Get a specific interaction evaluation
|
|
2564
|
+
* GET /api/eval/interactions/:evalId
|
|
2565
|
+
*/
|
|
2566
|
+
router.get('/interactions/:evalId', (req, res) => {
|
|
2567
|
+
try {
|
|
2568
|
+
const { evalId } = req.params;
|
|
2569
|
+
const evalData = evaluationStore.getInteractionEval(evalId);
|
|
2570
|
+
|
|
2571
|
+
if (!evalData) {
|
|
2572
|
+
return res.status(404).json({ error: 'Interaction evaluation not found' });
|
|
2573
|
+
}
|
|
2574
|
+
|
|
2575
|
+
res.json({ success: true, ...evalData });
|
|
2576
|
+
} catch (error) {
|
|
2577
|
+
console.error('[EvalRoutes] Get interaction error:', error);
|
|
2578
|
+
res.status(500).json({ error: 'Failed to get interaction evaluation' });
|
|
2579
|
+
}
|
|
2580
|
+
});
|
|
2581
|
+
|
|
2582
|
+
/**
|
|
2583
|
+
* Get mermaid sequence diagram for an interaction evaluation
|
|
2584
|
+
* GET /api/eval/interactions/:evalId/diagram
|
|
2585
|
+
*/
|
|
2586
|
+
router.get('/interactions/:evalId/diagram', (req, res) => {
|
|
2587
|
+
try {
|
|
2588
|
+
const { evalId } = req.params;
|
|
2589
|
+
const evalData = evaluationStore.getInteractionEval(evalId);
|
|
2590
|
+
|
|
2591
|
+
if (!evalData) {
|
|
2592
|
+
return res.status(404).json({ error: 'Interaction evaluation not found' });
|
|
2593
|
+
}
|
|
2594
|
+
|
|
2595
|
+
res.type('text/plain').send(evalData.sequenceDiagram || 'No diagram available');
|
|
2596
|
+
} catch (error) {
|
|
2597
|
+
console.error('[EvalRoutes] Get diagram error:', error);
|
|
2598
|
+
res.status(500).json({ error: 'Failed to get diagram' });
|
|
2599
|
+
}
|
|
2600
|
+
});
|
|
2601
|
+
|
|
2602
|
+
/**
|
|
2603
|
+
* Get formatted transcript for an interaction evaluation
|
|
2604
|
+
* GET /api/eval/interactions/:evalId/transcript
|
|
2605
|
+
*/
|
|
2606
|
+
router.get('/interactions/:evalId/transcript', (req, res) => {
|
|
2607
|
+
try {
|
|
2608
|
+
const { evalId } = req.params;
|
|
2609
|
+
const evalData = evaluationStore.getInteractionEval(evalId);
|
|
2610
|
+
|
|
2611
|
+
if (!evalData) {
|
|
2612
|
+
return res.status(404).json({ error: 'Interaction evaluation not found' });
|
|
2613
|
+
}
|
|
2614
|
+
|
|
2615
|
+
res.type('text/plain').send(evalData.formattedTranscript || 'No transcript available');
|
|
2616
|
+
} catch (error) {
|
|
2617
|
+
console.error('[EvalRoutes] Get transcript error:', error);
|
|
2618
|
+
res.status(500).json({ error: 'Failed to get transcript' });
|
|
2619
|
+
}
|
|
2620
|
+
});
|
|
2621
|
+
|
|
2622
|
+
// ============================================================================
|
|
2623
|
+
// Recognition A/B Comparison Endpoint
|
|
2624
|
+
// ============================================================================
|
|
2625
|
+
|
|
2626
|
+
/**
|
|
2627
|
+
* Run Recognition A/B comparison with SSE streaming
|
|
2628
|
+
* GET /api/eval/stream/recognition-ab
|
|
2629
|
+
*
|
|
2630
|
+
* Compares baseline (no recognition) vs recognition (with recognition) profiles
|
|
2631
|
+
* using only recognition_test: true scenarios.
|
|
2632
|
+
*
|
|
2633
|
+
* Returns:
|
|
2634
|
+
* - Per-profile results with dimension scores
|
|
2635
|
+
* - Recognition metrics for recognition profile
|
|
2636
|
+
* - Delta analysis with statistical significance indicators
|
|
2637
|
+
* - Winner badges per dimension
|
|
2638
|
+
*/
|
|
2639
|
+
router.get('/stream/recognition-ab', async (req, res) => {
|
|
2640
|
+
// Set up SSE
|
|
2641
|
+
res.writeHead(200, {
|
|
2642
|
+
'Content-Type': 'text/event-stream',
|
|
2643
|
+
'Cache-Control': 'no-cache',
|
|
2644
|
+
Connection: 'keep-alive',
|
|
2645
|
+
});
|
|
2646
|
+
|
|
2647
|
+
const sendEvent = (type, data) => {
|
|
2648
|
+
res.write(`event: ${type}\ndata: ${JSON.stringify(data)}\n\n`);
|
|
2649
|
+
};
|
|
2650
|
+
|
|
2651
|
+
// Keep-alive to prevent connection timeout
|
|
2652
|
+
const keepAlive = setInterval(() => {
|
|
2653
|
+
res.write(': keepalive\n\n');
|
|
2654
|
+
}, 15000);
|
|
2655
|
+
|
|
2656
|
+
// Register stream for crash protection
|
|
2657
|
+
const streamId = registerStream(res, keepAlive);
|
|
2658
|
+
|
|
2659
|
+
// Clean up on close
|
|
2660
|
+
req.on('close', () => {
|
|
2661
|
+
clearInterval(keepAlive);
|
|
2662
|
+
unregisterStream(streamId);
|
|
2663
|
+
});
|
|
2664
|
+
|
|
2665
|
+
try {
|
|
2666
|
+
// Fixed profiles for A/B comparison
|
|
2667
|
+
const profiles = ['baseline', 'recognition'];
|
|
2668
|
+
const skipRubric = req.query.skipRubric === 'true';
|
|
2669
|
+
const outputSize = req.query.outputSize || 'normal';
|
|
2670
|
+
|
|
2671
|
+
// Validate profiles exist
|
|
2672
|
+
const allProfiles = tutorApi.listProfiles();
|
|
2673
|
+
const validProfiles = profiles.filter(p => allProfiles.some(ap => ap.name === p));
|
|
2674
|
+
|
|
2675
|
+
if (validProfiles.length !== 2) {
|
|
2676
|
+
sendEvent('error', {
|
|
2677
|
+
error: 'Recognition A/B requires both baseline and recognition profiles',
|
|
2678
|
+
found: validProfiles,
|
|
2679
|
+
available: allProfiles.map(p => p.name),
|
|
2680
|
+
});
|
|
2681
|
+
return res.end();
|
|
2682
|
+
}
|
|
2683
|
+
|
|
2684
|
+
// Get only recognition_test scenarios
|
|
2685
|
+
const allScenarios = tutorApi.listScenarios();
|
|
2686
|
+
const recognitionScenarios = allScenarios.filter(s => s.recognition_test === true);
|
|
2687
|
+
|
|
2688
|
+
if (recognitionScenarios.length === 0) {
|
|
2689
|
+
sendEvent('error', { error: 'No recognition_test scenarios found in config' });
|
|
2690
|
+
return res.end();
|
|
2691
|
+
}
|
|
2692
|
+
|
|
2693
|
+
const totalTests = validProfiles.length * recognitionScenarios.length;
|
|
2694
|
+
const testLearnerId = `eval-recognition-ab-${Date.now()}`;
|
|
2695
|
+
|
|
2696
|
+
sendEvent('start', {
|
|
2697
|
+
profiles: validProfiles,
|
|
2698
|
+
scenarioCount: recognitionScenarios.length,
|
|
2699
|
+
scenarioIds: recognitionScenarios.map(s => s.id),
|
|
2700
|
+
totalTests,
|
|
2701
|
+
skipRubric,
|
|
2702
|
+
outputSize,
|
|
2703
|
+
testLearnerId,
|
|
2704
|
+
timestamp: new Date().toISOString(),
|
|
2705
|
+
});
|
|
2706
|
+
|
|
2707
|
+
sendEvent('log', {
|
|
2708
|
+
message: `Recognition A/B: baseline vs recognition × ${recognitionScenarios.length} scenarios`,
|
|
2709
|
+
level: 'info',
|
|
2710
|
+
});
|
|
2711
|
+
|
|
2712
|
+
// Create a run to persist results
|
|
2713
|
+
const run = evaluationStore.createRun({
|
|
2714
|
+
description: `Recognition A/B: baseline vs recognition × ${recognitionScenarios.length} scenarios`,
|
|
2715
|
+
totalScenarios: recognitionScenarios.length,
|
|
2716
|
+
totalConfigurations: 2,
|
|
2717
|
+
metadata: {
|
|
2718
|
+
runType: 'recognition-ab',
|
|
2719
|
+
profiles: validProfiles,
|
|
2720
|
+
scenarios: recognitionScenarios.map(s => s.id),
|
|
2721
|
+
scenarioNames: recognitionScenarios.map(s => s.name),
|
|
2722
|
+
skipRubric,
|
|
2723
|
+
testLearnerId,
|
|
2724
|
+
},
|
|
2725
|
+
});
|
|
2726
|
+
|
|
2727
|
+
sendEvent('log', { message: `Run ID: ${run.id}`, level: 'info' });
|
|
2728
|
+
|
|
2729
|
+
// Run evaluations
|
|
2730
|
+
const results = { baseline: [], recognition: [] };
|
|
2731
|
+
const dimensionScores = { baseline: {}, recognition: {} };
|
|
2732
|
+
const recognitionMetrics = {
|
|
2733
|
+
momentsGenerated: 0,
|
|
2734
|
+
dialecticalDepth: [],
|
|
2735
|
+
synthesisStrategies: {
|
|
2736
|
+
ghost_dominates: 0,
|
|
2737
|
+
learner_dominates: 0,
|
|
2738
|
+
dialectical_synthesis: 0,
|
|
2739
|
+
},
|
|
2740
|
+
};
|
|
2741
|
+
let completedTests = 0;
|
|
2742
|
+
|
|
2743
|
+
for (const profileName of validProfiles) {
|
|
2744
|
+
sendEvent('log', { message: `\n=== Profile: ${profileName} ===`, level: 'info' });
|
|
2745
|
+
|
|
2746
|
+
// Clear writing pad before each profile run for clean comparison
|
|
2747
|
+
try {
|
|
2748
|
+
clearConscious(testLearnerId);
|
|
2749
|
+
sendEvent('log', { message: ` Cleared writing pad for ${testLearnerId}`, level: 'info' });
|
|
2750
|
+
} catch (e) {
|
|
2751
|
+
// Pad may not exist yet, that's fine
|
|
2752
|
+
}
|
|
2753
|
+
|
|
2754
|
+
for (const scenario of recognitionScenarios) {
|
|
2755
|
+
completedTests++;
|
|
2756
|
+
|
|
2757
|
+
sendEvent('progress', {
|
|
2758
|
+
current: completedTests,
|
|
2759
|
+
total: totalTests,
|
|
2760
|
+
profile: profileName,
|
|
2761
|
+
scenario: scenario.name,
|
|
2762
|
+
percentage: Math.round((completedTests / totalTests) * 100),
|
|
2763
|
+
});
|
|
2764
|
+
|
|
2765
|
+
sendEvent('log', { message: `[${completedTests}/${totalTests}] ${scenario.name}...`, level: 'info' });
|
|
2766
|
+
|
|
2767
|
+
try {
|
|
2768
|
+
const config = { profileName, label: profileName };
|
|
2769
|
+
|
|
2770
|
+
// Create log callback for this test
|
|
2771
|
+
const onLog = (message, level = 'info') => {
|
|
2772
|
+
sendEvent('log', { message: ` ${message}`, level, timestamp: new Date().toISOString() });
|
|
2773
|
+
};
|
|
2774
|
+
|
|
2775
|
+
const result = await evaluationRunner.quickTest(config, {
|
|
2776
|
+
scenarioId: scenario.id,
|
|
2777
|
+
verbose: false,
|
|
2778
|
+
skipRubricEval: skipRubric,
|
|
2779
|
+
outputSize,
|
|
2780
|
+
onLog,
|
|
2781
|
+
learnerId: testLearnerId,
|
|
2782
|
+
});
|
|
2783
|
+
|
|
2784
|
+
results[profileName].push(result);
|
|
2785
|
+
|
|
2786
|
+
// Save result to database
|
|
2787
|
+
evaluationStore.storeResult(run.id, {
|
|
2788
|
+
...result,
|
|
2789
|
+
scenarioId: scenario.id,
|
|
2790
|
+
scenarioName: scenario.name,
|
|
2791
|
+
profileName,
|
|
2792
|
+
});
|
|
2793
|
+
|
|
2794
|
+
// Collect dimension scores
|
|
2795
|
+
if (result.scores) {
|
|
2796
|
+
for (const [dim, score] of Object.entries(result.scores)) {
|
|
2797
|
+
if (!dimensionScores[profileName][dim]) {
|
|
2798
|
+
dimensionScores[profileName][dim] = [];
|
|
2799
|
+
}
|
|
2800
|
+
if (typeof score === 'number') {
|
|
2801
|
+
dimensionScores[profileName][dim].push(score);
|
|
2802
|
+
}
|
|
2803
|
+
}
|
|
2804
|
+
}
|
|
2805
|
+
|
|
2806
|
+
// For recognition profile, collect recognition-specific metrics
|
|
2807
|
+
if (profileName === 'recognition') {
|
|
2808
|
+
try {
|
|
2809
|
+
const pad = getWritingPad(testLearnerId);
|
|
2810
|
+
if (pad) {
|
|
2811
|
+
recognitionMetrics.momentsGenerated = pad.totalRecognitionMoments || 0;
|
|
2812
|
+
if (pad.dialecticalDepth) {
|
|
2813
|
+
recognitionMetrics.dialecticalDepth.push(pad.dialecticalDepth);
|
|
2814
|
+
}
|
|
2815
|
+
// Aggregate synthesis strategies from pad stats
|
|
2816
|
+
const stats = pad.statistics || {};
|
|
2817
|
+
if (stats.synthesisStrategies) {
|
|
2818
|
+
recognitionMetrics.synthesisStrategies.ghost_dominates += stats.synthesisStrategies.ghost_dominates || 0;
|
|
2819
|
+
recognitionMetrics.synthesisStrategies.learner_dominates += stats.synthesisStrategies.learner_dominates || 0;
|
|
2820
|
+
recognitionMetrics.synthesisStrategies.dialectical_synthesis += stats.synthesisStrategies.dialectical_synthesis || 0;
|
|
2821
|
+
}
|
|
2822
|
+
}
|
|
2823
|
+
} catch (e) {
|
|
2824
|
+
// Recognition metrics collection failed silently
|
|
2825
|
+
}
|
|
2826
|
+
}
|
|
2827
|
+
|
|
2828
|
+
const scoreStr = result.overallScore != null ? result.overallScore.toFixed(1) : 'N/A';
|
|
2829
|
+
const status = result.success !== false ? '✓' : '✗';
|
|
2830
|
+
sendEvent('log', {
|
|
2831
|
+
message: ` ${status} Score: ${scoreStr} (${result.latencyMs}ms)`,
|
|
2832
|
+
level: result.success !== false ? 'success' : 'warning',
|
|
2833
|
+
});
|
|
2834
|
+
|
|
2835
|
+
sendEvent('result', {
|
|
2836
|
+
profile: profileName,
|
|
2837
|
+
scenarioId: scenario.id,
|
|
2838
|
+
scenarioName: scenario.name,
|
|
2839
|
+
passed: result.success !== false,
|
|
2840
|
+
score: result.overallScore,
|
|
2841
|
+
latencyMs: result.latencyMs,
|
|
2842
|
+
inputTokens: result.inputTokens,
|
|
2843
|
+
outputTokens: result.outputTokens,
|
|
2844
|
+
});
|
|
2845
|
+
|
|
2846
|
+
} catch (e) {
|
|
2847
|
+
sendEvent('log', { message: ` ✗ Error: ${e.message}`, level: 'error' });
|
|
2848
|
+
|
|
2849
|
+
const errorResult = {
|
|
2850
|
+
success: false,
|
|
2851
|
+
errorMessage: e.message,
|
|
2852
|
+
scenarioId: scenario.id,
|
|
2853
|
+
};
|
|
2854
|
+
results[profileName].push(errorResult);
|
|
2855
|
+
|
|
2856
|
+
evaluationStore.storeResult(run.id, {
|
|
2857
|
+
...errorResult,
|
|
2858
|
+
scenarioName: scenario.name,
|
|
2859
|
+
profileName,
|
|
2860
|
+
provider: 'unknown',
|
|
2861
|
+
model: 'unknown',
|
|
2862
|
+
});
|
|
2863
|
+
}
|
|
2864
|
+
}
|
|
2865
|
+
}
|
|
2866
|
+
|
|
2867
|
+
// Update run as completed
|
|
2868
|
+
evaluationStore.updateRun(run.id, {
|
|
2869
|
+
status: 'completed',
|
|
2870
|
+
totalTests: completedTests,
|
|
2871
|
+
completedAt: new Date().toISOString(),
|
|
2872
|
+
});
|
|
2873
|
+
|
|
2874
|
+
// Build dimension averages
|
|
2875
|
+
const dimensions = ['relevance', 'specificity', 'pedagogical', 'personalization', 'actionability', 'tone'];
|
|
2876
|
+
const dimensionAverages = { baseline: {}, recognition: {} };
|
|
2877
|
+
|
|
2878
|
+
for (const profile of validProfiles) {
|
|
2879
|
+
for (const dim of dimensions) {
|
|
2880
|
+
const scores = dimensionScores[profile]?.[dim] || [];
|
|
2881
|
+
dimensionAverages[profile][dim] = scores.length > 0
|
|
2882
|
+
? scores.reduce((a, b) => a + b, 0) / scores.length
|
|
2883
|
+
: null;
|
|
2884
|
+
}
|
|
2885
|
+
}
|
|
2886
|
+
|
|
2887
|
+
// Build delta analysis with winner indicators
|
|
2888
|
+
const deltaAnalysis = [];
|
|
2889
|
+
for (const dim of dimensions) {
|
|
2890
|
+
const baselineAvg = dimensionAverages.baseline[dim];
|
|
2891
|
+
const recognitionAvg = dimensionAverages.recognition[dim];
|
|
2892
|
+
|
|
2893
|
+
if (baselineAvg != null && recognitionAvg != null) {
|
|
2894
|
+
const delta = recognitionAvg - baselineAvg;
|
|
2895
|
+
const deltaPercent = baselineAvg > 0 ? (delta / baselineAvg) * 100 : 0;
|
|
2896
|
+
|
|
2897
|
+
// Significance thresholds (on 5-point scale)
|
|
2898
|
+
// * = >5% improvement (delta > 0.25)
|
|
2899
|
+
// ** = >10% improvement (delta > 0.5)
|
|
2900
|
+
let significance = '';
|
|
2901
|
+
let winner = null;
|
|
2902
|
+
|
|
2903
|
+
if (Math.abs(delta) > 0.5) {
|
|
2904
|
+
significance = '**';
|
|
2905
|
+
winner = delta > 0 ? 'recognition' : 'baseline';
|
|
2906
|
+
} else if (Math.abs(delta) > 0.25) {
|
|
2907
|
+
significance = '*';
|
|
2908
|
+
winner = delta > 0 ? 'recognition' : 'baseline';
|
|
2909
|
+
}
|
|
2910
|
+
|
|
2911
|
+
deltaAnalysis.push({
|
|
2912
|
+
dimension: dim,
|
|
2913
|
+
baseline: baselineAvg,
|
|
2914
|
+
recognition: recognitionAvg,
|
|
2915
|
+
delta,
|
|
2916
|
+
deltaPercent,
|
|
2917
|
+
significance,
|
|
2918
|
+
winner,
|
|
2919
|
+
});
|
|
2920
|
+
}
|
|
2921
|
+
}
|
|
2922
|
+
|
|
2923
|
+
// Calculate overall scores and winner
|
|
2924
|
+
const baselineResults = results.baseline || [];
|
|
2925
|
+
const recognitionResults = results.recognition || [];
|
|
2926
|
+
|
|
2927
|
+
const baselineScores = baselineResults.filter(r => r.overallScore != null).map(r => r.overallScore);
|
|
2928
|
+
const recognitionScores = recognitionResults.filter(r => r.overallScore != null).map(r => r.overallScore);
|
|
2929
|
+
|
|
2930
|
+
const baselineAvgScore = baselineScores.length > 0
|
|
2931
|
+
? baselineScores.reduce((a, b) => a + b, 0) / baselineScores.length
|
|
2932
|
+
: null;
|
|
2933
|
+
const recognitionAvgScore = recognitionScores.length > 0
|
|
2934
|
+
? recognitionScores.reduce((a, b) => a + b, 0) / recognitionScores.length
|
|
2935
|
+
: null;
|
|
2936
|
+
|
|
2937
|
+
let overallWinner = null;
|
|
2938
|
+
let overallDelta = null;
|
|
2939
|
+
let overallSignificance = '';
|
|
2940
|
+
|
|
2941
|
+
if (baselineAvgScore != null && recognitionAvgScore != null) {
|
|
2942
|
+
overallDelta = recognitionAvgScore - baselineAvgScore;
|
|
2943
|
+
|
|
2944
|
+
// Overall winner based on score delta > 5 points
|
|
2945
|
+
if (Math.abs(overallDelta) > 10) {
|
|
2946
|
+
overallSignificance = '**';
|
|
2947
|
+
overallWinner = overallDelta > 0 ? 'recognition' : 'baseline';
|
|
2948
|
+
} else if (Math.abs(overallDelta) > 5) {
|
|
2949
|
+
overallSignificance = '*';
|
|
2950
|
+
overallWinner = overallDelta > 0 ? 'recognition' : 'baseline';
|
|
2951
|
+
}
|
|
2952
|
+
}
|
|
2953
|
+
|
|
2954
|
+
// Calculate average dialectical depth
|
|
2955
|
+
const avgDialecticalDepth = recognitionMetrics.dialecticalDepth.length > 0
|
|
2956
|
+
? recognitionMetrics.dialecticalDepth.reduce((a, b) => a + b, 0) / recognitionMetrics.dialecticalDepth.length
|
|
2957
|
+
: 0;
|
|
2958
|
+
|
|
2959
|
+
sendEvent('log', { message: `\n=== Recognition A/B Complete ===`, level: 'success' });
|
|
2960
|
+
sendEvent('log', { message: `Total tests: ${completedTests}`, level: 'info' });
|
|
2961
|
+
sendEvent('log', {
|
|
2962
|
+
message: `Baseline avg: ${baselineAvgScore?.toFixed(1) || 'N/A'} | Recognition avg: ${recognitionAvgScore?.toFixed(1) || 'N/A'}`,
|
|
2963
|
+
level: 'info',
|
|
2964
|
+
});
|
|
2965
|
+
if (overallWinner) {
|
|
2966
|
+
sendEvent('log', { message: `Winner: ${overallWinner.toUpperCase()} (${overallSignificance})`, level: 'success' });
|
|
2967
|
+
}
|
|
2968
|
+
|
|
2969
|
+
// Send final complete event with full results
|
|
2970
|
+
sendEvent('complete', {
|
|
2971
|
+
success: true,
|
|
2972
|
+
runId: run.id,
|
|
2973
|
+
profiles: validProfiles,
|
|
2974
|
+
scenariosRun: recognitionScenarios.length,
|
|
2975
|
+
dimensionAverages,
|
|
2976
|
+
deltaAnalysis,
|
|
2977
|
+
overallScores: {
|
|
2978
|
+
baseline: baselineAvgScore,
|
|
2979
|
+
recognition: recognitionAvgScore,
|
|
2980
|
+
delta: overallDelta,
|
|
2981
|
+
significance: overallSignificance,
|
|
2982
|
+
winner: overallWinner,
|
|
2983
|
+
},
|
|
2984
|
+
recognitionMetrics: {
|
|
2985
|
+
momentsGenerated: recognitionMetrics.momentsGenerated,
|
|
2986
|
+
avgDialecticalDepth,
|
|
2987
|
+
synthesisStrategies: recognitionMetrics.synthesisStrategies,
|
|
2988
|
+
},
|
|
2989
|
+
results,
|
|
2990
|
+
});
|
|
2991
|
+
|
|
2992
|
+
unregisterStream(streamId);
|
|
2993
|
+
res.end();
|
|
2994
|
+
} catch (error) {
|
|
2995
|
+
sendEvent('log', { message: `Fatal error: ${error.message}`, level: 'error' });
|
|
2996
|
+
sendEvent('error', { error: error.message });
|
|
2997
|
+
unregisterStream(streamId);
|
|
2998
|
+
res.end();
|
|
2999
|
+
}
|
|
3000
|
+
});
|
|
3001
|
+
|
|
3002
|
+
export default router;
|