@machinespirits/eval 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/components/MobileEvalDashboard.tsx +267 -0
- package/components/comparison/DeltaAnalysisTable.tsx +137 -0
- package/components/comparison/ProfileComparisonCard.tsx +176 -0
- package/components/comparison/RecognitionABMode.tsx +385 -0
- package/components/comparison/RecognitionMetricsPanel.tsx +135 -0
- package/components/comparison/WinnerIndicator.tsx +64 -0
- package/components/comparison/index.ts +5 -0
- package/components/mobile/BottomSheet.tsx +233 -0
- package/components/mobile/DimensionBreakdown.tsx +210 -0
- package/components/mobile/DocsView.tsx +363 -0
- package/components/mobile/LogsView.tsx +481 -0
- package/components/mobile/PsychodynamicQuadrant.tsx +261 -0
- package/components/mobile/QuickTestView.tsx +1098 -0
- package/components/mobile/RecognitionTypeChart.tsx +124 -0
- package/components/mobile/RecognitionView.tsx +809 -0
- package/components/mobile/RunDetailView.tsx +261 -0
- package/components/mobile/RunHistoryView.tsx +367 -0
- package/components/mobile/ScoreRadial.tsx +211 -0
- package/components/mobile/StreamingLogPanel.tsx +230 -0
- package/components/mobile/SynthesisStrategyChart.tsx +140 -0
- package/config/interaction-eval-scenarios.yaml +832 -0
- package/config/learner-agents.yaml +248 -0
- package/docs/research/ABLATION-DIALOGUE-ROUNDS.md +52 -0
- package/docs/research/ABLATION-MODEL-SELECTION.md +53 -0
- package/docs/research/ADVANCED-EVAL-ANALYSIS.md +60 -0
- package/docs/research/ANOVA-RESULTS-2026-01-14.md +257 -0
- package/docs/research/COMPREHENSIVE-EVALUATION-PLAN.md +586 -0
- package/docs/research/COST-ANALYSIS.md +56 -0
- package/docs/research/CRITICAL-REVIEW-RECOGNITION-TUTORING.md +340 -0
- package/docs/research/DYNAMIC-VS-SCRIPTED-ANALYSIS.md +291 -0
- package/docs/research/EVAL-SYSTEM-ANALYSIS.md +306 -0
- package/docs/research/FACTORIAL-RESULTS-2026-01-14.md +301 -0
- package/docs/research/IMPLEMENTATION-PLAN-CRITIQUE-RESPONSE.md +1988 -0
- package/docs/research/LONGITUDINAL-DYADIC-EVALUATION.md +282 -0
- package/docs/research/MULTI-JUDGE-VALIDATION-2026-01-14.md +147 -0
- package/docs/research/PAPER-EXTENSION-DYADIC.md +204 -0
- package/docs/research/PAPER-UNIFIED.md +659 -0
- package/docs/research/PAPER-UNIFIED.pdf +0 -0
- package/docs/research/PROMPT-IMPROVEMENTS-2026-01-14.md +356 -0
- package/docs/research/SESSION-NOTES-2026-01-11-RECOGNITION-EVAL.md +419 -0
- package/docs/research/apa.csl +2133 -0
- package/docs/research/archive/PAPER-DRAFT-RECOGNITION-TUTORING.md +1637 -0
- package/docs/research/archive/paper-multiagent-tutor.tex +978 -0
- package/docs/research/paper-draft/full-paper.md +136 -0
- package/docs/research/paper-draft/images/pasted-image-2026-01-24T03-47-47-846Z-d76a7ae2.png +0 -0
- package/docs/research/paper-draft/references.bib +515 -0
- package/docs/research/transcript-baseline.md +139 -0
- package/docs/research/transcript-recognition-multiagent.md +187 -0
- package/hooks/useEvalData.ts +625 -0
- package/index.js +27 -0
- package/package.json +73 -0
- package/routes/evalRoutes.js +3002 -0
- package/scripts/advanced-eval-analysis.js +351 -0
- package/scripts/analyze-eval-costs.js +378 -0
- package/scripts/analyze-eval-results.js +513 -0
- package/scripts/analyze-interaction-evals.js +368 -0
- package/server-init.js +45 -0
- package/server.js +162 -0
- package/services/benchmarkService.js +1892 -0
- package/services/evaluationRunner.js +739 -0
- package/services/evaluationStore.js +1121 -0
- package/services/learnerConfigLoader.js +385 -0
- package/services/learnerTutorInteractionEngine.js +857 -0
- package/services/memory/learnerMemoryService.js +1227 -0
- package/services/memory/learnerWritingPad.js +577 -0
- package/services/memory/tutorWritingPad.js +674 -0
- package/services/promptRecommendationService.js +493 -0
- package/services/rubricEvaluator.js +826 -0
|
@@ -0,0 +1,739 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Evaluation Runner Service
|
|
3
|
+
*
|
|
4
|
+
* Orchestrates the evaluation of AI tutor configurations across
|
|
5
|
+
* test scenarios with rubric-based scoring.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
import { tutorApiService as tutorApi, monitoringService } from '@machinespirits/tutor-core';
|
|
9
|
+
import * as rubricEvaluator from './rubricEvaluator.js';
|
|
10
|
+
import * as evaluationStore from './evaluationStore.js';
|
|
11
|
+
|
|
12
|
+
// Rate limiting settings
|
|
13
|
+
const DEFAULT_PARALLELISM = 2;
|
|
14
|
+
const REQUEST_DELAY_MS = 500;
|
|
15
|
+
const MAX_RETRIES = 3;
|
|
16
|
+
const INITIAL_RETRY_DELAY_MS = 2000; // Start with 2 seconds
|
|
17
|
+
|
|
18
|
+
// Debug logging helper - suppressed in transcript mode for clean output
|
|
19
|
+
function debugLog(...args) {
|
|
20
|
+
if (process.env.TUTOR_TRANSCRIPT !== 'true') {
|
|
21
|
+
console.log(...args);
|
|
22
|
+
}
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
/**
|
|
26
|
+
* Sleep utility
|
|
27
|
+
*/
|
|
28
|
+
function sleep(ms) {
|
|
29
|
+
return new Promise(resolve => setTimeout(resolve, ms));
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
/**
|
|
33
|
+
* Retry wrapper for API calls with exponential backoff
|
|
34
|
+
* Handles 429 rate limit errors from OpenRouter free tier
|
|
35
|
+
*/
|
|
36
|
+
async function retryWithBackoff(fn, context = {}, maxRetries = MAX_RETRIES) {
|
|
37
|
+
let lastError;
|
|
38
|
+
|
|
39
|
+
for (let attempt = 0; attempt <= maxRetries; attempt++) {
|
|
40
|
+
try {
|
|
41
|
+
return await fn();
|
|
42
|
+
} catch (error) {
|
|
43
|
+
lastError = error;
|
|
44
|
+
|
|
45
|
+
// Check if it's a rate limit error (429)
|
|
46
|
+
const is429 = error?.message?.includes('429') ||
|
|
47
|
+
error?.message?.includes('rate limit') ||
|
|
48
|
+
error?.message?.includes('Rate limit');
|
|
49
|
+
|
|
50
|
+
// Don't retry on last attempt or non-429 errors
|
|
51
|
+
if (attempt === maxRetries || !is429) {
|
|
52
|
+
throw error;
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
// Calculate exponential backoff delay: 2s, 4s, 8s
|
|
56
|
+
const delayMs = INITIAL_RETRY_DELAY_MS * Math.pow(2, attempt);
|
|
57
|
+
|
|
58
|
+
debugLog(`[Retry ${attempt + 1}/${maxRetries}] Rate limit hit, waiting ${delayMs}ms before retry...`);
|
|
59
|
+
if (context.log) {
|
|
60
|
+
context.log(`Rate limit exceeded, retrying in ${delayMs / 1000}s (attempt ${attempt + 1}/${maxRetries})`, 'warning');
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
await sleep(delayMs);
|
|
64
|
+
}
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
// Should never reach here, but throw last error just in case
|
|
68
|
+
throw lastError;
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
/**
|
|
72
|
+
* Run a complete evaluation across configurations and scenarios
|
|
73
|
+
*
|
|
74
|
+
* @param {Object} options - Evaluation options
|
|
75
|
+
* @returns {Promise<Object>} Evaluation run results
|
|
76
|
+
*/
|
|
77
|
+
export async function runEvaluation(options = {}) {
|
|
78
|
+
const {
|
|
79
|
+
scenarios = 'all', // Which scenarios to run ('all' or array of IDs)
|
|
80
|
+
configurations = 'all', // Which configs to test ('all', 'profiles', or array)
|
|
81
|
+
runsPerConfig = 1, // Repetitions for statistical significance
|
|
82
|
+
parallelism = DEFAULT_PARALLELISM,
|
|
83
|
+
skipRubricEval = false, // Skip AI-based rubric evaluation (faster)
|
|
84
|
+
description = null,
|
|
85
|
+
verbose = false,
|
|
86
|
+
} = options;
|
|
87
|
+
|
|
88
|
+
const log = verbose ? console.log : () => {};
|
|
89
|
+
|
|
90
|
+
// Resolve scenarios
|
|
91
|
+
const allScenarios = tutorApi.listScenarios();
|
|
92
|
+
const targetScenarios = scenarios === 'all'
|
|
93
|
+
? allScenarios
|
|
94
|
+
: allScenarios.filter(s => scenarios.includes(s.id));
|
|
95
|
+
|
|
96
|
+
if (targetScenarios.length === 0) {
|
|
97
|
+
throw new Error('No scenarios to run');
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
// Resolve configurations
|
|
101
|
+
let targetConfigs = [];
|
|
102
|
+
if (configurations === 'all') {
|
|
103
|
+
targetConfigs = tutorApi.listConfigurations();
|
|
104
|
+
} else if (configurations === 'profiles') {
|
|
105
|
+
const profiles = tutorApi.listProfiles();
|
|
106
|
+
targetConfigs = profiles.map(p => ({
|
|
107
|
+
provider: null,
|
|
108
|
+
model: null,
|
|
109
|
+
profileName: p.name,
|
|
110
|
+
label: p.name,
|
|
111
|
+
}));
|
|
112
|
+
} else if (Array.isArray(configurations)) {
|
|
113
|
+
targetConfigs = configurations;
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
if (targetConfigs.length === 0) {
|
|
117
|
+
throw new Error('No configurations to test');
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
log(`\nStarting evaluation:`);
|
|
121
|
+
log(` Scenarios: ${targetScenarios.length}`);
|
|
122
|
+
log(` Configurations: ${targetConfigs.length}`);
|
|
123
|
+
log(` Runs per config: ${runsPerConfig}`);
|
|
124
|
+
log(` Total tests: ${targetScenarios.length * targetConfigs.length * runsPerConfig}`);
|
|
125
|
+
|
|
126
|
+
// Create evaluation run record
|
|
127
|
+
const run = evaluationStore.createRun({
|
|
128
|
+
description: description || `Evaluation: ${targetConfigs.length} configs x ${targetScenarios.length} scenarios`,
|
|
129
|
+
totalScenarios: targetScenarios.length,
|
|
130
|
+
totalConfigurations: targetConfigs.length,
|
|
131
|
+
metadata: {
|
|
132
|
+
runsPerConfig,
|
|
133
|
+
skipRubricEval,
|
|
134
|
+
},
|
|
135
|
+
});
|
|
136
|
+
|
|
137
|
+
log(`\nRun ID: ${run.id}\n`);
|
|
138
|
+
|
|
139
|
+
// Register with monitoring service for realtime tracking
|
|
140
|
+
monitoringService.startSession(run.id, {
|
|
141
|
+
userId: 'eval-runner',
|
|
142
|
+
profileName: `${targetConfigs.length} configs`,
|
|
143
|
+
modelId: 'evaluation-batch',
|
|
144
|
+
});
|
|
145
|
+
|
|
146
|
+
const results = [];
|
|
147
|
+
let completedTests = 0;
|
|
148
|
+
const totalTests = targetScenarios.length * targetConfigs.length * runsPerConfig;
|
|
149
|
+
|
|
150
|
+
// Run evaluations
|
|
151
|
+
for (const config of targetConfigs) {
|
|
152
|
+
log(`\nConfiguration: ${config.label || `${config.provider}/${config.model}`}`);
|
|
153
|
+
log('='.repeat(60));
|
|
154
|
+
|
|
155
|
+
for (const scenario of targetScenarios) {
|
|
156
|
+
for (let runNum = 0; runNum < runsPerConfig; runNum++) {
|
|
157
|
+
try {
|
|
158
|
+
const result = await runSingleTest(scenario, config, {
|
|
159
|
+
skipRubricEval,
|
|
160
|
+
verbose,
|
|
161
|
+
});
|
|
162
|
+
|
|
163
|
+
// Store result
|
|
164
|
+
evaluationStore.storeResult(run.id, result);
|
|
165
|
+
results.push(result);
|
|
166
|
+
|
|
167
|
+
completedTests++;
|
|
168
|
+
log(` [${completedTests}/${totalTests}] ${scenario.id}: ${result.success ? `score=${result.overallScore?.toFixed(1)}` : 'FAILED'}`);
|
|
169
|
+
|
|
170
|
+
// Update monitoring session with progress
|
|
171
|
+
monitoringService.recordEvent(run.id, {
|
|
172
|
+
type: 'evaluation_test',
|
|
173
|
+
inputTokens: result.inputTokens || 0,
|
|
174
|
+
outputTokens: result.outputTokens || 0,
|
|
175
|
+
latencyMs: result.latencyMs || 0,
|
|
176
|
+
round: completedTests,
|
|
177
|
+
approved: result.success,
|
|
178
|
+
});
|
|
179
|
+
|
|
180
|
+
// Rate limiting
|
|
181
|
+
await sleep(REQUEST_DELAY_MS);
|
|
182
|
+
} catch (error) {
|
|
183
|
+
log(` [${completedTests}/${totalTests}] ${scenario.id}: ERROR - ${error.message}`);
|
|
184
|
+
completedTests++;
|
|
185
|
+
|
|
186
|
+
// Record error in monitoring
|
|
187
|
+
monitoringService.recordEvent(run.id, {
|
|
188
|
+
type: 'evaluation_error',
|
|
189
|
+
round: completedTests,
|
|
190
|
+
error: error.message,
|
|
191
|
+
});
|
|
192
|
+
}
|
|
193
|
+
}
|
|
194
|
+
}
|
|
195
|
+
}
|
|
196
|
+
|
|
197
|
+
// Update run status
|
|
198
|
+
evaluationStore.updateRun(run.id, {
|
|
199
|
+
status: 'completed',
|
|
200
|
+
totalTests: results.length,
|
|
201
|
+
completedAt: new Date().toISOString(),
|
|
202
|
+
});
|
|
203
|
+
|
|
204
|
+
// End monitoring session
|
|
205
|
+
monitoringService.endSession(run.id);
|
|
206
|
+
|
|
207
|
+
// Get aggregated stats
|
|
208
|
+
const stats = evaluationStore.getRunStats(run.id);
|
|
209
|
+
const scenarioStats = evaluationStore.getScenarioStats(run.id);
|
|
210
|
+
|
|
211
|
+
log('\n' + '='.repeat(60));
|
|
212
|
+
log('EVALUATION COMPLETE');
|
|
213
|
+
log('='.repeat(60));
|
|
214
|
+
log(`Run ID: ${run.id}`);
|
|
215
|
+
log(`Total tests: ${results.length}`);
|
|
216
|
+
log(`Successful: ${results.filter(r => r.success).length}`);
|
|
217
|
+
|
|
218
|
+
return {
|
|
219
|
+
runId: run.id,
|
|
220
|
+
totalTests: results.length,
|
|
221
|
+
successfulTests: results.filter(r => r.success).length,
|
|
222
|
+
stats,
|
|
223
|
+
scenarioStats,
|
|
224
|
+
};
|
|
225
|
+
}
|
|
226
|
+
|
|
227
|
+
/**
|
|
228
|
+
* Run a single test (scenario + config combination)
|
|
229
|
+
* Handles both single-turn and multi-turn scenarios
|
|
230
|
+
*/
|
|
231
|
+
async function runSingleTest(scenario, config, options = {}) {
|
|
232
|
+
const { skipRubricEval = false, outputSize = 'normal', verbose = false, onLog, superegoStrategy = null } = options;
|
|
233
|
+
|
|
234
|
+
// Create a log function that calls both console and onLog callback
|
|
235
|
+
const log = (message, level = 'info') => {
|
|
236
|
+
if (verbose) console.log(message);
|
|
237
|
+
if (onLog) onLog(message, level);
|
|
238
|
+
};
|
|
239
|
+
|
|
240
|
+
const fullScenario = tutorApi.getScenario(scenario.id);
|
|
241
|
+
if (!fullScenario) {
|
|
242
|
+
throw new Error(`Scenario not found: ${scenario.id}`);
|
|
243
|
+
}
|
|
244
|
+
|
|
245
|
+
log(`Running scenario: ${scenario.name}`, 'info');
|
|
246
|
+
|
|
247
|
+
// Check if this is a multi-turn scenario
|
|
248
|
+
const isMultiTurn = tutorApi.isMultiTurnScenario(scenario.id);
|
|
249
|
+
|
|
250
|
+
if (isMultiTurn) {
|
|
251
|
+
log('Detected multi-turn scenario', 'info');
|
|
252
|
+
return runMultiTurnTest(scenario, config, fullScenario, { ...options, log });
|
|
253
|
+
}
|
|
254
|
+
|
|
255
|
+
// Single-turn evaluation (original logic)
|
|
256
|
+
return runSingleTurnTest(scenario, config, fullScenario, { ...options, log });
|
|
257
|
+
}
|
|
258
|
+
|
|
259
|
+
/**
|
|
260
|
+
* Run a single-turn test
|
|
261
|
+
*/
|
|
262
|
+
async function runSingleTurnTest(scenario, config, fullScenario, options = {}) {
|
|
263
|
+
const { skipRubricEval = false, outputSize = 'normal', verbose = false, log = () => {}, superegoStrategy = null } = options;
|
|
264
|
+
|
|
265
|
+
// Build context
|
|
266
|
+
log('Building learner context...', 'info');
|
|
267
|
+
const context = tutorApi.buildContext(fullScenario.learner_context);
|
|
268
|
+
context.isNewUser = fullScenario.is_new_user;
|
|
269
|
+
|
|
270
|
+
// Generate suggestions
|
|
271
|
+
log(`Generating suggestions with profile: ${config.profileName}`, 'info');
|
|
272
|
+
log(`Provider: ${config.provider || 'from profile'}, Model: ${config.model || 'from profile'}`, 'info');
|
|
273
|
+
if (config.egoModel) {
|
|
274
|
+
log(`Ego model override: ${config.egoModel}`, 'info');
|
|
275
|
+
}
|
|
276
|
+
|
|
277
|
+
// Wrap API call with retry logic for rate limit handling
|
|
278
|
+
const genResult = await retryWithBackoff(
|
|
279
|
+
() => tutorApi.generateSuggestions(context, {
|
|
280
|
+
provider: config.provider,
|
|
281
|
+
model: config.model,
|
|
282
|
+
egoModel: config.egoModel, // Override ego model for benchmarking
|
|
283
|
+
profileName: config.profileName,
|
|
284
|
+
hyperparameters: config.hyperparameters || {},
|
|
285
|
+
trace: true, // Always capture trace for tension analysis
|
|
286
|
+
superegoStrategy, // Pass through superego intervention strategy
|
|
287
|
+
outputSize, // compact, normal, expanded - affects response length
|
|
288
|
+
}),
|
|
289
|
+
{ log }
|
|
290
|
+
);
|
|
291
|
+
|
|
292
|
+
if (!genResult.success) {
|
|
293
|
+
log(`Generation failed: ${genResult.error}`, 'error');
|
|
294
|
+
return {
|
|
295
|
+
scenarioId: scenario.id,
|
|
296
|
+
scenarioName: scenario.name,
|
|
297
|
+
provider: config.provider || genResult.metadata?.provider,
|
|
298
|
+
model: config.model || genResult.metadata?.model,
|
|
299
|
+
profileName: config.profileName,
|
|
300
|
+
success: false,
|
|
301
|
+
errorMessage: genResult.error,
|
|
302
|
+
latencyMs: genResult.metadata?.latencyMs,
|
|
303
|
+
};
|
|
304
|
+
}
|
|
305
|
+
|
|
306
|
+
const suggestionCount = genResult.suggestions?.length || 0;
|
|
307
|
+
log(`Generated ${suggestionCount} suggestion(s) in ${genResult.metadata?.latencyMs}ms`, 'success');
|
|
308
|
+
|
|
309
|
+
if (genResult.metadata?.dialogueRounds) {
|
|
310
|
+
log(`Dialogue rounds: ${genResult.metadata.dialogueRounds}`, 'info');
|
|
311
|
+
}
|
|
312
|
+
|
|
313
|
+
// Quick validation (rule-based)
|
|
314
|
+
log('Running validation checks...', 'info');
|
|
315
|
+
const suggestion = genResult.suggestions?.[0];
|
|
316
|
+
const validation = suggestion
|
|
317
|
+
? rubricEvaluator.quickValidate(suggestion, {
|
|
318
|
+
requiredElements: fullScenario.required_elements,
|
|
319
|
+
forbiddenElements: fullScenario.forbidden_elements,
|
|
320
|
+
})
|
|
321
|
+
: { passesRequired: false, passesForbidden: true, requiredMissing: ['No suggestions generated'] };
|
|
322
|
+
|
|
323
|
+
log(`Validation: required=${validation.passesRequired ? 'PASS' : 'FAIL'}, forbidden=${validation.passesForbidden ? 'PASS' : 'FAIL'}`, validation.passesRequired && validation.passesForbidden ? 'success' : 'warning');
|
|
324
|
+
|
|
325
|
+
let rubricResult = null;
|
|
326
|
+
if (!skipRubricEval && suggestion) {
|
|
327
|
+
// Full rubric evaluation with AI judge
|
|
328
|
+
log('Running AI rubric evaluation...', 'info');
|
|
329
|
+
debugLog(`[evaluationRunner] Running rubric evaluation for ${scenario.id}...`);
|
|
330
|
+
rubricResult = await rubricEvaluator.evaluateSuggestion(suggestion, {
|
|
331
|
+
name: fullScenario.name,
|
|
332
|
+
description: fullScenario.description,
|
|
333
|
+
expectedBehavior: fullScenario.expected_behavior,
|
|
334
|
+
learnerContext: fullScenario.learner_context,
|
|
335
|
+
requiredElements: fullScenario.required_elements,
|
|
336
|
+
forbiddenElements: fullScenario.forbidden_elements,
|
|
337
|
+
}, {});
|
|
338
|
+
|
|
339
|
+
// Log rubric result summary
|
|
340
|
+
if (rubricResult) {
|
|
341
|
+
debugLog(`[evaluationRunner] Rubric result: success=${rubricResult.success}, ` +
|
|
342
|
+
`overallScore=${rubricResult.overallScore}, ` +
|
|
343
|
+
`scoresCount=${Object.keys(rubricResult.scores || {}).length}, ` +
|
|
344
|
+
`error=${rubricResult.error || 'none'}`);
|
|
345
|
+
if (rubricResult.success) {
|
|
346
|
+
log(`Rubric evaluation complete: score=${rubricResult.overallScore?.toFixed(1)}`, 'success');
|
|
347
|
+
} else {
|
|
348
|
+
log(`Rubric evaluation failed: ${rubricResult.error || 'unknown error'}`, 'error');
|
|
349
|
+
}
|
|
350
|
+
}
|
|
351
|
+
} else if (skipRubricEval) {
|
|
352
|
+
debugLog(`[evaluationRunner] Skipping rubric evaluation (--fast mode)`);
|
|
353
|
+
log('Skipping AI rubric evaluation (fast mode)', 'info');
|
|
354
|
+
} else if (!suggestion) {
|
|
355
|
+
debugLog(`[evaluationRunner] Skipping rubric evaluation (no suggestion generated)`);
|
|
356
|
+
log('Skipping rubric evaluation (no suggestion generated)', 'warning');
|
|
357
|
+
}
|
|
358
|
+
|
|
359
|
+
// Calculate overall score
|
|
360
|
+
let overallScore = null;
|
|
361
|
+
if (rubricResult?.success) {
|
|
362
|
+
overallScore = rubricResult.overallScore;
|
|
363
|
+
} else if (suggestion) {
|
|
364
|
+
// Fallback: simple validation-based score
|
|
365
|
+
overallScore = (validation.passesRequired ? 50 : 0) + (validation.passesForbidden ? 50 : 0);
|
|
366
|
+
}
|
|
367
|
+
|
|
368
|
+
return {
|
|
369
|
+
scenarioId: scenario.id,
|
|
370
|
+
scenarioName: scenario.name,
|
|
371
|
+
provider: config.provider || genResult.metadata?.provider,
|
|
372
|
+
model: config.model || genResult.metadata?.model,
|
|
373
|
+
profileName: config.profileName,
|
|
374
|
+
hyperparameters: config.hyperparameters,
|
|
375
|
+
suggestions: genResult.suggestions,
|
|
376
|
+
success: true,
|
|
377
|
+
latencyMs: genResult.metadata?.latencyMs,
|
|
378
|
+
inputTokens: genResult.metadata?.inputTokens,
|
|
379
|
+
outputTokens: genResult.metadata?.outputTokens,
|
|
380
|
+
dialogueRounds: genResult.metadata?.dialogueRounds,
|
|
381
|
+
apiCalls: genResult.metadata?.apiCalls,
|
|
382
|
+
cost: genResult.metadata?.totalCost, // OpenRouter API cost in USD
|
|
383
|
+
dialogueId: genResult.metadata?.dialogueId, // For linking to logs
|
|
384
|
+
scores: rubricResult?.scores && Object.keys(rubricResult.scores).length > 0 ? {
|
|
385
|
+
relevance: rubricResult.scores.relevance?.score,
|
|
386
|
+
specificity: rubricResult.scores.specificity?.score,
|
|
387
|
+
pedagogical: rubricResult.scores.pedagogical?.score,
|
|
388
|
+
personalization: rubricResult.scores.personalization?.score,
|
|
389
|
+
actionability: rubricResult.scores.actionability?.score,
|
|
390
|
+
tone: rubricResult.scores.tone?.score,
|
|
391
|
+
} : null,
|
|
392
|
+
// Include full scores with reasoning for detailed analysis
|
|
393
|
+
scoresWithReasoning: rubricResult?.scores && Object.keys(rubricResult.scores).length > 0
|
|
394
|
+
? rubricResult.scores
|
|
395
|
+
: null,
|
|
396
|
+
overallScore,
|
|
397
|
+
passesRequired: rubricResult?.passesRequired ?? validation.passesRequired,
|
|
398
|
+
passesForbidden: rubricResult?.passesForbidden ?? validation.passesForbidden,
|
|
399
|
+
requiredMissing: rubricResult?.requiredMissing || validation.requiredMissing,
|
|
400
|
+
forbiddenFound: rubricResult?.forbiddenFound || validation.forbiddenFound,
|
|
401
|
+
evaluatorModel: rubricResult?.evaluatorModel,
|
|
402
|
+
evaluationReasoning: rubricResult?.summary,
|
|
403
|
+
// Include dialogueResult for tension analysis
|
|
404
|
+
dialogueResult: {
|
|
405
|
+
dialogueTrace: genResult.dialogueTrace,
|
|
406
|
+
dialogueRounds: genResult.metadata?.dialogueRounds,
|
|
407
|
+
converged: genResult.metadata?.converged,
|
|
408
|
+
dialogueId: genResult.metadata?.dialogueId,
|
|
409
|
+
},
|
|
410
|
+
};
|
|
411
|
+
}
|
|
412
|
+
|
|
413
|
+
/**
|
|
414
|
+
* Run a multi-turn test
|
|
415
|
+
* Evaluates each turn and aggregates scores
|
|
416
|
+
*/
|
|
417
|
+
async function runMultiTurnTest(scenario, config, fullScenario, options = {}) {
|
|
418
|
+
const { skipRubricEval = false, verbose = false } = options;
|
|
419
|
+
const log = verbose ? console.log : () => {};
|
|
420
|
+
|
|
421
|
+
log(`[evaluationRunner] Running multi-turn scenario: ${scenario.id}`);
|
|
422
|
+
|
|
423
|
+
const turns = fullScenario.turns || [];
|
|
424
|
+
const turnResults = [];
|
|
425
|
+
let totalLatencyMs = 0;
|
|
426
|
+
let totalInputTokens = 0;
|
|
427
|
+
let totalOutputTokens = 0;
|
|
428
|
+
let totalApiCalls = 0;
|
|
429
|
+
let totalCost = 0;
|
|
430
|
+
|
|
431
|
+
// Run the multi-turn scenario through tutorApi (with retry for rate limits)
|
|
432
|
+
const multiTurnResult = await retryWithBackoff(
|
|
433
|
+
() => tutorApi.runMultiTurnScenario(scenario.id, {
|
|
434
|
+
provider: config.provider,
|
|
435
|
+
model: config.model,
|
|
436
|
+
profileName: config.profileName,
|
|
437
|
+
hyperparameters: config.hyperparameters || {},
|
|
438
|
+
trace: verbose,
|
|
439
|
+
}),
|
|
440
|
+
{ log }
|
|
441
|
+
);
|
|
442
|
+
|
|
443
|
+
// Validate that we got results
|
|
444
|
+
if (!multiTurnResult.turnResults || multiTurnResult.turnResults.length === 0) {
|
|
445
|
+
const errorMsg = `Multi-turn scenario returned no results (expected ${fullScenario.turns?.length + 1 || 1} turns)`;
|
|
446
|
+
log(errorMsg, 'error');
|
|
447
|
+
throw new Error(errorMsg);
|
|
448
|
+
}
|
|
449
|
+
|
|
450
|
+
// Evaluate each turn
|
|
451
|
+
for (const turnResult of multiTurnResult.turnResults) {
|
|
452
|
+
const suggestion = turnResult.suggestions?.[0];
|
|
453
|
+
|
|
454
|
+
// Quick validation for this turn
|
|
455
|
+
const validation = suggestion
|
|
456
|
+
? rubricEvaluator.quickValidate(suggestion, {
|
|
457
|
+
requiredElements: turnResult.requiredElements,
|
|
458
|
+
forbiddenElements: turnResult.forbiddenElements,
|
|
459
|
+
})
|
|
460
|
+
: { passesRequired: false, passesForbidden: true, requiredMissing: ['No suggestions generated'] };
|
|
461
|
+
|
|
462
|
+
let rubricResult = null;
|
|
463
|
+
if (!skipRubricEval && suggestion) {
|
|
464
|
+
log(`[evaluationRunner] Running rubric evaluation for turn ${turnResult.turnIndex}...`);
|
|
465
|
+
rubricResult = await rubricEvaluator.evaluateSuggestion(suggestion, {
|
|
466
|
+
name: `${fullScenario.name} - Turn ${turnResult.turnIndex}`,
|
|
467
|
+
description: turnResult.turnId === 'initial' ? fullScenario.description : `Turn: ${turnResult.learnerAction}`,
|
|
468
|
+
expectedBehavior: turnResult.expectedBehavior,
|
|
469
|
+
learnerContext: turnResult.context,
|
|
470
|
+
requiredElements: turnResult.requiredElements,
|
|
471
|
+
forbiddenElements: turnResult.forbiddenElements,
|
|
472
|
+
}, {});
|
|
473
|
+
}
|
|
474
|
+
|
|
475
|
+
// Calculate turn score
|
|
476
|
+
let turnScore = null;
|
|
477
|
+
if (rubricResult?.success) {
|
|
478
|
+
turnScore = rubricResult.overallScore;
|
|
479
|
+
} else if (suggestion) {
|
|
480
|
+
turnScore = (validation.passesRequired ? 50 : 0) + (validation.passesForbidden ? 50 : 0);
|
|
481
|
+
}
|
|
482
|
+
|
|
483
|
+
turnResults.push({
|
|
484
|
+
turnIndex: turnResult.turnIndex,
|
|
485
|
+
turnId: turnResult.turnId,
|
|
486
|
+
learnerAction: turnResult.learnerAction,
|
|
487
|
+
expectedBehavior: turnResult.expectedBehavior,
|
|
488
|
+
suggestion: suggestion,
|
|
489
|
+
scores: rubricResult?.scores && Object.keys(rubricResult.scores).length > 0 ? {
|
|
490
|
+
relevance: rubricResult.scores.relevance?.score,
|
|
491
|
+
specificity: rubricResult.scores.specificity?.score,
|
|
492
|
+
pedagogical: rubricResult.scores.pedagogical?.score,
|
|
493
|
+
personalization: rubricResult.scores.personalization?.score,
|
|
494
|
+
actionability: rubricResult.scores.actionability?.score,
|
|
495
|
+
tone: rubricResult.scores.tone?.score,
|
|
496
|
+
} : null,
|
|
497
|
+
turnScore,
|
|
498
|
+
passesRequired: rubricResult?.passesRequired ?? validation.passesRequired,
|
|
499
|
+
passesForbidden: rubricResult?.passesForbidden ?? validation.passesForbidden,
|
|
500
|
+
requiredMissing: validation.requiredMissing,
|
|
501
|
+
forbiddenFound: validation.forbiddenFound,
|
|
502
|
+
minAcceptableScore: turnResult.minAcceptableScore || fullScenario.min_acceptable_score,
|
|
503
|
+
});
|
|
504
|
+
|
|
505
|
+
// Aggregate metrics
|
|
506
|
+
totalLatencyMs += turnResult.metadata?.latencyMs || 0;
|
|
507
|
+
totalInputTokens += turnResult.metadata?.inputTokens || 0;
|
|
508
|
+
totalOutputTokens += turnResult.metadata?.outputTokens || 0;
|
|
509
|
+
totalApiCalls += turnResult.metadata?.apiCalls || 0;
|
|
510
|
+
totalCost += turnResult.metadata?.totalCost || 0;
|
|
511
|
+
}
|
|
512
|
+
|
|
513
|
+
// Calculate aggregate scores
|
|
514
|
+
const validTurnScores = turnResults.filter(t => t.turnScore !== null).map(t => t.turnScore);
|
|
515
|
+
const overallScore = validTurnScores.length > 0
|
|
516
|
+
? validTurnScores.reduce((sum, s) => sum + s, 0) / validTurnScores.length
|
|
517
|
+
: null;
|
|
518
|
+
|
|
519
|
+
// Aggregate dimension scores
|
|
520
|
+
const aggregateDimensions = {};
|
|
521
|
+
const dims = ['relevance', 'specificity', 'pedagogical', 'personalization', 'actionability', 'tone'];
|
|
522
|
+
for (const dim of dims) {
|
|
523
|
+
const dimScores = turnResults
|
|
524
|
+
.filter(t => t.scores?.[dim] !== undefined)
|
|
525
|
+
.map(t => t.scores[dim]);
|
|
526
|
+
if (dimScores.length > 0) {
|
|
527
|
+
aggregateDimensions[dim] = dimScores.reduce((sum, s) => sum + s, 0) / dimScores.length;
|
|
528
|
+
}
|
|
529
|
+
}
|
|
530
|
+
|
|
531
|
+
// Check if all turns pass their thresholds
|
|
532
|
+
const allTurnsPassed = turnResults.every(t => {
|
|
533
|
+
if (t.turnScore === null) return false;
|
|
534
|
+
const threshold = t.minAcceptableScore || fullScenario.min_acceptable_score || 0;
|
|
535
|
+
return t.turnScore >= threshold;
|
|
536
|
+
});
|
|
537
|
+
|
|
538
|
+
log(`[evaluationRunner] Multi-turn complete: ${turnResults.length} turns, avgScore=${overallScore?.toFixed(1)}`);
|
|
539
|
+
|
|
540
|
+
return {
|
|
541
|
+
scenarioId: scenario.id,
|
|
542
|
+
scenarioName: scenario.name,
|
|
543
|
+
isMultiTurn: true,
|
|
544
|
+
totalTurns: turnResults.length,
|
|
545
|
+
provider: config.provider || multiTurnResult.turnResults[0]?.metadata?.provider,
|
|
546
|
+
model: config.model || multiTurnResult.turnResults[0]?.metadata?.model,
|
|
547
|
+
profileName: config.profileName,
|
|
548
|
+
hyperparameters: config.hyperparameters,
|
|
549
|
+
suggestions: multiTurnResult.turnResults.map(t => t.suggestions?.[0]).filter(Boolean),
|
|
550
|
+
success: true,
|
|
551
|
+
latencyMs: totalLatencyMs,
|
|
552
|
+
inputTokens: totalInputTokens,
|
|
553
|
+
outputTokens: totalOutputTokens,
|
|
554
|
+
apiCalls: totalApiCalls,
|
|
555
|
+
cost: totalCost, // OpenRouter API cost in USD
|
|
556
|
+
dialogueId: multiTurnResult.dialogueId, // Single continuous dialogue ID for all turns
|
|
557
|
+
dialogueRounds: multiTurnResult.turnResults.reduce((sum, t) => sum + (t.metadata?.dialogueRounds || 0), 0), // Total across all turns
|
|
558
|
+
scores: Object.keys(aggregateDimensions).length > 0 ? aggregateDimensions : null,
|
|
559
|
+
overallScore,
|
|
560
|
+
turnResults,
|
|
561
|
+
allTurnsPassed,
|
|
562
|
+
passesRequired: turnResults.every(t => t.passesRequired),
|
|
563
|
+
passesForbidden: turnResults.every(t => t.passesForbidden),
|
|
564
|
+
};
|
|
565
|
+
}
|
|
566
|
+
|
|
567
|
+
/**
|
|
568
|
+
* Compare two or more configurations
|
|
569
|
+
*/
|
|
570
|
+
export async function compareConfigurations(configs, options = {}) {
|
|
571
|
+
const {
|
|
572
|
+
scenarios = 'all',
|
|
573
|
+
runsPerConfig = 1,
|
|
574
|
+
verbose = false,
|
|
575
|
+
} = options;
|
|
576
|
+
|
|
577
|
+
// Run evaluation with specified configs
|
|
578
|
+
const result = await runEvaluation({
|
|
579
|
+
scenarios,
|
|
580
|
+
configurations: configs,
|
|
581
|
+
runsPerConfig,
|
|
582
|
+
verbose,
|
|
583
|
+
description: `Comparison: ${configs.map(c => c.label || c.profileName || `${c.provider}/${c.model}`).join(' vs ')}`,
|
|
584
|
+
});
|
|
585
|
+
|
|
586
|
+
// Build comparison
|
|
587
|
+
const comparison = {
|
|
588
|
+
runId: result.runId,
|
|
589
|
+
configurations: configs,
|
|
590
|
+
rankings: result.stats.sort((a, b) => (b.avgScore || 0) - (a.avgScore || 0)).map((stat, i) => ({
|
|
591
|
+
rank: i + 1,
|
|
592
|
+
provider: stat.provider,
|
|
593
|
+
model: stat.model,
|
|
594
|
+
avgScore: stat.avgScore,
|
|
595
|
+
successRate: stat.successRate,
|
|
596
|
+
avgLatencyMs: stat.avgLatencyMs,
|
|
597
|
+
})),
|
|
598
|
+
scenarioBreakdown: result.scenarioStats,
|
|
599
|
+
};
|
|
600
|
+
|
|
601
|
+
return comparison;
|
|
602
|
+
}
|
|
603
|
+
|
|
604
|
+
/**
|
|
605
|
+
* Quick test of a single configuration
|
|
606
|
+
*/
|
|
607
|
+
export async function quickTest(config, options = {}) {
|
|
608
|
+
const {
|
|
609
|
+
scenarioId = 'new_user_first_visit',
|
|
610
|
+
verbose = true,
|
|
611
|
+
skipRubricEval = false,
|
|
612
|
+
outputSize = 'normal', // compact, normal, expanded
|
|
613
|
+
onLog,
|
|
614
|
+
superegoStrategy = null, // Superego intervention strategy
|
|
615
|
+
} = options;
|
|
616
|
+
|
|
617
|
+
const scenarios = [tutorApi.listScenarios().find(s => s.id === scenarioId)].filter(Boolean);
|
|
618
|
+
if (scenarios.length === 0) {
|
|
619
|
+
throw new Error(`Scenario not found: ${scenarioId}`);
|
|
620
|
+
}
|
|
621
|
+
|
|
622
|
+
const result = await runSingleTest(scenarios[0], config, { verbose, skipRubricEval, outputSize, onLog, superegoStrategy });
|
|
623
|
+
return result;
|
|
624
|
+
}
|
|
625
|
+
|
|
626
|
+
/**
|
|
627
|
+
* List available scenarios and configurations
|
|
628
|
+
*/
|
|
629
|
+
export function listOptions() {
|
|
630
|
+
return {
|
|
631
|
+
scenarios: tutorApi.listScenarios(),
|
|
632
|
+
configurations: tutorApi.listConfigurations(),
|
|
633
|
+
profiles: tutorApi.listProfiles(),
|
|
634
|
+
};
|
|
635
|
+
}
|
|
636
|
+
|
|
637
|
+
/**
|
|
638
|
+
* Get previous run results
|
|
639
|
+
*/
|
|
640
|
+
export function getRunResults(runId) {
|
|
641
|
+
const run = evaluationStore.getRun(runId);
|
|
642
|
+
if (!run) {
|
|
643
|
+
throw new Error(`Run not found: ${runId}`);
|
|
644
|
+
}
|
|
645
|
+
|
|
646
|
+
return {
|
|
647
|
+
run,
|
|
648
|
+
stats: evaluationStore.getRunStats(runId),
|
|
649
|
+
scenarioStats: evaluationStore.getScenarioStats(runId),
|
|
650
|
+
results: evaluationStore.getResults(runId),
|
|
651
|
+
};
|
|
652
|
+
}
|
|
653
|
+
|
|
654
|
+
/**
|
|
655
|
+
* Generate a text report for a run
|
|
656
|
+
*/
|
|
657
|
+
export function generateReport(runId) {
|
|
658
|
+
const run = evaluationStore.getRun(runId);
|
|
659
|
+
if (!run) {
|
|
660
|
+
throw new Error(`Run not found: ${runId}`);
|
|
661
|
+
}
|
|
662
|
+
|
|
663
|
+
const stats = evaluationStore.getRunStats(runId);
|
|
664
|
+
const scenarioStats = evaluationStore.getScenarioStats(runId);
|
|
665
|
+
|
|
666
|
+
const lines = [];
|
|
667
|
+
|
|
668
|
+
lines.push('='.repeat(80));
|
|
669
|
+
lines.push(`TUTOR EVALUATION REPORT: ${runId}`);
|
|
670
|
+
lines.push('='.repeat(80));
|
|
671
|
+
lines.push('');
|
|
672
|
+
lines.push(`Run Date: ${run.createdAt}`);
|
|
673
|
+
lines.push(`Description: ${run.description || 'N/A'}`);
|
|
674
|
+
lines.push(`Total Tests: ${run.totalTests}`);
|
|
675
|
+
lines.push(`Status: ${run.status}`);
|
|
676
|
+
lines.push('');
|
|
677
|
+
|
|
678
|
+
// Rankings table
|
|
679
|
+
lines.push('CONFIGURATION RANKINGS (by average score)');
|
|
680
|
+
lines.push('-'.repeat(80));
|
|
681
|
+
lines.push('| Rank | Configuration | Avg Score | Latency | Pass Rate |');
|
|
682
|
+
lines.push('|------|----------------------------------|-----------|---------|-----------|');
|
|
683
|
+
|
|
684
|
+
stats.forEach((stat, i) => {
|
|
685
|
+
const label = `${stat.provider}/${stat.model}`.substring(0, 32).padEnd(32);
|
|
686
|
+
const score = stat.avgScore ? stat.avgScore.toFixed(1).padStart(9) : ' N/A';
|
|
687
|
+
const latency = stat.avgLatencyMs ? `${stat.avgLatencyMs.toFixed(0)}ms`.padStart(7) : ' N/A';
|
|
688
|
+
const passRate = `${(stat.validationPassRate * 100).toFixed(0)}%`.padStart(9);
|
|
689
|
+
lines.push(`| ${(i + 1).toString().padStart(4)} | ${label} | ${score} | ${latency} | ${passRate} |`);
|
|
690
|
+
});
|
|
691
|
+
|
|
692
|
+
lines.push('');
|
|
693
|
+
|
|
694
|
+
// Dimension breakdown
|
|
695
|
+
if (stats.length > 0 && stats[0].dimensions) {
|
|
696
|
+
lines.push('DIMENSION BREAKDOWN');
|
|
697
|
+
lines.push('-'.repeat(80));
|
|
698
|
+
|
|
699
|
+
const dims = ['relevance', 'specificity', 'pedagogical', 'personalization', 'actionability', 'tone'];
|
|
700
|
+
const header = '| Dimension |' + stats.map(s => ` ${s.model.substring(0, 12).padEnd(12)} |`).join('');
|
|
701
|
+
lines.push(header);
|
|
702
|
+
lines.push('|-----------------|' + stats.map(() => '--------------|').join(''));
|
|
703
|
+
|
|
704
|
+
for (const dim of dims) {
|
|
705
|
+
const row = `| ${dim.padEnd(15)} |` + stats.map(s => {
|
|
706
|
+
const score = s.dimensions?.[dim];
|
|
707
|
+
return ` ${score ? score.toFixed(2).padStart(12) : ' N/A'} |`;
|
|
708
|
+
}).join('');
|
|
709
|
+
lines.push(row);
|
|
710
|
+
}
|
|
711
|
+
lines.push('');
|
|
712
|
+
}
|
|
713
|
+
|
|
714
|
+
// Scenario breakdown
|
|
715
|
+
lines.push('SCENARIO PERFORMANCE');
|
|
716
|
+
lines.push('-'.repeat(80));
|
|
717
|
+
|
|
718
|
+
for (const scenario of scenarioStats) {
|
|
719
|
+
lines.push(`\n${scenario.scenarioName} (${scenario.scenarioId})`);
|
|
720
|
+
for (const config of scenario.configurations) {
|
|
721
|
+
const status = config.passesValidation ? 'PASS' : 'FAIL';
|
|
722
|
+
lines.push(` ${config.provider}/${config.model}: ${config.avgScore?.toFixed(1) || 'N/A'} [${status}]`);
|
|
723
|
+
}
|
|
724
|
+
}
|
|
725
|
+
|
|
726
|
+
lines.push('');
|
|
727
|
+
lines.push('='.repeat(80));
|
|
728
|
+
|
|
729
|
+
return lines.join('\n');
|
|
730
|
+
}
|
|
731
|
+
|
|
732
|
+
export default {
|
|
733
|
+
runEvaluation,
|
|
734
|
+
compareConfigurations,
|
|
735
|
+
quickTest,
|
|
736
|
+
listOptions,
|
|
737
|
+
getRunResults,
|
|
738
|
+
generateReport,
|
|
739
|
+
};
|