@machinespirits/eval 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. package/components/MobileEvalDashboard.tsx +267 -0
  2. package/components/comparison/DeltaAnalysisTable.tsx +137 -0
  3. package/components/comparison/ProfileComparisonCard.tsx +176 -0
  4. package/components/comparison/RecognitionABMode.tsx +385 -0
  5. package/components/comparison/RecognitionMetricsPanel.tsx +135 -0
  6. package/components/comparison/WinnerIndicator.tsx +64 -0
  7. package/components/comparison/index.ts +5 -0
  8. package/components/mobile/BottomSheet.tsx +233 -0
  9. package/components/mobile/DimensionBreakdown.tsx +210 -0
  10. package/components/mobile/DocsView.tsx +363 -0
  11. package/components/mobile/LogsView.tsx +481 -0
  12. package/components/mobile/PsychodynamicQuadrant.tsx +261 -0
  13. package/components/mobile/QuickTestView.tsx +1098 -0
  14. package/components/mobile/RecognitionTypeChart.tsx +124 -0
  15. package/components/mobile/RecognitionView.tsx +809 -0
  16. package/components/mobile/RunDetailView.tsx +261 -0
  17. package/components/mobile/RunHistoryView.tsx +367 -0
  18. package/components/mobile/ScoreRadial.tsx +211 -0
  19. package/components/mobile/StreamingLogPanel.tsx +230 -0
  20. package/components/mobile/SynthesisStrategyChart.tsx +140 -0
  21. package/config/interaction-eval-scenarios.yaml +832 -0
  22. package/config/learner-agents.yaml +248 -0
  23. package/docs/research/ABLATION-DIALOGUE-ROUNDS.md +52 -0
  24. package/docs/research/ABLATION-MODEL-SELECTION.md +53 -0
  25. package/docs/research/ADVANCED-EVAL-ANALYSIS.md +60 -0
  26. package/docs/research/ANOVA-RESULTS-2026-01-14.md +257 -0
  27. package/docs/research/COMPREHENSIVE-EVALUATION-PLAN.md +586 -0
  28. package/docs/research/COST-ANALYSIS.md +56 -0
  29. package/docs/research/CRITICAL-REVIEW-RECOGNITION-TUTORING.md +340 -0
  30. package/docs/research/DYNAMIC-VS-SCRIPTED-ANALYSIS.md +291 -0
  31. package/docs/research/EVAL-SYSTEM-ANALYSIS.md +306 -0
  32. package/docs/research/FACTORIAL-RESULTS-2026-01-14.md +301 -0
  33. package/docs/research/IMPLEMENTATION-PLAN-CRITIQUE-RESPONSE.md +1988 -0
  34. package/docs/research/LONGITUDINAL-DYADIC-EVALUATION.md +282 -0
  35. package/docs/research/MULTI-JUDGE-VALIDATION-2026-01-14.md +147 -0
  36. package/docs/research/PAPER-EXTENSION-DYADIC.md +204 -0
  37. package/docs/research/PAPER-UNIFIED.md +659 -0
  38. package/docs/research/PAPER-UNIFIED.pdf +0 -0
  39. package/docs/research/PROMPT-IMPROVEMENTS-2026-01-14.md +356 -0
  40. package/docs/research/SESSION-NOTES-2026-01-11-RECOGNITION-EVAL.md +419 -0
  41. package/docs/research/apa.csl +2133 -0
  42. package/docs/research/archive/PAPER-DRAFT-RECOGNITION-TUTORING.md +1637 -0
  43. package/docs/research/archive/paper-multiagent-tutor.tex +978 -0
  44. package/docs/research/paper-draft/full-paper.md +136 -0
  45. package/docs/research/paper-draft/images/pasted-image-2026-01-24T03-47-47-846Z-d76a7ae2.png +0 -0
  46. package/docs/research/paper-draft/references.bib +515 -0
  47. package/docs/research/transcript-baseline.md +139 -0
  48. package/docs/research/transcript-recognition-multiagent.md +187 -0
  49. package/hooks/useEvalData.ts +625 -0
  50. package/index.js +27 -0
  51. package/package.json +73 -0
  52. package/routes/evalRoutes.js +3002 -0
  53. package/scripts/advanced-eval-analysis.js +351 -0
  54. package/scripts/analyze-eval-costs.js +378 -0
  55. package/scripts/analyze-eval-results.js +513 -0
  56. package/scripts/analyze-interaction-evals.js +368 -0
  57. package/server-init.js +45 -0
  58. package/server.js +162 -0
  59. package/services/benchmarkService.js +1892 -0
  60. package/services/evaluationRunner.js +739 -0
  61. package/services/evaluationStore.js +1121 -0
  62. package/services/learnerConfigLoader.js +385 -0
  63. package/services/learnerTutorInteractionEngine.js +857 -0
  64. package/services/memory/learnerMemoryService.js +1227 -0
  65. package/services/memory/learnerWritingPad.js +577 -0
  66. package/services/memory/tutorWritingPad.js +674 -0
  67. package/services/promptRecommendationService.js +493 -0
  68. package/services/rubricEvaluator.js +826 -0
@@ -0,0 +1,739 @@
1
+ /**
2
+ * Evaluation Runner Service
3
+ *
4
+ * Orchestrates the evaluation of AI tutor configurations across
5
+ * test scenarios with rubric-based scoring.
6
+ */
7
+
8
+ import { tutorApiService as tutorApi, monitoringService } from '@machinespirits/tutor-core';
9
+ import * as rubricEvaluator from './rubricEvaluator.js';
10
+ import * as evaluationStore from './evaluationStore.js';
11
+
12
+ // Rate limiting settings
13
+ const DEFAULT_PARALLELISM = 2;
14
+ const REQUEST_DELAY_MS = 500;
15
+ const MAX_RETRIES = 3;
16
+ const INITIAL_RETRY_DELAY_MS = 2000; // Start with 2 seconds
17
+
18
+ // Debug logging helper - suppressed in transcript mode for clean output
19
+ function debugLog(...args) {
20
+ if (process.env.TUTOR_TRANSCRIPT !== 'true') {
21
+ console.log(...args);
22
+ }
23
+ }
24
+
25
+ /**
26
+ * Sleep utility
27
+ */
28
+ function sleep(ms) {
29
+ return new Promise(resolve => setTimeout(resolve, ms));
30
+ }
31
+
32
+ /**
33
+ * Retry wrapper for API calls with exponential backoff
34
+ * Handles 429 rate limit errors from OpenRouter free tier
35
+ */
36
+ async function retryWithBackoff(fn, context = {}, maxRetries = MAX_RETRIES) {
37
+ let lastError;
38
+
39
+ for (let attempt = 0; attempt <= maxRetries; attempt++) {
40
+ try {
41
+ return await fn();
42
+ } catch (error) {
43
+ lastError = error;
44
+
45
+ // Check if it's a rate limit error (429)
46
+ const is429 = error?.message?.includes('429') ||
47
+ error?.message?.includes('rate limit') ||
48
+ error?.message?.includes('Rate limit');
49
+
50
+ // Don't retry on last attempt or non-429 errors
51
+ if (attempt === maxRetries || !is429) {
52
+ throw error;
53
+ }
54
+
55
+ // Calculate exponential backoff delay: 2s, 4s, 8s
56
+ const delayMs = INITIAL_RETRY_DELAY_MS * Math.pow(2, attempt);
57
+
58
+ debugLog(`[Retry ${attempt + 1}/${maxRetries}] Rate limit hit, waiting ${delayMs}ms before retry...`);
59
+ if (context.log) {
60
+ context.log(`Rate limit exceeded, retrying in ${delayMs / 1000}s (attempt ${attempt + 1}/${maxRetries})`, 'warning');
61
+ }
62
+
63
+ await sleep(delayMs);
64
+ }
65
+ }
66
+
67
+ // Should never reach here, but throw last error just in case
68
+ throw lastError;
69
+ }
70
+
71
+ /**
72
+ * Run a complete evaluation across configurations and scenarios
73
+ *
74
+ * @param {Object} options - Evaluation options
75
+ * @returns {Promise<Object>} Evaluation run results
76
+ */
77
+ export async function runEvaluation(options = {}) {
78
+ const {
79
+ scenarios = 'all', // Which scenarios to run ('all' or array of IDs)
80
+ configurations = 'all', // Which configs to test ('all', 'profiles', or array)
81
+ runsPerConfig = 1, // Repetitions for statistical significance
82
+ parallelism = DEFAULT_PARALLELISM,
83
+ skipRubricEval = false, // Skip AI-based rubric evaluation (faster)
84
+ description = null,
85
+ verbose = false,
86
+ } = options;
87
+
88
+ const log = verbose ? console.log : () => {};
89
+
90
+ // Resolve scenarios
91
+ const allScenarios = tutorApi.listScenarios();
92
+ const targetScenarios = scenarios === 'all'
93
+ ? allScenarios
94
+ : allScenarios.filter(s => scenarios.includes(s.id));
95
+
96
+ if (targetScenarios.length === 0) {
97
+ throw new Error('No scenarios to run');
98
+ }
99
+
100
+ // Resolve configurations
101
+ let targetConfigs = [];
102
+ if (configurations === 'all') {
103
+ targetConfigs = tutorApi.listConfigurations();
104
+ } else if (configurations === 'profiles') {
105
+ const profiles = tutorApi.listProfiles();
106
+ targetConfigs = profiles.map(p => ({
107
+ provider: null,
108
+ model: null,
109
+ profileName: p.name,
110
+ label: p.name,
111
+ }));
112
+ } else if (Array.isArray(configurations)) {
113
+ targetConfigs = configurations;
114
+ }
115
+
116
+ if (targetConfigs.length === 0) {
117
+ throw new Error('No configurations to test');
118
+ }
119
+
120
+ log(`\nStarting evaluation:`);
121
+ log(` Scenarios: ${targetScenarios.length}`);
122
+ log(` Configurations: ${targetConfigs.length}`);
123
+ log(` Runs per config: ${runsPerConfig}`);
124
+ log(` Total tests: ${targetScenarios.length * targetConfigs.length * runsPerConfig}`);
125
+
126
+ // Create evaluation run record
127
+ const run = evaluationStore.createRun({
128
+ description: description || `Evaluation: ${targetConfigs.length} configs x ${targetScenarios.length} scenarios`,
129
+ totalScenarios: targetScenarios.length,
130
+ totalConfigurations: targetConfigs.length,
131
+ metadata: {
132
+ runsPerConfig,
133
+ skipRubricEval,
134
+ },
135
+ });
136
+
137
+ log(`\nRun ID: ${run.id}\n`);
138
+
139
+ // Register with monitoring service for realtime tracking
140
+ monitoringService.startSession(run.id, {
141
+ userId: 'eval-runner',
142
+ profileName: `${targetConfigs.length} configs`,
143
+ modelId: 'evaluation-batch',
144
+ });
145
+
146
+ const results = [];
147
+ let completedTests = 0;
148
+ const totalTests = targetScenarios.length * targetConfigs.length * runsPerConfig;
149
+
150
+ // Run evaluations
151
+ for (const config of targetConfigs) {
152
+ log(`\nConfiguration: ${config.label || `${config.provider}/${config.model}`}`);
153
+ log('='.repeat(60));
154
+
155
+ for (const scenario of targetScenarios) {
156
+ for (let runNum = 0; runNum < runsPerConfig; runNum++) {
157
+ try {
158
+ const result = await runSingleTest(scenario, config, {
159
+ skipRubricEval,
160
+ verbose,
161
+ });
162
+
163
+ // Store result
164
+ evaluationStore.storeResult(run.id, result);
165
+ results.push(result);
166
+
167
+ completedTests++;
168
+ log(` [${completedTests}/${totalTests}] ${scenario.id}: ${result.success ? `score=${result.overallScore?.toFixed(1)}` : 'FAILED'}`);
169
+
170
+ // Update monitoring session with progress
171
+ monitoringService.recordEvent(run.id, {
172
+ type: 'evaluation_test',
173
+ inputTokens: result.inputTokens || 0,
174
+ outputTokens: result.outputTokens || 0,
175
+ latencyMs: result.latencyMs || 0,
176
+ round: completedTests,
177
+ approved: result.success,
178
+ });
179
+
180
+ // Rate limiting
181
+ await sleep(REQUEST_DELAY_MS);
182
+ } catch (error) {
183
+ log(` [${completedTests}/${totalTests}] ${scenario.id}: ERROR - ${error.message}`);
184
+ completedTests++;
185
+
186
+ // Record error in monitoring
187
+ monitoringService.recordEvent(run.id, {
188
+ type: 'evaluation_error',
189
+ round: completedTests,
190
+ error: error.message,
191
+ });
192
+ }
193
+ }
194
+ }
195
+ }
196
+
197
+ // Update run status
198
+ evaluationStore.updateRun(run.id, {
199
+ status: 'completed',
200
+ totalTests: results.length,
201
+ completedAt: new Date().toISOString(),
202
+ });
203
+
204
+ // End monitoring session
205
+ monitoringService.endSession(run.id);
206
+
207
+ // Get aggregated stats
208
+ const stats = evaluationStore.getRunStats(run.id);
209
+ const scenarioStats = evaluationStore.getScenarioStats(run.id);
210
+
211
+ log('\n' + '='.repeat(60));
212
+ log('EVALUATION COMPLETE');
213
+ log('='.repeat(60));
214
+ log(`Run ID: ${run.id}`);
215
+ log(`Total tests: ${results.length}`);
216
+ log(`Successful: ${results.filter(r => r.success).length}`);
217
+
218
+ return {
219
+ runId: run.id,
220
+ totalTests: results.length,
221
+ successfulTests: results.filter(r => r.success).length,
222
+ stats,
223
+ scenarioStats,
224
+ };
225
+ }
226
+
227
+ /**
228
+ * Run a single test (scenario + config combination)
229
+ * Handles both single-turn and multi-turn scenarios
230
+ */
231
+ async function runSingleTest(scenario, config, options = {}) {
232
+ const { skipRubricEval = false, outputSize = 'normal', verbose = false, onLog, superegoStrategy = null } = options;
233
+
234
+ // Create a log function that calls both console and onLog callback
235
+ const log = (message, level = 'info') => {
236
+ if (verbose) console.log(message);
237
+ if (onLog) onLog(message, level);
238
+ };
239
+
240
+ const fullScenario = tutorApi.getScenario(scenario.id);
241
+ if (!fullScenario) {
242
+ throw new Error(`Scenario not found: ${scenario.id}`);
243
+ }
244
+
245
+ log(`Running scenario: ${scenario.name}`, 'info');
246
+
247
+ // Check if this is a multi-turn scenario
248
+ const isMultiTurn = tutorApi.isMultiTurnScenario(scenario.id);
249
+
250
+ if (isMultiTurn) {
251
+ log('Detected multi-turn scenario', 'info');
252
+ return runMultiTurnTest(scenario, config, fullScenario, { ...options, log });
253
+ }
254
+
255
+ // Single-turn evaluation (original logic)
256
+ return runSingleTurnTest(scenario, config, fullScenario, { ...options, log });
257
+ }
258
+
259
+ /**
260
+ * Run a single-turn test
261
+ */
262
+ async function runSingleTurnTest(scenario, config, fullScenario, options = {}) {
263
+ const { skipRubricEval = false, outputSize = 'normal', verbose = false, log = () => {}, superegoStrategy = null } = options;
264
+
265
+ // Build context
266
+ log('Building learner context...', 'info');
267
+ const context = tutorApi.buildContext(fullScenario.learner_context);
268
+ context.isNewUser = fullScenario.is_new_user;
269
+
270
+ // Generate suggestions
271
+ log(`Generating suggestions with profile: ${config.profileName}`, 'info');
272
+ log(`Provider: ${config.provider || 'from profile'}, Model: ${config.model || 'from profile'}`, 'info');
273
+ if (config.egoModel) {
274
+ log(`Ego model override: ${config.egoModel}`, 'info');
275
+ }
276
+
277
+ // Wrap API call with retry logic for rate limit handling
278
+ const genResult = await retryWithBackoff(
279
+ () => tutorApi.generateSuggestions(context, {
280
+ provider: config.provider,
281
+ model: config.model,
282
+ egoModel: config.egoModel, // Override ego model for benchmarking
283
+ profileName: config.profileName,
284
+ hyperparameters: config.hyperparameters || {},
285
+ trace: true, // Always capture trace for tension analysis
286
+ superegoStrategy, // Pass through superego intervention strategy
287
+ outputSize, // compact, normal, expanded - affects response length
288
+ }),
289
+ { log }
290
+ );
291
+
292
+ if (!genResult.success) {
293
+ log(`Generation failed: ${genResult.error}`, 'error');
294
+ return {
295
+ scenarioId: scenario.id,
296
+ scenarioName: scenario.name,
297
+ provider: config.provider || genResult.metadata?.provider,
298
+ model: config.model || genResult.metadata?.model,
299
+ profileName: config.profileName,
300
+ success: false,
301
+ errorMessage: genResult.error,
302
+ latencyMs: genResult.metadata?.latencyMs,
303
+ };
304
+ }
305
+
306
+ const suggestionCount = genResult.suggestions?.length || 0;
307
+ log(`Generated ${suggestionCount} suggestion(s) in ${genResult.metadata?.latencyMs}ms`, 'success');
308
+
309
+ if (genResult.metadata?.dialogueRounds) {
310
+ log(`Dialogue rounds: ${genResult.metadata.dialogueRounds}`, 'info');
311
+ }
312
+
313
+ // Quick validation (rule-based)
314
+ log('Running validation checks...', 'info');
315
+ const suggestion = genResult.suggestions?.[0];
316
+ const validation = suggestion
317
+ ? rubricEvaluator.quickValidate(suggestion, {
318
+ requiredElements: fullScenario.required_elements,
319
+ forbiddenElements: fullScenario.forbidden_elements,
320
+ })
321
+ : { passesRequired: false, passesForbidden: true, requiredMissing: ['No suggestions generated'] };
322
+
323
+ log(`Validation: required=${validation.passesRequired ? 'PASS' : 'FAIL'}, forbidden=${validation.passesForbidden ? 'PASS' : 'FAIL'}`, validation.passesRequired && validation.passesForbidden ? 'success' : 'warning');
324
+
325
+ let rubricResult = null;
326
+ if (!skipRubricEval && suggestion) {
327
+ // Full rubric evaluation with AI judge
328
+ log('Running AI rubric evaluation...', 'info');
329
+ debugLog(`[evaluationRunner] Running rubric evaluation for ${scenario.id}...`);
330
+ rubricResult = await rubricEvaluator.evaluateSuggestion(suggestion, {
331
+ name: fullScenario.name,
332
+ description: fullScenario.description,
333
+ expectedBehavior: fullScenario.expected_behavior,
334
+ learnerContext: fullScenario.learner_context,
335
+ requiredElements: fullScenario.required_elements,
336
+ forbiddenElements: fullScenario.forbidden_elements,
337
+ }, {});
338
+
339
+ // Log rubric result summary
340
+ if (rubricResult) {
341
+ debugLog(`[evaluationRunner] Rubric result: success=${rubricResult.success}, ` +
342
+ `overallScore=${rubricResult.overallScore}, ` +
343
+ `scoresCount=${Object.keys(rubricResult.scores || {}).length}, ` +
344
+ `error=${rubricResult.error || 'none'}`);
345
+ if (rubricResult.success) {
346
+ log(`Rubric evaluation complete: score=${rubricResult.overallScore?.toFixed(1)}`, 'success');
347
+ } else {
348
+ log(`Rubric evaluation failed: ${rubricResult.error || 'unknown error'}`, 'error');
349
+ }
350
+ }
351
+ } else if (skipRubricEval) {
352
+ debugLog(`[evaluationRunner] Skipping rubric evaluation (--fast mode)`);
353
+ log('Skipping AI rubric evaluation (fast mode)', 'info');
354
+ } else if (!suggestion) {
355
+ debugLog(`[evaluationRunner] Skipping rubric evaluation (no suggestion generated)`);
356
+ log('Skipping rubric evaluation (no suggestion generated)', 'warning');
357
+ }
358
+
359
+ // Calculate overall score
360
+ let overallScore = null;
361
+ if (rubricResult?.success) {
362
+ overallScore = rubricResult.overallScore;
363
+ } else if (suggestion) {
364
+ // Fallback: simple validation-based score
365
+ overallScore = (validation.passesRequired ? 50 : 0) + (validation.passesForbidden ? 50 : 0);
366
+ }
367
+
368
+ return {
369
+ scenarioId: scenario.id,
370
+ scenarioName: scenario.name,
371
+ provider: config.provider || genResult.metadata?.provider,
372
+ model: config.model || genResult.metadata?.model,
373
+ profileName: config.profileName,
374
+ hyperparameters: config.hyperparameters,
375
+ suggestions: genResult.suggestions,
376
+ success: true,
377
+ latencyMs: genResult.metadata?.latencyMs,
378
+ inputTokens: genResult.metadata?.inputTokens,
379
+ outputTokens: genResult.metadata?.outputTokens,
380
+ dialogueRounds: genResult.metadata?.dialogueRounds,
381
+ apiCalls: genResult.metadata?.apiCalls,
382
+ cost: genResult.metadata?.totalCost, // OpenRouter API cost in USD
383
+ dialogueId: genResult.metadata?.dialogueId, // For linking to logs
384
+ scores: rubricResult?.scores && Object.keys(rubricResult.scores).length > 0 ? {
385
+ relevance: rubricResult.scores.relevance?.score,
386
+ specificity: rubricResult.scores.specificity?.score,
387
+ pedagogical: rubricResult.scores.pedagogical?.score,
388
+ personalization: rubricResult.scores.personalization?.score,
389
+ actionability: rubricResult.scores.actionability?.score,
390
+ tone: rubricResult.scores.tone?.score,
391
+ } : null,
392
+ // Include full scores with reasoning for detailed analysis
393
+ scoresWithReasoning: rubricResult?.scores && Object.keys(rubricResult.scores).length > 0
394
+ ? rubricResult.scores
395
+ : null,
396
+ overallScore,
397
+ passesRequired: rubricResult?.passesRequired ?? validation.passesRequired,
398
+ passesForbidden: rubricResult?.passesForbidden ?? validation.passesForbidden,
399
+ requiredMissing: rubricResult?.requiredMissing || validation.requiredMissing,
400
+ forbiddenFound: rubricResult?.forbiddenFound || validation.forbiddenFound,
401
+ evaluatorModel: rubricResult?.evaluatorModel,
402
+ evaluationReasoning: rubricResult?.summary,
403
+ // Include dialogueResult for tension analysis
404
+ dialogueResult: {
405
+ dialogueTrace: genResult.dialogueTrace,
406
+ dialogueRounds: genResult.metadata?.dialogueRounds,
407
+ converged: genResult.metadata?.converged,
408
+ dialogueId: genResult.metadata?.dialogueId,
409
+ },
410
+ };
411
+ }
412
+
413
+ /**
414
+ * Run a multi-turn test
415
+ * Evaluates each turn and aggregates scores
416
+ */
417
+ async function runMultiTurnTest(scenario, config, fullScenario, options = {}) {
418
+ const { skipRubricEval = false, verbose = false } = options;
419
+ const log = verbose ? console.log : () => {};
420
+
421
+ log(`[evaluationRunner] Running multi-turn scenario: ${scenario.id}`);
422
+
423
+ const turns = fullScenario.turns || [];
424
+ const turnResults = [];
425
+ let totalLatencyMs = 0;
426
+ let totalInputTokens = 0;
427
+ let totalOutputTokens = 0;
428
+ let totalApiCalls = 0;
429
+ let totalCost = 0;
430
+
431
+ // Run the multi-turn scenario through tutorApi (with retry for rate limits)
432
+ const multiTurnResult = await retryWithBackoff(
433
+ () => tutorApi.runMultiTurnScenario(scenario.id, {
434
+ provider: config.provider,
435
+ model: config.model,
436
+ profileName: config.profileName,
437
+ hyperparameters: config.hyperparameters || {},
438
+ trace: verbose,
439
+ }),
440
+ { log }
441
+ );
442
+
443
+ // Validate that we got results
444
+ if (!multiTurnResult.turnResults || multiTurnResult.turnResults.length === 0) {
445
+ const errorMsg = `Multi-turn scenario returned no results (expected ${fullScenario.turns?.length + 1 || 1} turns)`;
446
+ log(errorMsg, 'error');
447
+ throw new Error(errorMsg);
448
+ }
449
+
450
+ // Evaluate each turn
451
+ for (const turnResult of multiTurnResult.turnResults) {
452
+ const suggestion = turnResult.suggestions?.[0];
453
+
454
+ // Quick validation for this turn
455
+ const validation = suggestion
456
+ ? rubricEvaluator.quickValidate(suggestion, {
457
+ requiredElements: turnResult.requiredElements,
458
+ forbiddenElements: turnResult.forbiddenElements,
459
+ })
460
+ : { passesRequired: false, passesForbidden: true, requiredMissing: ['No suggestions generated'] };
461
+
462
+ let rubricResult = null;
463
+ if (!skipRubricEval && suggestion) {
464
+ log(`[evaluationRunner] Running rubric evaluation for turn ${turnResult.turnIndex}...`);
465
+ rubricResult = await rubricEvaluator.evaluateSuggestion(suggestion, {
466
+ name: `${fullScenario.name} - Turn ${turnResult.turnIndex}`,
467
+ description: turnResult.turnId === 'initial' ? fullScenario.description : `Turn: ${turnResult.learnerAction}`,
468
+ expectedBehavior: turnResult.expectedBehavior,
469
+ learnerContext: turnResult.context,
470
+ requiredElements: turnResult.requiredElements,
471
+ forbiddenElements: turnResult.forbiddenElements,
472
+ }, {});
473
+ }
474
+
475
+ // Calculate turn score
476
+ let turnScore = null;
477
+ if (rubricResult?.success) {
478
+ turnScore = rubricResult.overallScore;
479
+ } else if (suggestion) {
480
+ turnScore = (validation.passesRequired ? 50 : 0) + (validation.passesForbidden ? 50 : 0);
481
+ }
482
+
483
+ turnResults.push({
484
+ turnIndex: turnResult.turnIndex,
485
+ turnId: turnResult.turnId,
486
+ learnerAction: turnResult.learnerAction,
487
+ expectedBehavior: turnResult.expectedBehavior,
488
+ suggestion: suggestion,
489
+ scores: rubricResult?.scores && Object.keys(rubricResult.scores).length > 0 ? {
490
+ relevance: rubricResult.scores.relevance?.score,
491
+ specificity: rubricResult.scores.specificity?.score,
492
+ pedagogical: rubricResult.scores.pedagogical?.score,
493
+ personalization: rubricResult.scores.personalization?.score,
494
+ actionability: rubricResult.scores.actionability?.score,
495
+ tone: rubricResult.scores.tone?.score,
496
+ } : null,
497
+ turnScore,
498
+ passesRequired: rubricResult?.passesRequired ?? validation.passesRequired,
499
+ passesForbidden: rubricResult?.passesForbidden ?? validation.passesForbidden,
500
+ requiredMissing: validation.requiredMissing,
501
+ forbiddenFound: validation.forbiddenFound,
502
+ minAcceptableScore: turnResult.minAcceptableScore || fullScenario.min_acceptable_score,
503
+ });
504
+
505
+ // Aggregate metrics
506
+ totalLatencyMs += turnResult.metadata?.latencyMs || 0;
507
+ totalInputTokens += turnResult.metadata?.inputTokens || 0;
508
+ totalOutputTokens += turnResult.metadata?.outputTokens || 0;
509
+ totalApiCalls += turnResult.metadata?.apiCalls || 0;
510
+ totalCost += turnResult.metadata?.totalCost || 0;
511
+ }
512
+
513
+ // Calculate aggregate scores
514
+ const validTurnScores = turnResults.filter(t => t.turnScore !== null).map(t => t.turnScore);
515
+ const overallScore = validTurnScores.length > 0
516
+ ? validTurnScores.reduce((sum, s) => sum + s, 0) / validTurnScores.length
517
+ : null;
518
+
519
+ // Aggregate dimension scores
520
+ const aggregateDimensions = {};
521
+ const dims = ['relevance', 'specificity', 'pedagogical', 'personalization', 'actionability', 'tone'];
522
+ for (const dim of dims) {
523
+ const dimScores = turnResults
524
+ .filter(t => t.scores?.[dim] !== undefined)
525
+ .map(t => t.scores[dim]);
526
+ if (dimScores.length > 0) {
527
+ aggregateDimensions[dim] = dimScores.reduce((sum, s) => sum + s, 0) / dimScores.length;
528
+ }
529
+ }
530
+
531
+ // Check if all turns pass their thresholds
532
+ const allTurnsPassed = turnResults.every(t => {
533
+ if (t.turnScore === null) return false;
534
+ const threshold = t.minAcceptableScore || fullScenario.min_acceptable_score || 0;
535
+ return t.turnScore >= threshold;
536
+ });
537
+
538
+ log(`[evaluationRunner] Multi-turn complete: ${turnResults.length} turns, avgScore=${overallScore?.toFixed(1)}`);
539
+
540
+ return {
541
+ scenarioId: scenario.id,
542
+ scenarioName: scenario.name,
543
+ isMultiTurn: true,
544
+ totalTurns: turnResults.length,
545
+ provider: config.provider || multiTurnResult.turnResults[0]?.metadata?.provider,
546
+ model: config.model || multiTurnResult.turnResults[0]?.metadata?.model,
547
+ profileName: config.profileName,
548
+ hyperparameters: config.hyperparameters,
549
+ suggestions: multiTurnResult.turnResults.map(t => t.suggestions?.[0]).filter(Boolean),
550
+ success: true,
551
+ latencyMs: totalLatencyMs,
552
+ inputTokens: totalInputTokens,
553
+ outputTokens: totalOutputTokens,
554
+ apiCalls: totalApiCalls,
555
+ cost: totalCost, // OpenRouter API cost in USD
556
+ dialogueId: multiTurnResult.dialogueId, // Single continuous dialogue ID for all turns
557
+ dialogueRounds: multiTurnResult.turnResults.reduce((sum, t) => sum + (t.metadata?.dialogueRounds || 0), 0), // Total across all turns
558
+ scores: Object.keys(aggregateDimensions).length > 0 ? aggregateDimensions : null,
559
+ overallScore,
560
+ turnResults,
561
+ allTurnsPassed,
562
+ passesRequired: turnResults.every(t => t.passesRequired),
563
+ passesForbidden: turnResults.every(t => t.passesForbidden),
564
+ };
565
+ }
566
+
567
+ /**
568
+ * Compare two or more configurations
569
+ */
570
+ export async function compareConfigurations(configs, options = {}) {
571
+ const {
572
+ scenarios = 'all',
573
+ runsPerConfig = 1,
574
+ verbose = false,
575
+ } = options;
576
+
577
+ // Run evaluation with specified configs
578
+ const result = await runEvaluation({
579
+ scenarios,
580
+ configurations: configs,
581
+ runsPerConfig,
582
+ verbose,
583
+ description: `Comparison: ${configs.map(c => c.label || c.profileName || `${c.provider}/${c.model}`).join(' vs ')}`,
584
+ });
585
+
586
+ // Build comparison
587
+ const comparison = {
588
+ runId: result.runId,
589
+ configurations: configs,
590
+ rankings: result.stats.sort((a, b) => (b.avgScore || 0) - (a.avgScore || 0)).map((stat, i) => ({
591
+ rank: i + 1,
592
+ provider: stat.provider,
593
+ model: stat.model,
594
+ avgScore: stat.avgScore,
595
+ successRate: stat.successRate,
596
+ avgLatencyMs: stat.avgLatencyMs,
597
+ })),
598
+ scenarioBreakdown: result.scenarioStats,
599
+ };
600
+
601
+ return comparison;
602
+ }
603
+
604
+ /**
605
+ * Quick test of a single configuration
606
+ */
607
+ export async function quickTest(config, options = {}) {
608
+ const {
609
+ scenarioId = 'new_user_first_visit',
610
+ verbose = true,
611
+ skipRubricEval = false,
612
+ outputSize = 'normal', // compact, normal, expanded
613
+ onLog,
614
+ superegoStrategy = null, // Superego intervention strategy
615
+ } = options;
616
+
617
+ const scenarios = [tutorApi.listScenarios().find(s => s.id === scenarioId)].filter(Boolean);
618
+ if (scenarios.length === 0) {
619
+ throw new Error(`Scenario not found: ${scenarioId}`);
620
+ }
621
+
622
+ const result = await runSingleTest(scenarios[0], config, { verbose, skipRubricEval, outputSize, onLog, superegoStrategy });
623
+ return result;
624
+ }
625
+
626
+ /**
627
+ * List available scenarios and configurations
628
+ */
629
+ export function listOptions() {
630
+ return {
631
+ scenarios: tutorApi.listScenarios(),
632
+ configurations: tutorApi.listConfigurations(),
633
+ profiles: tutorApi.listProfiles(),
634
+ };
635
+ }
636
+
637
+ /**
638
+ * Get previous run results
639
+ */
640
+ export function getRunResults(runId) {
641
+ const run = evaluationStore.getRun(runId);
642
+ if (!run) {
643
+ throw new Error(`Run not found: ${runId}`);
644
+ }
645
+
646
+ return {
647
+ run,
648
+ stats: evaluationStore.getRunStats(runId),
649
+ scenarioStats: evaluationStore.getScenarioStats(runId),
650
+ results: evaluationStore.getResults(runId),
651
+ };
652
+ }
653
+
654
+ /**
655
+ * Generate a text report for a run
656
+ */
657
+ export function generateReport(runId) {
658
+ const run = evaluationStore.getRun(runId);
659
+ if (!run) {
660
+ throw new Error(`Run not found: ${runId}`);
661
+ }
662
+
663
+ const stats = evaluationStore.getRunStats(runId);
664
+ const scenarioStats = evaluationStore.getScenarioStats(runId);
665
+
666
+ const lines = [];
667
+
668
+ lines.push('='.repeat(80));
669
+ lines.push(`TUTOR EVALUATION REPORT: ${runId}`);
670
+ lines.push('='.repeat(80));
671
+ lines.push('');
672
+ lines.push(`Run Date: ${run.createdAt}`);
673
+ lines.push(`Description: ${run.description || 'N/A'}`);
674
+ lines.push(`Total Tests: ${run.totalTests}`);
675
+ lines.push(`Status: ${run.status}`);
676
+ lines.push('');
677
+
678
+ // Rankings table
679
+ lines.push('CONFIGURATION RANKINGS (by average score)');
680
+ lines.push('-'.repeat(80));
681
+ lines.push('| Rank | Configuration | Avg Score | Latency | Pass Rate |');
682
+ lines.push('|------|----------------------------------|-----------|---------|-----------|');
683
+
684
+ stats.forEach((stat, i) => {
685
+ const label = `${stat.provider}/${stat.model}`.substring(0, 32).padEnd(32);
686
+ const score = stat.avgScore ? stat.avgScore.toFixed(1).padStart(9) : ' N/A';
687
+ const latency = stat.avgLatencyMs ? `${stat.avgLatencyMs.toFixed(0)}ms`.padStart(7) : ' N/A';
688
+ const passRate = `${(stat.validationPassRate * 100).toFixed(0)}%`.padStart(9);
689
+ lines.push(`| ${(i + 1).toString().padStart(4)} | ${label} | ${score} | ${latency} | ${passRate} |`);
690
+ });
691
+
692
+ lines.push('');
693
+
694
+ // Dimension breakdown
695
+ if (stats.length > 0 && stats[0].dimensions) {
696
+ lines.push('DIMENSION BREAKDOWN');
697
+ lines.push('-'.repeat(80));
698
+
699
+ const dims = ['relevance', 'specificity', 'pedagogical', 'personalization', 'actionability', 'tone'];
700
+ const header = '| Dimension |' + stats.map(s => ` ${s.model.substring(0, 12).padEnd(12)} |`).join('');
701
+ lines.push(header);
702
+ lines.push('|-----------------|' + stats.map(() => '--------------|').join(''));
703
+
704
+ for (const dim of dims) {
705
+ const row = `| ${dim.padEnd(15)} |` + stats.map(s => {
706
+ const score = s.dimensions?.[dim];
707
+ return ` ${score ? score.toFixed(2).padStart(12) : ' N/A'} |`;
708
+ }).join('');
709
+ lines.push(row);
710
+ }
711
+ lines.push('');
712
+ }
713
+
714
+ // Scenario breakdown
715
+ lines.push('SCENARIO PERFORMANCE');
716
+ lines.push('-'.repeat(80));
717
+
718
+ for (const scenario of scenarioStats) {
719
+ lines.push(`\n${scenario.scenarioName} (${scenario.scenarioId})`);
720
+ for (const config of scenario.configurations) {
721
+ const status = config.passesValidation ? 'PASS' : 'FAIL';
722
+ lines.push(` ${config.provider}/${config.model}: ${config.avgScore?.toFixed(1) || 'N/A'} [${status}]`);
723
+ }
724
+ }
725
+
726
+ lines.push('');
727
+ lines.push('='.repeat(80));
728
+
729
+ return lines.join('\n');
730
+ }
731
+
732
+ export default {
733
+ runEvaluation,
734
+ compareConfigurations,
735
+ quickTest,
736
+ listOptions,
737
+ getRunResults,
738
+ generateReport,
739
+ };