@machinespirits/eval 0.1.2 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (102) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +161 -0
  3. package/config/eval-settings.yaml +18 -0
  4. package/config/evaluation-rubric-learner.yaml +277 -0
  5. package/config/evaluation-rubric.yaml +613 -0
  6. package/config/interaction-eval-scenarios.yaml +93 -50
  7. package/config/learner-agents.yaml +124 -193
  8. package/config/machinespirits-eval.code-workspace +11 -0
  9. package/config/providers.yaml +60 -0
  10. package/config/suggestion-scenarios.yaml +1399 -0
  11. package/config/tutor-agents.yaml +716 -0
  12. package/docs/EVALUATION-VARIABLES.md +589 -0
  13. package/docs/REPLICATION-PLAN.md +577 -0
  14. package/index.js +15 -6
  15. package/package.json +16 -22
  16. package/routes/evalRoutes.js +88 -36
  17. package/scripts/analyze-judge-reliability.js +401 -0
  18. package/scripts/analyze-run.js +97 -0
  19. package/scripts/analyze-run.mjs +282 -0
  20. package/scripts/analyze-validation-failures.js +141 -0
  21. package/scripts/check-run.mjs +17 -0
  22. package/scripts/code-impasse-strategies.js +1132 -0
  23. package/scripts/compare-runs.js +44 -0
  24. package/scripts/compare-suggestions.js +80 -0
  25. package/scripts/compare-transformation.js +116 -0
  26. package/scripts/dig-into-run.js +158 -0
  27. package/scripts/eval-cli.js +2626 -0
  28. package/scripts/generate-paper-figures.py +452 -0
  29. package/scripts/qualitative-analysis-ai.js +1313 -0
  30. package/scripts/qualitative-analysis.js +688 -0
  31. package/scripts/seed-db.js +87 -0
  32. package/scripts/show-failed-suggestions.js +64 -0
  33. package/scripts/validate-content.js +192 -0
  34. package/server.js +3 -2
  35. package/services/__tests__/evalConfigLoader.test.js +338 -0
  36. package/services/anovaStats.js +499 -0
  37. package/services/contentResolver.js +407 -0
  38. package/services/dialogueTraceAnalyzer.js +454 -0
  39. package/services/evalConfigLoader.js +625 -0
  40. package/services/evaluationRunner.js +2171 -270
  41. package/services/evaluationStore.js +564 -29
  42. package/services/learnerConfigLoader.js +75 -5
  43. package/services/learnerRubricEvaluator.js +284 -0
  44. package/services/learnerTutorInteractionEngine.js +375 -0
  45. package/services/processUtils.js +18 -0
  46. package/services/progressLogger.js +98 -0
  47. package/services/promptRecommendationService.js +31 -26
  48. package/services/promptRewriter.js +427 -0
  49. package/services/rubricEvaluator.js +543 -70
  50. package/services/streamingReporter.js +104 -0
  51. package/services/turnComparisonAnalyzer.js +494 -0
  52. package/components/MobileEvalDashboard.tsx +0 -267
  53. package/components/comparison/DeltaAnalysisTable.tsx +0 -137
  54. package/components/comparison/ProfileComparisonCard.tsx +0 -176
  55. package/components/comparison/RecognitionABMode.tsx +0 -385
  56. package/components/comparison/RecognitionMetricsPanel.tsx +0 -135
  57. package/components/comparison/WinnerIndicator.tsx +0 -64
  58. package/components/comparison/index.ts +0 -5
  59. package/components/mobile/BottomSheet.tsx +0 -233
  60. package/components/mobile/DimensionBreakdown.tsx +0 -210
  61. package/components/mobile/DocsView.tsx +0 -363
  62. package/components/mobile/LogsView.tsx +0 -481
  63. package/components/mobile/PsychodynamicQuadrant.tsx +0 -261
  64. package/components/mobile/QuickTestView.tsx +0 -1098
  65. package/components/mobile/RecognitionTypeChart.tsx +0 -124
  66. package/components/mobile/RecognitionView.tsx +0 -809
  67. package/components/mobile/RunDetailView.tsx +0 -261
  68. package/components/mobile/RunHistoryView.tsx +0 -367
  69. package/components/mobile/ScoreRadial.tsx +0 -211
  70. package/components/mobile/StreamingLogPanel.tsx +0 -230
  71. package/components/mobile/SynthesisStrategyChart.tsx +0 -140
  72. package/docs/research/ABLATION-DIALOGUE-ROUNDS.md +0 -52
  73. package/docs/research/ABLATION-MODEL-SELECTION.md +0 -53
  74. package/docs/research/ADVANCED-EVAL-ANALYSIS.md +0 -60
  75. package/docs/research/ANOVA-RESULTS-2026-01-14.md +0 -257
  76. package/docs/research/COMPREHENSIVE-EVALUATION-PLAN.md +0 -586
  77. package/docs/research/COST-ANALYSIS.md +0 -56
  78. package/docs/research/CRITICAL-REVIEW-RECOGNITION-TUTORING.md +0 -340
  79. package/docs/research/DYNAMIC-VS-SCRIPTED-ANALYSIS.md +0 -291
  80. package/docs/research/EVAL-SYSTEM-ANALYSIS.md +0 -306
  81. package/docs/research/FACTORIAL-RESULTS-2026-01-14.md +0 -301
  82. package/docs/research/IMPLEMENTATION-PLAN-CRITIQUE-RESPONSE.md +0 -1988
  83. package/docs/research/LONGITUDINAL-DYADIC-EVALUATION.md +0 -282
  84. package/docs/research/MULTI-JUDGE-VALIDATION-2026-01-14.md +0 -147
  85. package/docs/research/PAPER-EXTENSION-DYADIC.md +0 -204
  86. package/docs/research/PAPER-UNIFIED.md +0 -659
  87. package/docs/research/PAPER-UNIFIED.pdf +0 -0
  88. package/docs/research/PROMPT-IMPROVEMENTS-2026-01-14.md +0 -356
  89. package/docs/research/SESSION-NOTES-2026-01-11-RECOGNITION-EVAL.md +0 -419
  90. package/docs/research/apa.csl +0 -2133
  91. package/docs/research/archive/PAPER-DRAFT-RECOGNITION-TUTORING.md +0 -1637
  92. package/docs/research/archive/paper-multiagent-tutor.tex +0 -978
  93. package/docs/research/paper-draft/full-paper.md +0 -136
  94. package/docs/research/paper-draft/images/pasted-image-2026-01-24T03-47-47-846Z-d76a7ae2.png +0 -0
  95. package/docs/research/paper-draft/references.bib +0 -515
  96. package/docs/research/transcript-baseline.md +0 -139
  97. package/docs/research/transcript-recognition-multiagent.md +0 -187
  98. package/hooks/useEvalData.ts +0 -625
  99. package/server-init.js +0 -45
  100. package/services/benchmarkService.js +0 -1892
  101. package/types.ts +0 -165
  102. package/utils/haptics.ts +0 -45
@@ -5,13 +5,227 @@
5
5
  * test scenarios with rubric-based scoring.
6
6
  */
7
7
 
8
- import { tutorApiService as tutorApi, monitoringService } from '@machinespirits/tutor-core';
8
+ import fs from 'fs';
9
+ import path from 'path';
10
+ import { fileURLToPath } from 'url';
11
+ import { execSync } from 'child_process';
12
+ import { tutorApiService as tutorApi, monitoringService, tutorDialogueEngine as dialogueEngine } from '@machinespirits/tutor-core';
9
13
  import * as rubricEvaluator from './rubricEvaluator.js';
10
14
  import * as evaluationStore from './evaluationStore.js';
15
+ import * as evalConfigLoader from './evalConfigLoader.js';
16
+ import * as contentResolver from './contentResolver.js';
17
+ import { ProgressLogger, getProgressLogPath } from './progressLogger.js';
18
+ import { StreamingReporter } from './streamingReporter.js';
19
+ import * as anovaStats from './anovaStats.js';
20
+ import { generateLearnerResponse } from './learnerTutorInteractionEngine.js';
21
+ import * as turnComparisonAnalyzer from './turnComparisonAnalyzer.js';
22
+ import * as dialogueTraceAnalyzer from './dialogueTraceAnalyzer.js';
23
+ import * as promptRewriter from './promptRewriter.js';
24
+
25
+ const __dirname = path.dirname(fileURLToPath(import.meta.url));
26
+ const EVAL_ROOT = path.resolve(__dirname, '..');
27
+ const LOGS_DIR = path.join(EVAL_ROOT, 'logs', 'tutor-dialogues');
28
+
29
+ // Redirect tutor-core logs to this repo's logs/ directory (if available)
30
+ import('@machinespirits/tutor-core').then(mod => {
31
+ if (typeof mod.setLogDir === 'function') mod.setLogDir(path.join(EVAL_ROOT, 'logs'));
32
+ }).catch(() => { /* setLogDir not available in this tutor-core version */ });
33
+
34
+ // Read package version once at import time
35
+ const pkg = JSON.parse(fs.readFileSync(path.join(EVAL_ROOT, 'package.json'), 'utf-8'));
36
+
37
+ /**
38
+ * Get the current git commit hash, or 'unknown' if not in a git repo.
39
+ */
40
+ function getGitCommitHash() {
41
+ try {
42
+ return execSync('git rev-parse --short HEAD', { cwd: EVAL_ROOT, encoding: 'utf-8' }).trim();
43
+ } catch {
44
+ return 'unknown';
45
+ }
46
+ }
47
+
48
+ import { isPidAlive } from './processUtils.js';
49
+
50
+ /**
51
+ * Eval-only profile names that need remapping to tutor-core profiles.
52
+ */
53
+ const EVAL_ONLY_PROFILES = [
54
+ 'single_baseline', 'single_baseline_paid',
55
+ 'single_recognition', 'single_recognition_paid',
56
+ 'single_enhanced',
57
+ 'baseline', 'baseline_paid',
58
+ 'recognition', 'recognition_paid',
59
+ 'enhanced',
60
+ 'cell_1_base_single_unified', 'cell_2_base_single_psycho',
61
+ 'cell_3_base_multi_unified', 'cell_4_base_multi_psycho',
62
+ 'cell_5_recog_single_unified', 'cell_6_recog_single_psycho',
63
+ 'cell_7_recog_multi_unified', 'cell_8_recog_multi_psycho',
64
+ 'cell_9_enhanced_single_unified', 'cell_10_enhanced_single_psycho',
65
+ 'cell_11_enhanced_multi_unified', 'cell_12_enhanced_multi_psycho',
66
+ 'cell_13_hardwired_single_unified', 'cell_14_hardwired_single_psycho',
67
+ 'cell_15_placebo_single_unified', 'cell_16_placebo_single_psycho',
68
+ 'cell_17_placebo_multi_unified', 'cell_18_placebo_multi_psycho',
69
+ 'cell_19_memory_single_unified', 'cell_20_recog_nomem_single_unified',
70
+ 'cell_21_recog_multi_unified_rewrite',
71
+ ];
72
+
73
+ /**
74
+ * Resolve an eval profile name into dialogue settings and a tutor-core profile.
75
+ *
76
+ * Eval profiles (cell_*, recognition, etc.) carry dialogue/recognition config that
77
+ * tutor-core doesn't know about. This function extracts those settings and maps the
78
+ * profile name to a tutor-core equivalent ('budget' or 'recognition').
79
+ *
80
+ * Exported for unit testing.
81
+ */
82
+ export function resolveEvalProfile(profileName) {
83
+ const evalProfile = evalConfigLoader.loadTutorAgents()?.profiles?.[profileName];
84
+ const useDialogue = evalProfile?.dialogue?.enabled ?? false;
85
+ const maxRounds = evalProfile?.dialogue?.max_rounds ?? 0;
86
+ const recognitionMode = evalProfile?.recognition_mode ?? profileName?.includes('recognition') ?? false;
87
+
88
+ let resolvedProfileName = profileName;
89
+ if (profileName && EVAL_ONLY_PROFILES.includes(profileName)) {
90
+ // Map eval profile to tutor-core profile based on prompt_type
91
+ const promptType = evalProfile?.factors?.prompt_type;
92
+ if (promptType === 'enhanced') {
93
+ resolvedProfileName = 'enhanced';
94
+ } else if (promptType === 'placebo') {
95
+ resolvedProfileName = 'placebo';
96
+ } else if (promptType === 'hardwired') {
97
+ resolvedProfileName = 'hardwired';
98
+ } else if (promptType === 'memory') {
99
+ resolvedProfileName = 'memory';
100
+ } else if (promptType === 'recognition_nomem') {
101
+ resolvedProfileName = 'recognition_nomem';
102
+ } else if (recognitionMode) {
103
+ resolvedProfileName = 'recognition';
104
+ } else {
105
+ resolvedProfileName = 'budget';
106
+ }
107
+ }
108
+
109
+ return { useDialogue, maxRounds, recognitionMode, resolvedProfileName };
110
+ }
111
+
112
+ /**
113
+ * Resolve provider/model references in a config object through eval's providers.yaml.
114
+ * This ensures eval controls which model IDs get sent to tutorApi.
115
+ */
116
+ function resolveConfigModels(config) {
117
+ const resolved = { ...config };
118
+ if (config.provider && config.model) {
119
+ try {
120
+ const r = evalConfigLoader.resolveModel(`${config.provider}.${config.model}`);
121
+ resolved.provider = r.provider;
122
+ resolved.model = r.model;
123
+ } catch (e) { console.debug(`[evaluationRunner] resolveModel failed for ${config.provider}.${config.model}:`, e.message); }
124
+ }
125
+ if (config.egoModel) {
126
+ try {
127
+ const r = evalConfigLoader.resolveModel(config.egoModel);
128
+ resolved.egoModel = r.model;
129
+ resolved.egoProvider = r.provider;
130
+ } catch (e) { console.debug(`[evaluationRunner] resolveModel failed for egoModel ${config.egoModel}:`, e.message); }
131
+ }
132
+
133
+ // When a profileName is provided but no explicit provider/model,
134
+ // look up the profile from the eval repo's local tutor-agents.yaml
135
+ // and extract the ego provider/model as explicit overrides.
136
+ // Uses egoModel (not model) because tutor-core's generateSuggestions
137
+ // uses profileName to load its own config — egoModel is the override.
138
+ if (resolved.profileName && !resolved.provider && !resolved.model) {
139
+ const profile = evalConfigLoader.getTutorProfile(resolved.profileName);
140
+ if (profile?.ego) {
141
+ resolved.provider = profile.ego.resolvedProvider || profile.ego.provider;
142
+ resolved.model = profile.ego.resolvedModel || profile.ego.model;
143
+ // Pass egoModel as object { provider, model } — tutor-core's resolveModel()
144
+ // supports both string ("provider.model") and object formats, but aliases
145
+ // containing dots (e.g., "kimi-k2.5") break the string format's split('.').
146
+ resolved.egoModel = { provider: profile.ego.provider, model: profile.ego.model };
147
+ if (profile.ego.hyperparameters && !resolved.hyperparameters) {
148
+ resolved.hyperparameters = profile.ego.hyperparameters;
149
+ }
150
+ }
151
+ if (profile?.superego) {
152
+ resolved.superegoModel = { provider: profile.superego.provider, model: profile.superego.model };
153
+ if (profile.superego.hyperparameters && !resolved.superegoHyperparameters) {
154
+ resolved.superegoHyperparameters = profile.superego.hyperparameters;
155
+ }
156
+ }
157
+
158
+ // Extract factorial factor tags and learner architecture from profile
159
+ const rawProfile = evalConfigLoader.loadTutorAgents()?.profiles?.[resolved.profileName];
160
+ if (rawProfile?.factors) {
161
+ resolved.factors = rawProfile.factors;
162
+ }
163
+ if (rawProfile?.learner_architecture) {
164
+ resolved.learnerArchitecture = rawProfile.learner_architecture;
165
+ }
166
+ }
167
+
168
+ // Apply CLI --model override (replaces ego and superego models, preserves factorial metadata)
169
+ if (config.modelOverride) {
170
+ try {
171
+ const r = evalConfigLoader.resolveModel(config.modelOverride);
172
+ resolved.provider = r.provider;
173
+ resolved.model = r.model;
174
+ resolved.egoModel = { provider: r.provider, model: r.model };
175
+ if (resolved.superegoModel) {
176
+ resolved.superegoModel = { provider: r.provider, model: r.model };
177
+ }
178
+ } catch (e) {
179
+ throw new Error(`Invalid --model override "${config.modelOverride}": ${e.message}`);
180
+ }
181
+ }
182
+
183
+ // Apply CLI --ego-model override (replaces only ego model)
184
+ if (config.egoModelOverride) {
185
+ try {
186
+ const r = evalConfigLoader.resolveModel(config.egoModelOverride);
187
+ resolved.egoModel = { provider: r.provider, model: r.model };
188
+ // Also update top-level provider/model for compatibility
189
+ resolved.provider = r.provider;
190
+ resolved.model = r.model;
191
+ } catch (e) {
192
+ throw new Error(`Invalid --ego-model override "${config.egoModelOverride}": ${e.message}`);
193
+ }
194
+ }
195
+
196
+ // Apply CLI --superego-model override (replaces only superego model)
197
+ if (config.superegoModelOverride && resolved.superegoModel) {
198
+ try {
199
+ const r = evalConfigLoader.resolveModel(config.superegoModelOverride);
200
+ resolved.superegoModel = { provider: r.provider, model: r.model };
201
+ } catch (e) {
202
+ throw new Error(`Invalid --superego-model override "${config.superegoModelOverride}": ${e.message}`);
203
+ }
204
+ }
205
+
206
+ return resolved;
207
+ }
208
+
209
+ /**
210
+ * Filter scenarios by cluster name(s).
211
+ * Supported clusters: 'single-turn', 'multi-turn', or category names (core, mood, benchmark, recognition, multi_turn).
212
+ * Comma-separated values are OR'd together.
213
+ */
214
+ function applyScenarioFilter(scenarios, filter) {
215
+ const clusters = filter.split(',').map(s => s.trim().toLowerCase());
216
+ return scenarios.filter(s => {
217
+ for (const c of clusters) {
218
+ if (c === 'single-turn' && !s.isMultiTurn) return true;
219
+ if (c === 'multi-turn' && s.isMultiTurn) return true;
220
+ if (s.category === c) return true;
221
+ }
222
+ return false;
223
+ });
224
+ }
11
225
 
12
226
  // Rate limiting settings
13
- const DEFAULT_PARALLELISM = 2;
14
- const REQUEST_DELAY_MS = 500;
227
+ const DEFAULT_PARALLELISM = 3;
228
+ const REQUEST_DELAY_MS = 200;
15
229
  const MAX_RETRIES = 3;
16
230
  const INITIAL_RETRY_DELAY_MS = 2000; // Start with 2 seconds
17
231
 
@@ -29,6 +243,23 @@ function sleep(ms) {
29
243
  return new Promise(resolve => setTimeout(resolve, ms));
30
244
  }
31
245
 
246
+ /**
247
+ * Format a progress tag with percentage and elapsed time.
248
+ * @param {number} completed - Completed tests
249
+ * @param {number} total - Total tests
250
+ * @param {number} startTime - Start timestamp (Date.now())
251
+ * @returns {string} e.g. "[3/10] (30%) 1m 23s"
252
+ */
253
+ function formatProgress(completed, total, startTime) {
254
+ const pct = total > 0 ? Math.round((completed / total) * 100) : 0;
255
+ const elapsedMs = Date.now() - startTime;
256
+ const elapsedSec = Math.round(elapsedMs / 1000);
257
+ const min = Math.floor(elapsedSec / 60);
258
+ const sec = elapsedSec % 60;
259
+ const elapsed = min > 0 ? `${min}m ${sec}s` : `${sec}s`;
260
+ return `[${completed}/${total}] (${pct}%) ${elapsed}`;
261
+ }
262
+
32
263
  /**
33
264
  * Retry wrapper for API calls with exponential backoff
34
265
  * Handles 429 rate limit errors from OpenRouter free tier
@@ -68,6 +299,467 @@ async function retryWithBackoff(fn, context = {}, maxRetries = MAX_RETRIES) {
68
299
  throw lastError;
69
300
  }
70
301
 
302
+ // ---------------------------------------------------------------------------
303
+ // Structured context extraction — parse markdown learner context into
304
+ // labeled fields so the model can't miss key signals.
305
+ // See notes/baseline-prompt-v2-2026-02-02.md for rationale.
306
+ // ---------------------------------------------------------------------------
307
+
308
+ /**
309
+ * Extract key signals from markdown learner context and prepend a
310
+ * structured summary block. The original context is preserved below.
311
+ */
312
+ function structureLearnerContext(rawContext) {
313
+ if (!rawContext || typeof rawContext !== 'string') return rawContext;
314
+
315
+ const fields = {};
316
+
317
+ // User type
318
+ if (/\bnew user\b/i.test(rawContext)) {
319
+ fields['Learner Type'] = 'New user (no prior history)';
320
+ } else {
321
+ const sessionMatch = rawContext.match(/(\d+)\s+sessions?/i);
322
+ const eventMatch = rawContext.match(/(\d+)\s+total events?/i);
323
+ fields['Learner Type'] = 'Returning user' +
324
+ (sessionMatch ? `, ${sessionMatch[1]} sessions` : '') +
325
+ (eventMatch ? `, ${eventMatch[1]} events` : '');
326
+ }
327
+
328
+ // Current content
329
+ const viewingMatch = rawContext.match(/\*\*Currently viewing\*\*:\s*(.+)/);
330
+ if (viewingMatch) {
331
+ fields['Current Content'] = viewingMatch[1].trim();
332
+ }
333
+
334
+ // Struggle signals
335
+ const struggleMatch = rawContext.match(/\*\*Struggle signals? detected\*\*:\s*(\d+)/i);
336
+ if (struggleMatch) {
337
+ fields['Struggle Signals'] = `${struggleMatch[1]} detected`;
338
+ }
339
+
340
+ // Quiz/activity retries
341
+ const retryMatch = rawContext.match(/retried?\s+(\d+)\s+times?/i);
342
+ if (retryMatch) {
343
+ fields['Activity Retries'] = `${retryMatch[1]} retries`;
344
+ }
345
+ // Also check for "Retrying activity" lines
346
+ const retryLines = (rawContext.match(/Retrying activity/gi) || []).length;
347
+ if (retryLines > 0 && !retryMatch) {
348
+ fields['Activity Retries'] = `${retryLines} retries in timeline`;
349
+ }
350
+
351
+ // Primary struggle area
352
+ const struggleAreaMatch = rawContext.match(/\*\*Primary struggle area\*\*:\s*(.+)/);
353
+ if (struggleAreaMatch) {
354
+ fields['Primary Struggle'] = struggleAreaMatch[1].trim();
355
+ }
356
+
357
+ // Concept difficulty
358
+ const conceptMatch = rawContext.match(/\*\*Concept difficulty\*\*:\s*(.+)/);
359
+ if (conceptMatch) {
360
+ fields['Difficult Concepts'] = conceptMatch[1].trim();
361
+ }
362
+
363
+ // Mood / emotional signals from chat history
364
+ const chatLines = [];
365
+ const chatPattern = /- User:\s*"([^"]+)"/g;
366
+ let m;
367
+ while ((m = chatPattern.exec(rawContext)) !== null) {
368
+ chatLines.push(m[1]);
369
+ }
370
+ if (chatLines.length > 0) {
371
+ fields['Learner Messages'] = chatLines.join(' | ');
372
+ }
373
+
374
+ // Completed lectures
375
+ const completedMatch = rawContext.match(/\*\*Completed lectures?\*\*:\s*(.+)/);
376
+ if (completedMatch) {
377
+ fields['Completed Lectures'] = completedMatch[1].trim();
378
+ }
379
+
380
+ // Time on page
381
+ const timeMatch = rawContext.match(/\*\*Time on page\*\*:\s*(.+)/);
382
+ if (timeMatch) {
383
+ fields['Time on Page'] = timeMatch[1].trim();
384
+ }
385
+
386
+ // Scroll depth
387
+ const scrollMatch = rawContext.match(/\*\*Scroll depth\*\*:\s*(.+)/);
388
+ if (scrollMatch) {
389
+ fields['Scroll Depth'] = scrollMatch[1].trim();
390
+ }
391
+
392
+ // Performance / success rate
393
+ const avgScoreMatch = rawContext.match(/\*\*Average score\*\*:\s*(.+)/);
394
+ if (avgScoreMatch) {
395
+ fields['Average Score'] = avgScoreMatch[1].trim();
396
+ }
397
+
398
+ // Activities completion
399
+ const actCompMatch = rawContext.match(/\*\*Activities completed\*\*:\s*(.+)/);
400
+ if (actCompMatch) {
401
+ fields['Activities Completed'] = actCompMatch[1].trim();
402
+ }
403
+
404
+ // If no meaningful fields extracted, return original unchanged
405
+ const fieldKeys = Object.keys(fields);
406
+ if (fieldKeys.length <= 1) return rawContext; // only learner type
407
+
408
+ // Build structured summary block with explicit instruction header
409
+ const lines = [
410
+ '⚠️ YOU MUST REFERENCE AT LEAST ONE OF THESE SIGNALS BY NAME IN YOUR SUGGESTION:',
411
+ '<structured_context_summary>',
412
+ ];
413
+ for (const [key, value] of Object.entries(fields)) {
414
+ lines.push(`${key}: ${value}`);
415
+ }
416
+ lines.push('</structured_context_summary>');
417
+ lines.push('Your suggestion MUST mention specific data from the summary above. Generic responses are WRONG.');
418
+ lines.push('');
419
+
420
+ return lines.join('\n') + rawContext;
421
+ }
422
+
423
+ // ---------------------------------------------------------------------------
424
+ // Multi-turn context-building utilities (moved from multiTurnRunner.js)
425
+ // ---------------------------------------------------------------------------
426
+
427
+ /**
428
+ * Build updated context for a follow-up turn in a multi-turn scenario
429
+ */
430
+ function buildMultiTurnContext(options) {
431
+ const {
432
+ originalContext,
433
+ conversationHistory = [],
434
+ currentTurn,
435
+ previousSuggestion,
436
+ } = options;
437
+
438
+ const contextParts = [];
439
+
440
+ // sessionEvolution is now injected into the system prompt (not user context).
441
+ // See systemPromptExtension threading through generateAndEvaluateTurn → tutor-core.
442
+
443
+ contextParts.push(originalContext);
444
+
445
+ if (conversationHistory.length > 0) {
446
+ contextParts.push('\n### Conversation History');
447
+ for (const turn of conversationHistory) {
448
+ contextParts.push(formatTurnForContext(turn));
449
+ }
450
+ }
451
+
452
+ // Note: "Previous Tutor Suggestion" block removed — it duplicated the last
453
+ // entry already present in conversation history above.
454
+
455
+ if (currentTurn?.learner_action) {
456
+ contextParts.push('\n### Learner Action');
457
+ contextParts.push(formatLearnerAction(currentTurn));
458
+ }
459
+
460
+ if (currentTurn?.context_update) {
461
+ contextParts.push('\n' + currentTurn.context_update.trim());
462
+ }
463
+
464
+ return contextParts.join('\n');
465
+ }
466
+
467
+ /**
468
+ * Format a previous turn for inclusion in context
469
+ */
470
+ function formatTurnForContext(turn) {
471
+ const lines = [];
472
+ lines.push(`\n**Turn ${turn.turnIndex + 1}** (${turn.turnId})`);
473
+
474
+ if (turn.suggestion) {
475
+ lines.push(`- Tutor suggested: "${turn.suggestion.title || turn.suggestion.message?.substring(0, 100)}..."`);
476
+ if (turn.suggestion.actionTarget) {
477
+ lines.push(` - Action: ${turn.suggestion.action} → ${turn.suggestion.actionTarget}`);
478
+ }
479
+ }
480
+
481
+ if (turn.learnerAction) {
482
+ lines.push(`- Learner response: ${turn.learnerAction}`);
483
+ if (turn.learnerMessage) {
484
+ lines.push(` - Message: "${turn.learnerMessage}"`);
485
+ }
486
+ }
487
+
488
+ return lines.join('\n');
489
+ }
490
+
491
+ /**
492
+ * Format a suggestion for inclusion in conversation context
493
+ */
494
+ function formatSuggestionForContext(suggestion) {
495
+ const lines = [];
496
+
497
+ if (suggestion.title) {
498
+ lines.push(`**Title**: ${suggestion.title}`);
499
+ }
500
+ if (suggestion.message) {
501
+ lines.push(`**Message**: ${suggestion.message}`);
502
+ }
503
+ if (suggestion.action && suggestion.actionTarget) {
504
+ lines.push(`**Suggested Action**: ${suggestion.action} → ${suggestion.actionTarget}`);
505
+ }
506
+ // Note: reasoning intentionally excluded — it's internal justification that
507
+ // inflates context without helping the model generate the next suggestion.
508
+ // Title + message + action are sufficient for conversational continuity.
509
+
510
+ return lines.join('\n');
511
+ }
512
+
513
+ /**
514
+ * Format learner action for context
515
+ */
516
+ function formatLearnerAction(turn) {
517
+ const action = turn.learner_action;
518
+ const details = turn.action_details || {};
519
+ const lines = [];
520
+
521
+ switch (action) {
522
+ case 'followed_suggestion':
523
+ lines.push(`Learner **followed** the suggestion`);
524
+ if (details.action_taken) {
525
+ lines.push(`- Action: ${details.action_taken}`);
526
+ }
527
+ break;
528
+
529
+ case 'ignored_suggestion':
530
+ lines.push(`Learner **did not follow** the suggestion`);
531
+ if (details.explicit_rejection) {
532
+ lines.push(`- Explicitly rejected`);
533
+ }
534
+ break;
535
+
536
+ case 'asked_followup':
537
+ lines.push(`Learner **asked a follow-up question**`);
538
+ break;
539
+
540
+ case 'reported_confusion':
541
+ lines.push(`Learner **reported confusion**`);
542
+ break;
543
+
544
+ case 'completed_activity':
545
+ lines.push(`Learner **completed an activity**`);
546
+ if (details.activity_id) {
547
+ lines.push(`- Activity: ${details.activity_id}`);
548
+ }
549
+ if (details.success !== undefined) {
550
+ lines.push(`- Success: ${details.success}`);
551
+ }
552
+ if (details.score !== undefined) {
553
+ lines.push(`- Score: ${details.score}%`);
554
+ }
555
+ break;
556
+
557
+ default:
558
+ lines.push(`Learner action: ${action}`);
559
+ }
560
+
561
+ if (details.message) {
562
+ lines.push(`\n**Learner said**: "${details.message}"`);
563
+ }
564
+
565
+ return lines.join('\n');
566
+ }
567
+
568
+ /**
569
+ * Format learner action for transcript display (cleaner format for CLI)
570
+ */
571
+ function formatLearnerActionForTranscript(turn) {
572
+ const action = turn.learner_action;
573
+ const details = turn.action_details || {};
574
+ const lines = [];
575
+
576
+ const actionLabels = {
577
+ 'followed_suggestion': '✓ Followed suggestion',
578
+ 'ignored_suggestion': '✗ Ignored suggestion',
579
+ 'asked_followup': '❓ Asked follow-up question',
580
+ 'reported_confusion': '😕 Reported confusion',
581
+ 'completed_activity': '✅ Completed activity',
582
+ 'navigated_away': '🔄 Navigated away',
583
+ 'requested_hint': '💡 Requested hint',
584
+ };
585
+
586
+ lines.push(actionLabels[action] || `Action: ${action}`);
587
+
588
+ if (details.action_taken) {
589
+ lines.push(` → ${details.action_taken}`);
590
+ }
591
+ if (details.activity_id) {
592
+ lines.push(` Activity: ${details.activity_id}`);
593
+ }
594
+ if (details.success !== undefined) {
595
+ lines.push(` Success: ${details.success ? 'Yes' : 'No'}`);
596
+ }
597
+ if (details.score !== undefined) {
598
+ lines.push(` Score: ${details.score}%`);
599
+ }
600
+
601
+ if (details.message) {
602
+ lines.push(`\n "${details.message}"`);
603
+ }
604
+
605
+ return lines.join('\n');
606
+ }
607
+
608
+ // ---------------------------------------------------------------------------
609
+ // Shared generation + evaluation helper
610
+ // ---------------------------------------------------------------------------
611
+
612
+ /**
613
+ * Generate a tutor suggestion and evaluate it with the rubric.
614
+ *
615
+ * This is the single code path used by BOTH single-turn and multi-turn
616
+ * evaluations. It encapsulates:
617
+ * 1. retryWithBackoff → tutorApi.generateSuggestions
618
+ * 2. rubricEvaluator.quickValidate
619
+ * 3. rubricEvaluator.evaluateSuggestion (unless skipped)
620
+ *
621
+ * @param {Object} context - The learner context object (from tutorApi.buildContext)
622
+ * @param {Object} resolvedConfig - Resolved config with provider, model, egoModel, etc.
623
+ * @param {Object} turnMeta - Turn-level metadata for evaluation
624
+ * @param {string} turnMeta.scenarioName - Human-readable scenario name
625
+ * @param {string} turnMeta.description - Description for the rubric judge
626
+ * @param {string} turnMeta.expectedBehavior - Expected tutor behavior
627
+ * @param {string} turnMeta.learnerContext - Raw learner context string (for rubric)
628
+ * @param {string[]} turnMeta.requiredElements - Required elements for validation
629
+ * @param {string[]} turnMeta.forbiddenElements - Forbidden elements for validation
630
+ * @param {Object} options - Evaluation options
631
+ * @param {boolean} options.skipRubricEval
632
+ * @param {string} options.outputSize
633
+ * @param {string} options.superegoStrategy
634
+ * @param {string} options.judgeOverride
635
+ * @param {boolean} options.useDialogue
636
+ * @param {number} options.maxRounds
637
+ * @param {Function} options.log
638
+ * @param {string} options.scenarioId - Used for debug logging
639
+ * @returns {Promise<Object>} { genResult, suggestion, validation, rubricResult, turnScore }
640
+ */
641
+ async function generateAndEvaluateTurn(context, resolvedConfig, turnMeta, options = {}) {
642
+ const {
643
+ skipRubricEval = false,
644
+ outputSize = 'normal',
645
+ superegoStrategy = null,
646
+ judgeOverride = null,
647
+ useDialogue = false,
648
+ maxRounds = 0,
649
+ log = () => {},
650
+ scenarioId = '',
651
+ systemPromptExtension = null,
652
+ learnerId = null, // For Writing Pad memory persistence
653
+ } = options;
654
+
655
+ // Generate suggestions via tutor API with retry logic
656
+ const genResult = await retryWithBackoff(
657
+ () => tutorApi.generateSuggestions(context, {
658
+ provider: resolvedConfig.provider,
659
+ model: resolvedConfig.model,
660
+ egoModel: resolvedConfig.egoModel,
661
+ superegoModel: resolvedConfig.superegoModel || null,
662
+ profileName: resolvedConfig.profileName,
663
+ hyperparameters: resolvedConfig.hyperparameters || {},
664
+ trace: true,
665
+ superegoStrategy,
666
+ outputSize,
667
+ useDialogue,
668
+ maxRounds,
669
+ systemPromptExtension,
670
+ learnerId, // Activates Writing Pad three-layer memory
671
+ }),
672
+ { log }
673
+ );
674
+
675
+ if (!genResult.success) {
676
+ log(`Generation failed: ${genResult.error}`, 'error');
677
+ return { genResult, suggestion: null, validation: null, rubricResult: null, turnScore: null };
678
+ }
679
+
680
+ const suggestionCount = genResult.suggestions?.length || 0;
681
+ log(`Generated ${suggestionCount} suggestion(s) in ${genResult.metadata?.latencyMs}ms`, 'success');
682
+
683
+ if (genResult.metadata?.dialogueRounds) {
684
+ log(`Dialogue rounds: ${genResult.metadata.dialogueRounds}`, 'info');
685
+ }
686
+
687
+ // Quick validation (rule-based)
688
+ log('Running validation checks...', 'info');
689
+ const suggestion = genResult.suggestions?.[0];
690
+ const validation = suggestion
691
+ ? rubricEvaluator.quickValidate(suggestion, {
692
+ requiredElements: turnMeta.requiredElements,
693
+ requiredElementsAny: turnMeta.requiredElementsAny,
694
+ forbiddenElements: turnMeta.forbiddenElements,
695
+ })
696
+ : { passesRequired: false, passesForbidden: true, requiredMissing: ['No suggestions generated'] };
697
+
698
+ log(`Validation: required=${validation.passesRequired ? 'PASS' : 'FAIL'}, forbidden=${validation.passesForbidden ? 'PASS' : 'FAIL'}`, validation.passesRequired && validation.passesForbidden ? 'success' : 'warning');
699
+
700
+ let rubricResult = null;
701
+ if (!skipRubricEval && suggestion) {
702
+ log('Running AI rubric evaluation...', 'info');
703
+ debugLog(`[evaluationRunner] Running rubric evaluation for ${scenarioId}...`);
704
+
705
+ // Build dialogue context for the judge (if available from multi-turn)
706
+ const dialogueContext = (options.conversationHistory || options.dialogueTrace || options.consolidatedTrace)
707
+ ? {
708
+ conversationHistory: options.conversationHistory || null,
709
+ dialogueTrace: options.dialogueTrace || null,
710
+ consolidatedTrace: options.consolidatedTrace || null,
711
+ }
712
+ : null;
713
+
714
+ rubricResult = await rubricEvaluator.evaluateSuggestion(suggestion, {
715
+ name: turnMeta.scenarioName,
716
+ description: turnMeta.description,
717
+ expectedBehavior: turnMeta.expectedBehavior,
718
+ learnerContext: turnMeta.learnerContext,
719
+ requiredElements: turnMeta.requiredElements,
720
+ forbiddenElements: turnMeta.forbiddenElements,
721
+ }, { dialogueContext }, { judgeOverride });
722
+
723
+ if (rubricResult) {
724
+ debugLog(`[evaluationRunner] Rubric result: success=${rubricResult.success}, ` +
725
+ `overallScore=${rubricResult.overallScore}, ` +
726
+ `scoresCount=${Object.keys(rubricResult.scores || {}).length}, ` +
727
+ `error=${rubricResult.error || 'none'}`);
728
+ if (rubricResult.success) {
729
+ log(`Rubric evaluation complete: score=${rubricResult.overallScore?.toFixed(1)}`, 'success');
730
+ } else {
731
+ log(`Rubric evaluation failed: ${rubricResult.error || 'unknown error'}`, 'error');
732
+ }
733
+ }
734
+ } else if (skipRubricEval) {
735
+ debugLog(`[evaluationRunner] Skipping rubric evaluation (--fast mode)`);
736
+ log('Skipping AI rubric evaluation (fast mode)', 'info');
737
+ } else if (!suggestion) {
738
+ debugLog(`[evaluationRunner] Skipping rubric evaluation (no suggestion generated)`);
739
+ log('Skipping rubric evaluation (no suggestion generated)', 'warning');
740
+ }
741
+
742
+ // Calculate turn score
743
+ let turnScore = null;
744
+ let scoringMethod = null;
745
+ if (rubricResult?.success) {
746
+ turnScore = rubricResult.overallScore;
747
+ scoringMethod = 'rubric';
748
+ } else if (suggestion && rubricResult && !rubricResult.success) {
749
+ // Judge API failed — do NOT silently produce a synthetic score.
750
+ // Store null so downstream aggregation excludes this data point.
751
+ turnScore = null;
752
+ scoringMethod = 'judge_failed';
753
+ log(`WARNING: Judge evaluation failed for ${scenarioId}; score stored as null (was: ${(validation.passesRequired ? 50 : 0) + (validation.passesForbidden ? 50 : 0)} from keyword fallback). Error: ${rubricResult.error || 'unknown'}`, 'warning');
754
+ } else if (suggestion && !rubricResult) {
755
+ // Rubric evaluation was skipped (skipRubricEval=true) — no score available
756
+ turnScore = null;
757
+ scoringMethod = 'skipped';
758
+ }
759
+
760
+ return { genResult, suggestion, validation, rubricResult, turnScore, scoringMethod };
761
+ }
762
+
71
763
  /**
72
764
  * Run a complete evaluation across configurations and scenarios
73
765
  *
@@ -83,16 +775,47 @@ export async function runEvaluation(options = {}) {
83
775
  skipRubricEval = false, // Skip AI-based rubric evaluation (faster)
84
776
  description = null,
85
777
  verbose = false,
778
+ scenarioFilter = null, // Cluster filter: 'single-turn', 'multi-turn', or category names
779
+ modelOverride = null, // CLI --model override (e.g. "openrouter.nemotron")
780
+ egoModelOverride = null, // CLI --ego-model override (replaces only ego model)
781
+ superegoModelOverride = null, // CLI --superego-model override (replaces only superego model)
86
782
  } = options;
87
783
 
88
784
  const log = verbose ? console.log : () => {};
89
785
 
90
- // Resolve scenarios
91
- const allScenarios = tutorApi.listScenarios();
92
- const targetScenarios = scenarios === 'all'
786
+ // Log domain override env vars (always visible, not gated on verbose)
787
+ if (process.env.EVAL_CONTENT_PATH || process.env.EVAL_SCENARIOS_FILE) {
788
+ console.log('[evaluationRunner] Domain overrides detected:');
789
+ if (process.env.EVAL_CONTENT_PATH) console.log(` EVAL_CONTENT_PATH = ${process.env.EVAL_CONTENT_PATH}`);
790
+ if (process.env.EVAL_SCENARIOS_FILE) console.log(` EVAL_SCENARIOS_FILE = ${process.env.EVAL_SCENARIOS_FILE}`);
791
+ }
792
+
793
+ // Initialize content resolver from eval settings (opt-in)
794
+ const contentConfig = evalConfigLoader.getContentConfig();
795
+ if (contentConfig?.content_package_path) {
796
+ contentResolver.configure({
797
+ contentPackagePath: contentConfig.content_package_path,
798
+ maxLectureChars: contentConfig.max_lecture_chars,
799
+ includeSpeakerNotes: contentConfig.include_speaker_notes,
800
+ });
801
+ if (contentResolver.isConfigured()) {
802
+ console.log(`[evaluationRunner] Content: ${contentConfig.content_package_path}`);
803
+ } else {
804
+ console.warn('[evaluationRunner] Content path set but directory not found — using fallback curriculum');
805
+ }
806
+ }
807
+
808
+ // Resolve scenarios (loaded from eval repo's local rubric)
809
+ const allScenarios = evalConfigLoader.listScenarios();
810
+ let targetScenarios = scenarios === 'all'
93
811
  ? allScenarios
94
812
  : allScenarios.filter(s => scenarios.includes(s.id));
95
813
 
814
+ // Apply cluster filter if specified
815
+ if (scenarioFilter) {
816
+ targetScenarios = applyScenarioFilter(targetScenarios, scenarioFilter);
817
+ }
818
+
96
819
  if (targetScenarios.length === 0) {
97
820
  throw new Error('No scenarios to run');
98
821
  }
@@ -100,9 +823,22 @@ export async function runEvaluation(options = {}) {
100
823
  // Resolve configurations
101
824
  let targetConfigs = [];
102
825
  if (configurations === 'all') {
103
- targetConfigs = tutorApi.listConfigurations();
826
+ targetConfigs = evalConfigLoader.listConfigurations();
827
+ } else if (configurations === 'factorial') {
828
+ const FACTORIAL_CELLS = [
829
+ 'cell_1_base_single_unified', 'cell_2_base_single_psycho',
830
+ 'cell_3_base_multi_unified', 'cell_4_base_multi_psycho',
831
+ 'cell_5_recog_single_unified', 'cell_6_recog_single_psycho',
832
+ 'cell_7_recog_multi_unified', 'cell_8_recog_multi_psycho',
833
+ ];
834
+ targetConfigs = FACTORIAL_CELLS.map(name => ({
835
+ provider: null,
836
+ model: null,
837
+ profileName: name,
838
+ label: name,
839
+ }));
104
840
  } else if (configurations === 'profiles') {
105
- const profiles = tutorApi.listProfiles();
841
+ const profiles = evalConfigLoader.listTutorProfiles();
106
842
  targetConfigs = profiles.map(p => ({
107
843
  provider: null,
108
844
  model: null,
@@ -113,6 +849,24 @@ export async function runEvaluation(options = {}) {
113
849
  targetConfigs = configurations;
114
850
  }
115
851
 
852
+ // Apply model overrides: CLI flags take precedence over YAML-level config
853
+ const yamlOverrides = evalConfigLoader.getTutorModelOverrides();
854
+
855
+ // Effective overrides: CLI > YAML > none
856
+ const effectiveModelOverride = modelOverride || yamlOverrides.modelOverride;
857
+ const effectiveEgoModelOverride = egoModelOverride || yamlOverrides.egoModelOverride;
858
+ const effectiveSuperegoModelOverride = superegoModelOverride || yamlOverrides.superegoModelOverride;
859
+
860
+ if (effectiveModelOverride) {
861
+ targetConfigs = targetConfigs.map(c => ({ ...c, modelOverride: effectiveModelOverride }));
862
+ }
863
+ if (effectiveEgoModelOverride) {
864
+ targetConfigs = targetConfigs.map(c => ({ ...c, egoModelOverride: effectiveEgoModelOverride }));
865
+ }
866
+ if (effectiveSuperegoModelOverride) {
867
+ targetConfigs = targetConfigs.map(c => ({ ...c, superegoModelOverride: effectiveSuperegoModelOverride }));
868
+ }
869
+
116
870
  if (targetConfigs.length === 0) {
117
871
  throw new Error('No configurations to test');
118
872
  }
@@ -123,7 +877,7 @@ export async function runEvaluation(options = {}) {
123
877
  log(` Runs per config: ${runsPerConfig}`);
124
878
  log(` Total tests: ${targetScenarios.length * targetConfigs.length * runsPerConfig}`);
125
879
 
126
- // Create evaluation run record
880
+ // Create evaluation run record with reproducibility metadata
127
881
  const run = evaluationStore.createRun({
128
882
  description: description || `Evaluation: ${targetConfigs.length} configs x ${targetScenarios.length} scenarios`,
129
883
  totalScenarios: targetScenarios.length,
@@ -131,10 +885,51 @@ export async function runEvaluation(options = {}) {
131
885
  metadata: {
132
886
  runsPerConfig,
133
887
  skipRubricEval,
888
+ modelOverride: effectiveModelOverride || null,
889
+ egoModelOverride: effectiveEgoModelOverride || null,
890
+ superegoModelOverride: effectiveSuperegoModelOverride || null,
891
+ // Store scenario IDs and profile names for accurate resume
892
+ scenarioIds: targetScenarios.map(s => s.id),
893
+ profileNames: targetConfigs.map(c => c.profileName).filter(Boolean),
894
+ // Store env overrides so evaluate/rejudge can re-apply them
895
+ scenariosFile: process.env.EVAL_SCENARIOS_FILE || null,
896
+ contentPath: process.env.EVAL_CONTENT_PATH || null,
897
+ packageVersion: pkg.version,
898
+ gitCommit: getGitCommitHash(),
899
+ pid: process.pid,
134
900
  },
135
901
  });
136
902
 
137
- log(`\nRun ID: ${run.id}\n`);
903
+ const totalTests = targetScenarios.length * targetConfigs.length * runsPerConfig;
904
+
905
+ // Store total_tests upfront so progress can be tracked for in-progress runs
906
+ evaluationStore.updateRun(run.id, { status: 'running', totalTests });
907
+
908
+ const profileNames = targetConfigs.map(c => c.label || c.profileName || `${c.provider}/${c.model}`);
909
+ const scenarioNames = targetScenarios.map(s => s.name || s.id);
910
+
911
+ // Print run ID + progress log path immediately so users can `watch`
912
+ const progressLogPath = getProgressLogPath(run.id);
913
+ console.log(`\nRun ID: ${run.id} (use 'watch ${run.id}' to monitor)`);
914
+ console.log(`Progress log: ${progressLogPath}\n`);
915
+
916
+ // Instantiate progress logger and streaming reporter
917
+ const progressLogger = new ProgressLogger(run.id);
918
+ const reporter = new StreamingReporter({
919
+ totalTests,
920
+ totalScenarios: targetScenarios.length,
921
+ profiles: profileNames,
922
+ scenarios: scenarioNames,
923
+ });
924
+
925
+ progressLogger.runStart({
926
+ totalTests,
927
+ totalScenarios: targetScenarios.length,
928
+ totalConfigurations: targetConfigs.length,
929
+ scenarios: scenarioNames,
930
+ profiles: profileNames,
931
+ description: description || run.description,
932
+ });
138
933
 
139
934
  // Register with monitoring service for realtime tracking
140
935
  monitoringService.startSession(run.id, {
@@ -145,59 +940,223 @@ export async function runEvaluation(options = {}) {
145
940
 
146
941
  const results = [];
147
942
  let completedTests = 0;
148
- const totalTests = targetScenarios.length * targetConfigs.length * runsPerConfig;
149
-
150
- // Run evaluations
151
- for (const config of targetConfigs) {
152
- log(`\nConfiguration: ${config.label || `${config.provider}/${config.model}`}`);
153
- log('='.repeat(60));
154
943
 
155
- for (const scenario of targetScenarios) {
944
+ // Build flat list of all tests — SCENARIO-FIRST ordering
945
+ // All profiles for scenario 1 complete before scenario 2 starts.
946
+ const allTests = [];
947
+ for (const scenario of targetScenarios) {
948
+ for (const config of targetConfigs) {
156
949
  for (let runNum = 0; runNum < runsPerConfig; runNum++) {
157
- try {
158
- const result = await runSingleTest(scenario, config, {
159
- skipRubricEval,
160
- verbose,
161
- });
162
-
163
- // Store result
164
- evaluationStore.storeResult(run.id, result);
165
- results.push(result);
166
-
167
- completedTests++;
168
- log(` [${completedTests}/${totalTests}] ${scenario.id}: ${result.success ? `score=${result.overallScore?.toFixed(1)}` : 'FAILED'}`);
169
-
170
- // Update monitoring session with progress
171
- monitoringService.recordEvent(run.id, {
172
- type: 'evaluation_test',
173
- inputTokens: result.inputTokens || 0,
174
- outputTokens: result.outputTokens || 0,
175
- latencyMs: result.latencyMs || 0,
176
- round: completedTests,
177
- approved: result.success,
178
- });
950
+ allTests.push({ config, scenario, runNum });
951
+ }
952
+ }
953
+ }
179
954
 
180
- // Rate limiting
181
- await sleep(REQUEST_DELAY_MS);
182
- } catch (error) {
183
- log(` [${completedTests}/${totalTests}] ${scenario.id}: ERROR - ${error.message}`);
184
- completedTests++;
185
-
186
- // Record error in monitoring
187
- monitoringService.recordEvent(run.id, {
188
- type: 'evaluation_error',
189
- round: completedTests,
190
- error: error.message,
191
- });
192
- }
955
+ // Scenario completion tracking
956
+ const scenarioProgress = new Map();
957
+ for (const scenario of targetScenarios) {
958
+ scenarioProgress.set(scenario.id, {
959
+ total: targetConfigs.length * runsPerConfig,
960
+ completed: 0,
961
+ scores: [],
962
+ scenarioName: scenario.name || scenario.id,
963
+ });
964
+ }
965
+ let completedScenarios = 0;
966
+
967
+ // Parallel worker pool
968
+ async function processQueue(queue, workerCount, processItem) {
969
+ const items = [...queue];
970
+ let index = 0;
971
+
972
+ async function worker() {
973
+ while (index < items.length) {
974
+ const i = index++;
975
+ await processItem(items[i]);
976
+ await sleep(REQUEST_DELAY_MS);
193
977
  }
194
978
  }
979
+
980
+ const workers = Array.from(
981
+ { length: Math.min(workerCount, items.length) },
982
+ () => worker()
983
+ );
984
+ await Promise.all(workers);
195
985
  }
196
986
 
197
- // Update run status
987
+ log(`\nRunning ${allTests.length} tests with parallelism=${parallelism}...\n`);
988
+
989
+ const runStartTime = Date.now();
990
+
991
+ await processQueue(allTests, parallelism, async ({ config, scenario }) => {
992
+ const profileLabel = config.label || config.profileName || '';
993
+
994
+ // Emit test_start
995
+ progressLogger.testStart({
996
+ scenarioId: scenario.id,
997
+ scenarioName: scenario.name || scenario.id,
998
+ profileName: profileLabel,
999
+ });
1000
+
1001
+ try {
1002
+ const result = await runSingleTest(scenario, config, {
1003
+ skipRubricEval,
1004
+ verbose,
1005
+ });
1006
+
1007
+ // Store result (better-sqlite3 is synchronous, thread-safe for concurrent writes)
1008
+ evaluationStore.storeResult(run.id, result);
1009
+ results.push(result);
1010
+
1011
+ completedTests++;
1012
+
1013
+ // Emit test_complete event
1014
+ progressLogger.testComplete({
1015
+ scenarioId: scenario.id,
1016
+ scenarioName: scenario.name || scenario.id,
1017
+ profileName: profileLabel,
1018
+ success: result.success,
1019
+ overallScore: result.overallScore,
1020
+ baseScore: result.baseScore ?? null,
1021
+ recognitionScore: result.recognitionScore ?? null,
1022
+ latencyMs: result.latencyMs,
1023
+ completedCount: completedTests,
1024
+ totalTests,
1025
+ });
1026
+
1027
+ // Streaming reporter line
1028
+ reporter.onTestComplete({
1029
+ ...result,
1030
+ profileName: profileLabel,
1031
+ scenarioName: scenario.name || scenario.id,
1032
+ });
1033
+
1034
+ log(` ${formatProgress(completedTests, totalTests, runStartTime)} ${profileLabel} / ${scenario.id}: ${result.success ? `score=${result.overallScore?.toFixed(1)}` : 'FAILED'}`);
1035
+
1036
+ // Update monitoring session with progress
1037
+ monitoringService.recordEvent(run.id, {
1038
+ type: 'evaluation_test',
1039
+ inputTokens: result.inputTokens || 0,
1040
+ outputTokens: result.outputTokens || 0,
1041
+ latencyMs: result.latencyMs || 0,
1042
+ round: completedTests,
1043
+ approved: result.success,
1044
+ });
1045
+
1046
+ // Track scenario completion
1047
+ const sp = scenarioProgress.get(scenario.id);
1048
+ sp.completed++;
1049
+ if (result.overallScore != null) sp.scores.push(result.overallScore);
1050
+ if (sp.completed >= sp.total) {
1051
+ completedScenarios++;
1052
+ const avgScore = sp.scores.length > 0
1053
+ ? sp.scores.reduce((a, b) => a + b, 0) / sp.scores.length
1054
+ : null;
1055
+ progressLogger.scenarioComplete({
1056
+ scenarioId: scenario.id,
1057
+ scenarioName: sp.scenarioName,
1058
+ profileNames,
1059
+ avgScore,
1060
+ completedScenarios,
1061
+ totalScenarios: targetScenarios.length,
1062
+ });
1063
+ reporter.onScenarioComplete({
1064
+ scenarioName: sp.scenarioName,
1065
+ avgScore,
1066
+ completedScenarios,
1067
+ totalScenarios: targetScenarios.length,
1068
+ });
1069
+ }
1070
+ } catch (error) {
1071
+ completedTests++;
1072
+ log(` ${formatProgress(completedTests, totalTests, runStartTime)} ${profileLabel} / ${scenario.id}: ERROR - ${error.message}`);
1073
+
1074
+ // Store failed result so it shows up in the database instead of silently disappearing
1075
+ // Extract provider/model from nested ego config if not at top level (profile-based configs)
1076
+ const failedResult = {
1077
+ scenarioId: scenario.id,
1078
+ scenarioName: scenario.name || scenario.id,
1079
+ profileName: config.profileName,
1080
+ provider: config.provider || config.ego?.provider || 'unknown',
1081
+ model: config.model || config.ego?.model || 'unknown',
1082
+ egoModel: config.egoModel
1083
+ ? `${config.egoModel.provider}.${config.egoModel.model}`
1084
+ : config.ego ? `${config.ego.provider}.${config.ego.model}` : null,
1085
+ superegoModel: config.superegoModel
1086
+ ? `${config.superegoModel.provider}.${config.superegoModel.model}`
1087
+ : config.superego ? `${config.superego.provider}.${config.superego.model}` : null,
1088
+ factors: config.factors || null,
1089
+ learnerArchitecture: config.learnerArchitecture || null,
1090
+ success: false,
1091
+ errorMessage: error.message,
1092
+ };
1093
+ try {
1094
+ evaluationStore.storeResult(run.id, failedResult);
1095
+ results.push(failedResult);
1096
+ } catch (storeErr) {
1097
+ log(` [WARNING] Failed to store error result: ${storeErr.message}`);
1098
+ }
1099
+
1100
+ // Emit test_error event
1101
+ progressLogger.testError({
1102
+ scenarioId: scenario.id,
1103
+ scenarioName: scenario.name || scenario.id,
1104
+ profileName: profileLabel,
1105
+ errorMessage: error.message,
1106
+ completedCount: completedTests,
1107
+ totalTests,
1108
+ });
1109
+
1110
+ reporter.onTestError({
1111
+ scenarioName: scenario.name || scenario.id,
1112
+ profileName: profileLabel,
1113
+ errorMessage: error.message,
1114
+ });
1115
+
1116
+ // Record error in monitoring
1117
+ monitoringService.recordEvent(run.id, {
1118
+ type: 'evaluation_error',
1119
+ round: completedTests,
1120
+ error: error.message,
1121
+ });
1122
+
1123
+ // Track scenario completion even on error
1124
+ const sp = scenarioProgress.get(scenario.id);
1125
+ sp.completed++;
1126
+ if (sp.completed >= sp.total) {
1127
+ completedScenarios++;
1128
+ const avgScore = sp.scores.length > 0
1129
+ ? sp.scores.reduce((a, b) => a + b, 0) / sp.scores.length
1130
+ : null;
1131
+ progressLogger.scenarioComplete({
1132
+ scenarioId: scenario.id,
1133
+ scenarioName: sp.scenarioName,
1134
+ profileNames,
1135
+ avgScore,
1136
+ completedScenarios,
1137
+ totalScenarios: targetScenarios.length,
1138
+ });
1139
+ reporter.onScenarioComplete({
1140
+ scenarioName: sp.scenarioName,
1141
+ avgScore,
1142
+ completedScenarios,
1143
+ totalScenarios: targetScenarios.length,
1144
+ });
1145
+ }
1146
+ }
1147
+ });
1148
+
1149
+ const durationMs = Date.now() - runStartTime;
1150
+ const successfulTests = results.filter(r => r.success).length;
1151
+ const failedTests = completedTests - successfulTests;
1152
+
1153
+ // Emit run_complete
1154
+ progressLogger.runComplete({ totalTests: completedTests, successfulTests, failedTests, durationMs });
1155
+ reporter.onRunComplete({ totalTests: completedTests, successfulTests, failedTests, durationMs });
1156
+
1157
+ // Update run status (keep original totalTests to show expected vs actual)
198
1158
  evaluationStore.updateRun(run.id, {
199
1159
  status: 'completed',
200
- totalTests: results.length,
201
1160
  completedAt: new Date().toISOString(),
202
1161
  });
203
1162
 
@@ -208,19 +1167,14 @@ export async function runEvaluation(options = {}) {
208
1167
  const stats = evaluationStore.getRunStats(run.id);
209
1168
  const scenarioStats = evaluationStore.getScenarioStats(run.id);
210
1169
 
211
- log('\n' + '='.repeat(60));
212
- log('EVALUATION COMPLETE');
213
- log('='.repeat(60));
214
- log(`Run ID: ${run.id}`);
215
- log(`Total tests: ${results.length}`);
216
- log(`Successful: ${results.filter(r => r.success).length}`);
217
-
218
1170
  return {
219
1171
  runId: run.id,
220
- totalTests: results.length,
221
- successfulTests: results.filter(r => r.success).length,
1172
+ totalTests,
1173
+ successfulTests,
1174
+ failedTests,
222
1175
  stats,
223
1176
  scenarioStats,
1177
+ progressLogPath,
224
1178
  };
225
1179
  }
226
1180
 
@@ -229,7 +1183,7 @@ export async function runEvaluation(options = {}) {
229
1183
  * Handles both single-turn and multi-turn scenarios
230
1184
  */
231
1185
  async function runSingleTest(scenario, config, options = {}) {
232
- const { skipRubricEval = false, outputSize = 'normal', verbose = false, onLog, superegoStrategy = null } = options;
1186
+ const { skipRubricEval = false, outputSize = 'normal', verbose = false, onLog, superegoStrategy = null, judgeOverride = null } = options;
233
1187
 
234
1188
  // Create a log function that calls both console and onLog callback
235
1189
  const log = (message, level = 'info') => {
@@ -237,7 +1191,7 @@ async function runSingleTest(scenario, config, options = {}) {
237
1191
  if (onLog) onLog(message, level);
238
1192
  };
239
1193
 
240
- const fullScenario = tutorApi.getScenario(scenario.id);
1194
+ const fullScenario = evalConfigLoader.getScenario(scenario.id);
241
1195
  if (!fullScenario) {
242
1196
  throw new Error(`Scenario not found: ${scenario.id}`);
243
1197
  }
@@ -245,132 +1199,103 @@ async function runSingleTest(scenario, config, options = {}) {
245
1199
  log(`Running scenario: ${scenario.name}`, 'info');
246
1200
 
247
1201
  // Check if this is a multi-turn scenario
248
- const isMultiTurn = tutorApi.isMultiTurnScenario(scenario.id);
1202
+ const isMultiTurn = evalConfigLoader.isMultiTurnScenario(scenario.id);
249
1203
 
250
1204
  if (isMultiTurn) {
251
1205
  log('Detected multi-turn scenario', 'info');
252
- return runMultiTurnTest(scenario, config, fullScenario, { ...options, log });
1206
+ return runMultiTurnTest(scenario, config, fullScenario, { ...options, log, judgeOverride });
253
1207
  }
254
1208
 
255
1209
  // Single-turn evaluation (original logic)
256
- return runSingleTurnTest(scenario, config, fullScenario, { ...options, log });
1210
+ return runSingleTurnTest(scenario, config, fullScenario, { ...options, log, judgeOverride });
257
1211
  }
258
1212
 
259
- /**
260
- * Run a single-turn test
261
- */
262
- async function runSingleTurnTest(scenario, config, fullScenario, options = {}) {
263
- const { skipRubricEval = false, outputSize = 'normal', verbose = false, log = () => {}, superegoStrategy = null } = options;
264
-
265
- // Build context
266
- log('Building learner context...', 'info');
267
- const context = tutorApi.buildContext(fullScenario.learner_context);
268
- context.isNewUser = fullScenario.is_new_user;
269
-
270
- // Generate suggestions
271
- log(`Generating suggestions with profile: ${config.profileName}`, 'info');
272
- log(`Provider: ${config.provider || 'from profile'}, Model: ${config.model || 'from profile'}`, 'info');
273
- if (config.egoModel) {
274
- log(`Ego model override: ${config.egoModel}`, 'info');
275
- }
276
-
277
- // Wrap API call with retry logic for rate limit handling
278
- const genResult = await retryWithBackoff(
279
- () => tutorApi.generateSuggestions(context, {
280
- provider: config.provider,
281
- model: config.model,
282
- egoModel: config.egoModel, // Override ego model for benchmarking
283
- profileName: config.profileName,
284
- hyperparameters: config.hyperparameters || {},
285
- trace: true, // Always capture trace for tension analysis
286
- superegoStrategy, // Pass through superego intervention strategy
287
- outputSize, // compact, normal, expanded - affects response length
288
- }),
289
- { log }
290
- );
291
-
292
- if (!genResult.success) {
293
- log(`Generation failed: ${genResult.error}`, 'error');
294
- return {
295
- scenarioId: scenario.id,
296
- scenarioName: scenario.name,
297
- provider: config.provider || genResult.metadata?.provider,
298
- model: config.model || genResult.metadata?.model,
299
- profileName: config.profileName,
300
- success: false,
301
- errorMessage: genResult.error,
302
- latencyMs: genResult.metadata?.latencyMs,
303
- };
304
- }
305
-
306
- const suggestionCount = genResult.suggestions?.length || 0;
307
- log(`Generated ${suggestionCount} suggestion(s) in ${genResult.metadata?.latencyMs}ms`, 'success');
308
-
309
- if (genResult.metadata?.dialogueRounds) {
310
- log(`Dialogue rounds: ${genResult.metadata.dialogueRounds}`, 'info');
311
- }
312
-
313
- // Quick validation (rule-based)
314
- log('Running validation checks...', 'info');
315
- const suggestion = genResult.suggestions?.[0];
316
- const validation = suggestion
317
- ? rubricEvaluator.quickValidate(suggestion, {
318
- requiredElements: fullScenario.required_elements,
319
- forbiddenElements: fullScenario.forbidden_elements,
320
- })
321
- : { passesRequired: false, passesForbidden: true, requiredMissing: ['No suggestions generated'] };
322
-
323
- log(`Validation: required=${validation.passesRequired ? 'PASS' : 'FAIL'}, forbidden=${validation.passesForbidden ? 'PASS' : 'FAIL'}`, validation.passesRequired && validation.passesForbidden ? 'success' : 'warning');
324
-
325
- let rubricResult = null;
326
- if (!skipRubricEval && suggestion) {
327
- // Full rubric evaluation with AI judge
328
- log('Running AI rubric evaluation...', 'info');
329
- debugLog(`[evaluationRunner] Running rubric evaluation for ${scenario.id}...`);
330
- rubricResult = await rubricEvaluator.evaluateSuggestion(suggestion, {
331
- name: fullScenario.name,
332
- description: fullScenario.description,
333
- expectedBehavior: fullScenario.expected_behavior,
334
- learnerContext: fullScenario.learner_context,
335
- requiredElements: fullScenario.required_elements,
336
- forbiddenElements: fullScenario.forbidden_elements,
337
- }, {});
338
-
339
- // Log rubric result summary
340
- if (rubricResult) {
341
- debugLog(`[evaluationRunner] Rubric result: success=${rubricResult.success}, ` +
342
- `overallScore=${rubricResult.overallScore}, ` +
343
- `scoresCount=${Object.keys(rubricResult.scores || {}).length}, ` +
344
- `error=${rubricResult.error || 'none'}`);
345
- if (rubricResult.success) {
346
- log(`Rubric evaluation complete: score=${rubricResult.overallScore?.toFixed(1)}`, 'success');
347
- } else {
348
- log(`Rubric evaluation failed: ${rubricResult.error || 'unknown error'}`, 'error');
349
- }
350
- }
351
- } else if (skipRubricEval) {
352
- debugLog(`[evaluationRunner] Skipping rubric evaluation (--fast mode)`);
353
- log('Skipping AI rubric evaluation (fast mode)', 'info');
354
- } else if (!suggestion) {
355
- debugLog(`[evaluationRunner] Skipping rubric evaluation (no suggestion generated)`);
356
- log('Skipping rubric evaluation (no suggestion generated)', 'warning');
1213
+ /**
1214
+ * Run a single-turn test
1215
+ */
1216
+ async function runSingleTurnTest(scenario, config, fullScenario, options = {}) {
1217
+ const { skipRubricEval = false, outputSize = 'normal', verbose = false, log = () => {}, superegoStrategy = null, judgeOverride = null } = options;
1218
+
1219
+ // Resolve model aliases through eval's providers.yaml
1220
+ const resolvedConfig = resolveConfigModels(config);
1221
+
1222
+ // Build context with optional curriculum content
1223
+ log('Building learner context...', 'info');
1224
+ const curriculumContext = contentResolver.isConfigured()
1225
+ ? contentResolver.buildCurriculumContext(
1226
+ contentResolver.resolveScenarioContent(fullScenario)
1227
+ )
1228
+ : null;
1229
+ if (curriculumContext) {
1230
+ log(`Curriculum context loaded (${curriculumContext.length} chars)`, 'info');
357
1231
  }
1232
+ const structuredLearnerContext = structureLearnerContext(fullScenario.learner_context);
1233
+ const context = tutorApi.buildContext(structuredLearnerContext, curriculumContext);
1234
+ context.isNewUser = fullScenario.is_new_user;
358
1235
 
359
- // Calculate overall score
360
- let overallScore = null;
361
- if (rubricResult?.success) {
362
- overallScore = rubricResult.overallScore;
363
- } else if (suggestion) {
364
- // Fallback: simple validation-based score
365
- overallScore = (validation.passesRequired ? 50 : 0) + (validation.passesForbidden ? 50 : 0);
1236
+ // Resolve profile: extract dialogue/recognition settings and remap to tutor-core profile.
1237
+ const profileResolution = resolveEvalProfile(resolvedConfig.profileName);
1238
+ const { useDialogue, maxRounds, recognitionMode } = profileResolution;
1239
+ resolvedConfig.profileName = profileResolution.resolvedProfileName;
1240
+
1241
+ // Log config info
1242
+ log(`Generating suggestions with profile: ${resolvedConfig.profileName} (dialogue=${useDialogue}, rounds=${maxRounds}, recognition=${recognitionMode})`, 'info');
1243
+ log(`Provider: ${resolvedConfig.provider || 'from profile'}, Model: ${resolvedConfig.model || 'from profile'}`, 'info');
1244
+ if (resolvedConfig.egoModel) {
1245
+ const egoLabel = typeof resolvedConfig.egoModel === 'object'
1246
+ ? `${resolvedConfig.egoModel.provider}.${resolvedConfig.egoModel.model}`
1247
+ : resolvedConfig.egoModel;
1248
+ log(`Ego model override: ${egoLabel}`, 'info');
1249
+ }
1250
+
1251
+ // Use shared generation + evaluation helper
1252
+ const { genResult, suggestion, validation, rubricResult, turnScore: overallScore, scoringMethod } = await generateAndEvaluateTurn(
1253
+ context, resolvedConfig,
1254
+ {
1255
+ scenarioName: fullScenario.name,
1256
+ description: fullScenario.description,
1257
+ expectedBehavior: fullScenario.expected_behavior,
1258
+ learnerContext: fullScenario.learner_context,
1259
+ requiredElements: fullScenario.required_elements,
1260
+ requiredElementsAny: fullScenario.required_elements_any,
1261
+ forbiddenElements: fullScenario.forbidden_elements,
1262
+ },
1263
+ { skipRubricEval, outputSize, superegoStrategy, judgeOverride, useDialogue, maxRounds, log, scenarioId: scenario.id }
1264
+ );
1265
+
1266
+ if (!genResult.success) {
1267
+ return {
1268
+ scenarioId: scenario.id,
1269
+ scenarioName: scenario.name,
1270
+ scenarioType: fullScenario.type || 'suggestion',
1271
+ provider: resolvedConfig.provider || genResult.metadata?.provider,
1272
+ model: resolvedConfig.model || genResult.metadata?.model,
1273
+ profileName: config.profileName,
1274
+ egoModel: resolvedConfig.egoModel
1275
+ ? `${resolvedConfig.egoModel.provider}.${resolvedConfig.egoModel.model}`
1276
+ : null,
1277
+ superegoModel: resolvedConfig.superegoModel
1278
+ ? `${resolvedConfig.superegoModel.provider}.${resolvedConfig.superegoModel.model}`
1279
+ : null,
1280
+ success: false,
1281
+ errorMessage: genResult.error,
1282
+ latencyMs: genResult.metadata?.latencyMs,
1283
+ };
366
1284
  }
367
1285
 
368
1286
  return {
369
1287
  scenarioId: scenario.id,
370
1288
  scenarioName: scenario.name,
371
- provider: config.provider || genResult.metadata?.provider,
372
- model: config.model || genResult.metadata?.model,
1289
+ scenarioType: fullScenario.type || 'suggestion',
1290
+ provider: resolvedConfig.provider || genResult.metadata?.provider,
1291
+ model: resolvedConfig.model || genResult.metadata?.model,
373
1292
  profileName: config.profileName,
1293
+ egoModel: resolvedConfig.egoModel
1294
+ ? `${resolvedConfig.egoModel.provider}.${resolvedConfig.egoModel.model}`
1295
+ : null,
1296
+ superegoModel: resolvedConfig.superegoModel
1297
+ ? `${resolvedConfig.superegoModel.provider}.${resolvedConfig.superegoModel.model}`
1298
+ : null,
374
1299
  hyperparameters: config.hyperparameters,
375
1300
  suggestions: genResult.suggestions,
376
1301
  success: true,
@@ -379,8 +1304,8 @@ async function runSingleTurnTest(scenario, config, fullScenario, options = {}) {
379
1304
  outputTokens: genResult.metadata?.outputTokens,
380
1305
  dialogueRounds: genResult.metadata?.dialogueRounds,
381
1306
  apiCalls: genResult.metadata?.apiCalls,
382
- cost: genResult.metadata?.totalCost, // OpenRouter API cost in USD
383
- dialogueId: genResult.metadata?.dialogueId, // For linking to logs
1307
+ cost: genResult.metadata?.totalCost,
1308
+ dialogueId: genResult.metadata?.dialogueId,
384
1309
  scores: rubricResult?.scores && Object.keys(rubricResult.scores).length > 0 ? {
385
1310
  relevance: rubricResult.scores.relevance?.score,
386
1311
  specificity: rubricResult.scores.specificity?.score,
@@ -389,18 +1314,21 @@ async function runSingleTurnTest(scenario, config, fullScenario, options = {}) {
389
1314
  actionability: rubricResult.scores.actionability?.score,
390
1315
  tone: rubricResult.scores.tone?.score,
391
1316
  } : null,
392
- // Include full scores with reasoning for detailed analysis
393
1317
  scoresWithReasoning: rubricResult?.scores && Object.keys(rubricResult.scores).length > 0
394
1318
  ? rubricResult.scores
395
1319
  : null,
396
1320
  overallScore,
1321
+ scoringMethod,
1322
+ baseScore: rubricResult?.baseScore ?? null,
1323
+ recognitionScore: rubricResult?.recognitionScore ?? null,
397
1324
  passesRequired: rubricResult?.passesRequired ?? validation.passesRequired,
398
1325
  passesForbidden: rubricResult?.passesForbidden ?? validation.passesForbidden,
399
1326
  requiredMissing: rubricResult?.requiredMissing || validation.requiredMissing,
400
1327
  forbiddenFound: rubricResult?.forbiddenFound || validation.forbiddenFound,
401
- evaluatorModel: rubricResult?.evaluatorModel,
1328
+ judgeModel: rubricResult?.judgeModel,
402
1329
  evaluationReasoning: rubricResult?.summary,
403
- // Include dialogueResult for tension analysis
1330
+ factors: resolvedConfig.factors || null,
1331
+ learnerArchitecture: resolvedConfig.learnerArchitecture || null,
404
1332
  dialogueResult: {
405
1333
  dialogueTrace: genResult.dialogueTrace,
406
1334
  dialogueRounds: genResult.metadata?.dialogueRounds,
@@ -411,81 +1339,185 @@ async function runSingleTurnTest(scenario, config, fullScenario, options = {}) {
411
1339
  }
412
1340
 
413
1341
  /**
414
- * Run a multi-turn test
415
- * Evaluates each turn and aggregates scores
1342
+ * Run a multi-turn test as an iterative loop.
1343
+ *
1344
+ * Each turn goes through the SAME generateAndEvaluateTurn() code path as
1345
+ * single-turn, with accumulated conversation context between turns.
1346
+ * This eliminates the separate multiTurnRunner orchestration.
416
1347
  */
417
1348
  async function runMultiTurnTest(scenario, config, fullScenario, options = {}) {
418
- const { skipRubricEval = false, verbose = false } = options;
419
- const log = verbose ? console.log : () => {};
1349
+ const { skipRubricEval = false, outputSize = 'normal', verbose = false, log = () => {}, superegoStrategy = null, judgeOverride = null } = options;
420
1350
 
421
1351
  log(`[evaluationRunner] Running multi-turn scenario: ${scenario.id}`);
422
1352
 
423
- const turns = fullScenario.turns || [];
1353
+ // 1. Resolve config (models, profile) — same as single-turn
1354
+ const resolvedConfig = resolveConfigModels(config);
1355
+ const profileResolution = resolveEvalProfile(resolvedConfig.profileName);
1356
+ const { useDialogue, maxRounds } = profileResolution;
1357
+ resolvedConfig.profileName = profileResolution.resolvedProfileName;
1358
+
1359
+ // 2. Build curriculum context — same as single-turn
1360
+ const curriculumContext = contentResolver.isConfigured()
1361
+ ? contentResolver.buildCurriculumContext(
1362
+ contentResolver.resolveScenarioContent(fullScenario)
1363
+ )
1364
+ : null;
1365
+
1366
+ // 3. Generate dialogue ID for the session
1367
+ const dialogueId = `dialogue-${Date.now()}-${Math.random().toString(36).slice(2, 8)}`;
1368
+ dialogueEngine.setCurrentDialogueId(dialogueId);
1369
+
1370
+ // Generate synthetic learnerId for Writing Pad persistence across turns
1371
+ const learnerId = `eval-learner-${dialogueId}-${scenario.id.replace(/[^a-zA-Z0-9]/g, '')}`;
1372
+ log(`[evaluationRunner] Generated learnerId for Writing Pad: ${learnerId}`, 'info');
1373
+
1374
+ // Deep-clone turns to prevent mutation of shared scenario objects across profiles
1375
+ const turns = JSON.parse(JSON.stringify(fullScenario.turns || []));
424
1376
  const turnResults = [];
425
1377
  let totalLatencyMs = 0;
426
1378
  let totalInputTokens = 0;
427
1379
  let totalOutputTokens = 0;
428
1380
  let totalApiCalls = 0;
429
1381
  let totalCost = 0;
1382
+ let totalDialogueRounds = 0;
430
1383
 
431
- // Run the multi-turn scenario through tutorApi (with retry for rate limits)
432
- const multiTurnResult = await retryWithBackoff(
433
- () => tutorApi.runMultiTurnScenario(scenario.id, {
434
- provider: config.provider,
435
- model: config.model,
436
- profileName: config.profileName,
437
- hyperparameters: config.hyperparameters || {},
438
- trace: verbose,
439
- }),
440
- { log }
441
- );
1384
+ let conversationHistory = [];
1385
+ let previousSuggestion = null;
1386
+ const consolidatedTrace = [];
1387
+
1388
+ const sharedTurnOptions = { skipRubricEval, outputSize, superegoStrategy, judgeOverride, useDialogue, maxRounds, log, scenarioId: scenario.id, learnerId };
1389
+
1390
+ // Check if prompt rewriting is enabled for this profile
1391
+ const rawProfile = evalConfigLoader.loadTutorAgents()?.profiles?.[config.profileName];
1392
+ const promptRewritingEnabled = rawProfile?.prompt_rewriting?.enabled ?? false;
1393
+ const promptRewritingStrategy = rawProfile?.prompt_rewriting?.strategy ?? 'template';
1394
+ let sessionEvolution = null;
1395
+
1396
+ // 4. Loop through turns (initial turn 0 + follow-up turns)
1397
+ const totalTurnCount = 1 + turns.length;
1398
+ for (let turnIdx = 0; turnIdx < totalTurnCount; turnIdx++) {
1399
+ const isInitialTurn = turnIdx === 0;
1400
+ const turnDef = isInitialTurn ? null : turns[turnIdx - 1];
1401
+
1402
+ log(`[evaluationRunner] Turn ${turnIdx}/${totalTurnCount - 1}${isInitialTurn ? ' (initial)' : ` (${turnDef.id})`}`, 'info');
1403
+
1404
+ // Show learner action in transcript mode (for follow-up turns)
1405
+ if (!isInitialTurn && dialogueEngine.isTranscriptMode()) {
1406
+ dialogueEngine.transcript('LEARNER ACTION', formatLearnerActionForTranscript(turnDef));
1407
+ }
1408
+
1409
+ // Build context for this turn
1410
+ let contextStr;
1411
+ if (isInitialTurn) {
1412
+ contextStr = fullScenario.learner_context;
1413
+ } else {
1414
+ // Add previous turn to conversation history
1415
+ conversationHistory.push({
1416
+ turnIndex: turnIdx - 1,
1417
+ turnId: turnIdx === 1 ? 'initial' : turns[turnIdx - 2]?.id,
1418
+ suggestion: previousSuggestion,
1419
+ learnerAction: turnDef.learner_action,
1420
+ learnerMessage: turnDef.action_details?.message,
1421
+ });
1422
+
1423
+ contextStr = buildMultiTurnContext({
1424
+ originalContext: fullScenario.learner_context,
1425
+ conversationHistory,
1426
+ currentTurn: turnDef,
1427
+ previousSuggestion,
1428
+ });
1429
+ }
1430
+
1431
+ const structuredContextStr = structureLearnerContext(contextStr);
1432
+ const context = tutorApi.buildContext(structuredContextStr, curriculumContext);
1433
+ context.isNewUser = isInitialTurn ? fullScenario.is_new_user : false;
1434
+
1435
+ // Build turn-specific rubric metadata
1436
+ const turnMeta = {
1437
+ scenarioName: isInitialTurn
1438
+ ? fullScenario.name
1439
+ : `${fullScenario.name} - Turn ${turnIdx}`,
1440
+ description: isInitialTurn
1441
+ ? fullScenario.description
1442
+ : `Turn: ${turnDef.learner_action}`,
1443
+ expectedBehavior: isInitialTurn
1444
+ ? fullScenario.expected_behavior
1445
+ : turnDef.expected_behavior,
1446
+ learnerContext: contextStr,
1447
+ requiredElements: isInitialTurn
1448
+ ? (fullScenario.required_elements || [])
1449
+ : (turnDef.required_elements || []),
1450
+ requiredElementsAny: isInitialTurn
1451
+ ? (fullScenario.required_elements_any || [])
1452
+ : (turnDef.required_elements_any || []),
1453
+ forbiddenElements: isInitialTurn
1454
+ ? (fullScenario.forbidden_elements || [])
1455
+ : (turnDef.forbidden_elements || []),
1456
+ };
1457
+
1458
+ // Call the SAME generation+evaluation code path as single-turn
1459
+ // Pass dialogue context so the judge can see the full exchange
1460
+ const turnOptions = {
1461
+ ...sharedTurnOptions,
1462
+ ...(sessionEvolution ? { systemPromptExtension: sessionEvolution } : {}),
1463
+ conversationHistory: conversationHistory.length > 0 ? conversationHistory : null,
1464
+ consolidatedTrace: consolidatedTrace.length > 0 ? consolidatedTrace : null,
1465
+ };
1466
+ const { genResult, suggestion, validation, rubricResult, turnScore, scoringMethod } =
1467
+ await generateAndEvaluateTurn(context, resolvedConfig, turnMeta, turnOptions);
1468
+
1469
+ if (!genResult.success) {
1470
+ const turnId = isInitialTurn ? 'initial' : turnDef.id;
1471
+ throw new Error(`Multi-turn scenario ${scenario.id}: Turn ${turnIdx} (${turnId}) failed to generate suggestions`);
1472
+ }
442
1473
 
443
- // Validate that we got results
444
- if (!multiTurnResult.turnResults || multiTurnResult.turnResults.length === 0) {
445
- const errorMsg = `Multi-turn scenario returned no results (expected ${fullScenario.turns?.length + 1 || 1} turns)`;
446
- log(errorMsg, 'error');
447
- throw new Error(errorMsg);
448
- }
449
-
450
- // Evaluate each turn
451
- for (const turnResult of multiTurnResult.turnResults) {
452
- const suggestion = turnResult.suggestions?.[0];
453
-
454
- // Quick validation for this turn
455
- const validation = suggestion
456
- ? rubricEvaluator.quickValidate(suggestion, {
457
- requiredElements: turnResult.requiredElements,
458
- forbiddenElements: turnResult.forbiddenElements,
459
- })
460
- : { passesRequired: false, passesForbidden: true, requiredMissing: ['No suggestions generated'] };
461
-
462
- let rubricResult = null;
463
- if (!skipRubricEval && suggestion) {
464
- log(`[evaluationRunner] Running rubric evaluation for turn ${turnResult.turnIndex}...`);
465
- rubricResult = await rubricEvaluator.evaluateSuggestion(suggestion, {
466
- name: `${fullScenario.name} - Turn ${turnResult.turnIndex}`,
467
- description: turnResult.turnId === 'initial' ? fullScenario.description : `Turn: ${turnResult.learnerAction}`,
468
- expectedBehavior: turnResult.expectedBehavior,
469
- learnerContext: turnResult.context,
470
- requiredElements: turnResult.requiredElements,
471
- forbiddenElements: turnResult.forbiddenElements,
472
- }, {});
473
- }
474
-
475
- // Calculate turn score
476
- let turnScore = null;
477
- if (rubricResult?.success) {
478
- turnScore = rubricResult.overallScore;
479
- } else if (suggestion) {
480
- turnScore = (validation.passesRequired ? 50 : 0) + (validation.passesForbidden ? 50 : 0);
1474
+ // Accumulate dialogue traces
1475
+ if (genResult.dialogueTrace && genResult.dialogueTrace.length > 0) {
1476
+ // Insert user turn action entry before each turn (except initial)
1477
+ if (!isInitialTurn) {
1478
+ const histEntry = conversationHistory[conversationHistory.length - 1];
1479
+ consolidatedTrace.push({
1480
+ agent: 'user',
1481
+ action: 'turn_action',
1482
+ turnIndex: turnIdx,
1483
+ contextSummary: histEntry?.learnerMessage || `${histEntry?.learnerAction || 'Action'}`,
1484
+ detail: `Learner: ${histEntry?.learnerAction}`,
1485
+ timestamp: new Date().toISOString(),
1486
+ });
1487
+ }
1488
+ consolidatedTrace.push(...genResult.dialogueTrace);
1489
+
1490
+ // Add final delivery to user for multi-agent mode
1491
+ const hasSuperego = genResult.dialogueTrace.some(entry => entry.agent === 'superego');
1492
+ if (hasSuperego) {
1493
+ const suggCount = genResult.suggestions?.length || 0;
1494
+ consolidatedTrace.push({
1495
+ agent: 'user',
1496
+ action: 'final_output',
1497
+ turnIndex: turnIdx,
1498
+ from: 'ego',
1499
+ to: 'user',
1500
+ direction: 'response',
1501
+ suggestionCount: suggCount,
1502
+ contextSummary: `Delivered ${suggCount} suggestion${suggCount !== 1 ? 's' : ''}`,
1503
+ detail: `Turn ${turnIdx + 1} complete`,
1504
+ timestamp: new Date().toISOString(),
1505
+ });
1506
+ }
481
1507
  }
482
1508
 
1509
+ // Collect per-turn result
483
1510
  turnResults.push({
484
- turnIndex: turnResult.turnIndex,
485
- turnId: turnResult.turnId,
486
- learnerAction: turnResult.learnerAction,
487
- expectedBehavior: turnResult.expectedBehavior,
488
- suggestion: suggestion,
1511
+ turnIndex: turnIdx,
1512
+ turnId: isInitialTurn ? 'initial' : turnDef.id,
1513
+ learnerAction: isInitialTurn ? undefined : turnDef.learner_action,
1514
+ learnerMessage: isInitialTurn ? undefined : turnDef.action_details?.message, // Include generated learner message for growth tracking
1515
+ expectedBehavior: turnMeta.expectedBehavior,
1516
+ suggestion,
1517
+ learnerDeliberation: turnDef?._learnerDeliberation || null,
1518
+ learnerEmotionalState: turnDef?._learnerEmotionalState || null,
1519
+ learnerMessageGenerated: !!turnDef?._learnerDeliberation,
1520
+ learnerOriginalMessage: turnDef?._originalMessage || null,
489
1521
  scores: rubricResult?.scores && Object.keys(rubricResult.scores).length > 0 ? {
490
1522
  relevance: rubricResult.scores.relevance?.score,
491
1523
  specificity: rubricResult.scores.specificity?.score,
@@ -495,31 +1527,127 @@ async function runMultiTurnTest(scenario, config, fullScenario, options = {}) {
495
1527
  tone: rubricResult.scores.tone?.score,
496
1528
  } : null,
497
1529
  turnScore,
1530
+ scoringMethod,
498
1531
  passesRequired: rubricResult?.passesRequired ?? validation.passesRequired,
499
1532
  passesForbidden: rubricResult?.passesForbidden ?? validation.passesForbidden,
500
1533
  requiredMissing: validation.requiredMissing,
501
1534
  forbiddenFound: validation.forbiddenFound,
502
- minAcceptableScore: turnResult.minAcceptableScore || fullScenario.min_acceptable_score,
1535
+ minAcceptableScore: (!isInitialTurn ? turnDef.min_acceptable_score : null) || fullScenario.min_acceptable_score,
503
1536
  });
504
1537
 
505
1538
  // Aggregate metrics
506
- totalLatencyMs += turnResult.metadata?.latencyMs || 0;
507
- totalInputTokens += turnResult.metadata?.inputTokens || 0;
508
- totalOutputTokens += turnResult.metadata?.outputTokens || 0;
509
- totalApiCalls += turnResult.metadata?.apiCalls || 0;
510
- totalCost += turnResult.metadata?.totalCost || 0;
1539
+ totalLatencyMs += genResult.metadata?.latencyMs || 0;
1540
+ totalInputTokens += genResult.metadata?.inputTokens || 0;
1541
+ totalOutputTokens += genResult.metadata?.outputTokens || 0;
1542
+ totalApiCalls += genResult.metadata?.apiCalls || 0;
1543
+ totalCost += genResult.metadata?.totalCost || 0;
1544
+ totalDialogueRounds += genResult.metadata?.dialogueRounds || 0;
1545
+
1546
+ // Update for next iteration
1547
+ previousSuggestion = suggestion;
1548
+
1549
+ // Synthesize prompt rewriting directives for next turn (if enabled)
1550
+ if (promptRewritingEnabled && turnIdx < totalTurnCount - 1) {
1551
+ if (promptRewritingStrategy === 'llm') {
1552
+ // LLM-based directive synthesis using superego model
1553
+ try {
1554
+ sessionEvolution = await promptRewriter.synthesizeDirectivesLLM({
1555
+ turnResults,
1556
+ consolidatedTrace,
1557
+ conversationHistory,
1558
+ config: rawProfile,
1559
+ });
1560
+ if (sessionEvolution) {
1561
+ log(`[evaluationRunner] LLM rewriter generated directives for turn ${turnIdx + 1}`, 'info');
1562
+ }
1563
+ } catch (error) {
1564
+ log(`[evaluationRunner] LLM rewriter failed, falling back to template: ${error.message}`, 'warn');
1565
+ sessionEvolution = promptRewriter.synthesizeDirectives({
1566
+ turnResults,
1567
+ consolidatedTrace,
1568
+ conversationHistory,
1569
+ });
1570
+ }
1571
+ } else {
1572
+ // Template-based directive synthesis (deterministic, no LLM call)
1573
+ sessionEvolution = promptRewriter.synthesizeDirectives({
1574
+ turnResults,
1575
+ consolidatedTrace,
1576
+ conversationHistory,
1577
+ });
1578
+ }
1579
+ if (sessionEvolution) {
1580
+ log(`[evaluationRunner] Prompt rewriter (${promptRewritingStrategy}) generated ${sessionEvolution.split('\n').length - 2} directives for turn ${turnIdx + 1}`, 'info');
1581
+ }
1582
+ }
1583
+
1584
+ // Generate LLM learner response for next turn if ego_superego architecture
1585
+ // Note: check includes() to handle both 'ego_superego' and 'ego_superego_recognition'
1586
+ if (resolvedConfig.learnerArchitecture?.includes('ego_superego') && turnIdx < totalTurnCount - 1) {
1587
+ const nextTurnDef = turns[turnIdx]; // turnIdx is 0-based into the loop; turns[turnIdx] is the next follow-up turn
1588
+ if (nextTurnDef) {
1589
+ const learnerResponse = await generateLearnerResponse({
1590
+ tutorMessage: suggestion?.message || suggestion?.title || '',
1591
+ topic: fullScenario.topic || fullScenario.name || '',
1592
+ conversationHistory: conversationHistory.map(h => ({
1593
+ role: h.learnerMessage ? 'learner' : 'tutor',
1594
+ content: h.learnerMessage || h.suggestion?.message || '',
1595
+ })),
1596
+ learnerProfile: resolvedConfig.learnerArchitecture,
1597
+ personaId: fullScenario.learner_persona || 'eager_novice',
1598
+ modelOverride: config.modelOverride || null,
1599
+ });
1600
+
1601
+ // Override scripted message with LLM-generated one
1602
+ nextTurnDef._originalMessage = nextTurnDef.action_details?.message;
1603
+ nextTurnDef.action_details = nextTurnDef.action_details || {};
1604
+ nextTurnDef.action_details.message = learnerResponse.message;
1605
+ nextTurnDef._learnerDeliberation = learnerResponse.internalDeliberation;
1606
+ nextTurnDef._learnerEmotionalState = learnerResponse.emotionalState;
1607
+
1608
+ // Track learner LLM costs
1609
+ totalInputTokens += learnerResponse.tokenUsage?.inputTokens || 0;
1610
+ totalOutputTokens += learnerResponse.tokenUsage?.outputTokens || 0;
1611
+ totalApiCalls += learnerResponse.tokenUsage?.apiCalls || 0;
1612
+
1613
+ // Add learner deliberation to consolidated trace
1614
+ if (learnerResponse.internalDeliberation?.length > 0) {
1615
+ for (const delib of learnerResponse.internalDeliberation) {
1616
+ consolidatedTrace.push({
1617
+ agent: `learner_${delib.role}`,
1618
+ action: 'deliberation',
1619
+ turnIndex: turnIdx + 1,
1620
+ contextSummary: delib.content.substring(0, 100),
1621
+ detail: delib.content,
1622
+ timestamp: new Date().toISOString(),
1623
+ });
1624
+ }
1625
+ consolidatedTrace.push({
1626
+ agent: 'learner_synthesis',
1627
+ action: 'response',
1628
+ turnIndex: turnIdx + 1,
1629
+ contextSummary: learnerResponse.message.substring(0, 100),
1630
+ detail: learnerResponse.message,
1631
+ timestamp: new Date().toISOString(),
1632
+ });
1633
+ }
1634
+
1635
+ log(`[evaluationRunner] Generated LLM learner response (ego_superego): "${learnerResponse.message.substring(0, 80)}..."`, 'info');
1636
+ }
1637
+ }
511
1638
  }
512
1639
 
513
- // Calculate aggregate scores
1640
+ // 5. Aggregate scores across turns
514
1641
  const validTurnScores = turnResults.filter(t => t.turnScore !== null).map(t => t.turnScore);
515
1642
  const overallScore = validTurnScores.length > 0
516
1643
  ? validTurnScores.reduce((sum, s) => sum + s, 0) / validTurnScores.length
517
1644
  : null;
518
1645
 
519
- // Aggregate dimension scores
520
1646
  const aggregateDimensions = {};
521
- const dims = ['relevance', 'specificity', 'pedagogical', 'personalization', 'actionability', 'tone'];
522
- for (const dim of dims) {
1647
+ const baseDims = ['relevance', 'specificity', 'pedagogical', 'personalization', 'actionability', 'tone', 'productive_struggle', 'epistemic_honesty'];
1648
+ const recognitionDims = ['mutual_recognition', 'dialectical_responsiveness', 'memory_integration', 'transformative_potential', 'tutor_adaptation', 'learner_growth'];
1649
+ const allDims = [...baseDims, ...recognitionDims];
1650
+ for (const dim of allDims) {
523
1651
  const dimScores = turnResults
524
1652
  .filter(t => t.scores?.[dim] !== undefined)
525
1653
  .map(t => t.scores[dim]);
@@ -528,39 +1656,580 @@ async function runMultiTurnTest(scenario, config, fullScenario, options = {}) {
528
1656
  }
529
1657
  }
530
1658
 
531
- // Check if all turns pass their thresholds
1659
+ const baseScoreValues = baseDims.filter(d => aggregateDimensions[d] !== undefined).map(d => aggregateDimensions[d]);
1660
+ const recognitionScoreValues = recognitionDims.filter(d => aggregateDimensions[d] !== undefined).map(d => aggregateDimensions[d]);
1661
+ const baseScore = baseScoreValues.length > 0
1662
+ ? ((baseScoreValues.reduce((s, v) => s + v, 0) / baseScoreValues.length - 1) / 4) * 100
1663
+ : null;
1664
+ const recognitionScore = recognitionScoreValues.length > 0
1665
+ ? ((recognitionScoreValues.reduce((s, v) => s + v, 0) / recognitionScoreValues.length - 1) / 4) * 100
1666
+ : null;
1667
+
532
1668
  const allTurnsPassed = turnResults.every(t => {
533
1669
  if (t.turnScore === null) return false;
534
1670
  const threshold = t.minAcceptableScore || fullScenario.min_acceptable_score || 0;
535
1671
  return t.turnScore >= threshold;
536
1672
  });
537
1673
 
1674
+ // 5b. Holistic dialogue evaluation — score the full transcript as a single unit
1675
+ let holisticDialogueScore = null;
1676
+ if (!skipRubricEval && consolidatedTrace.length > 0 && turnResults.length > 1) {
1677
+ log('[evaluationRunner] Running holistic dialogue evaluation on full transcript...', 'info');
1678
+ try {
1679
+ // Use the last turn's suggestion as the focal point, with full dialogue context
1680
+ const lastSuggestion = turnResults[turnResults.length - 1]?.suggestion;
1681
+ if (lastSuggestion) {
1682
+ const holisticResult = await rubricEvaluator.evaluateSuggestion(lastSuggestion, {
1683
+ name: `${fullScenario.name} (holistic dialogue)`,
1684
+ description: `Holistic evaluation of ${turnResults.length}-turn dialogue. Score the overall quality of the tutoring interaction, not just this final response.`,
1685
+ expectedBehavior: fullScenario.expected_behavior,
1686
+ learnerContext: fullScenario.learner_context,
1687
+ requiredElements: fullScenario.required_elements || [],
1688
+ forbiddenElements: fullScenario.forbidden_elements || [],
1689
+ }, {
1690
+ dialogueContext: {
1691
+ conversationHistory,
1692
+ consolidatedTrace,
1693
+ },
1694
+ }, { judgeOverride });
1695
+
1696
+ if (holisticResult?.success) {
1697
+ holisticDialogueScore = {
1698
+ overallScore: holisticResult.overallScore,
1699
+ baseScore: holisticResult.baseScore,
1700
+ recognitionScore: holisticResult.recognitionScore,
1701
+ scores: holisticResult.scores,
1702
+ summary: holisticResult.summary,
1703
+ judgeModel: holisticResult.judgeModel,
1704
+ };
1705
+ log(`[evaluationRunner] Holistic dialogue score: ${holisticResult.overallScore?.toFixed(1)}`, 'success');
1706
+ } else {
1707
+ log(`[evaluationRunner] Holistic dialogue evaluation failed: ${holisticResult?.error || 'unknown'}`, 'warning');
1708
+ }
1709
+ }
1710
+ } catch (error) {
1711
+ log(`[evaluationRunner] Holistic dialogue evaluation error: ${error.message}`, 'warning');
1712
+ }
1713
+ }
1714
+
1715
+ // 5c. Analyze bilateral transformation (tutor + learner evolution)
1716
+ const turnProgressionAnalysis = turnComparisonAnalyzer.analyzeTurnProgression(turnResults);
1717
+ const markerDefinitions = fullScenario.transformation_markers || fullScenario.transformationMarkers || null;
1718
+ const transformationMarkerAnalysis = markerDefinitions
1719
+ ? turnComparisonAnalyzer.analyzeTransformationMarkers(turnResults, markerDefinitions)
1720
+ : null;
1721
+ const dialogueTraceReport = dialogueTraceAnalyzer.generateTransformationReport(consolidatedTrace, turnResults);
1722
+
1723
+ log(`[evaluationRunner] Bilateral transformation analysis:`, 'info');
1724
+ log(` - Tutor adaptation index: ${turnProgressionAnalysis.adaptationIndex?.toFixed(2) ?? 'N/A'}`, 'info');
1725
+ log(` - Learner growth index: ${turnProgressionAnalysis.learnerGrowthIndex?.toFixed(2) ?? 'N/A'}`, 'info');
1726
+ log(` - Bilateral balance: ${dialogueTraceReport.bilateralMetrics.bilateralBalance?.toFixed(2) ?? 'N/A'}`, 'info');
1727
+ if (dialogueTraceReport.bilateralMetrics.summary) {
1728
+ log(` - ${dialogueTraceReport.bilateralMetrics.summary}`, 'info');
1729
+ }
1730
+
1731
+ // 6. Write consolidated dialogue log
1732
+ const consolidatedDialogue = {
1733
+ suggestions: turnResults[turnResults.length - 1]?.suggestion ? [turnResults[turnResults.length - 1].suggestion] : [],
1734
+ dialogueTrace: consolidatedTrace,
1735
+ converged: false,
1736
+ rounds: totalDialogueRounds,
1737
+ metrics: {
1738
+ totalLatencyMs,
1739
+ totalInputTokens,
1740
+ totalOutputTokens,
1741
+ totalCost,
1742
+ apiCalls: totalApiCalls,
1743
+ },
1744
+ dialogueId,
1745
+ profileName: resolvedConfig.profileName,
1746
+ provider: resolvedConfig.provider,
1747
+ model: resolvedConfig.model,
1748
+ learnerContext: fullScenario.learner_context,
1749
+ isMultiTurn: true,
1750
+ learnerArchitecture: resolvedConfig.learnerArchitecture || 'unified',
1751
+ totalTurns: turnResults.length,
1752
+ turnResults: turnResults.map(t => ({
1753
+ turnIndex: t.turnIndex,
1754
+ turnId: t.turnId,
1755
+ suggestions: t.suggestion ? [t.suggestion] : [],
1756
+ })),
1757
+ // Holistic dialogue evaluation
1758
+ holisticDialogueScore,
1759
+ // Bilateral transformation analysis
1760
+ transformationAnalysis: {
1761
+ turnProgression: turnProgressionAnalysis,
1762
+ markerAnalysis: transformationMarkerAnalysis,
1763
+ dialogueTraceReport: dialogueTraceReport,
1764
+ },
1765
+ };
1766
+
1767
+ if (!fs.existsSync(LOGS_DIR)) {
1768
+ fs.mkdirSync(LOGS_DIR, { recursive: true });
1769
+ }
1770
+ const logPath = path.join(LOGS_DIR, `${dialogueId}.json`);
1771
+ fs.writeFileSync(logPath, JSON.stringify(consolidatedDialogue, null, 2));
1772
+
538
1773
  log(`[evaluationRunner] Multi-turn complete: ${turnResults.length} turns, avgScore=${overallScore?.toFixed(1)}`);
539
1774
 
1775
+ // Aggregate requiredMissing/forbiddenFound from all turns
1776
+ const requiredMissing = [...new Set(turnResults.flatMap(t => t.requiredMissing || []))];
1777
+ const forbiddenFound = [...new Set(turnResults.flatMap(t => t.forbiddenFound || []))];
1778
+
1779
+ // 7. Return result
540
1780
  return {
541
1781
  scenarioId: scenario.id,
542
1782
  scenarioName: scenario.name,
1783
+ scenarioType: fullScenario.type || 'suggestion',
543
1784
  isMultiTurn: true,
544
1785
  totalTurns: turnResults.length,
545
- provider: config.provider || multiTurnResult.turnResults[0]?.metadata?.provider,
546
- model: config.model || multiTurnResult.turnResults[0]?.metadata?.model,
1786
+ provider: resolvedConfig.provider,
1787
+ model: resolvedConfig.model,
547
1788
  profileName: config.profileName,
1789
+ egoModel: resolvedConfig.egoModel
1790
+ ? `${resolvedConfig.egoModel.provider}.${resolvedConfig.egoModel.model}`
1791
+ : null,
1792
+ superegoModel: resolvedConfig.superegoModel
1793
+ ? `${resolvedConfig.superegoModel.provider}.${resolvedConfig.superegoModel.model}`
1794
+ : null,
548
1795
  hyperparameters: config.hyperparameters,
549
- suggestions: multiTurnResult.turnResults.map(t => t.suggestions?.[0]).filter(Boolean),
1796
+ suggestions: turnResults.map(t => t.suggestion).filter(Boolean),
550
1797
  success: true,
551
1798
  latencyMs: totalLatencyMs,
552
1799
  inputTokens: totalInputTokens,
553
1800
  outputTokens: totalOutputTokens,
554
1801
  apiCalls: totalApiCalls,
555
- cost: totalCost, // OpenRouter API cost in USD
556
- dialogueId: multiTurnResult.dialogueId, // Single continuous dialogue ID for all turns
557
- dialogueRounds: multiTurnResult.turnResults.reduce((sum, t) => sum + (t.metadata?.dialogueRounds || 0), 0), // Total across all turns
1802
+ cost: totalCost,
1803
+ dialogueId,
1804
+ dialogueRounds: totalDialogueRounds,
558
1805
  scores: Object.keys(aggregateDimensions).length > 0 ? aggregateDimensions : null,
559
1806
  overallScore,
1807
+ scoringMethod: turnResults.some(t => t.scoringMethod === 'judge_failed')
1808
+ ? 'partial_judge_failure'
1809
+ : turnResults.every(t => t.scoringMethod === 'rubric') ? 'rubric' : 'mixed',
1810
+ baseScore,
1811
+ recognitionScore,
560
1812
  turnResults,
561
1813
  allTurnsPassed,
562
1814
  passesRequired: turnResults.every(t => t.passesRequired),
563
1815
  passesForbidden: turnResults.every(t => t.passesForbidden),
1816
+ requiredMissing,
1817
+ forbiddenFound,
1818
+ factors: resolvedConfig.factors || null,
1819
+ learnerArchitecture: resolvedConfig.learnerArchitecture || null,
1820
+ // Holistic dialogue evaluation (full transcript scored as single unit)
1821
+ holisticDialogueScore,
1822
+ // Bilateral transformation metrics
1823
+ transformationMetrics: {
1824
+ tutorAdaptationIndex: turnProgressionAnalysis.adaptationIndex,
1825
+ learnerGrowthIndex: turnProgressionAnalysis.learnerGrowthIndex,
1826
+ bilateralTransformationIndex: turnProgressionAnalysis.bilateralTransformationIndex,
1827
+ framingEvolution: turnProgressionAnalysis.framingEvolution,
1828
+ dimensionConvergence: turnProgressionAnalysis.dimensionConvergence,
1829
+ markerAnalysis: transformationMarkerAnalysis,
1830
+ bilateralMetrics: dialogueTraceReport.bilateralMetrics,
1831
+ superegoMetrics: dialogueTraceReport.superegoMetrics,
1832
+ transformationQuality: dialogueTraceReport.overallAssessment?.transformationQuality ?? null,
1833
+ },
1834
+ };
1835
+ }
1836
+
1837
+ /**
1838
+ * Resume an incomplete evaluation run, re-running only the missing tests.
1839
+ *
1840
+ * @param {Object} options
1841
+ * @param {string} options.runId - The run ID to resume
1842
+ * @param {number} [options.parallelism] - Parallel worker count
1843
+ * @param {boolean} [options.verbose] - Enable verbose output
1844
+ * @returns {Promise<Object>} Evaluation results (same shape as runEvaluation)
1845
+ */
1846
+ export async function resumeEvaluation(options = {}) {
1847
+ const {
1848
+ runId,
1849
+ parallelism = DEFAULT_PARALLELISM,
1850
+ verbose = false,
1851
+ force = false, // Skip the "already running" check
1852
+ } = options;
1853
+
1854
+ const log = verbose ? console.log : () => {};
1855
+
1856
+ // 1. Load the run and validate it exists
1857
+ const run = evaluationStore.getRun(runId);
1858
+ if (!run) {
1859
+ throw new Error(`Run not found: ${runId}`);
1860
+ }
1861
+
1862
+ // 1b. Check if another process is already running this evaluation
1863
+ const existingPid = run.metadata?.pid;
1864
+ if (existingPid && existingPid !== process.pid && !force) {
1865
+ const isAlive = isPidAlive(existingPid);
1866
+ if (isAlive) {
1867
+ throw new Error(
1868
+ `Run ${runId} is already being processed by pid ${existingPid}. ` +
1869
+ `Use --force to override (may cause duplicates).`
1870
+ );
1871
+ }
1872
+ }
1873
+
1874
+ // 2. Extract metadata
1875
+ const metadata = run.metadata || {};
1876
+ const runsPerConfig = metadata.runsPerConfig || 1;
1877
+ const skipRubricEval = metadata.skipRubricEval || false;
1878
+ const modelOverride = metadata.modelOverride || null;
1879
+
1880
+ // 3. Get existing results for completion checking
1881
+ const existingResults = evaluationStore.getResults(runId);
1882
+
1883
+ // 4. Reconstruct scenarios - prefer metadata (complete list), fall back to inferring from results
1884
+ const allScenarios = evalConfigLoader.listScenarios();
1885
+ let scenarioIds;
1886
+ if (metadata.scenarioIds && metadata.scenarioIds.length > 0) {
1887
+ // Use stored scenario list (includes scenarios that haven't started yet)
1888
+ scenarioIds = metadata.scenarioIds;
1889
+ } else {
1890
+ // Legacy: infer from existing results (may miss unstarted scenarios)
1891
+ scenarioIds = [...new Set(existingResults.map(r => r.scenarioId).filter(Boolean))];
1892
+ }
1893
+ const targetScenarios = allScenarios.filter(s => scenarioIds.includes(s.id));
1894
+
1895
+ if (targetScenarios.length === 0) {
1896
+ throw new Error(`No matching scenarios found for run ${runId}`);
1897
+ }
1898
+
1899
+ // 5. Reconstruct profiles - prefer metadata, fall back to inferring from results
1900
+ let profileNames;
1901
+ if (metadata.profileNames && metadata.profileNames.length > 0) {
1902
+ // Use stored profile list
1903
+ profileNames = metadata.profileNames;
1904
+ } else {
1905
+ // Legacy: infer from existing results
1906
+ profileNames = [...new Set(existingResults.map(r => r.profileName).filter(Boolean))];
1907
+ }
1908
+
1909
+ if (profileNames.length === 0) {
1910
+ throw new Error(`No profiles found for run ${runId} — cannot determine what to resume`);
1911
+ }
1912
+
1913
+ let targetConfigs = profileNames.map(name => ({
1914
+ provider: null,
1915
+ model: null,
1916
+ profileName: name,
1917
+ label: name,
1918
+ }));
1919
+
1920
+ // 6. Re-apply modelOverride if present in metadata
1921
+ if (modelOverride) {
1922
+ targetConfigs = targetConfigs.map(c => ({ ...c, modelOverride }));
1923
+ }
1924
+
1925
+ // 6. Count successful results per (profile, scenario) combo and fill up to runsPerConfig.
1926
+ // Failed results are excluded so they get retried.
1927
+ const completedCounts = {};
1928
+ for (const result of existingResults) {
1929
+ // Only count successful results — failed ones should be retried
1930
+ if (result.success === false || result.success === 0) continue;
1931
+ const key = `${result.profileName}:${result.scenarioId}`;
1932
+ completedCounts[key] = (completedCounts[key] || 0) + 1;
1933
+ }
1934
+
1935
+ // Build flat list of remaining tests
1936
+ const remainingTests = [];
1937
+ for (const scenario of targetScenarios) {
1938
+ for (const config of targetConfigs) {
1939
+ const key = `${config.profileName}:${scenario.id}`;
1940
+ const done = completedCounts[key] || 0;
1941
+ const needed = runsPerConfig - done;
1942
+ for (let i = 0; i < needed; i++) {
1943
+ remainingTests.push({ config, scenario, runNum: done + i });
1944
+ }
1945
+ }
1946
+ }
1947
+
1948
+ if (remainingTests.length === 0) {
1949
+ console.log(`\nRun ${runId}: all tests completed (${runsPerConfig} reps each). Nothing to resume.`);
1950
+ return {
1951
+ runId,
1952
+ totalTests: 0,
1953
+ successfulTests: 0,
1954
+ stats: evaluationStore.getRunStats(runId),
1955
+ scenarioStats: evaluationStore.getScenarioStats(runId),
1956
+ progressLogPath: getProgressLogPath(runId),
1957
+ resumed: true,
1958
+ alreadyComplete: true,
1959
+ };
1960
+ }
1961
+
1962
+ // 7. Set run status to 'running' and update PID
1963
+ evaluationStore.updateRun(runId, { status: 'running', metadata: { pid: process.pid } });
1964
+
1965
+ const totalRemainingTests = remainingTests.length;
1966
+ const totalExpectedTests = targetScenarios.length * targetConfigs.length * runsPerConfig;
1967
+
1968
+ console.log(`\nResuming run: ${runId}`);
1969
+ console.log(` Previously completed: ${existingResults.length} tests`);
1970
+ console.log(` Remaining: ${totalRemainingTests} tests`);
1971
+ console.log(` Profiles: ${profileNames.join(', ')}`);
1972
+ console.log(` Scenarios: ${targetScenarios.length}`);
1973
+ if (modelOverride) console.log(` Model override: ${modelOverride}`);
1974
+
1975
+ // Initialize content resolver (same as runEvaluation)
1976
+ const contentConfig = evalConfigLoader.getContentConfig();
1977
+ if (contentConfig?.content_package_path) {
1978
+ contentResolver.configure({
1979
+ contentPackagePath: contentConfig.content_package_path,
1980
+ maxLectureChars: contentConfig.max_lecture_chars,
1981
+ includeSpeakerNotes: contentConfig.include_speaker_notes,
1982
+ });
1983
+ }
1984
+
1985
+ // 8. Set up progress logger and streaming reporter (appends to existing JSONL)
1986
+ const progressLogPath = getProgressLogPath(runId);
1987
+ console.log(`Progress log: ${progressLogPath}\n`);
1988
+
1989
+ const progressLogger = new ProgressLogger(runId);
1990
+ const scenarioNames = targetScenarios.map(s => s.name || s.id);
1991
+ const reporter = new StreamingReporter({
1992
+ totalTests: totalRemainingTests,
1993
+ totalScenarios: targetScenarios.length,
1994
+ profiles: profileNames,
1995
+ scenarios: scenarioNames,
1996
+ });
1997
+
1998
+ progressLogger.runStart({
1999
+ totalTests: totalRemainingTests,
2000
+ totalScenarios: targetScenarios.length,
2001
+ totalConfigurations: targetConfigs.length,
2002
+ scenarios: scenarioNames,
2003
+ profiles: profileNames,
2004
+ description: `Resumed: ${totalRemainingTests} remaining tests`,
2005
+ });
2006
+
2007
+ // Register with monitoring
2008
+ monitoringService.startSession(runId, {
2009
+ userId: 'eval-runner-resume',
2010
+ profileName: `${targetConfigs.length} configs`,
2011
+ modelId: 'evaluation-batch',
2012
+ });
2013
+
2014
+ const results = [];
2015
+ let completedTests = 0;
2016
+
2017
+ // Scenario completion tracking
2018
+ const scenarioProgress = new Map();
2019
+ for (const scenario of targetScenarios) {
2020
+ const testsForScenario = remainingTests.filter(t => t.scenario.id === scenario.id).length;
2021
+ scenarioProgress.set(scenario.id, {
2022
+ total: testsForScenario,
2023
+ completed: 0,
2024
+ scores: [],
2025
+ scenarioName: scenario.name || scenario.id,
2026
+ });
2027
+ }
2028
+ let completedScenarios = 0;
2029
+
2030
+ // 9. Reuse the same parallel worker pool pattern
2031
+ async function processQueue(queue, workerCount, processItem) {
2032
+ const items = [...queue];
2033
+ let index = 0;
2034
+
2035
+ async function worker() {
2036
+ while (index < items.length) {
2037
+ const i = index++;
2038
+ await processItem(items[i]);
2039
+ await sleep(REQUEST_DELAY_MS);
2040
+ }
2041
+ }
2042
+
2043
+ const workers = Array.from(
2044
+ { length: Math.min(workerCount, items.length) },
2045
+ () => worker()
2046
+ );
2047
+ await Promise.all(workers);
2048
+ }
2049
+
2050
+ log(`\nRunning ${totalRemainingTests} remaining tests with parallelism=${parallelism}...\n`);
2051
+
2052
+ const runStartTime = Date.now();
2053
+
2054
+ await processQueue(remainingTests, parallelism, async ({ config, scenario }) => {
2055
+ const profileLabel = config.label || config.profileName || '';
2056
+
2057
+ progressLogger.testStart({
2058
+ scenarioId: scenario.id,
2059
+ scenarioName: scenario.name || scenario.id,
2060
+ profileName: profileLabel,
2061
+ });
2062
+
2063
+ try {
2064
+ const result = await runSingleTest(scenario, config, {
2065
+ skipRubricEval,
2066
+ verbose,
2067
+ });
2068
+
2069
+ evaluationStore.storeResult(runId, result);
2070
+ results.push(result);
2071
+ completedTests++;
2072
+
2073
+ progressLogger.testComplete({
2074
+ scenarioId: scenario.id,
2075
+ scenarioName: scenario.name || scenario.id,
2076
+ profileName: profileLabel,
2077
+ success: result.success,
2078
+ overallScore: result.overallScore,
2079
+ baseScore: result.baseScore ?? null,
2080
+ recognitionScore: result.recognitionScore ?? null,
2081
+ latencyMs: result.latencyMs,
2082
+ completedCount: completedTests,
2083
+ totalTests: totalRemainingTests,
2084
+ });
2085
+
2086
+ reporter.onTestComplete({
2087
+ ...result,
2088
+ profileName: profileLabel,
2089
+ scenarioName: scenario.name || scenario.id,
2090
+ });
2091
+
2092
+ log(` ${formatProgress(completedTests, totalRemainingTests, runStartTime)} ${profileLabel} / ${scenario.id}: ${result.success ? `score=${result.overallScore?.toFixed(1)}` : 'FAILED'}`);
2093
+
2094
+ monitoringService.recordEvent(runId, {
2095
+ type: 'evaluation_test',
2096
+ inputTokens: result.inputTokens || 0,
2097
+ outputTokens: result.outputTokens || 0,
2098
+ latencyMs: result.latencyMs || 0,
2099
+ round: completedTests,
2100
+ approved: result.success,
2101
+ });
2102
+
2103
+ // Track scenario completion
2104
+ const sp = scenarioProgress.get(scenario.id);
2105
+ sp.completed++;
2106
+ if (result.overallScore != null) sp.scores.push(result.overallScore);
2107
+ if (sp.completed >= sp.total) {
2108
+ completedScenarios++;
2109
+ const avgScore = sp.scores.length > 0
2110
+ ? sp.scores.reduce((a, b) => a + b, 0) / sp.scores.length
2111
+ : null;
2112
+ progressLogger.scenarioComplete({
2113
+ scenarioId: scenario.id,
2114
+ scenarioName: sp.scenarioName,
2115
+ profileNames,
2116
+ avgScore,
2117
+ completedScenarios,
2118
+ totalScenarios: targetScenarios.length,
2119
+ });
2120
+ reporter.onScenarioComplete({
2121
+ scenarioName: sp.scenarioName,
2122
+ avgScore,
2123
+ completedScenarios,
2124
+ totalScenarios: targetScenarios.length,
2125
+ });
2126
+ }
2127
+ } catch (error) {
2128
+ completedTests++;
2129
+ log(` ${formatProgress(completedTests, totalRemainingTests, runStartTime)} ${profileLabel} / ${scenario.id}: ERROR - ${error.message}`);
2130
+
2131
+ // Store failed result so it shows up in the database
2132
+ const failedResult = {
2133
+ scenarioId: scenario.id,
2134
+ scenarioName: scenario.name || scenario.id,
2135
+ profileName: config.profileName,
2136
+ provider: config.provider || config.ego?.provider || 'unknown',
2137
+ model: config.model || config.ego?.model || 'unknown',
2138
+ egoModel: config.egoModel
2139
+ ? `${config.egoModel.provider}.${config.egoModel.model}`
2140
+ : config.ego ? `${config.ego.provider}.${config.ego.model}` : null,
2141
+ superegoModel: config.superegoModel
2142
+ ? `${config.superegoModel.provider}.${config.superegoModel.model}`
2143
+ : config.superego ? `${config.superego.provider}.${config.superego.model}` : null,
2144
+ factors: config.factors || null,
2145
+ learnerArchitecture: config.learnerArchitecture || null,
2146
+ success: false,
2147
+ errorMessage: error.message,
2148
+ };
2149
+ try {
2150
+ evaluationStore.storeResult(runId, failedResult);
2151
+ results.push(failedResult);
2152
+ } catch (storeErr) {
2153
+ log(` [WARNING] Failed to store error result: ${storeErr.message}`);
2154
+ }
2155
+
2156
+ progressLogger.testError({
2157
+ scenarioId: scenario.id,
2158
+ scenarioName: scenario.name || scenario.id,
2159
+ profileName: profileLabel,
2160
+ errorMessage: error.message,
2161
+ completedCount: completedTests,
2162
+ totalTests: totalRemainingTests,
2163
+ });
2164
+
2165
+ reporter.onTestError({
2166
+ scenarioName: scenario.name || scenario.id,
2167
+ profileName: profileLabel,
2168
+ errorMessage: error.message,
2169
+ });
2170
+
2171
+ monitoringService.recordEvent(runId, {
2172
+ type: 'evaluation_error',
2173
+ round: completedTests,
2174
+ error: error.message,
2175
+ });
2176
+
2177
+ // Track scenario completion even on error
2178
+ const sp = scenarioProgress.get(scenario.id);
2179
+ sp.completed++;
2180
+ if (sp.completed >= sp.total) {
2181
+ completedScenarios++;
2182
+ const avgScore = sp.scores.length > 0
2183
+ ? sp.scores.reduce((a, b) => a + b, 0) / sp.scores.length
2184
+ : null;
2185
+ progressLogger.scenarioComplete({
2186
+ scenarioId: scenario.id,
2187
+ scenarioName: sp.scenarioName,
2188
+ profileNames,
2189
+ avgScore,
2190
+ completedScenarios,
2191
+ totalScenarios: targetScenarios.length,
2192
+ });
2193
+ reporter.onScenarioComplete({
2194
+ scenarioName: sp.scenarioName,
2195
+ avgScore,
2196
+ completedScenarios,
2197
+ totalScenarios: targetScenarios.length,
2198
+ });
2199
+ }
2200
+ }
2201
+ });
2202
+
2203
+ const durationMs = Date.now() - runStartTime;
2204
+ const successfulTests = results.filter(r => r.success).length;
2205
+ const failedTests = completedTests - successfulTests;
2206
+
2207
+ progressLogger.runComplete({ totalTests: completedTests, successfulTests, failedTests, durationMs });
2208
+ reporter.onRunComplete({ totalTests: completedTests, successfulTests, failedTests, durationMs });
2209
+
2210
+ // 10. Mark run as completed (keep original totalTests to show expected vs actual)
2211
+ const allResults = evaluationStore.getResults(runId);
2212
+ evaluationStore.updateRun(runId, {
2213
+ status: 'completed',
2214
+ completedAt: new Date().toISOString(),
2215
+ });
2216
+
2217
+ monitoringService.endSession(runId);
2218
+
2219
+ const stats = evaluationStore.getRunStats(runId);
2220
+ const scenarioStats = evaluationStore.getScenarioStats(runId);
2221
+
2222
+ return {
2223
+ runId,
2224
+ totalTests: run.totalTests,
2225
+ completedTests: allResults.length,
2226
+ successfulTests,
2227
+ failedTests: allResults.filter(r => !r.success).length,
2228
+ resumedTests: totalRemainingTests,
2229
+ stats,
2230
+ scenarioStats,
2231
+ progressLogPath,
2232
+ resumed: true,
564
2233
  };
565
2234
  }
566
2235
 
@@ -591,7 +2260,12 @@ export async function compareConfigurations(configs, options = {}) {
591
2260
  rank: i + 1,
592
2261
  provider: stat.provider,
593
2262
  model: stat.model,
2263
+ profileName: stat.profileName,
2264
+ egoModel: stat.egoModel,
2265
+ superegoModel: stat.superegoModel,
594
2266
  avgScore: stat.avgScore,
2267
+ avgBaseScore: stat.avgBaseScore,
2268
+ avgRecognitionScore: stat.avgRecognitionScore,
595
2269
  successRate: stat.successRate,
596
2270
  avgLatencyMs: stat.avgLatencyMs,
597
2271
  })),
@@ -612,14 +2286,15 @@ export async function quickTest(config, options = {}) {
612
2286
  outputSize = 'normal', // compact, normal, expanded
613
2287
  onLog,
614
2288
  superegoStrategy = null, // Superego intervention strategy
2289
+ judgeOverride = null, // Override judge model for this run
615
2290
  } = options;
616
2291
 
617
- const scenarios = [tutorApi.listScenarios().find(s => s.id === scenarioId)].filter(Boolean);
2292
+ const scenarios = [evalConfigLoader.listScenarios().find(s => s.id === scenarioId)].filter(Boolean);
618
2293
  if (scenarios.length === 0) {
619
2294
  throw new Error(`Scenario not found: ${scenarioId}`);
620
2295
  }
621
2296
 
622
- const result = await runSingleTest(scenarios[0], config, { verbose, skipRubricEval, outputSize, onLog, superegoStrategy });
2297
+ const result = await runSingleTest(scenarios[0], config, { verbose, skipRubricEval, outputSize, onLog, superegoStrategy, judgeOverride });
623
2298
  return result;
624
2299
  }
625
2300
 
@@ -628,9 +2303,9 @@ export async function quickTest(config, options = {}) {
628
2303
  */
629
2304
  export function listOptions() {
630
2305
  return {
631
- scenarios: tutorApi.listScenarios(),
632
- configurations: tutorApi.listConfigurations(),
633
- profiles: tutorApi.listProfiles(),
2306
+ scenarios: evalConfigLoader.listScenarios(),
2307
+ configurations: evalConfigLoader.listConfigurations(),
2308
+ profiles: evalConfigLoader.listTutorProfiles(),
634
2309
  };
635
2310
  }
636
2311
 
@@ -677,16 +2352,19 @@ export function generateReport(runId) {
677
2352
 
678
2353
  // Rankings table
679
2354
  lines.push('CONFIGURATION RANKINGS (by average score)');
680
- lines.push('-'.repeat(80));
681
- lines.push('| Rank | Configuration | Avg Score | Latency | Pass Rate |');
682
- lines.push('|------|----------------------------------|-----------|---------|-----------|');
2355
+ lines.push('-'.repeat(105));
2356
+ lines.push('| Rank | Profile | Model | Overall | Base | Recog | Latency | Pass |');
2357
+ lines.push('|------|----------------------------------|-------------------------|---------|--------|--------|---------|------|');
683
2358
 
684
2359
  stats.forEach((stat, i) => {
685
- const label = `${stat.provider}/${stat.model}`.substring(0, 32).padEnd(32);
686
- const score = stat.avgScore ? stat.avgScore.toFixed(1).padStart(9) : ' N/A';
2360
+ const profile = (stat.profileName || 'N/A').substring(0, 32).padEnd(32);
2361
+ const model = (stat.model || '').substring(0, 23).padEnd(23);
2362
+ const score = stat.avgScore ? stat.avgScore.toFixed(1).padStart(7) : ' N/A';
2363
+ const base = stat.avgBaseScore ? stat.avgBaseScore.toFixed(1).padStart(6) : ' N/A';
2364
+ const recog = stat.avgRecognitionScore ? stat.avgRecognitionScore.toFixed(1).padStart(6) : ' N/A';
687
2365
  const latency = stat.avgLatencyMs ? `${stat.avgLatencyMs.toFixed(0)}ms`.padStart(7) : ' N/A';
688
- const passRate = `${(stat.validationPassRate * 100).toFixed(0)}%`.padStart(9);
689
- lines.push(`| ${(i + 1).toString().padStart(4)} | ${label} | ${score} | ${latency} | ${passRate} |`);
2366
+ const passRate = `${(stat.validationPassRate * 100).toFixed(0)}%`.padStart(4);
2367
+ lines.push(`| ${(i + 1).toString().padStart(4)} | ${profile} | ${model} | ${score} | ${base} | ${recog} | ${latency} | ${passRate} |`);
690
2368
  });
691
2369
 
692
2370
  lines.push('');
@@ -697,7 +2375,7 @@ export function generateReport(runId) {
697
2375
  lines.push('-'.repeat(80));
698
2376
 
699
2377
  const dims = ['relevance', 'specificity', 'pedagogical', 'personalization', 'actionability', 'tone'];
700
- const header = '| Dimension |' + stats.map(s => ` ${s.model.substring(0, 12).padEnd(12)} |`).join('');
2378
+ const header = '| Dimension |' + stats.map(s => ` ${(s.profileName || s.model).substring(0, 12).padEnd(12)} |`).join('');
701
2379
  lines.push(header);
702
2380
  lines.push('|-----------------|' + stats.map(() => '--------------|').join(''));
703
2381
 
@@ -719,21 +2397,244 @@ export function generateReport(runId) {
719
2397
  lines.push(`\n${scenario.scenarioName} (${scenario.scenarioId})`);
720
2398
  for (const config of scenario.configurations) {
721
2399
  const status = config.passesValidation ? 'PASS' : 'FAIL';
722
- lines.push(` ${config.provider}/${config.model}: ${config.avgScore?.toFixed(1) || 'N/A'} [${status}]`);
2400
+ const profile = config.profileName || `${config.provider}/${config.model}`;
2401
+ const base = config.avgBaseScore != null ? `base=${config.avgBaseScore.toFixed(1)}` : '';
2402
+ const recog = config.avgRecognitionScore != null ? `recog=${config.avgRecognitionScore.toFixed(1)}` : '';
2403
+ const scores = [base, recog].filter(Boolean).join(', ');
2404
+ lines.push(` ${profile}: ${config.avgScore?.toFixed(1) || 'N/A'} (${scores}) [${status}]`);
723
2405
  }
724
2406
  }
725
2407
 
726
2408
  lines.push('');
2409
+
2410
+ // ANOVA analysis — if factorial data is available, run for each score type
2411
+ const scoreTypes = [
2412
+ { column: 'overall_score', label: 'Overall Score' },
2413
+ { column: 'base_score', label: 'Base Score' },
2414
+ { column: 'recognition_score', label: 'Recognition Score' },
2415
+ ];
2416
+
2417
+ for (const { column, label } of scoreTypes) {
2418
+ const cellData = evaluationStore.getFactorialCellData(runId, { scoreColumn: column });
2419
+ const cellKeys = Object.keys(cellData);
2420
+ if (cellKeys.length === 0) continue;
2421
+
2422
+ const totalSamples = Object.values(cellData).reduce((sum, arr) => sum + arr.length, 0);
2423
+ lines.push(`FACTORIAL ANOVA — ${label.toUpperCase()} (2x2x2)`);
2424
+ lines.push('-'.repeat(80));
2425
+ lines.push(`Cells with data: ${cellKeys.length}/8 | Total samples: ${totalSamples}`);
2426
+ lines.push('');
2427
+
2428
+ // Cell means summary
2429
+ for (const key of cellKeys.sort()) {
2430
+ const scores = cellData[key];
2431
+ const mean = scores.reduce((a, b) => a + b, 0) / scores.length;
2432
+ const cellLabel = key.replace(/r(\d)_t(\d)_l(\d)/, (_, r, t, l) =>
2433
+ `Recog=${r === '1' ? 'Y' : 'N'} Tutor=${t === '1' ? 'Multi' : 'Single'} Learner=${l === '1' ? 'Psycho' : 'Unified'}`
2434
+ );
2435
+ lines.push(` ${cellLabel}: mean=${mean.toFixed(1)} (n=${scores.length})`);
2436
+ }
2437
+ lines.push('');
2438
+
2439
+ if (totalSamples > 8) {
2440
+ const anovaResult = anovaStats.runThreeWayANOVA(cellData);
2441
+ lines.push(anovaStats.formatANOVAReport(anovaResult, { scoreLabel: label }));
2442
+ } else {
2443
+ lines.push(' (Need > 8 total samples for ANOVA — increase --runs)');
2444
+ }
2445
+ lines.push('');
2446
+ }
2447
+
727
2448
  lines.push('='.repeat(80));
728
2449
 
729
2450
  return lines.join('\n');
730
2451
  }
731
2452
 
2453
+ /**
2454
+ * Re-judge all results in an existing run without regenerating tutor responses.
2455
+ *
2456
+ * By default, creates NEW rows preserving judgment history (for inter-judge reliability).
2457
+ * Use --overwrite to replace existing scores instead.
2458
+ *
2459
+ * @param {string} runId - The run to rejudge
2460
+ * @param {Object} options
2461
+ * @param {string} [options.judgeOverride] - Override judge model (e.g. 'openrouter.nemotron')
2462
+ * @param {boolean} [options.verbose] - Show per-result progress
2463
+ * @param {string} [options.scenarioFilter] - Only rejudge results for this scenario ID
2464
+ * @param {number} [options.parallelism] - Concurrent judge calls (default 3)
2465
+ * @param {boolean} [options.overwrite] - If true, update existing rows instead of creating new ones
2466
+ * @returns {Promise<Object>} Summary stats
2467
+ */
2468
+ export async function rejudgeRun(runId, options = {}) {
2469
+ const {
2470
+ judgeOverride = null,
2471
+ verbose = false,
2472
+ scenarioFilter = null,
2473
+ parallelism = DEFAULT_PARALLELISM,
2474
+ overwrite = false,
2475
+ } = options;
2476
+
2477
+ const log = verbose ? console.log : () => {};
2478
+
2479
+ const run = evaluationStore.getRun(runId);
2480
+ if (!run) throw new Error(`Run not found: ${runId}`);
2481
+
2482
+ let results = evaluationStore.getResults(runId, {
2483
+ scenarioId: scenarioFilter || null,
2484
+ });
2485
+
2486
+ // Skip results that have no suggestions (errors / failed generation)
2487
+ results = results.filter(r => r.success && r.suggestions?.length > 0);
2488
+
2489
+ if (results.length === 0) {
2490
+ throw new Error('No successful results with suggestions found to rejudge');
2491
+ }
2492
+
2493
+ // Deduplicate: only rejudge unique responses (by suggestions content)
2494
+ // This prevents cascading rejudgments when running multiple times
2495
+ const seenSuggestions = new Set();
2496
+ const uniqueResults = [];
2497
+ for (const r of results) {
2498
+ const suggKey = typeof r.suggestions === 'string' ? r.suggestions : JSON.stringify(r.suggestions);
2499
+ if (!seenSuggestions.has(suggKey)) {
2500
+ seenSuggestions.add(suggKey);
2501
+ uniqueResults.push(r);
2502
+ }
2503
+ }
2504
+
2505
+ const skipped = results.length - uniqueResults.length;
2506
+ results = uniqueResults;
2507
+
2508
+ log(`\nRejudging ${results.length} unique results from run ${runId}${skipped > 0 ? ` (skipping ${skipped} duplicates)` : ''}`);
2509
+ if (judgeOverride) log(` Judge override: ${judgeOverride}`);
2510
+ if (scenarioFilter) log(` Scenario filter: ${scenarioFilter}`);
2511
+
2512
+ // Capture old scores for before/after comparison
2513
+ const oldScores = results.map(r => r.overallScore).filter(s => s != null);
2514
+ const oldAvg = oldScores.length > 0
2515
+ ? oldScores.reduce((a, b) => a + b, 0) / oldScores.length
2516
+ : null;
2517
+
2518
+ let completed = 0;
2519
+ let succeeded = 0;
2520
+ let failed = 0;
2521
+ const newScores = [];
2522
+
2523
+ // Build judge override object if provided
2524
+ // rubricEvaluator expects { judgeOverride: { model: "..." } }
2525
+ const judgeOverrideObj = judgeOverride ? { judgeOverride: { model: judgeOverride } } : {};
2526
+
2527
+ // Parallel worker pool (same pattern as main eval loop)
2528
+ const items = [...results];
2529
+ let index = 0;
2530
+
2531
+ async function worker() {
2532
+ while (index < items.length) {
2533
+ const i = index++;
2534
+ const result = items[i];
2535
+
2536
+ try {
2537
+ const fullScenario = evalConfigLoader.getScenario(result.scenarioId);
2538
+ if (!fullScenario) {
2539
+ throw new Error(`Scenario not found: ${result.scenarioId}`);
2540
+ }
2541
+
2542
+ const suggestion = result.suggestions[0];
2543
+
2544
+ // Load dialogue context for multi-turn results
2545
+ let dialogueContext = null;
2546
+ if (result.dialogueId) {
2547
+ const logPath = path.join(LOGS_DIR, `${result.dialogueId}.json`);
2548
+ try {
2549
+ if (fs.existsSync(logPath)) {
2550
+ const dialogueLog = JSON.parse(fs.readFileSync(logPath, 'utf-8'));
2551
+ if (dialogueLog.isMultiTurn && dialogueLog.dialogueTrace?.length > 0) {
2552
+ dialogueContext = {
2553
+ consolidatedTrace: dialogueLog.dialogueTrace,
2554
+ conversationHistory: (dialogueLog.turnResults || []).map((t, ti) => ({
2555
+ turnIndex: ti,
2556
+ turnId: t.turnId,
2557
+ suggestion: t.suggestions?.[0],
2558
+ learnerAction: t.learnerAction,
2559
+ learnerMessage: t.learnerMessage,
2560
+ })),
2561
+ };
2562
+ }
2563
+ }
2564
+ } catch (e) {
2565
+ log(` Warning: could not load dialogue log for ${result.dialogueId}: ${e.message}`);
2566
+ }
2567
+ }
2568
+
2569
+ const evaluation = await retryWithBackoff(
2570
+ () => rubricEvaluator.evaluateSuggestion(suggestion, {
2571
+ name: fullScenario.name,
2572
+ description: fullScenario.description,
2573
+ expectedBehavior: fullScenario.expected_behavior,
2574
+ learnerContext: fullScenario.learner_context,
2575
+ requiredElements: fullScenario.required_elements,
2576
+ forbiddenElements: fullScenario.forbidden_elements,
2577
+ }, { dialogueContext }, judgeOverrideObj),
2578
+ {}
2579
+ );
2580
+
2581
+ if (evaluation.success) {
2582
+ if (overwrite) {
2583
+ // Old behavior: update in place (loses history)
2584
+ evaluationStore.updateResultScores(result.id, evaluation);
2585
+ } else {
2586
+ // New behavior: create new row (preserves history for reliability analysis)
2587
+ evaluationStore.storeRejudgment(result, evaluation);
2588
+ }
2589
+ succeeded++;
2590
+ if (evaluation.overallScore != null) newScores.push(evaluation.overallScore);
2591
+ const modeLabel = overwrite ? 'replaced' : 'added';
2592
+ log(` [${completed + 1}/${results.length}] ${result.scenarioId} / ${result.profileName}: ${evaluation.overallScore?.toFixed(1)} (${modeLabel}, was ${result.overallScore?.toFixed(1) ?? '--'})`);
2593
+ } else {
2594
+ failed++;
2595
+ log(` [${completed + 1}/${results.length}] ${result.scenarioId} / ${result.profileName}: JUDGE FAILED - ${evaluation.error}`);
2596
+ }
2597
+ } catch (error) {
2598
+ failed++;
2599
+ log(` [${completed + 1}/${results.length}] ${result.scenarioId} / ${result.profileName}: ERROR - ${error.message}`);
2600
+ }
2601
+
2602
+ completed++;
2603
+ await sleep(REQUEST_DELAY_MS);
2604
+ }
2605
+ }
2606
+
2607
+ const workers = Array.from(
2608
+ { length: Math.min(parallelism, items.length) },
2609
+ () => worker()
2610
+ );
2611
+ await Promise.all(workers);
2612
+
2613
+ const newAvg = newScores.length > 0
2614
+ ? newScores.reduce((a, b) => a + b, 0) / newScores.length
2615
+ : null;
2616
+
2617
+ return {
2618
+ runId,
2619
+ total: results.length,
2620
+ succeeded,
2621
+ failed,
2622
+ oldAvgScore: oldAvg,
2623
+ newAvgScore: newAvg,
2624
+ scoreDelta: oldAvg != null && newAvg != null ? newAvg - oldAvg : null,
2625
+ };
2626
+ }
2627
+
2628
+ // Named exports for unit testing (these are internal helpers not part of the public API)
2629
+ export { structureLearnerContext, resolveConfigModels };
2630
+
732
2631
  export default {
733
2632
  runEvaluation,
2633
+ resumeEvaluation,
734
2634
  compareConfigurations,
735
2635
  quickTest,
736
2636
  listOptions,
737
2637
  getRunResults,
738
2638
  generateReport,
2639
+ rejudgeRun,
739
2640
  };