@machinespirits/eval 0.1.2 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +161 -0
- package/config/eval-settings.yaml +18 -0
- package/config/evaluation-rubric-learner.yaml +277 -0
- package/config/evaluation-rubric.yaml +613 -0
- package/config/interaction-eval-scenarios.yaml +93 -50
- package/config/learner-agents.yaml +124 -193
- package/config/machinespirits-eval.code-workspace +11 -0
- package/config/providers.yaml +60 -0
- package/config/suggestion-scenarios.yaml +1399 -0
- package/config/tutor-agents.yaml +716 -0
- package/docs/EVALUATION-VARIABLES.md +589 -0
- package/docs/REPLICATION-PLAN.md +577 -0
- package/index.js +15 -6
- package/package.json +16 -22
- package/routes/evalRoutes.js +88 -36
- package/scripts/analyze-judge-reliability.js +401 -0
- package/scripts/analyze-run.js +97 -0
- package/scripts/analyze-run.mjs +282 -0
- package/scripts/analyze-validation-failures.js +141 -0
- package/scripts/check-run.mjs +17 -0
- package/scripts/code-impasse-strategies.js +1132 -0
- package/scripts/compare-runs.js +44 -0
- package/scripts/compare-suggestions.js +80 -0
- package/scripts/compare-transformation.js +116 -0
- package/scripts/dig-into-run.js +158 -0
- package/scripts/eval-cli.js +2626 -0
- package/scripts/generate-paper-figures.py +452 -0
- package/scripts/qualitative-analysis-ai.js +1313 -0
- package/scripts/qualitative-analysis.js +688 -0
- package/scripts/seed-db.js +87 -0
- package/scripts/show-failed-suggestions.js +64 -0
- package/scripts/validate-content.js +192 -0
- package/server.js +3 -2
- package/services/__tests__/evalConfigLoader.test.js +338 -0
- package/services/anovaStats.js +499 -0
- package/services/contentResolver.js +407 -0
- package/services/dialogueTraceAnalyzer.js +454 -0
- package/services/evalConfigLoader.js +625 -0
- package/services/evaluationRunner.js +2171 -270
- package/services/evaluationStore.js +564 -29
- package/services/learnerConfigLoader.js +75 -5
- package/services/learnerRubricEvaluator.js +284 -0
- package/services/learnerTutorInteractionEngine.js +375 -0
- package/services/processUtils.js +18 -0
- package/services/progressLogger.js +98 -0
- package/services/promptRecommendationService.js +31 -26
- package/services/promptRewriter.js +427 -0
- package/services/rubricEvaluator.js +543 -70
- package/services/streamingReporter.js +104 -0
- package/services/turnComparisonAnalyzer.js +494 -0
- package/components/MobileEvalDashboard.tsx +0 -267
- package/components/comparison/DeltaAnalysisTable.tsx +0 -137
- package/components/comparison/ProfileComparisonCard.tsx +0 -176
- package/components/comparison/RecognitionABMode.tsx +0 -385
- package/components/comparison/RecognitionMetricsPanel.tsx +0 -135
- package/components/comparison/WinnerIndicator.tsx +0 -64
- package/components/comparison/index.ts +0 -5
- package/components/mobile/BottomSheet.tsx +0 -233
- package/components/mobile/DimensionBreakdown.tsx +0 -210
- package/components/mobile/DocsView.tsx +0 -363
- package/components/mobile/LogsView.tsx +0 -481
- package/components/mobile/PsychodynamicQuadrant.tsx +0 -261
- package/components/mobile/QuickTestView.tsx +0 -1098
- package/components/mobile/RecognitionTypeChart.tsx +0 -124
- package/components/mobile/RecognitionView.tsx +0 -809
- package/components/mobile/RunDetailView.tsx +0 -261
- package/components/mobile/RunHistoryView.tsx +0 -367
- package/components/mobile/ScoreRadial.tsx +0 -211
- package/components/mobile/StreamingLogPanel.tsx +0 -230
- package/components/mobile/SynthesisStrategyChart.tsx +0 -140
- package/docs/research/ABLATION-DIALOGUE-ROUNDS.md +0 -52
- package/docs/research/ABLATION-MODEL-SELECTION.md +0 -53
- package/docs/research/ADVANCED-EVAL-ANALYSIS.md +0 -60
- package/docs/research/ANOVA-RESULTS-2026-01-14.md +0 -257
- package/docs/research/COMPREHENSIVE-EVALUATION-PLAN.md +0 -586
- package/docs/research/COST-ANALYSIS.md +0 -56
- package/docs/research/CRITICAL-REVIEW-RECOGNITION-TUTORING.md +0 -340
- package/docs/research/DYNAMIC-VS-SCRIPTED-ANALYSIS.md +0 -291
- package/docs/research/EVAL-SYSTEM-ANALYSIS.md +0 -306
- package/docs/research/FACTORIAL-RESULTS-2026-01-14.md +0 -301
- package/docs/research/IMPLEMENTATION-PLAN-CRITIQUE-RESPONSE.md +0 -1988
- package/docs/research/LONGITUDINAL-DYADIC-EVALUATION.md +0 -282
- package/docs/research/MULTI-JUDGE-VALIDATION-2026-01-14.md +0 -147
- package/docs/research/PAPER-EXTENSION-DYADIC.md +0 -204
- package/docs/research/PAPER-UNIFIED.md +0 -659
- package/docs/research/PAPER-UNIFIED.pdf +0 -0
- package/docs/research/PROMPT-IMPROVEMENTS-2026-01-14.md +0 -356
- package/docs/research/SESSION-NOTES-2026-01-11-RECOGNITION-EVAL.md +0 -419
- package/docs/research/apa.csl +0 -2133
- package/docs/research/archive/PAPER-DRAFT-RECOGNITION-TUTORING.md +0 -1637
- package/docs/research/archive/paper-multiagent-tutor.tex +0 -978
- package/docs/research/paper-draft/full-paper.md +0 -136
- package/docs/research/paper-draft/images/pasted-image-2026-01-24T03-47-47-846Z-d76a7ae2.png +0 -0
- package/docs/research/paper-draft/references.bib +0 -515
- package/docs/research/transcript-baseline.md +0 -139
- package/docs/research/transcript-recognition-multiagent.md +0 -187
- package/hooks/useEvalData.ts +0 -625
- package/server-init.js +0 -45
- package/services/benchmarkService.js +0 -1892
- package/types.ts +0 -165
- package/utils/haptics.ts +0 -45
|
@@ -5,13 +5,227 @@
|
|
|
5
5
|
* test scenarios with rubric-based scoring.
|
|
6
6
|
*/
|
|
7
7
|
|
|
8
|
-
import
|
|
8
|
+
import fs from 'fs';
|
|
9
|
+
import path from 'path';
|
|
10
|
+
import { fileURLToPath } from 'url';
|
|
11
|
+
import { execSync } from 'child_process';
|
|
12
|
+
import { tutorApiService as tutorApi, monitoringService, tutorDialogueEngine as dialogueEngine } from '@machinespirits/tutor-core';
|
|
9
13
|
import * as rubricEvaluator from './rubricEvaluator.js';
|
|
10
14
|
import * as evaluationStore from './evaluationStore.js';
|
|
15
|
+
import * as evalConfigLoader from './evalConfigLoader.js';
|
|
16
|
+
import * as contentResolver from './contentResolver.js';
|
|
17
|
+
import { ProgressLogger, getProgressLogPath } from './progressLogger.js';
|
|
18
|
+
import { StreamingReporter } from './streamingReporter.js';
|
|
19
|
+
import * as anovaStats from './anovaStats.js';
|
|
20
|
+
import { generateLearnerResponse } from './learnerTutorInteractionEngine.js';
|
|
21
|
+
import * as turnComparisonAnalyzer from './turnComparisonAnalyzer.js';
|
|
22
|
+
import * as dialogueTraceAnalyzer from './dialogueTraceAnalyzer.js';
|
|
23
|
+
import * as promptRewriter from './promptRewriter.js';
|
|
24
|
+
|
|
25
|
+
const __dirname = path.dirname(fileURLToPath(import.meta.url));
|
|
26
|
+
const EVAL_ROOT = path.resolve(__dirname, '..');
|
|
27
|
+
const LOGS_DIR = path.join(EVAL_ROOT, 'logs', 'tutor-dialogues');
|
|
28
|
+
|
|
29
|
+
// Redirect tutor-core logs to this repo's logs/ directory (if available)
|
|
30
|
+
import('@machinespirits/tutor-core').then(mod => {
|
|
31
|
+
if (typeof mod.setLogDir === 'function') mod.setLogDir(path.join(EVAL_ROOT, 'logs'));
|
|
32
|
+
}).catch(() => { /* setLogDir not available in this tutor-core version */ });
|
|
33
|
+
|
|
34
|
+
// Read package version once at import time
|
|
35
|
+
const pkg = JSON.parse(fs.readFileSync(path.join(EVAL_ROOT, 'package.json'), 'utf-8'));
|
|
36
|
+
|
|
37
|
+
/**
|
|
38
|
+
* Get the current git commit hash, or 'unknown' if not in a git repo.
|
|
39
|
+
*/
|
|
40
|
+
function getGitCommitHash() {
|
|
41
|
+
try {
|
|
42
|
+
return execSync('git rev-parse --short HEAD', { cwd: EVAL_ROOT, encoding: 'utf-8' }).trim();
|
|
43
|
+
} catch {
|
|
44
|
+
return 'unknown';
|
|
45
|
+
}
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
import { isPidAlive } from './processUtils.js';
|
|
49
|
+
|
|
50
|
+
/**
|
|
51
|
+
* Eval-only profile names that need remapping to tutor-core profiles.
|
|
52
|
+
*/
|
|
53
|
+
const EVAL_ONLY_PROFILES = [
|
|
54
|
+
'single_baseline', 'single_baseline_paid',
|
|
55
|
+
'single_recognition', 'single_recognition_paid',
|
|
56
|
+
'single_enhanced',
|
|
57
|
+
'baseline', 'baseline_paid',
|
|
58
|
+
'recognition', 'recognition_paid',
|
|
59
|
+
'enhanced',
|
|
60
|
+
'cell_1_base_single_unified', 'cell_2_base_single_psycho',
|
|
61
|
+
'cell_3_base_multi_unified', 'cell_4_base_multi_psycho',
|
|
62
|
+
'cell_5_recog_single_unified', 'cell_6_recog_single_psycho',
|
|
63
|
+
'cell_7_recog_multi_unified', 'cell_8_recog_multi_psycho',
|
|
64
|
+
'cell_9_enhanced_single_unified', 'cell_10_enhanced_single_psycho',
|
|
65
|
+
'cell_11_enhanced_multi_unified', 'cell_12_enhanced_multi_psycho',
|
|
66
|
+
'cell_13_hardwired_single_unified', 'cell_14_hardwired_single_psycho',
|
|
67
|
+
'cell_15_placebo_single_unified', 'cell_16_placebo_single_psycho',
|
|
68
|
+
'cell_17_placebo_multi_unified', 'cell_18_placebo_multi_psycho',
|
|
69
|
+
'cell_19_memory_single_unified', 'cell_20_recog_nomem_single_unified',
|
|
70
|
+
'cell_21_recog_multi_unified_rewrite',
|
|
71
|
+
];
|
|
72
|
+
|
|
73
|
+
/**
|
|
74
|
+
* Resolve an eval profile name into dialogue settings and a tutor-core profile.
|
|
75
|
+
*
|
|
76
|
+
* Eval profiles (cell_*, recognition, etc.) carry dialogue/recognition config that
|
|
77
|
+
* tutor-core doesn't know about. This function extracts those settings and maps the
|
|
78
|
+
* profile name to a tutor-core equivalent ('budget' or 'recognition').
|
|
79
|
+
*
|
|
80
|
+
* Exported for unit testing.
|
|
81
|
+
*/
|
|
82
|
+
export function resolveEvalProfile(profileName) {
|
|
83
|
+
const evalProfile = evalConfigLoader.loadTutorAgents()?.profiles?.[profileName];
|
|
84
|
+
const useDialogue = evalProfile?.dialogue?.enabled ?? false;
|
|
85
|
+
const maxRounds = evalProfile?.dialogue?.max_rounds ?? 0;
|
|
86
|
+
const recognitionMode = evalProfile?.recognition_mode ?? profileName?.includes('recognition') ?? false;
|
|
87
|
+
|
|
88
|
+
let resolvedProfileName = profileName;
|
|
89
|
+
if (profileName && EVAL_ONLY_PROFILES.includes(profileName)) {
|
|
90
|
+
// Map eval profile to tutor-core profile based on prompt_type
|
|
91
|
+
const promptType = evalProfile?.factors?.prompt_type;
|
|
92
|
+
if (promptType === 'enhanced') {
|
|
93
|
+
resolvedProfileName = 'enhanced';
|
|
94
|
+
} else if (promptType === 'placebo') {
|
|
95
|
+
resolvedProfileName = 'placebo';
|
|
96
|
+
} else if (promptType === 'hardwired') {
|
|
97
|
+
resolvedProfileName = 'hardwired';
|
|
98
|
+
} else if (promptType === 'memory') {
|
|
99
|
+
resolvedProfileName = 'memory';
|
|
100
|
+
} else if (promptType === 'recognition_nomem') {
|
|
101
|
+
resolvedProfileName = 'recognition_nomem';
|
|
102
|
+
} else if (recognitionMode) {
|
|
103
|
+
resolvedProfileName = 'recognition';
|
|
104
|
+
} else {
|
|
105
|
+
resolvedProfileName = 'budget';
|
|
106
|
+
}
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
return { useDialogue, maxRounds, recognitionMode, resolvedProfileName };
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
/**
|
|
113
|
+
* Resolve provider/model references in a config object through eval's providers.yaml.
|
|
114
|
+
* This ensures eval controls which model IDs get sent to tutorApi.
|
|
115
|
+
*/
|
|
116
|
+
function resolveConfigModels(config) {
|
|
117
|
+
const resolved = { ...config };
|
|
118
|
+
if (config.provider && config.model) {
|
|
119
|
+
try {
|
|
120
|
+
const r = evalConfigLoader.resolveModel(`${config.provider}.${config.model}`);
|
|
121
|
+
resolved.provider = r.provider;
|
|
122
|
+
resolved.model = r.model;
|
|
123
|
+
} catch (e) { console.debug(`[evaluationRunner] resolveModel failed for ${config.provider}.${config.model}:`, e.message); }
|
|
124
|
+
}
|
|
125
|
+
if (config.egoModel) {
|
|
126
|
+
try {
|
|
127
|
+
const r = evalConfigLoader.resolveModel(config.egoModel);
|
|
128
|
+
resolved.egoModel = r.model;
|
|
129
|
+
resolved.egoProvider = r.provider;
|
|
130
|
+
} catch (e) { console.debug(`[evaluationRunner] resolveModel failed for egoModel ${config.egoModel}:`, e.message); }
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
// When a profileName is provided but no explicit provider/model,
|
|
134
|
+
// look up the profile from the eval repo's local tutor-agents.yaml
|
|
135
|
+
// and extract the ego provider/model as explicit overrides.
|
|
136
|
+
// Uses egoModel (not model) because tutor-core's generateSuggestions
|
|
137
|
+
// uses profileName to load its own config — egoModel is the override.
|
|
138
|
+
if (resolved.profileName && !resolved.provider && !resolved.model) {
|
|
139
|
+
const profile = evalConfigLoader.getTutorProfile(resolved.profileName);
|
|
140
|
+
if (profile?.ego) {
|
|
141
|
+
resolved.provider = profile.ego.resolvedProvider || profile.ego.provider;
|
|
142
|
+
resolved.model = profile.ego.resolvedModel || profile.ego.model;
|
|
143
|
+
// Pass egoModel as object { provider, model } — tutor-core's resolveModel()
|
|
144
|
+
// supports both string ("provider.model") and object formats, but aliases
|
|
145
|
+
// containing dots (e.g., "kimi-k2.5") break the string format's split('.').
|
|
146
|
+
resolved.egoModel = { provider: profile.ego.provider, model: profile.ego.model };
|
|
147
|
+
if (profile.ego.hyperparameters && !resolved.hyperparameters) {
|
|
148
|
+
resolved.hyperparameters = profile.ego.hyperparameters;
|
|
149
|
+
}
|
|
150
|
+
}
|
|
151
|
+
if (profile?.superego) {
|
|
152
|
+
resolved.superegoModel = { provider: profile.superego.provider, model: profile.superego.model };
|
|
153
|
+
if (profile.superego.hyperparameters && !resolved.superegoHyperparameters) {
|
|
154
|
+
resolved.superegoHyperparameters = profile.superego.hyperparameters;
|
|
155
|
+
}
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
// Extract factorial factor tags and learner architecture from profile
|
|
159
|
+
const rawProfile = evalConfigLoader.loadTutorAgents()?.profiles?.[resolved.profileName];
|
|
160
|
+
if (rawProfile?.factors) {
|
|
161
|
+
resolved.factors = rawProfile.factors;
|
|
162
|
+
}
|
|
163
|
+
if (rawProfile?.learner_architecture) {
|
|
164
|
+
resolved.learnerArchitecture = rawProfile.learner_architecture;
|
|
165
|
+
}
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
// Apply CLI --model override (replaces ego and superego models, preserves factorial metadata)
|
|
169
|
+
if (config.modelOverride) {
|
|
170
|
+
try {
|
|
171
|
+
const r = evalConfigLoader.resolveModel(config.modelOverride);
|
|
172
|
+
resolved.provider = r.provider;
|
|
173
|
+
resolved.model = r.model;
|
|
174
|
+
resolved.egoModel = { provider: r.provider, model: r.model };
|
|
175
|
+
if (resolved.superegoModel) {
|
|
176
|
+
resolved.superegoModel = { provider: r.provider, model: r.model };
|
|
177
|
+
}
|
|
178
|
+
} catch (e) {
|
|
179
|
+
throw new Error(`Invalid --model override "${config.modelOverride}": ${e.message}`);
|
|
180
|
+
}
|
|
181
|
+
}
|
|
182
|
+
|
|
183
|
+
// Apply CLI --ego-model override (replaces only ego model)
|
|
184
|
+
if (config.egoModelOverride) {
|
|
185
|
+
try {
|
|
186
|
+
const r = evalConfigLoader.resolveModel(config.egoModelOverride);
|
|
187
|
+
resolved.egoModel = { provider: r.provider, model: r.model };
|
|
188
|
+
// Also update top-level provider/model for compatibility
|
|
189
|
+
resolved.provider = r.provider;
|
|
190
|
+
resolved.model = r.model;
|
|
191
|
+
} catch (e) {
|
|
192
|
+
throw new Error(`Invalid --ego-model override "${config.egoModelOverride}": ${e.message}`);
|
|
193
|
+
}
|
|
194
|
+
}
|
|
195
|
+
|
|
196
|
+
// Apply CLI --superego-model override (replaces only superego model)
|
|
197
|
+
if (config.superegoModelOverride && resolved.superegoModel) {
|
|
198
|
+
try {
|
|
199
|
+
const r = evalConfigLoader.resolveModel(config.superegoModelOverride);
|
|
200
|
+
resolved.superegoModel = { provider: r.provider, model: r.model };
|
|
201
|
+
} catch (e) {
|
|
202
|
+
throw new Error(`Invalid --superego-model override "${config.superegoModelOverride}": ${e.message}`);
|
|
203
|
+
}
|
|
204
|
+
}
|
|
205
|
+
|
|
206
|
+
return resolved;
|
|
207
|
+
}
|
|
208
|
+
|
|
209
|
+
/**
|
|
210
|
+
* Filter scenarios by cluster name(s).
|
|
211
|
+
* Supported clusters: 'single-turn', 'multi-turn', or category names (core, mood, benchmark, recognition, multi_turn).
|
|
212
|
+
* Comma-separated values are OR'd together.
|
|
213
|
+
*/
|
|
214
|
+
function applyScenarioFilter(scenarios, filter) {
|
|
215
|
+
const clusters = filter.split(',').map(s => s.trim().toLowerCase());
|
|
216
|
+
return scenarios.filter(s => {
|
|
217
|
+
for (const c of clusters) {
|
|
218
|
+
if (c === 'single-turn' && !s.isMultiTurn) return true;
|
|
219
|
+
if (c === 'multi-turn' && s.isMultiTurn) return true;
|
|
220
|
+
if (s.category === c) return true;
|
|
221
|
+
}
|
|
222
|
+
return false;
|
|
223
|
+
});
|
|
224
|
+
}
|
|
11
225
|
|
|
12
226
|
// Rate limiting settings
|
|
13
|
-
const DEFAULT_PARALLELISM =
|
|
14
|
-
const REQUEST_DELAY_MS =
|
|
227
|
+
const DEFAULT_PARALLELISM = 3;
|
|
228
|
+
const REQUEST_DELAY_MS = 200;
|
|
15
229
|
const MAX_RETRIES = 3;
|
|
16
230
|
const INITIAL_RETRY_DELAY_MS = 2000; // Start with 2 seconds
|
|
17
231
|
|
|
@@ -29,6 +243,23 @@ function sleep(ms) {
|
|
|
29
243
|
return new Promise(resolve => setTimeout(resolve, ms));
|
|
30
244
|
}
|
|
31
245
|
|
|
246
|
+
/**
|
|
247
|
+
* Format a progress tag with percentage and elapsed time.
|
|
248
|
+
* @param {number} completed - Completed tests
|
|
249
|
+
* @param {number} total - Total tests
|
|
250
|
+
* @param {number} startTime - Start timestamp (Date.now())
|
|
251
|
+
* @returns {string} e.g. "[3/10] (30%) 1m 23s"
|
|
252
|
+
*/
|
|
253
|
+
function formatProgress(completed, total, startTime) {
|
|
254
|
+
const pct = total > 0 ? Math.round((completed / total) * 100) : 0;
|
|
255
|
+
const elapsedMs = Date.now() - startTime;
|
|
256
|
+
const elapsedSec = Math.round(elapsedMs / 1000);
|
|
257
|
+
const min = Math.floor(elapsedSec / 60);
|
|
258
|
+
const sec = elapsedSec % 60;
|
|
259
|
+
const elapsed = min > 0 ? `${min}m ${sec}s` : `${sec}s`;
|
|
260
|
+
return `[${completed}/${total}] (${pct}%) ${elapsed}`;
|
|
261
|
+
}
|
|
262
|
+
|
|
32
263
|
/**
|
|
33
264
|
* Retry wrapper for API calls with exponential backoff
|
|
34
265
|
* Handles 429 rate limit errors from OpenRouter free tier
|
|
@@ -68,6 +299,467 @@ async function retryWithBackoff(fn, context = {}, maxRetries = MAX_RETRIES) {
|
|
|
68
299
|
throw lastError;
|
|
69
300
|
}
|
|
70
301
|
|
|
302
|
+
// ---------------------------------------------------------------------------
|
|
303
|
+
// Structured context extraction — parse markdown learner context into
|
|
304
|
+
// labeled fields so the model can't miss key signals.
|
|
305
|
+
// See notes/baseline-prompt-v2-2026-02-02.md for rationale.
|
|
306
|
+
// ---------------------------------------------------------------------------
|
|
307
|
+
|
|
308
|
+
/**
|
|
309
|
+
* Extract key signals from markdown learner context and prepend a
|
|
310
|
+
* structured summary block. The original context is preserved below.
|
|
311
|
+
*/
|
|
312
|
+
function structureLearnerContext(rawContext) {
|
|
313
|
+
if (!rawContext || typeof rawContext !== 'string') return rawContext;
|
|
314
|
+
|
|
315
|
+
const fields = {};
|
|
316
|
+
|
|
317
|
+
// User type
|
|
318
|
+
if (/\bnew user\b/i.test(rawContext)) {
|
|
319
|
+
fields['Learner Type'] = 'New user (no prior history)';
|
|
320
|
+
} else {
|
|
321
|
+
const sessionMatch = rawContext.match(/(\d+)\s+sessions?/i);
|
|
322
|
+
const eventMatch = rawContext.match(/(\d+)\s+total events?/i);
|
|
323
|
+
fields['Learner Type'] = 'Returning user' +
|
|
324
|
+
(sessionMatch ? `, ${sessionMatch[1]} sessions` : '') +
|
|
325
|
+
(eventMatch ? `, ${eventMatch[1]} events` : '');
|
|
326
|
+
}
|
|
327
|
+
|
|
328
|
+
// Current content
|
|
329
|
+
const viewingMatch = rawContext.match(/\*\*Currently viewing\*\*:\s*(.+)/);
|
|
330
|
+
if (viewingMatch) {
|
|
331
|
+
fields['Current Content'] = viewingMatch[1].trim();
|
|
332
|
+
}
|
|
333
|
+
|
|
334
|
+
// Struggle signals
|
|
335
|
+
const struggleMatch = rawContext.match(/\*\*Struggle signals? detected\*\*:\s*(\d+)/i);
|
|
336
|
+
if (struggleMatch) {
|
|
337
|
+
fields['Struggle Signals'] = `${struggleMatch[1]} detected`;
|
|
338
|
+
}
|
|
339
|
+
|
|
340
|
+
// Quiz/activity retries
|
|
341
|
+
const retryMatch = rawContext.match(/retried?\s+(\d+)\s+times?/i);
|
|
342
|
+
if (retryMatch) {
|
|
343
|
+
fields['Activity Retries'] = `${retryMatch[1]} retries`;
|
|
344
|
+
}
|
|
345
|
+
// Also check for "Retrying activity" lines
|
|
346
|
+
const retryLines = (rawContext.match(/Retrying activity/gi) || []).length;
|
|
347
|
+
if (retryLines > 0 && !retryMatch) {
|
|
348
|
+
fields['Activity Retries'] = `${retryLines} retries in timeline`;
|
|
349
|
+
}
|
|
350
|
+
|
|
351
|
+
// Primary struggle area
|
|
352
|
+
const struggleAreaMatch = rawContext.match(/\*\*Primary struggle area\*\*:\s*(.+)/);
|
|
353
|
+
if (struggleAreaMatch) {
|
|
354
|
+
fields['Primary Struggle'] = struggleAreaMatch[1].trim();
|
|
355
|
+
}
|
|
356
|
+
|
|
357
|
+
// Concept difficulty
|
|
358
|
+
const conceptMatch = rawContext.match(/\*\*Concept difficulty\*\*:\s*(.+)/);
|
|
359
|
+
if (conceptMatch) {
|
|
360
|
+
fields['Difficult Concepts'] = conceptMatch[1].trim();
|
|
361
|
+
}
|
|
362
|
+
|
|
363
|
+
// Mood / emotional signals from chat history
|
|
364
|
+
const chatLines = [];
|
|
365
|
+
const chatPattern = /- User:\s*"([^"]+)"/g;
|
|
366
|
+
let m;
|
|
367
|
+
while ((m = chatPattern.exec(rawContext)) !== null) {
|
|
368
|
+
chatLines.push(m[1]);
|
|
369
|
+
}
|
|
370
|
+
if (chatLines.length > 0) {
|
|
371
|
+
fields['Learner Messages'] = chatLines.join(' | ');
|
|
372
|
+
}
|
|
373
|
+
|
|
374
|
+
// Completed lectures
|
|
375
|
+
const completedMatch = rawContext.match(/\*\*Completed lectures?\*\*:\s*(.+)/);
|
|
376
|
+
if (completedMatch) {
|
|
377
|
+
fields['Completed Lectures'] = completedMatch[1].trim();
|
|
378
|
+
}
|
|
379
|
+
|
|
380
|
+
// Time on page
|
|
381
|
+
const timeMatch = rawContext.match(/\*\*Time on page\*\*:\s*(.+)/);
|
|
382
|
+
if (timeMatch) {
|
|
383
|
+
fields['Time on Page'] = timeMatch[1].trim();
|
|
384
|
+
}
|
|
385
|
+
|
|
386
|
+
// Scroll depth
|
|
387
|
+
const scrollMatch = rawContext.match(/\*\*Scroll depth\*\*:\s*(.+)/);
|
|
388
|
+
if (scrollMatch) {
|
|
389
|
+
fields['Scroll Depth'] = scrollMatch[1].trim();
|
|
390
|
+
}
|
|
391
|
+
|
|
392
|
+
// Performance / success rate
|
|
393
|
+
const avgScoreMatch = rawContext.match(/\*\*Average score\*\*:\s*(.+)/);
|
|
394
|
+
if (avgScoreMatch) {
|
|
395
|
+
fields['Average Score'] = avgScoreMatch[1].trim();
|
|
396
|
+
}
|
|
397
|
+
|
|
398
|
+
// Activities completion
|
|
399
|
+
const actCompMatch = rawContext.match(/\*\*Activities completed\*\*:\s*(.+)/);
|
|
400
|
+
if (actCompMatch) {
|
|
401
|
+
fields['Activities Completed'] = actCompMatch[1].trim();
|
|
402
|
+
}
|
|
403
|
+
|
|
404
|
+
// If no meaningful fields extracted, return original unchanged
|
|
405
|
+
const fieldKeys = Object.keys(fields);
|
|
406
|
+
if (fieldKeys.length <= 1) return rawContext; // only learner type
|
|
407
|
+
|
|
408
|
+
// Build structured summary block with explicit instruction header
|
|
409
|
+
const lines = [
|
|
410
|
+
'⚠️ YOU MUST REFERENCE AT LEAST ONE OF THESE SIGNALS BY NAME IN YOUR SUGGESTION:',
|
|
411
|
+
'<structured_context_summary>',
|
|
412
|
+
];
|
|
413
|
+
for (const [key, value] of Object.entries(fields)) {
|
|
414
|
+
lines.push(`${key}: ${value}`);
|
|
415
|
+
}
|
|
416
|
+
lines.push('</structured_context_summary>');
|
|
417
|
+
lines.push('Your suggestion MUST mention specific data from the summary above. Generic responses are WRONG.');
|
|
418
|
+
lines.push('');
|
|
419
|
+
|
|
420
|
+
return lines.join('\n') + rawContext;
|
|
421
|
+
}
|
|
422
|
+
|
|
423
|
+
// ---------------------------------------------------------------------------
|
|
424
|
+
// Multi-turn context-building utilities (moved from multiTurnRunner.js)
|
|
425
|
+
// ---------------------------------------------------------------------------
|
|
426
|
+
|
|
427
|
+
/**
|
|
428
|
+
* Build updated context for a follow-up turn in a multi-turn scenario
|
|
429
|
+
*/
|
|
430
|
+
function buildMultiTurnContext(options) {
|
|
431
|
+
const {
|
|
432
|
+
originalContext,
|
|
433
|
+
conversationHistory = [],
|
|
434
|
+
currentTurn,
|
|
435
|
+
previousSuggestion,
|
|
436
|
+
} = options;
|
|
437
|
+
|
|
438
|
+
const contextParts = [];
|
|
439
|
+
|
|
440
|
+
// sessionEvolution is now injected into the system prompt (not user context).
|
|
441
|
+
// See systemPromptExtension threading through generateAndEvaluateTurn → tutor-core.
|
|
442
|
+
|
|
443
|
+
contextParts.push(originalContext);
|
|
444
|
+
|
|
445
|
+
if (conversationHistory.length > 0) {
|
|
446
|
+
contextParts.push('\n### Conversation History');
|
|
447
|
+
for (const turn of conversationHistory) {
|
|
448
|
+
contextParts.push(formatTurnForContext(turn));
|
|
449
|
+
}
|
|
450
|
+
}
|
|
451
|
+
|
|
452
|
+
// Note: "Previous Tutor Suggestion" block removed — it duplicated the last
|
|
453
|
+
// entry already present in conversation history above.
|
|
454
|
+
|
|
455
|
+
if (currentTurn?.learner_action) {
|
|
456
|
+
contextParts.push('\n### Learner Action');
|
|
457
|
+
contextParts.push(formatLearnerAction(currentTurn));
|
|
458
|
+
}
|
|
459
|
+
|
|
460
|
+
if (currentTurn?.context_update) {
|
|
461
|
+
contextParts.push('\n' + currentTurn.context_update.trim());
|
|
462
|
+
}
|
|
463
|
+
|
|
464
|
+
return contextParts.join('\n');
|
|
465
|
+
}
|
|
466
|
+
|
|
467
|
+
/**
|
|
468
|
+
* Format a previous turn for inclusion in context
|
|
469
|
+
*/
|
|
470
|
+
function formatTurnForContext(turn) {
|
|
471
|
+
const lines = [];
|
|
472
|
+
lines.push(`\n**Turn ${turn.turnIndex + 1}** (${turn.turnId})`);
|
|
473
|
+
|
|
474
|
+
if (turn.suggestion) {
|
|
475
|
+
lines.push(`- Tutor suggested: "${turn.suggestion.title || turn.suggestion.message?.substring(0, 100)}..."`);
|
|
476
|
+
if (turn.suggestion.actionTarget) {
|
|
477
|
+
lines.push(` - Action: ${turn.suggestion.action} → ${turn.suggestion.actionTarget}`);
|
|
478
|
+
}
|
|
479
|
+
}
|
|
480
|
+
|
|
481
|
+
if (turn.learnerAction) {
|
|
482
|
+
lines.push(`- Learner response: ${turn.learnerAction}`);
|
|
483
|
+
if (turn.learnerMessage) {
|
|
484
|
+
lines.push(` - Message: "${turn.learnerMessage}"`);
|
|
485
|
+
}
|
|
486
|
+
}
|
|
487
|
+
|
|
488
|
+
return lines.join('\n');
|
|
489
|
+
}
|
|
490
|
+
|
|
491
|
+
/**
|
|
492
|
+
* Format a suggestion for inclusion in conversation context
|
|
493
|
+
*/
|
|
494
|
+
function formatSuggestionForContext(suggestion) {
|
|
495
|
+
const lines = [];
|
|
496
|
+
|
|
497
|
+
if (suggestion.title) {
|
|
498
|
+
lines.push(`**Title**: ${suggestion.title}`);
|
|
499
|
+
}
|
|
500
|
+
if (suggestion.message) {
|
|
501
|
+
lines.push(`**Message**: ${suggestion.message}`);
|
|
502
|
+
}
|
|
503
|
+
if (suggestion.action && suggestion.actionTarget) {
|
|
504
|
+
lines.push(`**Suggested Action**: ${suggestion.action} → ${suggestion.actionTarget}`);
|
|
505
|
+
}
|
|
506
|
+
// Note: reasoning intentionally excluded — it's internal justification that
|
|
507
|
+
// inflates context without helping the model generate the next suggestion.
|
|
508
|
+
// Title + message + action are sufficient for conversational continuity.
|
|
509
|
+
|
|
510
|
+
return lines.join('\n');
|
|
511
|
+
}
|
|
512
|
+
|
|
513
|
+
/**
|
|
514
|
+
* Format learner action for context
|
|
515
|
+
*/
|
|
516
|
+
function formatLearnerAction(turn) {
|
|
517
|
+
const action = turn.learner_action;
|
|
518
|
+
const details = turn.action_details || {};
|
|
519
|
+
const lines = [];
|
|
520
|
+
|
|
521
|
+
switch (action) {
|
|
522
|
+
case 'followed_suggestion':
|
|
523
|
+
lines.push(`Learner **followed** the suggestion`);
|
|
524
|
+
if (details.action_taken) {
|
|
525
|
+
lines.push(`- Action: ${details.action_taken}`);
|
|
526
|
+
}
|
|
527
|
+
break;
|
|
528
|
+
|
|
529
|
+
case 'ignored_suggestion':
|
|
530
|
+
lines.push(`Learner **did not follow** the suggestion`);
|
|
531
|
+
if (details.explicit_rejection) {
|
|
532
|
+
lines.push(`- Explicitly rejected`);
|
|
533
|
+
}
|
|
534
|
+
break;
|
|
535
|
+
|
|
536
|
+
case 'asked_followup':
|
|
537
|
+
lines.push(`Learner **asked a follow-up question**`);
|
|
538
|
+
break;
|
|
539
|
+
|
|
540
|
+
case 'reported_confusion':
|
|
541
|
+
lines.push(`Learner **reported confusion**`);
|
|
542
|
+
break;
|
|
543
|
+
|
|
544
|
+
case 'completed_activity':
|
|
545
|
+
lines.push(`Learner **completed an activity**`);
|
|
546
|
+
if (details.activity_id) {
|
|
547
|
+
lines.push(`- Activity: ${details.activity_id}`);
|
|
548
|
+
}
|
|
549
|
+
if (details.success !== undefined) {
|
|
550
|
+
lines.push(`- Success: ${details.success}`);
|
|
551
|
+
}
|
|
552
|
+
if (details.score !== undefined) {
|
|
553
|
+
lines.push(`- Score: ${details.score}%`);
|
|
554
|
+
}
|
|
555
|
+
break;
|
|
556
|
+
|
|
557
|
+
default:
|
|
558
|
+
lines.push(`Learner action: ${action}`);
|
|
559
|
+
}
|
|
560
|
+
|
|
561
|
+
if (details.message) {
|
|
562
|
+
lines.push(`\n**Learner said**: "${details.message}"`);
|
|
563
|
+
}
|
|
564
|
+
|
|
565
|
+
return lines.join('\n');
|
|
566
|
+
}
|
|
567
|
+
|
|
568
|
+
/**
|
|
569
|
+
* Format learner action for transcript display (cleaner format for CLI)
|
|
570
|
+
*/
|
|
571
|
+
function formatLearnerActionForTranscript(turn) {
|
|
572
|
+
const action = turn.learner_action;
|
|
573
|
+
const details = turn.action_details || {};
|
|
574
|
+
const lines = [];
|
|
575
|
+
|
|
576
|
+
const actionLabels = {
|
|
577
|
+
'followed_suggestion': '✓ Followed suggestion',
|
|
578
|
+
'ignored_suggestion': '✗ Ignored suggestion',
|
|
579
|
+
'asked_followup': '❓ Asked follow-up question',
|
|
580
|
+
'reported_confusion': '😕 Reported confusion',
|
|
581
|
+
'completed_activity': '✅ Completed activity',
|
|
582
|
+
'navigated_away': '🔄 Navigated away',
|
|
583
|
+
'requested_hint': '💡 Requested hint',
|
|
584
|
+
};
|
|
585
|
+
|
|
586
|
+
lines.push(actionLabels[action] || `Action: ${action}`);
|
|
587
|
+
|
|
588
|
+
if (details.action_taken) {
|
|
589
|
+
lines.push(` → ${details.action_taken}`);
|
|
590
|
+
}
|
|
591
|
+
if (details.activity_id) {
|
|
592
|
+
lines.push(` Activity: ${details.activity_id}`);
|
|
593
|
+
}
|
|
594
|
+
if (details.success !== undefined) {
|
|
595
|
+
lines.push(` Success: ${details.success ? 'Yes' : 'No'}`);
|
|
596
|
+
}
|
|
597
|
+
if (details.score !== undefined) {
|
|
598
|
+
lines.push(` Score: ${details.score}%`);
|
|
599
|
+
}
|
|
600
|
+
|
|
601
|
+
if (details.message) {
|
|
602
|
+
lines.push(`\n "${details.message}"`);
|
|
603
|
+
}
|
|
604
|
+
|
|
605
|
+
return lines.join('\n');
|
|
606
|
+
}
|
|
607
|
+
|
|
608
|
+
// ---------------------------------------------------------------------------
|
|
609
|
+
// Shared generation + evaluation helper
|
|
610
|
+
// ---------------------------------------------------------------------------
|
|
611
|
+
|
|
612
|
+
/**
|
|
613
|
+
* Generate a tutor suggestion and evaluate it with the rubric.
|
|
614
|
+
*
|
|
615
|
+
* This is the single code path used by BOTH single-turn and multi-turn
|
|
616
|
+
* evaluations. It encapsulates:
|
|
617
|
+
* 1. retryWithBackoff → tutorApi.generateSuggestions
|
|
618
|
+
* 2. rubricEvaluator.quickValidate
|
|
619
|
+
* 3. rubricEvaluator.evaluateSuggestion (unless skipped)
|
|
620
|
+
*
|
|
621
|
+
* @param {Object} context - The learner context object (from tutorApi.buildContext)
|
|
622
|
+
* @param {Object} resolvedConfig - Resolved config with provider, model, egoModel, etc.
|
|
623
|
+
* @param {Object} turnMeta - Turn-level metadata for evaluation
|
|
624
|
+
* @param {string} turnMeta.scenarioName - Human-readable scenario name
|
|
625
|
+
* @param {string} turnMeta.description - Description for the rubric judge
|
|
626
|
+
* @param {string} turnMeta.expectedBehavior - Expected tutor behavior
|
|
627
|
+
* @param {string} turnMeta.learnerContext - Raw learner context string (for rubric)
|
|
628
|
+
* @param {string[]} turnMeta.requiredElements - Required elements for validation
|
|
629
|
+
* @param {string[]} turnMeta.forbiddenElements - Forbidden elements for validation
|
|
630
|
+
* @param {Object} options - Evaluation options
|
|
631
|
+
* @param {boolean} options.skipRubricEval
|
|
632
|
+
* @param {string} options.outputSize
|
|
633
|
+
* @param {string} options.superegoStrategy
|
|
634
|
+
* @param {string} options.judgeOverride
|
|
635
|
+
* @param {boolean} options.useDialogue
|
|
636
|
+
* @param {number} options.maxRounds
|
|
637
|
+
* @param {Function} options.log
|
|
638
|
+
* @param {string} options.scenarioId - Used for debug logging
|
|
639
|
+
* @returns {Promise<Object>} { genResult, suggestion, validation, rubricResult, turnScore }
|
|
640
|
+
*/
|
|
641
|
+
async function generateAndEvaluateTurn(context, resolvedConfig, turnMeta, options = {}) {
|
|
642
|
+
const {
|
|
643
|
+
skipRubricEval = false,
|
|
644
|
+
outputSize = 'normal',
|
|
645
|
+
superegoStrategy = null,
|
|
646
|
+
judgeOverride = null,
|
|
647
|
+
useDialogue = false,
|
|
648
|
+
maxRounds = 0,
|
|
649
|
+
log = () => {},
|
|
650
|
+
scenarioId = '',
|
|
651
|
+
systemPromptExtension = null,
|
|
652
|
+
learnerId = null, // For Writing Pad memory persistence
|
|
653
|
+
} = options;
|
|
654
|
+
|
|
655
|
+
// Generate suggestions via tutor API with retry logic
|
|
656
|
+
const genResult = await retryWithBackoff(
|
|
657
|
+
() => tutorApi.generateSuggestions(context, {
|
|
658
|
+
provider: resolvedConfig.provider,
|
|
659
|
+
model: resolvedConfig.model,
|
|
660
|
+
egoModel: resolvedConfig.egoModel,
|
|
661
|
+
superegoModel: resolvedConfig.superegoModel || null,
|
|
662
|
+
profileName: resolvedConfig.profileName,
|
|
663
|
+
hyperparameters: resolvedConfig.hyperparameters || {},
|
|
664
|
+
trace: true,
|
|
665
|
+
superegoStrategy,
|
|
666
|
+
outputSize,
|
|
667
|
+
useDialogue,
|
|
668
|
+
maxRounds,
|
|
669
|
+
systemPromptExtension,
|
|
670
|
+
learnerId, // Activates Writing Pad three-layer memory
|
|
671
|
+
}),
|
|
672
|
+
{ log }
|
|
673
|
+
);
|
|
674
|
+
|
|
675
|
+
if (!genResult.success) {
|
|
676
|
+
log(`Generation failed: ${genResult.error}`, 'error');
|
|
677
|
+
return { genResult, suggestion: null, validation: null, rubricResult: null, turnScore: null };
|
|
678
|
+
}
|
|
679
|
+
|
|
680
|
+
const suggestionCount = genResult.suggestions?.length || 0;
|
|
681
|
+
log(`Generated ${suggestionCount} suggestion(s) in ${genResult.metadata?.latencyMs}ms`, 'success');
|
|
682
|
+
|
|
683
|
+
if (genResult.metadata?.dialogueRounds) {
|
|
684
|
+
log(`Dialogue rounds: ${genResult.metadata.dialogueRounds}`, 'info');
|
|
685
|
+
}
|
|
686
|
+
|
|
687
|
+
// Quick validation (rule-based)
|
|
688
|
+
log('Running validation checks...', 'info');
|
|
689
|
+
const suggestion = genResult.suggestions?.[0];
|
|
690
|
+
const validation = suggestion
|
|
691
|
+
? rubricEvaluator.quickValidate(suggestion, {
|
|
692
|
+
requiredElements: turnMeta.requiredElements,
|
|
693
|
+
requiredElementsAny: turnMeta.requiredElementsAny,
|
|
694
|
+
forbiddenElements: turnMeta.forbiddenElements,
|
|
695
|
+
})
|
|
696
|
+
: { passesRequired: false, passesForbidden: true, requiredMissing: ['No suggestions generated'] };
|
|
697
|
+
|
|
698
|
+
log(`Validation: required=${validation.passesRequired ? 'PASS' : 'FAIL'}, forbidden=${validation.passesForbidden ? 'PASS' : 'FAIL'}`, validation.passesRequired && validation.passesForbidden ? 'success' : 'warning');
|
|
699
|
+
|
|
700
|
+
let rubricResult = null;
|
|
701
|
+
if (!skipRubricEval && suggestion) {
|
|
702
|
+
log('Running AI rubric evaluation...', 'info');
|
|
703
|
+
debugLog(`[evaluationRunner] Running rubric evaluation for ${scenarioId}...`);
|
|
704
|
+
|
|
705
|
+
// Build dialogue context for the judge (if available from multi-turn)
|
|
706
|
+
const dialogueContext = (options.conversationHistory || options.dialogueTrace || options.consolidatedTrace)
|
|
707
|
+
? {
|
|
708
|
+
conversationHistory: options.conversationHistory || null,
|
|
709
|
+
dialogueTrace: options.dialogueTrace || null,
|
|
710
|
+
consolidatedTrace: options.consolidatedTrace || null,
|
|
711
|
+
}
|
|
712
|
+
: null;
|
|
713
|
+
|
|
714
|
+
rubricResult = await rubricEvaluator.evaluateSuggestion(suggestion, {
|
|
715
|
+
name: turnMeta.scenarioName,
|
|
716
|
+
description: turnMeta.description,
|
|
717
|
+
expectedBehavior: turnMeta.expectedBehavior,
|
|
718
|
+
learnerContext: turnMeta.learnerContext,
|
|
719
|
+
requiredElements: turnMeta.requiredElements,
|
|
720
|
+
forbiddenElements: turnMeta.forbiddenElements,
|
|
721
|
+
}, { dialogueContext }, { judgeOverride });
|
|
722
|
+
|
|
723
|
+
if (rubricResult) {
|
|
724
|
+
debugLog(`[evaluationRunner] Rubric result: success=${rubricResult.success}, ` +
|
|
725
|
+
`overallScore=${rubricResult.overallScore}, ` +
|
|
726
|
+
`scoresCount=${Object.keys(rubricResult.scores || {}).length}, ` +
|
|
727
|
+
`error=${rubricResult.error || 'none'}`);
|
|
728
|
+
if (rubricResult.success) {
|
|
729
|
+
log(`Rubric evaluation complete: score=${rubricResult.overallScore?.toFixed(1)}`, 'success');
|
|
730
|
+
} else {
|
|
731
|
+
log(`Rubric evaluation failed: ${rubricResult.error || 'unknown error'}`, 'error');
|
|
732
|
+
}
|
|
733
|
+
}
|
|
734
|
+
} else if (skipRubricEval) {
|
|
735
|
+
debugLog(`[evaluationRunner] Skipping rubric evaluation (--fast mode)`);
|
|
736
|
+
log('Skipping AI rubric evaluation (fast mode)', 'info');
|
|
737
|
+
} else if (!suggestion) {
|
|
738
|
+
debugLog(`[evaluationRunner] Skipping rubric evaluation (no suggestion generated)`);
|
|
739
|
+
log('Skipping rubric evaluation (no suggestion generated)', 'warning');
|
|
740
|
+
}
|
|
741
|
+
|
|
742
|
+
// Calculate turn score
|
|
743
|
+
let turnScore = null;
|
|
744
|
+
let scoringMethod = null;
|
|
745
|
+
if (rubricResult?.success) {
|
|
746
|
+
turnScore = rubricResult.overallScore;
|
|
747
|
+
scoringMethod = 'rubric';
|
|
748
|
+
} else if (suggestion && rubricResult && !rubricResult.success) {
|
|
749
|
+
// Judge API failed — do NOT silently produce a synthetic score.
|
|
750
|
+
// Store null so downstream aggregation excludes this data point.
|
|
751
|
+
turnScore = null;
|
|
752
|
+
scoringMethod = 'judge_failed';
|
|
753
|
+
log(`WARNING: Judge evaluation failed for ${scenarioId}; score stored as null (was: ${(validation.passesRequired ? 50 : 0) + (validation.passesForbidden ? 50 : 0)} from keyword fallback). Error: ${rubricResult.error || 'unknown'}`, 'warning');
|
|
754
|
+
} else if (suggestion && !rubricResult) {
|
|
755
|
+
// Rubric evaluation was skipped (skipRubricEval=true) — no score available
|
|
756
|
+
turnScore = null;
|
|
757
|
+
scoringMethod = 'skipped';
|
|
758
|
+
}
|
|
759
|
+
|
|
760
|
+
return { genResult, suggestion, validation, rubricResult, turnScore, scoringMethod };
|
|
761
|
+
}
|
|
762
|
+
|
|
71
763
|
/**
|
|
72
764
|
* Run a complete evaluation across configurations and scenarios
|
|
73
765
|
*
|
|
@@ -83,16 +775,47 @@ export async function runEvaluation(options = {}) {
|
|
|
83
775
|
skipRubricEval = false, // Skip AI-based rubric evaluation (faster)
|
|
84
776
|
description = null,
|
|
85
777
|
verbose = false,
|
|
778
|
+
scenarioFilter = null, // Cluster filter: 'single-turn', 'multi-turn', or category names
|
|
779
|
+
modelOverride = null, // CLI --model override (e.g. "openrouter.nemotron")
|
|
780
|
+
egoModelOverride = null, // CLI --ego-model override (replaces only ego model)
|
|
781
|
+
superegoModelOverride = null, // CLI --superego-model override (replaces only superego model)
|
|
86
782
|
} = options;
|
|
87
783
|
|
|
88
784
|
const log = verbose ? console.log : () => {};
|
|
89
785
|
|
|
90
|
-
//
|
|
91
|
-
|
|
92
|
-
|
|
786
|
+
// Log domain override env vars (always visible, not gated on verbose)
|
|
787
|
+
if (process.env.EVAL_CONTENT_PATH || process.env.EVAL_SCENARIOS_FILE) {
|
|
788
|
+
console.log('[evaluationRunner] Domain overrides detected:');
|
|
789
|
+
if (process.env.EVAL_CONTENT_PATH) console.log(` EVAL_CONTENT_PATH = ${process.env.EVAL_CONTENT_PATH}`);
|
|
790
|
+
if (process.env.EVAL_SCENARIOS_FILE) console.log(` EVAL_SCENARIOS_FILE = ${process.env.EVAL_SCENARIOS_FILE}`);
|
|
791
|
+
}
|
|
792
|
+
|
|
793
|
+
// Initialize content resolver from eval settings (opt-in)
|
|
794
|
+
const contentConfig = evalConfigLoader.getContentConfig();
|
|
795
|
+
if (contentConfig?.content_package_path) {
|
|
796
|
+
contentResolver.configure({
|
|
797
|
+
contentPackagePath: contentConfig.content_package_path,
|
|
798
|
+
maxLectureChars: contentConfig.max_lecture_chars,
|
|
799
|
+
includeSpeakerNotes: contentConfig.include_speaker_notes,
|
|
800
|
+
});
|
|
801
|
+
if (contentResolver.isConfigured()) {
|
|
802
|
+
console.log(`[evaluationRunner] Content: ${contentConfig.content_package_path}`);
|
|
803
|
+
} else {
|
|
804
|
+
console.warn('[evaluationRunner] Content path set but directory not found — using fallback curriculum');
|
|
805
|
+
}
|
|
806
|
+
}
|
|
807
|
+
|
|
808
|
+
// Resolve scenarios (loaded from eval repo's local rubric)
|
|
809
|
+
const allScenarios = evalConfigLoader.listScenarios();
|
|
810
|
+
let targetScenarios = scenarios === 'all'
|
|
93
811
|
? allScenarios
|
|
94
812
|
: allScenarios.filter(s => scenarios.includes(s.id));
|
|
95
813
|
|
|
814
|
+
// Apply cluster filter if specified
|
|
815
|
+
if (scenarioFilter) {
|
|
816
|
+
targetScenarios = applyScenarioFilter(targetScenarios, scenarioFilter);
|
|
817
|
+
}
|
|
818
|
+
|
|
96
819
|
if (targetScenarios.length === 0) {
|
|
97
820
|
throw new Error('No scenarios to run');
|
|
98
821
|
}
|
|
@@ -100,9 +823,22 @@ export async function runEvaluation(options = {}) {
|
|
|
100
823
|
// Resolve configurations
|
|
101
824
|
let targetConfigs = [];
|
|
102
825
|
if (configurations === 'all') {
|
|
103
|
-
targetConfigs =
|
|
826
|
+
targetConfigs = evalConfigLoader.listConfigurations();
|
|
827
|
+
} else if (configurations === 'factorial') {
|
|
828
|
+
const FACTORIAL_CELLS = [
|
|
829
|
+
'cell_1_base_single_unified', 'cell_2_base_single_psycho',
|
|
830
|
+
'cell_3_base_multi_unified', 'cell_4_base_multi_psycho',
|
|
831
|
+
'cell_5_recog_single_unified', 'cell_6_recog_single_psycho',
|
|
832
|
+
'cell_7_recog_multi_unified', 'cell_8_recog_multi_psycho',
|
|
833
|
+
];
|
|
834
|
+
targetConfigs = FACTORIAL_CELLS.map(name => ({
|
|
835
|
+
provider: null,
|
|
836
|
+
model: null,
|
|
837
|
+
profileName: name,
|
|
838
|
+
label: name,
|
|
839
|
+
}));
|
|
104
840
|
} else if (configurations === 'profiles') {
|
|
105
|
-
const profiles =
|
|
841
|
+
const profiles = evalConfigLoader.listTutorProfiles();
|
|
106
842
|
targetConfigs = profiles.map(p => ({
|
|
107
843
|
provider: null,
|
|
108
844
|
model: null,
|
|
@@ -113,6 +849,24 @@ export async function runEvaluation(options = {}) {
|
|
|
113
849
|
targetConfigs = configurations;
|
|
114
850
|
}
|
|
115
851
|
|
|
852
|
+
// Apply model overrides: CLI flags take precedence over YAML-level config
|
|
853
|
+
const yamlOverrides = evalConfigLoader.getTutorModelOverrides();
|
|
854
|
+
|
|
855
|
+
// Effective overrides: CLI > YAML > none
|
|
856
|
+
const effectiveModelOverride = modelOverride || yamlOverrides.modelOverride;
|
|
857
|
+
const effectiveEgoModelOverride = egoModelOverride || yamlOverrides.egoModelOverride;
|
|
858
|
+
const effectiveSuperegoModelOverride = superegoModelOverride || yamlOverrides.superegoModelOverride;
|
|
859
|
+
|
|
860
|
+
if (effectiveModelOverride) {
|
|
861
|
+
targetConfigs = targetConfigs.map(c => ({ ...c, modelOverride: effectiveModelOverride }));
|
|
862
|
+
}
|
|
863
|
+
if (effectiveEgoModelOverride) {
|
|
864
|
+
targetConfigs = targetConfigs.map(c => ({ ...c, egoModelOverride: effectiveEgoModelOverride }));
|
|
865
|
+
}
|
|
866
|
+
if (effectiveSuperegoModelOverride) {
|
|
867
|
+
targetConfigs = targetConfigs.map(c => ({ ...c, superegoModelOverride: effectiveSuperegoModelOverride }));
|
|
868
|
+
}
|
|
869
|
+
|
|
116
870
|
if (targetConfigs.length === 0) {
|
|
117
871
|
throw new Error('No configurations to test');
|
|
118
872
|
}
|
|
@@ -123,7 +877,7 @@ export async function runEvaluation(options = {}) {
|
|
|
123
877
|
log(` Runs per config: ${runsPerConfig}`);
|
|
124
878
|
log(` Total tests: ${targetScenarios.length * targetConfigs.length * runsPerConfig}`);
|
|
125
879
|
|
|
126
|
-
// Create evaluation run record
|
|
880
|
+
// Create evaluation run record with reproducibility metadata
|
|
127
881
|
const run = evaluationStore.createRun({
|
|
128
882
|
description: description || `Evaluation: ${targetConfigs.length} configs x ${targetScenarios.length} scenarios`,
|
|
129
883
|
totalScenarios: targetScenarios.length,
|
|
@@ -131,10 +885,51 @@ export async function runEvaluation(options = {}) {
|
|
|
131
885
|
metadata: {
|
|
132
886
|
runsPerConfig,
|
|
133
887
|
skipRubricEval,
|
|
888
|
+
modelOverride: effectiveModelOverride || null,
|
|
889
|
+
egoModelOverride: effectiveEgoModelOverride || null,
|
|
890
|
+
superegoModelOverride: effectiveSuperegoModelOverride || null,
|
|
891
|
+
// Store scenario IDs and profile names for accurate resume
|
|
892
|
+
scenarioIds: targetScenarios.map(s => s.id),
|
|
893
|
+
profileNames: targetConfigs.map(c => c.profileName).filter(Boolean),
|
|
894
|
+
// Store env overrides so evaluate/rejudge can re-apply them
|
|
895
|
+
scenariosFile: process.env.EVAL_SCENARIOS_FILE || null,
|
|
896
|
+
contentPath: process.env.EVAL_CONTENT_PATH || null,
|
|
897
|
+
packageVersion: pkg.version,
|
|
898
|
+
gitCommit: getGitCommitHash(),
|
|
899
|
+
pid: process.pid,
|
|
134
900
|
},
|
|
135
901
|
});
|
|
136
902
|
|
|
137
|
-
|
|
903
|
+
const totalTests = targetScenarios.length * targetConfigs.length * runsPerConfig;
|
|
904
|
+
|
|
905
|
+
// Store total_tests upfront so progress can be tracked for in-progress runs
|
|
906
|
+
evaluationStore.updateRun(run.id, { status: 'running', totalTests });
|
|
907
|
+
|
|
908
|
+
const profileNames = targetConfigs.map(c => c.label || c.profileName || `${c.provider}/${c.model}`);
|
|
909
|
+
const scenarioNames = targetScenarios.map(s => s.name || s.id);
|
|
910
|
+
|
|
911
|
+
// Print run ID + progress log path immediately so users can `watch`
|
|
912
|
+
const progressLogPath = getProgressLogPath(run.id);
|
|
913
|
+
console.log(`\nRun ID: ${run.id} (use 'watch ${run.id}' to monitor)`);
|
|
914
|
+
console.log(`Progress log: ${progressLogPath}\n`);
|
|
915
|
+
|
|
916
|
+
// Instantiate progress logger and streaming reporter
|
|
917
|
+
const progressLogger = new ProgressLogger(run.id);
|
|
918
|
+
const reporter = new StreamingReporter({
|
|
919
|
+
totalTests,
|
|
920
|
+
totalScenarios: targetScenarios.length,
|
|
921
|
+
profiles: profileNames,
|
|
922
|
+
scenarios: scenarioNames,
|
|
923
|
+
});
|
|
924
|
+
|
|
925
|
+
progressLogger.runStart({
|
|
926
|
+
totalTests,
|
|
927
|
+
totalScenarios: targetScenarios.length,
|
|
928
|
+
totalConfigurations: targetConfigs.length,
|
|
929
|
+
scenarios: scenarioNames,
|
|
930
|
+
profiles: profileNames,
|
|
931
|
+
description: description || run.description,
|
|
932
|
+
});
|
|
138
933
|
|
|
139
934
|
// Register with monitoring service for realtime tracking
|
|
140
935
|
monitoringService.startSession(run.id, {
|
|
@@ -145,59 +940,223 @@ export async function runEvaluation(options = {}) {
|
|
|
145
940
|
|
|
146
941
|
const results = [];
|
|
147
942
|
let completedTests = 0;
|
|
148
|
-
const totalTests = targetScenarios.length * targetConfigs.length * runsPerConfig;
|
|
149
|
-
|
|
150
|
-
// Run evaluations
|
|
151
|
-
for (const config of targetConfigs) {
|
|
152
|
-
log(`\nConfiguration: ${config.label || `${config.provider}/${config.model}`}`);
|
|
153
|
-
log('='.repeat(60));
|
|
154
943
|
|
|
155
|
-
|
|
944
|
+
// Build flat list of all tests — SCENARIO-FIRST ordering
|
|
945
|
+
// All profiles for scenario 1 complete before scenario 2 starts.
|
|
946
|
+
const allTests = [];
|
|
947
|
+
for (const scenario of targetScenarios) {
|
|
948
|
+
for (const config of targetConfigs) {
|
|
156
949
|
for (let runNum = 0; runNum < runsPerConfig; runNum++) {
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
});
|
|
162
|
-
|
|
163
|
-
// Store result
|
|
164
|
-
evaluationStore.storeResult(run.id, result);
|
|
165
|
-
results.push(result);
|
|
166
|
-
|
|
167
|
-
completedTests++;
|
|
168
|
-
log(` [${completedTests}/${totalTests}] ${scenario.id}: ${result.success ? `score=${result.overallScore?.toFixed(1)}` : 'FAILED'}`);
|
|
169
|
-
|
|
170
|
-
// Update monitoring session with progress
|
|
171
|
-
monitoringService.recordEvent(run.id, {
|
|
172
|
-
type: 'evaluation_test',
|
|
173
|
-
inputTokens: result.inputTokens || 0,
|
|
174
|
-
outputTokens: result.outputTokens || 0,
|
|
175
|
-
latencyMs: result.latencyMs || 0,
|
|
176
|
-
round: completedTests,
|
|
177
|
-
approved: result.success,
|
|
178
|
-
});
|
|
950
|
+
allTests.push({ config, scenario, runNum });
|
|
951
|
+
}
|
|
952
|
+
}
|
|
953
|
+
}
|
|
179
954
|
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
955
|
+
// Scenario completion tracking
|
|
956
|
+
const scenarioProgress = new Map();
|
|
957
|
+
for (const scenario of targetScenarios) {
|
|
958
|
+
scenarioProgress.set(scenario.id, {
|
|
959
|
+
total: targetConfigs.length * runsPerConfig,
|
|
960
|
+
completed: 0,
|
|
961
|
+
scores: [],
|
|
962
|
+
scenarioName: scenario.name || scenario.id,
|
|
963
|
+
});
|
|
964
|
+
}
|
|
965
|
+
let completedScenarios = 0;
|
|
966
|
+
|
|
967
|
+
// Parallel worker pool
|
|
968
|
+
async function processQueue(queue, workerCount, processItem) {
|
|
969
|
+
const items = [...queue];
|
|
970
|
+
let index = 0;
|
|
971
|
+
|
|
972
|
+
async function worker() {
|
|
973
|
+
while (index < items.length) {
|
|
974
|
+
const i = index++;
|
|
975
|
+
await processItem(items[i]);
|
|
976
|
+
await sleep(REQUEST_DELAY_MS);
|
|
193
977
|
}
|
|
194
978
|
}
|
|
979
|
+
|
|
980
|
+
const workers = Array.from(
|
|
981
|
+
{ length: Math.min(workerCount, items.length) },
|
|
982
|
+
() => worker()
|
|
983
|
+
);
|
|
984
|
+
await Promise.all(workers);
|
|
195
985
|
}
|
|
196
986
|
|
|
197
|
-
|
|
987
|
+
log(`\nRunning ${allTests.length} tests with parallelism=${parallelism}...\n`);
|
|
988
|
+
|
|
989
|
+
const runStartTime = Date.now();
|
|
990
|
+
|
|
991
|
+
await processQueue(allTests, parallelism, async ({ config, scenario }) => {
|
|
992
|
+
const profileLabel = config.label || config.profileName || '';
|
|
993
|
+
|
|
994
|
+
// Emit test_start
|
|
995
|
+
progressLogger.testStart({
|
|
996
|
+
scenarioId: scenario.id,
|
|
997
|
+
scenarioName: scenario.name || scenario.id,
|
|
998
|
+
profileName: profileLabel,
|
|
999
|
+
});
|
|
1000
|
+
|
|
1001
|
+
try {
|
|
1002
|
+
const result = await runSingleTest(scenario, config, {
|
|
1003
|
+
skipRubricEval,
|
|
1004
|
+
verbose,
|
|
1005
|
+
});
|
|
1006
|
+
|
|
1007
|
+
// Store result (better-sqlite3 is synchronous, thread-safe for concurrent writes)
|
|
1008
|
+
evaluationStore.storeResult(run.id, result);
|
|
1009
|
+
results.push(result);
|
|
1010
|
+
|
|
1011
|
+
completedTests++;
|
|
1012
|
+
|
|
1013
|
+
// Emit test_complete event
|
|
1014
|
+
progressLogger.testComplete({
|
|
1015
|
+
scenarioId: scenario.id,
|
|
1016
|
+
scenarioName: scenario.name || scenario.id,
|
|
1017
|
+
profileName: profileLabel,
|
|
1018
|
+
success: result.success,
|
|
1019
|
+
overallScore: result.overallScore,
|
|
1020
|
+
baseScore: result.baseScore ?? null,
|
|
1021
|
+
recognitionScore: result.recognitionScore ?? null,
|
|
1022
|
+
latencyMs: result.latencyMs,
|
|
1023
|
+
completedCount: completedTests,
|
|
1024
|
+
totalTests,
|
|
1025
|
+
});
|
|
1026
|
+
|
|
1027
|
+
// Streaming reporter line
|
|
1028
|
+
reporter.onTestComplete({
|
|
1029
|
+
...result,
|
|
1030
|
+
profileName: profileLabel,
|
|
1031
|
+
scenarioName: scenario.name || scenario.id,
|
|
1032
|
+
});
|
|
1033
|
+
|
|
1034
|
+
log(` ${formatProgress(completedTests, totalTests, runStartTime)} ${profileLabel} / ${scenario.id}: ${result.success ? `score=${result.overallScore?.toFixed(1)}` : 'FAILED'}`);
|
|
1035
|
+
|
|
1036
|
+
// Update monitoring session with progress
|
|
1037
|
+
monitoringService.recordEvent(run.id, {
|
|
1038
|
+
type: 'evaluation_test',
|
|
1039
|
+
inputTokens: result.inputTokens || 0,
|
|
1040
|
+
outputTokens: result.outputTokens || 0,
|
|
1041
|
+
latencyMs: result.latencyMs || 0,
|
|
1042
|
+
round: completedTests,
|
|
1043
|
+
approved: result.success,
|
|
1044
|
+
});
|
|
1045
|
+
|
|
1046
|
+
// Track scenario completion
|
|
1047
|
+
const sp = scenarioProgress.get(scenario.id);
|
|
1048
|
+
sp.completed++;
|
|
1049
|
+
if (result.overallScore != null) sp.scores.push(result.overallScore);
|
|
1050
|
+
if (sp.completed >= sp.total) {
|
|
1051
|
+
completedScenarios++;
|
|
1052
|
+
const avgScore = sp.scores.length > 0
|
|
1053
|
+
? sp.scores.reduce((a, b) => a + b, 0) / sp.scores.length
|
|
1054
|
+
: null;
|
|
1055
|
+
progressLogger.scenarioComplete({
|
|
1056
|
+
scenarioId: scenario.id,
|
|
1057
|
+
scenarioName: sp.scenarioName,
|
|
1058
|
+
profileNames,
|
|
1059
|
+
avgScore,
|
|
1060
|
+
completedScenarios,
|
|
1061
|
+
totalScenarios: targetScenarios.length,
|
|
1062
|
+
});
|
|
1063
|
+
reporter.onScenarioComplete({
|
|
1064
|
+
scenarioName: sp.scenarioName,
|
|
1065
|
+
avgScore,
|
|
1066
|
+
completedScenarios,
|
|
1067
|
+
totalScenarios: targetScenarios.length,
|
|
1068
|
+
});
|
|
1069
|
+
}
|
|
1070
|
+
} catch (error) {
|
|
1071
|
+
completedTests++;
|
|
1072
|
+
log(` ${formatProgress(completedTests, totalTests, runStartTime)} ${profileLabel} / ${scenario.id}: ERROR - ${error.message}`);
|
|
1073
|
+
|
|
1074
|
+
// Store failed result so it shows up in the database instead of silently disappearing
|
|
1075
|
+
// Extract provider/model from nested ego config if not at top level (profile-based configs)
|
|
1076
|
+
const failedResult = {
|
|
1077
|
+
scenarioId: scenario.id,
|
|
1078
|
+
scenarioName: scenario.name || scenario.id,
|
|
1079
|
+
profileName: config.profileName,
|
|
1080
|
+
provider: config.provider || config.ego?.provider || 'unknown',
|
|
1081
|
+
model: config.model || config.ego?.model || 'unknown',
|
|
1082
|
+
egoModel: config.egoModel
|
|
1083
|
+
? `${config.egoModel.provider}.${config.egoModel.model}`
|
|
1084
|
+
: config.ego ? `${config.ego.provider}.${config.ego.model}` : null,
|
|
1085
|
+
superegoModel: config.superegoModel
|
|
1086
|
+
? `${config.superegoModel.provider}.${config.superegoModel.model}`
|
|
1087
|
+
: config.superego ? `${config.superego.provider}.${config.superego.model}` : null,
|
|
1088
|
+
factors: config.factors || null,
|
|
1089
|
+
learnerArchitecture: config.learnerArchitecture || null,
|
|
1090
|
+
success: false,
|
|
1091
|
+
errorMessage: error.message,
|
|
1092
|
+
};
|
|
1093
|
+
try {
|
|
1094
|
+
evaluationStore.storeResult(run.id, failedResult);
|
|
1095
|
+
results.push(failedResult);
|
|
1096
|
+
} catch (storeErr) {
|
|
1097
|
+
log(` [WARNING] Failed to store error result: ${storeErr.message}`);
|
|
1098
|
+
}
|
|
1099
|
+
|
|
1100
|
+
// Emit test_error event
|
|
1101
|
+
progressLogger.testError({
|
|
1102
|
+
scenarioId: scenario.id,
|
|
1103
|
+
scenarioName: scenario.name || scenario.id,
|
|
1104
|
+
profileName: profileLabel,
|
|
1105
|
+
errorMessage: error.message,
|
|
1106
|
+
completedCount: completedTests,
|
|
1107
|
+
totalTests,
|
|
1108
|
+
});
|
|
1109
|
+
|
|
1110
|
+
reporter.onTestError({
|
|
1111
|
+
scenarioName: scenario.name || scenario.id,
|
|
1112
|
+
profileName: profileLabel,
|
|
1113
|
+
errorMessage: error.message,
|
|
1114
|
+
});
|
|
1115
|
+
|
|
1116
|
+
// Record error in monitoring
|
|
1117
|
+
monitoringService.recordEvent(run.id, {
|
|
1118
|
+
type: 'evaluation_error',
|
|
1119
|
+
round: completedTests,
|
|
1120
|
+
error: error.message,
|
|
1121
|
+
});
|
|
1122
|
+
|
|
1123
|
+
// Track scenario completion even on error
|
|
1124
|
+
const sp = scenarioProgress.get(scenario.id);
|
|
1125
|
+
sp.completed++;
|
|
1126
|
+
if (sp.completed >= sp.total) {
|
|
1127
|
+
completedScenarios++;
|
|
1128
|
+
const avgScore = sp.scores.length > 0
|
|
1129
|
+
? sp.scores.reduce((a, b) => a + b, 0) / sp.scores.length
|
|
1130
|
+
: null;
|
|
1131
|
+
progressLogger.scenarioComplete({
|
|
1132
|
+
scenarioId: scenario.id,
|
|
1133
|
+
scenarioName: sp.scenarioName,
|
|
1134
|
+
profileNames,
|
|
1135
|
+
avgScore,
|
|
1136
|
+
completedScenarios,
|
|
1137
|
+
totalScenarios: targetScenarios.length,
|
|
1138
|
+
});
|
|
1139
|
+
reporter.onScenarioComplete({
|
|
1140
|
+
scenarioName: sp.scenarioName,
|
|
1141
|
+
avgScore,
|
|
1142
|
+
completedScenarios,
|
|
1143
|
+
totalScenarios: targetScenarios.length,
|
|
1144
|
+
});
|
|
1145
|
+
}
|
|
1146
|
+
}
|
|
1147
|
+
});
|
|
1148
|
+
|
|
1149
|
+
const durationMs = Date.now() - runStartTime;
|
|
1150
|
+
const successfulTests = results.filter(r => r.success).length;
|
|
1151
|
+
const failedTests = completedTests - successfulTests;
|
|
1152
|
+
|
|
1153
|
+
// Emit run_complete
|
|
1154
|
+
progressLogger.runComplete({ totalTests: completedTests, successfulTests, failedTests, durationMs });
|
|
1155
|
+
reporter.onRunComplete({ totalTests: completedTests, successfulTests, failedTests, durationMs });
|
|
1156
|
+
|
|
1157
|
+
// Update run status (keep original totalTests to show expected vs actual)
|
|
198
1158
|
evaluationStore.updateRun(run.id, {
|
|
199
1159
|
status: 'completed',
|
|
200
|
-
totalTests: results.length,
|
|
201
1160
|
completedAt: new Date().toISOString(),
|
|
202
1161
|
});
|
|
203
1162
|
|
|
@@ -208,19 +1167,14 @@ export async function runEvaluation(options = {}) {
|
|
|
208
1167
|
const stats = evaluationStore.getRunStats(run.id);
|
|
209
1168
|
const scenarioStats = evaluationStore.getScenarioStats(run.id);
|
|
210
1169
|
|
|
211
|
-
log('\n' + '='.repeat(60));
|
|
212
|
-
log('EVALUATION COMPLETE');
|
|
213
|
-
log('='.repeat(60));
|
|
214
|
-
log(`Run ID: ${run.id}`);
|
|
215
|
-
log(`Total tests: ${results.length}`);
|
|
216
|
-
log(`Successful: ${results.filter(r => r.success).length}`);
|
|
217
|
-
|
|
218
1170
|
return {
|
|
219
1171
|
runId: run.id,
|
|
220
|
-
totalTests
|
|
221
|
-
successfulTests
|
|
1172
|
+
totalTests,
|
|
1173
|
+
successfulTests,
|
|
1174
|
+
failedTests,
|
|
222
1175
|
stats,
|
|
223
1176
|
scenarioStats,
|
|
1177
|
+
progressLogPath,
|
|
224
1178
|
};
|
|
225
1179
|
}
|
|
226
1180
|
|
|
@@ -229,7 +1183,7 @@ export async function runEvaluation(options = {}) {
|
|
|
229
1183
|
* Handles both single-turn and multi-turn scenarios
|
|
230
1184
|
*/
|
|
231
1185
|
async function runSingleTest(scenario, config, options = {}) {
|
|
232
|
-
const { skipRubricEval = false, outputSize = 'normal', verbose = false, onLog, superegoStrategy = null } = options;
|
|
1186
|
+
const { skipRubricEval = false, outputSize = 'normal', verbose = false, onLog, superegoStrategy = null, judgeOverride = null } = options;
|
|
233
1187
|
|
|
234
1188
|
// Create a log function that calls both console and onLog callback
|
|
235
1189
|
const log = (message, level = 'info') => {
|
|
@@ -237,7 +1191,7 @@ async function runSingleTest(scenario, config, options = {}) {
|
|
|
237
1191
|
if (onLog) onLog(message, level);
|
|
238
1192
|
};
|
|
239
1193
|
|
|
240
|
-
const fullScenario =
|
|
1194
|
+
const fullScenario = evalConfigLoader.getScenario(scenario.id);
|
|
241
1195
|
if (!fullScenario) {
|
|
242
1196
|
throw new Error(`Scenario not found: ${scenario.id}`);
|
|
243
1197
|
}
|
|
@@ -245,132 +1199,103 @@ async function runSingleTest(scenario, config, options = {}) {
|
|
|
245
1199
|
log(`Running scenario: ${scenario.name}`, 'info');
|
|
246
1200
|
|
|
247
1201
|
// Check if this is a multi-turn scenario
|
|
248
|
-
const isMultiTurn =
|
|
1202
|
+
const isMultiTurn = evalConfigLoader.isMultiTurnScenario(scenario.id);
|
|
249
1203
|
|
|
250
1204
|
if (isMultiTurn) {
|
|
251
1205
|
log('Detected multi-turn scenario', 'info');
|
|
252
|
-
return runMultiTurnTest(scenario, config, fullScenario, { ...options, log });
|
|
1206
|
+
return runMultiTurnTest(scenario, config, fullScenario, { ...options, log, judgeOverride });
|
|
253
1207
|
}
|
|
254
1208
|
|
|
255
1209
|
// Single-turn evaluation (original logic)
|
|
256
|
-
return runSingleTurnTest(scenario, config, fullScenario, { ...options, log });
|
|
1210
|
+
return runSingleTurnTest(scenario, config, fullScenario, { ...options, log, judgeOverride });
|
|
257
1211
|
}
|
|
258
1212
|
|
|
259
|
-
/**
|
|
260
|
-
* Run a single-turn test
|
|
261
|
-
*/
|
|
262
|
-
async function runSingleTurnTest(scenario, config, fullScenario, options = {}) {
|
|
263
|
-
const { skipRubricEval = false, outputSize = 'normal', verbose = false, log = () => {}, superegoStrategy = null } = options;
|
|
264
|
-
|
|
265
|
-
//
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
context
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
// Wrap API call with retry logic for rate limit handling
|
|
278
|
-
const genResult = await retryWithBackoff(
|
|
279
|
-
() => tutorApi.generateSuggestions(context, {
|
|
280
|
-
provider: config.provider,
|
|
281
|
-
model: config.model,
|
|
282
|
-
egoModel: config.egoModel, // Override ego model for benchmarking
|
|
283
|
-
profileName: config.profileName,
|
|
284
|
-
hyperparameters: config.hyperparameters || {},
|
|
285
|
-
trace: true, // Always capture trace for tension analysis
|
|
286
|
-
superegoStrategy, // Pass through superego intervention strategy
|
|
287
|
-
outputSize, // compact, normal, expanded - affects response length
|
|
288
|
-
}),
|
|
289
|
-
{ log }
|
|
290
|
-
);
|
|
291
|
-
|
|
292
|
-
if (!genResult.success) {
|
|
293
|
-
log(`Generation failed: ${genResult.error}`, 'error');
|
|
294
|
-
return {
|
|
295
|
-
scenarioId: scenario.id,
|
|
296
|
-
scenarioName: scenario.name,
|
|
297
|
-
provider: config.provider || genResult.metadata?.provider,
|
|
298
|
-
model: config.model || genResult.metadata?.model,
|
|
299
|
-
profileName: config.profileName,
|
|
300
|
-
success: false,
|
|
301
|
-
errorMessage: genResult.error,
|
|
302
|
-
latencyMs: genResult.metadata?.latencyMs,
|
|
303
|
-
};
|
|
304
|
-
}
|
|
305
|
-
|
|
306
|
-
const suggestionCount = genResult.suggestions?.length || 0;
|
|
307
|
-
log(`Generated ${suggestionCount} suggestion(s) in ${genResult.metadata?.latencyMs}ms`, 'success');
|
|
308
|
-
|
|
309
|
-
if (genResult.metadata?.dialogueRounds) {
|
|
310
|
-
log(`Dialogue rounds: ${genResult.metadata.dialogueRounds}`, 'info');
|
|
311
|
-
}
|
|
312
|
-
|
|
313
|
-
// Quick validation (rule-based)
|
|
314
|
-
log('Running validation checks...', 'info');
|
|
315
|
-
const suggestion = genResult.suggestions?.[0];
|
|
316
|
-
const validation = suggestion
|
|
317
|
-
? rubricEvaluator.quickValidate(suggestion, {
|
|
318
|
-
requiredElements: fullScenario.required_elements,
|
|
319
|
-
forbiddenElements: fullScenario.forbidden_elements,
|
|
320
|
-
})
|
|
321
|
-
: { passesRequired: false, passesForbidden: true, requiredMissing: ['No suggestions generated'] };
|
|
322
|
-
|
|
323
|
-
log(`Validation: required=${validation.passesRequired ? 'PASS' : 'FAIL'}, forbidden=${validation.passesForbidden ? 'PASS' : 'FAIL'}`, validation.passesRequired && validation.passesForbidden ? 'success' : 'warning');
|
|
324
|
-
|
|
325
|
-
let rubricResult = null;
|
|
326
|
-
if (!skipRubricEval && suggestion) {
|
|
327
|
-
// Full rubric evaluation with AI judge
|
|
328
|
-
log('Running AI rubric evaluation...', 'info');
|
|
329
|
-
debugLog(`[evaluationRunner] Running rubric evaluation for ${scenario.id}...`);
|
|
330
|
-
rubricResult = await rubricEvaluator.evaluateSuggestion(suggestion, {
|
|
331
|
-
name: fullScenario.name,
|
|
332
|
-
description: fullScenario.description,
|
|
333
|
-
expectedBehavior: fullScenario.expected_behavior,
|
|
334
|
-
learnerContext: fullScenario.learner_context,
|
|
335
|
-
requiredElements: fullScenario.required_elements,
|
|
336
|
-
forbiddenElements: fullScenario.forbidden_elements,
|
|
337
|
-
}, {});
|
|
338
|
-
|
|
339
|
-
// Log rubric result summary
|
|
340
|
-
if (rubricResult) {
|
|
341
|
-
debugLog(`[evaluationRunner] Rubric result: success=${rubricResult.success}, ` +
|
|
342
|
-
`overallScore=${rubricResult.overallScore}, ` +
|
|
343
|
-
`scoresCount=${Object.keys(rubricResult.scores || {}).length}, ` +
|
|
344
|
-
`error=${rubricResult.error || 'none'}`);
|
|
345
|
-
if (rubricResult.success) {
|
|
346
|
-
log(`Rubric evaluation complete: score=${rubricResult.overallScore?.toFixed(1)}`, 'success');
|
|
347
|
-
} else {
|
|
348
|
-
log(`Rubric evaluation failed: ${rubricResult.error || 'unknown error'}`, 'error');
|
|
349
|
-
}
|
|
350
|
-
}
|
|
351
|
-
} else if (skipRubricEval) {
|
|
352
|
-
debugLog(`[evaluationRunner] Skipping rubric evaluation (--fast mode)`);
|
|
353
|
-
log('Skipping AI rubric evaluation (fast mode)', 'info');
|
|
354
|
-
} else if (!suggestion) {
|
|
355
|
-
debugLog(`[evaluationRunner] Skipping rubric evaluation (no suggestion generated)`);
|
|
356
|
-
log('Skipping rubric evaluation (no suggestion generated)', 'warning');
|
|
1213
|
+
/**
|
|
1214
|
+
* Run a single-turn test
|
|
1215
|
+
*/
|
|
1216
|
+
async function runSingleTurnTest(scenario, config, fullScenario, options = {}) {
|
|
1217
|
+
const { skipRubricEval = false, outputSize = 'normal', verbose = false, log = () => {}, superegoStrategy = null, judgeOverride = null } = options;
|
|
1218
|
+
|
|
1219
|
+
// Resolve model aliases through eval's providers.yaml
|
|
1220
|
+
const resolvedConfig = resolveConfigModels(config);
|
|
1221
|
+
|
|
1222
|
+
// Build context with optional curriculum content
|
|
1223
|
+
log('Building learner context...', 'info');
|
|
1224
|
+
const curriculumContext = contentResolver.isConfigured()
|
|
1225
|
+
? contentResolver.buildCurriculumContext(
|
|
1226
|
+
contentResolver.resolveScenarioContent(fullScenario)
|
|
1227
|
+
)
|
|
1228
|
+
: null;
|
|
1229
|
+
if (curriculumContext) {
|
|
1230
|
+
log(`Curriculum context loaded (${curriculumContext.length} chars)`, 'info');
|
|
357
1231
|
}
|
|
1232
|
+
const structuredLearnerContext = structureLearnerContext(fullScenario.learner_context);
|
|
1233
|
+
const context = tutorApi.buildContext(structuredLearnerContext, curriculumContext);
|
|
1234
|
+
context.isNewUser = fullScenario.is_new_user;
|
|
358
1235
|
|
|
359
|
-
//
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
1236
|
+
// Resolve profile: extract dialogue/recognition settings and remap to tutor-core profile.
|
|
1237
|
+
const profileResolution = resolveEvalProfile(resolvedConfig.profileName);
|
|
1238
|
+
const { useDialogue, maxRounds, recognitionMode } = profileResolution;
|
|
1239
|
+
resolvedConfig.profileName = profileResolution.resolvedProfileName;
|
|
1240
|
+
|
|
1241
|
+
// Log config info
|
|
1242
|
+
log(`Generating suggestions with profile: ${resolvedConfig.profileName} (dialogue=${useDialogue}, rounds=${maxRounds}, recognition=${recognitionMode})`, 'info');
|
|
1243
|
+
log(`Provider: ${resolvedConfig.provider || 'from profile'}, Model: ${resolvedConfig.model || 'from profile'}`, 'info');
|
|
1244
|
+
if (resolvedConfig.egoModel) {
|
|
1245
|
+
const egoLabel = typeof resolvedConfig.egoModel === 'object'
|
|
1246
|
+
? `${resolvedConfig.egoModel.provider}.${resolvedConfig.egoModel.model}`
|
|
1247
|
+
: resolvedConfig.egoModel;
|
|
1248
|
+
log(`Ego model override: ${egoLabel}`, 'info');
|
|
1249
|
+
}
|
|
1250
|
+
|
|
1251
|
+
// Use shared generation + evaluation helper
|
|
1252
|
+
const { genResult, suggestion, validation, rubricResult, turnScore: overallScore, scoringMethod } = await generateAndEvaluateTurn(
|
|
1253
|
+
context, resolvedConfig,
|
|
1254
|
+
{
|
|
1255
|
+
scenarioName: fullScenario.name,
|
|
1256
|
+
description: fullScenario.description,
|
|
1257
|
+
expectedBehavior: fullScenario.expected_behavior,
|
|
1258
|
+
learnerContext: fullScenario.learner_context,
|
|
1259
|
+
requiredElements: fullScenario.required_elements,
|
|
1260
|
+
requiredElementsAny: fullScenario.required_elements_any,
|
|
1261
|
+
forbiddenElements: fullScenario.forbidden_elements,
|
|
1262
|
+
},
|
|
1263
|
+
{ skipRubricEval, outputSize, superegoStrategy, judgeOverride, useDialogue, maxRounds, log, scenarioId: scenario.id }
|
|
1264
|
+
);
|
|
1265
|
+
|
|
1266
|
+
if (!genResult.success) {
|
|
1267
|
+
return {
|
|
1268
|
+
scenarioId: scenario.id,
|
|
1269
|
+
scenarioName: scenario.name,
|
|
1270
|
+
scenarioType: fullScenario.type || 'suggestion',
|
|
1271
|
+
provider: resolvedConfig.provider || genResult.metadata?.provider,
|
|
1272
|
+
model: resolvedConfig.model || genResult.metadata?.model,
|
|
1273
|
+
profileName: config.profileName,
|
|
1274
|
+
egoModel: resolvedConfig.egoModel
|
|
1275
|
+
? `${resolvedConfig.egoModel.provider}.${resolvedConfig.egoModel.model}`
|
|
1276
|
+
: null,
|
|
1277
|
+
superegoModel: resolvedConfig.superegoModel
|
|
1278
|
+
? `${resolvedConfig.superegoModel.provider}.${resolvedConfig.superegoModel.model}`
|
|
1279
|
+
: null,
|
|
1280
|
+
success: false,
|
|
1281
|
+
errorMessage: genResult.error,
|
|
1282
|
+
latencyMs: genResult.metadata?.latencyMs,
|
|
1283
|
+
};
|
|
366
1284
|
}
|
|
367
1285
|
|
|
368
1286
|
return {
|
|
369
1287
|
scenarioId: scenario.id,
|
|
370
1288
|
scenarioName: scenario.name,
|
|
371
|
-
|
|
372
|
-
|
|
1289
|
+
scenarioType: fullScenario.type || 'suggestion',
|
|
1290
|
+
provider: resolvedConfig.provider || genResult.metadata?.provider,
|
|
1291
|
+
model: resolvedConfig.model || genResult.metadata?.model,
|
|
373
1292
|
profileName: config.profileName,
|
|
1293
|
+
egoModel: resolvedConfig.egoModel
|
|
1294
|
+
? `${resolvedConfig.egoModel.provider}.${resolvedConfig.egoModel.model}`
|
|
1295
|
+
: null,
|
|
1296
|
+
superegoModel: resolvedConfig.superegoModel
|
|
1297
|
+
? `${resolvedConfig.superegoModel.provider}.${resolvedConfig.superegoModel.model}`
|
|
1298
|
+
: null,
|
|
374
1299
|
hyperparameters: config.hyperparameters,
|
|
375
1300
|
suggestions: genResult.suggestions,
|
|
376
1301
|
success: true,
|
|
@@ -379,8 +1304,8 @@ async function runSingleTurnTest(scenario, config, fullScenario, options = {}) {
|
|
|
379
1304
|
outputTokens: genResult.metadata?.outputTokens,
|
|
380
1305
|
dialogueRounds: genResult.metadata?.dialogueRounds,
|
|
381
1306
|
apiCalls: genResult.metadata?.apiCalls,
|
|
382
|
-
cost: genResult.metadata?.totalCost,
|
|
383
|
-
dialogueId: genResult.metadata?.dialogueId,
|
|
1307
|
+
cost: genResult.metadata?.totalCost,
|
|
1308
|
+
dialogueId: genResult.metadata?.dialogueId,
|
|
384
1309
|
scores: rubricResult?.scores && Object.keys(rubricResult.scores).length > 0 ? {
|
|
385
1310
|
relevance: rubricResult.scores.relevance?.score,
|
|
386
1311
|
specificity: rubricResult.scores.specificity?.score,
|
|
@@ -389,18 +1314,21 @@ async function runSingleTurnTest(scenario, config, fullScenario, options = {}) {
|
|
|
389
1314
|
actionability: rubricResult.scores.actionability?.score,
|
|
390
1315
|
tone: rubricResult.scores.tone?.score,
|
|
391
1316
|
} : null,
|
|
392
|
-
// Include full scores with reasoning for detailed analysis
|
|
393
1317
|
scoresWithReasoning: rubricResult?.scores && Object.keys(rubricResult.scores).length > 0
|
|
394
1318
|
? rubricResult.scores
|
|
395
1319
|
: null,
|
|
396
1320
|
overallScore,
|
|
1321
|
+
scoringMethod,
|
|
1322
|
+
baseScore: rubricResult?.baseScore ?? null,
|
|
1323
|
+
recognitionScore: rubricResult?.recognitionScore ?? null,
|
|
397
1324
|
passesRequired: rubricResult?.passesRequired ?? validation.passesRequired,
|
|
398
1325
|
passesForbidden: rubricResult?.passesForbidden ?? validation.passesForbidden,
|
|
399
1326
|
requiredMissing: rubricResult?.requiredMissing || validation.requiredMissing,
|
|
400
1327
|
forbiddenFound: rubricResult?.forbiddenFound || validation.forbiddenFound,
|
|
401
|
-
|
|
1328
|
+
judgeModel: rubricResult?.judgeModel,
|
|
402
1329
|
evaluationReasoning: rubricResult?.summary,
|
|
403
|
-
|
|
1330
|
+
factors: resolvedConfig.factors || null,
|
|
1331
|
+
learnerArchitecture: resolvedConfig.learnerArchitecture || null,
|
|
404
1332
|
dialogueResult: {
|
|
405
1333
|
dialogueTrace: genResult.dialogueTrace,
|
|
406
1334
|
dialogueRounds: genResult.metadata?.dialogueRounds,
|
|
@@ -411,81 +1339,185 @@ async function runSingleTurnTest(scenario, config, fullScenario, options = {}) {
|
|
|
411
1339
|
}
|
|
412
1340
|
|
|
413
1341
|
/**
|
|
414
|
-
* Run a multi-turn test
|
|
415
|
-
*
|
|
1342
|
+
* Run a multi-turn test as an iterative loop.
|
|
1343
|
+
*
|
|
1344
|
+
* Each turn goes through the SAME generateAndEvaluateTurn() code path as
|
|
1345
|
+
* single-turn, with accumulated conversation context between turns.
|
|
1346
|
+
* This eliminates the separate multiTurnRunner orchestration.
|
|
416
1347
|
*/
|
|
417
1348
|
async function runMultiTurnTest(scenario, config, fullScenario, options = {}) {
|
|
418
|
-
const { skipRubricEval = false, verbose = false } = options;
|
|
419
|
-
const log = verbose ? console.log : () => {};
|
|
1349
|
+
const { skipRubricEval = false, outputSize = 'normal', verbose = false, log = () => {}, superegoStrategy = null, judgeOverride = null } = options;
|
|
420
1350
|
|
|
421
1351
|
log(`[evaluationRunner] Running multi-turn scenario: ${scenario.id}`);
|
|
422
1352
|
|
|
423
|
-
|
|
1353
|
+
// 1. Resolve config (models, profile) — same as single-turn
|
|
1354
|
+
const resolvedConfig = resolveConfigModels(config);
|
|
1355
|
+
const profileResolution = resolveEvalProfile(resolvedConfig.profileName);
|
|
1356
|
+
const { useDialogue, maxRounds } = profileResolution;
|
|
1357
|
+
resolvedConfig.profileName = profileResolution.resolvedProfileName;
|
|
1358
|
+
|
|
1359
|
+
// 2. Build curriculum context — same as single-turn
|
|
1360
|
+
const curriculumContext = contentResolver.isConfigured()
|
|
1361
|
+
? contentResolver.buildCurriculumContext(
|
|
1362
|
+
contentResolver.resolveScenarioContent(fullScenario)
|
|
1363
|
+
)
|
|
1364
|
+
: null;
|
|
1365
|
+
|
|
1366
|
+
// 3. Generate dialogue ID for the session
|
|
1367
|
+
const dialogueId = `dialogue-${Date.now()}-${Math.random().toString(36).slice(2, 8)}`;
|
|
1368
|
+
dialogueEngine.setCurrentDialogueId(dialogueId);
|
|
1369
|
+
|
|
1370
|
+
// Generate synthetic learnerId for Writing Pad persistence across turns
|
|
1371
|
+
const learnerId = `eval-learner-${dialogueId}-${scenario.id.replace(/[^a-zA-Z0-9]/g, '')}`;
|
|
1372
|
+
log(`[evaluationRunner] Generated learnerId for Writing Pad: ${learnerId}`, 'info');
|
|
1373
|
+
|
|
1374
|
+
// Deep-clone turns to prevent mutation of shared scenario objects across profiles
|
|
1375
|
+
const turns = JSON.parse(JSON.stringify(fullScenario.turns || []));
|
|
424
1376
|
const turnResults = [];
|
|
425
1377
|
let totalLatencyMs = 0;
|
|
426
1378
|
let totalInputTokens = 0;
|
|
427
1379
|
let totalOutputTokens = 0;
|
|
428
1380
|
let totalApiCalls = 0;
|
|
429
1381
|
let totalCost = 0;
|
|
1382
|
+
let totalDialogueRounds = 0;
|
|
430
1383
|
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
1384
|
+
let conversationHistory = [];
|
|
1385
|
+
let previousSuggestion = null;
|
|
1386
|
+
const consolidatedTrace = [];
|
|
1387
|
+
|
|
1388
|
+
const sharedTurnOptions = { skipRubricEval, outputSize, superegoStrategy, judgeOverride, useDialogue, maxRounds, log, scenarioId: scenario.id, learnerId };
|
|
1389
|
+
|
|
1390
|
+
// Check if prompt rewriting is enabled for this profile
|
|
1391
|
+
const rawProfile = evalConfigLoader.loadTutorAgents()?.profiles?.[config.profileName];
|
|
1392
|
+
const promptRewritingEnabled = rawProfile?.prompt_rewriting?.enabled ?? false;
|
|
1393
|
+
const promptRewritingStrategy = rawProfile?.prompt_rewriting?.strategy ?? 'template';
|
|
1394
|
+
let sessionEvolution = null;
|
|
1395
|
+
|
|
1396
|
+
// 4. Loop through turns (initial turn 0 + follow-up turns)
|
|
1397
|
+
const totalTurnCount = 1 + turns.length;
|
|
1398
|
+
for (let turnIdx = 0; turnIdx < totalTurnCount; turnIdx++) {
|
|
1399
|
+
const isInitialTurn = turnIdx === 0;
|
|
1400
|
+
const turnDef = isInitialTurn ? null : turns[turnIdx - 1];
|
|
1401
|
+
|
|
1402
|
+
log(`[evaluationRunner] Turn ${turnIdx}/${totalTurnCount - 1}${isInitialTurn ? ' (initial)' : ` (${turnDef.id})`}`, 'info');
|
|
1403
|
+
|
|
1404
|
+
// Show learner action in transcript mode (for follow-up turns)
|
|
1405
|
+
if (!isInitialTurn && dialogueEngine.isTranscriptMode()) {
|
|
1406
|
+
dialogueEngine.transcript('LEARNER ACTION', formatLearnerActionForTranscript(turnDef));
|
|
1407
|
+
}
|
|
1408
|
+
|
|
1409
|
+
// Build context for this turn
|
|
1410
|
+
let contextStr;
|
|
1411
|
+
if (isInitialTurn) {
|
|
1412
|
+
contextStr = fullScenario.learner_context;
|
|
1413
|
+
} else {
|
|
1414
|
+
// Add previous turn to conversation history
|
|
1415
|
+
conversationHistory.push({
|
|
1416
|
+
turnIndex: turnIdx - 1,
|
|
1417
|
+
turnId: turnIdx === 1 ? 'initial' : turns[turnIdx - 2]?.id,
|
|
1418
|
+
suggestion: previousSuggestion,
|
|
1419
|
+
learnerAction: turnDef.learner_action,
|
|
1420
|
+
learnerMessage: turnDef.action_details?.message,
|
|
1421
|
+
});
|
|
1422
|
+
|
|
1423
|
+
contextStr = buildMultiTurnContext({
|
|
1424
|
+
originalContext: fullScenario.learner_context,
|
|
1425
|
+
conversationHistory,
|
|
1426
|
+
currentTurn: turnDef,
|
|
1427
|
+
previousSuggestion,
|
|
1428
|
+
});
|
|
1429
|
+
}
|
|
1430
|
+
|
|
1431
|
+
const structuredContextStr = structureLearnerContext(contextStr);
|
|
1432
|
+
const context = tutorApi.buildContext(structuredContextStr, curriculumContext);
|
|
1433
|
+
context.isNewUser = isInitialTurn ? fullScenario.is_new_user : false;
|
|
1434
|
+
|
|
1435
|
+
// Build turn-specific rubric metadata
|
|
1436
|
+
const turnMeta = {
|
|
1437
|
+
scenarioName: isInitialTurn
|
|
1438
|
+
? fullScenario.name
|
|
1439
|
+
: `${fullScenario.name} - Turn ${turnIdx}`,
|
|
1440
|
+
description: isInitialTurn
|
|
1441
|
+
? fullScenario.description
|
|
1442
|
+
: `Turn: ${turnDef.learner_action}`,
|
|
1443
|
+
expectedBehavior: isInitialTurn
|
|
1444
|
+
? fullScenario.expected_behavior
|
|
1445
|
+
: turnDef.expected_behavior,
|
|
1446
|
+
learnerContext: contextStr,
|
|
1447
|
+
requiredElements: isInitialTurn
|
|
1448
|
+
? (fullScenario.required_elements || [])
|
|
1449
|
+
: (turnDef.required_elements || []),
|
|
1450
|
+
requiredElementsAny: isInitialTurn
|
|
1451
|
+
? (fullScenario.required_elements_any || [])
|
|
1452
|
+
: (turnDef.required_elements_any || []),
|
|
1453
|
+
forbiddenElements: isInitialTurn
|
|
1454
|
+
? (fullScenario.forbidden_elements || [])
|
|
1455
|
+
: (turnDef.forbidden_elements || []),
|
|
1456
|
+
};
|
|
1457
|
+
|
|
1458
|
+
// Call the SAME generation+evaluation code path as single-turn
|
|
1459
|
+
// Pass dialogue context so the judge can see the full exchange
|
|
1460
|
+
const turnOptions = {
|
|
1461
|
+
...sharedTurnOptions,
|
|
1462
|
+
...(sessionEvolution ? { systemPromptExtension: sessionEvolution } : {}),
|
|
1463
|
+
conversationHistory: conversationHistory.length > 0 ? conversationHistory : null,
|
|
1464
|
+
consolidatedTrace: consolidatedTrace.length > 0 ? consolidatedTrace : null,
|
|
1465
|
+
};
|
|
1466
|
+
const { genResult, suggestion, validation, rubricResult, turnScore, scoringMethod } =
|
|
1467
|
+
await generateAndEvaluateTurn(context, resolvedConfig, turnMeta, turnOptions);
|
|
1468
|
+
|
|
1469
|
+
if (!genResult.success) {
|
|
1470
|
+
const turnId = isInitialTurn ? 'initial' : turnDef.id;
|
|
1471
|
+
throw new Error(`Multi-turn scenario ${scenario.id}: Turn ${turnIdx} (${turnId}) failed to generate suggestions`);
|
|
1472
|
+
}
|
|
442
1473
|
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
let turnScore = null;
|
|
477
|
-
if (rubricResult?.success) {
|
|
478
|
-
turnScore = rubricResult.overallScore;
|
|
479
|
-
} else if (suggestion) {
|
|
480
|
-
turnScore = (validation.passesRequired ? 50 : 0) + (validation.passesForbidden ? 50 : 0);
|
|
1474
|
+
// Accumulate dialogue traces
|
|
1475
|
+
if (genResult.dialogueTrace && genResult.dialogueTrace.length > 0) {
|
|
1476
|
+
// Insert user turn action entry before each turn (except initial)
|
|
1477
|
+
if (!isInitialTurn) {
|
|
1478
|
+
const histEntry = conversationHistory[conversationHistory.length - 1];
|
|
1479
|
+
consolidatedTrace.push({
|
|
1480
|
+
agent: 'user',
|
|
1481
|
+
action: 'turn_action',
|
|
1482
|
+
turnIndex: turnIdx,
|
|
1483
|
+
contextSummary: histEntry?.learnerMessage || `${histEntry?.learnerAction || 'Action'}`,
|
|
1484
|
+
detail: `Learner: ${histEntry?.learnerAction}`,
|
|
1485
|
+
timestamp: new Date().toISOString(),
|
|
1486
|
+
});
|
|
1487
|
+
}
|
|
1488
|
+
consolidatedTrace.push(...genResult.dialogueTrace);
|
|
1489
|
+
|
|
1490
|
+
// Add final delivery to user for multi-agent mode
|
|
1491
|
+
const hasSuperego = genResult.dialogueTrace.some(entry => entry.agent === 'superego');
|
|
1492
|
+
if (hasSuperego) {
|
|
1493
|
+
const suggCount = genResult.suggestions?.length || 0;
|
|
1494
|
+
consolidatedTrace.push({
|
|
1495
|
+
agent: 'user',
|
|
1496
|
+
action: 'final_output',
|
|
1497
|
+
turnIndex: turnIdx,
|
|
1498
|
+
from: 'ego',
|
|
1499
|
+
to: 'user',
|
|
1500
|
+
direction: 'response',
|
|
1501
|
+
suggestionCount: suggCount,
|
|
1502
|
+
contextSummary: `Delivered ${suggCount} suggestion${suggCount !== 1 ? 's' : ''}`,
|
|
1503
|
+
detail: `Turn ${turnIdx + 1} complete`,
|
|
1504
|
+
timestamp: new Date().toISOString(),
|
|
1505
|
+
});
|
|
1506
|
+
}
|
|
481
1507
|
}
|
|
482
1508
|
|
|
1509
|
+
// Collect per-turn result
|
|
483
1510
|
turnResults.push({
|
|
484
|
-
turnIndex:
|
|
485
|
-
turnId:
|
|
486
|
-
learnerAction:
|
|
487
|
-
|
|
488
|
-
|
|
1511
|
+
turnIndex: turnIdx,
|
|
1512
|
+
turnId: isInitialTurn ? 'initial' : turnDef.id,
|
|
1513
|
+
learnerAction: isInitialTurn ? undefined : turnDef.learner_action,
|
|
1514
|
+
learnerMessage: isInitialTurn ? undefined : turnDef.action_details?.message, // Include generated learner message for growth tracking
|
|
1515
|
+
expectedBehavior: turnMeta.expectedBehavior,
|
|
1516
|
+
suggestion,
|
|
1517
|
+
learnerDeliberation: turnDef?._learnerDeliberation || null,
|
|
1518
|
+
learnerEmotionalState: turnDef?._learnerEmotionalState || null,
|
|
1519
|
+
learnerMessageGenerated: !!turnDef?._learnerDeliberation,
|
|
1520
|
+
learnerOriginalMessage: turnDef?._originalMessage || null,
|
|
489
1521
|
scores: rubricResult?.scores && Object.keys(rubricResult.scores).length > 0 ? {
|
|
490
1522
|
relevance: rubricResult.scores.relevance?.score,
|
|
491
1523
|
specificity: rubricResult.scores.specificity?.score,
|
|
@@ -495,31 +1527,127 @@ async function runMultiTurnTest(scenario, config, fullScenario, options = {}) {
|
|
|
495
1527
|
tone: rubricResult.scores.tone?.score,
|
|
496
1528
|
} : null,
|
|
497
1529
|
turnScore,
|
|
1530
|
+
scoringMethod,
|
|
498
1531
|
passesRequired: rubricResult?.passesRequired ?? validation.passesRequired,
|
|
499
1532
|
passesForbidden: rubricResult?.passesForbidden ?? validation.passesForbidden,
|
|
500
1533
|
requiredMissing: validation.requiredMissing,
|
|
501
1534
|
forbiddenFound: validation.forbiddenFound,
|
|
502
|
-
minAcceptableScore:
|
|
1535
|
+
minAcceptableScore: (!isInitialTurn ? turnDef.min_acceptable_score : null) || fullScenario.min_acceptable_score,
|
|
503
1536
|
});
|
|
504
1537
|
|
|
505
1538
|
// Aggregate metrics
|
|
506
|
-
totalLatencyMs +=
|
|
507
|
-
totalInputTokens +=
|
|
508
|
-
totalOutputTokens +=
|
|
509
|
-
totalApiCalls +=
|
|
510
|
-
totalCost +=
|
|
1539
|
+
totalLatencyMs += genResult.metadata?.latencyMs || 0;
|
|
1540
|
+
totalInputTokens += genResult.metadata?.inputTokens || 0;
|
|
1541
|
+
totalOutputTokens += genResult.metadata?.outputTokens || 0;
|
|
1542
|
+
totalApiCalls += genResult.metadata?.apiCalls || 0;
|
|
1543
|
+
totalCost += genResult.metadata?.totalCost || 0;
|
|
1544
|
+
totalDialogueRounds += genResult.metadata?.dialogueRounds || 0;
|
|
1545
|
+
|
|
1546
|
+
// Update for next iteration
|
|
1547
|
+
previousSuggestion = suggestion;
|
|
1548
|
+
|
|
1549
|
+
// Synthesize prompt rewriting directives for next turn (if enabled)
|
|
1550
|
+
if (promptRewritingEnabled && turnIdx < totalTurnCount - 1) {
|
|
1551
|
+
if (promptRewritingStrategy === 'llm') {
|
|
1552
|
+
// LLM-based directive synthesis using superego model
|
|
1553
|
+
try {
|
|
1554
|
+
sessionEvolution = await promptRewriter.synthesizeDirectivesLLM({
|
|
1555
|
+
turnResults,
|
|
1556
|
+
consolidatedTrace,
|
|
1557
|
+
conversationHistory,
|
|
1558
|
+
config: rawProfile,
|
|
1559
|
+
});
|
|
1560
|
+
if (sessionEvolution) {
|
|
1561
|
+
log(`[evaluationRunner] LLM rewriter generated directives for turn ${turnIdx + 1}`, 'info');
|
|
1562
|
+
}
|
|
1563
|
+
} catch (error) {
|
|
1564
|
+
log(`[evaluationRunner] LLM rewriter failed, falling back to template: ${error.message}`, 'warn');
|
|
1565
|
+
sessionEvolution = promptRewriter.synthesizeDirectives({
|
|
1566
|
+
turnResults,
|
|
1567
|
+
consolidatedTrace,
|
|
1568
|
+
conversationHistory,
|
|
1569
|
+
});
|
|
1570
|
+
}
|
|
1571
|
+
} else {
|
|
1572
|
+
// Template-based directive synthesis (deterministic, no LLM call)
|
|
1573
|
+
sessionEvolution = promptRewriter.synthesizeDirectives({
|
|
1574
|
+
turnResults,
|
|
1575
|
+
consolidatedTrace,
|
|
1576
|
+
conversationHistory,
|
|
1577
|
+
});
|
|
1578
|
+
}
|
|
1579
|
+
if (sessionEvolution) {
|
|
1580
|
+
log(`[evaluationRunner] Prompt rewriter (${promptRewritingStrategy}) generated ${sessionEvolution.split('\n').length - 2} directives for turn ${turnIdx + 1}`, 'info');
|
|
1581
|
+
}
|
|
1582
|
+
}
|
|
1583
|
+
|
|
1584
|
+
// Generate LLM learner response for next turn if ego_superego architecture
|
|
1585
|
+
// Note: check includes() to handle both 'ego_superego' and 'ego_superego_recognition'
|
|
1586
|
+
if (resolvedConfig.learnerArchitecture?.includes('ego_superego') && turnIdx < totalTurnCount - 1) {
|
|
1587
|
+
const nextTurnDef = turns[turnIdx]; // turnIdx is 0-based into the loop; turns[turnIdx] is the next follow-up turn
|
|
1588
|
+
if (nextTurnDef) {
|
|
1589
|
+
const learnerResponse = await generateLearnerResponse({
|
|
1590
|
+
tutorMessage: suggestion?.message || suggestion?.title || '',
|
|
1591
|
+
topic: fullScenario.topic || fullScenario.name || '',
|
|
1592
|
+
conversationHistory: conversationHistory.map(h => ({
|
|
1593
|
+
role: h.learnerMessage ? 'learner' : 'tutor',
|
|
1594
|
+
content: h.learnerMessage || h.suggestion?.message || '',
|
|
1595
|
+
})),
|
|
1596
|
+
learnerProfile: resolvedConfig.learnerArchitecture,
|
|
1597
|
+
personaId: fullScenario.learner_persona || 'eager_novice',
|
|
1598
|
+
modelOverride: config.modelOverride || null,
|
|
1599
|
+
});
|
|
1600
|
+
|
|
1601
|
+
// Override scripted message with LLM-generated one
|
|
1602
|
+
nextTurnDef._originalMessage = nextTurnDef.action_details?.message;
|
|
1603
|
+
nextTurnDef.action_details = nextTurnDef.action_details || {};
|
|
1604
|
+
nextTurnDef.action_details.message = learnerResponse.message;
|
|
1605
|
+
nextTurnDef._learnerDeliberation = learnerResponse.internalDeliberation;
|
|
1606
|
+
nextTurnDef._learnerEmotionalState = learnerResponse.emotionalState;
|
|
1607
|
+
|
|
1608
|
+
// Track learner LLM costs
|
|
1609
|
+
totalInputTokens += learnerResponse.tokenUsage?.inputTokens || 0;
|
|
1610
|
+
totalOutputTokens += learnerResponse.tokenUsage?.outputTokens || 0;
|
|
1611
|
+
totalApiCalls += learnerResponse.tokenUsage?.apiCalls || 0;
|
|
1612
|
+
|
|
1613
|
+
// Add learner deliberation to consolidated trace
|
|
1614
|
+
if (learnerResponse.internalDeliberation?.length > 0) {
|
|
1615
|
+
for (const delib of learnerResponse.internalDeliberation) {
|
|
1616
|
+
consolidatedTrace.push({
|
|
1617
|
+
agent: `learner_${delib.role}`,
|
|
1618
|
+
action: 'deliberation',
|
|
1619
|
+
turnIndex: turnIdx + 1,
|
|
1620
|
+
contextSummary: delib.content.substring(0, 100),
|
|
1621
|
+
detail: delib.content,
|
|
1622
|
+
timestamp: new Date().toISOString(),
|
|
1623
|
+
});
|
|
1624
|
+
}
|
|
1625
|
+
consolidatedTrace.push({
|
|
1626
|
+
agent: 'learner_synthesis',
|
|
1627
|
+
action: 'response',
|
|
1628
|
+
turnIndex: turnIdx + 1,
|
|
1629
|
+
contextSummary: learnerResponse.message.substring(0, 100),
|
|
1630
|
+
detail: learnerResponse.message,
|
|
1631
|
+
timestamp: new Date().toISOString(),
|
|
1632
|
+
});
|
|
1633
|
+
}
|
|
1634
|
+
|
|
1635
|
+
log(`[evaluationRunner] Generated LLM learner response (ego_superego): "${learnerResponse.message.substring(0, 80)}..."`, 'info');
|
|
1636
|
+
}
|
|
1637
|
+
}
|
|
511
1638
|
}
|
|
512
1639
|
|
|
513
|
-
//
|
|
1640
|
+
// 5. Aggregate scores across turns
|
|
514
1641
|
const validTurnScores = turnResults.filter(t => t.turnScore !== null).map(t => t.turnScore);
|
|
515
1642
|
const overallScore = validTurnScores.length > 0
|
|
516
1643
|
? validTurnScores.reduce((sum, s) => sum + s, 0) / validTurnScores.length
|
|
517
1644
|
: null;
|
|
518
1645
|
|
|
519
|
-
// Aggregate dimension scores
|
|
520
1646
|
const aggregateDimensions = {};
|
|
521
|
-
const
|
|
522
|
-
|
|
1647
|
+
const baseDims = ['relevance', 'specificity', 'pedagogical', 'personalization', 'actionability', 'tone', 'productive_struggle', 'epistemic_honesty'];
|
|
1648
|
+
const recognitionDims = ['mutual_recognition', 'dialectical_responsiveness', 'memory_integration', 'transformative_potential', 'tutor_adaptation', 'learner_growth'];
|
|
1649
|
+
const allDims = [...baseDims, ...recognitionDims];
|
|
1650
|
+
for (const dim of allDims) {
|
|
523
1651
|
const dimScores = turnResults
|
|
524
1652
|
.filter(t => t.scores?.[dim] !== undefined)
|
|
525
1653
|
.map(t => t.scores[dim]);
|
|
@@ -528,39 +1656,580 @@ async function runMultiTurnTest(scenario, config, fullScenario, options = {}) {
|
|
|
528
1656
|
}
|
|
529
1657
|
}
|
|
530
1658
|
|
|
531
|
-
|
|
1659
|
+
const baseScoreValues = baseDims.filter(d => aggregateDimensions[d] !== undefined).map(d => aggregateDimensions[d]);
|
|
1660
|
+
const recognitionScoreValues = recognitionDims.filter(d => aggregateDimensions[d] !== undefined).map(d => aggregateDimensions[d]);
|
|
1661
|
+
const baseScore = baseScoreValues.length > 0
|
|
1662
|
+
? ((baseScoreValues.reduce((s, v) => s + v, 0) / baseScoreValues.length - 1) / 4) * 100
|
|
1663
|
+
: null;
|
|
1664
|
+
const recognitionScore = recognitionScoreValues.length > 0
|
|
1665
|
+
? ((recognitionScoreValues.reduce((s, v) => s + v, 0) / recognitionScoreValues.length - 1) / 4) * 100
|
|
1666
|
+
: null;
|
|
1667
|
+
|
|
532
1668
|
const allTurnsPassed = turnResults.every(t => {
|
|
533
1669
|
if (t.turnScore === null) return false;
|
|
534
1670
|
const threshold = t.minAcceptableScore || fullScenario.min_acceptable_score || 0;
|
|
535
1671
|
return t.turnScore >= threshold;
|
|
536
1672
|
});
|
|
537
1673
|
|
|
1674
|
+
// 5b. Holistic dialogue evaluation — score the full transcript as a single unit
|
|
1675
|
+
let holisticDialogueScore = null;
|
|
1676
|
+
if (!skipRubricEval && consolidatedTrace.length > 0 && turnResults.length > 1) {
|
|
1677
|
+
log('[evaluationRunner] Running holistic dialogue evaluation on full transcript...', 'info');
|
|
1678
|
+
try {
|
|
1679
|
+
// Use the last turn's suggestion as the focal point, with full dialogue context
|
|
1680
|
+
const lastSuggestion = turnResults[turnResults.length - 1]?.suggestion;
|
|
1681
|
+
if (lastSuggestion) {
|
|
1682
|
+
const holisticResult = await rubricEvaluator.evaluateSuggestion(lastSuggestion, {
|
|
1683
|
+
name: `${fullScenario.name} (holistic dialogue)`,
|
|
1684
|
+
description: `Holistic evaluation of ${turnResults.length}-turn dialogue. Score the overall quality of the tutoring interaction, not just this final response.`,
|
|
1685
|
+
expectedBehavior: fullScenario.expected_behavior,
|
|
1686
|
+
learnerContext: fullScenario.learner_context,
|
|
1687
|
+
requiredElements: fullScenario.required_elements || [],
|
|
1688
|
+
forbiddenElements: fullScenario.forbidden_elements || [],
|
|
1689
|
+
}, {
|
|
1690
|
+
dialogueContext: {
|
|
1691
|
+
conversationHistory,
|
|
1692
|
+
consolidatedTrace,
|
|
1693
|
+
},
|
|
1694
|
+
}, { judgeOverride });
|
|
1695
|
+
|
|
1696
|
+
if (holisticResult?.success) {
|
|
1697
|
+
holisticDialogueScore = {
|
|
1698
|
+
overallScore: holisticResult.overallScore,
|
|
1699
|
+
baseScore: holisticResult.baseScore,
|
|
1700
|
+
recognitionScore: holisticResult.recognitionScore,
|
|
1701
|
+
scores: holisticResult.scores,
|
|
1702
|
+
summary: holisticResult.summary,
|
|
1703
|
+
judgeModel: holisticResult.judgeModel,
|
|
1704
|
+
};
|
|
1705
|
+
log(`[evaluationRunner] Holistic dialogue score: ${holisticResult.overallScore?.toFixed(1)}`, 'success');
|
|
1706
|
+
} else {
|
|
1707
|
+
log(`[evaluationRunner] Holistic dialogue evaluation failed: ${holisticResult?.error || 'unknown'}`, 'warning');
|
|
1708
|
+
}
|
|
1709
|
+
}
|
|
1710
|
+
} catch (error) {
|
|
1711
|
+
log(`[evaluationRunner] Holistic dialogue evaluation error: ${error.message}`, 'warning');
|
|
1712
|
+
}
|
|
1713
|
+
}
|
|
1714
|
+
|
|
1715
|
+
// 5c. Analyze bilateral transformation (tutor + learner evolution)
|
|
1716
|
+
const turnProgressionAnalysis = turnComparisonAnalyzer.analyzeTurnProgression(turnResults);
|
|
1717
|
+
const markerDefinitions = fullScenario.transformation_markers || fullScenario.transformationMarkers || null;
|
|
1718
|
+
const transformationMarkerAnalysis = markerDefinitions
|
|
1719
|
+
? turnComparisonAnalyzer.analyzeTransformationMarkers(turnResults, markerDefinitions)
|
|
1720
|
+
: null;
|
|
1721
|
+
const dialogueTraceReport = dialogueTraceAnalyzer.generateTransformationReport(consolidatedTrace, turnResults);
|
|
1722
|
+
|
|
1723
|
+
log(`[evaluationRunner] Bilateral transformation analysis:`, 'info');
|
|
1724
|
+
log(` - Tutor adaptation index: ${turnProgressionAnalysis.adaptationIndex?.toFixed(2) ?? 'N/A'}`, 'info');
|
|
1725
|
+
log(` - Learner growth index: ${turnProgressionAnalysis.learnerGrowthIndex?.toFixed(2) ?? 'N/A'}`, 'info');
|
|
1726
|
+
log(` - Bilateral balance: ${dialogueTraceReport.bilateralMetrics.bilateralBalance?.toFixed(2) ?? 'N/A'}`, 'info');
|
|
1727
|
+
if (dialogueTraceReport.bilateralMetrics.summary) {
|
|
1728
|
+
log(` - ${dialogueTraceReport.bilateralMetrics.summary}`, 'info');
|
|
1729
|
+
}
|
|
1730
|
+
|
|
1731
|
+
// 6. Write consolidated dialogue log
|
|
1732
|
+
const consolidatedDialogue = {
|
|
1733
|
+
suggestions: turnResults[turnResults.length - 1]?.suggestion ? [turnResults[turnResults.length - 1].suggestion] : [],
|
|
1734
|
+
dialogueTrace: consolidatedTrace,
|
|
1735
|
+
converged: false,
|
|
1736
|
+
rounds: totalDialogueRounds,
|
|
1737
|
+
metrics: {
|
|
1738
|
+
totalLatencyMs,
|
|
1739
|
+
totalInputTokens,
|
|
1740
|
+
totalOutputTokens,
|
|
1741
|
+
totalCost,
|
|
1742
|
+
apiCalls: totalApiCalls,
|
|
1743
|
+
},
|
|
1744
|
+
dialogueId,
|
|
1745
|
+
profileName: resolvedConfig.profileName,
|
|
1746
|
+
provider: resolvedConfig.provider,
|
|
1747
|
+
model: resolvedConfig.model,
|
|
1748
|
+
learnerContext: fullScenario.learner_context,
|
|
1749
|
+
isMultiTurn: true,
|
|
1750
|
+
learnerArchitecture: resolvedConfig.learnerArchitecture || 'unified',
|
|
1751
|
+
totalTurns: turnResults.length,
|
|
1752
|
+
turnResults: turnResults.map(t => ({
|
|
1753
|
+
turnIndex: t.turnIndex,
|
|
1754
|
+
turnId: t.turnId,
|
|
1755
|
+
suggestions: t.suggestion ? [t.suggestion] : [],
|
|
1756
|
+
})),
|
|
1757
|
+
// Holistic dialogue evaluation
|
|
1758
|
+
holisticDialogueScore,
|
|
1759
|
+
// Bilateral transformation analysis
|
|
1760
|
+
transformationAnalysis: {
|
|
1761
|
+
turnProgression: turnProgressionAnalysis,
|
|
1762
|
+
markerAnalysis: transformationMarkerAnalysis,
|
|
1763
|
+
dialogueTraceReport: dialogueTraceReport,
|
|
1764
|
+
},
|
|
1765
|
+
};
|
|
1766
|
+
|
|
1767
|
+
if (!fs.existsSync(LOGS_DIR)) {
|
|
1768
|
+
fs.mkdirSync(LOGS_DIR, { recursive: true });
|
|
1769
|
+
}
|
|
1770
|
+
const logPath = path.join(LOGS_DIR, `${dialogueId}.json`);
|
|
1771
|
+
fs.writeFileSync(logPath, JSON.stringify(consolidatedDialogue, null, 2));
|
|
1772
|
+
|
|
538
1773
|
log(`[evaluationRunner] Multi-turn complete: ${turnResults.length} turns, avgScore=${overallScore?.toFixed(1)}`);
|
|
539
1774
|
|
|
1775
|
+
// Aggregate requiredMissing/forbiddenFound from all turns
|
|
1776
|
+
const requiredMissing = [...new Set(turnResults.flatMap(t => t.requiredMissing || []))];
|
|
1777
|
+
const forbiddenFound = [...new Set(turnResults.flatMap(t => t.forbiddenFound || []))];
|
|
1778
|
+
|
|
1779
|
+
// 7. Return result
|
|
540
1780
|
return {
|
|
541
1781
|
scenarioId: scenario.id,
|
|
542
1782
|
scenarioName: scenario.name,
|
|
1783
|
+
scenarioType: fullScenario.type || 'suggestion',
|
|
543
1784
|
isMultiTurn: true,
|
|
544
1785
|
totalTurns: turnResults.length,
|
|
545
|
-
provider:
|
|
546
|
-
model:
|
|
1786
|
+
provider: resolvedConfig.provider,
|
|
1787
|
+
model: resolvedConfig.model,
|
|
547
1788
|
profileName: config.profileName,
|
|
1789
|
+
egoModel: resolvedConfig.egoModel
|
|
1790
|
+
? `${resolvedConfig.egoModel.provider}.${resolvedConfig.egoModel.model}`
|
|
1791
|
+
: null,
|
|
1792
|
+
superegoModel: resolvedConfig.superegoModel
|
|
1793
|
+
? `${resolvedConfig.superegoModel.provider}.${resolvedConfig.superegoModel.model}`
|
|
1794
|
+
: null,
|
|
548
1795
|
hyperparameters: config.hyperparameters,
|
|
549
|
-
suggestions:
|
|
1796
|
+
suggestions: turnResults.map(t => t.suggestion).filter(Boolean),
|
|
550
1797
|
success: true,
|
|
551
1798
|
latencyMs: totalLatencyMs,
|
|
552
1799
|
inputTokens: totalInputTokens,
|
|
553
1800
|
outputTokens: totalOutputTokens,
|
|
554
1801
|
apiCalls: totalApiCalls,
|
|
555
|
-
cost: totalCost,
|
|
556
|
-
dialogueId
|
|
557
|
-
dialogueRounds:
|
|
1802
|
+
cost: totalCost,
|
|
1803
|
+
dialogueId,
|
|
1804
|
+
dialogueRounds: totalDialogueRounds,
|
|
558
1805
|
scores: Object.keys(aggregateDimensions).length > 0 ? aggregateDimensions : null,
|
|
559
1806
|
overallScore,
|
|
1807
|
+
scoringMethod: turnResults.some(t => t.scoringMethod === 'judge_failed')
|
|
1808
|
+
? 'partial_judge_failure'
|
|
1809
|
+
: turnResults.every(t => t.scoringMethod === 'rubric') ? 'rubric' : 'mixed',
|
|
1810
|
+
baseScore,
|
|
1811
|
+
recognitionScore,
|
|
560
1812
|
turnResults,
|
|
561
1813
|
allTurnsPassed,
|
|
562
1814
|
passesRequired: turnResults.every(t => t.passesRequired),
|
|
563
1815
|
passesForbidden: turnResults.every(t => t.passesForbidden),
|
|
1816
|
+
requiredMissing,
|
|
1817
|
+
forbiddenFound,
|
|
1818
|
+
factors: resolvedConfig.factors || null,
|
|
1819
|
+
learnerArchitecture: resolvedConfig.learnerArchitecture || null,
|
|
1820
|
+
// Holistic dialogue evaluation (full transcript scored as single unit)
|
|
1821
|
+
holisticDialogueScore,
|
|
1822
|
+
// Bilateral transformation metrics
|
|
1823
|
+
transformationMetrics: {
|
|
1824
|
+
tutorAdaptationIndex: turnProgressionAnalysis.adaptationIndex,
|
|
1825
|
+
learnerGrowthIndex: turnProgressionAnalysis.learnerGrowthIndex,
|
|
1826
|
+
bilateralTransformationIndex: turnProgressionAnalysis.bilateralTransformationIndex,
|
|
1827
|
+
framingEvolution: turnProgressionAnalysis.framingEvolution,
|
|
1828
|
+
dimensionConvergence: turnProgressionAnalysis.dimensionConvergence,
|
|
1829
|
+
markerAnalysis: transformationMarkerAnalysis,
|
|
1830
|
+
bilateralMetrics: dialogueTraceReport.bilateralMetrics,
|
|
1831
|
+
superegoMetrics: dialogueTraceReport.superegoMetrics,
|
|
1832
|
+
transformationQuality: dialogueTraceReport.overallAssessment?.transformationQuality ?? null,
|
|
1833
|
+
},
|
|
1834
|
+
};
|
|
1835
|
+
}
|
|
1836
|
+
|
|
1837
|
+
/**
|
|
1838
|
+
* Resume an incomplete evaluation run, re-running only the missing tests.
|
|
1839
|
+
*
|
|
1840
|
+
* @param {Object} options
|
|
1841
|
+
* @param {string} options.runId - The run ID to resume
|
|
1842
|
+
* @param {number} [options.parallelism] - Parallel worker count
|
|
1843
|
+
* @param {boolean} [options.verbose] - Enable verbose output
|
|
1844
|
+
* @returns {Promise<Object>} Evaluation results (same shape as runEvaluation)
|
|
1845
|
+
*/
|
|
1846
|
+
export async function resumeEvaluation(options = {}) {
|
|
1847
|
+
const {
|
|
1848
|
+
runId,
|
|
1849
|
+
parallelism = DEFAULT_PARALLELISM,
|
|
1850
|
+
verbose = false,
|
|
1851
|
+
force = false, // Skip the "already running" check
|
|
1852
|
+
} = options;
|
|
1853
|
+
|
|
1854
|
+
const log = verbose ? console.log : () => {};
|
|
1855
|
+
|
|
1856
|
+
// 1. Load the run and validate it exists
|
|
1857
|
+
const run = evaluationStore.getRun(runId);
|
|
1858
|
+
if (!run) {
|
|
1859
|
+
throw new Error(`Run not found: ${runId}`);
|
|
1860
|
+
}
|
|
1861
|
+
|
|
1862
|
+
// 1b. Check if another process is already running this evaluation
|
|
1863
|
+
const existingPid = run.metadata?.pid;
|
|
1864
|
+
if (existingPid && existingPid !== process.pid && !force) {
|
|
1865
|
+
const isAlive = isPidAlive(existingPid);
|
|
1866
|
+
if (isAlive) {
|
|
1867
|
+
throw new Error(
|
|
1868
|
+
`Run ${runId} is already being processed by pid ${existingPid}. ` +
|
|
1869
|
+
`Use --force to override (may cause duplicates).`
|
|
1870
|
+
);
|
|
1871
|
+
}
|
|
1872
|
+
}
|
|
1873
|
+
|
|
1874
|
+
// 2. Extract metadata
|
|
1875
|
+
const metadata = run.metadata || {};
|
|
1876
|
+
const runsPerConfig = metadata.runsPerConfig || 1;
|
|
1877
|
+
const skipRubricEval = metadata.skipRubricEval || false;
|
|
1878
|
+
const modelOverride = metadata.modelOverride || null;
|
|
1879
|
+
|
|
1880
|
+
// 3. Get existing results for completion checking
|
|
1881
|
+
const existingResults = evaluationStore.getResults(runId);
|
|
1882
|
+
|
|
1883
|
+
// 4. Reconstruct scenarios - prefer metadata (complete list), fall back to inferring from results
|
|
1884
|
+
const allScenarios = evalConfigLoader.listScenarios();
|
|
1885
|
+
let scenarioIds;
|
|
1886
|
+
if (metadata.scenarioIds && metadata.scenarioIds.length > 0) {
|
|
1887
|
+
// Use stored scenario list (includes scenarios that haven't started yet)
|
|
1888
|
+
scenarioIds = metadata.scenarioIds;
|
|
1889
|
+
} else {
|
|
1890
|
+
// Legacy: infer from existing results (may miss unstarted scenarios)
|
|
1891
|
+
scenarioIds = [...new Set(existingResults.map(r => r.scenarioId).filter(Boolean))];
|
|
1892
|
+
}
|
|
1893
|
+
const targetScenarios = allScenarios.filter(s => scenarioIds.includes(s.id));
|
|
1894
|
+
|
|
1895
|
+
if (targetScenarios.length === 0) {
|
|
1896
|
+
throw new Error(`No matching scenarios found for run ${runId}`);
|
|
1897
|
+
}
|
|
1898
|
+
|
|
1899
|
+
// 5. Reconstruct profiles - prefer metadata, fall back to inferring from results
|
|
1900
|
+
let profileNames;
|
|
1901
|
+
if (metadata.profileNames && metadata.profileNames.length > 0) {
|
|
1902
|
+
// Use stored profile list
|
|
1903
|
+
profileNames = metadata.profileNames;
|
|
1904
|
+
} else {
|
|
1905
|
+
// Legacy: infer from existing results
|
|
1906
|
+
profileNames = [...new Set(existingResults.map(r => r.profileName).filter(Boolean))];
|
|
1907
|
+
}
|
|
1908
|
+
|
|
1909
|
+
if (profileNames.length === 0) {
|
|
1910
|
+
throw new Error(`No profiles found for run ${runId} — cannot determine what to resume`);
|
|
1911
|
+
}
|
|
1912
|
+
|
|
1913
|
+
let targetConfigs = profileNames.map(name => ({
|
|
1914
|
+
provider: null,
|
|
1915
|
+
model: null,
|
|
1916
|
+
profileName: name,
|
|
1917
|
+
label: name,
|
|
1918
|
+
}));
|
|
1919
|
+
|
|
1920
|
+
// 6. Re-apply modelOverride if present in metadata
|
|
1921
|
+
if (modelOverride) {
|
|
1922
|
+
targetConfigs = targetConfigs.map(c => ({ ...c, modelOverride }));
|
|
1923
|
+
}
|
|
1924
|
+
|
|
1925
|
+
// 6. Count successful results per (profile, scenario) combo and fill up to runsPerConfig.
|
|
1926
|
+
// Failed results are excluded so they get retried.
|
|
1927
|
+
const completedCounts = {};
|
|
1928
|
+
for (const result of existingResults) {
|
|
1929
|
+
// Only count successful results — failed ones should be retried
|
|
1930
|
+
if (result.success === false || result.success === 0) continue;
|
|
1931
|
+
const key = `${result.profileName}:${result.scenarioId}`;
|
|
1932
|
+
completedCounts[key] = (completedCounts[key] || 0) + 1;
|
|
1933
|
+
}
|
|
1934
|
+
|
|
1935
|
+
// Build flat list of remaining tests
|
|
1936
|
+
const remainingTests = [];
|
|
1937
|
+
for (const scenario of targetScenarios) {
|
|
1938
|
+
for (const config of targetConfigs) {
|
|
1939
|
+
const key = `${config.profileName}:${scenario.id}`;
|
|
1940
|
+
const done = completedCounts[key] || 0;
|
|
1941
|
+
const needed = runsPerConfig - done;
|
|
1942
|
+
for (let i = 0; i < needed; i++) {
|
|
1943
|
+
remainingTests.push({ config, scenario, runNum: done + i });
|
|
1944
|
+
}
|
|
1945
|
+
}
|
|
1946
|
+
}
|
|
1947
|
+
|
|
1948
|
+
if (remainingTests.length === 0) {
|
|
1949
|
+
console.log(`\nRun ${runId}: all tests completed (${runsPerConfig} reps each). Nothing to resume.`);
|
|
1950
|
+
return {
|
|
1951
|
+
runId,
|
|
1952
|
+
totalTests: 0,
|
|
1953
|
+
successfulTests: 0,
|
|
1954
|
+
stats: evaluationStore.getRunStats(runId),
|
|
1955
|
+
scenarioStats: evaluationStore.getScenarioStats(runId),
|
|
1956
|
+
progressLogPath: getProgressLogPath(runId),
|
|
1957
|
+
resumed: true,
|
|
1958
|
+
alreadyComplete: true,
|
|
1959
|
+
};
|
|
1960
|
+
}
|
|
1961
|
+
|
|
1962
|
+
// 7. Set run status to 'running' and update PID
|
|
1963
|
+
evaluationStore.updateRun(runId, { status: 'running', metadata: { pid: process.pid } });
|
|
1964
|
+
|
|
1965
|
+
const totalRemainingTests = remainingTests.length;
|
|
1966
|
+
const totalExpectedTests = targetScenarios.length * targetConfigs.length * runsPerConfig;
|
|
1967
|
+
|
|
1968
|
+
console.log(`\nResuming run: ${runId}`);
|
|
1969
|
+
console.log(` Previously completed: ${existingResults.length} tests`);
|
|
1970
|
+
console.log(` Remaining: ${totalRemainingTests} tests`);
|
|
1971
|
+
console.log(` Profiles: ${profileNames.join(', ')}`);
|
|
1972
|
+
console.log(` Scenarios: ${targetScenarios.length}`);
|
|
1973
|
+
if (modelOverride) console.log(` Model override: ${modelOverride}`);
|
|
1974
|
+
|
|
1975
|
+
// Initialize content resolver (same as runEvaluation)
|
|
1976
|
+
const contentConfig = evalConfigLoader.getContentConfig();
|
|
1977
|
+
if (contentConfig?.content_package_path) {
|
|
1978
|
+
contentResolver.configure({
|
|
1979
|
+
contentPackagePath: contentConfig.content_package_path,
|
|
1980
|
+
maxLectureChars: contentConfig.max_lecture_chars,
|
|
1981
|
+
includeSpeakerNotes: contentConfig.include_speaker_notes,
|
|
1982
|
+
});
|
|
1983
|
+
}
|
|
1984
|
+
|
|
1985
|
+
// 8. Set up progress logger and streaming reporter (appends to existing JSONL)
|
|
1986
|
+
const progressLogPath = getProgressLogPath(runId);
|
|
1987
|
+
console.log(`Progress log: ${progressLogPath}\n`);
|
|
1988
|
+
|
|
1989
|
+
const progressLogger = new ProgressLogger(runId);
|
|
1990
|
+
const scenarioNames = targetScenarios.map(s => s.name || s.id);
|
|
1991
|
+
const reporter = new StreamingReporter({
|
|
1992
|
+
totalTests: totalRemainingTests,
|
|
1993
|
+
totalScenarios: targetScenarios.length,
|
|
1994
|
+
profiles: profileNames,
|
|
1995
|
+
scenarios: scenarioNames,
|
|
1996
|
+
});
|
|
1997
|
+
|
|
1998
|
+
progressLogger.runStart({
|
|
1999
|
+
totalTests: totalRemainingTests,
|
|
2000
|
+
totalScenarios: targetScenarios.length,
|
|
2001
|
+
totalConfigurations: targetConfigs.length,
|
|
2002
|
+
scenarios: scenarioNames,
|
|
2003
|
+
profiles: profileNames,
|
|
2004
|
+
description: `Resumed: ${totalRemainingTests} remaining tests`,
|
|
2005
|
+
});
|
|
2006
|
+
|
|
2007
|
+
// Register with monitoring
|
|
2008
|
+
monitoringService.startSession(runId, {
|
|
2009
|
+
userId: 'eval-runner-resume',
|
|
2010
|
+
profileName: `${targetConfigs.length} configs`,
|
|
2011
|
+
modelId: 'evaluation-batch',
|
|
2012
|
+
});
|
|
2013
|
+
|
|
2014
|
+
const results = [];
|
|
2015
|
+
let completedTests = 0;
|
|
2016
|
+
|
|
2017
|
+
// Scenario completion tracking
|
|
2018
|
+
const scenarioProgress = new Map();
|
|
2019
|
+
for (const scenario of targetScenarios) {
|
|
2020
|
+
const testsForScenario = remainingTests.filter(t => t.scenario.id === scenario.id).length;
|
|
2021
|
+
scenarioProgress.set(scenario.id, {
|
|
2022
|
+
total: testsForScenario,
|
|
2023
|
+
completed: 0,
|
|
2024
|
+
scores: [],
|
|
2025
|
+
scenarioName: scenario.name || scenario.id,
|
|
2026
|
+
});
|
|
2027
|
+
}
|
|
2028
|
+
let completedScenarios = 0;
|
|
2029
|
+
|
|
2030
|
+
// 9. Reuse the same parallel worker pool pattern
|
|
2031
|
+
async function processQueue(queue, workerCount, processItem) {
|
|
2032
|
+
const items = [...queue];
|
|
2033
|
+
let index = 0;
|
|
2034
|
+
|
|
2035
|
+
async function worker() {
|
|
2036
|
+
while (index < items.length) {
|
|
2037
|
+
const i = index++;
|
|
2038
|
+
await processItem(items[i]);
|
|
2039
|
+
await sleep(REQUEST_DELAY_MS);
|
|
2040
|
+
}
|
|
2041
|
+
}
|
|
2042
|
+
|
|
2043
|
+
const workers = Array.from(
|
|
2044
|
+
{ length: Math.min(workerCount, items.length) },
|
|
2045
|
+
() => worker()
|
|
2046
|
+
);
|
|
2047
|
+
await Promise.all(workers);
|
|
2048
|
+
}
|
|
2049
|
+
|
|
2050
|
+
log(`\nRunning ${totalRemainingTests} remaining tests with parallelism=${parallelism}...\n`);
|
|
2051
|
+
|
|
2052
|
+
const runStartTime = Date.now();
|
|
2053
|
+
|
|
2054
|
+
await processQueue(remainingTests, parallelism, async ({ config, scenario }) => {
|
|
2055
|
+
const profileLabel = config.label || config.profileName || '';
|
|
2056
|
+
|
|
2057
|
+
progressLogger.testStart({
|
|
2058
|
+
scenarioId: scenario.id,
|
|
2059
|
+
scenarioName: scenario.name || scenario.id,
|
|
2060
|
+
profileName: profileLabel,
|
|
2061
|
+
});
|
|
2062
|
+
|
|
2063
|
+
try {
|
|
2064
|
+
const result = await runSingleTest(scenario, config, {
|
|
2065
|
+
skipRubricEval,
|
|
2066
|
+
verbose,
|
|
2067
|
+
});
|
|
2068
|
+
|
|
2069
|
+
evaluationStore.storeResult(runId, result);
|
|
2070
|
+
results.push(result);
|
|
2071
|
+
completedTests++;
|
|
2072
|
+
|
|
2073
|
+
progressLogger.testComplete({
|
|
2074
|
+
scenarioId: scenario.id,
|
|
2075
|
+
scenarioName: scenario.name || scenario.id,
|
|
2076
|
+
profileName: profileLabel,
|
|
2077
|
+
success: result.success,
|
|
2078
|
+
overallScore: result.overallScore,
|
|
2079
|
+
baseScore: result.baseScore ?? null,
|
|
2080
|
+
recognitionScore: result.recognitionScore ?? null,
|
|
2081
|
+
latencyMs: result.latencyMs,
|
|
2082
|
+
completedCount: completedTests,
|
|
2083
|
+
totalTests: totalRemainingTests,
|
|
2084
|
+
});
|
|
2085
|
+
|
|
2086
|
+
reporter.onTestComplete({
|
|
2087
|
+
...result,
|
|
2088
|
+
profileName: profileLabel,
|
|
2089
|
+
scenarioName: scenario.name || scenario.id,
|
|
2090
|
+
});
|
|
2091
|
+
|
|
2092
|
+
log(` ${formatProgress(completedTests, totalRemainingTests, runStartTime)} ${profileLabel} / ${scenario.id}: ${result.success ? `score=${result.overallScore?.toFixed(1)}` : 'FAILED'}`);
|
|
2093
|
+
|
|
2094
|
+
monitoringService.recordEvent(runId, {
|
|
2095
|
+
type: 'evaluation_test',
|
|
2096
|
+
inputTokens: result.inputTokens || 0,
|
|
2097
|
+
outputTokens: result.outputTokens || 0,
|
|
2098
|
+
latencyMs: result.latencyMs || 0,
|
|
2099
|
+
round: completedTests,
|
|
2100
|
+
approved: result.success,
|
|
2101
|
+
});
|
|
2102
|
+
|
|
2103
|
+
// Track scenario completion
|
|
2104
|
+
const sp = scenarioProgress.get(scenario.id);
|
|
2105
|
+
sp.completed++;
|
|
2106
|
+
if (result.overallScore != null) sp.scores.push(result.overallScore);
|
|
2107
|
+
if (sp.completed >= sp.total) {
|
|
2108
|
+
completedScenarios++;
|
|
2109
|
+
const avgScore = sp.scores.length > 0
|
|
2110
|
+
? sp.scores.reduce((a, b) => a + b, 0) / sp.scores.length
|
|
2111
|
+
: null;
|
|
2112
|
+
progressLogger.scenarioComplete({
|
|
2113
|
+
scenarioId: scenario.id,
|
|
2114
|
+
scenarioName: sp.scenarioName,
|
|
2115
|
+
profileNames,
|
|
2116
|
+
avgScore,
|
|
2117
|
+
completedScenarios,
|
|
2118
|
+
totalScenarios: targetScenarios.length,
|
|
2119
|
+
});
|
|
2120
|
+
reporter.onScenarioComplete({
|
|
2121
|
+
scenarioName: sp.scenarioName,
|
|
2122
|
+
avgScore,
|
|
2123
|
+
completedScenarios,
|
|
2124
|
+
totalScenarios: targetScenarios.length,
|
|
2125
|
+
});
|
|
2126
|
+
}
|
|
2127
|
+
} catch (error) {
|
|
2128
|
+
completedTests++;
|
|
2129
|
+
log(` ${formatProgress(completedTests, totalRemainingTests, runStartTime)} ${profileLabel} / ${scenario.id}: ERROR - ${error.message}`);
|
|
2130
|
+
|
|
2131
|
+
// Store failed result so it shows up in the database
|
|
2132
|
+
const failedResult = {
|
|
2133
|
+
scenarioId: scenario.id,
|
|
2134
|
+
scenarioName: scenario.name || scenario.id,
|
|
2135
|
+
profileName: config.profileName,
|
|
2136
|
+
provider: config.provider || config.ego?.provider || 'unknown',
|
|
2137
|
+
model: config.model || config.ego?.model || 'unknown',
|
|
2138
|
+
egoModel: config.egoModel
|
|
2139
|
+
? `${config.egoModel.provider}.${config.egoModel.model}`
|
|
2140
|
+
: config.ego ? `${config.ego.provider}.${config.ego.model}` : null,
|
|
2141
|
+
superegoModel: config.superegoModel
|
|
2142
|
+
? `${config.superegoModel.provider}.${config.superegoModel.model}`
|
|
2143
|
+
: config.superego ? `${config.superego.provider}.${config.superego.model}` : null,
|
|
2144
|
+
factors: config.factors || null,
|
|
2145
|
+
learnerArchitecture: config.learnerArchitecture || null,
|
|
2146
|
+
success: false,
|
|
2147
|
+
errorMessage: error.message,
|
|
2148
|
+
};
|
|
2149
|
+
try {
|
|
2150
|
+
evaluationStore.storeResult(runId, failedResult);
|
|
2151
|
+
results.push(failedResult);
|
|
2152
|
+
} catch (storeErr) {
|
|
2153
|
+
log(` [WARNING] Failed to store error result: ${storeErr.message}`);
|
|
2154
|
+
}
|
|
2155
|
+
|
|
2156
|
+
progressLogger.testError({
|
|
2157
|
+
scenarioId: scenario.id,
|
|
2158
|
+
scenarioName: scenario.name || scenario.id,
|
|
2159
|
+
profileName: profileLabel,
|
|
2160
|
+
errorMessage: error.message,
|
|
2161
|
+
completedCount: completedTests,
|
|
2162
|
+
totalTests: totalRemainingTests,
|
|
2163
|
+
});
|
|
2164
|
+
|
|
2165
|
+
reporter.onTestError({
|
|
2166
|
+
scenarioName: scenario.name || scenario.id,
|
|
2167
|
+
profileName: profileLabel,
|
|
2168
|
+
errorMessage: error.message,
|
|
2169
|
+
});
|
|
2170
|
+
|
|
2171
|
+
monitoringService.recordEvent(runId, {
|
|
2172
|
+
type: 'evaluation_error',
|
|
2173
|
+
round: completedTests,
|
|
2174
|
+
error: error.message,
|
|
2175
|
+
});
|
|
2176
|
+
|
|
2177
|
+
// Track scenario completion even on error
|
|
2178
|
+
const sp = scenarioProgress.get(scenario.id);
|
|
2179
|
+
sp.completed++;
|
|
2180
|
+
if (sp.completed >= sp.total) {
|
|
2181
|
+
completedScenarios++;
|
|
2182
|
+
const avgScore = sp.scores.length > 0
|
|
2183
|
+
? sp.scores.reduce((a, b) => a + b, 0) / sp.scores.length
|
|
2184
|
+
: null;
|
|
2185
|
+
progressLogger.scenarioComplete({
|
|
2186
|
+
scenarioId: scenario.id,
|
|
2187
|
+
scenarioName: sp.scenarioName,
|
|
2188
|
+
profileNames,
|
|
2189
|
+
avgScore,
|
|
2190
|
+
completedScenarios,
|
|
2191
|
+
totalScenarios: targetScenarios.length,
|
|
2192
|
+
});
|
|
2193
|
+
reporter.onScenarioComplete({
|
|
2194
|
+
scenarioName: sp.scenarioName,
|
|
2195
|
+
avgScore,
|
|
2196
|
+
completedScenarios,
|
|
2197
|
+
totalScenarios: targetScenarios.length,
|
|
2198
|
+
});
|
|
2199
|
+
}
|
|
2200
|
+
}
|
|
2201
|
+
});
|
|
2202
|
+
|
|
2203
|
+
const durationMs = Date.now() - runStartTime;
|
|
2204
|
+
const successfulTests = results.filter(r => r.success).length;
|
|
2205
|
+
const failedTests = completedTests - successfulTests;
|
|
2206
|
+
|
|
2207
|
+
progressLogger.runComplete({ totalTests: completedTests, successfulTests, failedTests, durationMs });
|
|
2208
|
+
reporter.onRunComplete({ totalTests: completedTests, successfulTests, failedTests, durationMs });
|
|
2209
|
+
|
|
2210
|
+
// 10. Mark run as completed (keep original totalTests to show expected vs actual)
|
|
2211
|
+
const allResults = evaluationStore.getResults(runId);
|
|
2212
|
+
evaluationStore.updateRun(runId, {
|
|
2213
|
+
status: 'completed',
|
|
2214
|
+
completedAt: new Date().toISOString(),
|
|
2215
|
+
});
|
|
2216
|
+
|
|
2217
|
+
monitoringService.endSession(runId);
|
|
2218
|
+
|
|
2219
|
+
const stats = evaluationStore.getRunStats(runId);
|
|
2220
|
+
const scenarioStats = evaluationStore.getScenarioStats(runId);
|
|
2221
|
+
|
|
2222
|
+
return {
|
|
2223
|
+
runId,
|
|
2224
|
+
totalTests: run.totalTests,
|
|
2225
|
+
completedTests: allResults.length,
|
|
2226
|
+
successfulTests,
|
|
2227
|
+
failedTests: allResults.filter(r => !r.success).length,
|
|
2228
|
+
resumedTests: totalRemainingTests,
|
|
2229
|
+
stats,
|
|
2230
|
+
scenarioStats,
|
|
2231
|
+
progressLogPath,
|
|
2232
|
+
resumed: true,
|
|
564
2233
|
};
|
|
565
2234
|
}
|
|
566
2235
|
|
|
@@ -591,7 +2260,12 @@ export async function compareConfigurations(configs, options = {}) {
|
|
|
591
2260
|
rank: i + 1,
|
|
592
2261
|
provider: stat.provider,
|
|
593
2262
|
model: stat.model,
|
|
2263
|
+
profileName: stat.profileName,
|
|
2264
|
+
egoModel: stat.egoModel,
|
|
2265
|
+
superegoModel: stat.superegoModel,
|
|
594
2266
|
avgScore: stat.avgScore,
|
|
2267
|
+
avgBaseScore: stat.avgBaseScore,
|
|
2268
|
+
avgRecognitionScore: stat.avgRecognitionScore,
|
|
595
2269
|
successRate: stat.successRate,
|
|
596
2270
|
avgLatencyMs: stat.avgLatencyMs,
|
|
597
2271
|
})),
|
|
@@ -612,14 +2286,15 @@ export async function quickTest(config, options = {}) {
|
|
|
612
2286
|
outputSize = 'normal', // compact, normal, expanded
|
|
613
2287
|
onLog,
|
|
614
2288
|
superegoStrategy = null, // Superego intervention strategy
|
|
2289
|
+
judgeOverride = null, // Override judge model for this run
|
|
615
2290
|
} = options;
|
|
616
2291
|
|
|
617
|
-
const scenarios = [
|
|
2292
|
+
const scenarios = [evalConfigLoader.listScenarios().find(s => s.id === scenarioId)].filter(Boolean);
|
|
618
2293
|
if (scenarios.length === 0) {
|
|
619
2294
|
throw new Error(`Scenario not found: ${scenarioId}`);
|
|
620
2295
|
}
|
|
621
2296
|
|
|
622
|
-
const result = await runSingleTest(scenarios[0], config, { verbose, skipRubricEval, outputSize, onLog, superegoStrategy });
|
|
2297
|
+
const result = await runSingleTest(scenarios[0], config, { verbose, skipRubricEval, outputSize, onLog, superegoStrategy, judgeOverride });
|
|
623
2298
|
return result;
|
|
624
2299
|
}
|
|
625
2300
|
|
|
@@ -628,9 +2303,9 @@ export async function quickTest(config, options = {}) {
|
|
|
628
2303
|
*/
|
|
629
2304
|
export function listOptions() {
|
|
630
2305
|
return {
|
|
631
|
-
scenarios:
|
|
632
|
-
configurations:
|
|
633
|
-
profiles:
|
|
2306
|
+
scenarios: evalConfigLoader.listScenarios(),
|
|
2307
|
+
configurations: evalConfigLoader.listConfigurations(),
|
|
2308
|
+
profiles: evalConfigLoader.listTutorProfiles(),
|
|
634
2309
|
};
|
|
635
2310
|
}
|
|
636
2311
|
|
|
@@ -677,16 +2352,19 @@ export function generateReport(runId) {
|
|
|
677
2352
|
|
|
678
2353
|
// Rankings table
|
|
679
2354
|
lines.push('CONFIGURATION RANKINGS (by average score)');
|
|
680
|
-
lines.push('-'.repeat(
|
|
681
|
-
lines.push('| Rank |
|
|
682
|
-
lines.push('
|
|
2355
|
+
lines.push('-'.repeat(105));
|
|
2356
|
+
lines.push('| Rank | Profile | Model | Overall | Base | Recog | Latency | Pass |');
|
|
2357
|
+
lines.push('|------|----------------------------------|-------------------------|---------|--------|--------|---------|------|');
|
|
683
2358
|
|
|
684
2359
|
stats.forEach((stat, i) => {
|
|
685
|
-
const
|
|
686
|
-
const
|
|
2360
|
+
const profile = (stat.profileName || 'N/A').substring(0, 32).padEnd(32);
|
|
2361
|
+
const model = (stat.model || '').substring(0, 23).padEnd(23);
|
|
2362
|
+
const score = stat.avgScore ? stat.avgScore.toFixed(1).padStart(7) : ' N/A';
|
|
2363
|
+
const base = stat.avgBaseScore ? stat.avgBaseScore.toFixed(1).padStart(6) : ' N/A';
|
|
2364
|
+
const recog = stat.avgRecognitionScore ? stat.avgRecognitionScore.toFixed(1).padStart(6) : ' N/A';
|
|
687
2365
|
const latency = stat.avgLatencyMs ? `${stat.avgLatencyMs.toFixed(0)}ms`.padStart(7) : ' N/A';
|
|
688
|
-
const passRate = `${(stat.validationPassRate * 100).toFixed(0)}%`.padStart(
|
|
689
|
-
lines.push(`| ${(i + 1).toString().padStart(4)} | ${
|
|
2366
|
+
const passRate = `${(stat.validationPassRate * 100).toFixed(0)}%`.padStart(4);
|
|
2367
|
+
lines.push(`| ${(i + 1).toString().padStart(4)} | ${profile} | ${model} | ${score} | ${base} | ${recog} | ${latency} | ${passRate} |`);
|
|
690
2368
|
});
|
|
691
2369
|
|
|
692
2370
|
lines.push('');
|
|
@@ -697,7 +2375,7 @@ export function generateReport(runId) {
|
|
|
697
2375
|
lines.push('-'.repeat(80));
|
|
698
2376
|
|
|
699
2377
|
const dims = ['relevance', 'specificity', 'pedagogical', 'personalization', 'actionability', 'tone'];
|
|
700
|
-
const header = '| Dimension |' + stats.map(s => ` ${s.model.substring(0, 12).padEnd(12)} |`).join('');
|
|
2378
|
+
const header = '| Dimension |' + stats.map(s => ` ${(s.profileName || s.model).substring(0, 12).padEnd(12)} |`).join('');
|
|
701
2379
|
lines.push(header);
|
|
702
2380
|
lines.push('|-----------------|' + stats.map(() => '--------------|').join(''));
|
|
703
2381
|
|
|
@@ -719,21 +2397,244 @@ export function generateReport(runId) {
|
|
|
719
2397
|
lines.push(`\n${scenario.scenarioName} (${scenario.scenarioId})`);
|
|
720
2398
|
for (const config of scenario.configurations) {
|
|
721
2399
|
const status = config.passesValidation ? 'PASS' : 'FAIL';
|
|
722
|
-
|
|
2400
|
+
const profile = config.profileName || `${config.provider}/${config.model}`;
|
|
2401
|
+
const base = config.avgBaseScore != null ? `base=${config.avgBaseScore.toFixed(1)}` : '';
|
|
2402
|
+
const recog = config.avgRecognitionScore != null ? `recog=${config.avgRecognitionScore.toFixed(1)}` : '';
|
|
2403
|
+
const scores = [base, recog].filter(Boolean).join(', ');
|
|
2404
|
+
lines.push(` ${profile}: ${config.avgScore?.toFixed(1) || 'N/A'} (${scores}) [${status}]`);
|
|
723
2405
|
}
|
|
724
2406
|
}
|
|
725
2407
|
|
|
726
2408
|
lines.push('');
|
|
2409
|
+
|
|
2410
|
+
// ANOVA analysis — if factorial data is available, run for each score type
|
|
2411
|
+
const scoreTypes = [
|
|
2412
|
+
{ column: 'overall_score', label: 'Overall Score' },
|
|
2413
|
+
{ column: 'base_score', label: 'Base Score' },
|
|
2414
|
+
{ column: 'recognition_score', label: 'Recognition Score' },
|
|
2415
|
+
];
|
|
2416
|
+
|
|
2417
|
+
for (const { column, label } of scoreTypes) {
|
|
2418
|
+
const cellData = evaluationStore.getFactorialCellData(runId, { scoreColumn: column });
|
|
2419
|
+
const cellKeys = Object.keys(cellData);
|
|
2420
|
+
if (cellKeys.length === 0) continue;
|
|
2421
|
+
|
|
2422
|
+
const totalSamples = Object.values(cellData).reduce((sum, arr) => sum + arr.length, 0);
|
|
2423
|
+
lines.push(`FACTORIAL ANOVA — ${label.toUpperCase()} (2x2x2)`);
|
|
2424
|
+
lines.push('-'.repeat(80));
|
|
2425
|
+
lines.push(`Cells with data: ${cellKeys.length}/8 | Total samples: ${totalSamples}`);
|
|
2426
|
+
lines.push('');
|
|
2427
|
+
|
|
2428
|
+
// Cell means summary
|
|
2429
|
+
for (const key of cellKeys.sort()) {
|
|
2430
|
+
const scores = cellData[key];
|
|
2431
|
+
const mean = scores.reduce((a, b) => a + b, 0) / scores.length;
|
|
2432
|
+
const cellLabel = key.replace(/r(\d)_t(\d)_l(\d)/, (_, r, t, l) =>
|
|
2433
|
+
`Recog=${r === '1' ? 'Y' : 'N'} Tutor=${t === '1' ? 'Multi' : 'Single'} Learner=${l === '1' ? 'Psycho' : 'Unified'}`
|
|
2434
|
+
);
|
|
2435
|
+
lines.push(` ${cellLabel}: mean=${mean.toFixed(1)} (n=${scores.length})`);
|
|
2436
|
+
}
|
|
2437
|
+
lines.push('');
|
|
2438
|
+
|
|
2439
|
+
if (totalSamples > 8) {
|
|
2440
|
+
const anovaResult = anovaStats.runThreeWayANOVA(cellData);
|
|
2441
|
+
lines.push(anovaStats.formatANOVAReport(anovaResult, { scoreLabel: label }));
|
|
2442
|
+
} else {
|
|
2443
|
+
lines.push(' (Need > 8 total samples for ANOVA — increase --runs)');
|
|
2444
|
+
}
|
|
2445
|
+
lines.push('');
|
|
2446
|
+
}
|
|
2447
|
+
|
|
727
2448
|
lines.push('='.repeat(80));
|
|
728
2449
|
|
|
729
2450
|
return lines.join('\n');
|
|
730
2451
|
}
|
|
731
2452
|
|
|
2453
|
+
/**
|
|
2454
|
+
* Re-judge all results in an existing run without regenerating tutor responses.
|
|
2455
|
+
*
|
|
2456
|
+
* By default, creates NEW rows preserving judgment history (for inter-judge reliability).
|
|
2457
|
+
* Use --overwrite to replace existing scores instead.
|
|
2458
|
+
*
|
|
2459
|
+
* @param {string} runId - The run to rejudge
|
|
2460
|
+
* @param {Object} options
|
|
2461
|
+
* @param {string} [options.judgeOverride] - Override judge model (e.g. 'openrouter.nemotron')
|
|
2462
|
+
* @param {boolean} [options.verbose] - Show per-result progress
|
|
2463
|
+
* @param {string} [options.scenarioFilter] - Only rejudge results for this scenario ID
|
|
2464
|
+
* @param {number} [options.parallelism] - Concurrent judge calls (default 3)
|
|
2465
|
+
* @param {boolean} [options.overwrite] - If true, update existing rows instead of creating new ones
|
|
2466
|
+
* @returns {Promise<Object>} Summary stats
|
|
2467
|
+
*/
|
|
2468
|
+
export async function rejudgeRun(runId, options = {}) {
|
|
2469
|
+
const {
|
|
2470
|
+
judgeOverride = null,
|
|
2471
|
+
verbose = false,
|
|
2472
|
+
scenarioFilter = null,
|
|
2473
|
+
parallelism = DEFAULT_PARALLELISM,
|
|
2474
|
+
overwrite = false,
|
|
2475
|
+
} = options;
|
|
2476
|
+
|
|
2477
|
+
const log = verbose ? console.log : () => {};
|
|
2478
|
+
|
|
2479
|
+
const run = evaluationStore.getRun(runId);
|
|
2480
|
+
if (!run) throw new Error(`Run not found: ${runId}`);
|
|
2481
|
+
|
|
2482
|
+
let results = evaluationStore.getResults(runId, {
|
|
2483
|
+
scenarioId: scenarioFilter || null,
|
|
2484
|
+
});
|
|
2485
|
+
|
|
2486
|
+
// Skip results that have no suggestions (errors / failed generation)
|
|
2487
|
+
results = results.filter(r => r.success && r.suggestions?.length > 0);
|
|
2488
|
+
|
|
2489
|
+
if (results.length === 0) {
|
|
2490
|
+
throw new Error('No successful results with suggestions found to rejudge');
|
|
2491
|
+
}
|
|
2492
|
+
|
|
2493
|
+
// Deduplicate: only rejudge unique responses (by suggestions content)
|
|
2494
|
+
// This prevents cascading rejudgments when running multiple times
|
|
2495
|
+
const seenSuggestions = new Set();
|
|
2496
|
+
const uniqueResults = [];
|
|
2497
|
+
for (const r of results) {
|
|
2498
|
+
const suggKey = typeof r.suggestions === 'string' ? r.suggestions : JSON.stringify(r.suggestions);
|
|
2499
|
+
if (!seenSuggestions.has(suggKey)) {
|
|
2500
|
+
seenSuggestions.add(suggKey);
|
|
2501
|
+
uniqueResults.push(r);
|
|
2502
|
+
}
|
|
2503
|
+
}
|
|
2504
|
+
|
|
2505
|
+
const skipped = results.length - uniqueResults.length;
|
|
2506
|
+
results = uniqueResults;
|
|
2507
|
+
|
|
2508
|
+
log(`\nRejudging ${results.length} unique results from run ${runId}${skipped > 0 ? ` (skipping ${skipped} duplicates)` : ''}`);
|
|
2509
|
+
if (judgeOverride) log(` Judge override: ${judgeOverride}`);
|
|
2510
|
+
if (scenarioFilter) log(` Scenario filter: ${scenarioFilter}`);
|
|
2511
|
+
|
|
2512
|
+
// Capture old scores for before/after comparison
|
|
2513
|
+
const oldScores = results.map(r => r.overallScore).filter(s => s != null);
|
|
2514
|
+
const oldAvg = oldScores.length > 0
|
|
2515
|
+
? oldScores.reduce((a, b) => a + b, 0) / oldScores.length
|
|
2516
|
+
: null;
|
|
2517
|
+
|
|
2518
|
+
let completed = 0;
|
|
2519
|
+
let succeeded = 0;
|
|
2520
|
+
let failed = 0;
|
|
2521
|
+
const newScores = [];
|
|
2522
|
+
|
|
2523
|
+
// Build judge override object if provided
|
|
2524
|
+
// rubricEvaluator expects { judgeOverride: { model: "..." } }
|
|
2525
|
+
const judgeOverrideObj = judgeOverride ? { judgeOverride: { model: judgeOverride } } : {};
|
|
2526
|
+
|
|
2527
|
+
// Parallel worker pool (same pattern as main eval loop)
|
|
2528
|
+
const items = [...results];
|
|
2529
|
+
let index = 0;
|
|
2530
|
+
|
|
2531
|
+
async function worker() {
|
|
2532
|
+
while (index < items.length) {
|
|
2533
|
+
const i = index++;
|
|
2534
|
+
const result = items[i];
|
|
2535
|
+
|
|
2536
|
+
try {
|
|
2537
|
+
const fullScenario = evalConfigLoader.getScenario(result.scenarioId);
|
|
2538
|
+
if (!fullScenario) {
|
|
2539
|
+
throw new Error(`Scenario not found: ${result.scenarioId}`);
|
|
2540
|
+
}
|
|
2541
|
+
|
|
2542
|
+
const suggestion = result.suggestions[0];
|
|
2543
|
+
|
|
2544
|
+
// Load dialogue context for multi-turn results
|
|
2545
|
+
let dialogueContext = null;
|
|
2546
|
+
if (result.dialogueId) {
|
|
2547
|
+
const logPath = path.join(LOGS_DIR, `${result.dialogueId}.json`);
|
|
2548
|
+
try {
|
|
2549
|
+
if (fs.existsSync(logPath)) {
|
|
2550
|
+
const dialogueLog = JSON.parse(fs.readFileSync(logPath, 'utf-8'));
|
|
2551
|
+
if (dialogueLog.isMultiTurn && dialogueLog.dialogueTrace?.length > 0) {
|
|
2552
|
+
dialogueContext = {
|
|
2553
|
+
consolidatedTrace: dialogueLog.dialogueTrace,
|
|
2554
|
+
conversationHistory: (dialogueLog.turnResults || []).map((t, ti) => ({
|
|
2555
|
+
turnIndex: ti,
|
|
2556
|
+
turnId: t.turnId,
|
|
2557
|
+
suggestion: t.suggestions?.[0],
|
|
2558
|
+
learnerAction: t.learnerAction,
|
|
2559
|
+
learnerMessage: t.learnerMessage,
|
|
2560
|
+
})),
|
|
2561
|
+
};
|
|
2562
|
+
}
|
|
2563
|
+
}
|
|
2564
|
+
} catch (e) {
|
|
2565
|
+
log(` Warning: could not load dialogue log for ${result.dialogueId}: ${e.message}`);
|
|
2566
|
+
}
|
|
2567
|
+
}
|
|
2568
|
+
|
|
2569
|
+
const evaluation = await retryWithBackoff(
|
|
2570
|
+
() => rubricEvaluator.evaluateSuggestion(suggestion, {
|
|
2571
|
+
name: fullScenario.name,
|
|
2572
|
+
description: fullScenario.description,
|
|
2573
|
+
expectedBehavior: fullScenario.expected_behavior,
|
|
2574
|
+
learnerContext: fullScenario.learner_context,
|
|
2575
|
+
requiredElements: fullScenario.required_elements,
|
|
2576
|
+
forbiddenElements: fullScenario.forbidden_elements,
|
|
2577
|
+
}, { dialogueContext }, judgeOverrideObj),
|
|
2578
|
+
{}
|
|
2579
|
+
);
|
|
2580
|
+
|
|
2581
|
+
if (evaluation.success) {
|
|
2582
|
+
if (overwrite) {
|
|
2583
|
+
// Old behavior: update in place (loses history)
|
|
2584
|
+
evaluationStore.updateResultScores(result.id, evaluation);
|
|
2585
|
+
} else {
|
|
2586
|
+
// New behavior: create new row (preserves history for reliability analysis)
|
|
2587
|
+
evaluationStore.storeRejudgment(result, evaluation);
|
|
2588
|
+
}
|
|
2589
|
+
succeeded++;
|
|
2590
|
+
if (evaluation.overallScore != null) newScores.push(evaluation.overallScore);
|
|
2591
|
+
const modeLabel = overwrite ? 'replaced' : 'added';
|
|
2592
|
+
log(` [${completed + 1}/${results.length}] ${result.scenarioId} / ${result.profileName}: ${evaluation.overallScore?.toFixed(1)} (${modeLabel}, was ${result.overallScore?.toFixed(1) ?? '--'})`);
|
|
2593
|
+
} else {
|
|
2594
|
+
failed++;
|
|
2595
|
+
log(` [${completed + 1}/${results.length}] ${result.scenarioId} / ${result.profileName}: JUDGE FAILED - ${evaluation.error}`);
|
|
2596
|
+
}
|
|
2597
|
+
} catch (error) {
|
|
2598
|
+
failed++;
|
|
2599
|
+
log(` [${completed + 1}/${results.length}] ${result.scenarioId} / ${result.profileName}: ERROR - ${error.message}`);
|
|
2600
|
+
}
|
|
2601
|
+
|
|
2602
|
+
completed++;
|
|
2603
|
+
await sleep(REQUEST_DELAY_MS);
|
|
2604
|
+
}
|
|
2605
|
+
}
|
|
2606
|
+
|
|
2607
|
+
const workers = Array.from(
|
|
2608
|
+
{ length: Math.min(parallelism, items.length) },
|
|
2609
|
+
() => worker()
|
|
2610
|
+
);
|
|
2611
|
+
await Promise.all(workers);
|
|
2612
|
+
|
|
2613
|
+
const newAvg = newScores.length > 0
|
|
2614
|
+
? newScores.reduce((a, b) => a + b, 0) / newScores.length
|
|
2615
|
+
: null;
|
|
2616
|
+
|
|
2617
|
+
return {
|
|
2618
|
+
runId,
|
|
2619
|
+
total: results.length,
|
|
2620
|
+
succeeded,
|
|
2621
|
+
failed,
|
|
2622
|
+
oldAvgScore: oldAvg,
|
|
2623
|
+
newAvgScore: newAvg,
|
|
2624
|
+
scoreDelta: oldAvg != null && newAvg != null ? newAvg - oldAvg : null,
|
|
2625
|
+
};
|
|
2626
|
+
}
|
|
2627
|
+
|
|
2628
|
+
// Named exports for unit testing (these are internal helpers not part of the public API)
|
|
2629
|
+
export { structureLearnerContext, resolveConfigModels };
|
|
2630
|
+
|
|
732
2631
|
export default {
|
|
733
2632
|
runEvaluation,
|
|
2633
|
+
resumeEvaluation,
|
|
734
2634
|
compareConfigurations,
|
|
735
2635
|
quickTest,
|
|
736
2636
|
listOptions,
|
|
737
2637
|
getRunResults,
|
|
738
2638
|
generateReport,
|
|
2639
|
+
rejudgeRun,
|
|
739
2640
|
};
|