@machinespirits/eval 0.1.2 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +161 -0
- package/config/eval-settings.yaml +18 -0
- package/config/evaluation-rubric-learner.yaml +277 -0
- package/config/evaluation-rubric.yaml +613 -0
- package/config/interaction-eval-scenarios.yaml +93 -50
- package/config/learner-agents.yaml +124 -193
- package/config/machinespirits-eval.code-workspace +11 -0
- package/config/providers.yaml +60 -0
- package/config/suggestion-scenarios.yaml +1399 -0
- package/config/tutor-agents.yaml +716 -0
- package/docs/EVALUATION-VARIABLES.md +589 -0
- package/docs/REPLICATION-PLAN.md +577 -0
- package/index.js +15 -6
- package/package.json +16 -22
- package/routes/evalRoutes.js +88 -36
- package/scripts/analyze-judge-reliability.js +401 -0
- package/scripts/analyze-run.js +97 -0
- package/scripts/analyze-run.mjs +282 -0
- package/scripts/analyze-validation-failures.js +141 -0
- package/scripts/check-run.mjs +17 -0
- package/scripts/code-impasse-strategies.js +1132 -0
- package/scripts/compare-runs.js +44 -0
- package/scripts/compare-suggestions.js +80 -0
- package/scripts/compare-transformation.js +116 -0
- package/scripts/dig-into-run.js +158 -0
- package/scripts/eval-cli.js +2626 -0
- package/scripts/generate-paper-figures.py +452 -0
- package/scripts/qualitative-analysis-ai.js +1313 -0
- package/scripts/qualitative-analysis.js +688 -0
- package/scripts/seed-db.js +87 -0
- package/scripts/show-failed-suggestions.js +64 -0
- package/scripts/validate-content.js +192 -0
- package/server.js +3 -2
- package/services/__tests__/evalConfigLoader.test.js +338 -0
- package/services/anovaStats.js +499 -0
- package/services/contentResolver.js +407 -0
- package/services/dialogueTraceAnalyzer.js +454 -0
- package/services/evalConfigLoader.js +625 -0
- package/services/evaluationRunner.js +2171 -270
- package/services/evaluationStore.js +564 -29
- package/services/learnerConfigLoader.js +75 -5
- package/services/learnerRubricEvaluator.js +284 -0
- package/services/learnerTutorInteractionEngine.js +375 -0
- package/services/processUtils.js +18 -0
- package/services/progressLogger.js +98 -0
- package/services/promptRecommendationService.js +31 -26
- package/services/promptRewriter.js +427 -0
- package/services/rubricEvaluator.js +543 -70
- package/services/streamingReporter.js +104 -0
- package/services/turnComparisonAnalyzer.js +494 -0
- package/components/MobileEvalDashboard.tsx +0 -267
- package/components/comparison/DeltaAnalysisTable.tsx +0 -137
- package/components/comparison/ProfileComparisonCard.tsx +0 -176
- package/components/comparison/RecognitionABMode.tsx +0 -385
- package/components/comparison/RecognitionMetricsPanel.tsx +0 -135
- package/components/comparison/WinnerIndicator.tsx +0 -64
- package/components/comparison/index.ts +0 -5
- package/components/mobile/BottomSheet.tsx +0 -233
- package/components/mobile/DimensionBreakdown.tsx +0 -210
- package/components/mobile/DocsView.tsx +0 -363
- package/components/mobile/LogsView.tsx +0 -481
- package/components/mobile/PsychodynamicQuadrant.tsx +0 -261
- package/components/mobile/QuickTestView.tsx +0 -1098
- package/components/mobile/RecognitionTypeChart.tsx +0 -124
- package/components/mobile/RecognitionView.tsx +0 -809
- package/components/mobile/RunDetailView.tsx +0 -261
- package/components/mobile/RunHistoryView.tsx +0 -367
- package/components/mobile/ScoreRadial.tsx +0 -211
- package/components/mobile/StreamingLogPanel.tsx +0 -230
- package/components/mobile/SynthesisStrategyChart.tsx +0 -140
- package/docs/research/ABLATION-DIALOGUE-ROUNDS.md +0 -52
- package/docs/research/ABLATION-MODEL-SELECTION.md +0 -53
- package/docs/research/ADVANCED-EVAL-ANALYSIS.md +0 -60
- package/docs/research/ANOVA-RESULTS-2026-01-14.md +0 -257
- package/docs/research/COMPREHENSIVE-EVALUATION-PLAN.md +0 -586
- package/docs/research/COST-ANALYSIS.md +0 -56
- package/docs/research/CRITICAL-REVIEW-RECOGNITION-TUTORING.md +0 -340
- package/docs/research/DYNAMIC-VS-SCRIPTED-ANALYSIS.md +0 -291
- package/docs/research/EVAL-SYSTEM-ANALYSIS.md +0 -306
- package/docs/research/FACTORIAL-RESULTS-2026-01-14.md +0 -301
- package/docs/research/IMPLEMENTATION-PLAN-CRITIQUE-RESPONSE.md +0 -1988
- package/docs/research/LONGITUDINAL-DYADIC-EVALUATION.md +0 -282
- package/docs/research/MULTI-JUDGE-VALIDATION-2026-01-14.md +0 -147
- package/docs/research/PAPER-EXTENSION-DYADIC.md +0 -204
- package/docs/research/PAPER-UNIFIED.md +0 -659
- package/docs/research/PAPER-UNIFIED.pdf +0 -0
- package/docs/research/PROMPT-IMPROVEMENTS-2026-01-14.md +0 -356
- package/docs/research/SESSION-NOTES-2026-01-11-RECOGNITION-EVAL.md +0 -419
- package/docs/research/apa.csl +0 -2133
- package/docs/research/archive/PAPER-DRAFT-RECOGNITION-TUTORING.md +0 -1637
- package/docs/research/archive/paper-multiagent-tutor.tex +0 -978
- package/docs/research/paper-draft/full-paper.md +0 -136
- package/docs/research/paper-draft/images/pasted-image-2026-01-24T03-47-47-846Z-d76a7ae2.png +0 -0
- package/docs/research/paper-draft/references.bib +0 -515
- package/docs/research/transcript-baseline.md +0 -139
- package/docs/research/transcript-recognition-multiagent.md +0 -187
- package/hooks/useEvalData.ts +0 -625
- package/server-init.js +0 -45
- package/services/benchmarkService.js +0 -1892
- package/types.ts +0 -165
- package/utils/haptics.ts +0 -45
|
@@ -0,0 +1,625 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Eval Config Loader
|
|
3
|
+
*
|
|
4
|
+
* Loads the evaluation rubric locally from the eval repo's own config directory,
|
|
5
|
+
* removing the dependency on tutorApiService.loadRubric() for rubric/scenario data.
|
|
6
|
+
*
|
|
7
|
+
* Uses mtime-based caching (same pattern as tutor-core's configLoaderBase).
|
|
8
|
+
*/
|
|
9
|
+
|
|
10
|
+
import fs from 'fs';
|
|
11
|
+
import path from 'path';
|
|
12
|
+
import { fileURLToPath } from 'url';
|
|
13
|
+
import yaml from 'yaml';
|
|
14
|
+
|
|
15
|
+
const __dirname = path.dirname(fileURLToPath(import.meta.url));
|
|
16
|
+
const EVAL_CONFIG_DIR = path.join(path.resolve(__dirname, '..'), 'config');
|
|
17
|
+
|
|
18
|
+
// Mtime-based caches
|
|
19
|
+
let rubricCache = null;
|
|
20
|
+
let rubricMtime = null;
|
|
21
|
+
let scenariosCache = null;
|
|
22
|
+
let scenariosMtime = null;
|
|
23
|
+
let providersCache = null;
|
|
24
|
+
let providersMtime = null;
|
|
25
|
+
let tutorAgentsCache = null;
|
|
26
|
+
let tutorAgentsMtime = null;
|
|
27
|
+
let evalSettingsCache = null;
|
|
28
|
+
let evalSettingsMtime = null;
|
|
29
|
+
|
|
30
|
+
/**
|
|
31
|
+
* Load the evaluation rubric YAML from the eval repo's config directory.
|
|
32
|
+
*
|
|
33
|
+
* @param {Object} [options]
|
|
34
|
+
* @param {string} [options.rubricPath] - Override path to rubric YAML file
|
|
35
|
+
* @param {boolean} [options.forceReload] - Bypass mtime cache
|
|
36
|
+
* @returns {Object|null} Parsed rubric object, or null if file not found
|
|
37
|
+
*/
|
|
38
|
+
export function loadRubric({ rubricPath, forceReload } = {}) {
|
|
39
|
+
const effectivePath = rubricPath || path.join(EVAL_CONFIG_DIR, 'evaluation-rubric.yaml');
|
|
40
|
+
|
|
41
|
+
try {
|
|
42
|
+
const stats = fs.statSync(effectivePath);
|
|
43
|
+
if (!forceReload && rubricCache && rubricMtime === stats.mtimeMs) {
|
|
44
|
+
return rubricCache;
|
|
45
|
+
}
|
|
46
|
+
rubricMtime = stats.mtimeMs;
|
|
47
|
+
} catch (err) {
|
|
48
|
+
console.warn('[evalConfigLoader] Rubric file not found:', err.message);
|
|
49
|
+
return null;
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
try {
|
|
53
|
+
const content = fs.readFileSync(effectivePath, 'utf-8');
|
|
54
|
+
rubricCache = yaml.parse(content);
|
|
55
|
+
return rubricCache;
|
|
56
|
+
} catch (err) {
|
|
57
|
+
console.error('[evalConfigLoader] Failed to parse rubric:', err.message);
|
|
58
|
+
return null;
|
|
59
|
+
}
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
/**
|
|
63
|
+
* Load suggestion scenarios from the dedicated scenarios YAML file.
|
|
64
|
+
*
|
|
65
|
+
* Environment variable overrides:
|
|
66
|
+
* - EVAL_SCENARIOS_FILE: Override the scenarios file path (for testing different content domains)
|
|
67
|
+
*
|
|
68
|
+
* @param {Object} [options]
|
|
69
|
+
* @param {boolean} [options.forceReload] - Bypass mtime cache
|
|
70
|
+
* @returns {Object|null} Parsed scenarios object, or null if file not found
|
|
71
|
+
*/
|
|
72
|
+
export function loadSuggestionScenarios({ forceReload } = {}) {
|
|
73
|
+
// Allow environment variable override for scenarios file (domain generalizability testing)
|
|
74
|
+
const envScenariosFile = process.env.EVAL_SCENARIOS_FILE;
|
|
75
|
+
let effectivePath;
|
|
76
|
+
|
|
77
|
+
if (envScenariosFile) {
|
|
78
|
+
const evalRoot = path.resolve(EVAL_CONFIG_DIR, '..');
|
|
79
|
+
effectivePath = path.resolve(evalRoot, envScenariosFile);
|
|
80
|
+
console.log(`[evalConfigLoader] Using EVAL_SCENARIOS_FILE override: ${effectivePath}`);
|
|
81
|
+
} else {
|
|
82
|
+
effectivePath = path.join(EVAL_CONFIG_DIR, 'suggestion-scenarios.yaml');
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
try {
|
|
86
|
+
const stats = fs.statSync(effectivePath);
|
|
87
|
+
if (!forceReload && scenariosCache && scenariosMtime === stats.mtimeMs) {
|
|
88
|
+
return scenariosCache;
|
|
89
|
+
}
|
|
90
|
+
scenariosMtime = stats.mtimeMs;
|
|
91
|
+
} catch (err) {
|
|
92
|
+
console.warn('[evalConfigLoader] Suggestion scenarios file not found:', err.message);
|
|
93
|
+
return null;
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
try {
|
|
97
|
+
const content = fs.readFileSync(effectivePath, 'utf-8');
|
|
98
|
+
scenariosCache = yaml.parse(content);
|
|
99
|
+
return scenariosCache;
|
|
100
|
+
} catch (err) {
|
|
101
|
+
console.error('[evalConfigLoader] Failed to parse suggestion scenarios:', err.message);
|
|
102
|
+
return null;
|
|
103
|
+
}
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
/**
|
|
107
|
+
* Load the providers YAML from the eval repo's config directory.
|
|
108
|
+
*
|
|
109
|
+
* @param {Object} [options]
|
|
110
|
+
* @param {boolean} [options.forceReload] - Bypass mtime cache
|
|
111
|
+
* @returns {Object|null} Parsed providers object, or null if file not found
|
|
112
|
+
*/
|
|
113
|
+
export function loadProviders({ forceReload } = {}) {
|
|
114
|
+
const effectivePath = path.join(EVAL_CONFIG_DIR, 'providers.yaml');
|
|
115
|
+
|
|
116
|
+
try {
|
|
117
|
+
const stats = fs.statSync(effectivePath);
|
|
118
|
+
if (!forceReload && providersCache && providersMtime === stats.mtimeMs) {
|
|
119
|
+
return providersCache;
|
|
120
|
+
}
|
|
121
|
+
providersMtime = stats.mtimeMs;
|
|
122
|
+
} catch (err) {
|
|
123
|
+
console.warn('[evalConfigLoader] Providers file not found:', err.message);
|
|
124
|
+
return null;
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
try {
|
|
128
|
+
const content = fs.readFileSync(effectivePath, 'utf-8');
|
|
129
|
+
providersCache = yaml.parse(content);
|
|
130
|
+
return providersCache;
|
|
131
|
+
} catch (err) {
|
|
132
|
+
console.error('[evalConfigLoader] Failed to parse providers:', err.message);
|
|
133
|
+
return null;
|
|
134
|
+
}
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
/**
|
|
138
|
+
* Get provider config with API key resolved from environment.
|
|
139
|
+
*
|
|
140
|
+
* @param {string} providerName - Provider key (e.g. 'anthropic', 'openrouter')
|
|
141
|
+
* @param {Object} [options]
|
|
142
|
+
* @param {boolean} [options.forceReload] - Bypass mtime cache
|
|
143
|
+
* @returns {Object} Provider config with apiKey and isConfigured
|
|
144
|
+
*/
|
|
145
|
+
export function getProviderConfig(providerName, options = {}) {
|
|
146
|
+
const data = loadProviders(options);
|
|
147
|
+
const provider = data?.providers?.[providerName];
|
|
148
|
+
|
|
149
|
+
if (!provider) {
|
|
150
|
+
throw new Error(`Unknown provider: ${providerName}`);
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
const apiKey = provider.api_key_env ? (process.env[provider.api_key_env] || '') : '';
|
|
154
|
+
const isLocal = providerName === 'local';
|
|
155
|
+
const isConfigured = isLocal ? Boolean(provider.base_url) : Boolean(apiKey);
|
|
156
|
+
|
|
157
|
+
return {
|
|
158
|
+
...provider,
|
|
159
|
+
apiKey,
|
|
160
|
+
isConfigured,
|
|
161
|
+
};
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
/**
|
|
165
|
+
* Resolve a model reference to full provider config.
|
|
166
|
+
*
|
|
167
|
+
* Accepts:
|
|
168
|
+
* - String: "provider.alias" (e.g. "openrouter.sonnet")
|
|
169
|
+
* - Object: { provider, model } (e.g. { provider: 'anthropic', model: 'sonnet' })
|
|
170
|
+
*
|
|
171
|
+
* @param {string|Object} ref - Model reference
|
|
172
|
+
* @param {Object} [options]
|
|
173
|
+
* @param {boolean} [options.forceReload] - Bypass mtime cache
|
|
174
|
+
* @returns {Object} { provider, model, apiKey, isConfigured, baseUrl }
|
|
175
|
+
*/
|
|
176
|
+
export function resolveModel(ref, options = {}) {
|
|
177
|
+
let providerName, modelAlias;
|
|
178
|
+
|
|
179
|
+
if (typeof ref === 'string') {
|
|
180
|
+
const dotIndex = ref.indexOf('.');
|
|
181
|
+
if (dotIndex > 0 && dotIndex < ref.length - 1) {
|
|
182
|
+
providerName = ref.slice(0, dotIndex);
|
|
183
|
+
modelAlias = ref.slice(dotIndex + 1);
|
|
184
|
+
} else {
|
|
185
|
+
throw new Error(
|
|
186
|
+
`Invalid model reference: "${ref}". Use format "provider.model" (e.g., "openrouter.haiku", "anthropic.sonnet")`
|
|
187
|
+
);
|
|
188
|
+
}
|
|
189
|
+
} else if (typeof ref === 'object' && ref !== null) {
|
|
190
|
+
providerName = ref.provider;
|
|
191
|
+
modelAlias = ref.model;
|
|
192
|
+
if (!providerName || !modelAlias) {
|
|
193
|
+
throw new Error('Model reference object must have both "provider" and "model" properties');
|
|
194
|
+
}
|
|
195
|
+
} else {
|
|
196
|
+
throw new Error('Model reference must be a string or object');
|
|
197
|
+
}
|
|
198
|
+
|
|
199
|
+
const providerConfig = getProviderConfig(providerName, options);
|
|
200
|
+
const modelId = providerConfig.models?.[modelAlias] || modelAlias;
|
|
201
|
+
|
|
202
|
+
return {
|
|
203
|
+
provider: providerName,
|
|
204
|
+
model: modelId,
|
|
205
|
+
apiKey: providerConfig.apiKey,
|
|
206
|
+
isConfigured: providerConfig.isConfigured,
|
|
207
|
+
baseUrl: providerConfig.base_url,
|
|
208
|
+
};
|
|
209
|
+
}
|
|
210
|
+
|
|
211
|
+
/**
|
|
212
|
+
* Get judge model configuration from rubric.
|
|
213
|
+
*
|
|
214
|
+
* @param {Object} [options]
|
|
215
|
+
* @param {string} [options.rubricPath] - Override rubric path
|
|
216
|
+
* @returns {Object|null} Judge config ({ model, fallback, hyperparameters }) or null
|
|
217
|
+
*/
|
|
218
|
+
export function getJudgeConfig(options = {}) {
|
|
219
|
+
const rubric = loadRubric(options);
|
|
220
|
+
return rubric?.judge || null;
|
|
221
|
+
}
|
|
222
|
+
|
|
223
|
+
/**
|
|
224
|
+
* Get rubric dimensions with weights and criteria.
|
|
225
|
+
*
|
|
226
|
+
* @param {Object} [options]
|
|
227
|
+
* @param {string} [options.rubricPath] - Override rubric path
|
|
228
|
+
* @returns {Object} Dimensions map (keyed by dimension id)
|
|
229
|
+
*/
|
|
230
|
+
export function getRubricDimensions(options = {}) {
|
|
231
|
+
const rubric = loadRubric(options);
|
|
232
|
+
return rubric?.dimensions || {};
|
|
233
|
+
}
|
|
234
|
+
|
|
235
|
+
/**
|
|
236
|
+
* Get a single scenario by ID.
|
|
237
|
+
*
|
|
238
|
+
* Tries the dedicated suggestion-scenarios.yaml first, then falls back
|
|
239
|
+
* to the rubric file for backward compatibility.
|
|
240
|
+
*
|
|
241
|
+
* @param {string} scenarioId
|
|
242
|
+
* @param {Object} [options]
|
|
243
|
+
* @param {string} [options.rubricPath] - Override rubric path
|
|
244
|
+
* @returns {Object|null} Scenario object or null
|
|
245
|
+
*/
|
|
246
|
+
export function getScenario(scenarioId, options = {}) {
|
|
247
|
+
// Try new dedicated file first
|
|
248
|
+
const scenarios = loadSuggestionScenarios(options);
|
|
249
|
+
const scenario = scenarios?.scenarios?.[scenarioId];
|
|
250
|
+
if (scenario) {
|
|
251
|
+
return { ...scenario, type: scenario.type || 'suggestion', id: scenarioId };
|
|
252
|
+
}
|
|
253
|
+
|
|
254
|
+
// Fallback to rubric (backward compat)
|
|
255
|
+
const rubric = loadRubric(options);
|
|
256
|
+
const legacy = rubric?.scenarios?.[scenarioId];
|
|
257
|
+
if (legacy) {
|
|
258
|
+
console.warn(`[evalConfigLoader] Scenario '${scenarioId}' loaded from legacy rubric location`);
|
|
259
|
+
return { ...legacy, type: 'suggestion', id: scenarioId };
|
|
260
|
+
}
|
|
261
|
+
|
|
262
|
+
return null;
|
|
263
|
+
}
|
|
264
|
+
|
|
265
|
+
/**
|
|
266
|
+
* List all scenarios with metadata.
|
|
267
|
+
*
|
|
268
|
+
* Tries the dedicated suggestion-scenarios.yaml first, then falls back
|
|
269
|
+
* to the rubric file for backward compatibility.
|
|
270
|
+
*
|
|
271
|
+
* @param {Object} [options]
|
|
272
|
+
* @param {string} [options.rubricPath] - Override rubric path
|
|
273
|
+
* @returns {Array} Array of { id, name, description, type, isNewUser, minAcceptableScore, turnCount, isMultiTurn }
|
|
274
|
+
*/
|
|
275
|
+
export function listScenarios(options = {}) {
|
|
276
|
+
// Try new dedicated file first
|
|
277
|
+
const scenarioData = loadSuggestionScenarios(options);
|
|
278
|
+
let scenarioMap = scenarioData?.scenarios;
|
|
279
|
+
|
|
280
|
+
// Fallback to rubric
|
|
281
|
+
if (!scenarioMap) {
|
|
282
|
+
const rubric = loadRubric(options);
|
|
283
|
+
scenarioMap = rubric?.scenarios;
|
|
284
|
+
if (scenarioMap) {
|
|
285
|
+
console.warn('[evalConfigLoader] Scenarios loaded from legacy rubric location');
|
|
286
|
+
}
|
|
287
|
+
}
|
|
288
|
+
|
|
289
|
+
if (!scenarioMap) return [];
|
|
290
|
+
|
|
291
|
+
return Object.entries(scenarioMap).map(([id, scenario]) => ({
|
|
292
|
+
id,
|
|
293
|
+
name: scenario.name,
|
|
294
|
+
description: scenario.description,
|
|
295
|
+
type: scenario.type || 'suggestion',
|
|
296
|
+
category: scenario.category || 'core',
|
|
297
|
+
isNewUser: scenario.is_new_user,
|
|
298
|
+
minAcceptableScore: scenario.min_acceptable_score,
|
|
299
|
+
turnCount: (scenario.turns?.length || 0) + 1,
|
|
300
|
+
isMultiTurn: Array.isArray(scenario.turns) && scenario.turns.length > 0,
|
|
301
|
+
}));
|
|
302
|
+
}
|
|
303
|
+
|
|
304
|
+
/**
|
|
305
|
+
* Check if a scenario is multi-turn.
|
|
306
|
+
*
|
|
307
|
+
* @param {string} scenarioId
|
|
308
|
+
* @param {Object} [options]
|
|
309
|
+
* @returns {boolean}
|
|
310
|
+
*/
|
|
311
|
+
export function isMultiTurnScenario(scenarioId, options = {}) {
|
|
312
|
+
const scenario = getScenario(scenarioId, options);
|
|
313
|
+
return Array.isArray(scenario?.turns) && scenario.turns.length > 0;
|
|
314
|
+
}
|
|
315
|
+
|
|
316
|
+
/**
|
|
317
|
+
* Get evaluation settings from rubric.
|
|
318
|
+
*
|
|
319
|
+
* @param {Object} [options]
|
|
320
|
+
* @returns {Object} { useAIJudge, runsPerConfig, parallelism }
|
|
321
|
+
*/
|
|
322
|
+
export function getEvalSettings(options = {}) {
|
|
323
|
+
const rubric = loadRubric(options);
|
|
324
|
+
const settings = rubric?.settings || {};
|
|
325
|
+
return {
|
|
326
|
+
useAIJudge: settings.use_ai_judge ?? true,
|
|
327
|
+
runsPerConfig: settings.runs_per_config ?? 3,
|
|
328
|
+
parallelism: settings.parallelism ?? 2,
|
|
329
|
+
};
|
|
330
|
+
}
|
|
331
|
+
|
|
332
|
+
/**
|
|
333
|
+
* Get benchmark settings from rubric.
|
|
334
|
+
*
|
|
335
|
+
* @param {Object} [options]
|
|
336
|
+
* @returns {Object} { useAIJudge, forceAIJudgeDimensions }
|
|
337
|
+
*/
|
|
338
|
+
export function getBenchmarkSettings(options = {}) {
|
|
339
|
+
const rubric = loadRubric(options);
|
|
340
|
+
const settings = rubric?.settings?.benchmark || {};
|
|
341
|
+
return {
|
|
342
|
+
useAIJudge: settings.use_ai_judge ?? true,
|
|
343
|
+
forceAIJudgeDimensions: settings.force_ai_judge_dimensions || ['specificity'],
|
|
344
|
+
};
|
|
345
|
+
}
|
|
346
|
+
|
|
347
|
+
/**
|
|
348
|
+
* Load the tutor-agents YAML from the eval repo's config directory.
|
|
349
|
+
*
|
|
350
|
+
* @param {Object} [options]
|
|
351
|
+
* @param {boolean} [options.forceReload] - Bypass mtime cache
|
|
352
|
+
* @returns {Object|null} Parsed tutor-agents object, or null if file not found
|
|
353
|
+
*/
|
|
354
|
+
export function loadTutorAgents({ forceReload } = {}) {
|
|
355
|
+
const effectivePath = path.join(EVAL_CONFIG_DIR, 'tutor-agents.yaml');
|
|
356
|
+
|
|
357
|
+
try {
|
|
358
|
+
const stats = fs.statSync(effectivePath);
|
|
359
|
+
if (!forceReload && tutorAgentsCache && tutorAgentsMtime === stats.mtimeMs) {
|
|
360
|
+
return tutorAgentsCache;
|
|
361
|
+
}
|
|
362
|
+
tutorAgentsMtime = stats.mtimeMs;
|
|
363
|
+
} catch (err) {
|
|
364
|
+
console.warn('[evalConfigLoader] Tutor agents file not found:', err.message);
|
|
365
|
+
return null;
|
|
366
|
+
}
|
|
367
|
+
|
|
368
|
+
try {
|
|
369
|
+
const content = fs.readFileSync(effectivePath, 'utf-8');
|
|
370
|
+
tutorAgentsCache = yaml.parse(content);
|
|
371
|
+
return tutorAgentsCache;
|
|
372
|
+
} catch (err) {
|
|
373
|
+
console.error('[evalConfigLoader] Failed to parse tutor agents:', err.message);
|
|
374
|
+
return null;
|
|
375
|
+
}
|
|
376
|
+
}
|
|
377
|
+
|
|
378
|
+
/**
|
|
379
|
+
* Get a tutor profile's config with provider/model resolved through providers.yaml.
|
|
380
|
+
*
|
|
381
|
+
* @param {string} profileName - Profile key (e.g. 'budget', 'quality')
|
|
382
|
+
* @param {Object} [options]
|
|
383
|
+
* @param {boolean} [options.forceReload] - Bypass mtime cache
|
|
384
|
+
* @returns {Object|null} Resolved profile with ego/superego provider/model IDs, or null
|
|
385
|
+
*/
|
|
386
|
+
export function getTutorProfile(profileName, options = {}) {
|
|
387
|
+
const data = loadTutorAgents(options);
|
|
388
|
+
const profile = data?.profiles?.[profileName];
|
|
389
|
+
|
|
390
|
+
if (!profile) {
|
|
391
|
+
return null;
|
|
392
|
+
}
|
|
393
|
+
|
|
394
|
+
const result = {
|
|
395
|
+
name: profileName,
|
|
396
|
+
description: profile.description,
|
|
397
|
+
dialogue: profile.dialogue,
|
|
398
|
+
ego: profile.ego ? { ...profile.ego } : null,
|
|
399
|
+
superego: profile.superego ? { ...profile.superego } : null,
|
|
400
|
+
};
|
|
401
|
+
|
|
402
|
+
// Resolve ego model through providers.yaml
|
|
403
|
+
if (result.ego?.provider && result.ego?.model) {
|
|
404
|
+
try {
|
|
405
|
+
const resolved = resolveModel(`${result.ego.provider}.${result.ego.model}`, options);
|
|
406
|
+
result.ego.resolvedProvider = resolved.provider;
|
|
407
|
+
result.ego.resolvedModel = resolved.model;
|
|
408
|
+
} catch (e) {
|
|
409
|
+
// Keep the raw values if resolution fails
|
|
410
|
+
}
|
|
411
|
+
}
|
|
412
|
+
|
|
413
|
+
// Resolve superego model through providers.yaml
|
|
414
|
+
if (result.superego?.provider && result.superego?.model) {
|
|
415
|
+
try {
|
|
416
|
+
const resolved = resolveModel(`${result.superego.provider}.${result.superego.model}`, options);
|
|
417
|
+
result.superego.resolvedProvider = resolved.provider;
|
|
418
|
+
result.superego.resolvedModel = resolved.model;
|
|
419
|
+
} catch (e) {
|
|
420
|
+
// Keep the raw values if resolution fails
|
|
421
|
+
}
|
|
422
|
+
}
|
|
423
|
+
|
|
424
|
+
return result;
|
|
425
|
+
}
|
|
426
|
+
|
|
427
|
+
/**
|
|
428
|
+
* List available tutor profiles from the local tutor-agents.yaml.
|
|
429
|
+
*
|
|
430
|
+
* @param {Object} [options]
|
|
431
|
+
* @param {boolean} [options.forceReload] - Bypass mtime cache
|
|
432
|
+
* @returns {Array} Array of { name, description, dialogueEnabled, maxRounds, egoProvider, egoModel, superegoProvider, superegoModel }
|
|
433
|
+
*/
|
|
434
|
+
export function listTutorProfiles(options = {}) {
|
|
435
|
+
const data = loadTutorAgents(options);
|
|
436
|
+
const profiles = data?.profiles || {};
|
|
437
|
+
|
|
438
|
+
return Object.entries(profiles).map(([name, profile]) => ({
|
|
439
|
+
name,
|
|
440
|
+
description: profile.description || '',
|
|
441
|
+
dialogueEnabled: profile.dialogue?.enabled ?? true,
|
|
442
|
+
maxRounds: profile.dialogue?.max_rounds ?? 0,
|
|
443
|
+
egoProvider: profile.ego?.provider,
|
|
444
|
+
egoModel: profile.ego?.model,
|
|
445
|
+
superegoProvider: profile.superego?.provider,
|
|
446
|
+
superegoModel: profile.superego?.model,
|
|
447
|
+
}));
|
|
448
|
+
}
|
|
449
|
+
|
|
450
|
+
/**
|
|
451
|
+
* List available provider/model configurations from eval's providers.yaml.
|
|
452
|
+
*
|
|
453
|
+
* @param {Object} [options]
|
|
454
|
+
* @param {boolean} [options.forceReload] - Bypass mtime cache
|
|
455
|
+
* @returns {Array} Array of { provider, model, label }
|
|
456
|
+
*/
|
|
457
|
+
export function listConfigurations(options = {}) {
|
|
458
|
+
const data = loadProviders(options);
|
|
459
|
+
const providers = data?.providers || {};
|
|
460
|
+
const configs = [];
|
|
461
|
+
|
|
462
|
+
for (const [providerId, provider] of Object.entries(providers)) {
|
|
463
|
+
for (const [alias, modelId] of Object.entries(provider.models || {})) {
|
|
464
|
+
configs.push({
|
|
465
|
+
provider: providerId,
|
|
466
|
+
model: modelId,
|
|
467
|
+
label: `${providerId}/${alias}`,
|
|
468
|
+
});
|
|
469
|
+
}
|
|
470
|
+
}
|
|
471
|
+
|
|
472
|
+
return configs;
|
|
473
|
+
}
|
|
474
|
+
|
|
475
|
+
/**
|
|
476
|
+
* List scenarios filtered by category.
|
|
477
|
+
*
|
|
478
|
+
* @param {string} category - Category to filter by (e.g. 'core', 'recognition', 'multi_turn')
|
|
479
|
+
* @param {Object} [options]
|
|
480
|
+
* @returns {Array} Filtered scenario list
|
|
481
|
+
*/
|
|
482
|
+
export function listScenariosByCategory(category, options = {}) {
|
|
483
|
+
return listScenarios(options).filter(s => s.category === category);
|
|
484
|
+
}
|
|
485
|
+
|
|
486
|
+
/**
|
|
487
|
+
* Get interaction judge model configuration from rubric.
|
|
488
|
+
*
|
|
489
|
+
* Returns the `interaction_judge` section from evaluation-rubric.yaml,
|
|
490
|
+
* falling back to the suggestion `judge` section if not defined.
|
|
491
|
+
*
|
|
492
|
+
* @param {Object} [options]
|
|
493
|
+
* @param {string} [options.rubricPath] - Override rubric path
|
|
494
|
+
* @returns {Object|null} Judge config ({ model, fallback, hyperparameters }) or null
|
|
495
|
+
*/
|
|
496
|
+
export function getInteractionJudgeConfig(options = {}) {
|
|
497
|
+
const rubric = loadRubric(options);
|
|
498
|
+
return rubric?.interaction_judge || rubric?.judge || null;
|
|
499
|
+
}
|
|
500
|
+
|
|
501
|
+
/**
|
|
502
|
+
* Load eval-settings.yaml from the eval repo's config directory.
|
|
503
|
+
*
|
|
504
|
+
* @param {Object} [options]
|
|
505
|
+
* @param {boolean} [options.forceReload] - Bypass mtime cache
|
|
506
|
+
* @returns {Object|null} Parsed eval settings, or null if file not found
|
|
507
|
+
*/
|
|
508
|
+
export function loadEvalSettings({ forceReload } = {}) {
|
|
509
|
+
const effectivePath = path.join(EVAL_CONFIG_DIR, 'eval-settings.yaml');
|
|
510
|
+
|
|
511
|
+
try {
|
|
512
|
+
const stats = fs.statSync(effectivePath);
|
|
513
|
+
if (!forceReload && evalSettingsCache && evalSettingsMtime === stats.mtimeMs) {
|
|
514
|
+
return evalSettingsCache;
|
|
515
|
+
}
|
|
516
|
+
evalSettingsMtime = stats.mtimeMs;
|
|
517
|
+
} catch (err) {
|
|
518
|
+
// File is optional — not a warning
|
|
519
|
+
return null;
|
|
520
|
+
}
|
|
521
|
+
|
|
522
|
+
try {
|
|
523
|
+
const content = fs.readFileSync(effectivePath, 'utf-8');
|
|
524
|
+
evalSettingsCache = yaml.parse(content);
|
|
525
|
+
return evalSettingsCache;
|
|
526
|
+
} catch (err) {
|
|
527
|
+
console.error('[evalConfigLoader] Failed to parse eval-settings:', err.message);
|
|
528
|
+
return null;
|
|
529
|
+
}
|
|
530
|
+
}
|
|
531
|
+
|
|
532
|
+
/**
|
|
533
|
+
* Get content configuration from eval-settings.yaml.
|
|
534
|
+
* Resolves relative content_package_path against the eval repo root.
|
|
535
|
+
*
|
|
536
|
+
* Environment variable overrides:
|
|
537
|
+
* - EVAL_CONTENT_PATH: Override content_package_path (for testing different content domains)
|
|
538
|
+
*
|
|
539
|
+
* @param {Object} [options]
|
|
540
|
+
* @param {boolean} [options.forceReload] - Bypass mtime cache
|
|
541
|
+
* @returns {Object|null} Content config with resolved paths, or null
|
|
542
|
+
*/
|
|
543
|
+
export function getContentConfig(options = {}) {
|
|
544
|
+
const settings = loadEvalSettings(options);
|
|
545
|
+
const content = settings?.content;
|
|
546
|
+
if (!content) return null;
|
|
547
|
+
|
|
548
|
+
const evalRoot = path.resolve(EVAL_CONFIG_DIR, '..');
|
|
549
|
+
const resolved = { ...content };
|
|
550
|
+
|
|
551
|
+
// Allow environment variable override for content path (domain generalizability testing)
|
|
552
|
+
const envContentPath = process.env.EVAL_CONTENT_PATH;
|
|
553
|
+
if (envContentPath) {
|
|
554
|
+
resolved.content_package_path = path.resolve(evalRoot, envContentPath);
|
|
555
|
+
console.log(`[evalConfigLoader] Using EVAL_CONTENT_PATH override: ${resolved.content_package_path}`);
|
|
556
|
+
} else if (resolved.content_package_path) {
|
|
557
|
+
resolved.content_package_path = path.resolve(evalRoot, resolved.content_package_path);
|
|
558
|
+
}
|
|
559
|
+
|
|
560
|
+
return resolved;
|
|
561
|
+
}
|
|
562
|
+
|
|
563
|
+
/**
|
|
564
|
+
* Load scenarios from a custom file path.
|
|
565
|
+
* Used for testing domain generalizability with alternate content.
|
|
566
|
+
*
|
|
567
|
+
* @param {string} scenariosPath - Path to scenarios YAML file
|
|
568
|
+
* @returns {Object|null} Parsed scenarios object, or null if file not found
|
|
569
|
+
*/
|
|
570
|
+
export function loadCustomScenarios(scenariosPath) {
|
|
571
|
+
const evalRoot = path.resolve(EVAL_CONFIG_DIR, '..');
|
|
572
|
+
const resolvedPath = path.resolve(evalRoot, scenariosPath);
|
|
573
|
+
|
|
574
|
+
try {
|
|
575
|
+
const content = fs.readFileSync(resolvedPath, 'utf-8');
|
|
576
|
+
const scenarios = yaml.parse(content);
|
|
577
|
+
console.log(`[evalConfigLoader] Loaded custom scenarios from: ${resolvedPath}`);
|
|
578
|
+
return scenarios;
|
|
579
|
+
} catch (err) {
|
|
580
|
+
console.error(`[evalConfigLoader] Failed to load custom scenarios from ${resolvedPath}:`, err.message);
|
|
581
|
+
return null;
|
|
582
|
+
}
|
|
583
|
+
}
|
|
584
|
+
|
|
585
|
+
export default {
|
|
586
|
+
loadRubric,
|
|
587
|
+
loadSuggestionScenarios,
|
|
588
|
+
loadCustomScenarios,
|
|
589
|
+
loadProviders,
|
|
590
|
+
getProviderConfig,
|
|
591
|
+
resolveModel,
|
|
592
|
+
getJudgeConfig,
|
|
593
|
+
getInteractionJudgeConfig,
|
|
594
|
+
getRubricDimensions,
|
|
595
|
+
getScenario,
|
|
596
|
+
listScenarios,
|
|
597
|
+
listScenariosByCategory,
|
|
598
|
+
isMultiTurnScenario,
|
|
599
|
+
getEvalSettings,
|
|
600
|
+
getBenchmarkSettings,
|
|
601
|
+
loadTutorAgents,
|
|
602
|
+
getTutorProfile,
|
|
603
|
+
listTutorProfiles,
|
|
604
|
+
listConfigurations,
|
|
605
|
+
loadEvalSettings,
|
|
606
|
+
getContentConfig,
|
|
607
|
+
getTutorModelOverrides,
|
|
608
|
+
};
|
|
609
|
+
|
|
610
|
+
/**
|
|
611
|
+
* Get YAML-level model overrides from tutor-agents.yaml.
|
|
612
|
+
* These are lower priority than CLI flags.
|
|
613
|
+
*
|
|
614
|
+
* @param {Object} [options]
|
|
615
|
+
* @param {boolean} [options.forceReload] - Bypass mtime cache
|
|
616
|
+
* @returns {Object} { modelOverride, egoModelOverride, superegoModelOverride } (null if not set)
|
|
617
|
+
*/
|
|
618
|
+
export function getTutorModelOverrides(options = {}) {
|
|
619
|
+
const data = loadTutorAgents(options);
|
|
620
|
+
return {
|
|
621
|
+
modelOverride: data?.model_override || null,
|
|
622
|
+
egoModelOverride: data?.ego_model_override || null,
|
|
623
|
+
superegoModelOverride: data?.superego_model_override || null,
|
|
624
|
+
};
|
|
625
|
+
}
|