@machinespirits/eval 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/components/MobileEvalDashboard.tsx +267 -0
- package/components/comparison/DeltaAnalysisTable.tsx +137 -0
- package/components/comparison/ProfileComparisonCard.tsx +176 -0
- package/components/comparison/RecognitionABMode.tsx +385 -0
- package/components/comparison/RecognitionMetricsPanel.tsx +135 -0
- package/components/comparison/WinnerIndicator.tsx +64 -0
- package/components/comparison/index.ts +5 -0
- package/components/mobile/BottomSheet.tsx +233 -0
- package/components/mobile/DimensionBreakdown.tsx +210 -0
- package/components/mobile/DocsView.tsx +363 -0
- package/components/mobile/LogsView.tsx +481 -0
- package/components/mobile/PsychodynamicQuadrant.tsx +261 -0
- package/components/mobile/QuickTestView.tsx +1098 -0
- package/components/mobile/RecognitionTypeChart.tsx +124 -0
- package/components/mobile/RecognitionView.tsx +809 -0
- package/components/mobile/RunDetailView.tsx +261 -0
- package/components/mobile/RunHistoryView.tsx +367 -0
- package/components/mobile/ScoreRadial.tsx +211 -0
- package/components/mobile/StreamingLogPanel.tsx +230 -0
- package/components/mobile/SynthesisStrategyChart.tsx +140 -0
- package/config/interaction-eval-scenarios.yaml +832 -0
- package/config/learner-agents.yaml +248 -0
- package/docs/research/ABLATION-DIALOGUE-ROUNDS.md +52 -0
- package/docs/research/ABLATION-MODEL-SELECTION.md +53 -0
- package/docs/research/ADVANCED-EVAL-ANALYSIS.md +60 -0
- package/docs/research/ANOVA-RESULTS-2026-01-14.md +257 -0
- package/docs/research/COMPREHENSIVE-EVALUATION-PLAN.md +586 -0
- package/docs/research/COST-ANALYSIS.md +56 -0
- package/docs/research/CRITICAL-REVIEW-RECOGNITION-TUTORING.md +340 -0
- package/docs/research/DYNAMIC-VS-SCRIPTED-ANALYSIS.md +291 -0
- package/docs/research/EVAL-SYSTEM-ANALYSIS.md +306 -0
- package/docs/research/FACTORIAL-RESULTS-2026-01-14.md +301 -0
- package/docs/research/IMPLEMENTATION-PLAN-CRITIQUE-RESPONSE.md +1988 -0
- package/docs/research/LONGITUDINAL-DYADIC-EVALUATION.md +282 -0
- package/docs/research/MULTI-JUDGE-VALIDATION-2026-01-14.md +147 -0
- package/docs/research/PAPER-EXTENSION-DYADIC.md +204 -0
- package/docs/research/PAPER-UNIFIED.md +659 -0
- package/docs/research/PAPER-UNIFIED.pdf +0 -0
- package/docs/research/PROMPT-IMPROVEMENTS-2026-01-14.md +356 -0
- package/docs/research/SESSION-NOTES-2026-01-11-RECOGNITION-EVAL.md +419 -0
- package/docs/research/apa.csl +2133 -0
- package/docs/research/archive/PAPER-DRAFT-RECOGNITION-TUTORING.md +1637 -0
- package/docs/research/archive/paper-multiagent-tutor.tex +978 -0
- package/docs/research/paper-draft/full-paper.md +136 -0
- package/docs/research/paper-draft/images/pasted-image-2026-01-24T03-47-47-846Z-d76a7ae2.png +0 -0
- package/docs/research/paper-draft/references.bib +515 -0
- package/docs/research/transcript-baseline.md +139 -0
- package/docs/research/transcript-recognition-multiagent.md +187 -0
- package/hooks/useEvalData.ts +625 -0
- package/index.js +27 -0
- package/package.json +73 -0
- package/routes/evalRoutes.js +3002 -0
- package/scripts/advanced-eval-analysis.js +351 -0
- package/scripts/analyze-eval-costs.js +378 -0
- package/scripts/analyze-eval-results.js +513 -0
- package/scripts/analyze-interaction-evals.js +368 -0
- package/server-init.js +45 -0
- package/server.js +162 -0
- package/services/benchmarkService.js +1892 -0
- package/services/evaluationRunner.js +739 -0
- package/services/evaluationStore.js +1121 -0
- package/services/learnerConfigLoader.js +385 -0
- package/services/learnerTutorInteractionEngine.js +857 -0
- package/services/memory/learnerMemoryService.js +1227 -0
- package/services/memory/learnerWritingPad.js +577 -0
- package/services/memory/tutorWritingPad.js +674 -0
- package/services/promptRecommendationService.js +493 -0
- package/services/rubricEvaluator.js +826 -0
|
@@ -0,0 +1,826 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Rubric Evaluator Service
|
|
3
|
+
*
|
|
4
|
+
* Uses AI to evaluate tutor suggestions against the pedagogical rubric.
|
|
5
|
+
* Judge model configuration is loaded from config/evaluation-rubric.yaml
|
|
6
|
+
* Provider details are resolved from config/providers.yaml
|
|
7
|
+
*/
|
|
8
|
+
|
|
9
|
+
import { tutorApiService as tutorApi, tutorConfigLoader as configLoader } from '@machinespirits/tutor-core';
|
|
10
|
+
|
|
11
|
+
// Debug logging helper - suppressed in transcript mode for clean output
|
|
12
|
+
function debugLog(...args) {
|
|
13
|
+
if (process.env.TUTOR_TRANSCRIPT !== 'true') {
|
|
14
|
+
console.log(...args);
|
|
15
|
+
}
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
/**
|
|
19
|
+
* Get available evaluator configuration, resolving model references via providers.yaml
|
|
20
|
+
* Tries primary model first, then fallback if primary is not configured
|
|
21
|
+
*/
|
|
22
|
+
function getAvailableEvaluator() {
|
|
23
|
+
const rubric = tutorApi.loadRubric();
|
|
24
|
+
// Prefer 'judge' config, fall back to legacy 'evaluator' for backwards compatibility
|
|
25
|
+
const evalConfig = rubric?.judge || rubric?.evaluator;
|
|
26
|
+
|
|
27
|
+
if (!evalConfig?.model) {
|
|
28
|
+
console.warn('[rubricEvaluator] No judge config in evaluation-rubric.yaml, using defaults');
|
|
29
|
+
return {
|
|
30
|
+
provider: 'openrouter',
|
|
31
|
+
model: 'deepseek/deepseek-chat-v3-0324',
|
|
32
|
+
hyperparameters: { temperature: 0.2, max_tokens: 4000 },
|
|
33
|
+
};
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
// Try primary model
|
|
37
|
+
try {
|
|
38
|
+
const resolved = configLoader.resolveModel(evalConfig.model);
|
|
39
|
+
if (resolved.isConfigured) {
|
|
40
|
+
return {
|
|
41
|
+
provider: resolved.provider,
|
|
42
|
+
model: resolved.model,
|
|
43
|
+
apiKey: resolved.apiKey,
|
|
44
|
+
baseUrl: resolved.baseUrl,
|
|
45
|
+
hyperparameters: evalConfig.hyperparameters || {},
|
|
46
|
+
};
|
|
47
|
+
}
|
|
48
|
+
} catch (e) {
|
|
49
|
+
console.warn(`[rubricEvaluator] Failed to resolve primary evaluator: ${e.message}`);
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
// Try fallback
|
|
53
|
+
if (evalConfig.fallback?.model) {
|
|
54
|
+
try {
|
|
55
|
+
const fallback = configLoader.resolveModel(evalConfig.fallback.model);
|
|
56
|
+
if (fallback.isConfigured) {
|
|
57
|
+
debugLog(`[rubricEvaluator] Using fallback evaluator: ${fallback.provider}/${fallback.model}`);
|
|
58
|
+
return {
|
|
59
|
+
provider: fallback.provider,
|
|
60
|
+
model: fallback.model,
|
|
61
|
+
apiKey: fallback.apiKey,
|
|
62
|
+
baseUrl: fallback.baseUrl,
|
|
63
|
+
hyperparameters: evalConfig.fallback.hyperparameters || evalConfig.hyperparameters || {},
|
|
64
|
+
};
|
|
65
|
+
}
|
|
66
|
+
} catch (e) {
|
|
67
|
+
console.warn(`[rubricEvaluator] Failed to resolve fallback evaluator: ${e.message}`);
|
|
68
|
+
}
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
// Return primary anyway - will fail with helpful error
|
|
72
|
+
const resolved = configLoader.resolveModel(evalConfig.model);
|
|
73
|
+
return {
|
|
74
|
+
provider: resolved.provider,
|
|
75
|
+
model: resolved.model,
|
|
76
|
+
hyperparameters: evalConfig.hyperparameters || {},
|
|
77
|
+
};
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
/**
|
|
81
|
+
* Get the fallback evaluator config (if different from primary)
|
|
82
|
+
*/
|
|
83
|
+
function getFallbackEvaluator() {
|
|
84
|
+
const rubric = tutorApi.loadRubric();
|
|
85
|
+
// Prefer 'judge' config, fall back to legacy 'evaluator'
|
|
86
|
+
const evalConfig = rubric?.judge || rubric?.evaluator;
|
|
87
|
+
|
|
88
|
+
if (!evalConfig?.fallback?.model) return null;
|
|
89
|
+
|
|
90
|
+
try {
|
|
91
|
+
const fallback = configLoader.resolveModel(evalConfig.fallback.model);
|
|
92
|
+
if (fallback.isConfigured) {
|
|
93
|
+
return {
|
|
94
|
+
provider: fallback.provider,
|
|
95
|
+
model: fallback.model,
|
|
96
|
+
apiKey: fallback.apiKey,
|
|
97
|
+
baseUrl: fallback.baseUrl,
|
|
98
|
+
hyperparameters: evalConfig.fallback.hyperparameters || evalConfig.hyperparameters || {},
|
|
99
|
+
};
|
|
100
|
+
}
|
|
101
|
+
} catch (e) {
|
|
102
|
+
console.warn(`[rubricEvaluator] Failed to resolve fallback: ${e.message}`);
|
|
103
|
+
}
|
|
104
|
+
return null;
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
/**
|
|
108
|
+
* Call judge model with explicit config
|
|
109
|
+
*/
|
|
110
|
+
async function callJudgeModelWithConfig(prompt, config) {
|
|
111
|
+
const { provider, model, hyperparameters } = config;
|
|
112
|
+
const temperature = hyperparameters?.temperature ?? 0.2;
|
|
113
|
+
const maxTokens = hyperparameters?.max_tokens ?? 1500;
|
|
114
|
+
|
|
115
|
+
debugLog(`[rubricEvaluator] Calling fallback judge: ${provider}/${model}`);
|
|
116
|
+
|
|
117
|
+
// Wrap in try-catch to prevent unhandled rejections
|
|
118
|
+
try {
|
|
119
|
+
if (provider === 'openrouter') {
|
|
120
|
+
const apiKey = process.env.OPENROUTER_API_KEY;
|
|
121
|
+
if (!apiKey) throw new Error('OPENROUTER_API_KEY not set');
|
|
122
|
+
|
|
123
|
+
// Add timeout to prevent hanging
|
|
124
|
+
const controller = new AbortController();
|
|
125
|
+
const timeout = setTimeout(() => controller.abort(), 60000); // 60 second timeout
|
|
126
|
+
|
|
127
|
+
try {
|
|
128
|
+
const res = await fetch('https://openrouter.ai/api/v1/chat/completions', {
|
|
129
|
+
method: 'POST',
|
|
130
|
+
headers: {
|
|
131
|
+
'Content-Type': 'application/json',
|
|
132
|
+
'Authorization': `Bearer ${apiKey}`,
|
|
133
|
+
},
|
|
134
|
+
body: JSON.stringify({
|
|
135
|
+
model,
|
|
136
|
+
max_tokens: maxTokens,
|
|
137
|
+
temperature,
|
|
138
|
+
messages: [{ role: 'user', content: prompt }],
|
|
139
|
+
}),
|
|
140
|
+
signal: controller.signal,
|
|
141
|
+
});
|
|
142
|
+
|
|
143
|
+
clearTimeout(timeout);
|
|
144
|
+
|
|
145
|
+
if (!res.ok) {
|
|
146
|
+
const errorBody = await res.text().catch(() => '');
|
|
147
|
+
throw new Error(`OpenRouter API error: ${res.status} - ${errorBody.slice(0, 200)}`);
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
const data = await res.json().catch(err => {
|
|
151
|
+
throw new Error(`Failed to parse OpenRouter response: ${err.message}`);
|
|
152
|
+
});
|
|
153
|
+
|
|
154
|
+
return data.choices?.[0]?.message?.content || '';
|
|
155
|
+
} catch (err) {
|
|
156
|
+
clearTimeout(timeout);
|
|
157
|
+
if (err.name === 'AbortError') {
|
|
158
|
+
throw new Error('OpenRouter API request timed out after 60s');
|
|
159
|
+
}
|
|
160
|
+
throw err;
|
|
161
|
+
}
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
if (provider === 'gemini') {
|
|
165
|
+
const apiKey = process.env.GEMINI_API_KEY;
|
|
166
|
+
if (!apiKey) throw new Error('GEMINI_API_KEY not set');
|
|
167
|
+
|
|
168
|
+
// Add timeout to prevent hanging
|
|
169
|
+
const controller = new AbortController();
|
|
170
|
+
const timeout = setTimeout(() => controller.abort(), 60000);
|
|
171
|
+
|
|
172
|
+
try {
|
|
173
|
+
const res = await fetch(
|
|
174
|
+
`https://generativelanguage.googleapis.com/v1beta/models/${model}:generateContent?key=${apiKey}`,
|
|
175
|
+
{
|
|
176
|
+
method: 'POST',
|
|
177
|
+
headers: { 'Content-Type': 'application/json' },
|
|
178
|
+
body: JSON.stringify({
|
|
179
|
+
contents: [{ parts: [{ text: prompt }] }],
|
|
180
|
+
generationConfig: {
|
|
181
|
+
temperature,
|
|
182
|
+
maxOutputTokens: maxTokens,
|
|
183
|
+
},
|
|
184
|
+
}),
|
|
185
|
+
signal: controller.signal,
|
|
186
|
+
}
|
|
187
|
+
);
|
|
188
|
+
|
|
189
|
+
clearTimeout(timeout);
|
|
190
|
+
|
|
191
|
+
if (!res.ok) {
|
|
192
|
+
const errorBody = await res.text().catch(() => '');
|
|
193
|
+
throw new Error(`Gemini API error: ${res.status} - ${errorBody.slice(0, 200)}`);
|
|
194
|
+
}
|
|
195
|
+
|
|
196
|
+
const data = await res.json().catch(err => {
|
|
197
|
+
throw new Error(`Failed to parse Gemini response: ${err.message}`);
|
|
198
|
+
});
|
|
199
|
+
|
|
200
|
+
return data.candidates?.[0]?.content?.parts?.[0]?.text || '';
|
|
201
|
+
} catch (err) {
|
|
202
|
+
clearTimeout(timeout);
|
|
203
|
+
if (err.name === 'AbortError') {
|
|
204
|
+
throw new Error('Gemini API request timed out after 60s');
|
|
205
|
+
}
|
|
206
|
+
throw err;
|
|
207
|
+
}
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
throw new Error(`Unsupported fallback provider: ${provider}`);
|
|
211
|
+
} catch (error) {
|
|
212
|
+
// Log the error before re-throwing to help debugging
|
|
213
|
+
console.error(`[rubricEvaluator] Fallback judge error: ${error.message}`);
|
|
214
|
+
throw error;
|
|
215
|
+
}
|
|
216
|
+
}
|
|
217
|
+
|
|
218
|
+
/**
|
|
219
|
+
* Build the evaluation prompt for the judge model
|
|
220
|
+
*/
|
|
221
|
+
function buildEvaluationPrompt(suggestion, scenario, context) {
|
|
222
|
+
const rubric = tutorApi.loadRubric();
|
|
223
|
+
const dimensions = rubric?.dimensions || {};
|
|
224
|
+
|
|
225
|
+
// Build dimension criteria text
|
|
226
|
+
const dimensionCriteria = Object.entries(dimensions).map(([key, dim]) => {
|
|
227
|
+
const criteriaText = Object.entries(dim.criteria || {})
|
|
228
|
+
.map(([score, desc]) => ` ${score}: ${desc}`)
|
|
229
|
+
.join('\n');
|
|
230
|
+
return `**${dim.name}** (weight: ${(dim.weight * 100).toFixed(0)}%)
|
|
231
|
+
${dim.description}
|
|
232
|
+
Criteria:
|
|
233
|
+
${criteriaText}`;
|
|
234
|
+
}).join('\n\n');
|
|
235
|
+
|
|
236
|
+
return `You are an expert evaluator of AI tutoring systems. Evaluate the following AI tutor suggestion against the pedagogical rubric.
|
|
237
|
+
|
|
238
|
+
## EVALUATION RUBRIC
|
|
239
|
+
|
|
240
|
+
Score each dimension from 1-5:
|
|
241
|
+
- 1: Completely fails this criterion
|
|
242
|
+
- 2: Weak, significant issues
|
|
243
|
+
- 3: Adequate, meets basic expectations
|
|
244
|
+
- 4: Good, exceeds expectations
|
|
245
|
+
- 5: Excellent, exemplary
|
|
246
|
+
|
|
247
|
+
${dimensionCriteria}
|
|
248
|
+
|
|
249
|
+
## SCENARIO CONTEXT
|
|
250
|
+
|
|
251
|
+
**Scenario**: ${scenario.name}
|
|
252
|
+
**Description**: ${scenario.description}
|
|
253
|
+
**Expected Behavior**: ${scenario.expectedBehavior}
|
|
254
|
+
|
|
255
|
+
**Learner Context**:
|
|
256
|
+
${scenario.learnerContext || context.learnerContext || 'No context provided'}
|
|
257
|
+
|
|
258
|
+
## SUGGESTION TO EVALUATE
|
|
259
|
+
|
|
260
|
+
\`\`\`json
|
|
261
|
+
${JSON.stringify(suggestion, null, 2)}
|
|
262
|
+
\`\`\`
|
|
263
|
+
|
|
264
|
+
## VALIDATION REQUIREMENTS
|
|
265
|
+
|
|
266
|
+
Required elements (must include):
|
|
267
|
+
${(scenario.requiredElements || []).map(e => `- ${e}`).join('\n') || '- None specified'}
|
|
268
|
+
|
|
269
|
+
Forbidden elements (must NOT include):
|
|
270
|
+
${(scenario.forbiddenElements || []).map(e => `- ${e}`).join('\n') || '- None specified'}
|
|
271
|
+
|
|
272
|
+
## YOUR TASK
|
|
273
|
+
|
|
274
|
+
Evaluate the suggestion and provide:
|
|
275
|
+
1. A score (1-5) for each dimension with reasoning AND a direct quote from the suggestion that supports your assessment
|
|
276
|
+
2. Whether it passes the required/forbidden element checks
|
|
277
|
+
3. An overall score (weighted average, 0-100 scale)
|
|
278
|
+
|
|
279
|
+
For each dimension, include:
|
|
280
|
+
- **score**: 1-5 rating
|
|
281
|
+
- **reasoning**: Brief explanation of why this score was given
|
|
282
|
+
- **quote**: A short direct quote from the suggestion (title, message, or actionTarget) that exemplifies this dimension's score. Use "N/A" if no relevant quote exists.
|
|
283
|
+
|
|
284
|
+
Respond with ONLY a JSON object in this exact format:
|
|
285
|
+
\`\`\`json
|
|
286
|
+
{
|
|
287
|
+
"scores": {
|
|
288
|
+
"relevance": {"score": 4, "reasoning": "Matches learner's idle state", "quote": "Take your time with this concept"},
|
|
289
|
+
"specificity": {"score": 5, "reasoning": "Names exact lecture", "quote": "479-lecture-3"},
|
|
290
|
+
"pedagogical_soundness": {"score": 4, "reasoning": "Uses scaffolding", "quote": "Start with the basics before..."},
|
|
291
|
+
"personalization": {"score": 3, "reasoning": "Generic advice", "quote": "N/A"},
|
|
292
|
+
"actionability": {"score": 5, "reasoning": "Clear next step", "quote": "Click to continue to..."},
|
|
293
|
+
"tone": {"score": 4, "reasoning": "Encouraging", "quote": "You're making great progress"},
|
|
294
|
+
"mutual_recognition": {"score": 4, "reasoning": "Acknowledges learner's interpretation", "quote": "Your metaphor captures..."},
|
|
295
|
+
"dialectical_responsiveness": {"score": 3, "reasoning": "Responds but doesn't create tension", "quote": "N/A"},
|
|
296
|
+
"memory_integration": {"score": 4, "reasoning": "References previous session", "quote": "Building on your insight..."},
|
|
297
|
+
"transformative_potential": {"score": 3, "reasoning": "Informative but not transformative", "quote": "N/A"}
|
|
298
|
+
},
|
|
299
|
+
"validation": {
|
|
300
|
+
"passes_required": true,
|
|
301
|
+
"required_missing": [],
|
|
302
|
+
"passes_forbidden": true,
|
|
303
|
+
"forbidden_found": []
|
|
304
|
+
},
|
|
305
|
+
"overall_score": 82,
|
|
306
|
+
"summary": "Brief overall assessment"
|
|
307
|
+
}
|
|
308
|
+
\`\`\``;
|
|
309
|
+
}
|
|
310
|
+
|
|
311
|
+
/**
|
|
312
|
+
* Call the judge model (simple single-model approach)
|
|
313
|
+
*/
|
|
314
|
+
async function callJudgeModel(prompt) {
|
|
315
|
+
const evaluator = getAvailableEvaluator();
|
|
316
|
+
const { provider, model, hyperparameters } = evaluator;
|
|
317
|
+
const temperature = hyperparameters?.temperature ?? 0.2;
|
|
318
|
+
const maxTokens = hyperparameters?.max_tokens ?? 1500;
|
|
319
|
+
|
|
320
|
+
if (provider === 'anthropic') {
|
|
321
|
+
const apiKey = process.env.ANTHROPIC_API_KEY;
|
|
322
|
+
if (!apiKey) throw new Error('ANTHROPIC_API_KEY not set');
|
|
323
|
+
|
|
324
|
+
// Add timeout to prevent hanging
|
|
325
|
+
const controller = new AbortController();
|
|
326
|
+
const timeout = setTimeout(() => controller.abort(), 60000);
|
|
327
|
+
|
|
328
|
+
try {
|
|
329
|
+
const res = await fetch('https://api.anthropic.com/v1/messages', {
|
|
330
|
+
method: 'POST',
|
|
331
|
+
headers: {
|
|
332
|
+
'Content-Type': 'application/json',
|
|
333
|
+
'x-api-key': apiKey,
|
|
334
|
+
'anthropic-version': '2023-06-01',
|
|
335
|
+
},
|
|
336
|
+
body: JSON.stringify({
|
|
337
|
+
model,
|
|
338
|
+
max_tokens: maxTokens,
|
|
339
|
+
temperature,
|
|
340
|
+
messages: [{ role: 'user', content: prompt }],
|
|
341
|
+
}),
|
|
342
|
+
signal: controller.signal,
|
|
343
|
+
});
|
|
344
|
+
|
|
345
|
+
clearTimeout(timeout);
|
|
346
|
+
|
|
347
|
+
if (!res.ok) {
|
|
348
|
+
const errorBody = await res.text().catch(() => '');
|
|
349
|
+
throw new Error(`Anthropic API error: ${res.status} - ${errorBody.slice(0, 200)}`);
|
|
350
|
+
}
|
|
351
|
+
|
|
352
|
+
const data = await res.json().catch(err => {
|
|
353
|
+
throw new Error(`Failed to parse Anthropic response: ${err.message}`);
|
|
354
|
+
});
|
|
355
|
+
|
|
356
|
+
return data.content?.[0]?.text || '';
|
|
357
|
+
} catch (err) {
|
|
358
|
+
clearTimeout(timeout);
|
|
359
|
+
if (err.name === 'AbortError') {
|
|
360
|
+
throw new Error('Anthropic API request timed out after 60s');
|
|
361
|
+
}
|
|
362
|
+
throw err;
|
|
363
|
+
}
|
|
364
|
+
}
|
|
365
|
+
|
|
366
|
+
if (provider === 'openrouter') {
|
|
367
|
+
const apiKey = process.env.OPENROUTER_API_KEY;
|
|
368
|
+
if (!apiKey) throw new Error('OPENROUTER_API_KEY not set');
|
|
369
|
+
|
|
370
|
+
// Add timeout to prevent hanging
|
|
371
|
+
const controller = new AbortController();
|
|
372
|
+
const timeout = setTimeout(() => controller.abort(), 60000);
|
|
373
|
+
|
|
374
|
+
try {
|
|
375
|
+
const res = await fetch('https://openrouter.ai/api/v1/chat/completions', {
|
|
376
|
+
method: 'POST',
|
|
377
|
+
headers: {
|
|
378
|
+
'Content-Type': 'application/json',
|
|
379
|
+
'Authorization': `Bearer ${apiKey}`,
|
|
380
|
+
},
|
|
381
|
+
body: JSON.stringify({
|
|
382
|
+
model,
|
|
383
|
+
max_tokens: maxTokens,
|
|
384
|
+
temperature,
|
|
385
|
+
messages: [{ role: 'user', content: prompt }],
|
|
386
|
+
}),
|
|
387
|
+
signal: controller.signal,
|
|
388
|
+
});
|
|
389
|
+
|
|
390
|
+
clearTimeout(timeout);
|
|
391
|
+
|
|
392
|
+
if (!res.ok) {
|
|
393
|
+
const errorBody = await res.text().catch(() => '');
|
|
394
|
+
throw new Error(`OpenRouter API error: ${res.status} - ${errorBody.slice(0, 200)}`);
|
|
395
|
+
}
|
|
396
|
+
|
|
397
|
+
const data = await res.json().catch(err => {
|
|
398
|
+
throw new Error(`Failed to parse OpenRouter response: ${err.message}`);
|
|
399
|
+
});
|
|
400
|
+
|
|
401
|
+
return data.choices?.[0]?.message?.content || '';
|
|
402
|
+
} catch (err) {
|
|
403
|
+
clearTimeout(timeout);
|
|
404
|
+
if (err.name === 'AbortError') {
|
|
405
|
+
throw new Error('OpenRouter API request timed out after 60s');
|
|
406
|
+
}
|
|
407
|
+
throw err;
|
|
408
|
+
}
|
|
409
|
+
}
|
|
410
|
+
|
|
411
|
+
if (provider === 'openai') {
|
|
412
|
+
const apiKey = process.env.OPENAI_API_KEY;
|
|
413
|
+
if (!apiKey) throw new Error('OPENAI_API_KEY not set');
|
|
414
|
+
|
|
415
|
+
// Add timeout to prevent hanging
|
|
416
|
+
const controller = new AbortController();
|
|
417
|
+
const timeout = setTimeout(() => controller.abort(), 60000);
|
|
418
|
+
|
|
419
|
+
try {
|
|
420
|
+
const res = await fetch('https://api.openai.com/v1/chat/completions', {
|
|
421
|
+
method: 'POST',
|
|
422
|
+
headers: {
|
|
423
|
+
'Content-Type': 'application/json',
|
|
424
|
+
'Authorization': `Bearer ${apiKey}`,
|
|
425
|
+
},
|
|
426
|
+
body: JSON.stringify({
|
|
427
|
+
model,
|
|
428
|
+
max_tokens: maxTokens,
|
|
429
|
+
temperature,
|
|
430
|
+
messages: [{ role: 'user', content: prompt }],
|
|
431
|
+
}),
|
|
432
|
+
signal: controller.signal,
|
|
433
|
+
});
|
|
434
|
+
|
|
435
|
+
clearTimeout(timeout);
|
|
436
|
+
|
|
437
|
+
if (!res.ok) {
|
|
438
|
+
const errorBody = await res.text().catch(() => '');
|
|
439
|
+
throw new Error(`OpenAI API error: ${res.status} - ${errorBody.slice(0, 200)}`);
|
|
440
|
+
}
|
|
441
|
+
|
|
442
|
+
const data = await res.json().catch(err => {
|
|
443
|
+
throw new Error(`Failed to parse OpenAI response: ${err.message}`);
|
|
444
|
+
});
|
|
445
|
+
|
|
446
|
+
return data.choices?.[0]?.message?.content || '';
|
|
447
|
+
} catch (err) {
|
|
448
|
+
clearTimeout(timeout);
|
|
449
|
+
if (err.name === 'AbortError') {
|
|
450
|
+
throw new Error('OpenAI API request timed out after 60s');
|
|
451
|
+
}
|
|
452
|
+
throw err;
|
|
453
|
+
}
|
|
454
|
+
}
|
|
455
|
+
|
|
456
|
+
if (provider === 'gemini') {
|
|
457
|
+
const apiKey = process.env.GEMINI_API_KEY;
|
|
458
|
+
if (!apiKey) throw new Error('GEMINI_API_KEY not set');
|
|
459
|
+
|
|
460
|
+
// Add timeout to prevent hanging
|
|
461
|
+
const controller = new AbortController();
|
|
462
|
+
const timeout = setTimeout(() => controller.abort(), 60000);
|
|
463
|
+
|
|
464
|
+
try {
|
|
465
|
+
const res = await fetch(
|
|
466
|
+
`https://generativelanguage.googleapis.com/v1beta/models/${model}:generateContent?key=${apiKey}`,
|
|
467
|
+
{
|
|
468
|
+
method: 'POST',
|
|
469
|
+
headers: { 'Content-Type': 'application/json' },
|
|
470
|
+
body: JSON.stringify({
|
|
471
|
+
contents: [{ parts: [{ text: prompt }] }],
|
|
472
|
+
generationConfig: {
|
|
473
|
+
temperature,
|
|
474
|
+
maxOutputTokens: maxTokens,
|
|
475
|
+
},
|
|
476
|
+
}),
|
|
477
|
+
signal: controller.signal,
|
|
478
|
+
}
|
|
479
|
+
);
|
|
480
|
+
|
|
481
|
+
clearTimeout(timeout);
|
|
482
|
+
|
|
483
|
+
if (!res.ok) {
|
|
484
|
+
const errorBody = await res.text().catch(() => '');
|
|
485
|
+
throw new Error(`Gemini API error: ${res.status} - ${errorBody.slice(0, 200)}`);
|
|
486
|
+
}
|
|
487
|
+
|
|
488
|
+
const data = await res.json().catch(err => {
|
|
489
|
+
throw new Error(`Failed to parse Gemini response: ${err.message}`);
|
|
490
|
+
});
|
|
491
|
+
|
|
492
|
+
return data.candidates?.[0]?.content?.parts?.[0]?.text || '';
|
|
493
|
+
} catch (err) {
|
|
494
|
+
clearTimeout(timeout);
|
|
495
|
+
if (err.name === 'AbortError') {
|
|
496
|
+
throw new Error('Gemini API request timed out after 60s');
|
|
497
|
+
}
|
|
498
|
+
throw err;
|
|
499
|
+
}
|
|
500
|
+
}
|
|
501
|
+
|
|
502
|
+
throw new Error(`Unsupported judge provider: ${provider}`);
|
|
503
|
+
}
|
|
504
|
+
|
|
505
|
+
/**
|
|
506
|
+
* Parse the judge model's JSON response
|
|
507
|
+
*/
|
|
508
|
+
function parseJudgeResponse(responseText) {
|
|
509
|
+
// Extract JSON from response (may be wrapped in markdown code block)
|
|
510
|
+
const jsonMatch = responseText.match(/```(?:json)?\s*([\s\S]*?)```/) ||
|
|
511
|
+
responseText.match(/\{[\s\S]*\}/);
|
|
512
|
+
|
|
513
|
+
if (!jsonMatch) {
|
|
514
|
+
throw new Error('Could not parse judge response as JSON');
|
|
515
|
+
}
|
|
516
|
+
|
|
517
|
+
const jsonStr = jsonMatch[1] || jsonMatch[0];
|
|
518
|
+
return JSON.parse(jsonStr);
|
|
519
|
+
}
|
|
520
|
+
|
|
521
|
+
/**
|
|
522
|
+
* Evaluate a single suggestion against the rubric
|
|
523
|
+
*
|
|
524
|
+
* @param {Object} suggestion - The suggestion to evaluate
|
|
525
|
+
* @param {Object} scenario - The test scenario
|
|
526
|
+
* @param {Object} context - Additional context
|
|
527
|
+
* @returns {Promise<Object>} Evaluation result
|
|
528
|
+
*/
|
|
529
|
+
export async function evaluateSuggestion(suggestion, scenario, context = {}) {
|
|
530
|
+
const startTime = Date.now();
|
|
531
|
+
const evaluator = getAvailableEvaluator();
|
|
532
|
+
|
|
533
|
+
try {
|
|
534
|
+
const prompt = buildEvaluationPrompt(suggestion, scenario, context);
|
|
535
|
+
let responseText = await callJudgeModel(prompt);
|
|
536
|
+
|
|
537
|
+
// Log raw response for debugging
|
|
538
|
+
debugLog('[rubricEvaluator] Judge raw response (first 300 chars):', responseText.slice(0, 300));
|
|
539
|
+
|
|
540
|
+
// Handle empty response - try fallback model
|
|
541
|
+
if (!responseText || responseText.trim() === '') {
|
|
542
|
+
console.warn('[rubricEvaluator] Primary judge returned empty response, trying fallback...');
|
|
543
|
+
const fallbackConfig = getFallbackEvaluator();
|
|
544
|
+
if (fallbackConfig) {
|
|
545
|
+
responseText = await callJudgeModelWithConfig(prompt, fallbackConfig);
|
|
546
|
+
debugLog('[rubricEvaluator] Fallback response (first 300 chars):', responseText.slice(0, 300));
|
|
547
|
+
}
|
|
548
|
+
if (!responseText || responseText.trim() === '') {
|
|
549
|
+
throw new Error('Judge model returned empty response (primary and fallback)');
|
|
550
|
+
}
|
|
551
|
+
}
|
|
552
|
+
|
|
553
|
+
const parsed = parseJudgeResponse(responseText);
|
|
554
|
+
|
|
555
|
+
// Debug: log what was parsed
|
|
556
|
+
debugLog('[rubricEvaluator] Parsed keys:', Object.keys(parsed));
|
|
557
|
+
if (parsed.scores) {
|
|
558
|
+
debugLog('[rubricEvaluator] Score keys:', Object.keys(parsed.scores));
|
|
559
|
+
}
|
|
560
|
+
|
|
561
|
+
// Warning if scores are missing
|
|
562
|
+
if (!parsed.scores || Object.keys(parsed.scores).length === 0) {
|
|
563
|
+
console.warn('[rubricEvaluator] Warning: Judge response missing dimension scores');
|
|
564
|
+
console.warn('[rubricEvaluator] Full parsed response:', JSON.stringify(parsed, null, 2).slice(0, 800));
|
|
565
|
+
}
|
|
566
|
+
|
|
567
|
+
// Normalize dimension keys
|
|
568
|
+
const scores = {};
|
|
569
|
+
const dimensionMap = {
|
|
570
|
+
relevance: 'relevance',
|
|
571
|
+
specificity: 'specificity',
|
|
572
|
+
pedagogical_soundness: 'pedagogical',
|
|
573
|
+
pedagogical: 'pedagogical',
|
|
574
|
+
personalization: 'personalization',
|
|
575
|
+
actionability: 'actionability',
|
|
576
|
+
tone: 'tone',
|
|
577
|
+
};
|
|
578
|
+
|
|
579
|
+
for (const [key, value] of Object.entries(parsed.scores || {})) {
|
|
580
|
+
const normalizedKey = dimensionMap[key] || key;
|
|
581
|
+
// Handle both {score, reasoning, quote} objects and plain numbers
|
|
582
|
+
if (typeof value === 'object' && value !== null) {
|
|
583
|
+
scores[normalizedKey] = {
|
|
584
|
+
score: value.score,
|
|
585
|
+
reasoning: value.reasoning,
|
|
586
|
+
quote: value.quote || null,
|
|
587
|
+
};
|
|
588
|
+
} else if (typeof value === 'number') {
|
|
589
|
+
scores[normalizedKey] = {
|
|
590
|
+
score: value,
|
|
591
|
+
reasoning: null,
|
|
592
|
+
quote: null,
|
|
593
|
+
};
|
|
594
|
+
}
|
|
595
|
+
}
|
|
596
|
+
|
|
597
|
+
// Calculate overall score from dimension scores if available, otherwise use judge's score
|
|
598
|
+
let overallScore = parsed.overall_score;
|
|
599
|
+
if (Object.keys(scores).length > 0) {
|
|
600
|
+
const calculatedScore = calculateOverallScore(scores);
|
|
601
|
+
if (calculatedScore > 0) {
|
|
602
|
+
overallScore = calculatedScore;
|
|
603
|
+
}
|
|
604
|
+
}
|
|
605
|
+
|
|
606
|
+
return {
|
|
607
|
+
success: true,
|
|
608
|
+
scores,
|
|
609
|
+
overallScore,
|
|
610
|
+
passesRequired: parsed.validation?.passes_required ?? true,
|
|
611
|
+
passesForbidden: parsed.validation?.passes_forbidden ?? true,
|
|
612
|
+
requiredMissing: parsed.validation?.required_missing || [],
|
|
613
|
+
forbiddenFound: parsed.validation?.forbidden_found || [],
|
|
614
|
+
summary: parsed.summary,
|
|
615
|
+
evaluatorModel: `${evaluator.provider}/${evaluator.model}`,
|
|
616
|
+
evaluationTimeMs: Date.now() - startTime,
|
|
617
|
+
};
|
|
618
|
+
} catch (error) {
|
|
619
|
+
return {
|
|
620
|
+
success: false,
|
|
621
|
+
error: error.message,
|
|
622
|
+
evaluatorModel: `${evaluator.provider}/${evaluator.model}`,
|
|
623
|
+
evaluationTimeMs: Date.now() - startTime,
|
|
624
|
+
};
|
|
625
|
+
}
|
|
626
|
+
}
|
|
627
|
+
|
|
628
|
+
/**
|
|
629
|
+
* Evaluate multiple suggestions (batch)
|
|
630
|
+
*/
|
|
631
|
+
export async function evaluateSuggestions(suggestions, scenario, context = {}) {
|
|
632
|
+
const results = [];
|
|
633
|
+
|
|
634
|
+
for (const suggestion of suggestions) {
|
|
635
|
+
const result = await evaluateSuggestion(suggestion, scenario, context);
|
|
636
|
+
results.push(result);
|
|
637
|
+
}
|
|
638
|
+
|
|
639
|
+
// Aggregate scores if multiple suggestions
|
|
640
|
+
if (results.length > 0 && results[0].success) {
|
|
641
|
+
const avgScores = {};
|
|
642
|
+
const dimensions = ['relevance', 'specificity', 'pedagogical', 'personalization', 'actionability', 'tone'];
|
|
643
|
+
|
|
644
|
+
for (const dim of dimensions) {
|
|
645
|
+
const scores = results
|
|
646
|
+
.filter(r => r.success && r.scores?.[dim])
|
|
647
|
+
.map(r => r.scores[dim].score);
|
|
648
|
+
|
|
649
|
+
if (scores.length > 0) {
|
|
650
|
+
avgScores[dim] = scores.reduce((a, b) => a + b, 0) / scores.length;
|
|
651
|
+
}
|
|
652
|
+
}
|
|
653
|
+
|
|
654
|
+
const overallScores = results.filter(r => r.success).map(r => r.overallScore);
|
|
655
|
+
const avgOverall = overallScores.length > 0
|
|
656
|
+
? overallScores.reduce((a, b) => a + b, 0) / overallScores.length
|
|
657
|
+
: 0;
|
|
658
|
+
|
|
659
|
+
return {
|
|
660
|
+
individualResults: results,
|
|
661
|
+
aggregateScores: avgScores,
|
|
662
|
+
aggregateOverall: avgOverall,
|
|
663
|
+
allPassRequired: results.every(r => r.passesRequired),
|
|
664
|
+
allPassForbidden: results.every(r => r.passesForbidden),
|
|
665
|
+
};
|
|
666
|
+
}
|
|
667
|
+
|
|
668
|
+
return {
|
|
669
|
+
individualResults: results,
|
|
670
|
+
aggregateScores: {},
|
|
671
|
+
aggregateOverall: 0,
|
|
672
|
+
allPassRequired: false,
|
|
673
|
+
allPassForbidden: false,
|
|
674
|
+
};
|
|
675
|
+
}
|
|
676
|
+
|
|
677
|
+
/**
|
|
678
|
+
* Quick validation without AI (rule-based checks only)
|
|
679
|
+
*
|
|
680
|
+
* @param {Object} suggestion - The suggestion to validate
|
|
681
|
+
* @param {Object} scenario - The test scenario
|
|
682
|
+
* @returns {Object} Validation result
|
|
683
|
+
*/
|
|
684
|
+
export function quickValidate(suggestion, scenario) {
|
|
685
|
+
// For required elements, check all fields including actionTarget
|
|
686
|
+
const fullSuggestionText = JSON.stringify(suggestion).toLowerCase();
|
|
687
|
+
|
|
688
|
+
// For forbidden elements, only check user-facing fields (title, message)
|
|
689
|
+
// NOT the internal 'reasoning' field which may contain context-derived text
|
|
690
|
+
const userFacingText = [
|
|
691
|
+
suggestion.title || '',
|
|
692
|
+
suggestion.message || '',
|
|
693
|
+
].join(' ').toLowerCase();
|
|
694
|
+
|
|
695
|
+
const result = {
|
|
696
|
+
passesRequired: true,
|
|
697
|
+
passesForbidden: true,
|
|
698
|
+
requiredMissing: [],
|
|
699
|
+
forbiddenFound: [],
|
|
700
|
+
};
|
|
701
|
+
|
|
702
|
+
// Check required elements (can appear anywhere including actionTarget, reasoning)
|
|
703
|
+
for (const required of scenario.requiredElements || []) {
|
|
704
|
+
const normalizedRequired = required.toLowerCase();
|
|
705
|
+
const found = fullSuggestionText.includes(normalizedRequired) ||
|
|
706
|
+
(suggestion.actionTarget && suggestion.actionTarget.toLowerCase().includes(normalizedRequired)) ||
|
|
707
|
+
(suggestion.title && suggestion.title.toLowerCase().includes(normalizedRequired)) ||
|
|
708
|
+
(suggestion.message && suggestion.message.toLowerCase().includes(normalizedRequired));
|
|
709
|
+
|
|
710
|
+
if (!found) {
|
|
711
|
+
result.passesRequired = false;
|
|
712
|
+
result.requiredMissing.push(required);
|
|
713
|
+
}
|
|
714
|
+
}
|
|
715
|
+
|
|
716
|
+
// Check forbidden elements (only in user-facing text: title, message)
|
|
717
|
+
// The 'reasoning' field is internal and may legitimately reference context terms
|
|
718
|
+
for (const forbidden of scenario.forbiddenElements || []) {
|
|
719
|
+
const normalizedForbidden = forbidden.toLowerCase();
|
|
720
|
+
if (userFacingText.includes(normalizedForbidden)) {
|
|
721
|
+
result.passesForbidden = false;
|
|
722
|
+
result.forbiddenFound.push(forbidden);
|
|
723
|
+
}
|
|
724
|
+
}
|
|
725
|
+
|
|
726
|
+
return result;
|
|
727
|
+
}
|
|
728
|
+
|
|
729
|
+
/**
|
|
730
|
+
* Calculate weighted overall score from dimension scores
|
|
731
|
+
*/
|
|
732
|
+
export function calculateOverallScore(scores) {
|
|
733
|
+
const rubric = tutorApi.loadRubric();
|
|
734
|
+
const dimensions = rubric?.dimensions || {};
|
|
735
|
+
|
|
736
|
+
// Map rubric keys to normalized score keys (pedagogical_soundness -> pedagogical)
|
|
737
|
+
const keyMap = {
|
|
738
|
+
pedagogical_soundness: 'pedagogical',
|
|
739
|
+
};
|
|
740
|
+
|
|
741
|
+
let weightedSum = 0;
|
|
742
|
+
let totalWeight = 0;
|
|
743
|
+
|
|
744
|
+
for (const [key, dim] of Object.entries(dimensions)) {
|
|
745
|
+
// Try both the rubric key and the normalized key
|
|
746
|
+
const normalizedKey = keyMap[key] || key;
|
|
747
|
+
const scoreData = scores[normalizedKey] || scores[key];
|
|
748
|
+
const score = scoreData?.score ?? scoreData;
|
|
749
|
+
|
|
750
|
+
if (typeof score === 'number') {
|
|
751
|
+
weightedSum += score * (dim.weight || 0);
|
|
752
|
+
totalWeight += dim.weight || 0;
|
|
753
|
+
}
|
|
754
|
+
}
|
|
755
|
+
|
|
756
|
+
if (totalWeight === 0) return 0;
|
|
757
|
+
|
|
758
|
+
// Convert 1-5 scale to 0-100
|
|
759
|
+
const avgScore = weightedSum / totalWeight;
|
|
760
|
+
return ((avgScore - 1) / 4) * 100;
|
|
761
|
+
}
|
|
762
|
+
|
|
763
|
+
/**
|
|
764
|
+
* Calculate recognition-specific metrics from scores
|
|
765
|
+
* These metrics track the quality of mutual recognition between tutor and learner
|
|
766
|
+
*
|
|
767
|
+
* @param {Object} scores - Scores object from evaluation
|
|
768
|
+
* @returns {Object} Recognition metrics
|
|
769
|
+
*/
|
|
770
|
+
export function calculateRecognitionMetrics(scores) {
|
|
771
|
+
const recognitionDimensions = [
|
|
772
|
+
'mutual_recognition',
|
|
773
|
+
'dialectical_responsiveness',
|
|
774
|
+
'memory_integration',
|
|
775
|
+
'transformative_potential',
|
|
776
|
+
];
|
|
777
|
+
|
|
778
|
+
const metrics = {
|
|
779
|
+
recognitionScore: 0,
|
|
780
|
+
transformationRate: false,
|
|
781
|
+
memoryUtilization: false,
|
|
782
|
+
mutualAcknowledgment: false,
|
|
783
|
+
dimensionScores: {},
|
|
784
|
+
hasRecognitionData: false,
|
|
785
|
+
};
|
|
786
|
+
|
|
787
|
+
let totalScore = 0;
|
|
788
|
+
let scoredCount = 0;
|
|
789
|
+
|
|
790
|
+
for (const dim of recognitionDimensions) {
|
|
791
|
+
const scoreData = scores[dim];
|
|
792
|
+
const score = scoreData?.score ?? scoreData;
|
|
793
|
+
|
|
794
|
+
if (typeof score === 'number') {
|
|
795
|
+
metrics.dimensionScores[dim] = score;
|
|
796
|
+
totalScore += score;
|
|
797
|
+
scoredCount++;
|
|
798
|
+
|
|
799
|
+
// Track specific thresholds
|
|
800
|
+
if (dim === 'transformative_potential' && score >= 4) {
|
|
801
|
+
metrics.transformationRate = true;
|
|
802
|
+
}
|
|
803
|
+
if (dim === 'memory_integration' && score >= 3) {
|
|
804
|
+
metrics.memoryUtilization = true;
|
|
805
|
+
}
|
|
806
|
+
if (dim === 'mutual_recognition' && score >= 4) {
|
|
807
|
+
metrics.mutualAcknowledgment = true;
|
|
808
|
+
}
|
|
809
|
+
}
|
|
810
|
+
}
|
|
811
|
+
|
|
812
|
+
if (scoredCount > 0) {
|
|
813
|
+
metrics.recognitionScore = totalScore / scoredCount;
|
|
814
|
+
metrics.hasRecognitionData = true;
|
|
815
|
+
}
|
|
816
|
+
|
|
817
|
+
return metrics;
|
|
818
|
+
}
|
|
819
|
+
|
|
820
|
+
export default {
|
|
821
|
+
evaluateSuggestion,
|
|
822
|
+
evaluateSuggestions,
|
|
823
|
+
quickValidate,
|
|
824
|
+
calculateOverallScore,
|
|
825
|
+
calculateRecognitionMetrics,
|
|
826
|
+
};
|