@machinespirits/eval 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. package/components/MobileEvalDashboard.tsx +267 -0
  2. package/components/comparison/DeltaAnalysisTable.tsx +137 -0
  3. package/components/comparison/ProfileComparisonCard.tsx +176 -0
  4. package/components/comparison/RecognitionABMode.tsx +385 -0
  5. package/components/comparison/RecognitionMetricsPanel.tsx +135 -0
  6. package/components/comparison/WinnerIndicator.tsx +64 -0
  7. package/components/comparison/index.ts +5 -0
  8. package/components/mobile/BottomSheet.tsx +233 -0
  9. package/components/mobile/DimensionBreakdown.tsx +210 -0
  10. package/components/mobile/DocsView.tsx +363 -0
  11. package/components/mobile/LogsView.tsx +481 -0
  12. package/components/mobile/PsychodynamicQuadrant.tsx +261 -0
  13. package/components/mobile/QuickTestView.tsx +1098 -0
  14. package/components/mobile/RecognitionTypeChart.tsx +124 -0
  15. package/components/mobile/RecognitionView.tsx +809 -0
  16. package/components/mobile/RunDetailView.tsx +261 -0
  17. package/components/mobile/RunHistoryView.tsx +367 -0
  18. package/components/mobile/ScoreRadial.tsx +211 -0
  19. package/components/mobile/StreamingLogPanel.tsx +230 -0
  20. package/components/mobile/SynthesisStrategyChart.tsx +140 -0
  21. package/config/interaction-eval-scenarios.yaml +832 -0
  22. package/config/learner-agents.yaml +248 -0
  23. package/docs/research/ABLATION-DIALOGUE-ROUNDS.md +52 -0
  24. package/docs/research/ABLATION-MODEL-SELECTION.md +53 -0
  25. package/docs/research/ADVANCED-EVAL-ANALYSIS.md +60 -0
  26. package/docs/research/ANOVA-RESULTS-2026-01-14.md +257 -0
  27. package/docs/research/COMPREHENSIVE-EVALUATION-PLAN.md +586 -0
  28. package/docs/research/COST-ANALYSIS.md +56 -0
  29. package/docs/research/CRITICAL-REVIEW-RECOGNITION-TUTORING.md +340 -0
  30. package/docs/research/DYNAMIC-VS-SCRIPTED-ANALYSIS.md +291 -0
  31. package/docs/research/EVAL-SYSTEM-ANALYSIS.md +306 -0
  32. package/docs/research/FACTORIAL-RESULTS-2026-01-14.md +301 -0
  33. package/docs/research/IMPLEMENTATION-PLAN-CRITIQUE-RESPONSE.md +1988 -0
  34. package/docs/research/LONGITUDINAL-DYADIC-EVALUATION.md +282 -0
  35. package/docs/research/MULTI-JUDGE-VALIDATION-2026-01-14.md +147 -0
  36. package/docs/research/PAPER-EXTENSION-DYADIC.md +204 -0
  37. package/docs/research/PAPER-UNIFIED.md +659 -0
  38. package/docs/research/PAPER-UNIFIED.pdf +0 -0
  39. package/docs/research/PROMPT-IMPROVEMENTS-2026-01-14.md +356 -0
  40. package/docs/research/SESSION-NOTES-2026-01-11-RECOGNITION-EVAL.md +419 -0
  41. package/docs/research/apa.csl +2133 -0
  42. package/docs/research/archive/PAPER-DRAFT-RECOGNITION-TUTORING.md +1637 -0
  43. package/docs/research/archive/paper-multiagent-tutor.tex +978 -0
  44. package/docs/research/paper-draft/full-paper.md +136 -0
  45. package/docs/research/paper-draft/images/pasted-image-2026-01-24T03-47-47-846Z-d76a7ae2.png +0 -0
  46. package/docs/research/paper-draft/references.bib +515 -0
  47. package/docs/research/transcript-baseline.md +139 -0
  48. package/docs/research/transcript-recognition-multiagent.md +187 -0
  49. package/hooks/useEvalData.ts +625 -0
  50. package/index.js +27 -0
  51. package/package.json +73 -0
  52. package/routes/evalRoutes.js +3002 -0
  53. package/scripts/advanced-eval-analysis.js +351 -0
  54. package/scripts/analyze-eval-costs.js +378 -0
  55. package/scripts/analyze-eval-results.js +513 -0
  56. package/scripts/analyze-interaction-evals.js +368 -0
  57. package/server-init.js +45 -0
  58. package/server.js +162 -0
  59. package/services/benchmarkService.js +1892 -0
  60. package/services/evaluationRunner.js +739 -0
  61. package/services/evaluationStore.js +1121 -0
  62. package/services/learnerConfigLoader.js +385 -0
  63. package/services/learnerTutorInteractionEngine.js +857 -0
  64. package/services/memory/learnerMemoryService.js +1227 -0
  65. package/services/memory/learnerWritingPad.js +577 -0
  66. package/services/memory/tutorWritingPad.js +674 -0
  67. package/services/promptRecommendationService.js +493 -0
  68. package/services/rubricEvaluator.js +826 -0
@@ -0,0 +1,826 @@
1
+ /**
2
+ * Rubric Evaluator Service
3
+ *
4
+ * Uses AI to evaluate tutor suggestions against the pedagogical rubric.
5
+ * Judge model configuration is loaded from config/evaluation-rubric.yaml
6
+ * Provider details are resolved from config/providers.yaml
7
+ */
8
+
9
+ import { tutorApiService as tutorApi, tutorConfigLoader as configLoader } from '@machinespirits/tutor-core';
10
+
11
+ // Debug logging helper - suppressed in transcript mode for clean output
12
+ function debugLog(...args) {
13
+ if (process.env.TUTOR_TRANSCRIPT !== 'true') {
14
+ console.log(...args);
15
+ }
16
+ }
17
+
18
+ /**
19
+ * Get available evaluator configuration, resolving model references via providers.yaml
20
+ * Tries primary model first, then fallback if primary is not configured
21
+ */
22
+ function getAvailableEvaluator() {
23
+ const rubric = tutorApi.loadRubric();
24
+ // Prefer 'judge' config, fall back to legacy 'evaluator' for backwards compatibility
25
+ const evalConfig = rubric?.judge || rubric?.evaluator;
26
+
27
+ if (!evalConfig?.model) {
28
+ console.warn('[rubricEvaluator] No judge config in evaluation-rubric.yaml, using defaults');
29
+ return {
30
+ provider: 'openrouter',
31
+ model: 'deepseek/deepseek-chat-v3-0324',
32
+ hyperparameters: { temperature: 0.2, max_tokens: 4000 },
33
+ };
34
+ }
35
+
36
+ // Try primary model
37
+ try {
38
+ const resolved = configLoader.resolveModel(evalConfig.model);
39
+ if (resolved.isConfigured) {
40
+ return {
41
+ provider: resolved.provider,
42
+ model: resolved.model,
43
+ apiKey: resolved.apiKey,
44
+ baseUrl: resolved.baseUrl,
45
+ hyperparameters: evalConfig.hyperparameters || {},
46
+ };
47
+ }
48
+ } catch (e) {
49
+ console.warn(`[rubricEvaluator] Failed to resolve primary evaluator: ${e.message}`);
50
+ }
51
+
52
+ // Try fallback
53
+ if (evalConfig.fallback?.model) {
54
+ try {
55
+ const fallback = configLoader.resolveModel(evalConfig.fallback.model);
56
+ if (fallback.isConfigured) {
57
+ debugLog(`[rubricEvaluator] Using fallback evaluator: ${fallback.provider}/${fallback.model}`);
58
+ return {
59
+ provider: fallback.provider,
60
+ model: fallback.model,
61
+ apiKey: fallback.apiKey,
62
+ baseUrl: fallback.baseUrl,
63
+ hyperparameters: evalConfig.fallback.hyperparameters || evalConfig.hyperparameters || {},
64
+ };
65
+ }
66
+ } catch (e) {
67
+ console.warn(`[rubricEvaluator] Failed to resolve fallback evaluator: ${e.message}`);
68
+ }
69
+ }
70
+
71
+ // Return primary anyway - will fail with helpful error
72
+ const resolved = configLoader.resolveModel(evalConfig.model);
73
+ return {
74
+ provider: resolved.provider,
75
+ model: resolved.model,
76
+ hyperparameters: evalConfig.hyperparameters || {},
77
+ };
78
+ }
79
+
80
+ /**
81
+ * Get the fallback evaluator config (if different from primary)
82
+ */
83
+ function getFallbackEvaluator() {
84
+ const rubric = tutorApi.loadRubric();
85
+ // Prefer 'judge' config, fall back to legacy 'evaluator'
86
+ const evalConfig = rubric?.judge || rubric?.evaluator;
87
+
88
+ if (!evalConfig?.fallback?.model) return null;
89
+
90
+ try {
91
+ const fallback = configLoader.resolveModel(evalConfig.fallback.model);
92
+ if (fallback.isConfigured) {
93
+ return {
94
+ provider: fallback.provider,
95
+ model: fallback.model,
96
+ apiKey: fallback.apiKey,
97
+ baseUrl: fallback.baseUrl,
98
+ hyperparameters: evalConfig.fallback.hyperparameters || evalConfig.hyperparameters || {},
99
+ };
100
+ }
101
+ } catch (e) {
102
+ console.warn(`[rubricEvaluator] Failed to resolve fallback: ${e.message}`);
103
+ }
104
+ return null;
105
+ }
106
+
107
+ /**
108
+ * Call judge model with explicit config
109
+ */
110
+ async function callJudgeModelWithConfig(prompt, config) {
111
+ const { provider, model, hyperparameters } = config;
112
+ const temperature = hyperparameters?.temperature ?? 0.2;
113
+ const maxTokens = hyperparameters?.max_tokens ?? 1500;
114
+
115
+ debugLog(`[rubricEvaluator] Calling fallback judge: ${provider}/${model}`);
116
+
117
+ // Wrap in try-catch to prevent unhandled rejections
118
+ try {
119
+ if (provider === 'openrouter') {
120
+ const apiKey = process.env.OPENROUTER_API_KEY;
121
+ if (!apiKey) throw new Error('OPENROUTER_API_KEY not set');
122
+
123
+ // Add timeout to prevent hanging
124
+ const controller = new AbortController();
125
+ const timeout = setTimeout(() => controller.abort(), 60000); // 60 second timeout
126
+
127
+ try {
128
+ const res = await fetch('https://openrouter.ai/api/v1/chat/completions', {
129
+ method: 'POST',
130
+ headers: {
131
+ 'Content-Type': 'application/json',
132
+ 'Authorization': `Bearer ${apiKey}`,
133
+ },
134
+ body: JSON.stringify({
135
+ model,
136
+ max_tokens: maxTokens,
137
+ temperature,
138
+ messages: [{ role: 'user', content: prompt }],
139
+ }),
140
+ signal: controller.signal,
141
+ });
142
+
143
+ clearTimeout(timeout);
144
+
145
+ if (!res.ok) {
146
+ const errorBody = await res.text().catch(() => '');
147
+ throw new Error(`OpenRouter API error: ${res.status} - ${errorBody.slice(0, 200)}`);
148
+ }
149
+
150
+ const data = await res.json().catch(err => {
151
+ throw new Error(`Failed to parse OpenRouter response: ${err.message}`);
152
+ });
153
+
154
+ return data.choices?.[0]?.message?.content || '';
155
+ } catch (err) {
156
+ clearTimeout(timeout);
157
+ if (err.name === 'AbortError') {
158
+ throw new Error('OpenRouter API request timed out after 60s');
159
+ }
160
+ throw err;
161
+ }
162
+ }
163
+
164
+ if (provider === 'gemini') {
165
+ const apiKey = process.env.GEMINI_API_KEY;
166
+ if (!apiKey) throw new Error('GEMINI_API_KEY not set');
167
+
168
+ // Add timeout to prevent hanging
169
+ const controller = new AbortController();
170
+ const timeout = setTimeout(() => controller.abort(), 60000);
171
+
172
+ try {
173
+ const res = await fetch(
174
+ `https://generativelanguage.googleapis.com/v1beta/models/${model}:generateContent?key=${apiKey}`,
175
+ {
176
+ method: 'POST',
177
+ headers: { 'Content-Type': 'application/json' },
178
+ body: JSON.stringify({
179
+ contents: [{ parts: [{ text: prompt }] }],
180
+ generationConfig: {
181
+ temperature,
182
+ maxOutputTokens: maxTokens,
183
+ },
184
+ }),
185
+ signal: controller.signal,
186
+ }
187
+ );
188
+
189
+ clearTimeout(timeout);
190
+
191
+ if (!res.ok) {
192
+ const errorBody = await res.text().catch(() => '');
193
+ throw new Error(`Gemini API error: ${res.status} - ${errorBody.slice(0, 200)}`);
194
+ }
195
+
196
+ const data = await res.json().catch(err => {
197
+ throw new Error(`Failed to parse Gemini response: ${err.message}`);
198
+ });
199
+
200
+ return data.candidates?.[0]?.content?.parts?.[0]?.text || '';
201
+ } catch (err) {
202
+ clearTimeout(timeout);
203
+ if (err.name === 'AbortError') {
204
+ throw new Error('Gemini API request timed out after 60s');
205
+ }
206
+ throw err;
207
+ }
208
+ }
209
+
210
+ throw new Error(`Unsupported fallback provider: ${provider}`);
211
+ } catch (error) {
212
+ // Log the error before re-throwing to help debugging
213
+ console.error(`[rubricEvaluator] Fallback judge error: ${error.message}`);
214
+ throw error;
215
+ }
216
+ }
217
+
218
+ /**
219
+ * Build the evaluation prompt for the judge model
220
+ */
221
+ function buildEvaluationPrompt(suggestion, scenario, context) {
222
+ const rubric = tutorApi.loadRubric();
223
+ const dimensions = rubric?.dimensions || {};
224
+
225
+ // Build dimension criteria text
226
+ const dimensionCriteria = Object.entries(dimensions).map(([key, dim]) => {
227
+ const criteriaText = Object.entries(dim.criteria || {})
228
+ .map(([score, desc]) => ` ${score}: ${desc}`)
229
+ .join('\n');
230
+ return `**${dim.name}** (weight: ${(dim.weight * 100).toFixed(0)}%)
231
+ ${dim.description}
232
+ Criteria:
233
+ ${criteriaText}`;
234
+ }).join('\n\n');
235
+
236
+ return `You are an expert evaluator of AI tutoring systems. Evaluate the following AI tutor suggestion against the pedagogical rubric.
237
+
238
+ ## EVALUATION RUBRIC
239
+
240
+ Score each dimension from 1-5:
241
+ - 1: Completely fails this criterion
242
+ - 2: Weak, significant issues
243
+ - 3: Adequate, meets basic expectations
244
+ - 4: Good, exceeds expectations
245
+ - 5: Excellent, exemplary
246
+
247
+ ${dimensionCriteria}
248
+
249
+ ## SCENARIO CONTEXT
250
+
251
+ **Scenario**: ${scenario.name}
252
+ **Description**: ${scenario.description}
253
+ **Expected Behavior**: ${scenario.expectedBehavior}
254
+
255
+ **Learner Context**:
256
+ ${scenario.learnerContext || context.learnerContext || 'No context provided'}
257
+
258
+ ## SUGGESTION TO EVALUATE
259
+
260
+ \`\`\`json
261
+ ${JSON.stringify(suggestion, null, 2)}
262
+ \`\`\`
263
+
264
+ ## VALIDATION REQUIREMENTS
265
+
266
+ Required elements (must include):
267
+ ${(scenario.requiredElements || []).map(e => `- ${e}`).join('\n') || '- None specified'}
268
+
269
+ Forbidden elements (must NOT include):
270
+ ${(scenario.forbiddenElements || []).map(e => `- ${e}`).join('\n') || '- None specified'}
271
+
272
+ ## YOUR TASK
273
+
274
+ Evaluate the suggestion and provide:
275
+ 1. A score (1-5) for each dimension with reasoning AND a direct quote from the suggestion that supports your assessment
276
+ 2. Whether it passes the required/forbidden element checks
277
+ 3. An overall score (weighted average, 0-100 scale)
278
+
279
+ For each dimension, include:
280
+ - **score**: 1-5 rating
281
+ - **reasoning**: Brief explanation of why this score was given
282
+ - **quote**: A short direct quote from the suggestion (title, message, or actionTarget) that exemplifies this dimension's score. Use "N/A" if no relevant quote exists.
283
+
284
+ Respond with ONLY a JSON object in this exact format:
285
+ \`\`\`json
286
+ {
287
+ "scores": {
288
+ "relevance": {"score": 4, "reasoning": "Matches learner's idle state", "quote": "Take your time with this concept"},
289
+ "specificity": {"score": 5, "reasoning": "Names exact lecture", "quote": "479-lecture-3"},
290
+ "pedagogical_soundness": {"score": 4, "reasoning": "Uses scaffolding", "quote": "Start with the basics before..."},
291
+ "personalization": {"score": 3, "reasoning": "Generic advice", "quote": "N/A"},
292
+ "actionability": {"score": 5, "reasoning": "Clear next step", "quote": "Click to continue to..."},
293
+ "tone": {"score": 4, "reasoning": "Encouraging", "quote": "You're making great progress"},
294
+ "mutual_recognition": {"score": 4, "reasoning": "Acknowledges learner's interpretation", "quote": "Your metaphor captures..."},
295
+ "dialectical_responsiveness": {"score": 3, "reasoning": "Responds but doesn't create tension", "quote": "N/A"},
296
+ "memory_integration": {"score": 4, "reasoning": "References previous session", "quote": "Building on your insight..."},
297
+ "transformative_potential": {"score": 3, "reasoning": "Informative but not transformative", "quote": "N/A"}
298
+ },
299
+ "validation": {
300
+ "passes_required": true,
301
+ "required_missing": [],
302
+ "passes_forbidden": true,
303
+ "forbidden_found": []
304
+ },
305
+ "overall_score": 82,
306
+ "summary": "Brief overall assessment"
307
+ }
308
+ \`\`\``;
309
+ }
310
+
311
+ /**
312
+ * Call the judge model (simple single-model approach)
313
+ */
314
+ async function callJudgeModel(prompt) {
315
+ const evaluator = getAvailableEvaluator();
316
+ const { provider, model, hyperparameters } = evaluator;
317
+ const temperature = hyperparameters?.temperature ?? 0.2;
318
+ const maxTokens = hyperparameters?.max_tokens ?? 1500;
319
+
320
+ if (provider === 'anthropic') {
321
+ const apiKey = process.env.ANTHROPIC_API_KEY;
322
+ if (!apiKey) throw new Error('ANTHROPIC_API_KEY not set');
323
+
324
+ // Add timeout to prevent hanging
325
+ const controller = new AbortController();
326
+ const timeout = setTimeout(() => controller.abort(), 60000);
327
+
328
+ try {
329
+ const res = await fetch('https://api.anthropic.com/v1/messages', {
330
+ method: 'POST',
331
+ headers: {
332
+ 'Content-Type': 'application/json',
333
+ 'x-api-key': apiKey,
334
+ 'anthropic-version': '2023-06-01',
335
+ },
336
+ body: JSON.stringify({
337
+ model,
338
+ max_tokens: maxTokens,
339
+ temperature,
340
+ messages: [{ role: 'user', content: prompt }],
341
+ }),
342
+ signal: controller.signal,
343
+ });
344
+
345
+ clearTimeout(timeout);
346
+
347
+ if (!res.ok) {
348
+ const errorBody = await res.text().catch(() => '');
349
+ throw new Error(`Anthropic API error: ${res.status} - ${errorBody.slice(0, 200)}`);
350
+ }
351
+
352
+ const data = await res.json().catch(err => {
353
+ throw new Error(`Failed to parse Anthropic response: ${err.message}`);
354
+ });
355
+
356
+ return data.content?.[0]?.text || '';
357
+ } catch (err) {
358
+ clearTimeout(timeout);
359
+ if (err.name === 'AbortError') {
360
+ throw new Error('Anthropic API request timed out after 60s');
361
+ }
362
+ throw err;
363
+ }
364
+ }
365
+
366
+ if (provider === 'openrouter') {
367
+ const apiKey = process.env.OPENROUTER_API_KEY;
368
+ if (!apiKey) throw new Error('OPENROUTER_API_KEY not set');
369
+
370
+ // Add timeout to prevent hanging
371
+ const controller = new AbortController();
372
+ const timeout = setTimeout(() => controller.abort(), 60000);
373
+
374
+ try {
375
+ const res = await fetch('https://openrouter.ai/api/v1/chat/completions', {
376
+ method: 'POST',
377
+ headers: {
378
+ 'Content-Type': 'application/json',
379
+ 'Authorization': `Bearer ${apiKey}`,
380
+ },
381
+ body: JSON.stringify({
382
+ model,
383
+ max_tokens: maxTokens,
384
+ temperature,
385
+ messages: [{ role: 'user', content: prompt }],
386
+ }),
387
+ signal: controller.signal,
388
+ });
389
+
390
+ clearTimeout(timeout);
391
+
392
+ if (!res.ok) {
393
+ const errorBody = await res.text().catch(() => '');
394
+ throw new Error(`OpenRouter API error: ${res.status} - ${errorBody.slice(0, 200)}`);
395
+ }
396
+
397
+ const data = await res.json().catch(err => {
398
+ throw new Error(`Failed to parse OpenRouter response: ${err.message}`);
399
+ });
400
+
401
+ return data.choices?.[0]?.message?.content || '';
402
+ } catch (err) {
403
+ clearTimeout(timeout);
404
+ if (err.name === 'AbortError') {
405
+ throw new Error('OpenRouter API request timed out after 60s');
406
+ }
407
+ throw err;
408
+ }
409
+ }
410
+
411
+ if (provider === 'openai') {
412
+ const apiKey = process.env.OPENAI_API_KEY;
413
+ if (!apiKey) throw new Error('OPENAI_API_KEY not set');
414
+
415
+ // Add timeout to prevent hanging
416
+ const controller = new AbortController();
417
+ const timeout = setTimeout(() => controller.abort(), 60000);
418
+
419
+ try {
420
+ const res = await fetch('https://api.openai.com/v1/chat/completions', {
421
+ method: 'POST',
422
+ headers: {
423
+ 'Content-Type': 'application/json',
424
+ 'Authorization': `Bearer ${apiKey}`,
425
+ },
426
+ body: JSON.stringify({
427
+ model,
428
+ max_tokens: maxTokens,
429
+ temperature,
430
+ messages: [{ role: 'user', content: prompt }],
431
+ }),
432
+ signal: controller.signal,
433
+ });
434
+
435
+ clearTimeout(timeout);
436
+
437
+ if (!res.ok) {
438
+ const errorBody = await res.text().catch(() => '');
439
+ throw new Error(`OpenAI API error: ${res.status} - ${errorBody.slice(0, 200)}`);
440
+ }
441
+
442
+ const data = await res.json().catch(err => {
443
+ throw new Error(`Failed to parse OpenAI response: ${err.message}`);
444
+ });
445
+
446
+ return data.choices?.[0]?.message?.content || '';
447
+ } catch (err) {
448
+ clearTimeout(timeout);
449
+ if (err.name === 'AbortError') {
450
+ throw new Error('OpenAI API request timed out after 60s');
451
+ }
452
+ throw err;
453
+ }
454
+ }
455
+
456
+ if (provider === 'gemini') {
457
+ const apiKey = process.env.GEMINI_API_KEY;
458
+ if (!apiKey) throw new Error('GEMINI_API_KEY not set');
459
+
460
+ // Add timeout to prevent hanging
461
+ const controller = new AbortController();
462
+ const timeout = setTimeout(() => controller.abort(), 60000);
463
+
464
+ try {
465
+ const res = await fetch(
466
+ `https://generativelanguage.googleapis.com/v1beta/models/${model}:generateContent?key=${apiKey}`,
467
+ {
468
+ method: 'POST',
469
+ headers: { 'Content-Type': 'application/json' },
470
+ body: JSON.stringify({
471
+ contents: [{ parts: [{ text: prompt }] }],
472
+ generationConfig: {
473
+ temperature,
474
+ maxOutputTokens: maxTokens,
475
+ },
476
+ }),
477
+ signal: controller.signal,
478
+ }
479
+ );
480
+
481
+ clearTimeout(timeout);
482
+
483
+ if (!res.ok) {
484
+ const errorBody = await res.text().catch(() => '');
485
+ throw new Error(`Gemini API error: ${res.status} - ${errorBody.slice(0, 200)}`);
486
+ }
487
+
488
+ const data = await res.json().catch(err => {
489
+ throw new Error(`Failed to parse Gemini response: ${err.message}`);
490
+ });
491
+
492
+ return data.candidates?.[0]?.content?.parts?.[0]?.text || '';
493
+ } catch (err) {
494
+ clearTimeout(timeout);
495
+ if (err.name === 'AbortError') {
496
+ throw new Error('Gemini API request timed out after 60s');
497
+ }
498
+ throw err;
499
+ }
500
+ }
501
+
502
+ throw new Error(`Unsupported judge provider: ${provider}`);
503
+ }
504
+
505
+ /**
506
+ * Parse the judge model's JSON response
507
+ */
508
+ function parseJudgeResponse(responseText) {
509
+ // Extract JSON from response (may be wrapped in markdown code block)
510
+ const jsonMatch = responseText.match(/```(?:json)?\s*([\s\S]*?)```/) ||
511
+ responseText.match(/\{[\s\S]*\}/);
512
+
513
+ if (!jsonMatch) {
514
+ throw new Error('Could not parse judge response as JSON');
515
+ }
516
+
517
+ const jsonStr = jsonMatch[1] || jsonMatch[0];
518
+ return JSON.parse(jsonStr);
519
+ }
520
+
521
+ /**
522
+ * Evaluate a single suggestion against the rubric
523
+ *
524
+ * @param {Object} suggestion - The suggestion to evaluate
525
+ * @param {Object} scenario - The test scenario
526
+ * @param {Object} context - Additional context
527
+ * @returns {Promise<Object>} Evaluation result
528
+ */
529
+ export async function evaluateSuggestion(suggestion, scenario, context = {}) {
530
+ const startTime = Date.now();
531
+ const evaluator = getAvailableEvaluator();
532
+
533
+ try {
534
+ const prompt = buildEvaluationPrompt(suggestion, scenario, context);
535
+ let responseText = await callJudgeModel(prompt);
536
+
537
+ // Log raw response for debugging
538
+ debugLog('[rubricEvaluator] Judge raw response (first 300 chars):', responseText.slice(0, 300));
539
+
540
+ // Handle empty response - try fallback model
541
+ if (!responseText || responseText.trim() === '') {
542
+ console.warn('[rubricEvaluator] Primary judge returned empty response, trying fallback...');
543
+ const fallbackConfig = getFallbackEvaluator();
544
+ if (fallbackConfig) {
545
+ responseText = await callJudgeModelWithConfig(prompt, fallbackConfig);
546
+ debugLog('[rubricEvaluator] Fallback response (first 300 chars):', responseText.slice(0, 300));
547
+ }
548
+ if (!responseText || responseText.trim() === '') {
549
+ throw new Error('Judge model returned empty response (primary and fallback)');
550
+ }
551
+ }
552
+
553
+ const parsed = parseJudgeResponse(responseText);
554
+
555
+ // Debug: log what was parsed
556
+ debugLog('[rubricEvaluator] Parsed keys:', Object.keys(parsed));
557
+ if (parsed.scores) {
558
+ debugLog('[rubricEvaluator] Score keys:', Object.keys(parsed.scores));
559
+ }
560
+
561
+ // Warning if scores are missing
562
+ if (!parsed.scores || Object.keys(parsed.scores).length === 0) {
563
+ console.warn('[rubricEvaluator] Warning: Judge response missing dimension scores');
564
+ console.warn('[rubricEvaluator] Full parsed response:', JSON.stringify(parsed, null, 2).slice(0, 800));
565
+ }
566
+
567
+ // Normalize dimension keys
568
+ const scores = {};
569
+ const dimensionMap = {
570
+ relevance: 'relevance',
571
+ specificity: 'specificity',
572
+ pedagogical_soundness: 'pedagogical',
573
+ pedagogical: 'pedagogical',
574
+ personalization: 'personalization',
575
+ actionability: 'actionability',
576
+ tone: 'tone',
577
+ };
578
+
579
+ for (const [key, value] of Object.entries(parsed.scores || {})) {
580
+ const normalizedKey = dimensionMap[key] || key;
581
+ // Handle both {score, reasoning, quote} objects and plain numbers
582
+ if (typeof value === 'object' && value !== null) {
583
+ scores[normalizedKey] = {
584
+ score: value.score,
585
+ reasoning: value.reasoning,
586
+ quote: value.quote || null,
587
+ };
588
+ } else if (typeof value === 'number') {
589
+ scores[normalizedKey] = {
590
+ score: value,
591
+ reasoning: null,
592
+ quote: null,
593
+ };
594
+ }
595
+ }
596
+
597
+ // Calculate overall score from dimension scores if available, otherwise use judge's score
598
+ let overallScore = parsed.overall_score;
599
+ if (Object.keys(scores).length > 0) {
600
+ const calculatedScore = calculateOverallScore(scores);
601
+ if (calculatedScore > 0) {
602
+ overallScore = calculatedScore;
603
+ }
604
+ }
605
+
606
+ return {
607
+ success: true,
608
+ scores,
609
+ overallScore,
610
+ passesRequired: parsed.validation?.passes_required ?? true,
611
+ passesForbidden: parsed.validation?.passes_forbidden ?? true,
612
+ requiredMissing: parsed.validation?.required_missing || [],
613
+ forbiddenFound: parsed.validation?.forbidden_found || [],
614
+ summary: parsed.summary,
615
+ evaluatorModel: `${evaluator.provider}/${evaluator.model}`,
616
+ evaluationTimeMs: Date.now() - startTime,
617
+ };
618
+ } catch (error) {
619
+ return {
620
+ success: false,
621
+ error: error.message,
622
+ evaluatorModel: `${evaluator.provider}/${evaluator.model}`,
623
+ evaluationTimeMs: Date.now() - startTime,
624
+ };
625
+ }
626
+ }
627
+
628
+ /**
629
+ * Evaluate multiple suggestions (batch)
630
+ */
631
+ export async function evaluateSuggestions(suggestions, scenario, context = {}) {
632
+ const results = [];
633
+
634
+ for (const suggestion of suggestions) {
635
+ const result = await evaluateSuggestion(suggestion, scenario, context);
636
+ results.push(result);
637
+ }
638
+
639
+ // Aggregate scores if multiple suggestions
640
+ if (results.length > 0 && results[0].success) {
641
+ const avgScores = {};
642
+ const dimensions = ['relevance', 'specificity', 'pedagogical', 'personalization', 'actionability', 'tone'];
643
+
644
+ for (const dim of dimensions) {
645
+ const scores = results
646
+ .filter(r => r.success && r.scores?.[dim])
647
+ .map(r => r.scores[dim].score);
648
+
649
+ if (scores.length > 0) {
650
+ avgScores[dim] = scores.reduce((a, b) => a + b, 0) / scores.length;
651
+ }
652
+ }
653
+
654
+ const overallScores = results.filter(r => r.success).map(r => r.overallScore);
655
+ const avgOverall = overallScores.length > 0
656
+ ? overallScores.reduce((a, b) => a + b, 0) / overallScores.length
657
+ : 0;
658
+
659
+ return {
660
+ individualResults: results,
661
+ aggregateScores: avgScores,
662
+ aggregateOverall: avgOverall,
663
+ allPassRequired: results.every(r => r.passesRequired),
664
+ allPassForbidden: results.every(r => r.passesForbidden),
665
+ };
666
+ }
667
+
668
+ return {
669
+ individualResults: results,
670
+ aggregateScores: {},
671
+ aggregateOverall: 0,
672
+ allPassRequired: false,
673
+ allPassForbidden: false,
674
+ };
675
+ }
676
+
677
+ /**
678
+ * Quick validation without AI (rule-based checks only)
679
+ *
680
+ * @param {Object} suggestion - The suggestion to validate
681
+ * @param {Object} scenario - The test scenario
682
+ * @returns {Object} Validation result
683
+ */
684
+ export function quickValidate(suggestion, scenario) {
685
+ // For required elements, check all fields including actionTarget
686
+ const fullSuggestionText = JSON.stringify(suggestion).toLowerCase();
687
+
688
+ // For forbidden elements, only check user-facing fields (title, message)
689
+ // NOT the internal 'reasoning' field which may contain context-derived text
690
+ const userFacingText = [
691
+ suggestion.title || '',
692
+ suggestion.message || '',
693
+ ].join(' ').toLowerCase();
694
+
695
+ const result = {
696
+ passesRequired: true,
697
+ passesForbidden: true,
698
+ requiredMissing: [],
699
+ forbiddenFound: [],
700
+ };
701
+
702
+ // Check required elements (can appear anywhere including actionTarget, reasoning)
703
+ for (const required of scenario.requiredElements || []) {
704
+ const normalizedRequired = required.toLowerCase();
705
+ const found = fullSuggestionText.includes(normalizedRequired) ||
706
+ (suggestion.actionTarget && suggestion.actionTarget.toLowerCase().includes(normalizedRequired)) ||
707
+ (suggestion.title && suggestion.title.toLowerCase().includes(normalizedRequired)) ||
708
+ (suggestion.message && suggestion.message.toLowerCase().includes(normalizedRequired));
709
+
710
+ if (!found) {
711
+ result.passesRequired = false;
712
+ result.requiredMissing.push(required);
713
+ }
714
+ }
715
+
716
+ // Check forbidden elements (only in user-facing text: title, message)
717
+ // The 'reasoning' field is internal and may legitimately reference context terms
718
+ for (const forbidden of scenario.forbiddenElements || []) {
719
+ const normalizedForbidden = forbidden.toLowerCase();
720
+ if (userFacingText.includes(normalizedForbidden)) {
721
+ result.passesForbidden = false;
722
+ result.forbiddenFound.push(forbidden);
723
+ }
724
+ }
725
+
726
+ return result;
727
+ }
728
+
729
+ /**
730
+ * Calculate weighted overall score from dimension scores
731
+ */
732
+ export function calculateOverallScore(scores) {
733
+ const rubric = tutorApi.loadRubric();
734
+ const dimensions = rubric?.dimensions || {};
735
+
736
+ // Map rubric keys to normalized score keys (pedagogical_soundness -> pedagogical)
737
+ const keyMap = {
738
+ pedagogical_soundness: 'pedagogical',
739
+ };
740
+
741
+ let weightedSum = 0;
742
+ let totalWeight = 0;
743
+
744
+ for (const [key, dim] of Object.entries(dimensions)) {
745
+ // Try both the rubric key and the normalized key
746
+ const normalizedKey = keyMap[key] || key;
747
+ const scoreData = scores[normalizedKey] || scores[key];
748
+ const score = scoreData?.score ?? scoreData;
749
+
750
+ if (typeof score === 'number') {
751
+ weightedSum += score * (dim.weight || 0);
752
+ totalWeight += dim.weight || 0;
753
+ }
754
+ }
755
+
756
+ if (totalWeight === 0) return 0;
757
+
758
+ // Convert 1-5 scale to 0-100
759
+ const avgScore = weightedSum / totalWeight;
760
+ return ((avgScore - 1) / 4) * 100;
761
+ }
762
+
763
+ /**
764
+ * Calculate recognition-specific metrics from scores
765
+ * These metrics track the quality of mutual recognition between tutor and learner
766
+ *
767
+ * @param {Object} scores - Scores object from evaluation
768
+ * @returns {Object} Recognition metrics
769
+ */
770
+ export function calculateRecognitionMetrics(scores) {
771
+ const recognitionDimensions = [
772
+ 'mutual_recognition',
773
+ 'dialectical_responsiveness',
774
+ 'memory_integration',
775
+ 'transformative_potential',
776
+ ];
777
+
778
+ const metrics = {
779
+ recognitionScore: 0,
780
+ transformationRate: false,
781
+ memoryUtilization: false,
782
+ mutualAcknowledgment: false,
783
+ dimensionScores: {},
784
+ hasRecognitionData: false,
785
+ };
786
+
787
+ let totalScore = 0;
788
+ let scoredCount = 0;
789
+
790
+ for (const dim of recognitionDimensions) {
791
+ const scoreData = scores[dim];
792
+ const score = scoreData?.score ?? scoreData;
793
+
794
+ if (typeof score === 'number') {
795
+ metrics.dimensionScores[dim] = score;
796
+ totalScore += score;
797
+ scoredCount++;
798
+
799
+ // Track specific thresholds
800
+ if (dim === 'transformative_potential' && score >= 4) {
801
+ metrics.transformationRate = true;
802
+ }
803
+ if (dim === 'memory_integration' && score >= 3) {
804
+ metrics.memoryUtilization = true;
805
+ }
806
+ if (dim === 'mutual_recognition' && score >= 4) {
807
+ metrics.mutualAcknowledgment = true;
808
+ }
809
+ }
810
+ }
811
+
812
+ if (scoredCount > 0) {
813
+ metrics.recognitionScore = totalScore / scoredCount;
814
+ metrics.hasRecognitionData = true;
815
+ }
816
+
817
+ return metrics;
818
+ }
819
+
820
+ export default {
821
+ evaluateSuggestion,
822
+ evaluateSuggestions,
823
+ quickValidate,
824
+ calculateOverallScore,
825
+ calculateRecognitionMetrics,
826
+ };