@machinespirits/eval 0.1.2 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (102) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +161 -0
  3. package/config/eval-settings.yaml +18 -0
  4. package/config/evaluation-rubric-learner.yaml +277 -0
  5. package/config/evaluation-rubric.yaml +613 -0
  6. package/config/interaction-eval-scenarios.yaml +93 -50
  7. package/config/learner-agents.yaml +124 -193
  8. package/config/machinespirits-eval.code-workspace +11 -0
  9. package/config/providers.yaml +60 -0
  10. package/config/suggestion-scenarios.yaml +1399 -0
  11. package/config/tutor-agents.yaml +716 -0
  12. package/docs/EVALUATION-VARIABLES.md +589 -0
  13. package/docs/REPLICATION-PLAN.md +577 -0
  14. package/index.js +15 -6
  15. package/package.json +16 -22
  16. package/routes/evalRoutes.js +88 -36
  17. package/scripts/analyze-judge-reliability.js +401 -0
  18. package/scripts/analyze-run.js +97 -0
  19. package/scripts/analyze-run.mjs +282 -0
  20. package/scripts/analyze-validation-failures.js +141 -0
  21. package/scripts/check-run.mjs +17 -0
  22. package/scripts/code-impasse-strategies.js +1132 -0
  23. package/scripts/compare-runs.js +44 -0
  24. package/scripts/compare-suggestions.js +80 -0
  25. package/scripts/compare-transformation.js +116 -0
  26. package/scripts/dig-into-run.js +158 -0
  27. package/scripts/eval-cli.js +2626 -0
  28. package/scripts/generate-paper-figures.py +452 -0
  29. package/scripts/qualitative-analysis-ai.js +1313 -0
  30. package/scripts/qualitative-analysis.js +688 -0
  31. package/scripts/seed-db.js +87 -0
  32. package/scripts/show-failed-suggestions.js +64 -0
  33. package/scripts/validate-content.js +192 -0
  34. package/server.js +3 -2
  35. package/services/__tests__/evalConfigLoader.test.js +338 -0
  36. package/services/anovaStats.js +499 -0
  37. package/services/contentResolver.js +407 -0
  38. package/services/dialogueTraceAnalyzer.js +454 -0
  39. package/services/evalConfigLoader.js +625 -0
  40. package/services/evaluationRunner.js +2171 -270
  41. package/services/evaluationStore.js +564 -29
  42. package/services/learnerConfigLoader.js +75 -5
  43. package/services/learnerRubricEvaluator.js +284 -0
  44. package/services/learnerTutorInteractionEngine.js +375 -0
  45. package/services/processUtils.js +18 -0
  46. package/services/progressLogger.js +98 -0
  47. package/services/promptRecommendationService.js +31 -26
  48. package/services/promptRewriter.js +427 -0
  49. package/services/rubricEvaluator.js +543 -70
  50. package/services/streamingReporter.js +104 -0
  51. package/services/turnComparisonAnalyzer.js +494 -0
  52. package/components/MobileEvalDashboard.tsx +0 -267
  53. package/components/comparison/DeltaAnalysisTable.tsx +0 -137
  54. package/components/comparison/ProfileComparisonCard.tsx +0 -176
  55. package/components/comparison/RecognitionABMode.tsx +0 -385
  56. package/components/comparison/RecognitionMetricsPanel.tsx +0 -135
  57. package/components/comparison/WinnerIndicator.tsx +0 -64
  58. package/components/comparison/index.ts +0 -5
  59. package/components/mobile/BottomSheet.tsx +0 -233
  60. package/components/mobile/DimensionBreakdown.tsx +0 -210
  61. package/components/mobile/DocsView.tsx +0 -363
  62. package/components/mobile/LogsView.tsx +0 -481
  63. package/components/mobile/PsychodynamicQuadrant.tsx +0 -261
  64. package/components/mobile/QuickTestView.tsx +0 -1098
  65. package/components/mobile/RecognitionTypeChart.tsx +0 -124
  66. package/components/mobile/RecognitionView.tsx +0 -809
  67. package/components/mobile/RunDetailView.tsx +0 -261
  68. package/components/mobile/RunHistoryView.tsx +0 -367
  69. package/components/mobile/ScoreRadial.tsx +0 -211
  70. package/components/mobile/StreamingLogPanel.tsx +0 -230
  71. package/components/mobile/SynthesisStrategyChart.tsx +0 -140
  72. package/docs/research/ABLATION-DIALOGUE-ROUNDS.md +0 -52
  73. package/docs/research/ABLATION-MODEL-SELECTION.md +0 -53
  74. package/docs/research/ADVANCED-EVAL-ANALYSIS.md +0 -60
  75. package/docs/research/ANOVA-RESULTS-2026-01-14.md +0 -257
  76. package/docs/research/COMPREHENSIVE-EVALUATION-PLAN.md +0 -586
  77. package/docs/research/COST-ANALYSIS.md +0 -56
  78. package/docs/research/CRITICAL-REVIEW-RECOGNITION-TUTORING.md +0 -340
  79. package/docs/research/DYNAMIC-VS-SCRIPTED-ANALYSIS.md +0 -291
  80. package/docs/research/EVAL-SYSTEM-ANALYSIS.md +0 -306
  81. package/docs/research/FACTORIAL-RESULTS-2026-01-14.md +0 -301
  82. package/docs/research/IMPLEMENTATION-PLAN-CRITIQUE-RESPONSE.md +0 -1988
  83. package/docs/research/LONGITUDINAL-DYADIC-EVALUATION.md +0 -282
  84. package/docs/research/MULTI-JUDGE-VALIDATION-2026-01-14.md +0 -147
  85. package/docs/research/PAPER-EXTENSION-DYADIC.md +0 -204
  86. package/docs/research/PAPER-UNIFIED.md +0 -659
  87. package/docs/research/PAPER-UNIFIED.pdf +0 -0
  88. package/docs/research/PROMPT-IMPROVEMENTS-2026-01-14.md +0 -356
  89. package/docs/research/SESSION-NOTES-2026-01-11-RECOGNITION-EVAL.md +0 -419
  90. package/docs/research/apa.csl +0 -2133
  91. package/docs/research/archive/PAPER-DRAFT-RECOGNITION-TUTORING.md +0 -1637
  92. package/docs/research/archive/paper-multiagent-tutor.tex +0 -978
  93. package/docs/research/paper-draft/full-paper.md +0 -136
  94. package/docs/research/paper-draft/images/pasted-image-2026-01-24T03-47-47-846Z-d76a7ae2.png +0 -0
  95. package/docs/research/paper-draft/references.bib +0 -515
  96. package/docs/research/transcript-baseline.md +0 -139
  97. package/docs/research/transcript-recognition-multiagent.md +0 -187
  98. package/hooks/useEvalData.ts +0 -625
  99. package/server-init.js +0 -45
  100. package/services/benchmarkService.js +0 -1892
  101. package/types.ts +0 -165
  102. package/utils/haptics.ts +0 -45
@@ -6,7 +6,8 @@
6
6
  * Provider details are resolved from config/providers.yaml
7
7
  */
8
8
 
9
- import { tutorApiService as tutorApi, tutorConfigLoader as configLoader } from '@machinespirits/tutor-core';
9
+ import * as evalConfigLoader from './evalConfigLoader.js';
10
+ import { jsonrepair } from 'jsonrepair';
10
11
 
11
12
  // Debug logging helper - suppressed in transcript mode for clean output
12
13
  function debugLog(...args) {
@@ -16,13 +17,41 @@ function debugLog(...args) {
16
17
  }
17
18
 
18
19
  /**
19
- * Get available evaluator configuration, resolving model references via providers.yaml
20
+ * Get available judge configuration, resolving model references via providers.yaml
20
21
  * Tries primary model first, then fallback if primary is not configured
22
+ *
23
+ * @param {Object} [overrides] - Optional judge override
24
+ * @param {Object} [overrides.judgeOverride] - Override judge model config
25
+ * @param {string} [overrides.judgeOverride.model] - Model reference (e.g. 'anthropic/claude-opus-4.5')
26
+ * @param {string} [overrides.judgeOverride.apiKeyEnv] - Env var name for API key
27
+ * @param {Object} [overrides.judgeOverride.hyperparameters] - Override hyperparameters
21
28
  */
22
- function getAvailableEvaluator() {
23
- const rubric = tutorApi.loadRubric();
24
- // Prefer 'judge' config, fall back to legacy 'evaluator' for backwards compatibility
25
- const evalConfig = rubric?.judge || rubric?.evaluator;
29
+ export function getAvailableJudge(overrides = {}) {
30
+ const { judgeOverride } = overrides;
31
+
32
+ // If a judge override is provided, resolve and return it directly
33
+ if (judgeOverride?.model) {
34
+ try {
35
+ const resolved = evalConfigLoader.resolveModel(judgeOverride.model);
36
+ // Allow apiKeyEnv override
37
+ let apiKey = resolved.apiKey;
38
+ if (judgeOverride.apiKeyEnv) {
39
+ apiKey = process.env[judgeOverride.apiKeyEnv] || apiKey;
40
+ }
41
+ return {
42
+ provider: resolved.provider,
43
+ model: resolved.model,
44
+ apiKey,
45
+ baseUrl: resolved.baseUrl,
46
+ hyperparameters: judgeOverride.hyperparameters || {},
47
+ };
48
+ } catch (e) {
49
+ console.warn(`[rubricEvaluator] Failed to resolve judge override: ${e.message}, falling back to rubric config`);
50
+ }
51
+ }
52
+
53
+ const rubric = evalConfigLoader.loadRubric();
54
+ const evalConfig = rubric?.judge;
26
55
 
27
56
  if (!evalConfig?.model) {
28
57
  console.warn('[rubricEvaluator] No judge config in evaluation-rubric.yaml, using defaults');
@@ -35,7 +64,7 @@ function getAvailableEvaluator() {
35
64
 
36
65
  // Try primary model
37
66
  try {
38
- const resolved = configLoader.resolveModel(evalConfig.model);
67
+ const resolved = evalConfigLoader.resolveModel(evalConfig.model);
39
68
  if (resolved.isConfigured) {
40
69
  return {
41
70
  provider: resolved.provider,
@@ -46,15 +75,15 @@ function getAvailableEvaluator() {
46
75
  };
47
76
  }
48
77
  } catch (e) {
49
- console.warn(`[rubricEvaluator] Failed to resolve primary evaluator: ${e.message}`);
78
+ console.warn(`[rubricEvaluator] Failed to resolve primary judge: ${e.message}`);
50
79
  }
51
80
 
52
81
  // Try fallback
53
82
  if (evalConfig.fallback?.model) {
54
83
  try {
55
- const fallback = configLoader.resolveModel(evalConfig.fallback.model);
84
+ const fallback = evalConfigLoader.resolveModel(evalConfig.fallback.model);
56
85
  if (fallback.isConfigured) {
57
- debugLog(`[rubricEvaluator] Using fallback evaluator: ${fallback.provider}/${fallback.model}`);
86
+ debugLog(`[rubricEvaluator] Using fallback judge: ${fallback.provider}/${fallback.model}`);
58
87
  return {
59
88
  provider: fallback.provider,
60
89
  model: fallback.model,
@@ -64,12 +93,12 @@ function getAvailableEvaluator() {
64
93
  };
65
94
  }
66
95
  } catch (e) {
67
- console.warn(`[rubricEvaluator] Failed to resolve fallback evaluator: ${e.message}`);
96
+ console.warn(`[rubricEvaluator] Failed to resolve fallback judge: ${e.message}`);
68
97
  }
69
98
  }
70
99
 
71
100
  // Return primary anyway - will fail with helpful error
72
- const resolved = configLoader.resolveModel(evalConfig.model);
101
+ const resolved = evalConfigLoader.resolveModel(evalConfig.model);
73
102
  return {
74
103
  provider: resolved.provider,
75
104
  model: resolved.model,
@@ -78,17 +107,16 @@ function getAvailableEvaluator() {
78
107
  }
79
108
 
80
109
  /**
81
- * Get the fallback evaluator config (if different from primary)
110
+ * Get the fallback judge config (if different from primary)
82
111
  */
83
- function getFallbackEvaluator() {
84
- const rubric = tutorApi.loadRubric();
85
- // Prefer 'judge' config, fall back to legacy 'evaluator'
86
- const evalConfig = rubric?.judge || rubric?.evaluator;
112
+ function getFallbackJudge() {
113
+ const rubric = evalConfigLoader.loadRubric();
114
+ const evalConfig = rubric?.judge;
87
115
 
88
116
  if (!evalConfig?.fallback?.model) return null;
89
117
 
90
118
  try {
91
- const fallback = configLoader.resolveModel(evalConfig.fallback.model);
119
+ const fallback = evalConfigLoader.resolveModel(evalConfig.fallback.model);
92
120
  if (fallback.isConfigured) {
93
121
  return {
94
122
  provider: fallback.provider,
@@ -135,6 +163,7 @@ async function callJudgeModelWithConfig(prompt, config) {
135
163
  model,
136
164
  max_tokens: maxTokens,
137
165
  temperature,
166
+ include_reasoning: false,
138
167
  messages: [{ role: 'user', content: prompt }],
139
168
  }),
140
169
  signal: controller.signal,
@@ -215,12 +244,94 @@ async function callJudgeModelWithConfig(prompt, config) {
215
244
  }
216
245
  }
217
246
 
247
+ /**
248
+ * Format a dialogue transcript for the judge prompt.
249
+ * Renders the conversation history and internal deliberation traces as
250
+ * a readable exchange so the judge can evaluate the suggestion in context.
251
+ *
252
+ * @param {Object} dialogueContext - Dialogue context from the evaluation runner
253
+ * @param {Array} dialogueContext.conversationHistory - Array of turn objects
254
+ * @param {Array} dialogueContext.dialogueTrace - Current turn's dialogue trace
255
+ * @param {Array} dialogueContext.consolidatedTrace - Full multi-turn consolidated trace
256
+ * @returns {string|null} Formatted transcript section, or null if no dialogue data
257
+ */
258
+ function formatDialogueTranscript(dialogueContext) {
259
+ if (!dialogueContext) return null;
260
+
261
+ const { conversationHistory, dialogueTrace, consolidatedTrace } = dialogueContext;
262
+
263
+ // Use consolidatedTrace if available (richest source), otherwise fall back to conversationHistory
264
+ const trace = consolidatedTrace?.length > 0 ? consolidatedTrace : null;
265
+ const history = conversationHistory?.length > 0 ? conversationHistory : null;
266
+
267
+ if (!trace && !history) return null;
268
+
269
+ const lines = [];
270
+
271
+ if (trace) {
272
+ // Format from consolidated trace (includes internal deliberation)
273
+ let currentTurnIdx = -1;
274
+ for (const entry of trace) {
275
+ // Turn separator
276
+ if (entry.turnIndex !== undefined && entry.turnIndex !== currentTurnIdx) {
277
+ currentTurnIdx = entry.turnIndex;
278
+ lines.push(`\n--- Turn ${currentTurnIdx} ---`);
279
+ }
280
+
281
+ if (entry.agent === 'user' && entry.action === 'turn_action') {
282
+ lines.push(`[Learner Action] ${entry.detail || entry.contextSummary}`);
283
+ } else if (entry.agent === 'learner_ego') {
284
+ lines.push(` (Learner Ego: ${truncate(entry.detail || entry.contextSummary, 200)})`);
285
+ } else if (entry.agent === 'learner_superego') {
286
+ lines.push(` (Learner Superego: ${truncate(entry.detail || entry.contextSummary, 200)})`);
287
+ } else if (entry.agent === 'learner_synthesis') {
288
+ lines.push(`[Learner] "${truncate(entry.detail || entry.contextSummary, 300)}"`);
289
+ } else if (entry.agent === 'ego' && entry.action === 'initial_draft') {
290
+ lines.push(` (Tutor Ego draft: ${truncate(entry.contextSummary || '', 150)})`);
291
+ } else if (entry.agent === 'superego') {
292
+ lines.push(` (Tutor Superego: ${truncate(entry.contextSummary || '', 150)})`);
293
+ } else if (entry.agent === 'ego' && (entry.action === 'revision' || entry.action === 'final_revision')) {
294
+ lines.push(`[Tutor] (revised after superego feedback)`);
295
+ } else if (entry.agent === 'user' && entry.action === 'final_output') {
296
+ lines.push(`[Tutor → Learner] Delivered ${entry.suggestionCount} suggestion(s)`);
297
+ } else if (entry.agent === 'ego') {
298
+ // Single-agent tutor response
299
+ lines.push(`[Tutor] ${truncate(entry.contextSummary || '', 200)}`);
300
+ }
301
+ }
302
+ } else if (history) {
303
+ // Format from conversation history (less detail, no internal deliberation)
304
+ for (const turn of history) {
305
+ lines.push(`\n--- Turn ${turn.turnIndex} ---`);
306
+ if (turn.learnerMessage) {
307
+ lines.push(`[Learner] "${truncate(turn.learnerMessage, 300)}"`);
308
+ } else if (turn.learnerAction) {
309
+ lines.push(`[Learner Action] ${turn.learnerAction}`);
310
+ }
311
+ if (turn.suggestion) {
312
+ const msg = turn.suggestion.message || turn.suggestion.title || '';
313
+ lines.push(`[Tutor] "${truncate(msg, 300)}"`);
314
+ }
315
+ }
316
+ }
317
+
318
+ return lines.join('\n');
319
+ }
320
+
321
+ /**
322
+ * Truncate a string to maxLen characters, adding ellipsis if needed.
323
+ */
324
+ function truncate(str, maxLen) {
325
+ if (!str) return '';
326
+ if (str.length <= maxLen) return str;
327
+ return str.slice(0, maxLen - 3) + '...';
328
+ }
329
+
218
330
  /**
219
331
  * Build the evaluation prompt for the judge model
220
332
  */
221
333
  function buildEvaluationPrompt(suggestion, scenario, context) {
222
- const rubric = tutorApi.loadRubric();
223
- const dimensions = rubric?.dimensions || {};
334
+ const dimensions = evalConfigLoader.getRubricDimensions();
224
335
 
225
336
  // Build dimension criteria text
226
337
  const dimensionCriteria = Object.entries(dimensions).map(([key, dim]) => {
@@ -233,7 +344,18 @@ Criteria:
233
344
  ${criteriaText}`;
234
345
  }).join('\n\n');
235
346
 
236
- return `You are an expert evaluator of AI tutoring systems. Evaluate the following AI tutor suggestion against the pedagogical rubric.
347
+ // Build optional dialogue transcript section
348
+ const dialogueTranscript = formatDialogueTranscript(context.dialogueContext);
349
+ const dialogueSection = dialogueTranscript
350
+ ? `\n## DIALOGUE TRANSCRIPT
351
+
352
+ The following is the full learner-tutor exchange leading to this suggestion. Internal deliberation traces (ego/superego) show the reasoning process. Use this context to evaluate how well the tutor responded to the learner's actual engagement, struggle, and development.
353
+
354
+ ${dialogueTranscript}
355
+ `
356
+ : '';
357
+
358
+ return `You are an expert evaluator of AI tutoring systems. Evaluate the following AI tutor suggestion against the pedagogical rubric.${dialogueTranscript ? ' The suggestion was produced in the context of a multi-turn dialogue — evaluate it in that context, considering how the tutor responds to the learner\'s actual engagement and development.' : ''}
237
359
 
238
360
  ## EVALUATION RUBRIC
239
361
 
@@ -254,7 +376,7 @@ ${dimensionCriteria}
254
376
 
255
377
  **Learner Context**:
256
378
  ${scenario.learnerContext || context.learnerContext || 'No context provided'}
257
-
379
+ ${dialogueSection}
258
380
  ## SUGGESTION TO EVALUATE
259
381
 
260
382
  \`\`\`json
@@ -271,30 +393,39 @@ ${(scenario.forbiddenElements || []).map(e => `- ${e}`).join('\n') || '- None sp
271
393
 
272
394
  ## YOUR TASK
273
395
 
274
- Evaluate the suggestion and provide:
275
- 1. A score (1-5) for each dimension with reasoning AND a direct quote from the suggestion that supports your assessment
396
+ Evaluate the suggestion${dialogueTranscript ? ' in the context of the dialogue above' : ''} and provide:
397
+ 1. A score (1-5) for each dimension with reasoning
276
398
  2. Whether it passes the required/forbidden element checks
277
399
  3. An overall score (weighted average, 0-100 scale)
278
400
 
279
401
  For each dimension, include:
280
402
  - **score**: 1-5 rating
281
- - **reasoning**: Brief explanation of why this score was given
282
- - **quote**: A short direct quote from the suggestion (title, message, or actionTarget) that exemplifies this dimension's score. Use "N/A" if no relevant quote exists.
403
+ - **reasoning**: Brief explanation of why this score was given${dialogueTranscript ? '. For recognition dimensions, consider how the tutor engaged with the learner\'s actual responses and development.' : ''}
404
+
405
+ CRITICAL JSON RULES:
406
+ - Never use unescaped double quotes inside JSON string values. Use single quotes or rephrase.
407
+ - Keep "reasoning" values under 25 words.
408
+ - BAD: "reasoning": "Says "great job" which is encouraging"
409
+ - GOOD: "reasoning": "Says 'great job' which is encouraging"
283
410
 
284
- Respond with ONLY a JSON object in this exact format:
411
+ Respond with ONLY a JSON object in this exact format (no other text before or after):
285
412
  \`\`\`json
286
413
  {
287
414
  "scores": {
288
- "relevance": {"score": 4, "reasoning": "Matches learner's idle state", "quote": "Take your time with this concept"},
289
- "specificity": {"score": 5, "reasoning": "Names exact lecture", "quote": "479-lecture-3"},
290
- "pedagogical_soundness": {"score": 4, "reasoning": "Uses scaffolding", "quote": "Start with the basics before..."},
291
- "personalization": {"score": 3, "reasoning": "Generic advice", "quote": "N/A"},
292
- "actionability": {"score": 5, "reasoning": "Clear next step", "quote": "Click to continue to..."},
293
- "tone": {"score": 4, "reasoning": "Encouraging", "quote": "You're making great progress"},
294
- "mutual_recognition": {"score": 4, "reasoning": "Acknowledges learner's interpretation", "quote": "Your metaphor captures..."},
295
- "dialectical_responsiveness": {"score": 3, "reasoning": "Responds but doesn't create tension", "quote": "N/A"},
296
- "memory_integration": {"score": 4, "reasoning": "References previous session", "quote": "Building on your insight..."},
297
- "transformative_potential": {"score": 3, "reasoning": "Informative but not transformative", "quote": "N/A"}
415
+ "relevance": {"score": 4, "reasoning": "Matches idle state well"},
416
+ "specificity": {"score": 5, "reasoning": "Names exact lecture"},
417
+ "pedagogical_soundness": {"score": 4, "reasoning": "Uses scaffolding"},
418
+ "personalization": {"score": 3, "reasoning": "Generic advice"},
419
+ "actionability": {"score": 5, "reasoning": "Clear next step"},
420
+ "tone": {"score": 4, "reasoning": "Encouraging tone"},
421
+ "mutual_recognition": {"score": 4, "reasoning": "Acknowledges interpretation"},
422
+ "dialectical_responsiveness": {"score": 3, "reasoning": "Responds without tension"},
423
+ "memory_integration": {"score": 4, "reasoning": "References prior session"},
424
+ "transformative_potential": {"score": 3, "reasoning": "Informative not transformative"},
425
+ "tutor_adaptation": {"score": 3, "reasoning": "Some adjustment to input"},
426
+ "learner_growth": {"score": 4, "reasoning": "Shows conceptual development"},
427
+ "productive_struggle": {"score": 4, "reasoning": "Sustains appropriate tension"},
428
+ "epistemic_honesty": {"score": 4, "reasoning": "Represents complexity fairly"}
298
429
  },
299
430
  "validation": {
300
431
  "passes_required": true,
@@ -310,10 +441,20 @@ Respond with ONLY a JSON object in this exact format:
310
441
 
311
442
  /**
312
443
  * Call the judge model (simple single-model approach)
444
+ *
445
+ * @param {string} prompt - The evaluation prompt
446
+ * @param {Object} [overrides] - Optional overrides (passed to getAvailableEvaluator)
313
447
  */
314
- async function callJudgeModel(prompt) {
315
- const evaluator = getAvailableEvaluator();
316
- const { provider, model, hyperparameters } = evaluator;
448
+ // Models/prefixes that support response_format: { type: "json_object" }
449
+ const JSON_MODE_PREFIXES = ['gpt-', 'deepseek-', 'claude-'];
450
+
451
+ function supportsJsonMode(model) {
452
+ return JSON_MODE_PREFIXES.some(prefix => model.startsWith(prefix));
453
+ }
454
+
455
+ async function callJudgeModel(prompt, overrides = {}) {
456
+ const judge = getAvailableJudge(overrides);
457
+ const { provider, model, hyperparameters } = judge;
317
458
  const temperature = hyperparameters?.temperature ?? 0.2;
318
459
  const maxTokens = hyperparameters?.max_tokens ?? 1500;
319
460
 
@@ -372,18 +513,24 @@ async function callJudgeModel(prompt) {
372
513
  const timeout = setTimeout(() => controller.abort(), 60000);
373
514
 
374
515
  try {
516
+ const body = {
517
+ model,
518
+ max_tokens: maxTokens,
519
+ temperature,
520
+ include_reasoning: false,
521
+ messages: [{ role: 'user', content: prompt }],
522
+ };
523
+ if (supportsJsonMode(model)) {
524
+ body.response_format = { type: 'json_object' };
525
+ }
526
+
375
527
  const res = await fetch('https://openrouter.ai/api/v1/chat/completions', {
376
528
  method: 'POST',
377
529
  headers: {
378
530
  'Content-Type': 'application/json',
379
531
  'Authorization': `Bearer ${apiKey}`,
380
532
  },
381
- body: JSON.stringify({
382
- model,
383
- max_tokens: maxTokens,
384
- temperature,
385
- messages: [{ role: 'user', content: prompt }],
386
- }),
533
+ body: JSON.stringify(body),
387
534
  signal: controller.signal,
388
535
  });
389
536
 
@@ -417,18 +564,23 @@ async function callJudgeModel(prompt) {
417
564
  const timeout = setTimeout(() => controller.abort(), 60000);
418
565
 
419
566
  try {
567
+ const body = {
568
+ model,
569
+ max_tokens: maxTokens,
570
+ temperature,
571
+ messages: [{ role: 'user', content: prompt }],
572
+ };
573
+ if (supportsJsonMode(model)) {
574
+ body.response_format = { type: 'json_object' };
575
+ }
576
+
420
577
  const res = await fetch('https://api.openai.com/v1/chat/completions', {
421
578
  method: 'POST',
422
579
  headers: {
423
580
  'Content-Type': 'application/json',
424
581
  'Authorization': `Bearer ${apiKey}`,
425
582
  },
426
- body: JSON.stringify({
427
- model,
428
- max_tokens: maxTokens,
429
- temperature,
430
- messages: [{ role: 'user', content: prompt }],
431
- }),
583
+ body: JSON.stringify(body),
432
584
  signal: controller.signal,
433
585
  });
434
586
 
@@ -502,20 +654,155 @@ async function callJudgeModel(prompt) {
502
654
  throw new Error(`Unsupported judge provider: ${provider}`);
503
655
  }
504
656
 
657
+ /**
658
+ * Repair unescaped double quotes inside JSON string values.
659
+ * Targets patterns like: "key": "text with "inner" quotes"
660
+ * Replaces inner unescaped quotes with single quotes.
661
+ */
662
+ function repairUnescapedQuotes(jsonStr) {
663
+ // Strategy: walk through the string tracking whether we're inside a JSON string value.
664
+ // When we find a quote that isn't at a key/value boundary, replace it with a single quote.
665
+ let result = '';
666
+ let i = 0;
667
+ const len = jsonStr.length;
668
+
669
+ while (i < len) {
670
+ const ch = jsonStr[i];
671
+
672
+ if (ch === '"') {
673
+ // Find the matching close quote for this JSON string
674
+ result += '"';
675
+ i++;
676
+ // Scan for the true end of this string value
677
+ while (i < len) {
678
+ const c = jsonStr[i];
679
+ if (c === '\\') {
680
+ // Escaped character — pass through both chars
681
+ result += jsonStr[i] + (jsonStr[i + 1] || '');
682
+ i += 2;
683
+ continue;
684
+ }
685
+ if (c === '"') {
686
+ // Is this the real end of the string? Look ahead for JSON structure chars
687
+ const after = jsonStr.slice(i + 1).trimStart();
688
+ if (after[0] === ':' || after[0] === ',' || after[0] === '}' || after[0] === ']' || after.length === 0) {
689
+ // This is a real closing quote
690
+ result += '"';
691
+ i++;
692
+ break;
693
+ } else {
694
+ // This is an unescaped inner quote — replace with single quote
695
+ result += "'";
696
+ i++;
697
+ continue;
698
+ }
699
+ }
700
+ result += c;
701
+ i++;
702
+ }
703
+ } else {
704
+ result += ch;
705
+ i++;
706
+ }
707
+ }
708
+
709
+ return result;
710
+ }
711
+
712
+ /**
713
+ * Last-resort regex extraction of individual dimension scores.
714
+ * Returns a partial result object or null if too few scores found.
715
+ */
716
+ function regexScoreRescue(text) {
717
+ const dimensionNames = [
718
+ 'relevance', 'specificity', 'pedagogical_soundness', 'personalization',
719
+ 'actionability', 'tone', 'mutual_recognition', 'dialectical_responsiveness',
720
+ 'memory_integration', 'transformative_potential', 'tutor_adaptation',
721
+ 'learner_growth', 'productive_struggle', 'epistemic_honesty',
722
+ ];
723
+
724
+ const scores = {};
725
+ for (const dim of dimensionNames) {
726
+ // Match patterns like: "relevance": {"score": 4 or "relevance":{"score":4
727
+ const pattern = new RegExp(`"${dim}"\\s*:\\s*\\{?\\s*"?score"?\\s*:\\s*(\\d)`, 'i');
728
+ const match = text.match(pattern);
729
+ if (match) {
730
+ scores[dim] = { score: parseInt(match[1], 10), reasoning: null };
731
+ }
732
+ }
733
+
734
+ // Need at least 3 scores for a useful partial result
735
+ if (Object.keys(scores).length < 3) return null;
736
+
737
+ debugLog(`[rubricEvaluator] Regex rescue recovered ${Object.keys(scores).length} scores`);
738
+
739
+ // Try to extract overall_score and summary
740
+ const overallMatch = text.match(/"overall_score"\s*:\s*(\d+)/);
741
+ const summaryMatch = text.match(/"summary"\s*:\s*"([^"]+)"/);
742
+
743
+ return {
744
+ scores,
745
+ validation: { passes_required: true, required_missing: [], passes_forbidden: true, forbidden_found: [] },
746
+ overall_score: overallMatch ? parseInt(overallMatch[1], 10) : null,
747
+ summary: summaryMatch ? summaryMatch[1] : 'Partial scores recovered via regex rescue',
748
+ };
749
+ }
750
+
505
751
  /**
506
752
  * Parse the judge model's JSON response
507
753
  */
508
754
  function parseJudgeResponse(responseText) {
509
755
  // Extract JSON from response (may be wrapped in markdown code block)
510
- const jsonMatch = responseText.match(/```(?:json)?\s*([\s\S]*?)```/) ||
511
- responseText.match(/\{[\s\S]*\}/);
756
+ let jsonMatch = responseText.match(/```(?:json)?\s*([\s\S]*?)```/);
757
+
758
+ if (!jsonMatch) {
759
+ // Strip preamble/postamble text — find first { and last }
760
+ const firstBrace = responseText.indexOf('{');
761
+ const lastBrace = responseText.lastIndexOf('}');
762
+ if (firstBrace !== -1 && lastBrace > firstBrace) {
763
+ jsonMatch = [null, responseText.slice(firstBrace, lastBrace + 1)];
764
+ }
765
+ }
512
766
 
513
767
  if (!jsonMatch) {
514
768
  throw new Error('Could not parse judge response as JSON');
515
769
  }
516
770
 
517
771
  const jsonStr = jsonMatch[1] || jsonMatch[0];
518
- return JSON.parse(jsonStr);
772
+
773
+ try {
774
+ return JSON.parse(jsonStr);
775
+ } catch (e) {
776
+ // Try to fix common JSON issues: trailing commas, unescaped newlines in strings
777
+ const cleaned = jsonStr
778
+ .replace(/,\s*([}\]])/g, '$1') // trailing commas
779
+ .replace(/[\x00-\x1f]/g, m => // control chars in strings
780
+ m === '\n' ? '\\n' : m === '\t' ? '\\t' : m === '\r' ? '\\r' : '');
781
+ try {
782
+ return JSON.parse(cleaned);
783
+ } catch (e2) {
784
+ // Attempt JSON repair: fix unescaped double quotes inside string values
785
+ // Pattern: "key": "text with "inner" quotes" → "key": "text with 'inner' quotes"
786
+ debugLog('[rubricEvaluator] Attempting JSON repair for unescaped quotes...');
787
+ try {
788
+ const repaired = repairUnescapedQuotes(cleaned);
789
+ return JSON.parse(repaired);
790
+ } catch (e3) {
791
+ // Final fallback: use jsonrepair library which handles many more edge cases
792
+ debugLog('[rubricEvaluator] Attempting jsonrepair library fallback...');
793
+ try {
794
+ const robustRepaired = jsonrepair(jsonStr);
795
+ return JSON.parse(robustRepaired);
796
+ } catch (e4) {
797
+ // Last resort: regex rescue — extract individual scores
798
+ debugLog('[rubricEvaluator] Attempting regex score rescue...');
799
+ const rescued = regexScoreRescue(jsonStr);
800
+ if (rescued) return rescued;
801
+ throw new Error(`Could not parse judge response as JSON: initial=${e.message}, repair=${e3.message}, jsonrepair=${e4.message}`);
802
+ }
803
+ }
804
+ }
805
+ }
519
806
  }
520
807
 
521
808
  /**
@@ -524,15 +811,17 @@ function parseJudgeResponse(responseText) {
524
811
  * @param {Object} suggestion - The suggestion to evaluate
525
812
  * @param {Object} scenario - The test scenario
526
813
  * @param {Object} context - Additional context
814
+ * @param {Object} [overrides] - Optional overrides
815
+ * @param {Object} [overrides.judgeOverride] - Override judge model config
527
816
  * @returns {Promise<Object>} Evaluation result
528
817
  */
529
- export async function evaluateSuggestion(suggestion, scenario, context = {}) {
818
+ export async function evaluateSuggestion(suggestion, scenario, context = {}, overrides = {}) {
530
819
  const startTime = Date.now();
531
- const evaluator = getAvailableEvaluator();
820
+ const judge = getAvailableJudge(overrides);
532
821
 
533
822
  try {
534
823
  const prompt = buildEvaluationPrompt(suggestion, scenario, context);
535
- let responseText = await callJudgeModel(prompt);
824
+ let responseText = await callJudgeModel(prompt, overrides);
536
825
 
537
826
  // Log raw response for debugging
538
827
  debugLog('[rubricEvaluator] Judge raw response (first 300 chars):', responseText.slice(0, 300));
@@ -540,7 +829,7 @@ export async function evaluateSuggestion(suggestion, scenario, context = {}) {
540
829
  // Handle empty response - try fallback model
541
830
  if (!responseText || responseText.trim() === '') {
542
831
  console.warn('[rubricEvaluator] Primary judge returned empty response, trying fallback...');
543
- const fallbackConfig = getFallbackEvaluator();
832
+ const fallbackConfig = getFallbackJudge();
544
833
  if (fallbackConfig) {
545
834
  responseText = await callJudgeModelWithConfig(prompt, fallbackConfig);
546
835
  debugLog('[rubricEvaluator] Fallback response (first 300 chars):', responseText.slice(0, 300));
@@ -550,7 +839,35 @@ export async function evaluateSuggestion(suggestion, scenario, context = {}) {
550
839
  }
551
840
  }
552
841
 
553
- const parsed = parseJudgeResponse(responseText);
842
+ let parsed;
843
+ try {
844
+ parsed = parseJudgeResponse(responseText);
845
+ } catch (parseError) {
846
+ // JSON parse failed — retry with fallback model before giving up
847
+ console.warn(`[rubricEvaluator] Parse failed (${parseError.message}), retrying with fallback...`);
848
+ const fallbackConfig = getFallbackJudge();
849
+ if (fallbackConfig) {
850
+ let retryText = await callJudgeModelWithConfig(prompt, fallbackConfig);
851
+ if (retryText && retryText.trim()) {
852
+ try {
853
+ parsed = parseJudgeResponse(retryText);
854
+ } catch (retryParseError) {
855
+ // Second attempt: models are non-deterministic, retry once more
856
+ console.warn(`[rubricEvaluator] Fallback parse also failed (${retryParseError.message}), retrying once more...`);
857
+ retryText = await callJudgeModelWithConfig(prompt, fallbackConfig);
858
+ if (retryText && retryText.trim()) {
859
+ parsed = parseJudgeResponse(retryText);
860
+ } else {
861
+ throw retryParseError;
862
+ }
863
+ }
864
+ } else {
865
+ throw parseError;
866
+ }
867
+ } else {
868
+ throw parseError;
869
+ }
870
+ }
554
871
 
555
872
  // Debug: log what was parsed
556
873
  debugLog('[rubricEvaluator] Parsed keys:', Object.keys(parsed));
@@ -578,18 +895,16 @@ export async function evaluateSuggestion(suggestion, scenario, context = {}) {
578
895
 
579
896
  for (const [key, value] of Object.entries(parsed.scores || {})) {
580
897
  const normalizedKey = dimensionMap[key] || key;
581
- // Handle both {score, reasoning, quote} objects and plain numbers
898
+ // Handle both {score, reasoning} objects and plain numbers
582
899
  if (typeof value === 'object' && value !== null) {
583
900
  scores[normalizedKey] = {
584
901
  score: value.score,
585
902
  reasoning: value.reasoning,
586
- quote: value.quote || null,
587
903
  };
588
904
  } else if (typeof value === 'number') {
589
905
  scores[normalizedKey] = {
590
906
  score: value,
591
907
  reasoning: null,
592
- quote: null,
593
908
  };
594
909
  }
595
910
  }
@@ -607,19 +922,25 @@ export async function evaluateSuggestion(suggestion, scenario, context = {}) {
607
922
  success: true,
608
923
  scores,
609
924
  overallScore,
925
+ baseScore: calculateBaseScore(scores),
926
+ recognitionScore: calculateRecognitionScore(scores),
610
927
  passesRequired: parsed.validation?.passes_required ?? true,
611
928
  passesForbidden: parsed.validation?.passes_forbidden ?? true,
612
929
  requiredMissing: parsed.validation?.required_missing || [],
613
930
  forbiddenFound: parsed.validation?.forbidden_found || [],
614
931
  summary: parsed.summary,
615
- evaluatorModel: `${evaluator.provider}/${evaluator.model}`,
932
+ judgeModel: `${judge.provider}/${judge.model}`,
616
933
  evaluationTimeMs: Date.now() - startTime,
617
934
  };
618
935
  } catch (error) {
619
936
  return {
620
937
  success: false,
938
+ scores: {},
939
+ overallScore: null,
940
+ baseScore: null,
941
+ recognitionScore: null,
621
942
  error: error.message,
622
- evaluatorModel: `${evaluator.provider}/${evaluator.model}`,
943
+ judgeModel: `${judge.provider}/${judge.model}`,
623
944
  evaluationTimeMs: Date.now() - startTime,
624
945
  };
625
946
  }
@@ -628,18 +949,18 @@ export async function evaluateSuggestion(suggestion, scenario, context = {}) {
628
949
  /**
629
950
  * Evaluate multiple suggestions (batch)
630
951
  */
631
- export async function evaluateSuggestions(suggestions, scenario, context = {}) {
952
+ export async function evaluateSuggestions(suggestions, scenario, context = {}, overrides = {}) {
632
953
  const results = [];
633
954
 
634
955
  for (const suggestion of suggestions) {
635
- const result = await evaluateSuggestion(suggestion, scenario, context);
956
+ const result = await evaluateSuggestion(suggestion, scenario, context, overrides);
636
957
  results.push(result);
637
958
  }
638
959
 
639
960
  // Aggregate scores if multiple suggestions
640
961
  if (results.length > 0 && results[0].success) {
641
962
  const avgScores = {};
642
- const dimensions = ['relevance', 'specificity', 'pedagogical', 'personalization', 'actionability', 'tone'];
963
+ const dimensions = ['relevance', 'specificity', 'pedagogical', 'personalization', 'actionability', 'tone', 'productive_struggle', 'epistemic_honesty'];
643
964
 
644
965
  for (const dim of dimensions) {
645
966
  const scores = results
@@ -697,9 +1018,18 @@ export function quickValidate(suggestion, scenario) {
697
1018
  passesForbidden: true,
698
1019
  requiredMissing: [],
699
1020
  forbiddenFound: [],
1021
+ // Transformation marker analysis (for multi-turn scenarios)
1022
+ transformationMarkersFound: [],
1023
+ staticMarkersFound: [],
1024
+ learnerGrowthMarkersFound: [],
1025
+ learnerStaticMarkersFound: [],
1026
+ transformationScore: null,
1027
+ learnerGrowthScore: null,
1028
+ bilateralTransformationScore: null,
700
1029
  };
701
1030
 
702
1031
  // Check required elements (can appear anywhere including actionTarget, reasoning)
1032
+ // ALL elements in requiredElements must be present
703
1033
  for (const required of scenario.requiredElements || []) {
704
1034
  const normalizedRequired = required.toLowerCase();
705
1035
  const found = fullSuggestionText.includes(normalizedRequired) ||
@@ -713,6 +1043,23 @@ export function quickValidate(suggestion, scenario) {
713
1043
  }
714
1044
  }
715
1045
 
1046
+ // Check requiredElementsAny - ANY one of these must be present
1047
+ const anyElements = scenario.requiredElementsAny || [];
1048
+ if (anyElements.length > 0) {
1049
+ const anyFound = anyElements.some(required => {
1050
+ const normalizedRequired = required.toLowerCase();
1051
+ return fullSuggestionText.includes(normalizedRequired) ||
1052
+ (suggestion.actionTarget && suggestion.actionTarget.toLowerCase().includes(normalizedRequired)) ||
1053
+ (suggestion.title && suggestion.title.toLowerCase().includes(normalizedRequired)) ||
1054
+ (suggestion.message && suggestion.message.toLowerCase().includes(normalizedRequired));
1055
+ });
1056
+
1057
+ if (!anyFound) {
1058
+ result.passesRequired = false;
1059
+ result.requiredMissing.push(`one of: ${anyElements.join(', ')}`);
1060
+ }
1061
+ }
1062
+
716
1063
  // Check forbidden elements (only in user-facing text: title, message)
717
1064
  // The 'reasoning' field is internal and may legitimately reference context terms
718
1065
  for (const forbidden of scenario.forbiddenElements || []) {
@@ -723,15 +1070,121 @@ export function quickValidate(suggestion, scenario) {
723
1070
  }
724
1071
  }
725
1072
 
1073
+ // Check transformation markers (for multi-turn scenarios)
1074
+ const markers = scenario.transformationMarkers || scenario.transformation_markers;
1075
+ if (markers) {
1076
+ // Tutor evolving markers (in tutor response)
1077
+ const tutorEvolving = markers.tutor_evolving || markers.tutorEvolving || [];
1078
+ for (const marker of tutorEvolving) {
1079
+ if (userFacingText.includes(marker.toLowerCase())) {
1080
+ result.transformationMarkersFound.push(marker);
1081
+ }
1082
+ }
1083
+
1084
+ // Tutor static markers (in tutor response)
1085
+ const tutorStatic = markers.tutor_static || markers.tutorStatic || [];
1086
+ for (const marker of tutorStatic) {
1087
+ if (userFacingText.includes(marker.toLowerCase())) {
1088
+ result.staticMarkersFound.push(marker);
1089
+ }
1090
+ }
1091
+
1092
+ // Calculate tutor transformation score
1093
+ const tutorEvolvingCount = result.transformationMarkersFound.length;
1094
+ const tutorStaticCount = result.staticMarkersFound.length;
1095
+ const tutorTotal = tutorEvolvingCount + tutorStaticCount;
1096
+ if (tutorTotal > 0) {
1097
+ result.transformationScore = tutorEvolvingCount / tutorTotal;
1098
+ }
1099
+
1100
+ // Learner growth markers (these will typically be found in context/history, not suggestion)
1101
+ // Included for completeness when analyzing full dialogue
1102
+ const learnerEvolving = markers.learner_evolving || markers.learnerEvolving || [];
1103
+ const learnerStatic = markers.learner_static || markers.learnerStatic || [];
1104
+
1105
+ // Store marker definitions for use by turn analysis
1106
+ result._markerDefinitions = {
1107
+ tutorEvolving,
1108
+ tutorStatic,
1109
+ learnerEvolving,
1110
+ learnerStatic,
1111
+ };
1112
+ }
1113
+
726
1114
  return result;
727
1115
  }
728
1116
 
1117
+ // Dimension groups for dual scoring
1118
+ const BASE_DIMENSIONS = ['relevance', 'specificity', 'pedagogical', 'personalization', 'actionability', 'tone', 'productive_struggle', 'epistemic_honesty'];
1119
+ const RECOGNITION_DIMENSIONS = ['mutual_recognition', 'dialectical_responsiveness', 'memory_integration', 'transformative_potential', 'tutor_adaptation', 'learner_growth'];
1120
+
1121
+ /**
1122
+ * Calculate base score from the 6 core pedagogical dimensions.
1123
+ * Weights are re-normalized to sum to 1.0 across only the base dimensions.
1124
+ *
1125
+ * @param {Object} scores - Scores object from evaluation
1126
+ * @returns {number} 0-100 score
1127
+ */
1128
+ export function calculateBaseScore(scores) {
1129
+ const dimensions = evalConfigLoader.getRubricDimensions();
1130
+ const keyMap = { pedagogical_soundness: 'pedagogical' };
1131
+
1132
+ let weightedSum = 0;
1133
+ let totalWeight = 0;
1134
+
1135
+ for (const [key, dim] of Object.entries(dimensions)) {
1136
+ const normalizedKey = keyMap[key] || key;
1137
+ if (!BASE_DIMENSIONS.includes(normalizedKey)) continue;
1138
+
1139
+ const scoreData = scores[normalizedKey] || scores[key];
1140
+ const score = scoreData?.score ?? scoreData;
1141
+
1142
+ if (typeof score === 'number') {
1143
+ weightedSum += score * (dim.weight || 0);
1144
+ totalWeight += dim.weight || 0;
1145
+ }
1146
+ }
1147
+
1148
+ if (totalWeight === 0) return 0;
1149
+ const avgScore = weightedSum / totalWeight;
1150
+ return ((avgScore - 1) / 4) * 100;
1151
+ }
1152
+
1153
+ /**
1154
+ * Calculate recognition score from the 4 recognition dimensions.
1155
+ * Weights are re-normalized to sum to 1.0 across only the recognition dimensions.
1156
+ *
1157
+ * @param {Object} scores - Scores object from evaluation
1158
+ * @returns {number} 0-100 score
1159
+ */
1160
+ export function calculateRecognitionScore(scores) {
1161
+ const dimensions = evalConfigLoader.getRubricDimensions();
1162
+
1163
+ let weightedSum = 0;
1164
+ let totalWeight = 0;
1165
+
1166
+ for (const [key, dim] of Object.entries(dimensions)) {
1167
+ if (!RECOGNITION_DIMENSIONS.includes(key)) continue;
1168
+
1169
+ const scoreData = scores[key];
1170
+ const score = scoreData?.score ?? scoreData;
1171
+
1172
+ if (typeof score === 'number') {
1173
+ weightedSum += score * (dim.weight || 0);
1174
+ totalWeight += dim.weight || 0;
1175
+ }
1176
+ }
1177
+
1178
+ if (totalWeight === 0) return 0;
1179
+ const avgScore = weightedSum / totalWeight;
1180
+ return ((avgScore - 1) / 4) * 100;
1181
+ }
1182
+
729
1183
  /**
730
1184
  * Calculate weighted overall score from dimension scores
731
1185
  */
732
1186
  export function calculateOverallScore(scores) {
733
- const rubric = tutorApi.loadRubric();
734
- const dimensions = rubric?.dimensions || {};
1187
+ const dimensions = evalConfigLoader.getRubricDimensions();
735
1188
 
736
1189
  // Map rubric keys to normalized score keys (pedagogical_soundness -> pedagogical)
737
1190
  const keyMap = {
@@ -773,6 +1226,8 @@ export function calculateRecognitionMetrics(scores) {
773
1226
  'dialectical_responsiveness',
774
1227
  'memory_integration',
775
1228
  'transformative_potential',
1229
+ 'tutor_adaptation',
1230
+ 'learner_growth',
776
1231
  ];
777
1232
 
778
1233
  const metrics = {
@@ -780,6 +1235,9 @@ export function calculateRecognitionMetrics(scores) {
780
1235
  transformationRate: false,
781
1236
  memoryUtilization: false,
782
1237
  mutualAcknowledgment: false,
1238
+ tutorAdaptation: false,
1239
+ learnerGrowth: false,
1240
+ bilateralTransformation: false,
783
1241
  dimensionScores: {},
784
1242
  hasRecognitionData: false,
785
1243
  };
@@ -806,9 +1264,18 @@ export function calculateRecognitionMetrics(scores) {
806
1264
  if (dim === 'mutual_recognition' && score >= 4) {
807
1265
  metrics.mutualAcknowledgment = true;
808
1266
  }
1267
+ if (dim === 'tutor_adaptation' && score >= 4) {
1268
+ metrics.tutorAdaptation = true;
1269
+ }
1270
+ if (dim === 'learner_growth' && score >= 4) {
1271
+ metrics.learnerGrowth = true;
1272
+ }
809
1273
  }
810
1274
  }
811
1275
 
1276
+ // Bilateral transformation: both tutor and learner show adaptation
1277
+ metrics.bilateralTransformation = metrics.tutorAdaptation && metrics.learnerGrowth;
1278
+
812
1279
  if (scoredCount > 0) {
813
1280
  metrics.recognitionScore = totalScore / scoredCount;
814
1281
  metrics.hasRecognitionData = true;
@@ -817,10 +1284,16 @@ export function calculateRecognitionMetrics(scores) {
817
1284
  return metrics;
818
1285
  }
819
1286
 
1287
+ export { buildEvaluationPrompt };
1288
+
820
1289
  export default {
821
1290
  evaluateSuggestion,
822
1291
  evaluateSuggestions,
823
1292
  quickValidate,
824
1293
  calculateOverallScore,
1294
+ calculateBaseScore,
1295
+ calculateRecognitionScore,
825
1296
  calculateRecognitionMetrics,
1297
+ getAvailableJudge,
1298
+ buildEvaluationPrompt,
826
1299
  };