@machinespirits/eval 0.1.2 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +161 -0
- package/config/eval-settings.yaml +18 -0
- package/config/evaluation-rubric-learner.yaml +277 -0
- package/config/evaluation-rubric.yaml +613 -0
- package/config/interaction-eval-scenarios.yaml +93 -50
- package/config/learner-agents.yaml +124 -193
- package/config/machinespirits-eval.code-workspace +11 -0
- package/config/providers.yaml +60 -0
- package/config/suggestion-scenarios.yaml +1399 -0
- package/config/tutor-agents.yaml +716 -0
- package/docs/EVALUATION-VARIABLES.md +589 -0
- package/docs/REPLICATION-PLAN.md +577 -0
- package/index.js +15 -6
- package/package.json +16 -22
- package/routes/evalRoutes.js +88 -36
- package/scripts/analyze-judge-reliability.js +401 -0
- package/scripts/analyze-run.js +97 -0
- package/scripts/analyze-run.mjs +282 -0
- package/scripts/analyze-validation-failures.js +141 -0
- package/scripts/check-run.mjs +17 -0
- package/scripts/code-impasse-strategies.js +1132 -0
- package/scripts/compare-runs.js +44 -0
- package/scripts/compare-suggestions.js +80 -0
- package/scripts/compare-transformation.js +116 -0
- package/scripts/dig-into-run.js +158 -0
- package/scripts/eval-cli.js +2626 -0
- package/scripts/generate-paper-figures.py +452 -0
- package/scripts/qualitative-analysis-ai.js +1313 -0
- package/scripts/qualitative-analysis.js +688 -0
- package/scripts/seed-db.js +87 -0
- package/scripts/show-failed-suggestions.js +64 -0
- package/scripts/validate-content.js +192 -0
- package/server.js +3 -2
- package/services/__tests__/evalConfigLoader.test.js +338 -0
- package/services/anovaStats.js +499 -0
- package/services/contentResolver.js +407 -0
- package/services/dialogueTraceAnalyzer.js +454 -0
- package/services/evalConfigLoader.js +625 -0
- package/services/evaluationRunner.js +2171 -270
- package/services/evaluationStore.js +564 -29
- package/services/learnerConfigLoader.js +75 -5
- package/services/learnerRubricEvaluator.js +284 -0
- package/services/learnerTutorInteractionEngine.js +375 -0
- package/services/processUtils.js +18 -0
- package/services/progressLogger.js +98 -0
- package/services/promptRecommendationService.js +31 -26
- package/services/promptRewriter.js +427 -0
- package/services/rubricEvaluator.js +543 -70
- package/services/streamingReporter.js +104 -0
- package/services/turnComparisonAnalyzer.js +494 -0
- package/components/MobileEvalDashboard.tsx +0 -267
- package/components/comparison/DeltaAnalysisTable.tsx +0 -137
- package/components/comparison/ProfileComparisonCard.tsx +0 -176
- package/components/comparison/RecognitionABMode.tsx +0 -385
- package/components/comparison/RecognitionMetricsPanel.tsx +0 -135
- package/components/comparison/WinnerIndicator.tsx +0 -64
- package/components/comparison/index.ts +0 -5
- package/components/mobile/BottomSheet.tsx +0 -233
- package/components/mobile/DimensionBreakdown.tsx +0 -210
- package/components/mobile/DocsView.tsx +0 -363
- package/components/mobile/LogsView.tsx +0 -481
- package/components/mobile/PsychodynamicQuadrant.tsx +0 -261
- package/components/mobile/QuickTestView.tsx +0 -1098
- package/components/mobile/RecognitionTypeChart.tsx +0 -124
- package/components/mobile/RecognitionView.tsx +0 -809
- package/components/mobile/RunDetailView.tsx +0 -261
- package/components/mobile/RunHistoryView.tsx +0 -367
- package/components/mobile/ScoreRadial.tsx +0 -211
- package/components/mobile/StreamingLogPanel.tsx +0 -230
- package/components/mobile/SynthesisStrategyChart.tsx +0 -140
- package/docs/research/ABLATION-DIALOGUE-ROUNDS.md +0 -52
- package/docs/research/ABLATION-MODEL-SELECTION.md +0 -53
- package/docs/research/ADVANCED-EVAL-ANALYSIS.md +0 -60
- package/docs/research/ANOVA-RESULTS-2026-01-14.md +0 -257
- package/docs/research/COMPREHENSIVE-EVALUATION-PLAN.md +0 -586
- package/docs/research/COST-ANALYSIS.md +0 -56
- package/docs/research/CRITICAL-REVIEW-RECOGNITION-TUTORING.md +0 -340
- package/docs/research/DYNAMIC-VS-SCRIPTED-ANALYSIS.md +0 -291
- package/docs/research/EVAL-SYSTEM-ANALYSIS.md +0 -306
- package/docs/research/FACTORIAL-RESULTS-2026-01-14.md +0 -301
- package/docs/research/IMPLEMENTATION-PLAN-CRITIQUE-RESPONSE.md +0 -1988
- package/docs/research/LONGITUDINAL-DYADIC-EVALUATION.md +0 -282
- package/docs/research/MULTI-JUDGE-VALIDATION-2026-01-14.md +0 -147
- package/docs/research/PAPER-EXTENSION-DYADIC.md +0 -204
- package/docs/research/PAPER-UNIFIED.md +0 -659
- package/docs/research/PAPER-UNIFIED.pdf +0 -0
- package/docs/research/PROMPT-IMPROVEMENTS-2026-01-14.md +0 -356
- package/docs/research/SESSION-NOTES-2026-01-11-RECOGNITION-EVAL.md +0 -419
- package/docs/research/apa.csl +0 -2133
- package/docs/research/archive/PAPER-DRAFT-RECOGNITION-TUTORING.md +0 -1637
- package/docs/research/archive/paper-multiagent-tutor.tex +0 -978
- package/docs/research/paper-draft/full-paper.md +0 -136
- package/docs/research/paper-draft/images/pasted-image-2026-01-24T03-47-47-846Z-d76a7ae2.png +0 -0
- package/docs/research/paper-draft/references.bib +0 -515
- package/docs/research/transcript-baseline.md +0 -139
- package/docs/research/transcript-recognition-multiagent.md +0 -187
- package/hooks/useEvalData.ts +0 -625
- package/server-init.js +0 -45
- package/services/benchmarkService.js +0 -1892
- package/types.ts +0 -165
- package/utils/haptics.ts +0 -45
|
@@ -6,7 +6,8 @@
|
|
|
6
6
|
* Provider details are resolved from config/providers.yaml
|
|
7
7
|
*/
|
|
8
8
|
|
|
9
|
-
import
|
|
9
|
+
import * as evalConfigLoader from './evalConfigLoader.js';
|
|
10
|
+
import { jsonrepair } from 'jsonrepair';
|
|
10
11
|
|
|
11
12
|
// Debug logging helper - suppressed in transcript mode for clean output
|
|
12
13
|
function debugLog(...args) {
|
|
@@ -16,13 +17,41 @@ function debugLog(...args) {
|
|
|
16
17
|
}
|
|
17
18
|
|
|
18
19
|
/**
|
|
19
|
-
* Get available
|
|
20
|
+
* Get available judge configuration, resolving model references via providers.yaml
|
|
20
21
|
* Tries primary model first, then fallback if primary is not configured
|
|
22
|
+
*
|
|
23
|
+
* @param {Object} [overrides] - Optional judge override
|
|
24
|
+
* @param {Object} [overrides.judgeOverride] - Override judge model config
|
|
25
|
+
* @param {string} [overrides.judgeOverride.model] - Model reference (e.g. 'anthropic/claude-opus-4.5')
|
|
26
|
+
* @param {string} [overrides.judgeOverride.apiKeyEnv] - Env var name for API key
|
|
27
|
+
* @param {Object} [overrides.judgeOverride.hyperparameters] - Override hyperparameters
|
|
21
28
|
*/
|
|
22
|
-
function
|
|
23
|
-
const
|
|
24
|
-
|
|
25
|
-
|
|
29
|
+
export function getAvailableJudge(overrides = {}) {
|
|
30
|
+
const { judgeOverride } = overrides;
|
|
31
|
+
|
|
32
|
+
// If a judge override is provided, resolve and return it directly
|
|
33
|
+
if (judgeOverride?.model) {
|
|
34
|
+
try {
|
|
35
|
+
const resolved = evalConfigLoader.resolveModel(judgeOverride.model);
|
|
36
|
+
// Allow apiKeyEnv override
|
|
37
|
+
let apiKey = resolved.apiKey;
|
|
38
|
+
if (judgeOverride.apiKeyEnv) {
|
|
39
|
+
apiKey = process.env[judgeOverride.apiKeyEnv] || apiKey;
|
|
40
|
+
}
|
|
41
|
+
return {
|
|
42
|
+
provider: resolved.provider,
|
|
43
|
+
model: resolved.model,
|
|
44
|
+
apiKey,
|
|
45
|
+
baseUrl: resolved.baseUrl,
|
|
46
|
+
hyperparameters: judgeOverride.hyperparameters || {},
|
|
47
|
+
};
|
|
48
|
+
} catch (e) {
|
|
49
|
+
console.warn(`[rubricEvaluator] Failed to resolve judge override: ${e.message}, falling back to rubric config`);
|
|
50
|
+
}
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
const rubric = evalConfigLoader.loadRubric();
|
|
54
|
+
const evalConfig = rubric?.judge;
|
|
26
55
|
|
|
27
56
|
if (!evalConfig?.model) {
|
|
28
57
|
console.warn('[rubricEvaluator] No judge config in evaluation-rubric.yaml, using defaults');
|
|
@@ -35,7 +64,7 @@ function getAvailableEvaluator() {
|
|
|
35
64
|
|
|
36
65
|
// Try primary model
|
|
37
66
|
try {
|
|
38
|
-
const resolved =
|
|
67
|
+
const resolved = evalConfigLoader.resolveModel(evalConfig.model);
|
|
39
68
|
if (resolved.isConfigured) {
|
|
40
69
|
return {
|
|
41
70
|
provider: resolved.provider,
|
|
@@ -46,15 +75,15 @@ function getAvailableEvaluator() {
|
|
|
46
75
|
};
|
|
47
76
|
}
|
|
48
77
|
} catch (e) {
|
|
49
|
-
console.warn(`[rubricEvaluator] Failed to resolve primary
|
|
78
|
+
console.warn(`[rubricEvaluator] Failed to resolve primary judge: ${e.message}`);
|
|
50
79
|
}
|
|
51
80
|
|
|
52
81
|
// Try fallback
|
|
53
82
|
if (evalConfig.fallback?.model) {
|
|
54
83
|
try {
|
|
55
|
-
const fallback =
|
|
84
|
+
const fallback = evalConfigLoader.resolveModel(evalConfig.fallback.model);
|
|
56
85
|
if (fallback.isConfigured) {
|
|
57
|
-
debugLog(`[rubricEvaluator] Using fallback
|
|
86
|
+
debugLog(`[rubricEvaluator] Using fallback judge: ${fallback.provider}/${fallback.model}`);
|
|
58
87
|
return {
|
|
59
88
|
provider: fallback.provider,
|
|
60
89
|
model: fallback.model,
|
|
@@ -64,12 +93,12 @@ function getAvailableEvaluator() {
|
|
|
64
93
|
};
|
|
65
94
|
}
|
|
66
95
|
} catch (e) {
|
|
67
|
-
console.warn(`[rubricEvaluator] Failed to resolve fallback
|
|
96
|
+
console.warn(`[rubricEvaluator] Failed to resolve fallback judge: ${e.message}`);
|
|
68
97
|
}
|
|
69
98
|
}
|
|
70
99
|
|
|
71
100
|
// Return primary anyway - will fail with helpful error
|
|
72
|
-
const resolved =
|
|
101
|
+
const resolved = evalConfigLoader.resolveModel(evalConfig.model);
|
|
73
102
|
return {
|
|
74
103
|
provider: resolved.provider,
|
|
75
104
|
model: resolved.model,
|
|
@@ -78,17 +107,16 @@ function getAvailableEvaluator() {
|
|
|
78
107
|
}
|
|
79
108
|
|
|
80
109
|
/**
|
|
81
|
-
* Get the fallback
|
|
110
|
+
* Get the fallback judge config (if different from primary)
|
|
82
111
|
*/
|
|
83
|
-
function
|
|
84
|
-
const rubric =
|
|
85
|
-
|
|
86
|
-
const evalConfig = rubric?.judge || rubric?.evaluator;
|
|
112
|
+
function getFallbackJudge() {
|
|
113
|
+
const rubric = evalConfigLoader.loadRubric();
|
|
114
|
+
const evalConfig = rubric?.judge;
|
|
87
115
|
|
|
88
116
|
if (!evalConfig?.fallback?.model) return null;
|
|
89
117
|
|
|
90
118
|
try {
|
|
91
|
-
const fallback =
|
|
119
|
+
const fallback = evalConfigLoader.resolveModel(evalConfig.fallback.model);
|
|
92
120
|
if (fallback.isConfigured) {
|
|
93
121
|
return {
|
|
94
122
|
provider: fallback.provider,
|
|
@@ -135,6 +163,7 @@ async function callJudgeModelWithConfig(prompt, config) {
|
|
|
135
163
|
model,
|
|
136
164
|
max_tokens: maxTokens,
|
|
137
165
|
temperature,
|
|
166
|
+
include_reasoning: false,
|
|
138
167
|
messages: [{ role: 'user', content: prompt }],
|
|
139
168
|
}),
|
|
140
169
|
signal: controller.signal,
|
|
@@ -215,12 +244,94 @@ async function callJudgeModelWithConfig(prompt, config) {
|
|
|
215
244
|
}
|
|
216
245
|
}
|
|
217
246
|
|
|
247
|
+
/**
|
|
248
|
+
* Format a dialogue transcript for the judge prompt.
|
|
249
|
+
* Renders the conversation history and internal deliberation traces as
|
|
250
|
+
* a readable exchange so the judge can evaluate the suggestion in context.
|
|
251
|
+
*
|
|
252
|
+
* @param {Object} dialogueContext - Dialogue context from the evaluation runner
|
|
253
|
+
* @param {Array} dialogueContext.conversationHistory - Array of turn objects
|
|
254
|
+
* @param {Array} dialogueContext.dialogueTrace - Current turn's dialogue trace
|
|
255
|
+
* @param {Array} dialogueContext.consolidatedTrace - Full multi-turn consolidated trace
|
|
256
|
+
* @returns {string|null} Formatted transcript section, or null if no dialogue data
|
|
257
|
+
*/
|
|
258
|
+
function formatDialogueTranscript(dialogueContext) {
|
|
259
|
+
if (!dialogueContext) return null;
|
|
260
|
+
|
|
261
|
+
const { conversationHistory, dialogueTrace, consolidatedTrace } = dialogueContext;
|
|
262
|
+
|
|
263
|
+
// Use consolidatedTrace if available (richest source), otherwise fall back to conversationHistory
|
|
264
|
+
const trace = consolidatedTrace?.length > 0 ? consolidatedTrace : null;
|
|
265
|
+
const history = conversationHistory?.length > 0 ? conversationHistory : null;
|
|
266
|
+
|
|
267
|
+
if (!trace && !history) return null;
|
|
268
|
+
|
|
269
|
+
const lines = [];
|
|
270
|
+
|
|
271
|
+
if (trace) {
|
|
272
|
+
// Format from consolidated trace (includes internal deliberation)
|
|
273
|
+
let currentTurnIdx = -1;
|
|
274
|
+
for (const entry of trace) {
|
|
275
|
+
// Turn separator
|
|
276
|
+
if (entry.turnIndex !== undefined && entry.turnIndex !== currentTurnIdx) {
|
|
277
|
+
currentTurnIdx = entry.turnIndex;
|
|
278
|
+
lines.push(`\n--- Turn ${currentTurnIdx} ---`);
|
|
279
|
+
}
|
|
280
|
+
|
|
281
|
+
if (entry.agent === 'user' && entry.action === 'turn_action') {
|
|
282
|
+
lines.push(`[Learner Action] ${entry.detail || entry.contextSummary}`);
|
|
283
|
+
} else if (entry.agent === 'learner_ego') {
|
|
284
|
+
lines.push(` (Learner Ego: ${truncate(entry.detail || entry.contextSummary, 200)})`);
|
|
285
|
+
} else if (entry.agent === 'learner_superego') {
|
|
286
|
+
lines.push(` (Learner Superego: ${truncate(entry.detail || entry.contextSummary, 200)})`);
|
|
287
|
+
} else if (entry.agent === 'learner_synthesis') {
|
|
288
|
+
lines.push(`[Learner] "${truncate(entry.detail || entry.contextSummary, 300)}"`);
|
|
289
|
+
} else if (entry.agent === 'ego' && entry.action === 'initial_draft') {
|
|
290
|
+
lines.push(` (Tutor Ego draft: ${truncate(entry.contextSummary || '', 150)})`);
|
|
291
|
+
} else if (entry.agent === 'superego') {
|
|
292
|
+
lines.push(` (Tutor Superego: ${truncate(entry.contextSummary || '', 150)})`);
|
|
293
|
+
} else if (entry.agent === 'ego' && (entry.action === 'revision' || entry.action === 'final_revision')) {
|
|
294
|
+
lines.push(`[Tutor] (revised after superego feedback)`);
|
|
295
|
+
} else if (entry.agent === 'user' && entry.action === 'final_output') {
|
|
296
|
+
lines.push(`[Tutor → Learner] Delivered ${entry.suggestionCount} suggestion(s)`);
|
|
297
|
+
} else if (entry.agent === 'ego') {
|
|
298
|
+
// Single-agent tutor response
|
|
299
|
+
lines.push(`[Tutor] ${truncate(entry.contextSummary || '', 200)}`);
|
|
300
|
+
}
|
|
301
|
+
}
|
|
302
|
+
} else if (history) {
|
|
303
|
+
// Format from conversation history (less detail, no internal deliberation)
|
|
304
|
+
for (const turn of history) {
|
|
305
|
+
lines.push(`\n--- Turn ${turn.turnIndex} ---`);
|
|
306
|
+
if (turn.learnerMessage) {
|
|
307
|
+
lines.push(`[Learner] "${truncate(turn.learnerMessage, 300)}"`);
|
|
308
|
+
} else if (turn.learnerAction) {
|
|
309
|
+
lines.push(`[Learner Action] ${turn.learnerAction}`);
|
|
310
|
+
}
|
|
311
|
+
if (turn.suggestion) {
|
|
312
|
+
const msg = turn.suggestion.message || turn.suggestion.title || '';
|
|
313
|
+
lines.push(`[Tutor] "${truncate(msg, 300)}"`);
|
|
314
|
+
}
|
|
315
|
+
}
|
|
316
|
+
}
|
|
317
|
+
|
|
318
|
+
return lines.join('\n');
|
|
319
|
+
}
|
|
320
|
+
|
|
321
|
+
/**
|
|
322
|
+
* Truncate a string to maxLen characters, adding ellipsis if needed.
|
|
323
|
+
*/
|
|
324
|
+
function truncate(str, maxLen) {
|
|
325
|
+
if (!str) return '';
|
|
326
|
+
if (str.length <= maxLen) return str;
|
|
327
|
+
return str.slice(0, maxLen - 3) + '...';
|
|
328
|
+
}
|
|
329
|
+
|
|
218
330
|
/**
|
|
219
331
|
* Build the evaluation prompt for the judge model
|
|
220
332
|
*/
|
|
221
333
|
function buildEvaluationPrompt(suggestion, scenario, context) {
|
|
222
|
-
const
|
|
223
|
-
const dimensions = rubric?.dimensions || {};
|
|
334
|
+
const dimensions = evalConfigLoader.getRubricDimensions();
|
|
224
335
|
|
|
225
336
|
// Build dimension criteria text
|
|
226
337
|
const dimensionCriteria = Object.entries(dimensions).map(([key, dim]) => {
|
|
@@ -233,7 +344,18 @@ Criteria:
|
|
|
233
344
|
${criteriaText}`;
|
|
234
345
|
}).join('\n\n');
|
|
235
346
|
|
|
236
|
-
|
|
347
|
+
// Build optional dialogue transcript section
|
|
348
|
+
const dialogueTranscript = formatDialogueTranscript(context.dialogueContext);
|
|
349
|
+
const dialogueSection = dialogueTranscript
|
|
350
|
+
? `\n## DIALOGUE TRANSCRIPT
|
|
351
|
+
|
|
352
|
+
The following is the full learner-tutor exchange leading to this suggestion. Internal deliberation traces (ego/superego) show the reasoning process. Use this context to evaluate how well the tutor responded to the learner's actual engagement, struggle, and development.
|
|
353
|
+
|
|
354
|
+
${dialogueTranscript}
|
|
355
|
+
`
|
|
356
|
+
: '';
|
|
357
|
+
|
|
358
|
+
return `You are an expert evaluator of AI tutoring systems. Evaluate the following AI tutor suggestion against the pedagogical rubric.${dialogueTranscript ? ' The suggestion was produced in the context of a multi-turn dialogue — evaluate it in that context, considering how the tutor responds to the learner\'s actual engagement and development.' : ''}
|
|
237
359
|
|
|
238
360
|
## EVALUATION RUBRIC
|
|
239
361
|
|
|
@@ -254,7 +376,7 @@ ${dimensionCriteria}
|
|
|
254
376
|
|
|
255
377
|
**Learner Context**:
|
|
256
378
|
${scenario.learnerContext || context.learnerContext || 'No context provided'}
|
|
257
|
-
|
|
379
|
+
${dialogueSection}
|
|
258
380
|
## SUGGESTION TO EVALUATE
|
|
259
381
|
|
|
260
382
|
\`\`\`json
|
|
@@ -271,30 +393,39 @@ ${(scenario.forbiddenElements || []).map(e => `- ${e}`).join('\n') || '- None sp
|
|
|
271
393
|
|
|
272
394
|
## YOUR TASK
|
|
273
395
|
|
|
274
|
-
Evaluate the suggestion and provide:
|
|
275
|
-
1. A score (1-5) for each dimension with reasoning
|
|
396
|
+
Evaluate the suggestion${dialogueTranscript ? ' in the context of the dialogue above' : ''} and provide:
|
|
397
|
+
1. A score (1-5) for each dimension with reasoning
|
|
276
398
|
2. Whether it passes the required/forbidden element checks
|
|
277
399
|
3. An overall score (weighted average, 0-100 scale)
|
|
278
400
|
|
|
279
401
|
For each dimension, include:
|
|
280
402
|
- **score**: 1-5 rating
|
|
281
|
-
- **reasoning**: Brief explanation of why this score was given
|
|
282
|
-
|
|
403
|
+
- **reasoning**: Brief explanation of why this score was given${dialogueTranscript ? '. For recognition dimensions, consider how the tutor engaged with the learner\'s actual responses and development.' : ''}
|
|
404
|
+
|
|
405
|
+
CRITICAL JSON RULES:
|
|
406
|
+
- Never use unescaped double quotes inside JSON string values. Use single quotes or rephrase.
|
|
407
|
+
- Keep "reasoning" values under 25 words.
|
|
408
|
+
- BAD: "reasoning": "Says "great job" which is encouraging"
|
|
409
|
+
- GOOD: "reasoning": "Says 'great job' which is encouraging"
|
|
283
410
|
|
|
284
|
-
Respond with ONLY a JSON object in this exact format:
|
|
411
|
+
Respond with ONLY a JSON object in this exact format (no other text before or after):
|
|
285
412
|
\`\`\`json
|
|
286
413
|
{
|
|
287
414
|
"scores": {
|
|
288
|
-
"relevance": {"score": 4, "reasoning": "Matches
|
|
289
|
-
"specificity": {"score": 5, "reasoning": "Names exact lecture"
|
|
290
|
-
"pedagogical_soundness": {"score": 4, "reasoning": "Uses scaffolding"
|
|
291
|
-
"personalization": {"score": 3, "reasoning": "Generic advice"
|
|
292
|
-
"actionability": {"score": 5, "reasoning": "Clear next step"
|
|
293
|
-
"tone": {"score": 4, "reasoning": "Encouraging
|
|
294
|
-
"mutual_recognition": {"score": 4, "reasoning": "Acknowledges
|
|
295
|
-
"dialectical_responsiveness": {"score": 3, "reasoning": "Responds
|
|
296
|
-
"memory_integration": {"score": 4, "reasoning": "References
|
|
297
|
-
"transformative_potential": {"score": 3, "reasoning": "Informative
|
|
415
|
+
"relevance": {"score": 4, "reasoning": "Matches idle state well"},
|
|
416
|
+
"specificity": {"score": 5, "reasoning": "Names exact lecture"},
|
|
417
|
+
"pedagogical_soundness": {"score": 4, "reasoning": "Uses scaffolding"},
|
|
418
|
+
"personalization": {"score": 3, "reasoning": "Generic advice"},
|
|
419
|
+
"actionability": {"score": 5, "reasoning": "Clear next step"},
|
|
420
|
+
"tone": {"score": 4, "reasoning": "Encouraging tone"},
|
|
421
|
+
"mutual_recognition": {"score": 4, "reasoning": "Acknowledges interpretation"},
|
|
422
|
+
"dialectical_responsiveness": {"score": 3, "reasoning": "Responds without tension"},
|
|
423
|
+
"memory_integration": {"score": 4, "reasoning": "References prior session"},
|
|
424
|
+
"transformative_potential": {"score": 3, "reasoning": "Informative not transformative"},
|
|
425
|
+
"tutor_adaptation": {"score": 3, "reasoning": "Some adjustment to input"},
|
|
426
|
+
"learner_growth": {"score": 4, "reasoning": "Shows conceptual development"},
|
|
427
|
+
"productive_struggle": {"score": 4, "reasoning": "Sustains appropriate tension"},
|
|
428
|
+
"epistemic_honesty": {"score": 4, "reasoning": "Represents complexity fairly"}
|
|
298
429
|
},
|
|
299
430
|
"validation": {
|
|
300
431
|
"passes_required": true,
|
|
@@ -310,10 +441,20 @@ Respond with ONLY a JSON object in this exact format:
|
|
|
310
441
|
|
|
311
442
|
/**
|
|
312
443
|
* Call the judge model (simple single-model approach)
|
|
444
|
+
*
|
|
445
|
+
* @param {string} prompt - The evaluation prompt
|
|
446
|
+
* @param {Object} [overrides] - Optional overrides (passed to getAvailableEvaluator)
|
|
313
447
|
*/
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
448
|
+
// Models/prefixes that support response_format: { type: "json_object" }
|
|
449
|
+
const JSON_MODE_PREFIXES = ['gpt-', 'deepseek-', 'claude-'];
|
|
450
|
+
|
|
451
|
+
function supportsJsonMode(model) {
|
|
452
|
+
return JSON_MODE_PREFIXES.some(prefix => model.startsWith(prefix));
|
|
453
|
+
}
|
|
454
|
+
|
|
455
|
+
async function callJudgeModel(prompt, overrides = {}) {
|
|
456
|
+
const judge = getAvailableJudge(overrides);
|
|
457
|
+
const { provider, model, hyperparameters } = judge;
|
|
317
458
|
const temperature = hyperparameters?.temperature ?? 0.2;
|
|
318
459
|
const maxTokens = hyperparameters?.max_tokens ?? 1500;
|
|
319
460
|
|
|
@@ -372,18 +513,24 @@ async function callJudgeModel(prompt) {
|
|
|
372
513
|
const timeout = setTimeout(() => controller.abort(), 60000);
|
|
373
514
|
|
|
374
515
|
try {
|
|
516
|
+
const body = {
|
|
517
|
+
model,
|
|
518
|
+
max_tokens: maxTokens,
|
|
519
|
+
temperature,
|
|
520
|
+
include_reasoning: false,
|
|
521
|
+
messages: [{ role: 'user', content: prompt }],
|
|
522
|
+
};
|
|
523
|
+
if (supportsJsonMode(model)) {
|
|
524
|
+
body.response_format = { type: 'json_object' };
|
|
525
|
+
}
|
|
526
|
+
|
|
375
527
|
const res = await fetch('https://openrouter.ai/api/v1/chat/completions', {
|
|
376
528
|
method: 'POST',
|
|
377
529
|
headers: {
|
|
378
530
|
'Content-Type': 'application/json',
|
|
379
531
|
'Authorization': `Bearer ${apiKey}`,
|
|
380
532
|
},
|
|
381
|
-
body: JSON.stringify(
|
|
382
|
-
model,
|
|
383
|
-
max_tokens: maxTokens,
|
|
384
|
-
temperature,
|
|
385
|
-
messages: [{ role: 'user', content: prompt }],
|
|
386
|
-
}),
|
|
533
|
+
body: JSON.stringify(body),
|
|
387
534
|
signal: controller.signal,
|
|
388
535
|
});
|
|
389
536
|
|
|
@@ -417,18 +564,23 @@ async function callJudgeModel(prompt) {
|
|
|
417
564
|
const timeout = setTimeout(() => controller.abort(), 60000);
|
|
418
565
|
|
|
419
566
|
try {
|
|
567
|
+
const body = {
|
|
568
|
+
model,
|
|
569
|
+
max_tokens: maxTokens,
|
|
570
|
+
temperature,
|
|
571
|
+
messages: [{ role: 'user', content: prompt }],
|
|
572
|
+
};
|
|
573
|
+
if (supportsJsonMode(model)) {
|
|
574
|
+
body.response_format = { type: 'json_object' };
|
|
575
|
+
}
|
|
576
|
+
|
|
420
577
|
const res = await fetch('https://api.openai.com/v1/chat/completions', {
|
|
421
578
|
method: 'POST',
|
|
422
579
|
headers: {
|
|
423
580
|
'Content-Type': 'application/json',
|
|
424
581
|
'Authorization': `Bearer ${apiKey}`,
|
|
425
582
|
},
|
|
426
|
-
body: JSON.stringify(
|
|
427
|
-
model,
|
|
428
|
-
max_tokens: maxTokens,
|
|
429
|
-
temperature,
|
|
430
|
-
messages: [{ role: 'user', content: prompt }],
|
|
431
|
-
}),
|
|
583
|
+
body: JSON.stringify(body),
|
|
432
584
|
signal: controller.signal,
|
|
433
585
|
});
|
|
434
586
|
|
|
@@ -502,20 +654,155 @@ async function callJudgeModel(prompt) {
|
|
|
502
654
|
throw new Error(`Unsupported judge provider: ${provider}`);
|
|
503
655
|
}
|
|
504
656
|
|
|
657
|
+
/**
|
|
658
|
+
* Repair unescaped double quotes inside JSON string values.
|
|
659
|
+
* Targets patterns like: "key": "text with "inner" quotes"
|
|
660
|
+
* Replaces inner unescaped quotes with single quotes.
|
|
661
|
+
*/
|
|
662
|
+
function repairUnescapedQuotes(jsonStr) {
|
|
663
|
+
// Strategy: walk through the string tracking whether we're inside a JSON string value.
|
|
664
|
+
// When we find a quote that isn't at a key/value boundary, replace it with a single quote.
|
|
665
|
+
let result = '';
|
|
666
|
+
let i = 0;
|
|
667
|
+
const len = jsonStr.length;
|
|
668
|
+
|
|
669
|
+
while (i < len) {
|
|
670
|
+
const ch = jsonStr[i];
|
|
671
|
+
|
|
672
|
+
if (ch === '"') {
|
|
673
|
+
// Find the matching close quote for this JSON string
|
|
674
|
+
result += '"';
|
|
675
|
+
i++;
|
|
676
|
+
// Scan for the true end of this string value
|
|
677
|
+
while (i < len) {
|
|
678
|
+
const c = jsonStr[i];
|
|
679
|
+
if (c === '\\') {
|
|
680
|
+
// Escaped character — pass through both chars
|
|
681
|
+
result += jsonStr[i] + (jsonStr[i + 1] || '');
|
|
682
|
+
i += 2;
|
|
683
|
+
continue;
|
|
684
|
+
}
|
|
685
|
+
if (c === '"') {
|
|
686
|
+
// Is this the real end of the string? Look ahead for JSON structure chars
|
|
687
|
+
const after = jsonStr.slice(i + 1).trimStart();
|
|
688
|
+
if (after[0] === ':' || after[0] === ',' || after[0] === '}' || after[0] === ']' || after.length === 0) {
|
|
689
|
+
// This is a real closing quote
|
|
690
|
+
result += '"';
|
|
691
|
+
i++;
|
|
692
|
+
break;
|
|
693
|
+
} else {
|
|
694
|
+
// This is an unescaped inner quote — replace with single quote
|
|
695
|
+
result += "'";
|
|
696
|
+
i++;
|
|
697
|
+
continue;
|
|
698
|
+
}
|
|
699
|
+
}
|
|
700
|
+
result += c;
|
|
701
|
+
i++;
|
|
702
|
+
}
|
|
703
|
+
} else {
|
|
704
|
+
result += ch;
|
|
705
|
+
i++;
|
|
706
|
+
}
|
|
707
|
+
}
|
|
708
|
+
|
|
709
|
+
return result;
|
|
710
|
+
}
|
|
711
|
+
|
|
712
|
+
/**
|
|
713
|
+
* Last-resort regex extraction of individual dimension scores.
|
|
714
|
+
* Returns a partial result object or null if too few scores found.
|
|
715
|
+
*/
|
|
716
|
+
function regexScoreRescue(text) {
|
|
717
|
+
const dimensionNames = [
|
|
718
|
+
'relevance', 'specificity', 'pedagogical_soundness', 'personalization',
|
|
719
|
+
'actionability', 'tone', 'mutual_recognition', 'dialectical_responsiveness',
|
|
720
|
+
'memory_integration', 'transformative_potential', 'tutor_adaptation',
|
|
721
|
+
'learner_growth', 'productive_struggle', 'epistemic_honesty',
|
|
722
|
+
];
|
|
723
|
+
|
|
724
|
+
const scores = {};
|
|
725
|
+
for (const dim of dimensionNames) {
|
|
726
|
+
// Match patterns like: "relevance": {"score": 4 or "relevance":{"score":4
|
|
727
|
+
const pattern = new RegExp(`"${dim}"\\s*:\\s*\\{?\\s*"?score"?\\s*:\\s*(\\d)`, 'i');
|
|
728
|
+
const match = text.match(pattern);
|
|
729
|
+
if (match) {
|
|
730
|
+
scores[dim] = { score: parseInt(match[1], 10), reasoning: null };
|
|
731
|
+
}
|
|
732
|
+
}
|
|
733
|
+
|
|
734
|
+
// Need at least 3 scores for a useful partial result
|
|
735
|
+
if (Object.keys(scores).length < 3) return null;
|
|
736
|
+
|
|
737
|
+
debugLog(`[rubricEvaluator] Regex rescue recovered ${Object.keys(scores).length} scores`);
|
|
738
|
+
|
|
739
|
+
// Try to extract overall_score and summary
|
|
740
|
+
const overallMatch = text.match(/"overall_score"\s*:\s*(\d+)/);
|
|
741
|
+
const summaryMatch = text.match(/"summary"\s*:\s*"([^"]+)"/);
|
|
742
|
+
|
|
743
|
+
return {
|
|
744
|
+
scores,
|
|
745
|
+
validation: { passes_required: true, required_missing: [], passes_forbidden: true, forbidden_found: [] },
|
|
746
|
+
overall_score: overallMatch ? parseInt(overallMatch[1], 10) : null,
|
|
747
|
+
summary: summaryMatch ? summaryMatch[1] : 'Partial scores recovered via regex rescue',
|
|
748
|
+
};
|
|
749
|
+
}
|
|
750
|
+
|
|
505
751
|
/**
|
|
506
752
|
* Parse the judge model's JSON response
|
|
507
753
|
*/
|
|
508
754
|
function parseJudgeResponse(responseText) {
|
|
509
755
|
// Extract JSON from response (may be wrapped in markdown code block)
|
|
510
|
-
|
|
511
|
-
|
|
756
|
+
let jsonMatch = responseText.match(/```(?:json)?\s*([\s\S]*?)```/);
|
|
757
|
+
|
|
758
|
+
if (!jsonMatch) {
|
|
759
|
+
// Strip preamble/postamble text — find first { and last }
|
|
760
|
+
const firstBrace = responseText.indexOf('{');
|
|
761
|
+
const lastBrace = responseText.lastIndexOf('}');
|
|
762
|
+
if (firstBrace !== -1 && lastBrace > firstBrace) {
|
|
763
|
+
jsonMatch = [null, responseText.slice(firstBrace, lastBrace + 1)];
|
|
764
|
+
}
|
|
765
|
+
}
|
|
512
766
|
|
|
513
767
|
if (!jsonMatch) {
|
|
514
768
|
throw new Error('Could not parse judge response as JSON');
|
|
515
769
|
}
|
|
516
770
|
|
|
517
771
|
const jsonStr = jsonMatch[1] || jsonMatch[0];
|
|
518
|
-
|
|
772
|
+
|
|
773
|
+
try {
|
|
774
|
+
return JSON.parse(jsonStr);
|
|
775
|
+
} catch (e) {
|
|
776
|
+
// Try to fix common JSON issues: trailing commas, unescaped newlines in strings
|
|
777
|
+
const cleaned = jsonStr
|
|
778
|
+
.replace(/,\s*([}\]])/g, '$1') // trailing commas
|
|
779
|
+
.replace(/[\x00-\x1f]/g, m => // control chars in strings
|
|
780
|
+
m === '\n' ? '\\n' : m === '\t' ? '\\t' : m === '\r' ? '\\r' : '');
|
|
781
|
+
try {
|
|
782
|
+
return JSON.parse(cleaned);
|
|
783
|
+
} catch (e2) {
|
|
784
|
+
// Attempt JSON repair: fix unescaped double quotes inside string values
|
|
785
|
+
// Pattern: "key": "text with "inner" quotes" → "key": "text with 'inner' quotes"
|
|
786
|
+
debugLog('[rubricEvaluator] Attempting JSON repair for unescaped quotes...');
|
|
787
|
+
try {
|
|
788
|
+
const repaired = repairUnescapedQuotes(cleaned);
|
|
789
|
+
return JSON.parse(repaired);
|
|
790
|
+
} catch (e3) {
|
|
791
|
+
// Final fallback: use jsonrepair library which handles many more edge cases
|
|
792
|
+
debugLog('[rubricEvaluator] Attempting jsonrepair library fallback...');
|
|
793
|
+
try {
|
|
794
|
+
const robustRepaired = jsonrepair(jsonStr);
|
|
795
|
+
return JSON.parse(robustRepaired);
|
|
796
|
+
} catch (e4) {
|
|
797
|
+
// Last resort: regex rescue — extract individual scores
|
|
798
|
+
debugLog('[rubricEvaluator] Attempting regex score rescue...');
|
|
799
|
+
const rescued = regexScoreRescue(jsonStr);
|
|
800
|
+
if (rescued) return rescued;
|
|
801
|
+
throw new Error(`Could not parse judge response as JSON: initial=${e.message}, repair=${e3.message}, jsonrepair=${e4.message}`);
|
|
802
|
+
}
|
|
803
|
+
}
|
|
804
|
+
}
|
|
805
|
+
}
|
|
519
806
|
}
|
|
520
807
|
|
|
521
808
|
/**
|
|
@@ -524,15 +811,17 @@ function parseJudgeResponse(responseText) {
|
|
|
524
811
|
* @param {Object} suggestion - The suggestion to evaluate
|
|
525
812
|
* @param {Object} scenario - The test scenario
|
|
526
813
|
* @param {Object} context - Additional context
|
|
814
|
+
* @param {Object} [overrides] - Optional overrides
|
|
815
|
+
* @param {Object} [overrides.judgeOverride] - Override judge model config
|
|
527
816
|
* @returns {Promise<Object>} Evaluation result
|
|
528
817
|
*/
|
|
529
|
-
export async function evaluateSuggestion(suggestion, scenario, context = {}) {
|
|
818
|
+
export async function evaluateSuggestion(suggestion, scenario, context = {}, overrides = {}) {
|
|
530
819
|
const startTime = Date.now();
|
|
531
|
-
const
|
|
820
|
+
const judge = getAvailableJudge(overrides);
|
|
532
821
|
|
|
533
822
|
try {
|
|
534
823
|
const prompt = buildEvaluationPrompt(suggestion, scenario, context);
|
|
535
|
-
let responseText = await callJudgeModel(prompt);
|
|
824
|
+
let responseText = await callJudgeModel(prompt, overrides);
|
|
536
825
|
|
|
537
826
|
// Log raw response for debugging
|
|
538
827
|
debugLog('[rubricEvaluator] Judge raw response (first 300 chars):', responseText.slice(0, 300));
|
|
@@ -540,7 +829,7 @@ export async function evaluateSuggestion(suggestion, scenario, context = {}) {
|
|
|
540
829
|
// Handle empty response - try fallback model
|
|
541
830
|
if (!responseText || responseText.trim() === '') {
|
|
542
831
|
console.warn('[rubricEvaluator] Primary judge returned empty response, trying fallback...');
|
|
543
|
-
const fallbackConfig =
|
|
832
|
+
const fallbackConfig = getFallbackJudge();
|
|
544
833
|
if (fallbackConfig) {
|
|
545
834
|
responseText = await callJudgeModelWithConfig(prompt, fallbackConfig);
|
|
546
835
|
debugLog('[rubricEvaluator] Fallback response (first 300 chars):', responseText.slice(0, 300));
|
|
@@ -550,7 +839,35 @@ export async function evaluateSuggestion(suggestion, scenario, context = {}) {
|
|
|
550
839
|
}
|
|
551
840
|
}
|
|
552
841
|
|
|
553
|
-
|
|
842
|
+
let parsed;
|
|
843
|
+
try {
|
|
844
|
+
parsed = parseJudgeResponse(responseText);
|
|
845
|
+
} catch (parseError) {
|
|
846
|
+
// JSON parse failed — retry with fallback model before giving up
|
|
847
|
+
console.warn(`[rubricEvaluator] Parse failed (${parseError.message}), retrying with fallback...`);
|
|
848
|
+
const fallbackConfig = getFallbackJudge();
|
|
849
|
+
if (fallbackConfig) {
|
|
850
|
+
let retryText = await callJudgeModelWithConfig(prompt, fallbackConfig);
|
|
851
|
+
if (retryText && retryText.trim()) {
|
|
852
|
+
try {
|
|
853
|
+
parsed = parseJudgeResponse(retryText);
|
|
854
|
+
} catch (retryParseError) {
|
|
855
|
+
// Second attempt: models are non-deterministic, retry once more
|
|
856
|
+
console.warn(`[rubricEvaluator] Fallback parse also failed (${retryParseError.message}), retrying once more...`);
|
|
857
|
+
retryText = await callJudgeModelWithConfig(prompt, fallbackConfig);
|
|
858
|
+
if (retryText && retryText.trim()) {
|
|
859
|
+
parsed = parseJudgeResponse(retryText);
|
|
860
|
+
} else {
|
|
861
|
+
throw retryParseError;
|
|
862
|
+
}
|
|
863
|
+
}
|
|
864
|
+
} else {
|
|
865
|
+
throw parseError;
|
|
866
|
+
}
|
|
867
|
+
} else {
|
|
868
|
+
throw parseError;
|
|
869
|
+
}
|
|
870
|
+
}
|
|
554
871
|
|
|
555
872
|
// Debug: log what was parsed
|
|
556
873
|
debugLog('[rubricEvaluator] Parsed keys:', Object.keys(parsed));
|
|
@@ -578,18 +895,16 @@ export async function evaluateSuggestion(suggestion, scenario, context = {}) {
|
|
|
578
895
|
|
|
579
896
|
for (const [key, value] of Object.entries(parsed.scores || {})) {
|
|
580
897
|
const normalizedKey = dimensionMap[key] || key;
|
|
581
|
-
// Handle both {score, reasoning
|
|
898
|
+
// Handle both {score, reasoning} objects and plain numbers
|
|
582
899
|
if (typeof value === 'object' && value !== null) {
|
|
583
900
|
scores[normalizedKey] = {
|
|
584
901
|
score: value.score,
|
|
585
902
|
reasoning: value.reasoning,
|
|
586
|
-
quote: value.quote || null,
|
|
587
903
|
};
|
|
588
904
|
} else if (typeof value === 'number') {
|
|
589
905
|
scores[normalizedKey] = {
|
|
590
906
|
score: value,
|
|
591
907
|
reasoning: null,
|
|
592
|
-
quote: null,
|
|
593
908
|
};
|
|
594
909
|
}
|
|
595
910
|
}
|
|
@@ -607,19 +922,25 @@ export async function evaluateSuggestion(suggestion, scenario, context = {}) {
|
|
|
607
922
|
success: true,
|
|
608
923
|
scores,
|
|
609
924
|
overallScore,
|
|
925
|
+
baseScore: calculateBaseScore(scores),
|
|
926
|
+
recognitionScore: calculateRecognitionScore(scores),
|
|
610
927
|
passesRequired: parsed.validation?.passes_required ?? true,
|
|
611
928
|
passesForbidden: parsed.validation?.passes_forbidden ?? true,
|
|
612
929
|
requiredMissing: parsed.validation?.required_missing || [],
|
|
613
930
|
forbiddenFound: parsed.validation?.forbidden_found || [],
|
|
614
931
|
summary: parsed.summary,
|
|
615
|
-
|
|
932
|
+
judgeModel: `${judge.provider}/${judge.model}`,
|
|
616
933
|
evaluationTimeMs: Date.now() - startTime,
|
|
617
934
|
};
|
|
618
935
|
} catch (error) {
|
|
619
936
|
return {
|
|
620
937
|
success: false,
|
|
938
|
+
scores: {},
|
|
939
|
+
overallScore: null,
|
|
940
|
+
baseScore: null,
|
|
941
|
+
recognitionScore: null,
|
|
621
942
|
error: error.message,
|
|
622
|
-
|
|
943
|
+
judgeModel: `${judge.provider}/${judge.model}`,
|
|
623
944
|
evaluationTimeMs: Date.now() - startTime,
|
|
624
945
|
};
|
|
625
946
|
}
|
|
@@ -628,18 +949,18 @@ export async function evaluateSuggestion(suggestion, scenario, context = {}) {
|
|
|
628
949
|
/**
|
|
629
950
|
* Evaluate multiple suggestions (batch)
|
|
630
951
|
*/
|
|
631
|
-
export async function evaluateSuggestions(suggestions, scenario, context = {}) {
|
|
952
|
+
export async function evaluateSuggestions(suggestions, scenario, context = {}, overrides = {}) {
|
|
632
953
|
const results = [];
|
|
633
954
|
|
|
634
955
|
for (const suggestion of suggestions) {
|
|
635
|
-
const result = await evaluateSuggestion(suggestion, scenario, context);
|
|
956
|
+
const result = await evaluateSuggestion(suggestion, scenario, context, overrides);
|
|
636
957
|
results.push(result);
|
|
637
958
|
}
|
|
638
959
|
|
|
639
960
|
// Aggregate scores if multiple suggestions
|
|
640
961
|
if (results.length > 0 && results[0].success) {
|
|
641
962
|
const avgScores = {};
|
|
642
|
-
const dimensions = ['relevance', 'specificity', 'pedagogical', 'personalization', 'actionability', 'tone'];
|
|
963
|
+
const dimensions = ['relevance', 'specificity', 'pedagogical', 'personalization', 'actionability', 'tone', 'productive_struggle', 'epistemic_honesty'];
|
|
643
964
|
|
|
644
965
|
for (const dim of dimensions) {
|
|
645
966
|
const scores = results
|
|
@@ -697,9 +1018,18 @@ export function quickValidate(suggestion, scenario) {
|
|
|
697
1018
|
passesForbidden: true,
|
|
698
1019
|
requiredMissing: [],
|
|
699
1020
|
forbiddenFound: [],
|
|
1021
|
+
// Transformation marker analysis (for multi-turn scenarios)
|
|
1022
|
+
transformationMarkersFound: [],
|
|
1023
|
+
staticMarkersFound: [],
|
|
1024
|
+
learnerGrowthMarkersFound: [],
|
|
1025
|
+
learnerStaticMarkersFound: [],
|
|
1026
|
+
transformationScore: null,
|
|
1027
|
+
learnerGrowthScore: null,
|
|
1028
|
+
bilateralTransformationScore: null,
|
|
700
1029
|
};
|
|
701
1030
|
|
|
702
1031
|
// Check required elements (can appear anywhere including actionTarget, reasoning)
|
|
1032
|
+
// ALL elements in requiredElements must be present
|
|
703
1033
|
for (const required of scenario.requiredElements || []) {
|
|
704
1034
|
const normalizedRequired = required.toLowerCase();
|
|
705
1035
|
const found = fullSuggestionText.includes(normalizedRequired) ||
|
|
@@ -713,6 +1043,23 @@ export function quickValidate(suggestion, scenario) {
|
|
|
713
1043
|
}
|
|
714
1044
|
}
|
|
715
1045
|
|
|
1046
|
+
// Check requiredElementsAny - ANY one of these must be present
|
|
1047
|
+
const anyElements = scenario.requiredElementsAny || [];
|
|
1048
|
+
if (anyElements.length > 0) {
|
|
1049
|
+
const anyFound = anyElements.some(required => {
|
|
1050
|
+
const normalizedRequired = required.toLowerCase();
|
|
1051
|
+
return fullSuggestionText.includes(normalizedRequired) ||
|
|
1052
|
+
(suggestion.actionTarget && suggestion.actionTarget.toLowerCase().includes(normalizedRequired)) ||
|
|
1053
|
+
(suggestion.title && suggestion.title.toLowerCase().includes(normalizedRequired)) ||
|
|
1054
|
+
(suggestion.message && suggestion.message.toLowerCase().includes(normalizedRequired));
|
|
1055
|
+
});
|
|
1056
|
+
|
|
1057
|
+
if (!anyFound) {
|
|
1058
|
+
result.passesRequired = false;
|
|
1059
|
+
result.requiredMissing.push(`one of: ${anyElements.join(', ')}`);
|
|
1060
|
+
}
|
|
1061
|
+
}
|
|
1062
|
+
|
|
716
1063
|
// Check forbidden elements (only in user-facing text: title, message)
|
|
717
1064
|
// The 'reasoning' field is internal and may legitimately reference context terms
|
|
718
1065
|
for (const forbidden of scenario.forbiddenElements || []) {
|
|
@@ -723,15 +1070,121 @@ export function quickValidate(suggestion, scenario) {
|
|
|
723
1070
|
}
|
|
724
1071
|
}
|
|
725
1072
|
|
|
1073
|
+
// Check transformation markers (for multi-turn scenarios)
|
|
1074
|
+
const markers = scenario.transformationMarkers || scenario.transformation_markers;
|
|
1075
|
+
if (markers) {
|
|
1076
|
+
// Tutor evolving markers (in tutor response)
|
|
1077
|
+
const tutorEvolving = markers.tutor_evolving || markers.tutorEvolving || [];
|
|
1078
|
+
for (const marker of tutorEvolving) {
|
|
1079
|
+
if (userFacingText.includes(marker.toLowerCase())) {
|
|
1080
|
+
result.transformationMarkersFound.push(marker);
|
|
1081
|
+
}
|
|
1082
|
+
}
|
|
1083
|
+
|
|
1084
|
+
// Tutor static markers (in tutor response)
|
|
1085
|
+
const tutorStatic = markers.tutor_static || markers.tutorStatic || [];
|
|
1086
|
+
for (const marker of tutorStatic) {
|
|
1087
|
+
if (userFacingText.includes(marker.toLowerCase())) {
|
|
1088
|
+
result.staticMarkersFound.push(marker);
|
|
1089
|
+
}
|
|
1090
|
+
}
|
|
1091
|
+
|
|
1092
|
+
// Calculate tutor transformation score
|
|
1093
|
+
const tutorEvolvingCount = result.transformationMarkersFound.length;
|
|
1094
|
+
const tutorStaticCount = result.staticMarkersFound.length;
|
|
1095
|
+
const tutorTotal = tutorEvolvingCount + tutorStaticCount;
|
|
1096
|
+
if (tutorTotal > 0) {
|
|
1097
|
+
result.transformationScore = tutorEvolvingCount / tutorTotal;
|
|
1098
|
+
}
|
|
1099
|
+
|
|
1100
|
+
// Learner growth markers (these will typically be found in context/history, not suggestion)
|
|
1101
|
+
// Included for completeness when analyzing full dialogue
|
|
1102
|
+
const learnerEvolving = markers.learner_evolving || markers.learnerEvolving || [];
|
|
1103
|
+
const learnerStatic = markers.learner_static || markers.learnerStatic || [];
|
|
1104
|
+
|
|
1105
|
+
// Store marker definitions for use by turn analysis
|
|
1106
|
+
result._markerDefinitions = {
|
|
1107
|
+
tutorEvolving,
|
|
1108
|
+
tutorStatic,
|
|
1109
|
+
learnerEvolving,
|
|
1110
|
+
learnerStatic,
|
|
1111
|
+
};
|
|
1112
|
+
}
|
|
1113
|
+
|
|
726
1114
|
return result;
|
|
727
1115
|
}
|
|
728
1116
|
|
|
1117
|
+
// Dimension groups for dual scoring
|
|
1118
|
+
const BASE_DIMENSIONS = ['relevance', 'specificity', 'pedagogical', 'personalization', 'actionability', 'tone', 'productive_struggle', 'epistemic_honesty'];
|
|
1119
|
+
const RECOGNITION_DIMENSIONS = ['mutual_recognition', 'dialectical_responsiveness', 'memory_integration', 'transformative_potential', 'tutor_adaptation', 'learner_growth'];
|
|
1120
|
+
|
|
1121
|
+
/**
|
|
1122
|
+
* Calculate base score from the 6 core pedagogical dimensions.
|
|
1123
|
+
* Weights are re-normalized to sum to 1.0 across only the base dimensions.
|
|
1124
|
+
*
|
|
1125
|
+
* @param {Object} scores - Scores object from evaluation
|
|
1126
|
+
* @returns {number} 0-100 score
|
|
1127
|
+
*/
|
|
1128
|
+
export function calculateBaseScore(scores) {
|
|
1129
|
+
const dimensions = evalConfigLoader.getRubricDimensions();
|
|
1130
|
+
const keyMap = { pedagogical_soundness: 'pedagogical' };
|
|
1131
|
+
|
|
1132
|
+
let weightedSum = 0;
|
|
1133
|
+
let totalWeight = 0;
|
|
1134
|
+
|
|
1135
|
+
for (const [key, dim] of Object.entries(dimensions)) {
|
|
1136
|
+
const normalizedKey = keyMap[key] || key;
|
|
1137
|
+
if (!BASE_DIMENSIONS.includes(normalizedKey)) continue;
|
|
1138
|
+
|
|
1139
|
+
const scoreData = scores[normalizedKey] || scores[key];
|
|
1140
|
+
const score = scoreData?.score ?? scoreData;
|
|
1141
|
+
|
|
1142
|
+
if (typeof score === 'number') {
|
|
1143
|
+
weightedSum += score * (dim.weight || 0);
|
|
1144
|
+
totalWeight += dim.weight || 0;
|
|
1145
|
+
}
|
|
1146
|
+
}
|
|
1147
|
+
|
|
1148
|
+
if (totalWeight === 0) return 0;
|
|
1149
|
+
const avgScore = weightedSum / totalWeight;
|
|
1150
|
+
return ((avgScore - 1) / 4) * 100;
|
|
1151
|
+
}
|
|
1152
|
+
|
|
1153
|
+
/**
|
|
1154
|
+
* Calculate recognition score from the 4 recognition dimensions.
|
|
1155
|
+
* Weights are re-normalized to sum to 1.0 across only the recognition dimensions.
|
|
1156
|
+
*
|
|
1157
|
+
* @param {Object} scores - Scores object from evaluation
|
|
1158
|
+
* @returns {number} 0-100 score
|
|
1159
|
+
*/
|
|
1160
|
+
export function calculateRecognitionScore(scores) {
|
|
1161
|
+
const dimensions = evalConfigLoader.getRubricDimensions();
|
|
1162
|
+
|
|
1163
|
+
let weightedSum = 0;
|
|
1164
|
+
let totalWeight = 0;
|
|
1165
|
+
|
|
1166
|
+
for (const [key, dim] of Object.entries(dimensions)) {
|
|
1167
|
+
if (!RECOGNITION_DIMENSIONS.includes(key)) continue;
|
|
1168
|
+
|
|
1169
|
+
const scoreData = scores[key];
|
|
1170
|
+
const score = scoreData?.score ?? scoreData;
|
|
1171
|
+
|
|
1172
|
+
if (typeof score === 'number') {
|
|
1173
|
+
weightedSum += score * (dim.weight || 0);
|
|
1174
|
+
totalWeight += dim.weight || 0;
|
|
1175
|
+
}
|
|
1176
|
+
}
|
|
1177
|
+
|
|
1178
|
+
if (totalWeight === 0) return 0;
|
|
1179
|
+
const avgScore = weightedSum / totalWeight;
|
|
1180
|
+
return ((avgScore - 1) / 4) * 100;
|
|
1181
|
+
}
|
|
1182
|
+
|
|
729
1183
|
/**
|
|
730
1184
|
* Calculate weighted overall score from dimension scores
|
|
731
1185
|
*/
|
|
732
1186
|
export function calculateOverallScore(scores) {
|
|
733
|
-
const
|
|
734
|
-
const dimensions = rubric?.dimensions || {};
|
|
1187
|
+
const dimensions = evalConfigLoader.getRubricDimensions();
|
|
735
1188
|
|
|
736
1189
|
// Map rubric keys to normalized score keys (pedagogical_soundness -> pedagogical)
|
|
737
1190
|
const keyMap = {
|
|
@@ -773,6 +1226,8 @@ export function calculateRecognitionMetrics(scores) {
|
|
|
773
1226
|
'dialectical_responsiveness',
|
|
774
1227
|
'memory_integration',
|
|
775
1228
|
'transformative_potential',
|
|
1229
|
+
'tutor_adaptation',
|
|
1230
|
+
'learner_growth',
|
|
776
1231
|
];
|
|
777
1232
|
|
|
778
1233
|
const metrics = {
|
|
@@ -780,6 +1235,9 @@ export function calculateRecognitionMetrics(scores) {
|
|
|
780
1235
|
transformationRate: false,
|
|
781
1236
|
memoryUtilization: false,
|
|
782
1237
|
mutualAcknowledgment: false,
|
|
1238
|
+
tutorAdaptation: false,
|
|
1239
|
+
learnerGrowth: false,
|
|
1240
|
+
bilateralTransformation: false,
|
|
783
1241
|
dimensionScores: {},
|
|
784
1242
|
hasRecognitionData: false,
|
|
785
1243
|
};
|
|
@@ -806,9 +1264,18 @@ export function calculateRecognitionMetrics(scores) {
|
|
|
806
1264
|
if (dim === 'mutual_recognition' && score >= 4) {
|
|
807
1265
|
metrics.mutualAcknowledgment = true;
|
|
808
1266
|
}
|
|
1267
|
+
if (dim === 'tutor_adaptation' && score >= 4) {
|
|
1268
|
+
metrics.tutorAdaptation = true;
|
|
1269
|
+
}
|
|
1270
|
+
if (dim === 'learner_growth' && score >= 4) {
|
|
1271
|
+
metrics.learnerGrowth = true;
|
|
1272
|
+
}
|
|
809
1273
|
}
|
|
810
1274
|
}
|
|
811
1275
|
|
|
1276
|
+
// Bilateral transformation: both tutor and learner show adaptation
|
|
1277
|
+
metrics.bilateralTransformation = metrics.tutorAdaptation && metrics.learnerGrowth;
|
|
1278
|
+
|
|
812
1279
|
if (scoredCount > 0) {
|
|
813
1280
|
metrics.recognitionScore = totalScore / scoredCount;
|
|
814
1281
|
metrics.hasRecognitionData = true;
|
|
@@ -817,10 +1284,16 @@ export function calculateRecognitionMetrics(scores) {
|
|
|
817
1284
|
return metrics;
|
|
818
1285
|
}
|
|
819
1286
|
|
|
1287
|
+
export { buildEvaluationPrompt };
|
|
1288
|
+
|
|
820
1289
|
export default {
|
|
821
1290
|
evaluateSuggestion,
|
|
822
1291
|
evaluateSuggestions,
|
|
823
1292
|
quickValidate,
|
|
824
1293
|
calculateOverallScore,
|
|
1294
|
+
calculateBaseScore,
|
|
1295
|
+
calculateRecognitionScore,
|
|
825
1296
|
calculateRecognitionMetrics,
|
|
1297
|
+
getAvailableJudge,
|
|
1298
|
+
buildEvaluationPrompt,
|
|
826
1299
|
};
|